diff --git a/CMakeLists.txt b/CMakeLists.txt
index d4a0eb067b4f1..334a6cfcd0ee1 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -300,10 +300,6 @@ if(WITH_GPU)
     endif()
 endif()
 
-if(WITH_CINN)
-    include(cinn)
-endif()
-
 if(WITH_ROCM)
     include(hip)
     include(miopen) # set miopen libraries, must before configure
diff --git a/cmake/cinn.cmake b/cmake/external/cinn.cmake
similarity index 64%
rename from cmake/cinn.cmake
rename to cmake/external/cinn.cmake
index dd5f809e9581a..ee5aea9f8b294 100644
--- a/cmake/cinn.cmake
+++ b/cmake/external/cinn.cmake
@@ -27,16 +27,15 @@ add_definitions(-w)
 include(ExternalProject)
 set(CINN_SOURCE_DIR ${THIRD_PARTY_PATH}/CINN)
 # TODO(zhhsplendid): Modify git tag after we have release tag
-set(CINN_GIT_TAG 3f004bfa3ed273ecf1de8e7b946433038c79b84f)
-set(CINN_OPTIONAL_ARGS -DWITH_CUDA=${WITH_GPU} -DWITH_CUDNN=${WITH_GPU} -DPUBLISH_LIBS=ON)
-set(CINN_BUILD_COMMAND $(MAKE) cinncore -j && $(MAKE) cinnapi -j)
+set(CINN_GIT_TAG e422c01b7875301996a2baf67a14ba61b0e6192a)
+set(CINN_OPTIONAL_ARGS -DPY_VERSION=${PY_VERSION} -DWITH_CUDA=${WITH_GPU} -DWITH_CUDNN=${WITH_GPU} -DPUBLISH_LIBS=ON -DWITH_TESTING=ON)
+set(CINN_BUILD_COMMAND $(MAKE) cinnapi -j)
 ExternalProject_Add(
   external_cinn
   ${EXTERNAL_PROJECT_LOG_ARGS}
   GIT_REPOSITORY   "${GIT_URL}/PaddlePaddle/CINN.git"
   GIT_TAG          ${CINN_GIT_TAG}
   PREFIX           ${CINN_SOURCE_DIR}
-  UPDATE_COMMAND   ""
   BUILD_COMMAND    ${CINN_BUILD_COMMAND}
   INSTALL_COMMAND  ""
   CMAKE_ARGS       ${CINN_OPTIONAL_ARGS})
@@ -52,49 +51,20 @@ message(STATUS "CINN BINARY_DIR: ${CINN_BINARY_DIR}")
 message(STATUS "CINN SOURCE_DIR: ${CINN_SOURCE_DIR}")
 
 
-#########################
-# Add CINN's dependencies
-#########################
+######################################
+# Add CINN's dependencies header files
+######################################
 
 # Add absl
-set(ABSL_LIB_NAMES
-  hash
-  wyhash
-  city
-  strings
-  throw_delegate
-  bad_any_cast_impl
-  bad_optional_access
-  bad_variant_access
-  raw_hash_set
-  )
-set(ABSL_LIB_DIR "${CINN_BINARY_DIR}/dist/third_party/absl/lib")
 set(ABSL_INCLUDE_DIR "${CINN_BINARY_DIR}/dist/third_party/absl/include")
-add_library(absl STATIC IMPORTED GLOBAL)
-set_target_properties(absl PROPERTIES IMPORTED_LOCATION ${ABSL_LIB_DIR}/libabsl_base.a)
-foreach(lib_name ${ABSL_LIB_NAMES})
-    target_link_libraries(absl INTERFACE ${ABSL_LIB_DIR}/libabsl_${lib_name}.a)
-endforeach()
 include_directories(${ABSL_INCLUDE_DIR})
 
 # Add isl
-set(ISL_LIB_DIR "${CINN_BINARY_DIR}/dist/third_party/isl/lib")
 set(ISL_INCLUDE_DIR "${CINN_BINARY_DIR}/dist/third_party/isl/include")
-add_library(isl STATIC IMPORTED GLOBAL)
-set_target_properties(isl PROPERTIES IMPORTED_LOCATION ${ISL_LIB_DIR}/libisl.a)
 include_directories(${ISL_INCLUDE_DIR})
 
 # Add LLVM
-set(LLVM_LIB_NAMES
-  ExecutionEngine
-  )
-set(LLVM_LIB_DIR "${CINN_BINARY_DIR}/dist/third_party/llvm/lib")
 set(LLVM_INCLUDE_DIR "${CINN_BINARY_DIR}/dist/third_party/llvm/include")
-add_library(llvm STATIC IMPORTED GLOBAL)
-set_target_properties(llvm PROPERTIES IMPORTED_LOCATION ${LLVM_LIB_DIR}/libLLVMCore.a)
-foreach(lib_name ${LLVM_LIB_NAMES})
-    target_link_libraries(llvm INTERFACE ${LLVM_LIB_DIR}/libLLVM${lib_name}.a)
-endforeach()
 include_directories(${LLVM_INCLUDE_DIR})
 
 ######################################################
@@ -108,5 +78,5 @@ set(CINN_INCLUDE_DIR "${CINN_BINARY_DIR}/dist/cinn/include")
 add_library(cinn SHARED IMPORTED GLOBAL)
 set_target_properties(cinn PROPERTIES IMPORTED_LOCATION "${CINN_LIB_LOCATION}/${CINN_LIB_NAME}")
 include_directories(${CINN_INCLUDE_DIR})
-add_dependencies(cinn external_cinn absl isl llvm glog gflag)
+add_dependencies(cinn external_cinn)
 
diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index 70bdc67980c03..11a7adbbeb9a8 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -35,7 +35,7 @@ ELSE ()
 ENDIF()
 
 SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
-SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210921")
+SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20211020")
 SET(XPU_XRE_URL  "${XPU_BASE_URL}/${XPU_XRE_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
 SET(XPU_XDNN_URL "${XPU_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
 SET(XPU_XCCL_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210623/${XPU_XCCL_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index a396af570f324..7830cf7b50acc 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -217,7 +217,8 @@ function(op_library TARGET)
 "fusion_transpose_flatten_concat_op" "fusion_conv_inception_op"
 "sync_batch_norm_op" "sparse_attention_op"  "dgc_op" "fused_fc_elementwise_layernorm_op"
 "skip_layernorm_op" "multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op" "fusion_lstm_op"
-"fused_bn_add_activation_op" "fused_attention_op" "resnet_unit_op")
+"fused_bn_add_activation_op" "fused_attention_op" "resnet_unit_op" "fused_feedforward_op")
+
         if ("${TARGET}" STREQUAL "${manual_pybind_op}")
             set(pybind_flag 1)
         endif()
@@ -298,7 +299,7 @@ function(op_library TARGET)
         file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
     endif()
 
-    if (WITH_XPU AND ${xpu_cc_srcs_len} GREATER 0)
+    if (WITH_XPU AND ${pybind_flag} EQUAL 0 AND ${xpu_cc_srcs_len} GREATER 0)
         file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, XPU);\n")
     endif()
 
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index d45b5e07bb8f3..7cdbee1746a8f 100644
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -360,6 +360,12 @@ if (WITH_LITE)
     include(external/lite)
 endif (WITH_LITE)
 
+if (WITH_CINN)
+    message(STATUS "Compile Paddle with CINN.")
+    include(external/cinn)
+    add_definitions(-DPADDLE_WITH_CINN)
+endif (WITH_CINN)
+
 if (WITH_CRYPTO)
     include(external/cryptopp)   # download, build, install cryptopp
     list(APPEND third_party_deps extern_cryptopp)
diff --git a/paddle/fluid/distributed/ps.proto b/paddle/fluid/distributed/ps.proto
index 002be15b003eb..4483f960eb137 100644
--- a/paddle/fluid/distributed/ps.proto
+++ b/paddle/fluid/distributed/ps.proto
@@ -119,13 +119,11 @@ message TableParameter {
 
 message TableAccessorParameter {
   optional string accessor_class = 1;
-  //  optional SparseSGDRuleParameter sparse_sgd_param = 2;
   optional uint32 fea_dim = 4 [ default = 11 ];
   optional uint32 embedx_dim = 5 [ default = 8 ];
   optional uint32 embedx_threshold = 6 [ default = 10 ];
   optional CtrAccessorParameter ctr_accessor_param = 7;
   repeated TableAccessorSaveParameter table_accessor_save_param = 8;
-  //  optional SparseCommonSGDRuleParameter sparse_commonsgd_param = 9;
   optional SparseCommonSGDRuleParameter embed_sgd_param = 10;
   optional SparseCommonSGDRuleParameter embedx_sgd_param = 11;
 }
@@ -182,13 +180,6 @@ message TableAccessorSaveParameter {
   optional string deconverter = 3;
 }
 
-// message SparseSGDRuleParameter {
-//    optional double learning_rate = 1 [default = 0.05];
-//    optional double initial_g2sum = 2 [default = 3.0];
-//    optional double initial_range = 3 [default = 0.0001];
-//    repeated float weight_bounds = 4;
-//}
-
 message SparseCommonSGDRuleParameter {
   optional string name = 1;
   optional SparseNaiveSGDRuleParameter naive = 2;
diff --git a/paddle/fluid/distributed/table/CMakeLists.txt b/paddle/fluid/distributed/table/CMakeLists.txt
index b4b87e652b7db..7ec7041b63ba1 100644
--- a/paddle/fluid/distributed/table/CMakeLists.txt
+++ b/paddle/fluid/distributed/table/CMakeLists.txt
@@ -36,7 +36,8 @@ cc_library(tensor_table SRCS tensor_table.cc DEPS eigen3 ps_framework_proto exec
 set_source_files_properties(table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 
 set_source_files_properties(sparse_sgd_rule.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+set_source_files_properties(ctr_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_library(sparse_sgd_rule SRCS sparse_sgd_rule.cc DEPS ${TABLE_DEPS} ps_framework_proto)
+cc_library(ctr_accessor SRCS ctr_accessor.cc DEPS ${TABLE_DEPS} ps_framework_proto sparse_sgd_rule)
 
-
-cc_library(table SRCS table.cc DEPS common_table tensor_accessor tensor_table ps_framework_proto string_helper device_context gflags glog boost sparse_sgd_rule)
+cc_library(table SRCS table.cc DEPS common_table tensor_accessor tensor_table ps_framework_proto string_helper device_context gflags glog boost ctr_accessor)
diff --git a/paddle/fluid/distributed/table/ctr_accessor.cc b/paddle/fluid/distributed/table/ctr_accessor.cc
new file mode 100644
index 0000000000000..1ef8c9e152733
--- /dev/null
+++ b/paddle/fluid/distributed/table/ctr_accessor.cc
@@ -0,0 +1,329 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/table/ctr_accessor.h"
+#include <gflags/gflags.h>
+#include "glog/logging.h"
+#include "paddle/fluid/string/string_helper.h"
+
+namespace paddle {
+namespace distributed {
+
+int CtrCommonAccessor::initialize() {
+  auto name = _config.embed_sgd_param().name();
+  _embed_sgd_rule = CREATE_PSCORE_CLASS(SparseValueSGDRule, name);
+  _embed_sgd_rule->load_config(_config.embed_sgd_param(), 1);
+
+  name = _config.embedx_sgd_param().name();
+  _embedx_sgd_rule = CREATE_PSCORE_CLASS(SparseValueSGDRule, name);
+  _embedx_sgd_rule->load_config(_config.embedx_sgd_param(),
+                                _config.embedx_dim());
+
+  common_feature_value.embed_sgd_dim = _embed_sgd_rule->dim();
+  common_feature_value.embedx_dim = _config.embedx_dim();
+  common_feature_value.embedx_sgd_dim = _embedx_sgd_rule->dim();
+  _show_click_decay_rate = _config.ctr_accessor_param().show_click_decay_rate();
+
+  return 0;
+}
+
+size_t CtrCommonAccessor::dim() { return common_feature_value.dim(); }
+
+size_t CtrCommonAccessor::dim_size(size_t dim) {
+  auto embedx_dim = _config.embedx_dim();
+  return common_feature_value.dim_size(dim, embedx_dim);
+}
+
+size_t CtrCommonAccessor::size() { return common_feature_value.size(); }
+
+size_t CtrCommonAccessor::mf_size() {
+  return (_config.embedx_dim() + common_feature_value.embedx_sgd_dim) *
+         sizeof(float);  // embedx embedx_g2sum
+}
+
+// pull value
+size_t CtrCommonAccessor::select_dim() {
+  auto embedx_dim = _config.embedx_dim();
+  return 1 + embedx_dim;
+}
+
+size_t CtrCommonAccessor::select_dim_size(size_t dim) { return sizeof(float); }
+
+size_t CtrCommonAccessor::select_size() { return select_dim() * sizeof(float); }
+
+// push value
+size_t CtrCommonAccessor::update_dim() {
+  auto embedx_dim = _config.embedx_dim();
+  return 4 + embedx_dim;
+}
+
+size_t CtrCommonAccessor::update_dim_size(size_t dim) { return sizeof(float); }
+
+size_t CtrCommonAccessor::update_size() { return update_dim() * sizeof(float); }
+
+bool CtrCommonAccessor::shrink(float* value) {
+  auto base_threshold = _config.ctr_accessor_param().base_threshold();
+  auto delta_threshold = _config.ctr_accessor_param().delta_threshold();
+  auto delete_after_unseen_days =
+      _config.ctr_accessor_param().delete_after_unseen_days();
+  auto delete_threshold = _config.ctr_accessor_param().delete_threshold();
+
+  // time_decay first
+  common_feature_value.show(value) *= _show_click_decay_rate;
+  common_feature_value.click(value) *= _show_click_decay_rate;
+
+  // shrink after
+  auto score = show_click_score(common_feature_value.show(value),
+                                common_feature_value.click(value));
+  auto unseen_days = common_feature_value.unseen_days(value);
+  if (score < delete_threshold || unseen_days > delete_after_unseen_days) {
+    return true;
+  }
+  return false;
+}
+
+bool CtrCommonAccessor::save(float* value, int param) {
+  auto base_threshold = _config.ctr_accessor_param().base_threshold();
+  auto delta_threshold = _config.ctr_accessor_param().delta_threshold();
+  auto delta_keep_days = _config.ctr_accessor_param().delta_keep_days();
+  if (param == 2) {
+    delta_threshold = 0;
+  }
+  switch (param) {
+    // save all
+    case 0: {
+      return true;
+    }
+    // save xbox delta
+    case 1:
+    // save xbox base
+    case 2: {
+      if (show_click_score(common_feature_value.show(value),
+                           common_feature_value.click(value)) >=
+              base_threshold &&
+          common_feature_value.delta_score(value) >= delta_threshold &&
+          common_feature_value.unseen_days(value) <= delta_keep_days) {
+        // do this after save, because it must not be modified when retry
+        if (param == 2) {
+          common_feature_value.delta_score(value) = 0;
+        }
+        return true;
+      } else {
+        return false;
+      }
+    }
+    // already decayed in shrink
+    case 3: {
+      // do this after save, because it must not be modified when retry
+      // common_feature_value.unseen_days(value)++;
+      return true;
+    }
+    // save revert batch_model
+    case 5: {
+      return true;
+    }
+    default:
+      return true;
+  }
+}
+
+void CtrCommonAccessor::update_stat_after_save(float* value, int param) {
+  auto base_threshold = _config.ctr_accessor_param().base_threshold();
+  auto delta_threshold = _config.ctr_accessor_param().delta_threshold();
+  auto delta_keep_days = _config.ctr_accessor_param().delta_keep_days();
+  if (param == 2) {
+    delta_threshold = 0;
+  }
+  switch (param) {
+    case 1: {
+      if (show_click_score(common_feature_value.show(value),
+                           common_feature_value.click(value)) >=
+              base_threshold &&
+          common_feature_value.delta_score(value) >= delta_threshold &&
+          common_feature_value.unseen_days(value) <= delta_keep_days) {
+        common_feature_value.delta_score(value) = 0;
+      }
+    }
+      return;
+    case 3: {
+      common_feature_value.unseen_days(value)++;
+    }
+      return;
+    default:
+      return;
+  }
+}
+
+int32_t CtrCommonAccessor::create(float** values, size_t num) {
+  auto embedx_dim = _config.embedx_dim();
+  for (size_t value_item = 0; value_item < num; ++value_item) {
+    float* value = values[value_item];
+    value[common_feature_value.unseen_days_index()] = 0;
+    value[common_feature_value.delta_score_index()] = 0;
+    value[common_feature_value.show_index()] = 0;
+    value[common_feature_value.click_index()] = 0;
+    value[common_feature_value.slot_index()] = -1;
+    _embed_sgd_rule->init_value(
+        value + common_feature_value.embed_w_index(),
+        value + common_feature_value.embed_g2sum_index());
+    _embedx_sgd_rule->init_value(
+        value + common_feature_value.embedx_w_index(),
+        value + common_feature_value.embedx_g2sum_index(), false);
+  }
+  return 0;
+}
+
+bool CtrCommonAccessor::need_extend_mf(float* value) {
+  float show = value[common_feature_value.show_index()];
+  float click = value[common_feature_value.click_index()];
+  float score = (show - click) * _config.ctr_accessor_param().nonclk_coeff() +
+                click * _config.ctr_accessor_param().click_coeff();
+  return score >= _config.embedx_threshold();
+}
+
+bool CtrCommonAccessor::has_mf(size_t size) {
+  return size > common_feature_value.embedx_g2sum_index();
+}
+
+// from CommonFeatureValue to CtrCommonPullValue
+int32_t CtrCommonAccessor::select(float** select_values, const float** values,
+                                  size_t num) {
+  auto embedx_dim = _config.embedx_dim();
+  for (size_t value_item = 0; value_item < num; ++value_item) {
+    float* select_value = select_values[value_item];
+    const float* value = values[value_item];
+    select_value[CtrCommonPullValue::embed_w_index()] =
+        value[common_feature_value.embed_w_index()];
+    memcpy(select_value + CtrCommonPullValue::embedx_w_index(),
+           value + common_feature_value.embedx_w_index(),
+           embedx_dim * sizeof(float));
+  }
+  return 0;
+}
+
+// from CtrCommonPushValue to CtrCommonPushValue
+// first dim: item
+// second dim: field num
+int32_t CtrCommonAccessor::merge(float** update_values,
+                                 const float** other_update_values,
+                                 size_t num) {
+  auto embedx_dim = _config.embedx_dim();
+  size_t total_dim = CtrCommonPushValue::dim(embedx_dim);
+  for (size_t value_item = 0; value_item < num; ++value_item) {
+    float* update_value = update_values[value_item];
+    const float* other_update_value = other_update_values[value_item];
+    for (auto i = 0u; i < total_dim; ++i) {
+      if (i != CtrCommonPushValue::slot_index()) {
+        update_value[i] += other_update_value[i];
+      }
+    }
+  }
+  return 0;
+}
+
+// from CtrCommonPushValue to CommonFeatureValue
+// first dim: item
+// second dim: field num
+int32_t CtrCommonAccessor::update(float** update_values,
+                                  const float** push_values, size_t num) {
+  auto embedx_dim = _config.embedx_dim();
+  for (size_t value_item = 0; value_item < num; ++value_item) {
+    float* update_value = update_values[value_item];
+    const float* push_value = push_values[value_item];
+    float push_show = push_value[CtrCommonPushValue::show_index()];
+    float push_click = push_value[CtrCommonPushValue::click_index()];
+    float slot = push_value[CtrCommonPushValue::slot_index()];
+    update_value[common_feature_value.show_index()] += push_show;
+    update_value[common_feature_value.click_index()] += push_click;
+    update_value[common_feature_value.slot_index()] = slot;
+    update_value[common_feature_value.delta_score_index()] +=
+        (push_show - push_click) * _config.ctr_accessor_param().nonclk_coeff() +
+        push_click * _config.ctr_accessor_param().click_coeff();
+    update_value[common_feature_value.unseen_days_index()] = 0;
+    _embed_sgd_rule->update_value(
+        update_value + common_feature_value.embed_w_index(),
+        update_value + common_feature_value.embed_g2sum_index(),
+        push_value + CtrCommonPushValue::embed_g_index());
+    _embedx_sgd_rule->update_value(
+        update_value + common_feature_value.embedx_w_index(),
+        update_value + common_feature_value.embedx_g2sum_index(),
+        push_value + CtrCommonPushValue::embedx_g_index());
+  }
+  return 0;
+}
+
+bool CtrCommonAccessor::create_value(int stage, const float* value) {
+  // stage == 0, pull
+  // stage == 1, push
+  if (stage == 0) {
+    return true;
+  } else if (stage == 1) {
+    // operation
+    auto show = CtrCommonPushValue::show_const(value);
+    auto click = CtrCommonPushValue::click_const(value);
+    auto score = show_click_score(show, click);
+    if (score <= 0) {
+      return false;
+    }
+    if (score >= 1) {
+      return true;
+    }
+    return local_uniform_real_distribution<float>()(local_random_engine()) <
+           score;
+  } else {
+    return true;
+  }
+}
+
+float CtrCommonAccessor::show_click_score(float show, float click) {
+  auto nonclk_coeff = _config.ctr_accessor_param().nonclk_coeff();
+  auto click_coeff = _config.ctr_accessor_param().click_coeff();
+  return (show - click) * nonclk_coeff + click * click_coeff;
+}
+
+std::string CtrCommonAccessor::parse_to_string(const float* v, int param) {
+  thread_local std::ostringstream os;
+  os.clear();
+  os.str("");
+  os << v[0] << " " << v[1] << " " << v[2] << " " << v[3] << " " << v[4] << " "
+     << v[5];
+  for (int i = common_feature_value.embed_g2sum_index();
+       i < common_feature_value.embedx_w_index(); i++) {
+    os << " " << v[i];
+  }
+  auto show = common_feature_value.show_const(v);
+  auto click = common_feature_value.click_const(v);
+  auto score = show_click_score(show, click);
+  if (score >= _config.embedx_threshold()) {
+    for (auto i = common_feature_value.embedx_w_index();
+         i < common_feature_value.dim(); ++i) {
+      os << " " << v[i];
+    }
+  }
+  return os.str();
+}
+
+int CtrCommonAccessor::parse_from_string(const std::string& str, float* value) {
+  int embedx_dim = _config.embedx_dim();
+
+  _embedx_sgd_rule->init_value(
+      value + common_feature_value.embedx_w_index(),
+      value + common_feature_value.embedx_g2sum_index());
+  auto ret = paddle::string::str_to_float(str.data(), value);
+  CHECK(ret >= 6) << "expect more than 6 real:" << ret;
+  return ret;
+}
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/table/ctr_accessor.h b/paddle/fluid/distributed/table/ctr_accessor.h
new file mode 100644
index 0000000000000..3c2ac7189f777
--- /dev/null
+++ b/paddle/fluid/distributed/table/ctr_accessor.h
@@ -0,0 +1,223 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <stdint.h>
+#include <stdio.h>
+#include <vector>
+#include "paddle/fluid/distributed/common/registerer.h"
+#include "paddle/fluid/distributed/ps.pb.h"
+#include "paddle/fluid/distributed/table/accessor.h"
+#include "paddle/fluid/distributed/table/sparse_sgd_rule.h"
+
+namespace paddle {
+namespace distributed {
+
+class CtrCommonAccessor : public ValueAccessor {
+ public:
+  struct CtrCommonFeatureValue {
+    /*
+       float slot;
+       float unseen_days;
+       float delta_score;
+       float show;
+       float click;
+       float embed_w;
+       std::vector<float> embed_g2sum;
+       std::vector<float> embedx_w;
+       std::<vector>float embedx_g2sum;
+       */
+
+    int dim() { return 6 + embed_sgd_dim + embedx_sgd_dim + embedx_dim; }
+    int dim_size(size_t dim, int embedx_dim) { return sizeof(float); }
+    int size() { return dim() * sizeof(float); }
+    int slot_index() { return 0; }
+    int unseen_days_index() { return slot_index() + 1; }
+    int delta_score_index() { return unseen_days_index() + 1; }
+    int show_index() { return delta_score_index() + 1; }
+    int click_index() { return show_index() + 1; }
+    int embed_w_index() { return click_index() + 1; }
+    int embed_g2sum_index() { return embed_w_index() + 1; }
+    int embedx_w_index() { return embed_g2sum_index() + embed_sgd_dim; }
+    int embedx_g2sum_index() { return embedx_w_index() + embedx_dim; }
+
+    float& unseen_days(float* val) { return val[unseen_days_index()]; }
+    float& delta_score(float* val) { return val[delta_score_index()]; }
+    float& show(float* val) { return val[show_index()]; }
+    float& click(float* val) { return val[click_index()]; }
+    float& slot(float* val) { return val[slot_index()]; }
+    float& embed_w(float* val) { return val[embed_w_index()]; }
+    float& embed_g2sum(float* val) { return val[embed_g2sum_index()]; }
+    float& embedx_w(float* val) { return val[embedx_w_index()]; }
+    float& embedx_g2sum(float* val) { return val[embedx_g2sum_index()]; }
+    float show_const(const float* val) {
+      float s = val[show_index()];
+      return s;
+    }
+    float click_const(const float* val) {
+      float c = val[click_index()];
+      return c;
+    }
+    int embed_sgd_dim;
+    int embedx_dim;
+    int embedx_sgd_dim;
+  };
+
+  struct CtrCommonPushValue {
+    /*
+       float slot;
+       float show;
+       float click;
+       float embed_g;
+       std::vector<float> embedx_g;
+       */
+
+    static int dim(int embedx_dim) { return 4 + embedx_dim; }
+
+    static int dim_size(int dim, int embedx_dim) { return sizeof(float); }
+    static int size(int embedx_dim) { return dim(embedx_dim) * sizeof(float); }
+    static int slot_index() { return 0; }
+    static int show_index() { return CtrCommonPushValue::slot_index() + 1; }
+    static int click_index() { return CtrCommonPushValue::show_index() + 1; }
+    static int embed_g_index() { return CtrCommonPushValue::click_index() + 1; }
+    static int embedx_g_index() {
+      return CtrCommonPushValue::embed_g_index() + 1;
+    }
+    static float& slot(float* val) {
+      return val[CtrCommonPushValue::slot_index()];
+    }
+    static float& show(float* val) {
+      return val[CtrCommonPushValue::show_index()];
+    }
+    static float& click(float* val) {
+      return val[CtrCommonPushValue::click_index()];
+    }
+    static float show_const(const float* val) {
+      float s = val[show_index()];
+      return s;
+    }
+    static float click_const(const float* val) {
+      float c = val[click_index()];
+      return c;
+    }
+    static float& embed_g(float* val) {
+      return val[CtrCommonPushValue::embed_g_index()];
+    }
+    static float* embedx_g(float* val) {
+      return val + CtrCommonPushValue::embedx_g_index();
+    }
+  };
+
+  struct CtrCommonPullValue {
+    /*
+       float embed_w;
+       std::vector<float> embedx_w;
+       */
+
+    static int dim(int embedx_dim) { return 1 + embedx_dim; }
+    static int dim_size(size_t dim) { return sizeof(float); }
+    static int size(int embedx_dim) { return dim(embedx_dim) * sizeof(float); }
+    static int embed_w_index() { return 0; }
+    static int embedx_w_index() { return 1; }
+    static float& embed_w(float* val) {
+      return val[CtrCommonPullValue::embed_w_index()];
+    }
+    static float* embedx_w(float* val) {
+      return val + CtrCommonPullValue::embedx_w_index();
+    }
+  };
+  CtrCommonAccessor() {}
+  virtual int initialize();
+  virtual ~CtrCommonAccessor() {}
+
+  // value维度
+  virtual size_t dim();
+  // value各个维度的size
+  virtual size_t dim_size(size_t dim);
+  // value各维度相加总size
+  virtual size_t size();
+  // value中mf动态长度部分总size大小, sparse下生效
+  virtual size_t mf_size();
+  // pull value维度
+  virtual size_t select_dim();
+  // pull value各个维度的size
+  virtual size_t select_dim_size(size_t dim);
+  // pull value各维度相加总size
+  virtual size_t select_size();
+  // push value维度
+  virtual size_t update_dim();
+  // push value各个维度的size
+  virtual size_t update_dim_size(size_t dim);
+  // push value各维度相加总size
+  virtual size_t update_size();
+  // 判断该value是否进行shrink
+  virtual bool shrink(float* value);
+  // 判断该value是否保存到ssd
+  // virtual bool save_ssd(float* value);
+  virtual bool need_extend_mf(float* value);
+  virtual bool has_mf(size_t size);
+  // 判断该value是否在save阶段dump,
+  // param作为参数用于标识save阶段，如downpour的xbox与batch_model
+  // param = 0, save all feature
+  // param = 1, save delta feature
+  // param = 2, save xbox base feature
+  bool save(float* value, int param) override;
+  // update delta_score and unseen_days after save
+  void update_stat_after_save(float* value, int param) override;
+  // keys不存在时，为values生成随机值
+  // 要求value的内存由外部调用者分配完毕
+  virtual int32_t create(float** value, size_t num);
+  // 从values中选取到select_values中
+  virtual int32_t select(float** select_values, const float** values,
+                         size_t num);
+  // 将update_values聚合到一起
+  virtual int32_t merge(float** update_values,
+                        const float** other_update_values, size_t num);
+  // 将update_values聚合到一起，通过it.next判定是否进入下一个key
+  // virtual int32_t merge(float** update_values, iterator it);
+  // 将update_values更新应用到values中
+  virtual int32_t update(float** values, const float** update_values,
+                         size_t num);
+
+  std::string parse_to_string(const float* value, int param) override;
+  int32_t parse_from_string(const std::string& str, float* v) override;
+  virtual bool create_value(int type, const float* value);
+
+  // 这个接口目前只用来取show
+  float get_field(float* value, const std::string& name) override {
+    // CHECK(name == "show");
+    if (name == "show") {
+      return common_feature_value.show(value);
+    }
+    return 0.0;
+  }
+
+ private:
+  // float show_click_score(float show, float click);
+
+  // SparseValueSGDRule* _embed_sgd_rule;
+  // SparseValueSGDRule* _embedx_sgd_rule;
+  // CtrCommonFeatureValue common_feature_value;
+  float _show_click_decay_rate;
+  int32_t _ssd_unseenday_threshold;
+
+ public:  // TODO(zhaocaibei123): it should be private, but we make it public
+          // for unit test
+  CtrCommonFeatureValue common_feature_value;
+  float show_click_score(float show, float click);
+  SparseValueSGDRule* _embed_sgd_rule;
+  SparseValueSGDRule* _embedx_sgd_rule;
+};
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/test/CMakeLists.txt b/paddle/fluid/distributed/test/CMakeLists.txt
index 832797ec2fc0e..f8cd9af4774ec 100644
--- a/paddle/fluid/distributed/test/CMakeLists.txt
+++ b/paddle/fluid/distributed/test/CMakeLists.txt
@@ -26,3 +26,6 @@ cc_test(feature_value_test SRCS feature_value_test.cc DEPS ${COMMON_DEPS} boost
 
 set_source_files_properties(sparse_sgd_rule_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_test(sparse_sgd_rule_test SRCS sparse_sgd_rule_test.cc DEPS ${COMMON_DEPS} boost table)
+
+set_source_files_properties(ctr_accessor_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_test(ctr_accessor_test SRCS ctr_accessor_test.cc DEPS ${COMMON_DEPS} boost table)
diff --git a/paddle/fluid/distributed/test/ctr_accessor_test.cc b/paddle/fluid/distributed/test/ctr_accessor_test.cc
new file mode 100644
index 0000000000000..8c667cad605fc
--- /dev/null
+++ b/paddle/fluid/distributed/test/ctr_accessor_test.cc
@@ -0,0 +1,304 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/distributed/table/ctr_accessor.h"
+#include <cmath>
+#include <iostream>
+#include "gtest/gtest.h"
+#include "paddle/fluid/distributed/common/registerer.h"
+#include "paddle/fluid/distributed/ps.pb.h"
+#include "paddle/fluid/distributed/table/sparse_sgd_rule.h"
+
+namespace paddle {
+namespace distributed {
+REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseAdaGradSGDRule);
+REGISTER_PSCORE_CLASS(SparseValueSGDRule, StdAdaGradSGDRule);
+REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseAdamSGDRule);
+REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseNaiveSGDRule);
+
+TableAccessorParameter gen_param() {
+  TableAccessorParameter param;
+  param.set_accessor_class("CtrCommonAccessor");
+  param.set_fea_dim(11);
+  param.set_embedx_dim(8);
+  param.mutable_ctr_accessor_param()->set_nonclk_coeff(0.2);
+  param.mutable_ctr_accessor_param()->set_click_coeff(1);
+  param.mutable_ctr_accessor_param()->set_base_threshold(0.5);
+  param.mutable_ctr_accessor_param()->set_delta_threshold(0.2);
+  param.mutable_ctr_accessor_param()->set_delta_keep_days(16);
+  param.mutable_ctr_accessor_param()->set_show_click_decay_rate(0.99);
+  /*
+  param.mutable_embed_sgd_param()->set_name("naive");
+  auto* naive_param = param.mutable_embed_sgd_param()->mutable_naive();
+  naive_param->set_learning_rate(0.1);
+  naive_param->set_initial_range(0.3);
+  naive_param->add_weight_bounds(-10.0);
+  naive_param->add_weight_bounds(10.0);
+  */
+  param.mutable_embed_sgd_param()->set_name("StdAdaGradSGDRule");
+  auto* adagrad_param = param.mutable_embed_sgd_param()->mutable_adagrad();
+  adagrad_param->set_learning_rate(0.1);
+  adagrad_param->set_initial_range(0.3);
+  adagrad_param->set_initial_g2sum(0.0);
+  adagrad_param->add_weight_bounds(-10.0);
+  adagrad_param->add_weight_bounds(10.0);
+
+  param.mutable_embedx_sgd_param()->set_name("SparseNaiveSGDRule");
+  auto* naive_param = param.mutable_embedx_sgd_param()->mutable_naive();
+  naive_param->set_learning_rate(0.1);
+  naive_param->set_initial_range(0.3);
+  naive_param->add_weight_bounds(-10.0);
+  naive_param->add_weight_bounds(10.0);
+
+  return std::move(param);
+}
+
+TEST(downpour_feature_value_accessor_test, test_shrink) {
+  TableAccessorParameter parameter = gen_param();
+  CtrCommonAccessor* acc = new CtrCommonAccessor();
+  ASSERT_EQ(acc->configure(parameter), 0);
+  ASSERT_EQ(acc->initialize(), 0);
+
+  VLOG(3) << "size of struct: " << acc->common_feature_value.embed_sgd_dim
+          << " " << acc->common_feature_value.embedx_dim << " "
+          << acc->common_feature_value.embedx_sgd_dim << " "
+          << acc->common_feature_value.dim() << "\n";
+
+  float* value = new float[acc->dim()];
+  for (auto i = 0u; i < acc->dim(); ++i) {
+    value[i] = i * 1.0;
+  }
+  ASSERT_TRUE(!acc->shrink(value));
+
+  // set unseen_days too long
+  value[1] = 1000;
+  // set delta score too small
+  value[2] = 0.001;
+  ASSERT_TRUE(acc->shrink(value));
+}
+
+TEST(downpour_feature_value_accessor_test, test_save) {
+  TableAccessorParameter parameter = gen_param();
+  CtrCommonAccessor* acc = new CtrCommonAccessor();
+  ASSERT_EQ(acc->configure(parameter), 0);
+  ASSERT_EQ(acc->initialize(), 0);
+
+  float* value = new float[acc->dim()];
+  for (auto i = 0u; i < acc->dim(); ++i) {
+    value[i] = i * 1.0;
+  }
+
+  // save all feature
+  ASSERT_TRUE(acc->save(value, 0));
+
+  // save delta feature
+  ASSERT_TRUE(acc->save(value, 1));
+
+  // save base feature with time decay
+  ASSERT_TRUE(acc->save(value, 2));
+
+  VLOG(3) << "test_save:";
+  for (auto i = 0u; i < acc->dim(); ++i) {
+    VLOG(3) << value[i];
+  }
+}
+
+TEST(downpour_feature_value_accessor_test, test_create) {
+  TableAccessorParameter parameter = gen_param();
+  CtrCommonAccessor* acc = new CtrCommonAccessor();
+  ASSERT_EQ(acc->configure(parameter), 0);
+  ASSERT_EQ(acc->initialize(), 0);
+
+  const int field_size = 7 + 8;
+  const int item_size = 10;
+
+  float** value = new float*[item_size];
+  for (auto i = 0u; i < item_size; ++i) {
+    value[i] = new float[field_size];
+  }
+  ASSERT_EQ(acc->create(value, item_size), 0);
+
+  for (auto i = 0u; i < item_size; ++i) {
+    for (auto j = 0u; j < field_size; ++j) {
+      VLOG(3) << value[i][j] << " ";
+      // ASSERT_FLOAT_EQ(value[i][j], 0);
+    }
+    VLOG(3) << "\n";
+  }
+}
+
+TEST(downpour_feature_value_accessor_test, test_update) {
+  TableAccessorParameter parameter = gen_param();
+  CtrCommonAccessor* acc = new CtrCommonAccessor();
+  ASSERT_EQ(acc->configure(parameter), 0);
+  ASSERT_EQ(acc->initialize(), 0);
+
+  VLOG(3) << "dim: " << acc->common_feature_value.dim() << "\n";
+  VLOG(3) << "update_dim: " << acc->update_dim() << "\n";
+
+  const int field_size = 7 + 8;
+  const int item_size = 10;
+
+  float** value = new float*[item_size];
+  for (auto i = 0u; i < item_size; ++i) {
+    value[i] = new float[field_size];
+
+    for (auto j = 0u; j < field_size; ++j) {
+      value[i][j] = 0;
+    }
+  }
+
+  typedef const float* const_float_ptr;
+  const_float_ptr* grad = new const_float_ptr[item_size];
+  for (auto i = 0u; i < item_size; ++i) {
+    float* p = new float[acc->update_dim()];
+    for (auto j = 0u; j < acc->update_dim(); ++j) {
+      p[j] = i;
+    }
+    grad[i] = p;
+  }
+
+  struct DownpourSparseValueTest {
+    float slot;
+    float unseen_days;
+    float delta_score;
+    float show;
+    float click;
+    float embed_w;
+    std::vector<float> embed_g2sum;
+    std::vector<float> embedx_w;
+    std::vector<float> embedx_g2sum;
+
+    void to_array(float* ptr, size_t dim) {
+      ptr[0] = slot;
+      ptr[1] = unseen_days;
+      ptr[2] = delta_score;
+      ptr[3] = show;
+      ptr[4] = click;
+      ptr[5] = embed_w;
+      int idx = 6;
+      for (auto j = 0u; j < 1; ++j) {
+        ptr[idx + j] = embed_g2sum[j];
+      }
+      idx += 1;
+      for (auto j = 0u; j < 8; ++j) {
+        ptr[idx + j] = embedx_w[j];
+      }
+      idx += 8;
+      for (auto j = 0u; j < 0; ++j) {
+        ptr[idx + j] = embedx_g2sum[j];
+      }
+    }
+  };
+  struct DownpourSparsePushValueTest {
+    float slot;
+    float show;
+    float click;
+    float embed_g;
+    std::vector<float> embedx_g;
+  };
+  std::vector<float*> exp_value;
+  for (auto i = 0u; i < item_size; ++i) {
+    DownpourSparseValueTest v;
+    v.slot = value[i][0];
+    v.unseen_days = value[i][1];
+    v.delta_score = value[i][2];
+    v.show = value[i][3];
+    v.click = value[i][4];
+    v.embed_w = value[i][5];
+
+    int idx = 6;
+    for (auto j = 0u; j < acc->common_feature_value.embed_sgd_dim; ++j) {
+      v.embed_g2sum.push_back(value[i][idx + j]);
+    }
+    idx += acc->common_feature_value.embed_sgd_dim;
+    for (auto j = 0u; j < acc->common_feature_value.embedx_dim; ++j) {
+      v.embedx_w.push_back(value[i][idx + j]);
+    }
+    idx += acc->common_feature_value.embedx_dim;
+    for (auto j = 0u; j < acc->common_feature_value.embedx_sgd_dim; ++j) {
+      v.embedx_g2sum.push_back(value[i][idx + j]);
+    }
+
+    DownpourSparsePushValueTest push_v;
+    push_v.slot = grad[i][0];
+    push_v.show = grad[i][1];
+    push_v.click = grad[i][2];
+    push_v.embed_g = grad[i][3];
+    for (auto j = 0; j < parameter.embedx_dim(); ++j) {
+      push_v.embedx_g.push_back(grad[i][4 + j]);
+    }
+
+    v.slot = push_v.slot;
+    v.unseen_days = 0;
+    v.show += push_v.show;
+    v.click += push_v.click;
+    v.delta_score += acc->show_click_score(push_v.show, push_v.click);
+
+    acc->_embed_sgd_rule->update_value(&v.embed_w, &v.embed_g2sum[0],
+                                       &push_v.embed_g);
+    acc->_embedx_sgd_rule->update_value(&v.embedx_w[0], &v.embedx_g2sum[0],
+                                        &push_v.embedx_g[0]);
+
+    float* ptr = new float[acc->dim()];
+    v.to_array(ptr, parameter.embedx_dim());
+    exp_value.push_back(ptr);
+  }
+  acc->update(value, grad, item_size);
+
+  for (auto i = 0u; i < item_size; ++i) {
+    for (auto j = 0u; j < acc->dim(); ++j) {
+      VLOG(3) << value[i][j] << ":" << exp_value[i][j] << " ";
+      ASSERT_FLOAT_EQ(value[i][j], exp_value[i][j]);
+    }
+  }
+}
+
+TEST(downpour_feature_value_accessor_test, test_show_click_score) {
+  TableAccessorParameter parameter = gen_param();
+  CtrCommonAccessor* acc = new CtrCommonAccessor();
+  ASSERT_EQ(acc->configure(parameter), 0);
+  ASSERT_EQ(acc->initialize(), 0);
+
+  float show = 10;
+  float click = 6;
+  ASSERT_FLOAT_EQ(acc->show_click_score(show, click), 6.8);
+}
+
+TEST(downpour_feature_value_accessor_test, test_string_related) {
+  TableAccessorParameter parameter = gen_param();
+  CtrCommonAccessor* acc = new CtrCommonAccessor();
+  ASSERT_EQ(acc->configure(parameter), 0);
+  ASSERT_EQ(acc->initialize(), 0);
+
+  const int field_size = 15;
+  float* value = new float[field_size];
+  for (auto i = 0u; i < field_size; ++i) {
+    value[i] = i;
+  }
+
+  auto str = acc->parse_to_string(value, 0);
+
+  VLOG(3) << str << std::endl;
+
+  str = "0 1 2 3 4 5 6";
+  ASSERT_NE(acc->parse_from_string(str, value), 0);
+  // make sure init_zero=true
+
+  for (auto i = 7; i < 15; ++i) {
+    ASSERT_FLOAT_EQ(value[i], 0);
+  }
+}
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 4dfcf0985b85e..edb43b8d38c27 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -26,7 +26,9 @@ add_subdirectory(details)
 add_subdirectory(fleet)
 add_subdirectory(io)
 add_subdirectory(new_executor)
-add_subdirectory(paddle2cinn)
+if (WITH_CINN)
+  add_subdirectory(paddle2cinn)
+endif()
 #ddim lib
 proto_library(framework_proto SRCS framework.proto)
 proto_library(pass_desc_proto SRCS pass_desc.proto DEPS framework_proto)
@@ -353,7 +355,7 @@ target_link_libraries(executor while_op_helper executor_gc_helper recurrent_op_h
 cc_library(parallel_executor SRCS parallel_executor.cc DEPS
         threaded_ssa_graph_executor scope_buffered_ssa_graph_executor parallel_ssa_graph_executor async_ssa_graph_executor
         graph build_strategy bind_threaded_ssa_graph_executor collective_helper
-        fast_threaded_ssa_graph_executor variable_helper cinn_runner)
+        fast_threaded_ssa_graph_executor variable_helper)
 
 cc_library(executor_cache SRCS executor_cache.cc DEPS parallel_executor)
 if(WITH_PSCORE)
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 5e2fd08406fa7..87f77ec2fff3a 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -139,7 +139,12 @@ set(IR_PASS_DEPS graph_viz_pass multi_devices_graph_pass
     coalesce_grad_tensor_pass fuse_all_reduce_op_pass backward_optimizer_op_deps_pass
     fuse_adam_op_pass fuse_sgd_op_pass fuse_momentum_op_pass
     sync_batch_norm_pass runtime_context_cache_pass graph_to_program_pass
-    fix_op_run_order_pass build_cinn_pass)
+    fix_op_run_order_pass)
+
+if (WITH_CINN)
+  set(IR_PASS_DEPS ${IR_PASS_DEPS} build_cinn_pass)
+endif()
+
 if(NOT APPLE AND NOT WIN32 AND (WITH_GPU OR WITH_ROCM))
   set(IR_PASS_DEPS ${IR_PASS_DEPS} fusion_group_pass)
 endif()
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index 6b6ee40833123..1bb1ae0ea6755 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -20,8 +20,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h"
 
 DECLARE_bool(convert_all_blocks);
-DECLARE_bool(use_cinn);
 DECLARE_bool(use_mkldnn);
+#ifdef PADDLE_WITH_CINN
+DECLARE_bool(use_cinn);
+#endif
 
 namespace paddle {
 namespace framework {
@@ -72,10 +74,13 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
     // Note: This pass is used to check whether the multi_device_graph is right.
     AppendPass("multi_devices_check_pass");
 
-    // Note: This pass is used to enable cinn.
+#ifdef PADDLE_WITH_CINN
     if (FLAGS_use_cinn) {
+      // Note: This pass is used to enable cinn.
       AppendPass("build_cinn_pass");
     }
+#endif
+
     SetCollectiveContext();
   }
 
@@ -486,7 +491,9 @@ USE_PASS(fuse_momentum_op_pass);
 USE_PASS(fuse_all_reduce_op_pass);
 USE_PASS(runtime_context_cache_pass);
 USE_PASS(add_reader_dependency_pass);
+#ifdef PADDLE_WITH_CINN
 USE_PASS(build_cinn_pass);
+#endif
 #ifdef PADDLE_WITH_MKLDNN
 USE_PASS(mkldnn_placement_pass);
 #endif
diff --git a/paddle/fluid/framework/fleet/gloo_wrapper.h b/paddle/fluid/framework/fleet/gloo_wrapper.h
index f1ec042dbd705..42ae73f9b13f1 100644
--- a/paddle/fluid/framework/fleet/gloo_wrapper.h
+++ b/paddle/fluid/framework/fleet/gloo_wrapper.h
@@ -27,6 +27,7 @@ limitations under the License. */
 #include <vector>
 #ifdef PADDLE_WITH_GLOO
 #include <gloo/allgather.h>
+#include <gloo/allgatherv.h>
 #include <gloo/allreduce.h>
 #include <gloo/barrier.h>
 #include <gloo/rendezvous/context.h>
@@ -238,10 +239,25 @@ class GlooWrapper {
     return ret;
   }
 
-  // TODO(xiongkun03): support all gather array of
+  // NOTE(@xiongkun03): support all gather array of
   //                   numbers with different length
-  //                   can use AllgathervOptions, may be work in different
-  //                   occasion. Need some survey.
+  //                   if the third argument is int, use allgather,
+  //                   if it is vector, use AllgathervOptions,
+  //                   which works in different length occasion.
+  template <typename T>
+  void AllGatherVector(T* input_ptr, T* output_ptr,
+                       std::vector<size_t>& element_nums) {  // NOLINT
+    CHECK_EQ(is_initialized_, true);
+#ifdef PADDLE_WITH_GLOO
+    gloo::AllgathervOptions opts(context_);
+    opts.setInput(input_ptr, element_nums[rank_]);
+    opts.setOutput(output_ptr, element_nums);
+    gloo::allgatherv(opts);
+#else
+    LOG(WARNING) << "AllGather does nothing when WITH_GLOO=OFF";
+#endif
+  }
+
   template <typename T>
   void AllGatherVector(T* input_ptr, T* output_ptr,
                        size_t element_num) {  // NOLINT
diff --git a/paddle/fluid/framework/ir/cinn_lib_test.cc b/paddle/fluid/framework/ir/cinn_lib_test.cc
index cdee45a06c71a..23cb653fef22a 100644
--- a/paddle/fluid/framework/ir/cinn_lib_test.cc
+++ b/paddle/fluid/framework/ir/cinn_lib_test.cc
@@ -24,6 +24,7 @@ limitations under the License. */
 #include <cuda_runtime.h>
 #endif
 
+#include "cinn/cinn.h"
 #include "cinn/common/target.h"
 #include "cinn/frontend/net_builder.h"
 #include "cinn/frontend/syntax.h"
diff --git a/paddle/fluid/framework/ir/delete_quant_dequant_filter_op_pass.cc b/paddle/fluid/framework/ir/delete_quant_dequant_filter_op_pass.cc
index b9cc337df8792..2fc133edb7a96 100644
--- a/paddle/fluid/framework/ir/delete_quant_dequant_filter_op_pass.cc
+++ b/paddle/fluid/framework/ir/delete_quant_dequant_filter_op_pass.cc
@@ -181,7 +181,7 @@ void DeleteQuantDequantFilterOpPass::ApplyImpl(ir::Graph* graph) const {
                               "Weight scale should be nonzero, but get zero."));
         weight_scale[i] = weight_scale[i] / range;
       }
-    } else {
+    } else if (dequant_type == "fake_quantize_dequantize_abs_max") {
       // Implement quantize_dequantize_abs_max quantization algorithm
       float abs_max_weight = 0.;
       for (int j = 0; j < weight_tensor->numel(); j++) {
@@ -192,6 +192,9 @@ void DeleteQuantDequantFilterOpPass::ApplyImpl(ir::Graph* graph) const {
                         platform::errors::InvalidArgument(
                             "Weight scale should be nonzero, but get zero"));
       weight_scale.push_back(abs_max_weight / range);
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Unsupported quantize_dequantize op type: %s", dequant_type));
     }
 
     nodes2rm.insert(quant_dequant_op_outscale);
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index d1f17625adb3d..dd0ffe8b9fd0d 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -1627,6 +1627,7 @@ PDNode *patterns::Matmul::operator()() {
                          ->assert_is_op_input("matmul", "X");
   auto matmul_in_y = pattern->NewNode(matmul_in_y_repr())
                          ->AsInput()
+                         ->assert_is_persistable_var()
                          ->assert_is_op_input("matmul", "Y");
   auto matmul_out = pattern->NewNode(matmul_out_repr())
                         ->AsOutput()
@@ -1636,23 +1637,45 @@ PDNode *patterns::Matmul::operator()() {
   return matmul_out;
 }
 
+// MatmulV2: tensor * weight
+PDNode *patterns::MatmulV2Weight::operator()() {
+  auto matmul_v2_op =
+      pattern->NewNode(matmul_v2_op_repr())->assert_is_op("matmul_v2");
+
+  auto matmul_v2_in_x = pattern->NewNode(matmul_v2_in_x_repr())
+                            ->AsInput()
+                            ->assert_is_op_input("matmul_v2", "X");
+  auto matmul_v2_in_y = pattern->NewNode(matmul_v2_in_y_repr())
+                            ->AsInput()
+                            ->assert_is_persistable_var()  // Y is weight
+                            ->assert_is_op_input("matmul_v2", "Y");
+  auto matmul_v2_out = pattern->NewNode(matmul_v2_out_repr())
+                           ->AsOutput()
+                           ->assert_is_op_output("matmul_v2", "Out");
+
+  matmul_v2_op->LinksFrom({matmul_v2_in_x, matmul_v2_in_y})
+      .LinksTo({matmul_v2_out});
+  return matmul_v2_out;
+}
+
+// MatmulV2: tensor * tensor or tensor * weight
 PDNode *patterns::MatmulV2::operator()() {
-  auto matmul_op =
-      pattern->NewNode(matmul_op_repr())->assert_is_op("matmul_v2");
+  auto matmul_v2_op =
+      pattern->NewNode(matmul_v2_op_repr())->assert_is_op("matmul_v2");
 
-  auto matmul_in_x = pattern->NewNode(matmul_in_x_repr())
-                         ->AsInput()
-                         ->assert_is_op_input("matmul_v2", "X");
-  auto matmul_in_y = pattern->NewNode(matmul_in_y_repr())
-                         ->assert_is_persistable_var()
-                         ->AsInput()
-                         ->assert_is_op_input("matmul_v2", "Y");
-  auto matmul_out = pattern->NewNode(matmul_out_repr())
-                        ->AsOutput()
-                        ->assert_is_op_output("matmul_v2", "Out");
+  auto matmul_v2_in_x = pattern->NewNode(matmul_v2_in_x_repr())
+                            ->AsInput()
+                            ->assert_is_op_input("matmul_v2", "X");
+  auto matmul_v2_in_y = pattern->NewNode(matmul_v2_in_y_repr())
+                            ->AsInput()
+                            ->assert_is_op_input("matmul_v2", "Y");
+  auto matmul_v2_out = pattern->NewNode(matmul_v2_out_repr())
+                           ->AsOutput()
+                           ->assert_is_op_output("matmul_v2", "Out");
 
-  matmul_op->LinksFrom({matmul_in_x, matmul_in_y}).LinksTo({matmul_out});
-  return matmul_out;
+  matmul_v2_op->LinksFrom({matmul_v2_in_x, matmul_v2_in_y})
+      .LinksTo({matmul_v2_out});
+  return matmul_v2_out;
 }
 
 PDNode *patterns::Squeeze2Matmul::operator()() {
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index a716d048aca01..d7bfdc57d1c7e 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -994,17 +994,28 @@ struct Matmul : public PatternBase {
   PATTERN_DECL_NODE(matmul_out);
 };
 
-// Matmul_v2 op
-// Forward pass for matmul_v2.
+// MatmulV2: tensor * weight
+struct MatmulV2Weight : public PatternBase {
+  MatmulV2Weight(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "matmul_v2_weight") {}
+
+  PDNode* operator()();
+  PATTERN_DECL_NODE(matmul_v2_in_x);
+  PATTERN_DECL_NODE(matmul_v2_in_y);
+  PATTERN_DECL_NODE(matmul_v2_op);
+  PATTERN_DECL_NODE(matmul_v2_out);
+};
+
+// MatmulV2: tensor * tensor or tensor * weight
 struct MatmulV2 : public PatternBase {
   MatmulV2(PDPattern* pattern, const std::string& name_scope)
       : PatternBase(pattern, name_scope, "matmul_v2") {}
 
   PDNode* operator()();
-  PATTERN_DECL_NODE(matmul_in_x);
-  PATTERN_DECL_NODE(matmul_in_y);
-  PATTERN_DECL_NODE(matmul_op);
-  PATTERN_DECL_NODE(matmul_out);
+  PATTERN_DECL_NODE(matmul_v2_in_x);
+  PATTERN_DECL_NODE(matmul_v2_in_y);
+  PATTERN_DECL_NODE(matmul_v2_op);
+  PATTERN_DECL_NODE(matmul_v2_out);
 };
 
 // Squeeze2 + Matmul
diff --git a/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc b/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc
index cdec49260f90c..865b556f301c0 100644
--- a/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc
+++ b/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc
@@ -68,7 +68,7 @@ MapMatmul2MulPass::MapMatmul2MulPass() {
       .End();
 }
 
-MapMatmulv2ToMulPass::MapMatmulv2ToMulPass() {
+MapMatmulV2ToMulPass::MapMatmulV2ToMulPass() {
   AddOpCompat(OpCompat("matmul_v2"))
       .AddInput("X")
       .IsTensor()
@@ -104,6 +104,45 @@ MapMatmulv2ToMulPass::MapMatmulv2ToMulPass() {
       .End();
 }
 
+MapMatmulV2ToMatmulPass::MapMatmulV2ToMatmulPass() {
+  AddOpCompat(OpCompat("matmul_v2"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Y")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("trans_x")
+      .IsType<bool>()
+      .End()
+      .AddAttr("trans_y")
+      .IsType<bool>()
+      .End();
+
+  AddOpCompat(OpCompat("matmul"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Y")
+      .IsTensor()
+      .End()
+      .AddAttr("alpha")
+      .IsNumEQ(1.0f)
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("transpose_X")
+      .IsType<bool>()
+      .End()
+      .AddAttr("transpose_Y")
+      .IsType<bool>()
+      .End();
+}
+
 Flatten2MatmulFusePass::Flatten2MatmulFusePass() {
   AddOpCompat(OpCompat("matmul"))
       .AddInput("X")
@@ -246,15 +285,11 @@ void MapMatmul2MulPass::ApplyImpl(ir::Graph* graph) const {
     std::vector<int64_t> y_shape = matmul_in_y->Var()->GetShape();
     size_t x_rank = x_shape.size();
     size_t y_rank = y_shape.size();
-    flag = flag && (x_rank == 2 || x_rank == 3) && y_rank == 2;
-
-    std::vector<Node*>& next_ops = matmul_out->outputs;
-    flag = flag && next_ops.size() == 1 &&
-           next_ops[0]->Name() == "elementwise_add";
+    flag = flag && x_rank >= 2 && y_rank == 2;
 
     if (flag) {
       if (!IsCompat(subgraph, g)) {
-        LOG(WARNING) << "Pass in op compat failed.";
+        LOG(WARNING) << "MapMatmul2MulPass in op compat failed.";
         return;
       }
       OpDesc desc(matmul_op->Op()->Block());
@@ -268,6 +303,8 @@ void MapMatmul2MulPass::ApplyImpl(ir::Graph* graph) const {
         desc.SetAttr("enable_int8", matmul_op->Op()->GetAttr("enable_int8"));
         desc.SetAttr("X_scale", matmul_op->Op()->GetAttr("X_scale"));
         desc.SetAttr("weight_scale", matmul_op->Op()->GetAttr("weight_scale"));
+        desc.SetAttr("out_threshold",
+                     matmul_op->Op()->GetAttr("out_threshold"));
       }
       auto mul_node = g->CreateOpNode(&desc);
       IR_NODE_LINK_TO(matmul_in_x, mul_node);
@@ -287,66 +324,72 @@ void MapMatmul2MulPass::ApplyImpl(ir::Graph* graph) const {
   AddStatis(found_count);
 }
 
-void MapMatmulv2ToMulPass::ApplyImpl(ir::Graph* graph) const {
+void MapMatmulV2ToMulPass::ApplyImpl(ir::Graph* graph) const {
   PADDLE_ENFORCE_NOT_NULL(
       graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
   std::string name_scope = "map_matmul_v2_to_mul_pass";
   FusePassBase::Init(name_scope, graph);
 
   GraphPatternDetector gpd;
-  patterns::MatmulV2 matmul_pattern(gpd.mutable_pattern(), name_scope);
-  matmul_pattern();
+  patterns::MatmulV2Weight matmul_v2_weight_pattern(gpd.mutable_pattern(),
+                                                    name_scope);
+  matmul_v2_weight_pattern();
 
   int found_count = 0;
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
-    VLOG(4) << "map matmul_v2 to mul";
-    GET_IR_NODE_FROM_SUBGRAPH(matmul_in_x, matmul_in_x, matmul_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(matmul_in_y, matmul_in_y, matmul_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(matmul_op, matmul_op, matmul_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(matmul_out, matmul_out, matmul_pattern);
-    bool flag = true;
+    VLOG(3) << "map matmul_v2 to mul";
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_v2_in_x, matmul_v2_in_x,
+                              matmul_v2_weight_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_v2_in_y, matmul_v2_in_y,
+                              matmul_v2_weight_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_v2_op, matmul_v2_op,
+                              matmul_v2_weight_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_v2_out, matmul_v2_out,
+                              matmul_v2_weight_pattern);
 
-    bool trans_x = BOOST_GET_CONST(bool, matmul_op->Op()->GetAttr("trans_x"));
-    bool trans_y = BOOST_GET_CONST(bool, matmul_op->Op()->GetAttr("trans_y"));
+    bool flag = true;
+    bool trans_x =
+        BOOST_GET_CONST(bool, matmul_v2_op->Op()->GetAttr("trans_x"));
+    bool trans_y =
+        BOOST_GET_CONST(bool, matmul_v2_op->Op()->GetAttr("trans_y"));
     flag = flag && !trans_x && !trans_y;
 
-    std::vector<int64_t> x_shape = matmul_in_x->Var()->GetShape();
-    std::vector<int64_t> y_shape = matmul_in_y->Var()->GetShape();
+    std::vector<int64_t> x_shape = matmul_v2_in_x->Var()->GetShape();
+    std::vector<int64_t> y_shape = matmul_v2_in_y->Var()->GetShape();
     size_t x_rank = x_shape.size();
     size_t y_rank = y_shape.size();
-    flag = flag && (x_rank == 2 || x_rank == 3) && y_rank == 2;
-
-    std::vector<Node*>& next_ops = matmul_out->outputs;
-    flag = flag && next_ops.size() == 1 &&
-           next_ops[0]->Name() == "elementwise_add";
+    flag = flag && x_rank >= 2 && y_rank == 2;
 
     if (flag) {
       if (!IsCompat(subgraph, g)) {
-        LOG(WARNING) << "Pass in op compat failed.";
+        LOG(WARNING) << "MapMatmulV2ToMulPass in op compat failed.";
         return;
       }
-      OpDesc desc(matmul_op->Op()->Block());
+      OpDesc desc(matmul_v2_op->Op()->Block());
       desc.SetType("mul");
-      desc.SetInput("X", {matmul_in_x->Name()});
-      desc.SetInput("Y", {matmul_in_y->Name()});
-      desc.SetOutput("Out", {matmul_out->Name()});
+      desc.SetInput("X", {matmul_v2_in_x->Name()});
+      desc.SetInput("Y", {matmul_v2_in_y->Name()});
+      desc.SetOutput("Out", {matmul_v2_out->Name()});
       desc.SetAttr("x_num_col_dims", static_cast<int>(x_rank - 1));
       desc.SetAttr("y_num_col_dims", 1);
-      if (matmul_op->Op()->HasAttr("enable_int8")) {
-        desc.SetAttr("enable_int8", matmul_op->Op()->GetAttr("enable_int8"));
-        desc.SetAttr("X_scale", matmul_op->Op()->GetAttr("X_scale"));
-        desc.SetAttr("weight_scale", matmul_op->Op()->GetAttr("weight_scale"));
+      if (matmul_v2_op->Op()->HasAttr("enable_int8")) {
+        desc.SetAttr("enable_int8", matmul_v2_op->Op()->GetAttr("enable_int8"));
+        desc.SetAttr("X_scale", matmul_v2_op->Op()->GetAttr("X_scale"));
+        desc.SetAttr("weight_scale",
+                     matmul_v2_op->Op()->GetAttr("weight_scale"));
+        desc.SetAttr("out_threshold",
+                     matmul_v2_op->Op()->GetAttr("out_threshold"));
       }
       auto mul_node = g->CreateOpNode(&desc);
-      IR_NODE_LINK_TO(matmul_in_x, mul_node);
-      IR_NODE_LINK_TO(matmul_in_y, mul_node);
-      IR_NODE_LINK_TO(mul_node, matmul_out);
-      GraphSafeRemoveNodes(graph, {matmul_op});
+      IR_NODE_LINK_TO(matmul_v2_in_x, mul_node);
+      IR_NODE_LINK_TO(matmul_v2_in_y, mul_node);
+      IR_NODE_LINK_TO(mul_node, matmul_v2_out);
+      GraphSafeRemoveNodes(graph, {matmul_v2_op});
       ++found_count;
 
       if (!IsCompat(desc)) {
-        LOG(WARNING) << "MapMatmulv2ToMulPass in out mul op compat failed.";
+        LOG(WARNING) << "MapMatmulV2ToMulPass in out mul op compat failed.";
         return;
       }
     }
@@ -356,6 +399,82 @@ void MapMatmulv2ToMulPass::ApplyImpl(ir::Graph* graph) const {
   AddStatis(found_count);
 }
 
+void MapMatmulV2ToMatmulPass::ApplyImpl(ir::Graph* graph) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
+  std::string name_scope = "map_matmul_v2_to_matmul_pass";
+  FusePassBase::Init(name_scope, graph);
+
+  GraphPatternDetector gpd;
+  patterns::MatmulV2 matmul_v2_pattern(gpd.mutable_pattern(), name_scope);
+  matmul_v2_pattern();
+
+  int found_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "map matmul_v2 to matmul";
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_v2_in_x, matmul_v2_in_x,
+                              matmul_v2_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_v2_in_y, matmul_v2_in_y,
+                              matmul_v2_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_v2_op, matmul_v2_op, matmul_v2_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_v2_out, matmul_v2_out, matmul_v2_pattern);
+    if (!IsCompat(subgraph, g)) {
+      LOG(WARNING) << "MapMatmulV2ToMatmulPass in op compat failed.";
+      return;
+    }
+
+    std::vector<int64_t> x_shape = matmul_v2_in_x->Var()->GetShape();
+    std::vector<int64_t> y_shape = matmul_v2_in_y->Var()->GetShape();
+    if (x_shape.size() != y_shape.size()) {
+      LOG(WARNING)
+          << "matmul op not support broadcast, please check inputs'shape. ";
+      return;
+    }
+    uint64_t dims = 2;
+    for (size_t i = 0; i < x_shape.size() - dims; ++i) {
+      if (x_shape[i] != y_shape[i] && (x_shape[i] == 1 || y_shape[i] == 1)) {
+        LOG(WARNING) << "matmul op not support broadcast, please check "
+                        "inputs'shape[i]. ";
+        return;
+      }
+    }
+
+    OpDesc desc(matmul_v2_op->Op()->Block());
+    desc.SetType("matmul");
+    desc.SetInput("X", {matmul_v2_in_x->Name()});
+    desc.SetInput("Y", {matmul_v2_in_y->Name()});
+    desc.SetOutput("Out", {matmul_v2_out->Name()});
+    desc.SetAttr("transpose_X", matmul_v2_op->Op()->GetAttr("trans_x"));
+    desc.SetAttr("transpose_Y", matmul_v2_op->Op()->GetAttr("trans_y"));
+    desc.SetAttr("alpha", 1.0f);
+    if (matmul_v2_op->Op()->HasAttr("use_mkldnn")) {
+      desc.SetAttr("use_mkldnn", matmul_v2_op->Op()->GetAttr("use_mkldnn"));
+    }
+    if (matmul_v2_op->Op()->HasAttr("enable_int8")) {
+      desc.SetAttr("enable_int8", matmul_v2_op->Op()->GetAttr("enable_int8"));
+      desc.SetAttr("X_scale", matmul_v2_op->Op()->GetAttr("X_scale"));
+      desc.SetAttr("weight_scale", matmul_v2_op->Op()->GetAttr("weight_scale"));
+      desc.SetAttr("out_threshold",
+                   matmul_v2_op->Op()->GetAttr("out_threshold"));
+    }
+    auto matmul_node = g->CreateOpNode(&desc);
+    IR_NODE_LINK_TO(matmul_v2_in_x, matmul_node);
+    IR_NODE_LINK_TO(matmul_v2_in_y, matmul_node);
+    IR_NODE_LINK_TO(matmul_node, matmul_v2_out);
+    GraphSafeRemoveNodes(graph, {matmul_v2_op});
+    ++found_count;
+
+    if (!IsCompat(desc)) {
+      LOG(WARNING) << "MapMatmulV2ToMatmulPass in out matmul op compat failed.";
+      return;
+    }
+  };
+
+  gpd(graph, handler);
+  AddStatis(found_count);
+}
+
 void Squeeze2MatmulFusePass::ApplyImpl(ir::Graph* graph) const {
   PADDLE_ENFORCE_NOT_NULL(
       graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
@@ -402,7 +521,7 @@ void Squeeze2MatmulFusePass::ApplyImpl(ir::Graph* graph) const {
 
     if (flag) {
       if (!IsCompat(subgraph, g)) {
-        LOG(WARNING) << "Pass in op compat failed.";
+        LOG(WARNING) << "Squeeze2MatmulFusePass in op compat failed.";
         return;
       }
       OpDesc desc(matmul_op->Op()->Block());
@@ -416,6 +535,8 @@ void Squeeze2MatmulFusePass::ApplyImpl(ir::Graph* graph) const {
         desc.SetAttr("enable_int8", matmul_op->Op()->GetAttr("enable_int8"));
         desc.SetAttr("X_scale", matmul_op->Op()->GetAttr("X_scale"));
         desc.SetAttr("weight_scale", matmul_op->Op()->GetAttr("weight_scale"));
+        desc.SetAttr("out_threshold",
+                     matmul_op->Op()->GetAttr("out_threshold"));
       }
       auto mul_node = g->CreateOpNode(&desc);
       IR_NODE_LINK_TO(squeeze2_in_x, mul_node);
@@ -544,7 +665,7 @@ void Reshape2MatmulFusePass::ApplyImpl(ir::Graph* graph) const {
 
     if (flag) {
       if (!IsCompat(subgraph, g)) {
-        LOG(WARNING) << "Pass in op compat failed.";
+        LOG(WARNING) << "Reshape2MatmulFusePass in op compat failed.";
         return;
       }
       OpDesc desc(matmul_op->Op()->Block());
@@ -558,9 +679,11 @@ void Reshape2MatmulFusePass::ApplyImpl(ir::Graph* graph) const {
         desc.SetAttr("enable_int8", matmul_op->Op()->GetAttr("enable_int8"));
         desc.SetAttr("X_scale", matmul_op->Op()->GetAttr("X_scale"));
         desc.SetAttr("weight_scale", matmul_op->Op()->GetAttr("weight_scale"));
+        desc.SetAttr("out_threshold",
+                     matmul_op->Op()->GetAttr("out_threshold"));
       }
       if (!IsCompat(desc)) {
-        LOG(WARNING) << "reshape2 matmul pass in out mul op compat failed.";
+        LOG(WARNING) << "Reshape2MatmulFusePass in out mul op compat failed.";
         return;
       }
       auto mul_node = g->CreateOpNode(&desc);
@@ -629,7 +752,7 @@ void Flatten2MatmulFusePass::ApplyImpl(ir::Graph* graph) const {
 
     if (pattern_found) {
       if (!IsCompat(subgraph, g)) {
-        LOG(WARNING) << "Pass in op compat failed.";
+        LOG(WARNING) << "Flatten2MatmulFusePass in op compat failed.";
         return;
       }
       OpDesc desc(matmul_op->Op()->Block());
@@ -643,6 +766,8 @@ void Flatten2MatmulFusePass::ApplyImpl(ir::Graph* graph) const {
         desc.SetAttr("enable_int8", matmul_op->Op()->GetAttr("enable_int8"));
         desc.SetAttr("X_scale", matmul_op->Op()->GetAttr("X_scale"));
         desc.SetAttr("weight_scale", matmul_op->Op()->GetAttr("weight_scale"));
+        desc.SetAttr("out_threshold",
+                     matmul_op->Op()->GetAttr("out_threshold"));
       }
       auto mul_node = g->CreateOpNode(&desc);
       IR_NODE_LINK_TO(flatten2_in_x, mul_node);
@@ -674,13 +799,21 @@ REGISTER_PASS_CAPABILITY(map_matmul_to_mul_pass)
             .EQ("mul", 0));
 
 REGISTER_PASS(map_matmul_v2_to_mul_pass,
-              paddle::framework::ir::MapMatmulv2ToMulPass);
+              paddle::framework::ir::MapMatmulV2ToMulPass);
 REGISTER_PASS_CAPABILITY(map_matmul_v2_to_mul_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
             .EQ("matmul_v2", 0)
             .EQ("mul", 0));
 
+REGISTER_PASS(map_matmul_v2_to_matmul_pass,
+              paddle::framework::ir::MapMatmulV2ToMatmulPass);
+REGISTER_PASS_CAPABILITY(map_matmul_v2_to_matmul_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("matmul_v2", 0)
+            .LE("matmul", 1));
+
 REGISTER_PASS(squeeze2_matmul_fuse_pass,
               paddle::framework::ir::Squeeze2MatmulFusePass);
 REGISTER_PASS_CAPABILITY(squeeze2_matmul_fuse_pass)
diff --git a/paddle/fluid/framework/ir/map_matmul_to_mul_pass.h b/paddle/fluid/framework/ir/map_matmul_to_mul_pass.h
index 8f462810fce51..a924cd8ddf92c 100644
--- a/paddle/fluid/framework/ir/map_matmul_to_mul_pass.h
+++ b/paddle/fluid/framework/ir/map_matmul_to_mul_pass.h
@@ -49,10 +49,22 @@ class MapMatmul2MulPass : public FusePassBase {
 /*
  * Map matmul_v2 to mul, the same as MapMatmul2MulPass.
  */
-class MapMatmulv2ToMulPass : public FusePassBase {
+class MapMatmulV2ToMulPass : public FusePassBase {
  public:
-  MapMatmulv2ToMulPass();
-  virtual ~MapMatmulv2ToMulPass() {}
+  MapMatmulV2ToMulPass();
+  virtual ~MapMatmulV2ToMulPass() {}
+
+ protected:
+  void ApplyImpl(Graph* graph) const override;
+};
+
+/*
+ * Map matmul_v2 to matmul, not supoort broadcast.
+ */
+class MapMatmulV2ToMatmulPass : public FusePassBase {
+ public:
+  MapMatmulV2ToMatmulPass();
+  virtual ~MapMatmulV2ToMatmulPass() {}
 
  protected:
   void ApplyImpl(Graph* graph) const override;
diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
index 4c0b28fd42266..8bbe6a12d8abc 100644
--- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
@@ -461,7 +461,7 @@ PDNode* MultiHeadMatmulV3Pattern::operator()() {
       pattern->NewNode(transpose2_0_repr())->assert_is_op("transpose2");
   auto* transpose2_0_out_var = pattern->NewNode(transpose2_0_out_repr())
                                    ->assert_is_op_output("transpose2");
-  transpose2_0_out_var->AsIntermediate()->assert_is_ops_input(matmul_ops);
+  transpose2_0_out_var->AsIntermediate()->assert_is_ops_input(matmul_ops, "X");
 
   auto* matmul_qk =
       pattern->NewNode(matmul_qk_repr())->assert_is_ops(matmul_ops);
@@ -1174,6 +1174,23 @@ MultiHeadMatmulV3FusePass::MultiHeadMatmulV3FusePass() {
       .IsType<bool>()
       .End();
 
+  AddOpCompat(OpCompat("matmul_v2"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Y")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("trans_x")
+      .IsBoolEQ(false)
+      .End()
+      .AddAttr("trans_y")  // QK(true) QKV(false)
+      .IsType<bool>()
+      .End();
+
   AddOpCompat(OpCompat("softmax"))
       .AddInput("X")
       .IsTensor()
diff --git a/paddle/fluid/framework/new_executor/event_count.h b/paddle/fluid/framework/new_executor/event_count.h
index 0c6d49042d22d..7f1e3670056fc 100644
--- a/paddle/fluid/framework/new_executor/event_count.h
+++ b/paddle/fluid/framework/new_executor/event_count.h
@@ -50,11 +50,13 @@
 #include <cstdlib>
 #include <mutex>
 #include <vector>
-#include "paddle/fluid/framework/new_executor/workqueue_utils.h"
 
 namespace paddle {
 namespace framework {
 
+void* AlignedMalloc(size_t size, size_t alignment);
+void AlignedFree(void* memory_ptr);
+
 class EventCount {
  public:
   class Waiter;
diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index 7e16c3619d61c..d6ea840362e7e 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -23,6 +23,8 @@
 PADDLE_DEFINE_EXPORTED_bool(new_executor_use_inplace, true,
                             "Use inplace in new executor");
 
+constexpr const char* kExceptionCaught = "ExceptionCaught";
+
 namespace paddle {
 namespace framework {
 // NOTE(Aurelius84): Need a better strategy to determine it.
@@ -37,11 +39,14 @@ InterpreterCore::InterpreterCore(const platform::Place& place,
       main_program_(main_prog),
       global_scope_(global_scope),
       stream_analyzer_(place),
-      async_work_queue_(kHostNumThreads) {
+      async_work_queue_(kHostNumThreads, &main_thread_blocker_) {
   is_build_ = false;
 
   feed_names_ = feed_names;
 
+  exception_notifier_ = main_thread_blocker_.RegisterEvent(
+      kExceptionCaught, [this]() { return exception_holder_.IsCaught(); });
+
   // Step1: add feedop and fetchop to main_program
   AddFetch(fetch_names);
 
@@ -360,6 +365,8 @@ void InterpreterCore::ExecuteInstructionList(
   async_work_queue_.PrepareAtomicVarRef(vec_meta_info_);
   op_run_number_ = 0;
 
+  exception_holder_.Clear();
+
   for (size_t i = 0; i < dependecy_count_.size(); ++i) {
     if (dependecy_count_[i] == 0) {
       async_work_queue_.AddTask(vec_instr[i].type_,
@@ -367,7 +374,13 @@ void InterpreterCore::ExecuteInstructionList(
     }
   }
 
-  async_work_queue_.WaitEmpty();
+  auto event_id = main_thread_blocker_.WaitEvent();
+  VLOG(3) << "event_id " << event_id;
+
+  if (UNLIKELY(exception_holder_.IsCaught())) {
+    VLOG(4) << "Exception caught " << exception_holder_.Type();
+    exception_holder_.ReThrow();
+  }
 
   PADDLE_ENFORCE_EQ(
       op_run_number_.load(), vec_instr.size(),
@@ -440,11 +453,34 @@ void InterpreterCore::RunInstructionAsync(size_t instr_id) {
     instr_id = ready_ops.front();
     ready_ops.pop();
     auto& instr_node = vec_instruction_[instr_id];
-    platform::RecordEvent instruction_event(
-        instr_node.kernel_func_.operator_base_->Type());
+    auto* op = instr_node.kernel_func_.operator_base_;
+    platform::RecordEvent instruction_event(op->Type());
     event_manager_.WaitEvent(instr_node, place_);
 
-    RunInstruction(instr_node);
+    try {
+      RunInstruction(instr_node);
+    } catch (platform::EnforceNotMet& ex) {
+      framework::InsertCallStackInfo(op->Type(), op->Attrs(), &ex);
+      exception_holder_.Catch(std::make_exception_ptr(std::move(ex)));
+    } catch (platform::EOFException&) {
+      exception_holder_.Catch(std::current_exception());
+    } catch (std::exception& ex) {
+      LOG(WARNING) << op->Type() << " raises an exception "
+                   << platform::demangle(typeid(ex).name()) << ", "
+                   << ex.what();
+      exception_holder_.Catch(std::current_exception());
+    } catch (...) {
+      LOG(WARNING) << op->Type() << " raises an unknown exception";
+      exception_holder_.Catch(std::current_exception());
+    }
+
+    if (UNLIKELY(exception_holder_.IsCaught())) {
+      VLOG(4) << "Exception caught";
+      if (exception_notifier_ != nullptr) {
+        exception_notifier_->NotifyEvent();
+      }
+      return;
+    }
 
     event_manager_.RecordEvent(instr_node, place_);
     op_run_number_.fetch_add(1, std::memory_order_relaxed);
diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h
index d6c916b9ddc4c..9fba5f2cdce8b 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.h
+++ b/paddle/fluid/framework/new_executor/interpretercore.h
@@ -19,6 +19,7 @@
 #include <unordered_map>
 #include <vector>
 
+#include "paddle/fluid/framework/details/exception_holder.h"
 #include "paddle/fluid/framework/new_executor/event_manager.h"
 #include "paddle/fluid/framework/new_executor/interpretercore_garbage_collector.h"
 #include "paddle/fluid/framework/new_executor/interpretercore_util.h"
@@ -26,6 +27,7 @@
 #include "paddle/fluid/framework/new_executor/profiler.h"
 #include "paddle/fluid/framework/new_executor/stream_analyzer.h"
 #include "paddle/fluid/framework/new_executor/workqueue.h"
+#include "paddle/fluid/framework/new_executor/workqueue_utils.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/variable.h"
@@ -95,7 +97,10 @@ class InterpreterCore {
   InterpreterProfiler dry_run_profiler_;
   StreamAnalyzer stream_analyzer_;
   EventManager event_manager_;
+  EventsWaiter main_thread_blocker_;
   interpretercore::AsyncWorkQueue async_work_queue_;
+  details::ExceptionHolder exception_holder_;
+  std::shared_ptr<EventsWaiter::EventNotifier> exception_notifier_{nullptr};
 
   InterpreterCoreGarbageCollector gc_;
   std::vector<paddle::platform::DeviceEvent> gc_event_;
diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.h b/paddle/fluid/framework/new_executor/interpretercore_util.h
index 2a5942c712365..b1e1c02ab9513 100644
--- a/paddle/fluid/framework/new_executor/interpretercore_util.h
+++ b/paddle/fluid/framework/new_executor/interpretercore_util.h
@@ -33,6 +33,7 @@
 #include "paddle/fluid/framework/garbage_collector.h"
 #include "paddle/fluid/framework/new_executor/new_executor_defs.h"
 #include "paddle/fluid/framework/new_executor/workqueue.h"
+#include "paddle/fluid/framework/new_executor/workqueue_utils.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
@@ -53,16 +54,19 @@ using AtomicVectorSizeT = std::vector<std::unique_ptr<std::atomic<size_t>>>;
 
 class AsyncWorkQueue {
  public:
-  explicit AsyncWorkQueue(size_t host_num_threads)
+  AsyncWorkQueue(size_t host_num_threads, EventsWaiter* waiter)
       : host_num_thread_(host_num_threads) {
     std::vector<WorkQueueOptions> group_options;
     // for execute host Kernel
     group_options.emplace_back(/*num_threads*/ host_num_threads,
                                /*allow_spinning*/ true,
-                               /*track_task*/ true);
+                               /*track_task*/ true,
+                               /*queue_empty_waiter*/ waiter);
     // for launch device Kernel
     group_options.emplace_back(/*num_threads*/ 1,
-                               /*allow_spinning*/ true, /*track_task*/ true);
+                               /*allow_spinning*/ true,
+                               /*track_task*/ true,
+                               /*queue_empty_waiter*/ waiter);
     queue_group_ = CreateWorkQueueGroup(group_options);
   }
 
@@ -71,12 +75,14 @@ class AsyncWorkQueue {
   AtomicVectorSizeT& PrepareAtomicVarRef(
       const std::vector<VariableMetaInfo>& vec_meta_info);
 
-  void WaitEmpty() { queue_group_->WaitQueueGroupEmpty(); }
+  // void WaitEmpty() { queue_group_->WaitQueueGroupEmpty(); }
 
   void AddTask(const OpFuncType& op_func_type, std::function<void()> fn) {
     queue_group_->AddTask(static_cast<size_t>(op_func_type), std::move(fn));
   }
 
+  void Cancel() { queue_group_->Cancel(); }
+
   AtomicVectorSizeT& AtomicDeps() { return atomic_deps_; }
   AtomicVectorSizeT& AtomicVarRef() { return atomic_var_ref_; }
 
diff --git a/paddle/fluid/framework/new_executor/nonblocking_threadpool.h b/paddle/fluid/framework/new_executor/nonblocking_threadpool.h
index 2997ce1fe2473..6e56532456c6f 100644
--- a/paddle/fluid/framework/new_executor/nonblocking_threadpool.h
+++ b/paddle/fluid/framework/new_executor/nonblocking_threadpool.h
@@ -19,9 +19,12 @@
 namespace paddle {
 namespace framework {
 
+template <typename Notifier>
 class TaskTracker {
  public:
-  TaskTracker() : wait_empty_cv_(1) {}
+  TaskTracker() = default;
+
+  explicit TaskTracker(Notifier& notifier) : notifier_(&notifier) {}
 
   TaskTracker(const TaskTracker&) = delete;
 
@@ -33,32 +36,17 @@ class TaskTracker {
 
   void SubCounter() {
     if (1 == num_tasks_.fetch_sub(1, std::memory_order_relaxed)) {
-      wait_empty_cv_.Notify(true);
+      if (notifier_ != nullptr) {
+        notifier_->NotifyEvent();
+      }
     }
   }
 
-  // only one user can wait at any time
-  void WaitTaskNumToZero() {
-    bool waiting = false;
-    if (!wait_empty_.compare_exchange_strong(waiting, true,
-                                             std::memory_order_seq_cst,
-                                             std::memory_order_relaxed)) {
-      abort();
-    }
-    EventCount::Waiter* w = wait_empty_cv_.GetWaiter(0);
-    wait_empty_cv_.Prewait();
-    if (num_tasks_.load(std::memory_order_relaxed) == 0) {
-      wait_empty_cv_.CancelWait();
-    } else {
-      wait_empty_cv_.CommitWait(w);
-    }
-    wait_empty_.store(false);
-  }
+  uint64_t PendingTaskNum() { return num_tasks_.load(); }
 
  private:
   alignas(64) std::atomic<uint64_t> num_tasks_{0};
-  alignas(64) EventCount wait_empty_cv_;
-  alignas(64) std::atomic<bool> wait_empty_{false};
+  Notifier* notifier_{nullptr};
 };
 
 template <typename Environment>
@@ -185,6 +173,12 @@ class ThreadPoolTempl {
     ec_.Notify(true);
   }
 
+  void WaitThreadsExit() {
+    for (size_t i = 0; i < thread_data_.size(); ++i) {
+      thread_data_[i].thread->WaitExit();
+    }
+  }
+
   size_t NumThreads() const { return num_threads_; }
 
   int CurrentThreadId() const {
diff --git a/paddle/fluid/framework/new_executor/thread_environment.h b/paddle/fluid/framework/new_executor/thread_environment.h
index be936274186f4..eb1ee4de90898 100644
--- a/paddle/fluid/framework/new_executor/thread_environment.h
+++ b/paddle/fluid/framework/new_executor/thread_environment.h
@@ -25,7 +25,16 @@ struct StlThreadEnvironment {
   class EnvThread {
    public:
     explicit EnvThread(std::function<void()> f) : thr_(std::move(f)) {}
-    ~EnvThread() { thr_.join(); }
+    void WaitExit() {
+      if (thr_.joinable()) {
+        thr_.join();
+      }
+    }
+    ~EnvThread() {
+      if (thr_.joinable()) {
+        thr_.join();
+      }
+    }
 
    private:
     std::thread thr_;
diff --git a/paddle/fluid/framework/new_executor/workqueue.cc b/paddle/fluid/framework/new_executor/workqueue.cc
index 8c6eeab4d5c0a..7607b3a297f84 100644
--- a/paddle/fluid/framework/new_executor/workqueue.cc
+++ b/paddle/fluid/framework/new_executor/workqueue.cc
@@ -13,13 +13,18 @@ namespace paddle {
 namespace framework {
 namespace {
 
+using TaskTracker = TaskTracker<EventsWaiter::EventNotifier>;
+
 class WorkQueueImpl : public WorkQueue {
  public:
-  explicit WorkQueueImpl(const WorkQueueOptions& options)
-      : WorkQueue(options), queue_(nullptr), tracker_(nullptr) {
-    if (options_.track_task) {
+  explicit WorkQueueImpl(const WorkQueueOptions& options) : WorkQueue(options) {
+    if (options_.track_task && options.queue_empty_waiter != nullptr) {
       void* storage = AlignedMalloc(sizeof(TaskTracker), alignof(TaskTracker));
-      tracker_ = new (storage) TaskTracker;
+      TaskTracker* tracker = reinterpret_cast<TaskTracker*>(storage);
+      auto notifier = options.queue_empty_waiter->RegisterEvent(
+          kQueueEmptyEvent,
+          [tracker]() { return tracker->PendingTaskNum() == 0; });
+      tracker_ = new (storage) TaskTracker(*notifier.get());
     }
     queue_ = new NonblockingThreadPool(options_.num_threads,
                                        options_.allow_spinning);
@@ -44,20 +49,16 @@ class WorkQueueImpl : public WorkQueue {
     queue_->AddTask(std::move(fn));
   }
 
-  void WaitQueueEmpty() override {
-    if (tracker_ == nullptr) {
-      PADDLE_THROW(
-          platform::errors::Unavailable("set WorkQueueOptions.track_task = "
-                                        "true before call this interface."));
-    }
-    tracker_->WaitTaskNumToZero();
+  void Cancel() override {
+    queue_->Cancel();
+    queue_->WaitThreadsExit();
   }
 
   size_t NumThreads() const override { return queue_->NumThreads(); }
 
  private:
-  NonblockingThreadPool* queue_;
-  TaskTracker* tracker_;
+  NonblockingThreadPool* queue_{nullptr};
+  TaskTracker* tracker_{nullptr};
 };
 
 class WorkQueueGroupImpl : public WorkQueueGroup {
@@ -69,12 +70,12 @@ class WorkQueueGroupImpl : public WorkQueueGroup {
 
   void AddTask(size_t queue_idx, std::function<void()> fn) override;
 
-  void WaitQueueGroupEmpty() override;
-
   size_t QueueNumThreads(size_t queue_idx) const override;
 
   size_t QueueGroupNumThreads() const override;
 
+  void Cancel() override;
+
  private:
   std::vector<NonblockingThreadPool*> queues_;
   NonblockingThreadPool* queues_storage_;
@@ -92,9 +93,14 @@ WorkQueueGroupImpl::WorkQueueGroupImpl(
   queues_storage_ = reinterpret_cast<NonblockingThreadPool*>(buffer);
   for (size_t idx = 0; idx < num_queues; ++idx) {
     const auto& options = queues_options_[idx];
-    if (options.track_task && tracker_ == nullptr) {
+    if (options.track_task && tracker_ == nullptr &&
+        options.queue_empty_waiter != nullptr) {
       void* storage = AlignedMalloc(sizeof(TaskTracker), alignof(TaskTracker));
-      tracker_ = new (storage) TaskTracker;
+      TaskTracker* tracker = reinterpret_cast<TaskTracker*>(storage);
+      auto notifier = options.queue_empty_waiter->RegisterEvent(
+          kQueueEmptyEvent,
+          [tracker]() { return tracker->PendingTaskNum() == 0; });
+      tracker_ = new (storage) TaskTracker(*notifier.get());
     }
     queues_[idx] = new (&queues_storage_[idx])
         NonblockingThreadPool(options.num_threads, options.allow_spinning);
@@ -124,15 +130,6 @@ void WorkQueueGroupImpl::AddTask(size_t queue_idx, std::function<void()> fn) {
   queues_[queue_idx]->AddTask(std::move(fn));
 }
 
-void WorkQueueGroupImpl::WaitQueueGroupEmpty() {
-  if (nullptr == tracker_) {
-    PADDLE_THROW(platform::errors::Unavailable(
-        "set WorkQueueOptions.track_task = true for at least one of queues "
-        "before call this interface."));
-  }
-  tracker_->WaitTaskNumToZero();
-}
-
 size_t WorkQueueGroupImpl::QueueNumThreads(size_t queue_idx) const {
   assert(queue_idx < queues_.size());
   return queues_.at(queue_idx)->NumThreads();
@@ -146,6 +143,15 @@ size_t WorkQueueGroupImpl::QueueGroupNumThreads() const {
   return total_num;
 }
 
+void WorkQueueGroupImpl::Cancel() {
+  for (auto queue : queues_) {
+    queue->Cancel();
+  }
+  for (auto queue : queues_) {
+    queue->WaitThreadsExit();
+  }
+}
+
 }  // namespace
 
 std::unique_ptr<WorkQueue> CreateSingleThreadedWorkQueue(
diff --git a/paddle/fluid/framework/new_executor/workqueue.h b/paddle/fluid/framework/new_executor/workqueue.h
index ead9d9949b700..a299d0aaed7d2 100644
--- a/paddle/fluid/framework/new_executor/workqueue.h
+++ b/paddle/fluid/framework/new_executor/workqueue.h
@@ -21,15 +21,31 @@
 namespace paddle {
 namespace framework {
 
+constexpr const char* kQueueEmptyEvent = "QueueEmpty";
+
+class EventsWaiter;
+
 struct WorkQueueOptions {
   WorkQueueOptions(size_t num_threads, bool allow_spinning, bool track_task)
       : num_threads(num_threads),
         allow_spinning(allow_spinning),
         track_task(track_task) {}
 
+  WorkQueueOptions(size_t num_threads, bool allow_spinning, bool track_task,
+                   EventsWaiter* waiter)
+      : num_threads(num_threads),
+        allow_spinning(allow_spinning),
+        track_task(track_task),
+        queue_empty_waiter(waiter) {}
+
   size_t num_threads;
   bool allow_spinning;
+  // If you need to blocking the calling  thread to wait "queue empty", set
+  // track_task = true and set queue_empty_waiter. EventsWaiter::WaitEvent will
+  // block the calling thread until any of events (including "queue empty")
+  // occured.
   bool track_task;
+  EventsWaiter* queue_empty_waiter{nullptr};  // not owned
 };
 
 class WorkQueue {
@@ -44,12 +60,13 @@ class WorkQueue {
 
   virtual void AddTask(std::function<void()> fn) = 0;
 
-  // set WorkQueueOptions.track_task = true before call this
-  // interface, otherwise will abort()
-  virtual void WaitQueueEmpty() = 0;
+  // See WorkQueueOptions.track_task for details
+  // virtual void WaitQueueEmpty() = 0;
 
   virtual size_t NumThreads() const = 0;
 
+  virtual void Cancel() = 0;
+
  protected:
   WorkQueueOptions options_;
 };
@@ -67,14 +84,15 @@ class WorkQueueGroup {
 
   virtual void AddTask(size_t queue_idx, std::function<void()> fn) = 0;
 
-  // set WorkQueueOptions.track_task = true for at least one of queues
-  // before call this interface, otherwise will abort()
-  virtual void WaitQueueGroupEmpty() = 0;
+  // See WorkQueueOptions.track_task for details
+  // virtual void WaitQueueGroupEmpty() = 0;
 
   virtual size_t QueueNumThreads(size_t queue_idx) const = 0;
 
   virtual size_t QueueGroupNumThreads() const = 0;
 
+  virtual void Cancel() = 0;
+
  protected:
   std::vector<WorkQueueOptions> queues_options_;
 };
diff --git a/paddle/fluid/framework/new_executor/workqueue_test.cc b/paddle/fluid/framework/new_executor/workqueue_test.cc
index c229a84b145ab..3ea0096b631e8 100644
--- a/paddle/fluid/framework/new_executor/workqueue_test.cc
+++ b/paddle/fluid/framework/new_executor/workqueue_test.cc
@@ -16,18 +16,21 @@
 #include <atomic>
 #include "glog/logging.h"
 #include "gtest/gtest.h"
+#include "paddle/fluid/framework/new_executor/workqueue_utils.h"
 
 TEST(WorkQueue, TestSingleThreadedWorkQueue) {
   VLOG(1) << "In Test";
   using paddle::framework::WorkQueueOptions;
   using paddle::framework::WorkQueue;
   using paddle::framework::CreateSingleThreadedWorkQueue;
+  using paddle::framework::EventsWaiter;
   std::atomic<bool> finished{false};
   std::atomic<unsigned> counter{0};
   constexpr unsigned kLoopNum = 1000000;
   // CreateSingleThreadedWorkQueue
+  EventsWaiter events_waiter;
   WorkQueueOptions options(/*num_threads*/ 1, /*allow_spinning*/ true,
-                           /*track_task*/ true);
+                           /*track_task*/ true, &events_waiter);
   auto work_queue = CreateSingleThreadedWorkQueue(options);
   // NumThreads
   EXPECT_EQ(work_queue->NumThreads(), 1u);
@@ -42,7 +45,7 @@ TEST(WorkQueue, TestSingleThreadedWorkQueue) {
   });
   // WaitQueueEmpty
   EXPECT_EQ(finished.load(), false);
-  work_queue->WaitQueueEmpty();
+  events_waiter.WaitEvent();
   EXPECT_EQ(finished.load(), true);
   EXPECT_EQ(counter.load(), kLoopNum);
 }
@@ -52,13 +55,15 @@ TEST(WorkQueue, TestMultiThreadedWorkQueue) {
   using paddle::framework::WorkQueueOptions;
   using paddle::framework::WorkQueue;
   using paddle::framework::CreateMultiThreadedWorkQueue;
+  using paddle::framework::EventsWaiter;
   std::atomic<bool> finished{false};
   std::atomic<unsigned> counter{0};
   constexpr unsigned kExternalLoopNum = 100;
   constexpr unsigned kLoopNum = 1000000;
   // CreateMultiThreadedWorkQueue
+  EventsWaiter events_waiter;
   WorkQueueOptions options(/*num_threads*/ 10, /*allow_spinning*/ true,
-                           /*track_task*/ true);
+                           /*track_task*/ true, &events_waiter);
   auto work_queue = CreateMultiThreadedWorkQueue(options);
   // NumThreads
   EXPECT_EQ(work_queue->NumThreads(), 10u);
@@ -75,24 +80,28 @@ TEST(WorkQueue, TestMultiThreadedWorkQueue) {
   }
   // WaitQueueEmpty
   EXPECT_EQ(finished.load(), false);
-  work_queue->WaitQueueEmpty();
+  events_waiter.WaitEvent();
   EXPECT_EQ(finished.load(), true);
   EXPECT_EQ(counter.load(), kLoopNum * kExternalLoopNum);
+  // Cancel
+  work_queue->Cancel();
 }
 
 TEST(WorkQueue, TestWorkQueueGroup) {
   using paddle::framework::WorkQueueOptions;
   using paddle::framework::WorkQueueGroup;
   using paddle::framework::CreateWorkQueueGroup;
+  using paddle::framework::EventsWaiter;
   std::atomic<bool> finished{false};
   std::atomic<unsigned> counter{0};
   constexpr unsigned kExternalLoopNum = 100;
   constexpr unsigned kLoopNum = 1000000;
-  // CreateMultiThreadedWorkQueue
+  // ThreadedWorkQueueGroup
+  EventsWaiter events_waiter;
   WorkQueueOptions sq_options(/*num_threads*/ 1, /*allow_spinning*/ true,
-                              /*track_task*/ true);
+                              /*track_task*/ true, &events_waiter);
   WorkQueueOptions mq_options(/*num_threads*/ 10, /*allow_spinning*/ true,
-                              /*track_task*/ true);
+                              /*track_task*/ true, &events_waiter);
   auto queue_group = CreateWorkQueueGroup({sq_options, mq_options});
   // NumThreads
   EXPECT_EQ(queue_group->QueueNumThreads(0), 1u);
@@ -112,7 +121,9 @@ TEST(WorkQueue, TestWorkQueueGroup) {
       ++counter;
     }
   });
-  //  WaitQueueGroupEmpty()
-  queue_group->WaitQueueGroupEmpty();
+  // WaitQueueGroupEmpty
+  events_waiter.WaitEvent();
   EXPECT_EQ(counter.load(), kLoopNum * kExternalLoopNum + kLoopNum);
+  // Cancel
+  queue_group->Cancel();
 }
diff --git a/paddle/fluid/framework/new_executor/workqueue_utils.cc b/paddle/fluid/framework/new_executor/workqueue_utils.cc
index 2ea49e676a807..2c81cffb49d82 100644
--- a/paddle/fluid/framework/new_executor/workqueue_utils.cc
+++ b/paddle/fluid/framework/new_executor/workqueue_utils.cc
@@ -55,5 +55,62 @@ void AlignedFree(void* mem_ptr) {
 #endif
 }
 
+constexpr EventsWaiter::EventId kEmptyEventId = -1;
+
+EventsWaiter::EventsWaiter()
+    : trigger_event_(kEmptyEventId), waiting_(false), cv_(1) {}
+
+std::shared_ptr<EventsWaiter::EventNotifier> EventsWaiter::RegisterEvent(
+    const std::string& name, EventChecker checker) {
+  names_.emplace_back(name);
+  checkers_.emplace_back(std::move(checker));
+  EventId id = checkers_.size() - 1;
+  auto notifier = std::shared_ptr<EventNotifier>(new EventNotifier(id, this));
+  notifiers_.emplace_back(notifier);
+  return notifier;
+}
+
+std::string EventsWaiter::WaitEvent() {
+  // only one user can wait at any time
+  bool waiting = false;
+  if (!waiting_.compare_exchange_strong(waiting, true,
+                                        std::memory_order_seq_cst,
+                                        std::memory_order_relaxed)) {
+    PADDLE_THROW(
+        platform::errors::ResourceExhausted("Another thread is waiting."));
+  }
+  EventId id = kEmptyEventId;
+  auto w = cv_.GetWaiter(0);
+  cv_.Prewait();
+  int64_t event_num = checkers_.size();
+  for (int64_t i = 0; id == kEmptyEventId && i < event_num; ++i) {
+    if (checkers_[i]()) {
+      id = i;
+    }
+  }
+  if (id != kEmptyEventId) {
+    cv_.CancelWait();
+  } else {
+    cv_.CommitWait(w);
+    id = trigger_event_.load(std::memory_order_relaxed);
+  }
+  trigger_event_.store(kEmptyEventId, std::memory_order_relaxed);
+  waiting_.store(false);
+  return names_.at(id);
+}
+
+void EventsWaiter::SetTriggerEvent(const EventId& id) {
+  trigger_event_.store(id, std::memory_order_relaxed);
+  cv_.Notify(true);
+}
+
+std::string EventsWaiter::EventNotifier::GetEventName() {
+  return waiter_.names_.at(id_);
+}
+
+void EventsWaiter::EventNotifier::NotifyEvent() {
+  waiter_.SetTriggerEvent(id_);
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/workqueue_utils.h b/paddle/fluid/framework/new_executor/workqueue_utils.h
index bb219fea36267..a06d9f319dfee 100644
--- a/paddle/fluid/framework/new_executor/workqueue_utils.h
+++ b/paddle/fluid/framework/new_executor/workqueue_utils.h
@@ -18,6 +18,11 @@
 #include <cassert>
 #include <cstddef>
 #include <cstdlib>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/new_executor/event_count.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -64,5 +69,56 @@ void* AlignedMalloc(size_t size, size_t alignment);
 
 void AlignedFree(void* memory_ptr);
 
+// A multiplexing waiter, be able to wait multi events simultaneously.
+// Blocking the calling thread to wait any of the registered events.
+// Non-thread-safe.
+class EventsWaiter {
+ public:
+  using EventId = int64_t;
+
+  using EventChecker = std::function<bool()>;
+
+  class EventNotifier {
+   public:
+    void NotifyEvent();
+
+    EventId GetEventId() { return id_; }
+
+    std::string GetEventName();
+
+   private:
+    friend EventsWaiter;
+    EventNotifier(EventId id, EventsWaiter* waiter)
+        : id_(id), waiter_(*waiter) {}
+
+    EventId id_;
+    EventsWaiter& waiter_;
+  };
+
+  EventsWaiter();
+
+  EventsWaiter(const EventsWaiter&) = delete;
+
+  EventsWaiter& operator=(const EventsWaiter&) = delete;
+
+  // All the RegisterEvent functions must be called before any WaitEvent
+  std::shared_ptr<EventNotifier> RegisterEvent(const std::string& name,
+                                               EventChecker checker);
+
+  // Wait any of the registered events
+  std::string WaitEvent();
+
+ private:
+  friend EventNotifier;
+  void SetTriggerEvent(const EventId& id);
+
+  std::vector<std::string> names_;
+  std::vector<EventChecker> checkers_;
+  std::vector<std::shared_ptr<EventNotifier>> notifiers_;
+  std::atomic<EventId> trigger_event_;
+  std::atomic<bool> waiting_;
+  EventCount cv_;
+};
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/paddle2cinn/CMakeLists.txt b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
index 4a65333217727..04931c7c4b35e 100644
--- a/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
+++ b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
@@ -1,9 +1,11 @@
 cc_library(cinn_cache_key SRCS cinn_cache_key.cc DEPS boost graph graph_helper lod_tensor proto_desc)
-cc_library(cinn_compiled_object SRCS cinn_compiled_object.cc DEPS feed_fetch_method graph lod_tensor proto_desc)
-cc_library(cinn_runner SRCS cinn_runner.cc DEPS cinn_cache_key cinn_compiled_object feed_fetch_method graph lod_tensor scope)
-cc_library(build_cinn_pass SRCS build_cinn_pass.cc DEPS pass subgraph_detector)
+cc_library(build_cinn_pass SRCS build_cinn_pass.cc DEPS pass subgraph_detector cinn_compiler)
+cc_library(transform_desc SRCS transform_desc.cc DEPS proto_desc cinn)
+cc_library(cinn_graph_symbolization SRCS cinn_graph_symbolization.cc DEPS lod_tensor graph graph_helper transform_desc cinn)
+cc_library(cinn_compiler SRCS cinn_compiler.cc DEPS graph lod_tensor cinn_cache_key cinn_graph_symbolization cinn)
 
 cc_test(cinn_cache_key_test SRCS cinn_cache_key_test.cc DEPS cinn_cache_key)
-cc_test(cinn_runner_test SRCS cinn_runner_test.cc DEPS cinn_runner proto_desc)
-cc_test(cinn_compiled_object_test SRCS cinn_compiled_object_test.cc DEPS cinn_compiled_object)
-cc_test(test_build_cinn_pass SRCS build_cinn_pass_test.cc DEPS build_cinn_pass)
+cc_test(build_cinn_pass_test SRCS build_cinn_pass_test.cc DEPS build_cinn_pass cinn_compiler)
+cc_test(transform_desc_test SRCS transform_desc_test.cc DEPS transform_desc)
+cc_test(cinn_graph_symbolization_test SRCS cinn_graph_symbolization_test.cc DEPS cinn_graph_symbolization)
+cc_test(cinn_compiler_test SRCS cinn_compiler_test.cc DEPS cinn_compiler place proto_desc graph_viz_pass build_cinn_pass cinn)
diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
index caddc8fbb7381..0664a63c2b72b 100644
--- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
+++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
@@ -14,45 +14,21 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/paddle2cinn/build_cinn_pass.h"
 
+#include <algorithm>
+#include <iterator>
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
+#include <utility>
 #include <vector>
 
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/op_mappers/use_op_mappers.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/framework/ir/subgraph_detector.h"
-// #include "cinn/frontend/op_mapper_registry.h"
-// #include "cinn/frontend/op_mappers/use_op_mappers.h"
-
-// TODO(jiangcheng05): just for local compile, remove after
-// paddle and CINN have been binded
-// The APIs are the same as CINN:
-// https://github.com/PaddlePaddle/CINN/blob/develop/cinn/utils/registry.h
-namespace cinn {
-namespace frontend {
-class OpMapperRegistry {
- public:
-  static OpMapperRegistry* Global() {
-    static OpMapperRegistry inst;
-    return &inst;
-  }
-
-  inline const OpMapperRegistry* Find(const std::string& name) {
-    std::unordered_set<std::string> fmap_ = {"mul", "add", "relu", "sigmoid",
-                                             "softmax"};
-    auto p = fmap_.find(name);
-    if (p != fmap_.end()) {
-      return this;
-    } else {
-      return nullptr;
-    }
-  }
-};
-
-}  // namespace frontend
-}  // namespace cinn
+#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
 
 namespace paddle {
 namespace framework {
@@ -138,20 +114,26 @@ void AddOutputVar(const std::unordered_set<Node*>& output_vars,
 // var node are from internal nodes
 std::unique_ptr<Graph> CreateNewSubGraph(const GraphNodeSet& cluster,
                                          const GraphNodeSet& cluster_internals,
-                                         const GraphNodeSet& cluster_inputs) {
+                                         const GraphNodeSet& cluster_inputs,
+                                         const GraphNodeSet& cluster_outputs) {
   // Graph's constructor must has one parameter, and in our code,
   // the ProgramDesc is useless, so here we pass a temporary object.
-  auto sub_graph = std::make_unique<Graph>(framework::ProgramDesc());
+  auto subgraph = std::make_unique<Graph>(framework::ProgramDesc());
 
   std::unordered_map<Node*, Node*> old_op2new_op;
   for (auto* op : cluster) {
-    auto sub_node = sub_graph->CreateOpNode(op->Op());
+    auto sub_node = subgraph->CreateOpNode(op->Op());
     old_op2new_op[op] = sub_node;
   }
 
   std::unordered_map<Node*, Node*> old_var2new_var;
   for (auto* var : cluster_internals) {
-    auto sub_node = sub_graph->CreateVarNode(var->Var());
+    Node* sub_node;
+    if (var->Var() == nullptr) {
+      sub_node = subgraph->CreateEmptyNode(var->Name(), var->NodeType());
+    } else {
+      sub_node = subgraph->CreateVarNode(var->Var());
+    }
     old_var2new_var[var] = sub_node;
   }
 
@@ -164,7 +146,7 @@ std::unique_ptr<Graph> CreateNewSubGraph(const GraphNodeSet& cluster,
     for (auto* var : op->inputs) {
       if (cluster_internals.count(var)) {
         old_op2new_op[op]->inputs.emplace_back(old_var2new_var[var]);
-      } else if (cluster_inputs.count(var)) {
+      } else if (cluster_inputs.count(var) && var->Var() != nullptr) {
         if (var->Var()->IsParameter()) {
           // Parameters have been preserved in scope, compared to feed var,
           // param just need add new var and don't need add feed op.
@@ -181,7 +163,7 @@ std::unique_ptr<Graph> CreateNewSubGraph(const GraphNodeSet& cluster,
     for (auto* var : op->outputs) {
       if (cluster_internals.count(var)) {
         old_op2new_op[op]->outputs.emplace_back(old_var2new_var[var]);
-      } else {
+      } else if (cluster_outputs.count(var) && var->Var() != nullptr) {
         // Create new output var node to guarantee the independency of
         // subgraph. In other words, the subgraph has no connection with
         // other graph, even the input graph.
@@ -190,9 +172,9 @@ std::unique_ptr<Graph> CreateNewSubGraph(const GraphNodeSet& cluster,
     }
   }
 
-  AddFeedOpAndVar(need_feed_vars, cluster, old_op2new_op, sub_graph.get());
-  AddParamVar(param_vars, cluster, old_op2new_op, sub_graph.get());
-  AddOutputVar(output_vars, cluster, old_op2new_op, sub_graph.get());
+  AddFeedOpAndVar(need_feed_vars, cluster, old_op2new_op, subgraph.get());
+  AddParamVar(param_vars, cluster, old_op2new_op, subgraph.get());
+  AddOutputVar(output_vars, cluster, old_op2new_op, subgraph.get());
 
   for (auto* var : cluster_internals) {
     for (auto* op : var->inputs) {
@@ -207,7 +189,7 @@ std::unique_ptr<Graph> CreateNewSubGraph(const GraphNodeSet& cluster,
     }
   }
 
-  return sub_graph;
+  return subgraph;
 }
 
 // This interface is used to classify all variables involved in a cluster into
@@ -256,11 +238,30 @@ void AnalyseClusterVariables(const GraphNodeSet& cluster,
   }
 }
 
-Node* AddSpecialOpToGraph(Graph* graph, const GraphNodeSet& cluster_inputs,
-                          const GraphNodeSet& cluster_outputs) {
+Node* AddSpecialOpToGraph(const GraphNodeSet& cluster_inputs,
+                          const GraphNodeSet& cluster_outputs,
+                          const std::string& compilation_key, Graph* graph) {
   // add special cinn op
   framework::OpDesc special_op_desc;
   special_op_desc.SetType(kCinnLaunchOp);
+  std::vector<std::string> input_names;
+  std::for_each(cluster_inputs.begin(), cluster_inputs.end(),
+                [&input_names](Node* n) {
+                  if (n->Var() != nullptr) {
+                    input_names.emplace_back(n->Name());
+                  }
+                });
+  special_op_desc.SetInput("X", input_names);
+  std::vector<std::string> output_names;
+  std::for_each(cluster_outputs.begin(), cluster_outputs.end(),
+                [&output_names](Node* n) {
+                  if (n->Var() != nullptr) {
+                    output_names.emplace_back(n->Name());
+                  }
+                });
+  special_op_desc.SetOutput("Out", output_names);
+  special_op_desc.SetAttr(kCompilationKey, compilation_key);
+  special_op_desc.Flush();
   auto* special_op_node = graph->CreateOpNode(&special_op_desc);
   special_op_node->inputs.assign(cluster_inputs.begin(), cluster_inputs.end());
   special_op_node->outputs.assign(cluster_outputs.begin(),
@@ -268,9 +269,9 @@ Node* AddSpecialOpToGraph(Graph* graph, const GraphNodeSet& cluster_inputs,
   return special_op_node;
 }
 
-void AddLinkToSpecialOp(Node* special_op_node,
-                        const GraphNodeSet& cluster_inputs,
-                        const GraphNodeSet& cluster_outputs) {
+void AddLinkToSpecialOp(const GraphNodeSet& cluster_inputs,
+                        const GraphNodeSet& cluster_outputs,
+                        Node* special_op_node) {
   // add new link from cluster_inputs to special_op_node
   for (auto* var_node : cluster_inputs) {
     var_node->outputs.push_back(special_op_node);
@@ -338,14 +339,15 @@ void ReplaceSubGraphWithSpecialOpNode(const GraphNodeSet& cluster,
                                       const GraphNodeSet& cluster_inputs,
                                       const GraphNodeSet& cluster_outputs,
                                       const GraphNodeSet& cluster_internals,
+                                      const std::string& compilation_key,
                                       Graph* graph) {
   // First, add the special op node whose name is "kCinnLaunchOp" into graph
-  auto special_op_node =
-      AddSpecialOpToGraph(graph, cluster_inputs, cluster_outputs);
+  auto special_op_node = AddSpecialOpToGraph(cluster_inputs, cluster_outputs,
+                                             compilation_key, graph);
   // Second, remove all graph's links which are from or to cluster nodes
   RemoveLinkFromCluster(cluster, cluster_inputs, cluster_outputs);
   // Third, add new links from or to the the special op node
-  AddLinkToSpecialOp(special_op_node, cluster_inputs, cluster_outputs);
+  AddLinkToSpecialOp(cluster_inputs, cluster_outputs, special_op_node);
   // Finally, remove the cinn sub graph from graph
   RemoveSubGraphFromGraph(cluster, cluster_internals, graph);
 }
@@ -354,8 +356,7 @@ void ReplaceSubGraphWithSpecialOpNode(const GraphNodeSet& cluster,
 // Here we using SubgraphDetector to detecte the subgraph that
 // all of op node supported by CINN. We using OpMapperRegistry
 // to check whether the op node supported by CINN.
-void SearchAllSubgraphs(Graph* graph,
-                        std::vector<std::unique_ptr<Graph>>* cinn_subgraphs) {
+void SearchAllSubgraphs(Graph* graph) {
   auto teller = [](const Node* node) {
     return ::cinn::frontend::OpMapperRegistry::Global()->Find(node->Name()) !=
            nullptr;
@@ -363,29 +364,26 @@ void SearchAllSubgraphs(Graph* graph,
   std::vector<GraphNodeVec> clusters =
       framework::ir::SubgraphDetector(graph, teller)();
 
-  cinn_subgraphs->clear();
+  auto* cinn_compiler = CinnCompiler::GetInstance();
   for (const auto& node_vec : clusters) {
-    // classify var node to inputs, outputs, and internals.
+    // Classify var node to inputs, outputs, and internals.
     GraphNodeSet cluster_set(node_vec.begin(), node_vec.end());
 
     GraphNodeSet cluster_inputs, cluster_outputs, cluster_internals;
     AnalyseClusterVariables(cluster_set, &cluster_inputs, &cluster_outputs,
                             &cluster_internals);
-
-    cinn_subgraphs->emplace_back(
-        CreateNewSubGraph(cluster_set, cluster_internals, cluster_inputs));
-
-    // replacing subgraph to a new special op node
+    // Create a new subgraph according to the found cluster and
+    // save it in CinnCompiler
+    std::string compilation_key = cinn_compiler->AddGraph(CreateNewSubGraph(
+        cluster_set, cluster_internals, cluster_inputs, cluster_outputs));
+    // Replace the found cluster to a new special op node
     ReplaceSubGraphWithSpecialOpNode(cluster_set, cluster_inputs,
-                                     cluster_outputs, cluster_internals, graph);
+                                     cluster_outputs, cluster_internals,
+                                     compilation_key, graph);
   }
 }
 
-void BuildCinnPass::ApplyImpl(Graph* graph) const {
-  auto& cinn_subgraphs =
-      Get<std::vector<std::unique_ptr<Graph>>>("cinn_subgraphs");
-  SearchAllSubgraphs(graph, &cinn_subgraphs);
-}
+void BuildCinnPass::ApplyImpl(Graph* graph) const { SearchAllSubgraphs(graph); }
 
 }  // namespace paddle2cinn
 }  // namespace framework
diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.h b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.h
index e71160ba108ec..556ff228915e4 100644
--- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.h
+++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.h
@@ -21,6 +21,7 @@ namespace framework {
 namespace paddle2cinn {
 
 constexpr char kCinnLaunchOp[] = "CinnLaunchOp";
+constexpr char kCompilationKey[] = "compilation_key";
 
 // A pass named BuildCinnPass, the function of this pass is:
 //
@@ -39,12 +40,13 @@ constexpr char kCinnLaunchOp[] = "CinnLaunchOp";
 // Firstly, both op nodes should be compile supported.
 // Secondly, there should be a direct path between the two op nodes through a
 // var node.
-// Thirdly, there should be no extral path between the two op nodes through
+// Thirdly, there should be no extra path between the two op nodes through
 // unsupported op nodes.
 // Lastly, if op nodes a and b can be divied into a cluster, op nodes b and c
-// can be devided into a cluster, a and c can also be devided into a cluster.
-// The implementation of cluster detection is enclosured in class
-// SubGraphDetector.
+// can be divided into a cluster, a and c can also be divided into a cluster.
+// The implementation of cluster detection is encapsulated in the
+// SubGraphDetector
+// class.
 //
 // b) How to deal with the links between the var nodes in global graph and the
 // op nodes in a cluster?
diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc
index bf68a2b554b7f..79a27dccb4b00 100644
--- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <algorithm>
 #include <memory>
+#include <string>
 
 #include "gtest/gtest.h"
 
@@ -23,6 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/var_desc.h"
 
@@ -49,9 +51,10 @@ inline int CountNode(const std::unordered_set<Node*>& nodes,
 
 inline Node* GetNode(const std::unordered_set<Node*>& nodes,
                      const std::string& op_name) {
-  return *std::find_if(
-      nodes.begin(), nodes.end(),
-      [&op_name](const Node* node) { return node->Name() == op_name; });
+  return *std::find_if(nodes.begin(), nodes.end(),
+                       [&op_name](const Node* node) {
+                         return node->Name().find(op_name) != std::string::npos;
+                       });
 }
 
 inline bool CheckGraphIndependence(const std::unordered_set<Node*>& nodes) {
@@ -83,6 +86,18 @@ inline bool CheckGraphIndependence(const std::unordered_set<Node*>& nodes) {
   return true;
 }
 
+// Get compilation_key values
+std::vector<std::string> GetCompilationKeys(const Graph& graph) {
+  std::vector<std::string> compilation_keys;
+  for (auto& node : graph.Nodes()) {
+    if (node->IsOp() && node->Name() == kCinnLaunchOp) {
+      compilation_keys.emplace_back(
+          BOOST_GET_CONST(std::string, node->Op()->GetAttr(kCompilationKey)));
+    }
+  }
+  return compilation_keys;
+}
+
 std::unique_ptr<Graph> BuildNoCinnSubgraph() {
   ProgramDesc prog;
   auto g = std::make_unique<Graph>(prog);
@@ -133,17 +148,14 @@ TEST(BuildCinnPassTest, NoCinnSubgraph) {
 
   auto pass =
       paddle::framework::ir::PassRegistry::Instance().Get("build_cinn_pass");
-  std::vector<std::unique_ptr<Graph>> cinn_subgraphs;
-  pass->SetNotOwned<std::vector<std::unique_ptr<Graph>>>("cinn_subgraphs",
-                                                         &cinn_subgraphs);
   pass->Apply(g.get());
 
   // After search, origin graph should no change
   ASSERT_EQ(previous_nodes, g->Nodes());
   ASSERT_TRUE(CheckGraphIndependence(g->Nodes()));
 
-  // After search, there should one cinn subgraph
-  ASSERT_TRUE(cinn_subgraphs.empty());
+  // After search, there should be no cinn subgraph
+  ASSERT_TRUE(GetCompilationKeys(*g).empty());
 }
 
 std::unique_ptr<Graph> BuildAllOpSupportCinnGraph() {
@@ -175,22 +187,25 @@ std::unique_ptr<Graph> BuildAllOpSupportCinnGraph() {
   ir::Node* mul = g->CreateOpNode(&mul_op);
   ir::Node* relu = g->CreateOpNode(&relu_op);
 
+  ir::Node* v0 = g->CreateEmptyNode("var0", Node::Type::kVariable);
   ir::Node* v1 = g->CreateVarNode(&var1);
   ir::Node* v2 = g->CreateVarNode(&var2);
   ir::Node* v3 = g->CreateVarNode(&var3);
   ir::Node* v4 = g->CreateVarNode(&var4);
   ir::Node* v5 = g->CreateVarNode(&var5);
   ir::Node* v6 = g->CreateVarNode(&var6);
+  ir::Node* v7 = g->CreateControlDepVar();
 
   // fill op node
-  mul->inputs = {v1, v2};
+  mul->inputs = {v0, v1, v2};
   mul->outputs = {v3};
   add->inputs = {v3, v4};
   add->outputs = {v5};
   relu->inputs = {v5};
-  relu->outputs = {v6};
+  relu->outputs = {v6, v7};
 
   // fill variable node
+  v0->outputs = {mul};
   v1->outputs = {mul};
   v2->outputs = {mul};
 
@@ -203,6 +218,7 @@ std::unique_ptr<Graph> BuildAllOpSupportCinnGraph() {
   v5->outputs = {relu};
 
   v6->inputs = {relu};
+  v7->inputs = {relu};
 
   return g;
 }
@@ -212,31 +228,31 @@ TEST(BuildCinnPassTest, AllOpSupportCinn) {
 
   auto pass =
       paddle::framework::ir::PassRegistry::Instance().Get("build_cinn_pass");
-  std::vector<std::unique_ptr<Graph>> cinn_subgraphs;
-  pass->SetNotOwned<std::vector<std::unique_ptr<Graph>>>("cinn_subgraphs",
-                                                         &cinn_subgraphs);
   pass->Apply(g.get());
 
   // After search, the graph should as following
-  // v1 --|
-  // v2 --| --> kCinnLaunchOp --> v6
+  // v0 --|
+  // v1 --|                   |--> v6
+  // v2 --| --> kCinnLaunchOp |--> v7
   // v4 --|
   const auto& nodes = g->Nodes();
-  ASSERT_EQ(nodes.size(), static_cast<size_t>(5));
+  ASSERT_EQ(nodes.size(), static_cast<size_t>(7));
   ASSERT_TRUE(CheckGraphIndependence(nodes));
 
   // A new op named kCinnLaunchOp should be added
   ASSERT_TRUE(CheckNodeExisted(nodes, kCinnLaunchOp));
   auto* cinn_op = GetNode(nodes, kCinnLaunchOp);
+  auto* v0 = GetNode(nodes, "var0");
   auto* v1 = GetNode(nodes, "var1");
   auto* v2 = GetNode(nodes, "var2");
   auto* v4 = GetNode(nodes, "var4");
   auto* v6 = GetNode(nodes, "var6");
+  auto* v7 = GetNode(nodes, Node::kControlDepVarName);
 
   ASSERT_EQ(
       std::unordered_set<Node*>(cinn_op->inputs.begin(), cinn_op->inputs.end()),
-      std::unordered_set<Node*>({v1, v2, v4}));
-  ASSERT_EQ(cinn_op->outputs, std::vector<Node*>({v6}));
+      std::unordered_set<Node*>({v0, v1, v2, v4}));
+  ASSERT_EQ(cinn_op->outputs, std::vector<Node*>({v6, v7}));
   ASSERT_EQ(v1->outputs, std::vector<Node*>({cinn_op}));
   ASSERT_EQ(v6->inputs, std::vector<Node*>({cinn_op}));
 
@@ -250,10 +266,12 @@ TEST(BuildCinnPassTest, AllOpSupportCinn) {
   //               | --> mul --> v3 --
   //          v2 --                   | --> add --> v5 --> relu --> v6
   //                    feed --> v4 --
-  ASSERT_EQ(cinn_subgraphs.size(), static_cast<size_t>(1));
-  const auto& subgraph = cinn_subgraphs.back();
+  auto compilation_keys = GetCompilationKeys(*g);
+  ASSERT_EQ(compilation_keys.size(), static_cast<size_t>(1));
+  auto* cinn_compiler = CinnCompiler::GetInstance();
+  const auto& subgraph = cinn_compiler->FindGraph(compilation_keys[0]);
 
-  const auto& subnodes = subgraph->Nodes();
+  const auto& subnodes = subgraph.Nodes();
   ASSERT_EQ(subnodes.size(), static_cast<size_t>(11));
   ASSERT_TRUE(CheckGraphIndependence(subnodes));
 
@@ -338,9 +356,6 @@ TEST(BuildCinnPassTest, OneCinnSubgraph) {
 
   auto pass =
       paddle::framework::ir::PassRegistry::Instance().Get("build_cinn_pass");
-  std::vector<std::unique_ptr<Graph>> cinn_subgraphs;
-  pass->SetNotOwned<std::vector<std::unique_ptr<Graph>>>("cinn_subgraphs",
-                                                         &cinn_subgraphs);
   pass->Apply(g.get());
 
   // After search, the graph should as following
@@ -366,10 +381,12 @@ TEST(BuildCinnPassTest, OneCinnSubgraph) {
   // feed --> v1 --
   //               | --> mul --> v3 --> relu --> v4
   //          v2 --
-  ASSERT_EQ(cinn_subgraphs.size(), static_cast<size_t>(1));
-  const auto& subgraph = cinn_subgraphs.back();
+  auto compilation_keys = GetCompilationKeys(*g);
+  ASSERT_EQ(compilation_keys.size(), static_cast<size_t>(1));
+  auto* cinn_compiler = CinnCompiler::GetInstance();
+  const auto& subgraph = cinn_compiler->FindGraph(compilation_keys[0]);
 
-  const auto& subnodes = subgraph->Nodes();
+  const auto& subnodes = subgraph.Nodes();
   ASSERT_EQ(subnodes.size(), static_cast<size_t>(7));
   ASSERT_TRUE(CheckGraphIndependence(subnodes));
 
@@ -450,9 +467,6 @@ TEST(BuildCinnPassTest, MultiCinnSubgraph) {
 
   auto pass =
       paddle::framework::ir::PassRegistry::Instance().Get("build_cinn_pass");
-  std::vector<std::unique_ptr<Graph>> cinn_subgraphs;
-  pass->SetNotOwned<std::vector<std::unique_ptr<Graph>>>("cinn_subgraphs",
-                                                         &cinn_subgraphs);
   pass->Apply(g.get());
 
   // After search, the graph should as following
@@ -478,7 +492,8 @@ TEST(BuildCinnPassTest, MultiCinnSubgraph) {
 
   // After search, there should has two cinn subgraphs,
   // and each of subgraphs just has one node.
-  ASSERT_EQ(cinn_subgraphs.size(), static_cast<size_t>(2));
+  auto compilation_keys = GetCompilationKeys(*g);
+  ASSERT_EQ(compilation_keys.size(), static_cast<size_t>(2));
 
   // subgraph1:
   // feed --> v4 --> relu --> v5
@@ -486,12 +501,13 @@ TEST(BuildCinnPassTest, MultiCinnSubgraph) {
   // feed --> v1 --
   //               | --> mul --> v3
   //          v2 --
-  const auto& subgraph1 = cinn_subgraphs[0];
-  const auto& subnodes1 = subgraph1->Nodes();
+  auto* cinn_compiler = CinnCompiler::GetInstance();
+  const auto& subgraph1 = cinn_compiler->FindGraph(compilation_keys[0]);
+  const auto& subnodes1 = subgraph1.Nodes();
   ASSERT_TRUE(CheckGraphIndependence(subnodes1));
 
-  const auto& subgraph2 = cinn_subgraphs[1];
-  const auto& subnodes2 = subgraph2->Nodes();
+  const auto& subgraph2 = cinn_compiler->FindGraph(compilation_keys[1]);
+  const auto& subnodes2 = subgraph2.Nodes();
   ASSERT_TRUE(CheckGraphIndependence(subnodes2));
 
   if (CheckNodeExisted(subnodes1, "relu")) {
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc b/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc
index ac6c83be4fae3..923282c59e2d4 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc
@@ -28,32 +28,38 @@ namespace paddle2cinn {
 
 CinnCacheKey::CinnCacheKey(
     const ir::Graph& graph,
-    const std::map<std::string, const LoDTensor*>& feed_tensors) {
-  this->SetKey(graph, feed_tensors);
+    const std::map<std::string, const LoDTensor*>& input_tensors,
+    const std::string& arch_str) {
+  this->SetKey(graph, input_tensors, arch_str);
 }
 
 CinnCacheKey::CinnCacheKey(const ir::Graph& graph,
-                           const std::map<std::string, DDim>& feed_shapes) {
-  this->SetKey(graph, feed_shapes);
+                           const std::map<std::string, DDim>& input_shapes,
+                           const std::string& arch_str) {
+  this->SetKey(graph, input_shapes, arch_str);
 }
 
 void CinnCacheKey::SetKey(
     const ir::Graph& graph,
-    const std::map<std::string, const LoDTensor*>& feed_tensors) {
+    const std::map<std::string, const LoDTensor*>& input_tensors,
+    const std::string& arch_str) {
   ProgramDesc program;
   GraphToProgram(graph, &program);
   program.Proto()->SerializeToString(&graph_serialize_str_);
-  for (const auto& name_tensor : feed_tensors) {
-    feed_shapes_[name_tensor.first] = name_tensor.second->dims();
+  for (const auto& name_tensor : input_tensors) {
+    input_shapes_[name_tensor.first] = name_tensor.second->dims();
   }
+  arch_str_ = arch_str;
 }
 
 void CinnCacheKey::SetKey(const ir::Graph& graph,
-                          const std::map<std::string, DDim>& feed_shapes) {
+                          const std::map<std::string, DDim>& input_shapes,
+                          const std::string& arch_str) {
   ProgramDesc program;
   GraphToProgram(graph, &program);
   program.Proto()->SerializeToString(&graph_serialize_str_);
-  feed_shapes_ = feed_shapes;
+  input_shapes_ = input_shapes;
+  arch_str_ = arch_str;
 }
 
 bool CinnCacheKey::operator!=(const CinnCacheKey& other) const {
@@ -62,7 +68,7 @@ bool CinnCacheKey::operator!=(const CinnCacheKey& other) const {
 
 bool CinnCacheKey::operator==(const CinnCacheKey& other) const {
   return graph_serialize_str_ == other.graph_serialize_str_ &&
-         feed_shapes_ == other.feed_shapes_;
+         input_shapes_ == other.input_shapes_ && arch_str_ == other.arch_str_;
 }
 
 size_t CinnCacheKey::Hash::hash_combine(size_t seed, size_t value) {
@@ -73,12 +79,13 @@ size_t CinnCacheKey::Hash::operator()(const CinnCacheKey& key) const {
   std::size_t ret = 0;
 
   std::hash<std::string> string_hasher;
-  for (const auto& name_shape : key.feed_shapes_) {
+  for (const auto& name_shape : key.input_shapes_) {
     ret = hash_combine(ret, string_hasher(name_shape.first));
     ret = hash_combine(ret, string_hasher(name_shape.second.to_str()));
   }
 
   ret = hash_combine(ret, string_hasher(key.graph_serialize_str_));
+  ret = hash_combine(ret, string_hasher(key.arch_str_));
   return ret;
 }
 
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_cache_key.h b/paddle/fluid/framework/paddle2cinn/cinn_cache_key.h
index 9627ae92aaba2..02b152a681c44 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_cache_key.h
+++ b/paddle/fluid/framework/paddle2cinn/cinn_cache_key.h
@@ -26,24 +26,28 @@ namespace paddle2cinn {
 
 // Class to store the keys for compiling CINN.
 //
-// CINN cannot handle changable shape now, so CinnRunner keeps a cache mapping
+// CINN cannot handle changable shape now, so CinnCompiler keeps a cache mapping
 // from CinnCacheKey to CinnCompiledObject.
 //
-// The CinnCacheKey contains a graph serialized string and the feeded tensor
+// The CinnCacheKey contains a graph serialized string and the input tensor
 // shapes.
 class CinnCacheKey {
  public:
   CinnCacheKey(const ir::Graph& graph,
-               const std::map<std::string, const LoDTensor*>& feed_tensors);
+               const std::map<std::string, const LoDTensor*>& input_tensors,
+               const std::string& arch_str);
   CinnCacheKey(const ir::Graph& graph,
-               const std::map<std::string, DDim>& feed_shapes);
+               const std::map<std::string, DDim>& input_shapes,
+               const std::string& arch_str);
 
   ~CinnCacheKey() {}
 
   void SetKey(const ir::Graph& graph,
-              const std::map<std::string, const LoDTensor*>& feed_tensors);
+              const std::map<std::string, const LoDTensor*>& input_tensors,
+              const std::string& arch_str);
   void SetKey(const ir::Graph& graph,
-              const std::map<std::string, DDim>& feed_shapes);
+              const std::map<std::string, DDim>& input_shapes,
+              const std::string& arch_str);
 
   bool operator==(const CinnCacheKey& other) const;
   bool operator!=(const CinnCacheKey& other) const;
@@ -55,7 +59,8 @@ class CinnCacheKey {
 
  private:
   std::string graph_serialize_str_;
-  std::map<std::string, DDim> feed_shapes_;
+  std::map<std::string, DDim> input_shapes_;
+  std::string arch_str_;
 };
 
 }  // namespace paddle2cinn
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc
index a84ade26bfd12..f13f44998211f 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc
@@ -47,17 +47,19 @@ TEST(CinnCacheKeyTest, TestAsUnorderedKey) {
   DDim ddim = paddle::framework::make_ddim({1, 2, 3});
   std::map<std::string, DDim> feed_shapes = {{"X", ddim}};
 
-  CinnCacheKey cache_key1(empty_graph, feed_tensors);
-  CinnCacheKey cache_key2(empty_graph, feed_shapes);
-  EXPECT_EQ(cache_key1, cache_key2);
-
-  CinnCacheKey cache_key3(graph, feed_shapes);
-  CinnCacheKey cache_key4(graph, feed_tensors);
+  CinnCacheKey cache_key0(empty_graph, feed_tensors, "x86");
+  CinnCacheKey cache_key1(empty_graph, feed_shapes, "x86");
+  EXPECT_EQ(cache_key0, cache_key1);
+
+  CinnCacheKey cache_key2(graph, feed_shapes, "x86");
+  CinnCacheKey cache_key3(graph, feed_shapes, "nvgpu");
+  CinnCacheKey cache_key4(graph, feed_tensors, "nvgpu");
+  EXPECT_NE(cache_key2, cache_key3);
   EXPECT_EQ(cache_key3, cache_key4);
 
   CinnCacheKey cache_key5(empty_graph,
-                          std::map<std::string, const LoDTensor *>());
-  CinnCacheKey cache_key6(empty_graph, std::map<std::string, DDim>());
+                          std::map<std::string, const LoDTensor *>(), "unk");
+  CinnCacheKey cache_key6(empty_graph, std::map<std::string, DDim>(), "unk");
   EXPECT_EQ(cache_key5, cache_key6);
 
   EXPECT_NE(cache_key1, cache_key3);
@@ -69,19 +71,19 @@ TEST(CinnCacheKeyTest, TestAsUnorderedKey) {
   EXPECT_NE(cache_key5, cache_key1);
   EXPECT_NE(cache_key2, cache_key6);
 
+  test_set.insert(cache_key0);
   test_set.insert(cache_key1);
-  test_set.insert(cache_key2);
   test_set.insert(cache_key3);
   test_set.insert(cache_key4);
   test_set.insert(cache_key5);
   test_set.insert(cache_key6);
   EXPECT_EQ(test_set.size(), 3U);
 
-  auto iter = test_set.find(cache_key1);
+  auto iter = test_set.find(cache_key0);
   EXPECT_NE(iter, test_set.end());
   test_set.erase(iter);
   EXPECT_EQ(test_set.size(), 2U);
-  EXPECT_EQ(test_set.find(cache_key2), test_set.end());
+  EXPECT_EQ(test_set.find(cache_key1), test_set.end());
 
   iter = test_set.find(cache_key3);
   EXPECT_NE(iter, test_set.end());
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiled_object.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiled_object.cc
deleted file mode 100644
index a90494bafe9bb..0000000000000
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiled_object.cc
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/paddle2cinn/cinn_compiled_object.h"
-
-#include <map>
-
-#include "paddle/fluid/framework/feed_fetch_type.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/scope.h"
-
-namespace paddle {
-namespace framework {
-namespace paddle2cinn {
-
-CinnCompiledObject::CinnCompiledObject() {
-  // TODO(zhhsplendid): complete this function after CINN interface is ready
-}
-CinnCompiledObject::~CinnCompiledObject() {
-  // TODO(zhhsplendid): complete this function after CINN interface is ready
-}
-
-void CinnCompiledObject::Compile(
-    const ir::Graph& graph,
-    std::map<std::string, const LoDTensor*>* feed_targets) {
-  // TODO(zhhsplendid): complete this function after CINN interface is ready
-}
-
-std::map<std::string, FetchType*> CinnCompiledObject::Run(
-    Scope* scope, std::map<std::string, const LoDTensor*>* feed_targets) {
-  // TODO(zhhsplendid): complete this function after CINN interface is ready
-  return std::map<std::string, FetchType*>();
-}
-
-}  // namespace paddle2cinn
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiled_object.h b/paddle/fluid/framework/paddle2cinn/cinn_compiled_object.h
deleted file mode 100644
index 21191d4434587..0000000000000
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiled_object.h
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <map>
-
-#include "paddle/fluid/framework/feed_fetch_type.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-
-namespace paddle {
-namespace framework {
-namespace paddle2cinn {
-
-// Class to store and call CINN complied object
-class CinnCompiledObject {
- public:
-  CinnCompiledObject();
-  ~CinnCompiledObject();
-
-  // Compiles use CINN. CINN compilation needs model graph, input names, and
-  // input_shapes
-  void Compile(const ir::Graph& graph,
-               std::map<std::string, const LoDTensor*>* feed_targets);
-
-  // Feed LoDTensors to tun CINN compiled object and return fetched result
-  std::map<std::string, FetchType*> Run(
-      Scope* scope, std::map<std::string, const LoDTensor*>* feed_targets);
-
-  // Converts compiled object to Paddle Graph
-  // To be discussed
-  // ir::Graph ToGraph();
-};
-
-}  // namespace paddle2cinn
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiled_object_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiled_object_test.cc
deleted file mode 100644
index 5a7861edf210c..0000000000000
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiled_object_test.cc
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <map>
-
-#include "gtest/gtest.h"
-
-#include "paddle/fluid/framework/feed_fetch_type.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/paddle2cinn/cinn_compiled_object.h"
-#include "paddle/fluid/framework/program_desc.h"
-
-namespace paddle {
-namespace framework {
-namespace paddle2cinn {
-
-TEST(CinnCompiledObjecctTest, TodoTest) {
-  ProgramDesc empty_program;
-  ir::Graph empty_graph(empty_program);
-  std::map<std::string, const LoDTensor*> empty_feed;
-  Scope empty_scope;
-
-  CinnCompiledObject compiled_obj;
-  compiled_obj.Compile(empty_graph, &empty_feed);
-  auto fetch = compiled_obj.Run(&empty_scope, &empty_feed);
-}
-
-}  // namespace paddle2cinn
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
new file mode 100644
index 0000000000000..44cea60bdcb8e
--- /dev/null
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
@@ -0,0 +1,127 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
+
+#include <map>
+#include <memory>
+#include <string>
+
+#include "cinn/common/target.h"
+#include "cinn/common/type.h"
+#include "cinn/frontend/decomposer/use_decomposer.h"
+#include "cinn/frontend/net_builder.h"  // need to remove after
+#include "cinn/frontend/pass/use_program_pass.h"
+#include "cinn/frontend/program_pass.h"
+#include "cinn/frontend/syntax.h"
+#include "cinn/hlir/framework/graph.h"
+#include "cinn/hlir/framework/graph_compiler.h"
+#include "cinn/hlir/framework/pass.h"
+#include "cinn/hlir/pass/use_pass.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace paddle2cinn {
+
+using ir::Graph;
+using ::cinn::common::Target;
+using ::cinn::common::Float;
+using ::cinn::hlir::framework::GraphCompiler;
+using ::cinn::hlir::framework::BuildScope;
+using ::cinn::frontend::ProgramPass;
+using ::cinn::hlir::framework::ApplyPass;
+
+CinnCompiler* CinnCompiler::GetInstance() {
+  static CinnCompiler instance;
+  return &instance;
+}
+
+std::string CinnCompiler::AddGraph(std::unique_ptr<Graph> graph) {
+  std::string graph_key;
+  ProgramDesc program;
+  GraphToProgram(*graph, &program);
+  program.Proto()->SerializeToString(&graph_key);
+  if (!graphs_.count(graph_key)) {
+    graphs_[graph_key] = std::move(graph);
+  } else {
+    LOG(WARNING)
+        << "The graph being added is already in CinnCompiler. Its key is:\n"
+        << graph_key;
+  }
+  return graph_key;
+}
+
+const Graph& CinnCompiler::FindGraph(const std::string& graph_key) const {
+  PADDLE_ENFORCE_NE(
+      graphs_.count(graph_key), 0,
+      platform::errors::InvalidArgument("Can not find the target graph: %s",
+                                        graph_key.c_str()));
+  return *graphs_.at(graph_key);
+}
+
+const CinnCompiledObject& CinnCompiler::Compile(
+    const Graph& graph,
+    const std::map<std::string, const LoDTensor*>& input_tensors,
+    const Target& target) {
+  CinnCacheKey cur_key(graph, input_tensors, target.arch_str());
+  if (!cache_.count(cur_key)) {
+    real_compiled_num_++;
+    cache_[cur_key] = CompileGraph(graph, input_tensors, target);
+  }
+  return *cache_[cur_key];
+}
+
+const CinnCompiledObject& CinnCompiler::Compile(
+    const std::string& compilation_key,
+    const std::map<std::string, const LoDTensor*>& input_tensors,
+    const Target& target) {
+  const auto& graph = FindGraph(compilation_key);
+  return Compile(graph, input_tensors, target);
+}
+
+std::unique_ptr<CinnCompiledObject> CinnCompiler::CompileGraph(
+    const ir::Graph& graph,
+    const std::map<std::string, const LoDTensor*>& input_tensors,
+    const Target& target) const {
+  CinnGraphSymbolization symbol{real_compiled_num_, graph, target,
+                                input_tensors};
+  auto frontend_program = symbol();
+  ProgramPass::Apply(&frontend_program, target, {"Decomposer"});
+  auto cinn_graph = std::make_shared<::cinn::hlir::framework::Graph>(
+      frontend_program, target);
+  VLOG(4) << "The " << real_compiled_num_ << "-th compilation ("
+          << target.arch_str() << "), and its related graph:\n"
+          << cinn_graph->Visualize();
+  ApplyPass(cinn_graph.get(), "OpFusion");
+  auto scope = BuildScope(target, cinn_graph);
+  GraphCompiler graph_compiler(target, scope, cinn_graph);
+  GraphCompiler::CompileOptions options;
+  options.with_instantiate_variables = false;
+  auto compiled_res = graph_compiler.Build(options);
+  auto compiled_obj = std::make_unique<CinnCompiledObject>();
+  *compiled_obj = {std::move(compiled_res.runtime_program), scope,
+                   symbol.var_model_to_program_map()};
+  return compiled_obj;
+}
+
+}  // namespace paddle2cinn
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.h b/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
new file mode 100644
index 0000000000000..3b0fb5cf6965f
--- /dev/null
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
@@ -0,0 +1,88 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <atomic>
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "cinn/common/target.h"
+#include "cinn/hlir/framework/graph_compiler.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/paddle2cinn/cinn_cache_key.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/macros.h"
+
+namespace paddle {
+namespace framework {
+namespace paddle2cinn {
+
+struct CinnCompiledObject {
+  std::unique_ptr<::cinn::hlir::framework::Program> runtime_program;
+  std::shared_ptr<::cinn::hlir::framework::Scope> scope;
+  std::unordered_map<std::string, std::string> paddle2cinn_varmap;
+};
+
+// Entrance to use CINN.
+//
+// CINN cannot handle changable shape now, so CinnCompiler keeps a cache mapping
+// from CinnCacheKey to CinnCompiledObject. If cache hits, we will re-use cache
+// stored CinnCompiledObject, otherwise we will compile again and put into
+// cache.
+class CinnCompiler {
+ public:
+  // Singleton
+  static CinnCompiler* GetInstance();
+
+  const CinnCompiledObject& Compile(
+      const ir::Graph& graph,
+      const std::map<std::string, const LoDTensor*>& input_tensors,
+      const ::cinn::common::Target& target);
+
+  const CinnCompiledObject& Compile(
+      const std::string& compilation_key,
+      const std::map<std::string, const LoDTensor*>& input_tensors,
+      const ::cinn::common::Target& target);
+
+  std::string AddGraph(std::unique_ptr<ir::Graph> graph);
+
+  const ir::Graph& FindGraph(const std::string& key) const;
+
+  std::int64_t real_compiled_num() const { return real_compiled_num_; }
+
+  ~CinnCompiler() = default;
+
+ private:
+  CinnCompiler() = default;
+  std::unique_ptr<CinnCompiledObject> CompileGraph(
+      const ir::Graph& graph,
+      const std::map<std::string, const LoDTensor*>& input_tensors,
+      const ::cinn::common::Target& target) const;
+
+  std::unordered_map<std::string, std::unique_ptr<ir::Graph>> graphs_;
+  std::unordered_map<CinnCacheKey, std::unique_ptr<CinnCompiledObject>,
+                     CinnCacheKey::Hash>
+      cache_;
+  std::atomic_int64_t real_compiled_num_{0};
+
+  DISABLE_COPY_AND_ASSIGN(CinnCompiler);
+};
+
+}  // namespace paddle2cinn
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc
new file mode 100644
index 0000000000000..22792e0f8c359
--- /dev/null
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc
@@ -0,0 +1,168 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
+
+#include <map>
+#include <memory>
+#include <string>
+
+#include "cinn/common/target.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/paddle2cinn/build_cinn_pass.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace framework {
+namespace paddle2cinn {
+
+using ir::Graph;
+using ::cinn::common::Target;
+
+//  X -
+//     | -> mul -> MUL_OUT -
+//  Y -                     | -> elementwise_add -> ADD_OUT -> relu -> RELU_OUT
+//                       Z -
+std::unique_ptr<Graph> CreateGraph() {
+  ProgramDesc program;
+  auto* global_block = program.MutableBlock(0);
+  // mul
+  auto* x = global_block->Var("X");
+  x->SetType(proto::VarType::LOD_TENSOR);
+  x->SetLoDLevel(0);
+  x->SetDataType(proto::VarType::FP32);
+  x->SetShape({1000, 784});
+
+  auto* y = global_block->Var("Y");
+  y->SetType(proto::VarType::LOD_TENSOR);
+  y->SetLoDLevel(0);
+  y->SetDataType(proto::VarType::FP32);
+  y->SetShape({784, 100});
+  y->SetPersistable(true);
+  y->SetIsParameter(true);
+
+  auto* mul_op = global_block->AppendOp();
+  mul_op->SetType("mul");
+  mul_op->SetInput("X", {x->Name()});
+  mul_op->SetInput("Y", {y->Name()});
+
+  auto* mul_out = global_block->Var("MUL_OUT");
+  mul_out->SetType(proto::VarType::LOD_TENSOR);
+  mul_op->SetOutput("Out", {mul_out->Name()});
+
+  // add
+  auto* z = global_block->Var("Z");
+  z->SetType(proto::VarType::LOD_TENSOR);
+  z->SetLoDLevel(0);
+  z->SetDataType(proto::VarType::FP32);
+  z->SetShape({100});
+  z->SetPersistable(true);
+  z->SetIsParameter(true);
+
+  auto* add_op = global_block->AppendOp();
+  add_op->SetType("elementwise_add");
+  add_op->SetInput("X", {mul_out->Name()});
+  add_op->SetInput("Y", {z->Name()});
+
+  auto* add_out = global_block->Var("ADD_OUT");
+  add_out->SetType(proto::VarType::LOD_TENSOR);
+  add_op->SetOutput("Out", {add_out->Name()});
+
+  // relu
+  auto* relu_op = global_block->AppendOp();
+  relu_op->SetType("relu");
+  relu_op->SetInput("X", {add_out->Name()});
+
+  auto* relu_out = global_block->Var("RELU_OUT");
+  relu_out->SetType(proto::VarType::LOD_TENSOR);
+  relu_op->SetOutput("Out", {relu_out->Name()});
+  program.Flush();
+  return std::make_unique<Graph>(program);
+}
+
+TEST(CinnCompilerTest, Compile) {
+  auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass");
+  auto cinn_pass = ir::PassRegistry::Instance().Get("build_cinn_pass");
+  auto viz_graph = [&viz_pass](const std::string& viz_path, Graph* graph) {
+    viz_pass->Erase("graph_viz_path");
+    viz_pass->Set("graph_viz_path", new std::string(viz_path));
+    viz_pass->Apply(graph);
+  };
+
+  // create a graph
+  auto graph = CreateGraph();
+  viz_graph("origin_graph.dot", graph.get());
+  // apply build_cinn_pass
+  cinn_pass->Apply(graph.get());
+  viz_graph("processed_graph.dot", graph.get());
+  // get the compilation_key
+  std::vector<std::string> compilation_keys;
+  for (auto& node : graph->Nodes()) {
+    if (node->IsOp() && node->Name() == kCinnLaunchOp) {
+      compilation_keys.emplace_back(
+          BOOST_GET_CONST(std::string, node->Op()->GetAttr(kCompilationKey)));
+    }
+  }
+  ASSERT_EQ(compilation_keys.size(), 1);
+
+  const auto& compilation_key = compilation_keys[0];
+  auto* cinn_compiler = CinnCompiler::GetInstance();
+  const auto& compiling_graph = cinn_compiler->FindGraph(compilation_key);
+  // viz_graph("compiling_graph.dot", const_cast<Graph*>(&compiling_graph));
+
+  EXPECT_THROW(cinn_compiler->FindGraph("no_existed"),
+               paddle::platform::EnforceNotMet);
+
+  LoDTensor tensor1, tensor2, tensor3;
+  tensor1.Resize({1000, 784});
+  tensor2.Resize({784, 100});
+  tensor3.Resize({100});
+  tensor1.mutable_data<float>(platform::CPUPlace());
+  tensor2.mutable_data<float>(platform::CPUPlace());
+  tensor3.mutable_data<float>(platform::CPUPlace());
+  std::map<std::string, const LoDTensor*> input_tensors = {
+      {"X", &tensor1}, {"Y", &tensor2}, {"Z", &tensor3}};
+
+  auto compile_fn = [&](const Target& target) {
+    const auto& compiled_obj =
+        cinn_compiler->Compile(compiling_graph, input_tensors, target);
+    ASSERT_NE(compiled_obj.runtime_program, nullptr);
+    ASSERT_NE(compiled_obj.scope, nullptr);
+    ASSERT_FALSE(compiled_obj.paddle2cinn_varmap.empty());
+    const auto& cached_obj =
+        cinn_compiler->Compile(compilation_key, input_tensors, target);
+    ASSERT_EQ(reinterpret_cast<std::uint64_t>(&compiled_obj),
+              reinterpret_cast<std::uint64_t>(&cached_obj));
+  };
+
+  // GPU Compilation
+  compile_fn(::cinn::common::DefaultNVGPUTarget());
+  ASSERT_EQ(cinn_compiler->real_compiled_num(), 1);
+  // CPU Compilation
+  compile_fn(::cinn::common::DefaultHostTarget());
+  ASSERT_EQ(cinn_compiler->real_compiled_num(), 2);
+}
+
+}  // namespace paddle2cinn
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(build_cinn_pass);
+USE_PASS(graph_viz_pass);
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.cc b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.cc
new file mode 100644
index 0000000000000..e4e16498b8440
--- /dev/null
+++ b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.cc
@@ -0,0 +1,172 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h"
+
+#include <algorithm>
+#include <iterator>
+#include <queue>
+#include <vector>
+
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/paddle2cinn/transform_desc.h"
+#include "paddle/fluid/framework/variable.h"
+
+#include "cinn/frontend/op_mappers/use_op_mappers.h"
+#include "cinn/frontend/var_type_utils.h"
+
+namespace paddle {
+namespace framework {
+namespace paddle2cinn {
+
+using ir::Graph;
+using ir::Node;
+using CinnTensor = ::cinn::hlir::framework::Tensor;
+using OpMapperContext = CinnGraphSymbolization::OpMapperContext;
+using CinnOpDesc = CinnGraphSymbolization::CinnOpDesc;
+using FeedInfoMap = CinnGraphSymbolization::FeedInfoMap;
+
+namespace utils {
+
+OpMapperContext::FeedInfo GetCinnFeedInfoFromTensor(const Tensor& tensor) {
+  OpMapperContext::FeedInfo info;
+  const auto& dim = tensor.dims();
+  for (int i = 0; i < dim.size(); i++) {
+    info.shape.emplace_back(static_cast<int>(dim[i]));
+  }
+
+  auto cinn_var_type = TransformVarDataTypeToCinn(tensor.type());
+  info.type = ::cinn::frontend::utils::CppVarType2CommonType(cinn_var_type);
+  return info;
+}
+}  // namespace utils
+
+FeedInfoMap CinnGraphSymbolization::GetFeedInfoMapFromInput() const {
+  FeedInfoMap feed_map;
+  for (auto& feed_pair : input_tensors_) {
+    const auto& feed_name = feed_pair.first;
+    const auto* tensor = feed_pair.second;
+
+    feed_map[feed_name] = utils::GetCinnFeedInfoFromTensor(*tensor);
+  }
+  return feed_map;
+}
+
+// get the graph's op input Parameter var name set
+std::unordered_set<std::string>
+CinnGraphSymbolization::GetGraphInputParameterNames() const {
+  std::unordered_set<std::string> names;
+
+  for (auto* node : graph_.Nodes()) {
+    if (node->IsOp()) {
+      for (auto* var : node->inputs) {
+        if (var->Var()->IsParameter()) {
+          // Only need preserve the input parameter var of graph,
+          // others do not.
+          names.insert(var->Name());
+        }
+      }
+    }
+  }
+
+  return names;
+}
+
+// Transform paddle scope to cinn, note that we only preserve the graph’s
+// input parameter variable and ignore others.
+std::shared_ptr<::cinn::hlir::framework::Scope>
+CinnGraphSymbolization::CreateCinnScope(const FeedInfoMap& feed_map) const {
+  auto cinn_scope = ::cinn::hlir::framework::Scope::Create();
+
+  // get the graph's input parameter variable name list
+  auto parameter_names = GetGraphInputParameterNames();
+
+  for (const auto& param_name : parameter_names) {
+    VLOG(4) << "add param var [" << param_name << "] info scope";
+    // if cannot find var in graph input, skip.
+    // scope accepte the CINN format name, so here we need transform
+    // paddle format name to CINN format.
+    auto* cinn_var = cinn_scope->Var<CinnTensor>(
+        ::cinn::utils::TransValidVarName(param_name));
+
+    auto& cinn_tensor = absl::get<CinnTensor>(*cinn_var);
+    // here we only need preserve dtype and shape, do not need preserve data
+    auto feed_info = feed_map.at(param_name);
+    cinn_tensor->set_type(feed_info.type);
+    cinn_tensor->Resize(::cinn::hlir::framework::Shape(feed_info.shape));
+  }
+
+  return cinn_scope;
+}
+
+std::vector<std::unique_ptr<CinnOpDesc>>
+CinnGraphSymbolization::TransformAllGraphOpToCinn() const {
+  std::vector<std::unique_ptr<CinnOpDesc>> cinn_op_descs;
+
+  const auto& sorted_ops = ir::TopologySortOperations(graph_);
+  for (auto* node : sorted_ops) {
+    cinn_op_descs.emplace_back(std::make_unique<CinnOpDesc>());
+    auto& cinn_desc = cinn_op_descs.back();
+
+    TransformOpDescToCinn(node->Op(), cinn_desc.get());
+  }
+  return cinn_op_descs;
+}
+
+void CinnGraphSymbolization::RunOp(const CinnOpDesc& op_desc,
+                                   const OpMapperContext& ctx) const {
+  const auto& op_type = op_desc.Type();
+  auto* kernel = ::cinn::frontend::OpMapperRegistry::Global()->Find(op_type);
+  PADDLE_ENFORCE_NE(kernel, nullptr,
+                    platform::errors::NotFound(
+                        "Op %s is Not Supported by CINN, please register"
+                        " this op in the CINN repo.",
+                        op_type.c_str()));
+  VLOG(4) << "Running Op " << op_type;
+  kernel->Run(op_desc, ctx);
+}
+
+void CinnGraphSymbolization::RunGraph(const OpMapperContext& ctx) const {
+  auto cinn_op_descs = TransformAllGraphOpToCinn();
+  // run the CINN op one by one, note that all ops
+  // have been sorted at constructor.
+  for (auto& op_desc : cinn_op_descs) {
+    RunOp(*op_desc, ctx);
+  }
+}
+
+::cinn::frontend::Program CinnGraphSymbolization::operator()() {
+  std::string builder_name = "NetBuilder_of_graph_" + std::to_string(graph_id_);
+  VLOG(4) << "NetBuilder Name " << builder_name;
+
+  ::cinn::frontend::NetBuilder builder(builder_name);
+
+  auto feed_map = GetFeedInfoMapFromInput();
+  auto cinn_scope = CreateCinnScope(feed_map);
+
+  OpMapperContext ctx(*cinn_scope, target_, &builder, &var_map_,
+                      &var_model_to_program_map_);
+  // add all tensor's feed info into context
+  for (auto& feed_pair : feed_map) {
+    ctx.AddFeedInfo(feed_pair.first, feed_pair.second);
+    VLOG(4) << "add feed var [" << feed_pair.first << "] info context";
+  }
+  RunGraph(ctx);
+
+  return builder.Build();
+}
+
+}  // namespace paddle2cinn
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h
new file mode 100644
index 0000000000000..b6b4b24c6ee3d
--- /dev/null
+++ b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h
@@ -0,0 +1,128 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <map>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+
+#include "cinn/frontend/net_builder.h"
+#include "cinn/frontend/op_mapper_registry.h"
+
+namespace paddle {
+namespace framework {
+namespace paddle2cinn {
+
+// An executor accept subgraph which is generated by BuildCinnPass,
+// run each op's CINN Op Mapper, finally return a frontend::Program object
+// corresponding to the subgraph.
+//
+// Parameter:
+// 1. graph_id:
+//       the unique graph id, used for generating unique NetBuilder name.
+// 2. graph:
+//       the CINN subgraph whose op are all supported by CINN, and the
+//       graph is independently of other graph.
+// 3. input_tensors:
+//       all input var nodes of CINN subgraph, they are necessary for
+//       we need pass the shape and data type into CINN, otherwise the
+//       NetBuilder may error for the shape not meet the precondition.
+//
+// Describe:
+// The main function is operator(), it will run all op function by CINN
+// OpMapper and finally return a program object.
+// The executor operator() consisted by the following step:
+// 1. create a NetBuilder, it's name is unique for each graph;
+// 2. create OpMapperContext, contain scope, target, local var_map and
+//    local var_model_to_program_map;
+// 3. add all feed var into OpMapperContext to pass the shape and type
+//    into CINN;
+// 4. topological sorting graph op nodes;
+// 5. transform all op from paddle opdesc format to cinn opdesc format;
+// 5. run the CINN op in graph one by one. Note that the graph have been
+//    topo sorted;
+// 6. return the NetBuilder.Build() after all op run.
+class CinnGraphSymbolization {
+ public:
+  CinnGraphSymbolization(
+      int64_t graph_id, const ir::Graph& graph,
+      const ::cinn::common::Target& target,
+      const std::map<std::string, const LoDTensor*>& input_tensors)
+      : graph_id_(graph_id),
+        graph_(graph),
+        target_(target),
+        input_tensors_(input_tensors) {}
+
+  // run all CINN op in graph by topo sorting then return its NetBuilder
+  ::cinn::frontend::Program operator()();
+
+  // return the internal variable map
+  const std::unordered_map<std::string, ::cinn::frontend::Variable>& var_map()
+      const {
+    return var_map_;
+  }
+
+  // return the map from the variable name in paddle model to cinn program.
+  const std::unordered_map<std::string, std::string>& var_model_to_program_map()
+      const {
+    return var_model_to_program_map_;
+  }
+
+  using OpMapperContext = ::cinn::frontend::OpMapperContext;
+  using FeedInfoMap =
+      std::unordered_map<std::string, OpMapperContext::FeedInfo>;
+  using CinnOpDesc = ::cinn::frontend::paddle::cpp::OpDesc;
+
+ private:
+  const int64_t graph_id_;
+  const ir::Graph& graph_;
+  const ::cinn::common::Target& target_;
+  const std::map<std::string, const LoDTensor*>& input_tensors_;
+
+  // preserve local variable map
+  std::unordered_map<std::string, ::cinn::frontend::Variable> var_map_;
+  std::unordered_map<std::string, std::string> var_model_to_program_map_;
+
+  // transform all paddle var desc in feed list into cinn_var_descs_
+  FeedInfoMap GetFeedInfoMapFromInput() const;
+
+  // transform all paddle op desc in graph into cinn op desc
+  std::vector<std::unique_ptr<CinnOpDesc>> TransformAllGraphOpToCinn() const;
+
+  // RunOp accept OpDesc and global run context then run
+  // it's kernel registered in OpMapper.
+  // called in RunGraph.
+  void RunOp(const CinnOpDesc& op_desc, const OpMapperContext& ctx) const;
+
+  // preserve var desc, run the op one by one.
+  void RunGraph(const OpMapperContext& ctx) const;
+
+  // create cinn scope and add parameter's feed info into scope
+  std::shared_ptr<::cinn::hlir::framework::Scope> CreateCinnScope(
+      const FeedInfoMap& feed_map) const;
+
+  // get the graph op's input persistable var name set
+  std::unordered_set<std::string> GetGraphInputParameterNames() const;
+
+  friend class CinnGraphSymbolizationForTest;
+};
+
+}  // namespace paddle2cinn
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization_test.cc
new file mode 100644
index 0000000000000..940228314a1d4
--- /dev/null
+++ b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization_test.cc
@@ -0,0 +1,299 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h"
+
+namespace paddle {
+namespace framework {
+namespace paddle2cinn {
+
+using ir::Graph;
+using ir::Node;
+using ::cinn::frontend::NetBuilder;
+using CinnTensor = ::cinn::hlir::framework::Tensor;
+using OpMapperContext = CinnGraphSymbolization::OpMapperContext;
+using CinnOpDesc = CinnGraphSymbolization::CinnOpDesc;
+using FeedInfoMap = CinnGraphSymbolization::FeedInfoMap;
+
+// only used for test CinnGraphSymbolization class
+class CinnGraphSymbolizationForTest {
+ public:
+  explicit CinnGraphSymbolizationForTest(CinnGraphSymbolization* cinn_symbol)
+      : cinn_symbol_(cinn_symbol) {}
+
+  std::unordered_set<std::string> GetGraphInputParameterNames() {
+    return cinn_symbol_->GetGraphInputParameterNames();
+  }
+
+  std::shared_ptr<::cinn::hlir::framework::Scope> CreateCinnScope(
+      const FeedInfoMap& feed_map) {
+    return cinn_symbol_->CreateCinnScope(feed_map);
+  }
+
+  OpMapperContext CreateNewContext(NetBuilder* builder,
+                                   const FeedInfoMap& feed_map) {
+    return OpMapperContext(*cinn_symbol_->CreateCinnScope(feed_map),
+                           cinn_symbol_->target_, builder,
+                           &cinn_symbol_->var_map_,
+                           &cinn_symbol_->var_model_to_program_map_);
+  }
+
+  FeedInfoMap GetFeedInfoMapFromInput() {
+    return cinn_symbol_->GetFeedInfoMapFromInput();
+  }
+
+  std::vector<std::unique_ptr<CinnOpDesc>> TransformAllGraphOpToCinn() {
+    return cinn_symbol_->TransformAllGraphOpToCinn();
+  }
+
+  void RunOp(const CinnOpDesc& op_desc, const OpMapperContext& ctx) {
+    cinn_symbol_->RunOp(op_desc, ctx);
+  }
+
+ private:
+  CinnGraphSymbolization* cinn_symbol_;
+};
+
+class CinnGraphSymbolizationTest : public ::testing::Test {
+ public:
+  CinnGraphSymbolizationTest() {
+    int64_t graph_id = 100;
+    graph_ = BuildAllOpSupportCinnGraph();
+    target_ = CreateDefaultTarget();
+    feed_tensors_ = CreateFeedTensors();
+    feed_targets_ = ConvertFeedType(feed_tensors_);
+    symbol_ = std::make_unique<CinnGraphSymbolization>(graph_id, *graph_,
+                                                       target_, feed_targets_);
+    builder_ = std::make_unique<NetBuilder>("NetBuilder_of_graph_" +
+                                            std::to_string(graph_id));
+    test_ = std::make_unique<CinnGraphSymbolizationForTest>(symbol_.get());
+    feed_map_ = test_->GetFeedInfoMapFromInput();
+  }
+
+  std::unique_ptr<CinnGraphSymbolization> symbol_;
+  std::unique_ptr<CinnGraphSymbolizationForTest> test_;
+  std::map<std::string, const LoDTensor*> feed_targets_;
+
+  OpMapperContext CreateNewContext() {
+    return test_->CreateNewContext(builder_.get(), feed_map_);
+  }
+
+  std::shared_ptr<::cinn::hlir::framework::Scope> CreateCinnScope() {
+    return test_->CreateCinnScope(feed_map_);
+  }
+
+ private:
+  std::unique_ptr<Graph> graph_;
+  ::cinn::common::Target target_;
+  std::map<std::string, LoDTensor> feed_tensors_;
+  std::unique_ptr<NetBuilder> builder_;
+  FeedInfoMap feed_map_;
+
+  std::unique_ptr<Graph> BuildAllOpSupportCinnGraph() {
+    ProgramDesc prog;
+    auto g = std::make_unique<Graph>(prog);
+
+    // v1 --
+    //      | --> mul --> v3 --
+    // v2 --                   | --> add --> v5 --> relu --> v6
+    //                    v4 --
+
+    OpDesc add_op;
+    add_op.SetType("add");
+    add_op.SetInput("X", {"var3"});
+    add_op.SetInput("Y", {"var4"});
+    add_op.SetOutput("Out", {"var5"});
+
+    OpDesc mul_op;
+    mul_op.SetType("mul");
+    mul_op.SetInput("X", {"var1"});
+    mul_op.SetInput("Y", {"var2"});
+    mul_op.SetOutput("Out", {"var3"});
+
+    OpDesc relu_op;
+    relu_op.SetType("relu");
+    relu_op.SetInput("X", {"var5"});
+    relu_op.SetOutput("Out", {"var6"});
+
+    OpDesc feed_var1;
+    feed_var1.SetType("feed");
+    feed_var1.SetOutput("Out", {"var1"});
+
+    OpDesc feed_var4;
+    feed_var4.SetType("feed");
+    feed_var4.SetOutput("Out", {"var4"});
+
+    VarDesc var1("var1");
+    VarDesc var2("var2");
+    var2.SetPersistable(true);
+    var2.SetIsParameter(true);
+    VarDesc var3("var3");
+    VarDesc var4("var4");
+    VarDesc var5("var5");
+    VarDesc var6("var6");
+
+    ir::Node* add = g->CreateOpNode(&add_op);
+    ir::Node* mul = g->CreateOpNode(&mul_op);
+    ir::Node* relu = g->CreateOpNode(&relu_op);
+
+    ir::Node* feed1 = g->CreateOpNode(&feed_var1);
+    ir::Node* feed4 = g->CreateOpNode(&feed_var4);
+
+    ir::Node* v1 = g->CreateVarNode(&var1);
+    ir::Node* v2 = g->CreateVarNode(&var2);
+    ir::Node* v3 = g->CreateVarNode(&var3);
+    ir::Node* v4 = g->CreateVarNode(&var4);
+    ir::Node* v5 = g->CreateVarNode(&var5);
+    ir::Node* v6 = g->CreateVarNode(&var6);
+
+    // fill op node
+    feed1->outputs = {v1};
+    feed4->outputs = {v4};
+    mul->inputs = {v1, v2};
+    mul->outputs = {v3};
+    add->inputs = {v3, v4};
+    add->outputs = {v5};
+    relu->inputs = {v5};
+    relu->outputs = {v6};
+
+    // fill variable node
+    v1->inputs = {feed1};
+    v1->outputs = {mul};
+
+    v2->outputs = {mul};
+
+    v3->inputs = {mul};
+    v3->outputs = {add};
+
+    v4->inputs = {feed4};
+    v4->outputs = {add};
+
+    v5->inputs = {add};
+    v5->outputs = {relu};
+
+    v6->inputs = {relu};
+
+    return g;
+  }
+
+  ::cinn::common::Target CreateDefaultTarget(bool use_gpu = false) {
+#ifdef PADDLE_WITH_CUDA
+    if (use_gpu) {
+      return ::cinn::common::DefaultNVGPUTarget();
+    }
+#endif
+    return ::cinn::common::DefaultHostTarget();
+  }
+
+  std::map<std::string, LoDTensor> CreateFeedTensors() {
+    std::map<std::string, LoDTensor> feed_targets;
+
+    auto create_tensor = []() {
+      LoDTensor tensor;
+      DDim dims = {256, 1024};
+      tensor.Resize(dims);
+      tensor.mutable_data(platform::CPUPlace(), proto::VarType::FP32);
+      return tensor;
+    };
+#define FillFeedList(Name) feed_targets[#Name] = create_tensor();
+    FillFeedList(var1);
+    FillFeedList(var2);
+    FillFeedList(var3);
+    FillFeedList(var4);
+    FillFeedList(var5);
+    FillFeedList(var6);
+#undef FillFeedList
+    DDim y_dim = {1024, 1024};
+    feed_targets["var2"].Resize(y_dim);
+
+    return feed_targets;
+  }
+
+  std::map<std::string, const LoDTensor*> ConvertFeedType(
+      const std::map<std::string, LoDTensor>& feed_targets) {
+    std::map<std::string, const LoDTensor*> res;
+    for (auto& feed_pair : feed_targets) {
+      res[feed_pair.first] = &feed_pair.second;
+    }
+    return res;
+  }
+};
+
+TEST_F(CinnGraphSymbolizationTest, feed_map) {
+  auto feed_map = test_->GetFeedInfoMapFromInput();
+  auto ctx = CreateNewContext();
+
+  ASSERT_TRUE(feed_map.count("var1"));
+  ASSERT_TRUE(feed_map.count("var2"));
+
+  auto feed_info = feed_map.at("var1");
+  ASSERT_EQ(feed_info.shape, std::vector<int>({256, 1024}));
+  ASSERT_EQ(feed_info.type, ::cinn::common::F32());
+}
+
+TEST_F(CinnGraphSymbolizationTest, scope) {
+  auto prame_names = test_->GetGraphInputParameterNames();
+  ASSERT_EQ(prame_names, std::unordered_set<std::string>({"var2"}));
+
+  auto cinn_scope = CreateCinnScope();
+
+  auto* var1 = cinn_scope->FindVar("var1");
+  ASSERT_EQ(var1, nullptr);
+  auto* var2 = cinn_scope->FindVar("var2");
+  ASSERT_NE(var2, nullptr);
+
+  auto& cinn_tensor = absl::get<CinnTensor>(*var2);
+  ASSERT_EQ(cinn_tensor->shape().data(), std::vector<int>({1024, 1024}));
+  ASSERT_EQ(cinn_tensor->type(), ::cinn::common::F32());
+}
+
+TEST_F(CinnGraphSymbolizationTest, sortgraph) {
+  auto cinn_op_descs = test_->TransformAllGraphOpToCinn();
+  ASSERT_FALSE(cinn_op_descs.empty());
+  std::vector<std::string> sort_names;
+  for (auto& desc : cinn_op_descs) {
+    sort_names.emplace_back(desc->Type());
+  }
+  ASSERT_EQ(sort_names,
+            std::vector<std::string>({"feed", "mul", "feed", "add", "relu"}));
+}
+
+TEST_F(CinnGraphSymbolizationTest, runop) {
+  auto cinn_op_descs = test_->TransformAllGraphOpToCinn();
+  auto feed_map = test_->GetFeedInfoMapFromInput();
+
+  auto ctx = CreateNewContext();
+  // add all tensor's feed info into context
+  for (auto& feed_pair : feed_map) {
+    ctx.AddFeedInfo(feed_pair.first, feed_pair.second);
+  }
+
+  ASSERT_NO_THROW(test_->RunOp(*cinn_op_descs[0], ctx));
+
+  CinnOpDesc desc;
+  desc.SetType("fake");
+  ASSERT_ANY_THROW(test_->RunOp(desc, ctx));
+}
+
+TEST_F(CinnGraphSymbolizationTest, basic) {
+  ASSERT_NO_THROW((*symbol_)());
+  ASSERT_FALSE(symbol_->var_map().empty());
+  ASSERT_FALSE(symbol_->var_model_to_program_map().empty());
+}
+
+}  // namespace paddle2cinn
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_runner.cc b/paddle/fluid/framework/paddle2cinn/cinn_runner.cc
deleted file mode 100644
index ba90095cae679..0000000000000
--- a/paddle/fluid/framework/paddle2cinn/cinn_runner.cc
+++ /dev/null
@@ -1,61 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/paddle2cinn/cinn_runner.h"
-
-#include <map>
-#include <memory>
-#include <mutex>
-
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/tensor.h"
-
-namespace paddle {
-namespace framework {
-namespace paddle2cinn {
-
-using ir::Graph;
-
-std::once_flag CinnRunner::get_instance_once_flag_;
-std::shared_ptr<CinnRunner> CinnRunner::instance_;
-
-std::shared_ptr<CinnRunner> CinnRunner::GetInstance() {
-  std::call_once(get_instance_once_flag_,
-                 [&]() { instance_.reset(new CinnRunner()); });
-  return instance_;
-}
-
-void CinnRunner::ReplaceWithCinn(Graph* graph) {
-  // TODO(zhhsplendid): call CINN Api when it is ready
-}
-
-std::map<std::string, FetchType*> CinnRunner::Run(
-    const Graph& graph, Scope* scope,
-    std::map<std::string, const LoDTensor*>* feed_targets) {
-  CinnCacheKey cur_key(graph, *feed_targets);
-  std::shared_ptr<CinnCompiledObject> obj_to_run;
-  if (cache_.find(cur_key) != cache_.end()) {
-    obj_to_run = cache_[cur_key];
-  } else {
-    obj_to_run = std::make_shared<CinnCompiledObject>();
-    obj_to_run->Compile(graph, feed_targets);
-    cache_[cur_key] = obj_to_run;
-  }
-  return obj_to_run->Run(scope, feed_targets);
-}
-
-}  // namespace paddle2cinn
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_runner.h b/paddle/fluid/framework/paddle2cinn/cinn_runner.h
deleted file mode 100644
index 23d9565d2f392..0000000000000
--- a/paddle/fluid/framework/paddle2cinn/cinn_runner.h
+++ /dev/null
@@ -1,65 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <map>
-#include <memory>
-#include <mutex>
-#include <unordered_map>
-
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/paddle2cinn/cinn_cache_key.h"
-#include "paddle/fluid/framework/paddle2cinn/cinn_compiled_object.h"
-#include "paddle/fluid/framework/scope.h"
-
-namespace paddle {
-namespace framework {
-namespace paddle2cinn {
-
-// Entrance to run CINN.
-//
-// CINN cannot handle changable shape now, so CinnRunner keeps a cache mapping
-// from CinnCacheKey to CinnCompiledObject. If cache hits, we will re-use cache
-// stored CinnCompiledObject, otherwise we will compile again and put into
-// cache.
-class CinnRunner {
- public:
-  ~CinnRunner() {}
-
-  // Singleton
-  static std::shared_ptr<CinnRunner> GetInstance();
-
-  // Replace Paddle graph with some CINN subgraphs/ops
-  void ReplaceWithCinn(ir::Graph* graph);
-
-  // Feed LoDTensors to tun CINN compiled object and return fetched result
-  std::map<std::string, FetchType*> Run(
-      const ir::Graph& graph, Scope* scope,
-      std::map<std::string, const LoDTensor*>* feed_targets);
-
- private:
-  CinnRunner() {}
-
-  static std::once_flag get_instance_once_flag_;
-  static std::shared_ptr<CinnRunner> instance_;
-  std::unordered_map<CinnCacheKey, std::shared_ptr<CinnCompiledObject>,
-                     CinnCacheKey::Hash>
-      cache_;
-};
-
-}  // namespace paddle2cinn
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_runner_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_runner_test.cc
deleted file mode 100644
index c02b994c147ca..0000000000000
--- a/paddle/fluid/framework/paddle2cinn/cinn_runner_test.cc
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/paddle2cinn/cinn_runner.h"
-
-#include <memory>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/scope.h"
-
-namespace paddle {
-namespace framework {
-namespace paddle2cinn {
-
-using ir::Graph;
-
-TEST(CinnRunnerTest, TodoTest) {
-  ProgramDesc empty_program;
-  Graph empty_graph(empty_program);
-  Scope empty_scope;
-  std::map<std::string, const LoDTensor*> empty_feed;
-
-  std::shared_ptr<CinnRunner> cinn_runner = CinnRunner::GetInstance();
-  cinn_runner->ReplaceWithCinn(&empty_graph);
-  cinn_runner->Run(empty_graph, &empty_scope, &empty_feed);
-}
-
-}  // namespace paddle2cinn
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/paddle2cinn/transform_desc.cc b/paddle/fluid/framework/paddle2cinn/transform_desc.cc
new file mode 100644
index 0000000000000..52b1395c732ac
--- /dev/null
+++ b/paddle/fluid/framework/paddle2cinn/transform_desc.cc
@@ -0,0 +1,348 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/paddle2cinn/transform_desc.h"
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace paddle2cinn {
+
+using PbVarType = framework::proto::VarType;
+namespace cpp = ::cinn::frontend::paddle::cpp;
+
+::cinn::frontend::paddle::cpp::VarDescAPI::Type TransformVarTypeToCinn(
+    const ::paddle::framework::proto::VarType::Type &type) {
+#define SET_TYPE_CASE_ITEM(type__)                                  \
+  case ::paddle::framework::proto::VarType::type__:                 \
+    return ::cinn::frontend::paddle::cpp::VarDescAPI::Type::type__; \
+    break;
+
+  switch (type) {
+    SET_TYPE_CASE_ITEM(LOD_TENSOR);
+    SET_TYPE_CASE_ITEM(LOD_TENSOR_ARRAY);
+    SET_TYPE_CASE_ITEM(LOD_RANK_TABLE);
+    SET_TYPE_CASE_ITEM(SELECTED_ROWS);
+    SET_TYPE_CASE_ITEM(FEED_MINIBATCH);
+    SET_TYPE_CASE_ITEM(FETCH_LIST);
+    SET_TYPE_CASE_ITEM(STEP_SCOPES);
+    SET_TYPE_CASE_ITEM(PLACE_LIST);
+    SET_TYPE_CASE_ITEM(READER);
+    default:
+      PADDLE_THROW(platform::errors::NotFound("Cannot found var type"));
+  }
+#undef SET_TYPE_CASE_ITEM
+}
+
+::paddle::framework::proto::VarType::Type TransformVarTypeFromCinn(
+    const ::cinn::frontend::paddle::cpp::VarDescAPI::Type &type) {
+#define SET_TYPE_CASE_ITEM(type__)                              \
+  case ::cinn::frontend::paddle::cpp::VarDescAPI::Type::type__: \
+    return ::paddle::framework::proto::VarType::type__;         \
+    break;
+
+  switch (type) {
+    SET_TYPE_CASE_ITEM(LOD_TENSOR);
+    SET_TYPE_CASE_ITEM(LOD_TENSOR_ARRAY);
+    SET_TYPE_CASE_ITEM(LOD_RANK_TABLE);
+    SET_TYPE_CASE_ITEM(SELECTED_ROWS);
+    SET_TYPE_CASE_ITEM(FEED_MINIBATCH);
+    SET_TYPE_CASE_ITEM(FETCH_LIST);
+    SET_TYPE_CASE_ITEM(STEP_SCOPES);
+    SET_TYPE_CASE_ITEM(PLACE_LIST);
+    SET_TYPE_CASE_ITEM(READER);
+    default:
+      PADDLE_THROW(platform::errors::NotFound("Cannot found var type"));
+  }
+#undef SET_TYPE_CASE_ITEM
+}
+
+::cinn::frontend::paddle::cpp::VarDescAPI::Type TransformVarDataTypeToCinn(
+    const ::paddle::framework::proto::VarType::Type &type) {
+#define SET_DATA_TYPE_CASE_ITEM(type__)                             \
+  case ::paddle::framework::proto::VarType::type__:                 \
+    return ::cinn::frontend::paddle::cpp::VarDescAPI::Type::type__; \
+    break;
+
+  switch (type) {
+    SET_DATA_TYPE_CASE_ITEM(BOOL);
+    SET_DATA_TYPE_CASE_ITEM(SIZE_T);
+    SET_DATA_TYPE_CASE_ITEM(UINT8);
+    SET_DATA_TYPE_CASE_ITEM(INT8);
+    SET_DATA_TYPE_CASE_ITEM(INT16);
+    SET_DATA_TYPE_CASE_ITEM(INT32);
+    SET_DATA_TYPE_CASE_ITEM(INT64);
+    SET_DATA_TYPE_CASE_ITEM(FP16);
+    SET_DATA_TYPE_CASE_ITEM(FP32);
+    SET_DATA_TYPE_CASE_ITEM(FP64);
+    default:
+      PADDLE_THROW(platform::errors::NotFound("Cannot found var data type"));
+  }
+#undef SET_DATA_TYPE_CASE_ITEM
+}
+
+::paddle::framework::proto::VarType::Type TransformVarDataTypeFromCpp(
+    const ::cinn::frontend::paddle::cpp::VarDescAPI::Type &type) {
+#define SET_DATA_TYPE_CASE_ITEM(type__)                         \
+  case ::cinn::frontend::paddle::cpp::VarDescAPI::Type::type__: \
+    return ::paddle::framework::proto::VarType::type__;         \
+    break;
+
+  switch (type) {
+    SET_DATA_TYPE_CASE_ITEM(BOOL);
+    SET_DATA_TYPE_CASE_ITEM(SIZE_T);
+    SET_DATA_TYPE_CASE_ITEM(UINT8);
+    SET_DATA_TYPE_CASE_ITEM(INT8);
+    SET_DATA_TYPE_CASE_ITEM(INT16);
+    SET_DATA_TYPE_CASE_ITEM(INT32);
+    SET_DATA_TYPE_CASE_ITEM(INT64);
+    SET_DATA_TYPE_CASE_ITEM(FP16);
+    SET_DATA_TYPE_CASE_ITEM(FP32);
+    SET_DATA_TYPE_CASE_ITEM(FP64);
+    default:
+      PADDLE_THROW(platform::errors::NotFound("Cannot found var data type"));
+  }
+#undef SET_DATA_TYPE_CASE_ITEM
+}
+
+void TransformVarDescToCinn(framework::VarDesc *pb_desc,
+                            cpp::VarDesc *cpp_desc) {
+  cpp_desc->SetName(pb_desc->Name());
+  cpp_desc->SetType(TransformVarTypeToCinn(pb_desc->GetType()));
+  cpp_desc->SetPersistable(pb_desc->Persistable());
+  if (pb_desc->Name() != "feed" && pb_desc->Name() != "fetch") {
+    cpp_desc->SetDataType(TransformVarDataTypeToCinn(pb_desc->GetDataType()));
+    cpp_desc->SetShape(pb_desc->GetShape());
+  }
+}
+
+void TransformVarDescFromCinn(const cpp::VarDesc &cpp_desc,
+                              framework::VarDesc *pb_desc) {
+  pb_desc->Proto()->Clear();
+  pb_desc->SetName(cpp_desc.Name());
+  pb_desc->SetType(TransformVarTypeFromCinn(cpp_desc.GetType()));
+  pb_desc->SetPersistable(cpp_desc.Persistable());
+  if (cpp_desc.Name() != "feed" && cpp_desc.Name() != "fetch") {
+    pb_desc->SetShape(cpp_desc.GetShape());
+    pb_desc->SetDataType(TransformVarDataTypeFromCpp(cpp_desc.GetDataType()));
+  }
+}
+
+/// For OpDesc transform
+void OpInputsToCinn(framework::OpDesc *pb_desc, cpp::OpDesc *cpp_desc) {
+  for (const std::string &param : pb_desc->InputNames()) {
+    cpp_desc->SetInput(param, pb_desc->Input(param));
+  }
+}
+
+void OpInputsFromCinn(const cpp::OpDesc &cpp_desc, framework::OpDesc *pb_desc) {
+  pb_desc->MutableInputs()->clear();
+  for (const std::string &param : cpp_desc.InputArgumentNames()) {
+    pb_desc->SetInput(param, cpp_desc.Input(param));
+  }
+}
+
+void OpOutputsToCinn(framework::OpDesc *pb_desc, cpp::OpDesc *cpp_desc) {
+  for (const std::string &param : pb_desc->OutputNames()) {
+    cpp_desc->SetOutput(param, pb_desc->Output(param));
+  }
+}
+
+void OpOutputsFromCinn(const cpp::OpDesc &cpp_desc,
+                       framework::OpDesc *pb_desc) {
+  pb_desc->MutableOutputs()->clear();
+  for (const std::string &param : cpp_desc.OutputArgumentNames()) {
+    pb_desc->SetOutput(param, cpp_desc.Output(param));
+  }
+}
+
+void OpAttrsToCinn(framework::OpDesc *pb_desc, cpp::OpDesc *cpp_desc) {
+  using AttrType = framework::proto::AttrType;
+  auto set_attr = [&](const std::string &name, AttrType type) {
+    switch (type) {
+#define IMPL_ONE(type__, T)                                        \
+  case AttrType::type__:                                           \
+    cpp_desc->SetAttr<T>(name, pb_desc->GetAttrIfExists<T>(name)); \
+    break;
+      IMPL_ONE(INT, int32_t);
+      IMPL_ONE(FLOAT, float);
+      IMPL_ONE(STRING, std::string);
+      IMPL_ONE(STRINGS, std::vector<std::string>);
+      IMPL_ONE(FLOATS, std::vector<float>);
+      IMPL_ONE(INTS, std::vector<int>);
+      IMPL_ONE(BOOLEAN, bool);
+      IMPL_ONE(LONG, int64_t);
+      IMPL_ONE(LONGS, std::vector<int64_t>);
+      case AttrType::BLOCK: {
+        auto i = pb_desc->GetAttrIfExists<int16_t>(name);
+        cpp_desc->SetAttr<int32_t>(name, i);
+        break;
+      }
+      default:
+        PADDLE_THROW(platform::errors::NotFound(
+            "Unsupported attr type %d found ", static_cast<int>(type)));
+    }
+  };
+#undef IMPL_ONE
+
+  for (const auto &attr_name : pb_desc->AttrNames()) {
+    auto type = pb_desc->GetAttrType(attr_name);
+    set_attr(attr_name, type);
+  }
+}
+
+void OpAttrsFromCinn(const cpp::OpDesc &cpp_desc, framework::OpDesc *pb_desc) {
+  pb_desc->MutableAttrMap()->clear();
+  using AttrType = cpp::OpDescAPI::AttrType;
+  auto set_attr = [&](const std::string &name, AttrType type) {
+    switch (type) {
+#define IMPL_ONE(type__, T)                            \
+  case AttrType::type__:                               \
+    pb_desc->SetAttr(name, cpp_desc.GetAttr<T>(name)); \
+    break;
+      IMPL_ONE(INT, int32_t);
+      IMPL_ONE(FLOAT, float);
+      IMPL_ONE(STRING, std::string);
+      IMPL_ONE(STRINGS, std::vector<std::string>);
+      IMPL_ONE(FLOATS, std::vector<float>);
+      IMPL_ONE(INTS, std::vector<int>);
+      IMPL_ONE(BOOLEAN, bool);
+      IMPL_ONE(LONG, int64_t);
+      IMPL_ONE(LONGS, std::vector<int64_t>);
+      default:
+        PADDLE_THROW(platform::errors::NotFound(
+            "Unsupported attr type %d found ", static_cast<int>(type)));
+    }
+  };
+#undef IMPL_ONE
+
+  for (const auto &attr_name : cpp_desc.AttrNames()) {
+    auto type = cpp_desc.GetAttrType(attr_name);
+    set_attr(attr_name, type);
+  }
+}
+
+void TransformOpDescToCinn(framework::OpDesc *pb_desc, cpp::OpDesc *cpp_desc) {
+  cpp_desc->SetType(pb_desc->Type());
+  OpInputsToCinn(pb_desc, cpp_desc);
+  OpOutputsToCinn(pb_desc, cpp_desc);
+  OpAttrsToCinn(pb_desc, cpp_desc);
+}
+
+void TransformOpDescFromCinn(const cpp::OpDesc &cpp_desc,
+                             framework::OpDesc *pb_desc) {
+  pb_desc->Proto()->Clear();
+  pb_desc->SetType(cpp_desc.Type());
+  OpInputsFromCinn(cpp_desc, pb_desc);
+  OpOutputsFromCinn(cpp_desc, pb_desc);
+  OpAttrsFromCinn(cpp_desc, pb_desc);
+}
+
+/// For BlockDesc transform
+void TransformBlockDescToCinn(framework::BlockDesc *pb_desc,
+                              cpp::BlockDesc *cpp_desc) {
+  cpp_desc->SetIdx(pb_desc->ID());
+  cpp_desc->SetParentIdx(pb_desc->Parent());
+  cpp_desc->SetForwardBlockIdx(pb_desc->ForwardBlockID());
+
+  cpp_desc->ClearOps();
+  const auto &all_ops = pb_desc->AllOps();
+  for (const auto &op : all_ops) {
+    auto *cpp_op_desc = cpp_desc->AddOp<cpp::OpDesc>();
+    TransformOpDescToCinn(op, cpp_op_desc);
+  }
+
+  cpp_desc->ClearVars();
+  const auto &all_vars = pb_desc->AllVars();
+  for (const auto &var : all_vars) {
+    auto *cpp_var_desc = cpp_desc->AddVar<cpp::VarDesc>();
+    TransformVarDescToCinn(var, cpp_var_desc);
+  }
+}
+
+void TransformBlockDescFromCinn(const cpp::BlockDesc &cpp_desc,
+                                framework::BlockDesc *pb_desc) {
+  pb_desc->Proto()->Clear();
+
+  pb_desc->Proto()->set_idx(cpp_desc.Idx());
+  pb_desc->Proto()->set_parent_idx(cpp_desc.ParentIdx());
+  pb_desc->Proto()->set_forward_block_idx(cpp_desc.ForwardBlockIdx());
+
+  for (size_t i = 0; i < cpp_desc.OpsSize(); ++i) {
+    const auto &cpp_op_desc =
+        cpp_desc.template GetConstOp<cpp::OpDesc>(static_cast<int32_t>(i));
+    auto *pb_op_desc = pb_desc->AppendOp();
+    TransformOpDescFromCinn(cpp_op_desc, pb_op_desc);
+  }
+
+  for (size_t i = 0; i < cpp_desc.VarsSize(); ++i) {
+    const auto &cpp_var_desc =
+        cpp_desc.template GetConstVar<cpp::VarDesc>(static_cast<int32_t>(i));
+    auto *pb_var_desc = pb_desc->Var(cpp_var_desc.Name());
+    TransformVarDescFromCinn(cpp_var_desc, pb_var_desc);
+  }
+}
+
+/// For ProgramDesc transform
+void TransformProgramDescToCinn(framework::ProgramDesc *pb_desc,
+                                cpp::ProgramDesc *cpp_desc) {
+  if (pb_desc->Proto()->version().has_version()) {
+    cpp_desc->SetVersion(pb_desc->Version());
+  }
+
+  cpp_desc->ClearBlocks();
+  for (size_t i = 0; i < pb_desc->Size(); ++i) {
+    auto *pb_block_desc = pb_desc->MutableBlock(i);
+    auto *cpp_block_desc = cpp_desc->AddBlock<cpp::BlockDesc>();
+    TransformBlockDescToCinn(pb_block_desc, cpp_block_desc);
+  }
+}
+
+void TransformProgramDescFromCinn(const cpp::ProgramDesc &cpp_desc,
+                                  framework::ProgramDesc *pb_desc) {
+  pb_desc->Proto()->Clear();
+
+  if (cpp_desc.HasVersion()) {
+    pb_desc->SetVersion(cpp_desc.Version());
+  }
+
+  // For paddle proto program, the only way to add block is invoke
+  // AppendBlock(),
+  // the AppendBlock need one necessary parameter: const BlockDesc &parent,
+  // but the only function of parent is set the block's parent_idx value.
+  // Meanwhile a program has at least one block, so we set block0 to all
+  // sub-block's parent in initial and cannot remove.
+  // Don't worry, it will be change in "TransformBlockDescFromCinn".
+  auto *block0 = pb_desc->MutableBlock(0);
+
+  for (size_t i = 0; i < cpp_desc.BlocksSize(); ++i) {
+    const auto &cpp_block_desc = cpp_desc.GetConstBlock<cpp::BlockDesc>(i);
+    framework::BlockDesc *pb_block_desc = nullptr;
+    if (i < pb_desc->Size()) {
+      pb_block_desc = pb_desc->MutableBlock(i);
+    } else {
+      pb_block_desc = pb_desc->AppendBlock(*block0);
+    }
+    TransformBlockDescFromCinn(cpp_block_desc, pb_block_desc);
+  }
+}
+
+}  // namespace paddle2cinn
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/paddle2cinn/transform_desc.h b/paddle/fluid/framework/paddle2cinn/transform_desc.h
new file mode 100644
index 0000000000000..76a4f812730df
--- /dev/null
+++ b/paddle/fluid/framework/paddle2cinn/transform_desc.h
@@ -0,0 +1,79 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/var_desc.h"
+
+#include "cinn/frontend/paddle/cpp/block_desc.h"
+#include "cinn/frontend/paddle/cpp/desc_api.h"
+#include "cinn/frontend/paddle/cpp/op_desc.h"
+#include "cinn/frontend/paddle/cpp/program_desc.h"
+#include "cinn/frontend/paddle/cpp/var_desc.h"
+
+namespace paddle {
+namespace framework {
+namespace paddle2cinn {
+
+::cinn::frontend::paddle::cpp::VarDescAPI::Type TransformVarTypeToCinn(
+    const ::paddle::framework::proto::VarType::Type& type);
+
+::paddle::framework::proto::VarType::Type TransformVarTypeFromCinn(
+    const ::cinn::frontend::paddle::cpp::VarDescAPI::Type& type);
+
+::cinn::frontend::paddle::cpp::VarDescAPI::Type TransformVarDataTypeToCinn(
+    const ::paddle::framework::proto::VarType::Type& type);
+
+::paddle::framework::proto::VarType::Type TransformVarDataTypeFromCpp(
+    const ::cinn::frontend::paddle::cpp::VarDescAPI::Type& type);
+
+// Why use framework::VarDesc* rather than const framework::VarDesc& here?
+// framework::VarDesc lack of many API like clear(), etc. On the other hand,
+// the paddle node return framework::Desc* even if the node is const
+void TransformVarDescToCinn(framework::VarDesc* pb_desc,
+                            ::cinn::frontend::paddle::cpp::VarDesc* cpp_desc);
+
+void TransformVarDescFromCinn(
+    const ::cinn::frontend::paddle::cpp::VarDesc& cpp_desc,
+    framework::VarDesc* pb_desc);
+
+void TransformOpDescToCinn(framework::OpDesc* pb_desc,
+                           ::cinn::frontend::paddle::cpp::OpDesc* cpp_desc);
+
+void TransformOpDescFromCinn(
+    const ::cinn::frontend::paddle::cpp::OpDesc& cpp_desc,
+    framework::OpDesc* pb_desc);
+
+void TransformBlockDescToCinn(
+    framework::BlockDesc* pb_desc,
+    ::cinn::frontend::paddle::cpp::BlockDesc* cpp_desc);
+
+void TransformBlockDescFromCinn(
+    const ::cinn::frontend::paddle::cpp::BlockDesc& cpp_desc,
+    framework::BlockDesc* pb_desc);
+
+void TransformProgramDescToCinn(
+    framework::ProgramDesc* pb_desc,
+    ::cinn::frontend::paddle::cpp::ProgramDesc* cpp_desc);
+
+void TransformProgramDescFromCinn(
+    const ::cinn::frontend::paddle::cpp::ProgramDesc& cpp_desc,
+    framework::ProgramDesc* pb_desc);
+
+}  // namespace paddle2cinn
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/paddle2cinn/transform_desc_test.cc b/paddle/fluid/framework/paddle2cinn/transform_desc_test.cc
new file mode 100644
index 0000000000000..ba324295cad72
--- /dev/null
+++ b/paddle/fluid/framework/paddle2cinn/transform_desc_test.cc
@@ -0,0 +1,236 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <unordered_map>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/paddle2cinn/transform_desc.h"
+
+namespace paddle {
+namespace framework {
+namespace paddle2cinn {
+
+using PbVarType = framework::proto::VarType;
+namespace cpp = ::cinn::frontend::paddle::cpp;
+
+// check VarDesc
+cpp::VarDesc CreateCppVarDesc() {
+  cpp::VarDesc var("test");
+  var.SetType(cpp::VarDescAPI::Type::LOD_TENSOR);
+  var.SetPersistable(true);
+  var.SetDataType(cpp::VarDescAPI::Type::FP32);
+  var.SetShape({100, 200, 300});
+  return var;
+}
+
+framework::VarDesc CreatePbVarDesc() {
+  framework::VarDesc var("test");
+  var.SetType(PbVarType::LOD_TENSOR);
+  var.SetPersistable(true);
+  var.SetDataType(PbVarType::FP32);
+  var.SetShape({100, 200, 300});
+  return var;
+}
+
+TEST(TransformVarDesc, cpp2pb) {
+  auto cpp_var = CreateCppVarDesc();
+  framework::VarDesc pb_var("init");
+  TransformVarDescFromCinn(cpp_var, &pb_var);
+
+  auto correct_var = CreatePbVarDesc();
+  ASSERT_EQ(pb_var.Name(), correct_var.Name());
+  ASSERT_EQ(pb_var.GetType(), correct_var.GetType());
+  ASSERT_EQ(pb_var.Persistable(), correct_var.Persistable());
+  ASSERT_EQ(pb_var.GetDataType(), correct_var.GetDataType());
+  ASSERT_EQ(pb_var.GetShape(), correct_var.GetShape());
+}
+
+TEST(TransformVarDesc, pb2cpp) {
+  auto pb_var = CreatePbVarDesc();
+  cpp::VarDesc cpp_var;
+  TransformVarDescToCinn(&pb_var, &cpp_var);
+
+  auto correct_var = CreateCppVarDesc();
+  ASSERT_EQ(cpp_var.Name(), correct_var.Name());
+  ASSERT_EQ(cpp_var.GetType(), correct_var.GetType());
+  ASSERT_EQ(cpp_var.Persistable(), correct_var.Persistable());
+  ASSERT_EQ(cpp_var.GetDataType(), correct_var.GetDataType());
+  ASSERT_EQ(cpp_var.GetShape(), correct_var.GetShape());
+}
+
+// check OpDesc
+cpp::OpDesc CreateCppOpDesc() {
+  cpp::OpDesc op;
+  op.SetType("test");
+  op.SetInput("X", {"x1"});
+  op.SetInput("Y", {"y1", "y2"});
+  op.SetOutput("Out", {"out1"});
+  op.SetAttr<float>("attr_f", 0.1f);
+  op.SetAttr<std::string>("attr_str", "test_attr");
+  return op;
+}
+
+framework::OpDesc CreatePbOpDesc() {
+  framework::OpDesc op;
+  op.SetType("test");
+  op.SetInput("X", {"x1"});
+  op.SetInput("Y", {"y1", "y2"});
+  op.SetOutput("Out", {"out1"});
+  op.SetAttr("attr_f", 0.1f);
+  op.SetAttr("attr_str", std::string("test_attr"));
+  return op;
+}
+
+TEST(TransformOpDesc, cpp2pb) {
+  auto cpp_op = CreateCppOpDesc();
+  framework::OpDesc pb_op;
+  TransformOpDescFromCinn(cpp_op, &pb_op);
+
+  auto correct_op = CreatePbOpDesc();
+  ASSERT_EQ(pb_op.Type(), correct_op.Type());
+  ASSERT_EQ(pb_op.Inputs(), correct_op.Inputs());
+  ASSERT_EQ(pb_op.Outputs(), correct_op.Outputs());
+  ASSERT_EQ(pb_op.AttrNames(), correct_op.AttrNames());
+
+  for (const auto &attr_name : pb_op.AttrNames()) {
+    ASSERT_EQ(pb_op.GetAttrType(attr_name), correct_op.GetAttrType(attr_name));
+  }
+  ASSERT_EQ(pb_op.GetAttrIfExists<float>("attr_f"),
+            correct_op.GetAttrIfExists<float>("attr_f"));
+  ASSERT_EQ(pb_op.GetAttrIfExists<std::string>("attr_str"),
+            correct_op.GetAttrIfExists<std::string>("attr_str"));
+}
+
+TEST(TransformOpDesc, pb2cpp) {
+  auto pb_op = CreatePbOpDesc();
+  cpp::OpDesc cpp_op;
+  TransformOpDescToCinn(&pb_op, &cpp_op);
+
+  auto correct_op = CreateCppOpDesc();
+  ASSERT_EQ(cpp_op.Type(), correct_op.Type());
+  ASSERT_EQ(cpp_op.inputs(), correct_op.inputs());
+  ASSERT_EQ(cpp_op.outputs(), correct_op.outputs());
+  ASSERT_EQ(cpp_op.AttrNames(), correct_op.AttrNames());
+  ASSERT_EQ(cpp_op.attr_types(), correct_op.attr_types());
+
+  ASSERT_EQ(cpp_op.GetAttr<float>("attr_f"),
+            correct_op.GetAttr<float>("attr_f"));
+  ASSERT_EQ(cpp_op.GetAttr<std::string>("attr_str"),
+            correct_op.GetAttr<std::string>("attr_str"));
+}
+
+// check BlockDesc
+// framework::BlockDesc is DISABLE_COPY_AND_ASSIGN, so can not return
+void CreateCppBlockDesc(cpp::BlockDesc *block) {
+  block->SetIdx(42);
+  block->SetParentIdx(4);
+  block->SetForwardBlockIdx(32);
+
+  auto *op = block->AddOp<cpp::OpDesc>();
+  *op = CreateCppOpDesc();
+
+  auto *var = block->AddVar<cpp::VarDesc>();
+  *var = CreateCppVarDesc();
+}
+
+void CreatePbBlockDesc(framework::BlockDesc *block) {
+  block->Proto()->set_idx(42);
+  block->Proto()->set_parent_idx(4);
+  block->Proto()->set_forward_block_idx(32);
+
+  auto *op = block->AppendOp();
+  *op = CreatePbOpDesc();
+
+  auto *var = block->Var("init");
+  *var = CreatePbVarDesc();
+}
+
+TEST(TransformBlockDesc, cpp2pb) {
+  cpp::BlockDesc cpp_block;
+  CreateCppBlockDesc(&cpp_block);
+
+  framework::ProgramDesc pb_prog;
+  auto *pb_block = pb_prog.MutableBlock(0);
+  TransformBlockDescFromCinn(cpp_block, pb_block);
+
+  framework::ProgramDesc correct_prog;
+  auto *correct_block = correct_prog.MutableBlock(0);
+  CreatePbBlockDesc(correct_block);
+  ASSERT_EQ(pb_block->ID(), correct_block->ID());
+  ASSERT_EQ(pb_block->Parent(), correct_block->Parent());
+  ASSERT_EQ(pb_block->ForwardBlockID(), correct_block->ForwardBlockID());
+  ASSERT_EQ(pb_block->OpSize(), correct_block->OpSize());
+  ASSERT_EQ(pb_block->AllVars().size(), correct_block->AllVars().size());
+}
+
+TEST(TransformBlockDesc, pb2cpp) {
+  framework::ProgramDesc pb_prog;
+  auto *pb_block = pb_prog.MutableBlock(0);
+  CreatePbBlockDesc(pb_block);
+
+  cpp::BlockDesc cpp_block;
+  TransformBlockDescToCinn(pb_block, &cpp_block);
+
+  cpp::BlockDesc correct_block;
+  CreateCppBlockDesc(&correct_block);
+  ASSERT_EQ(cpp_block.Idx(), correct_block.Idx());
+  ASSERT_EQ(cpp_block.ParentIdx(), correct_block.ParentIdx());
+  ASSERT_EQ(cpp_block.ForwardBlockIdx(), correct_block.ForwardBlockIdx());
+  ASSERT_EQ(cpp_block.OpsSize(), correct_block.OpsSize());
+  ASSERT_EQ(cpp_block.VarsSize(), correct_block.VarsSize());
+}
+
+// check ProgramDesc
+cpp::ProgramDesc CreateCppProgramDesc() {
+  cpp::ProgramDesc prog;
+  prog.SetVersion(22);
+
+  auto *block = prog.AddBlock<cpp::BlockDesc>();
+  CreateCppBlockDesc(block);
+
+  return prog;
+}
+
+framework::ProgramDesc CreatePbProgramDesc() {
+  framework::ProgramDesc prog;
+  prog.SetVersion(22);
+
+  auto *block = prog.MutableBlock(0);
+  CreatePbBlockDesc(block);
+  return prog;
+}
+
+TEST(TransformProgramDesc, cpp2pb) {
+  auto cpp_prog = CreateCppProgramDesc();
+  framework::ProgramDesc pb_prog;
+  TransformProgramDescFromCinn(cpp_prog, &pb_prog);
+
+  auto correct_prog = CreatePbProgramDesc();
+  ASSERT_EQ(pb_prog.Version(), correct_prog.Version());
+  ASSERT_EQ(pb_prog.Size(), correct_prog.Size());
+}
+
+TEST(TransformProgramDesc, pb2cpp) {
+  auto pb_prog = CreatePbProgramDesc();
+  cpp::ProgramDesc cpp_prog;
+  TransformProgramDescToCinn(&pb_prog, &cpp_prog);
+
+  auto correct_prog = CreateCppProgramDesc();
+  ASSERT_EQ(cpp_prog.Version(), correct_prog.Version());
+  ASSERT_EQ(cpp_prog.BlocksSize(), correct_prog.BlocksSize());
+}
+
+}  // namespace paddle2cinn
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/imperative/gloo_context.cc b/paddle/fluid/imperative/gloo_context.cc
index 0d93cdf57932f..ef1bf0d158787 100644
--- a/paddle/fluid/imperative/gloo_context.cc
+++ b/paddle/fluid/imperative/gloo_context.cc
@@ -53,15 +53,13 @@ void GLOOParallelContext::InitWithRingID(int ring_id) {
       platform::errors::OutOfRange("Still not implement InitWithRingID"));
 }
 
-#define GLOO_CASE(type, T, gw)                                        \
-  case type: {                                                        \
-    VLOG(4) << "Use the gloo all reduce to sync. SRC:" << src_tensor; \
-    std::vector<T> send_vector##T;                                    \
-    framework::TensorToVector<T>(src_tensor, &send_vector##T);        \
-    auto recv_vector##T = gw->AllReduce<T>(send_vector##T);           \
-    framework::TensorFromVector<T>(recv_vector##T, dst_tensor);       \
-    VLOG(4) << "DST:" << *dst_tensor;                                 \
-    break;                                                            \
+#define GLOO_CASE(type, T, gw)                                  \
+  case type: {                                                  \
+    std::vector<T> send_vector##T;                              \
+    framework::TensorToVector<T>(src_tensor, &send_vector##T);  \
+    auto recv_vector##T = gw->AllReduce<T>(send_vector##T);     \
+    framework::TensorFromVector<T>(recv_vector##T, dst_tensor); \
+    break;                                                      \
   }
 
 void GLOOParallelContext::AllReduceByStream(const framework::Variable &src,
@@ -118,7 +116,7 @@ void GLOOParallelContext::AllReduce(const framework::Tensor &src_tensor,
     const auto *src_tensor_ptr = src_tensor.data<T>();            \
     gw->AllGatherVector<T>(const_cast<T *>(src_tensor_ptr),       \
                            reinterpret_cast<T *>(dst_tensor_ptr), \
-                           value_sendcount);                      \
+                           element_nums);                         \
     break;                                                        \
   }
 
@@ -150,48 +148,31 @@ void GLOOParallelContext::AllReduce(const framework::SelectedRows &src,
   auto *dst_rows_ptr = dst_rows->MutableData(place);
   const int64_t *src_rows_ptr = src_rows.Data(place);
 
-  // VLOG(3) << "Selected Rows of src:" << string::join_strings(dst_rows, ',')
-
   auto *dst_tensor = dst->mutable_value();
   auto dims = src_tensor.dims();
   dims[0] = rows_num;
   auto feature_size = framework::product(dims) / dims[0];
   dst_tensor->Resize(dims);
-  if (std::all_of(cpu_rows_num_ptr, cpu_rows_num_ptr + nranks,
-                  [&](size_t row) { return row == cpu_rows_num_ptr[0]; })) {
-    // During sparse communication, the number of each card is same.
-    // Because gloo wrapper utility class currently don't support
-    // broadcast, so we only deal the-same case.
-    VLOG(3) << "Use the gloo all reduce to sync. SRC:" << src_tensor;
-    // framework::SerializeToStream(VLOG(4), src);
-    VLOG(3) << "allgather replaces broadcast to speed up in sparse allreduce";
-    auto value_sendcount = cpu_rows_num_ptr[0] * feature_size;
-    auto *dst_tensor_ptr = dst_tensor->mutable_data(place, dtype);
-
-    gloo_wrapper->AllGatherVector<int64_t>(const_cast<int64_t *>(src_rows_ptr),
-                                           static_cast<int64_t *>(dst_rows_ptr),
-                                           rows_num_vector[0]);
-
-    switch (dtype) {
-      GLOO_ALL_GATHER_CASE(framework::proto::VarType::FP32, float,
-                           gloo_wrapper);
-      GLOO_ALL_GATHER_CASE(framework::proto::VarType::FP64, double,
-                           gloo_wrapper);
-      GLOO_ALL_GATHER_CASE(framework::proto::VarType::INT32, int, gloo_wrapper);
-      GLOO_ALL_GATHER_CASE(framework::proto::VarType::INT64, int64_t,
-                           gloo_wrapper);
-      default: {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "Invalid datatype for allreduce"));
-      }
+
+  std::vector<size_t> element_nums = rows_num_vector;
+  std::for_each(element_nums.begin(), element_nums.end(),
+                [feature_size](size_t &x) { x = x * feature_size; });
+
+  auto *dst_tensor_ptr = dst_tensor->mutable_data(place, dtype);
+  gloo_wrapper->AllGatherVector<int64_t>(const_cast<int64_t *>(src_rows_ptr),
+                                         static_cast<int64_t *>(dst_rows_ptr),
+                                         rows_num_vector);
+
+  switch (dtype) {
+    GLOO_ALL_GATHER_CASE(framework::proto::VarType::FP32, float, gloo_wrapper);
+    GLOO_ALL_GATHER_CASE(framework::proto::VarType::FP64, double, gloo_wrapper);
+    GLOO_ALL_GATHER_CASE(framework::proto::VarType::INT32, int, gloo_wrapper);
+    GLOO_ALL_GATHER_CASE(framework::proto::VarType::INT64, int64_t,
+                         gloo_wrapper);
+    default: {
+      PADDLE_THROW(
+          platform::errors::InvalidArgument("Invalid datatype for allreduce"));
     }
-    VLOG(3) << "Selected Row DST:" << *dst_tensor;
-    VLOG(3) << "Selected Rows of DST:"
-            << string::join_strings(std::vector<int64_t>(*dst_rows), ',');
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "The number of each card is not the same, gloo only support the-same"
-        "batch division"));
   }
 }
 
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 8f45cd0fa6ea1..c31464bf20acc 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -21,6 +21,7 @@
 #include "paddle/fluid/platform/xpu/xpu_op_list.h"
 #endif
 DECLARE_bool(check_nan_inf);
+DECLARE_bool(benchmark);
 
 namespace paddle {
 namespace imperative {
@@ -208,6 +209,19 @@ static void PreparedOpRunImpl(
         op.Type(), outs, dev_ctx->GetPlace());
   }
 
+  /*For profiling/benchmark only*/
+  if (FLAGS_benchmark) {
+    dev_ctx->Wait();
+#if defined(PADDLE_WITH_CUDA)
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetLastError());
+    VLOG(4) << "Operator(" << op.Type() << "): context wait and get last error";
+#endif
+#if defined(PADDLE_WITH_HIP)
+    PADDLE_ENFORCE_CUDA_SUCCESS(hipGetLastError());
+    VLOG(4) << "Operator(" << op.Type() << "): context wait and get last error";
+#endif
+  }
+
   /**
    * [ Why need handle complex gradient to real gradient? ]
    *
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index eabca4197a1d3..dda4be8f81c63 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1415,6 +1415,7 @@ USE_TRT_CONVERTER(tile);
 USE_TRT_CONVERTER(conv3d);
 USE_TRT_CONVERTER(conv3d_transpose);
 USE_TRT_CONVERTER(mish);
+USE_TRT_CONVERTER(pool3d)
 #endif
 
 namespace paddle_infer {
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index ae697529c2554..0eba7d03fea98 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -93,8 +93,9 @@ const std::vector<std::string> kTRTSubgraphPasses({
       "squeeze2_matmul_fuse_pass",              //
       "reshape2_matmul_fuse_pass",              //
       "flatten2_matmul_fuse_pass",              //
-      "map_matmul_to_mul_pass",                 //
       "map_matmul_v2_to_mul_pass",              //
+      "map_matmul_v2_to_matmul_pass",           //
+      "map_matmul_to_mul_pass",                 //
       "fc_fuse_pass",                           //
       "conv_elementwise_add_fuse_pass",         //
       "add_support_int8_pass",
@@ -142,8 +143,9 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
         "squeeze2_matmul_fuse_pass",                 //
         "reshape2_matmul_fuse_pass",                 //
         "flatten2_matmul_fuse_pass",                 //
-        "map_matmul_to_mul_pass",                    //
         "map_matmul_v2_to_mul_pass",                 //
+        "map_matmul_v2_to_matmul_pass",              //
+        "map_matmul_to_mul_pass",                    //
         "fc_fuse_pass",                              //
         "fc_elementwise_layernorm_fuse_pass",        //
 #if CUDNN_VERSION >= 7100  // To run conv_fusion, the version of cudnn must be
@@ -196,15 +198,16 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
                   // "embedding_fc_lstm_fuse_pass", //
                   // TODO(wilber): fix correctness problem.
                   // "fc_lstm_fuse_pass",                    //
-                  "mul_lstm_fuse_pass",                      //
-                  "fc_gru_fuse_pass",                        //
-                  "mul_gru_fuse_pass",                       //
-                  "seq_concat_fc_fuse_pass",                 //
-                  "squeeze2_matmul_fuse_pass",               //
-                  "reshape2_matmul_fuse_pass",               //
-                  "flatten2_matmul_fuse_pass",               //
+                  "mul_lstm_fuse_pass",         //
+                  "fc_gru_fuse_pass",           //
+                  "mul_gru_fuse_pass",          //
+                  "seq_concat_fc_fuse_pass",    //
+                  "squeeze2_matmul_fuse_pass",  //
+                  "reshape2_matmul_fuse_pass",  //
+                  "flatten2_matmul_fuse_pass",  //
+                  "map_matmul_v2_to_mul_pass",  //
+                  // "map_matmul_v2_to_matmul_pass",            //
                   "map_matmul_to_mul_pass",                  //
-                  "map_matmul_v2_to_mul_pass",               //
                   "fc_fuse_pass",                            //
                   "repeated_fc_relu_fuse_pass",              //
                   "squared_mat_sub_fuse_pass",               //
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index ef12cb6b36617..b6aa0a230cc2d 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -19,6 +19,7 @@ nv_library(tensorrt_converter
                 conv3d_op.cc
                 mish_op.cc
                 nearest_interp_v2_op.cc
+                pool3d_op.cc
            DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry)
 
 nv_test(test_op_converter SRCS test_op_converter.cc DEPS
diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
index 05cd7bad5cbac..35c9658108ab5 100644
--- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
@@ -138,8 +138,11 @@ class Pool2dOpConverter : public OpConverter {
 
     if (engine_->with_dynamic_shape()) {
       if (!adaptive && !global_pooling && !ceil_mode) {
-        if ((g_post_pad.w() > 0 || g_post_pad.h() > 0) &&
-            (padding_algorithm != "SAME")) {
+        // input_shape.d < 0 means we can't get shape info here.
+        // we may suffer from issue if shape is not met finally.
+        if ((padding_algorithm != "SAME") &&
+            ((g_post_pad.w() > 0 && input_shape.d[input_dims - 2] > 0) ||
+             (g_post_pad.h() > 0 && input_shape.d[input_dims - 1] > 0))) {
           auto *pad_layer = TRT_ENGINE_ADD_LAYER(engine_, Padding, *input1,
                                                  g_pre_pad, g_post_pad);
           PADDLE_ENFORCE_NOT_NULL(
@@ -148,6 +151,7 @@ class Pool2dOpConverter : public OpConverter {
                              "created. The pointer to pad layer is `NULL`."));
           input1 = pad_layer->getOutput(0);
         }
+
         auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *input1,
                                                 nv_pool_type, nv_ksize);
         pool_layer->setStride(nv_strides);
diff --git a/paddle/fluid/inference/tensorrt/convert/pool3d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool3d_op.cc
new file mode 100644
index 0000000000000..9baed499f14a7
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/pool3d_op.cc
@@ -0,0 +1,228 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+inline void DealCeilMode(const nvinfer1::Dims &input_shape,
+                         std::vector<int> ksize, std::vector<int> strides,
+                         std::vector<int> paddings, nvinfer1::DimsCHW *pre_pad,
+                         nvinfer1::DimsCHW *post_pad, int input_dims) {
+  int input_depth = input_shape.d[input_dims - 3];
+  int input_height = input_shape.d[input_dims - 2];
+  int input_width = input_shape.d[input_dims - 1];
+
+  int floor_d_output_size =
+      (input_depth - ksize[0] + 2 * paddings[0]) / strides[0] + 1;
+  int ceil_d_output_size =
+      (input_depth - ksize[0] + 2 * paddings[0] + strides[0] - 1) / strides[0] +
+      1;
+
+  int floor_h_output_size =
+      (input_height - ksize[1] + 2 * paddings[1]) / strides[1] + 1;
+  int ceil_h_output_size =
+      (input_height - ksize[1] + 2 * paddings[1] + strides[1] - 1) /
+          strides[1] +
+      1;
+
+  int floor_w_output_size =
+      (input_width - ksize[2] + 2 * paddings[2]) / strides[2] + 1;
+  int ceil_w_output_size =
+      (input_width - ksize[2] + 2 * paddings[2] + strides[2] - 1) / strides[2] +
+      1;
+
+  if (floor_d_output_size != ceil_d_output_size) {
+    post_pad->c() = strides[0] - 1;
+  }
+
+  if (floor_h_output_size != ceil_h_output_size) {
+    post_pad->h() = strides[1] - 1;
+  }
+
+  if (floor_w_output_size != ceil_w_output_size) {
+    post_pad->w() = strides[2] - 1;
+  }
+}
+
+class Pool3dOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc &op,
+                  const framework::Scope &scope, bool test_mode) override {
+    VLOG(4)
+        << "convert a fluid pool3d op to tensorrt pool3d layer without bias";
+    framework::OpDesc op_desc(op, nullptr);
+    auto *input1 = engine_->GetITensor(op_desc.Input("X")[0]);
+    nvinfer1::Dims input_shape = input1->getDimensions();
+    int input_dims = input_shape.nbDims;
+
+    bool global_pooling =
+        BOOST_GET_CONST(bool, op_desc.GetAttr("global_pooling"));
+    std::string pool_type =
+        BOOST_GET_CONST(std::string, op_desc.GetAttr("pooling_type"));
+    std::vector<int> ksize =
+        BOOST_GET_CONST(std::vector<int>, op_desc.GetAttr("ksize"));
+    std::vector<int> strides =
+        BOOST_GET_CONST(std::vector<int>, op_desc.GetAttr("strides"));
+    std::vector<int> paddings =
+        BOOST_GET_CONST(std::vector<int>, op_desc.GetAttr("paddings"));
+    bool exclusive = op_desc.HasAttr("exclusive")
+                         ? BOOST_GET_CONST(bool, op_desc.GetAttr("exclusive"))
+                         : true;
+    bool ceil_mode = BOOST_GET_CONST(bool, op_desc.GetAttr("ceil_mode"));
+    bool adaptive = false;
+    if (op_desc.HasAttr("adaptive"))
+      adaptive = BOOST_GET_CONST(bool, op_desc.GetAttr("adaptive"));
+    std::string padding_algorithm = "EXPLICIT";
+    if (op_desc.HasAttr("padding_algorithm"))
+      padding_algorithm =
+          BOOST_GET_CONST(std::string, op_desc.GetAttr("padding_algorithm"));
+    if (padding_algorithm == "VALID" || padding_algorithm == "SAME") {
+      std::fill(paddings.begin(), paddings.end(), 0);
+    }
+
+    nvinfer1::PoolingType nv_pool_type = nvinfer1::PoolingType::kMAX;
+    nvinfer1::ReduceOperation reduce_operation =
+        nvinfer1::ReduceOperation::kMAX;
+    plugin::Pool3DPlugin::Pool3DType plugin_pool_type =
+        plugin::Pool3DPlugin::Pool3DType::max;
+    if (pool_type == "max") {
+      nv_pool_type = nvinfer1::PoolingType::kMAX;
+      reduce_operation = nvinfer1::ReduceOperation::kMAX;
+      plugin_pool_type = plugin::Pool3DPlugin::Pool3DType::max;
+    } else if (pool_type == "avg") {
+      nv_pool_type = nvinfer1::PoolingType::kAVERAGE;
+      reduce_operation = nvinfer1::ReduceOperation::kAVG;
+      plugin_pool_type = plugin::Pool3DPlugin::Pool3DType::avg;
+    }
+    nvinfer1::DimsCHW nv_ksize(ksize[0], ksize[1], ksize[2]);
+    nvinfer1::DimsCHW nv_strides(strides[0], strides[1], strides[2]);
+    nvinfer1::DimsCHW nv_paddings(paddings[0], paddings[1], paddings[2]);
+    nvinfer1::ILayer *layer = nullptr;
+    if (op_desc.HasAttr("enable_int8")) {
+      CHECK(op_desc.HasAttr("X_scale"));
+      float input_scale = BOOST_GET_CONST(float, op_desc.GetAttr("X_scale"));
+      engine_->SetTensorDynamicRange(input1, input_scale);
+    }
+
+    if (engine_->with_dynamic_shape()) {
+      if (!adaptive && !global_pooling && !ceil_mode) {
+        auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, PoolingNd, *input1,
+                                                nv_pool_type, nv_ksize);
+        pool_layer->setStrideNd(nv_strides);
+        pool_layer->setPaddingNd(nv_paddings);
+        pool_layer->setAverageCountExcludesPadding(exclusive);
+        layer = pool_layer;
+      } else if (global_pooling) {
+        auto *reduce_layer = TRT_ENGINE_ADD_LAYER(engine_, Reduce, *input1,
+                                                  reduce_operation, 28, true);
+        layer = reduce_layer;
+      } else {
+        plugin::Pool3DPluginDynamic *plugin = new plugin::Pool3DPluginDynamic(
+            ceil_mode, pool_type, adaptive, ksize, strides, paddings,
+            global_pooling);
+        layer = engine_->AddDynamicPlugin(&input1, 1, plugin);
+      }
+      auto output_name = op_desc.Output("Out")[0];
+      layer->setName(("pool3d (Output: " + output_name + ")").c_str());
+      layer->getOutput(0)->setName(output_name.c_str());
+      engine_->SetITensor(output_name, layer->getOutput(0));
+      if (test_mode) {
+        engine_->DeclareOutput(output_name);
+      }
+      return;
+    }
+
+    if (global_pooling == true) {
+      auto *reduce_layer = TRT_ENGINE_ADD_LAYER(engine_, Reduce, *input1,
+                                                reduce_operation, 14, true);
+      layer = reduce_layer;
+      auto output_name = op_desc.Output("Out")[0];
+      layer->setName(("pool3d (Output: " + output_name + ")").c_str());
+      layer->getOutput(0)->setName(output_name.c_str());
+      engine_->SetITensor(output_name, layer->getOutput(0));
+      if (test_mode) {
+        engine_->DeclareOutput(output_name);
+      }
+      return;
+    }
+
+    if (!adaptive) {
+      if (!ceil_mode) {
+        auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, PoolingNd, *input1,
+                                                nv_pool_type, nv_ksize);
+        PADDLE_ENFORCE_NOT_NULL(
+            pool_layer,
+            platform::errors::Fatal(
+                "trt pool layer in converter could not be created."));
+        pool_layer->setStrideNd(nv_strides);
+        pool_layer->setPaddingNd(nv_paddings);
+        pool_layer->setAverageCountExcludesPadding(exclusive);
+        layer = pool_layer;
+      } else {
+        std::vector<int> input_shape_v;
+        for (int i = 0; i < input_dims; i++) {
+          input_shape_v.push_back(input_shape.d[i]);
+        }
+        plugin::Pool3DPlugin *plugin =
+            new plugin::Pool3DPlugin(ceil_mode, plugin_pool_type, adaptive,
+                                     ksize, strides, paddings, input_shape_v);
+        auto *pool_layer = engine_->AddPluginV2Ext(&input1, 1, plugin);
+        PADDLE_ENFORCE_NOT_NULL(
+            pool_layer,
+            platform::errors::Fatal(
+                "trt pool3d plugin layer in converter could not be created."));
+        layer = pool_layer;
+      }
+    } else {
+      // Average pooling needs to exclude the padding pixels from the average
+      // mean.
+      // It is not supported well by TRT, we use a plugin here.
+      std::vector<int> input_shape_v;
+      for (int i = 0; i < input_dims; i++) {
+        input_shape_v.push_back(input_shape.d[i]);
+      }
+      plugin::Pool3DPlugin *plugin =
+          new plugin::Pool3DPlugin(ceil_mode, plugin_pool_type, adaptive, ksize,
+                                   strides, paddings, input_shape_v);
+      auto *pool_layer = engine_->AddPluginV2Ext(&input1, 1, plugin);
+      PADDLE_ENFORCE_NOT_NULL(
+          pool_layer,
+          platform::errors::Fatal(
+              "trt pool3d plugin layer in converter could not be created."));
+      layer = pool_layer;
+    }
+    auto output_name = op_desc.Output("Out")[0];
+    RreplenishLayerAndOutput(layer, "pool3d", {output_name}, test_mode);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+USE_OP(pool3d);
+REGISTER_TRT_OP_CONVERTER(pool3d, Pool3dOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index edf69dc7aa2b5..0e1b9fe3366ca 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -116,6 +116,17 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<T>& shape, std::string input,
             input, ShapeStr(shape)));
       }
       return nvinfer1::Dims2(shape[1], shape[2]);
+    } else if (shape.size() == 2UL) {
+      if (shape[1] == -1) {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "The input [%s] shape of trt subgraph is %s, please enable "
+            "trt dynamic_shape mode by SetTRTDynamicShapeInfo.",
+            input, ShapeStr(shape)));
+      }
+      nvinfer1::Dims dims;
+      dims.nbDims = 1;
+      dims.d[0] = shape[1];
+      return dims;
     }
     return nvinfer1::Dims3(shape[1], 1, 1);
   } else {
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 91515f1fa5811..13504f444109b 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -142,7 +142,8 @@ struct SimpleOpTypeSetTeller : public Teller {
                                              "conv3d",
                                              "conv3d_transpose",
                                              "mish",
-                                             "nearest_interp_v2"};
+                                             "nearest_interp_v2",
+                                             "pool3d"};
 };
 
 bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
@@ -339,6 +340,26 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
                    "the pass.";
         return false;
       }
+
+      // not support broadcast
+      auto* x_var_desc = block->FindVar(desc.Input("X")[0]);
+      auto* y_var_desc = block->FindVar(desc.Input("Y")[0]);
+      const auto x_shape = x_var_desc->GetShape();
+      const auto y_shape = y_var_desc->GetShape();
+      if (x_shape.size() != y_shape.size()) {
+        VLOG(3)
+            << "matmul op not support broadcast, please check inputs'shape. ";
+        return false;
+      }
+      uint64_t dims = 2;
+      for (size_t i = 0; i < x_shape.size() - dims; ++i) {
+        if (x_shape[i] != y_shape[i] && (x_shape[i] == 1 || y_shape[i] == 1)) {
+          VLOG(3) << "matmul op not support broadcast, please check "
+                     "inputs'shape[i]. ";
+          return false;
+        }
+      }
+
       for (auto& param_name : desc.Inputs()) {
         for (auto& var_name : param_name.second) {
           auto* var_desc = block->FindVar(var_name);
@@ -1043,6 +1064,22 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
                 << desc.Output("Y").size();
         return false;
       }
+
+      auto* block = desc.Block();
+      if (block == nullptr) {
+        VLOG(3) << "The block desc is nullptr, we can't continue to analyze. "
+                   "Developers need to check whether block_desc is passed in "
+                   "the pass.";
+        return false;
+      }
+      auto x_var_name = desc.Input("X")[0];
+      auto* x_var_desc = block->FindVar(x_var_name);
+      const auto x_shape = x_var_desc->GetShape();
+      if (x_shape.size() != 4) {
+        VLOG(3) << "The instance_norm op only support 4-dimensional input in "
+                   "tensorrt.";
+        return false;
+      }
     }
 
     if (op_type == "leaky_relu") {
@@ -1329,6 +1366,47 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
     }
 
     if (op_type == "fc") {
+      auto* block = desc.Block();
+      if (block == nullptr) {
+        VLOG(3) << "The block desc is nullptr, we can't continue to analyze. "
+                   "Developers need to check whether block_desc is passed in "
+                   "the pass.";
+        return false;
+      }
+
+      // y'shapes == 2
+      auto fc_inputs = desc.Inputs();
+      std::string fc_y = "";
+      if (fc_inputs.find("Y") != fc_inputs.end()) {
+        fc_y = "Y";
+      } else if (fc_inputs.find("W") != fc_inputs.end()) {
+        fc_y = "W";
+      } else {
+        VLOG(3) << " input_y(fc_op) must be Y or W ";
+        return false;
+      }
+
+      //  There is currently no input: Y(weight) more than two dimensions
+      /*
+      auto* y_var_desc = block->FindVar(desc.Input(fc_y)[0]);
+      const auto y_shape = y_var_desc->GetShape();
+      if (y_shape.size() != 2) {
+        VLOG(3)
+            << " input_y(fc_op)'shapes must be 2, but input_y(fc_op)'shapes = "
+            << y_shape.size();
+        return false;
+      }
+      // y_num_col_dims ==1
+      if (desc.HasAttr("y_num_col_dims")) {
+        int y_num_col_dims =
+            BOOST_GET_CONST(int, desc.GetAttr("y_num_col_dims"));
+        if (y_num_col_dims != 1) {
+          VLOG(3) << " fc_op'y_num_col_dims must be 1, but y_num_col_dims = "
+                  << y_num_col_dims;
+          return false;
+        }
+      }
+      */
       int x_num_col_dims =
           desc.HasAttr("x_num_col_dims")
               ? BOOST_GET_CONST(int, desc.GetAttr("x_num_col_dims"))
@@ -1336,8 +1414,9 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
                      ? BOOST_GET_CONST(int, desc.GetAttr("in_num_col_dims"))
                      : 1);
       if (x_num_col_dims < 1) {
-        VLOG(3) << "converter expects x_num_col_dims >= 1, "
-                   "but x_num_col_dims = %d.";
+        VLOG(3) << "fc_op expects x_num_col_dims >= 1, "
+                   "but x_num_col_dims = "
+                << x_num_col_dims;
         return false;
       }
     }
diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
index e6bcb59fd092c..9e93894e623c0 100644
--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
@@ -10,6 +10,7 @@ nv_library(tensorrt_plugin
            roi_align_op_plugin.cu
            gather_nd_op_plugin.cu
            mish_op_plugin.cu
+           pool3d_op_plugin.cu
            DEPS enforce tensorrt_engine prelu tensor bert_encoder_functor)
 
 nv_test(test_split_plugin SRCS test_split_plugin.cc DEPS
diff --git a/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu
index b7c4fb7c99acf..a9a50543e7bb7 100644
--- a/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu
@@ -65,11 +65,6 @@ int InstanceNormPlugin::enqueue(int batch_size, const void *const *inputs,
 #endif
                                 cudaStream_t stream) TRT_NOEXCEPT {
   const auto &input_dims = this->getInputDims(0);
-
-  PADDLE_ENFORCE_EQ(input_dims.nbDims, 3,
-                    platform::errors::InvalidArgument(
-                        "Input Dims should be 3 (except the batch), got %d",
-                        input_dims.nbDims));
   int n = batch_size;
   int c = input_dims.d[0];
   int h = input_dims.d[1];
diff --git a/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu
new file mode 100644
index 0000000000000..861a9aa9d000b
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu
@@ -0,0 +1,375 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, softwarepool
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.h"
+#include "paddle/fluid/operators/math/pooling.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+size_t Pool3DPlugin::getSerializationSize() const TRT_NOEXCEPT {
+  return getBaseSerializationSize() + SerializedSize(ceil_mode_) +
+         SerializedSize(pool3d_type_) + SerializedSize(adaptive_) +
+         SerializedSize(ksize_) + SerializedSize(strides_) +
+         SerializedSize(paddings_) + SerializedSize(input_shape_) +
+         SerializedSize(output_shape_);
+}
+
+// TRT will call this func when we need to serialize the configuration of
+// tensorrt.
+void Pool3DPlugin::serialize(void *buffer) const TRT_NOEXCEPT {
+  serializeBase(buffer);
+  SerializeValue(&buffer, ceil_mode_);
+  SerializeValue(&buffer, pool3d_type_);
+  SerializeValue(&buffer, adaptive_);
+  SerializeValue(&buffer, ksize_);
+  SerializeValue(&buffer, strides_);
+  SerializeValue(&buffer, paddings_);
+  SerializeValue(&buffer, input_shape_);
+  SerializeValue(&buffer, output_shape_);
+}
+
+Pool3DPlugin *Pool3DPlugin::clone() const TRT_NOEXCEPT {
+  return new Pool3DPlugin(ceil_mode_, pool3d_type_, adaptive_, ksize_, strides_,
+                          paddings_, input_shape_);
+}
+
+const char *Pool3DPlugin::getPluginType() const TRT_NOEXCEPT {
+  return "pool3d_plugin";
+}
+
+int Pool3DPlugin::getNbOutputs() const TRT_NOEXCEPT { return 1; }
+
+int Pool3DPlugin::initialize() TRT_NOEXCEPT { return 0; }
+
+nvinfer1::DataType Pool3DPlugin::getOutputDataType(
+    int index, const nvinfer1::DataType *input_types,
+    int nb_inputs) const TRT_NOEXCEPT {
+  return input_types[0];
+}
+
+void Pool3DPlugin::destroy() TRT_NOEXCEPT { delete this; }
+
+nvinfer1::Dims Pool3DPlugin::getOutputDimensions(
+    int index, const nvinfer1::Dims *inputDims, int nbInputs) TRT_NOEXCEPT {
+  PADDLE_ENFORCE_EQ(nbInputs, 1,
+                    platform::errors::InvalidArgument(
+                        "The Pool3D Plugin only has one input, so the nbInputs "
+                        "value should be 1, but get %d.",
+                        nbInputs));
+  PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
+                                  "The Pool3D Plugin only has one input, so "
+                                  "the index value should be 0, but get %d.",
+                                  index));
+  PADDLE_ENFORCE_EQ(inputDims[0].nbDims, 4,
+                    platform::errors::InvalidArgument(
+                        "The Pool3D Plugin only has four Dimensions, so the "
+                        "nbDims value should be 4, but get %d.",
+                        inputDims[0].nbDims));
+
+  nvinfer1::Dims const &input_dims = inputDims[0];
+
+  nvinfer1::Dims output_dims = input_dims;
+
+  output_dims.d[1] = output_shape_[1];
+  output_dims.d[2] = output_shape_[2];
+  output_dims.d[3] = output_shape_[3];
+  return output_dims;
+}
+
+int Pool3DPlugin::enqueue(int batchSize, const void *const *inputs,
+#if IS_TRT_VERSION_LT(8000)
+                          void **outputs, void *workspace,
+                          cudaStream_t stream) TRT_NOEXCEPT {
+#else
+                          void *const *outputs, void *workspace,
+                          cudaStream_t stream) TRT_NOEXCEPT {
+#endif
+  int input_size = 0;
+  float const *idata = reinterpret_cast<float const *>(inputs[0]);
+  float *const *odatas = reinterpret_cast<float *const *>(outputs);
+
+  std::vector<int> input_shape = input_shape_;
+  std::vector<int> output_shape = output_shape_;
+  input_shape.insert(input_shape.begin(), batchSize);
+  output_shape.insert(output_shape.begin(), batchSize);
+
+  if (pool3d_type_ == Pool3DType::max) {
+    paddle::operators::math::MaxPool<float> pool_process;
+    paddle::operators::math::Pool3dDirectCUDAFunctor<
+        paddle::operators::math::MaxPool<float>, float>
+        pool3d_forward;
+    pool3d_forward(idata, input_shape, output_shape, ksize_, strides_,
+                   paddings_, true, adaptive_, odatas[0], stream, pool_process);
+  } else if (pool3d_type_ == Pool3DType::avg) {
+    paddle::operators::math::AvgPool<float> pool_process;
+    paddle::operators::math::Pool3dDirectCUDAFunctor<
+        paddle::operators::math::AvgPool<float>, float>
+        pool3d_forward;
+    pool3d_forward(idata, input_shape, output_shape, ksize_, strides_,
+                   paddings_, true, adaptive_, odatas[0], stream, pool_process);
+  }
+
+  return cudaGetLastError() != cudaSuccess;
+}
+
+// Dynamic Plugin below.
+
+Pool3DPluginDynamic::Pool3DPluginDynamic(void const *serialData,
+                                         size_t serialLength) {
+  DeserializeValue(&serialData, &serialLength, &ceil_mode_);
+  const char *pool3d_type;
+  DeserializeValue(&serialData, &serialLength, &pool3d_type);
+  pool3d_type_ = std::string(pool3d_type);
+  DeserializeValue(&serialData, &serialLength, &adaptive_);
+  DeserializeValue(&serialData, &serialLength, &ksize_);
+  DeserializeValue(&serialData, &serialLength, &strides_);
+  DeserializeValue(&serialData, &serialLength, &paddings_);
+  DeserializeValue(&serialData, &serialLength, &is_global_);
+}
+
+nvinfer1::IPluginV2DynamicExt *Pool3DPluginDynamic::clone() const TRT_NOEXCEPT {
+  return new Pool3DPluginDynamic(ceil_mode_, pool3d_type_, adaptive_, ksize_,
+                                 strides_, paddings_, is_global_);
+}
+
+const char *Pool3DPluginDynamic::getPluginType() const TRT_NOEXCEPT {
+  return "pool3d_plugin_dynamic";
+}
+int Pool3DPluginDynamic::getNbOutputs() const TRT_NOEXCEPT { return 1; }
+
+int Pool3DPluginDynamic::initialize() TRT_NOEXCEPT { return 0; }
+
+void Pool3DPluginDynamic::configurePlugin(
+    const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs,
+    const nvinfer1::DynamicPluginTensorDesc *out, int nbOutputs) TRT_NOEXCEPT {}
+
+size_t Pool3DPluginDynamic::getWorkspaceSize(
+    const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
+    const nvinfer1::PluginTensorDesc *outputs,
+    int nbOutputs) const TRT_NOEXCEPT {
+  return 0;
+}
+
+size_t Pool3DPluginDynamic::getSerializationSize() const TRT_NOEXCEPT {
+  return SerializedSize(ceil_mode_) + SerializedSize(pool3d_type_.c_str()) +
+         SerializedSize(adaptive_) + SerializedSize(ksize_) +
+         SerializedSize(strides_) + SerializedSize(paddings_) +
+         SerializedSize(is_global_);
+}
+
+void Pool3DPluginDynamic::serialize(void *buffer) const TRT_NOEXCEPT {
+  SerializeValue(&buffer, ceil_mode_);
+  SerializeValue(&buffer, pool3d_type_.c_str());
+  SerializeValue(&buffer, adaptive_);
+  SerializeValue(&buffer, ksize_);
+  SerializeValue(&buffer, strides_);
+  SerializeValue(&buffer, paddings_);
+  SerializeValue(&buffer, is_global_);
+}
+
+nvinfer1::DimsExprs Pool3DPluginDynamic::getOutputDimensions(
+    int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,
+    nvinfer1::IExprBuilder &expr_builder) TRT_NOEXCEPT {
+  PADDLE_ENFORCE_EQ(nb_inputs, 1,
+                    platform::errors::InvalidArgument(
+                        "The Split plugin should be only one input."));
+
+  PADDLE_ENFORCE_EQ(
+      inputs[0].d[1]->isConstant(), true,
+      platform::errors::InvalidArgument("The channel dimension should be "
+                                        "static, but we found it's dynamic."));
+  nvinfer1::DimsExprs output(inputs[0]);
+  if (is_global_) {
+    output.d[2] = expr_builder.constant(1);
+    output.d[3] = expr_builder.constant(1);
+    output.d[4] = expr_builder.constant(1);
+    return output;
+  }
+  if (adaptive_) {
+    output.d[2] = expr_builder.constant(ksize_[0]);
+    output.d[3] = expr_builder.constant(ksize_[1]);
+    output.d[4] = expr_builder.constant(ksize_[2]);
+    return output;
+  }
+
+  auto stri_0 = expr_builder.constant(strides_[0]);
+  auto stri_1 = expr_builder.constant(strides_[1]);
+  auto stri_2 = expr_builder.constant(strides_[2]);
+  auto one_value = expr_builder.constant(1);
+
+  auto v0_tmp = expr_builder.constant(-ksize_[0] + 2 * paddings_[0]);
+  auto v1_tmp = expr_builder.constant(-ksize_[1] + 2 * paddings_[1]);
+  auto v2_tmp = expr_builder.constant(-ksize_[2] + 2 * paddings_[2]);
+
+  auto ceil_tmp =
+      expr_builder.constant(-ksize_[0] + 2 * paddings_[0] + strides_[0] - 1);
+  auto ceil1_tmp =
+      expr_builder.constant(-ksize_[1] + 2 * paddings_[1] + strides_[1] - 1);
+  auto ceil2_tmp =
+      expr_builder.constant(-ksize_[2] + 2 * paddings_[2] + strides_[2] - 1);
+
+  if (!ceil_mode_) {
+    output.d[2] = expr_builder.operation(
+        nvinfer1::DimensionOperation::kSUM,
+        *expr_builder.operation(
+            nvinfer1::DimensionOperation::kFLOOR_DIV,
+            *expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
+                                    *inputs[0].d[2], *v0_tmp),
+            *stri_0),
+        *one_value);
+    output.d[3] = expr_builder.operation(
+        nvinfer1::DimensionOperation::kSUM,
+        *expr_builder.operation(
+            nvinfer1::DimensionOperation::kFLOOR_DIV,
+            *expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
+                                    *inputs[0].d[3], *v1_tmp),
+            *stri_1),
+        *one_value);
+    output.d[4] = expr_builder.operation(
+        nvinfer1::DimensionOperation::kSUM,
+        *expr_builder.operation(
+            nvinfer1::DimensionOperation::kFLOOR_DIV,
+            *expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
+                                    *inputs[0].d[4], *v2_tmp),
+            *stri_2),
+        *one_value);
+
+  } else {
+    output.d[2] = expr_builder.operation(
+        nvinfer1::DimensionOperation::kSUM,
+        *expr_builder.operation(
+            nvinfer1::DimensionOperation::kFLOOR_DIV,
+            *expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
+                                    *inputs[0].d[2], *ceil_tmp),
+            *stri_0),
+        *one_value);
+    output.d[3] = expr_builder.operation(
+        nvinfer1::DimensionOperation::kSUM,
+        *expr_builder.operation(
+            nvinfer1::DimensionOperation::kFLOOR_DIV,
+            *expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
+                                    *inputs[0].d[3], *ceil1_tmp),
+            *stri_1),
+        *one_value);
+    output.d[4] = expr_builder.operation(
+        nvinfer1::DimensionOperation::kSUM,
+        *expr_builder.operation(
+            nvinfer1::DimensionOperation::kFLOOR_DIV,
+            *expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
+                                    *inputs[0].d[4], *ceil2_tmp),
+            *stri_2),
+        *one_value);
+  }
+
+  return output;
+}
+
+bool Pool3DPluginDynamic::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc *in_out, int nb_inputs,
+    int nb_outputs) TRT_NOEXCEPT {
+  PADDLE_ENFORCE_NOT_NULL(
+      in_out, platform::errors::InvalidArgument(
+                  "The input of swish plugin shoule not be nullptr."));
+
+  PADDLE_ENFORCE_LT(
+      pos, nb_inputs + nb_outputs,
+      platform::errors::InvalidArgument("The pos(%d) should be less than the "
+                                        "num(%d) of the input and the output.",
+                                        pos, nb_inputs + nb_outputs));
+  (in_out && pos < (nb_inputs + nb_outputs));
+
+  return ((in_out[pos].type == nvinfer1::DataType::kFLOAT) &&
+          in_out[pos].format == nvinfer1::PluginFormat::kLINEAR);
+}
+
+nvinfer1::DataType Pool3DPluginDynamic::getOutputDataType(
+    int index, const nvinfer1::DataType *input_types,
+    int nb_inputs) const TRT_NOEXCEPT {
+  PADDLE_ENFORCE_EQ(index, 0,
+                    platform::errors::InvalidArgument(
+                        "The Pool3D Plugin only has one input, so the "
+                        "index value should be 0, but get %d.",
+                        index));
+  PADDLE_ENFORCE_EQ((input_types[0] == nvinfer1::DataType::kFLOAT), true,
+                    platform::errors::InvalidArgument(
+                        "The input type should be half or float"));
+  return input_types[0];
+}
+
+int Pool3DPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc,
+                                 const nvinfer1::PluginTensorDesc *output_desc,
+                                 const void *const *inputs,
+                                 void *const *outputs, void *workspace,
+                                 cudaStream_t stream) TRT_NOEXCEPT {
+  auto input_dims = input_desc[0].dims;
+  int n = input_dims.d[0];
+  int c = input_dims.d[1];
+  int d = input_dims.d[2];
+  int h = input_dims.d[3];
+  int w = input_dims.d[4];
+
+  const float *input = static_cast<const float *>(inputs[0]);
+  float *output = static_cast<float *>(outputs[0]);
+
+  std::vector<int> input_shape, output_shape;
+  for (int i = 0; i < input_dims.nbDims; i++)
+    input_shape.push_back(input_dims.d[i]);
+  output_shape = input_shape;
+
+  std::vector<int> ksize = ksize_;
+  std::vector<int> paddings = paddings_;
+  if (is_global_) {
+    ksize[0] = d;
+    ksize[1] = h;
+    ksize[2] = w;
+    paddings[0] = 0;
+    paddings[1] = 0;
+    paddings[2] = 0;
+    output_shape[2] = 1;
+    output_shape[3] = 1;
+    output_shape[4] = 1;
+  } else {
+    auto data_dim = CalcOutputSize({d, h, w}, ceil_mode_, adaptive_, ksize_,
+                                   strides_, paddings_);
+    output_shape[2] = data_dim[0];
+    output_shape[3] = data_dim[1];
+    output_shape[4] = data_dim[2];
+  }
+
+  if (pool3d_type_ == "max") {
+    paddle::operators::math::MaxPool<float> pool_process;
+    paddle::operators::math::Pool3dDirectCUDAFunctor<
+        paddle::operators::math::MaxPool<float>, float>
+        pool3d_forward;
+    pool3d_forward(input, input_shape, output_shape, ksize, strides_, paddings,
+                   true, adaptive_, output, stream, pool_process);
+  } else if (pool3d_type_ == "avg") {
+    paddle::operators::math::AvgPool<float> pool_process;
+    paddle::operators::math::Pool3dDirectCUDAFunctor<
+        paddle::operators::math::AvgPool<float>, float>
+        pool3d_forward;
+    pool3d_forward(input, input_shape, output_shape, ksize, strides_, paddings,
+                   true, adaptive_, output, stream, pool_process);
+  }
+
+  return cudaGetLastError() != cudaSuccess;
+}
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.h
new file mode 100644
index 0000000000000..7c9a8625d70f3
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.h
@@ -0,0 +1,244 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <stdio.h>
+#include <cassert>
+#include <string>
+#include <vector>
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+static std::vector<int> CalcOutputSize(const std::vector<int>& input_shape,
+                                       const bool& ceil_mode,
+                                       const bool& adaptive,
+                                       const std::vector<int>& ksize,
+                                       const std::vector<int>& strides,
+                                       const std::vector<int>& paddings) {
+  std::vector<int> output_shape = input_shape;
+  if (adaptive) {
+    output_shape[0] = ksize[0];
+    output_shape[1] = ksize[1];
+    output_shape[2] = ksize[2];
+  } else {
+    int output_d =
+        (input_shape[0] - ksize[0] + 2 * paddings[0]) / strides[0] + 1;
+    int output_h =
+        (input_shape[1] - ksize[1] + 2 * paddings[1]) / strides[1] + 1;
+    int output_w =
+        (input_shape[2] - ksize[2] + 2 * paddings[2]) / strides[2] + 1;
+    if (ceil_mode) {
+      output_d =
+          (input_shape[0] - ksize[0] + 2 * paddings[0] + strides[0] - 1) /
+              strides[0] +
+          1;
+      output_h =
+          (input_shape[1] - ksize[1] + 2 * paddings[1] + strides[1] - 1) /
+              strides[1] +
+          1;
+      output_w =
+          (input_shape[2] - ksize[2] + 2 * paddings[2] + strides[2] - 1) /
+              strides[2] +
+          1;
+    }
+    output_shape[0] = output_d;
+    output_shape[1] = output_h;
+    output_shape[2] = output_w;
+  }
+  return output_shape;
+}
+
+class Pool3DPlugin : public PluginTensorRTV2Ext {
+ public:
+  size_t getSerializationSize() const TRT_NOEXCEPT override;
+  // TRT will call this func when we need to serialize the configuration of
+  // tensorrt.
+  void serialize(void* buffer) const TRT_NOEXCEPT override;
+
+  enum class Pool3DType {
+    max = 0,
+    avg,
+  };
+  Pool3DPlugin() {}
+  Pool3DPlugin(bool ceil_mode, Pool3DType pool3d_type, bool adaptive,
+               std::vector<int> ksize, std::vector<int> strides,
+               std::vector<int> paddings, std::vector<int> input_shape)
+      : ceil_mode_(ceil_mode),
+        pool3d_type_(pool3d_type),
+        adaptive_(adaptive),
+        ksize_(ksize),
+        strides_(strides),
+        paddings_(paddings),
+        input_shape_(input_shape) {
+    output_shape_ = input_shape_;
+    std::vector<int> output_shape =
+        CalcOutputSize({input_shape_[1], input_shape_[2], input_shape_[3]},
+                       ceil_mode_, adaptive_, ksize_, strides_, paddings_);
+    output_shape_[1] = output_shape[0];
+    output_shape_[2] = output_shape[1];
+    output_shape_[3] = output_shape[2];
+  }
+
+  // It was used for tensorrt deserialization.
+  // It should not be called by users.
+  Pool3DPlugin(void const* serialData, size_t serialLength) {
+    deserializeBase(serialData, serialLength);
+    DeserializeValue(&serialData, &serialLength, &ceil_mode_);
+    DeserializeValue(&serialData, &serialLength, &pool3d_type_);
+    DeserializeValue(&serialData, &serialLength, &adaptive_);
+    DeserializeValue(&serialData, &serialLength, &ksize_);
+    DeserializeValue(&serialData, &serialLength, &strides_);
+    DeserializeValue(&serialData, &serialLength, &paddings_);
+    DeserializeValue(&serialData, &serialLength, &input_shape_);
+    DeserializeValue(&serialData, &serialLength, &output_shape_);
+  }
+
+  Pool3DPlugin* clone() const TRT_NOEXCEPT override;
+
+  const char* getPluginType() const TRT_NOEXCEPT override;
+
+  nvinfer1::DataType getOutputDataType(
+      int index, const nvinfer1::DataType* input_types,
+      int nb_inputs) const TRT_NOEXCEPT override;
+
+  int getNbOutputs() const TRT_NOEXCEPT override;
+
+  nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs,
+                                     int nbInputDims) TRT_NOEXCEPT override;
+
+  int initialize() TRT_NOEXCEPT override;
+
+  void destroy() TRT_NOEXCEPT override;
+
+#if IS_TRT_VERSION_LT(8000)
+  int enqueue(int batchSize, const void* const* inputs, void** outputs,
+#else
+  int enqueue(int batchSize, const void* const* inputs, void* const* outputs,
+#endif
+              void* workspace, cudaStream_t stream) TRT_NOEXCEPT override;
+
+ private:
+  bool ceil_mode_;
+  Pool3DType pool3d_type_;
+  bool adaptive_;
+  std::vector<int> ksize_;
+  std::vector<int> strides_;
+  std::vector<int> paddings_;
+  std::vector<int> input_shape_;
+  std::vector<int> output_shape_;
+};
+
+class Pool3DPluginCreator : public TensorRTPluginCreator {
+ public:
+  const char* getPluginName() const TRT_NOEXCEPT override {
+    return "pool3d_plugin";
+  }
+
+  const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
+
+  nvinfer1::IPluginV2* deserializePlugin(
+      const char* name, const void* serial_data,
+      size_t serial_length) TRT_NOEXCEPT override {
+    return new Pool3DPlugin(serial_data, serial_length);
+  }
+};
+REGISTER_TRT_PLUGIN_V2(Pool3DPluginCreator);
+
+class Pool3DPluginDynamic : public DynamicPluginTensorRT {
+ public:
+  Pool3DPluginDynamic() {}
+  Pool3DPluginDynamic(const bool& ceil_mode, const std::string& pool3d_type,
+                      const bool& adaptive, const std::vector<int>& ksize,
+                      const std::vector<int>& strides,
+                      const std::vector<int>& paddings, const bool& is_global)
+      : ceil_mode_(ceil_mode),
+        pool3d_type_(pool3d_type),
+        adaptive_(adaptive),
+        ksize_(ksize),
+        strides_(strides),
+        paddings_(paddings),
+        is_global_(is_global) {}
+
+  Pool3DPluginDynamic(void const* serialData, size_t serialLength);
+  ~Pool3DPluginDynamic() {}
+  nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
+  const char* getPluginType() const TRT_NOEXCEPT override;
+  int getNbOutputs() const TRT_NOEXCEPT override;
+  int initialize() TRT_NOEXCEPT override;
+  size_t getSerializationSize() const TRT_NOEXCEPT override;
+  void serialize(void* buffer) const TRT_NOEXCEPT override;
+
+  nvinfer1::DimsExprs getOutputDimensions(
+      int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
+      nvinfer1::IExprBuilder& expr_builder) TRT_NOEXCEPT override;
+
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc* inOut,
+                                 int nbInputs,
+                                 int nbOutputs) TRT_NOEXCEPT override;
+
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc* out,
+                       int nbOutputs) TRT_NOEXCEPT override;
+
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc* outputs,
+                          int nbOutputs) const TRT_NOEXCEPT override;
+
+  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+              const nvinfer1::PluginTensorDesc* outputDesc,
+              const void* const* inputs, void* const* outputs, void* workspace,
+              cudaStream_t stream) TRT_NOEXCEPT override;
+  nvinfer1::DataType getOutputDataType(
+      int index, const nvinfer1::DataType* inputTypes,
+      int nbInputs) const TRT_NOEXCEPT override;
+
+  void destroy() TRT_NOEXCEPT override { delete this; }
+
+ private:
+  bool ceil_mode_;
+  std::string pool3d_type_;
+  bool adaptive_;
+  std::vector<int> ksize_;
+  std::vector<int> strides_;
+  std::vector<int> paddings_;
+  bool is_global_;
+};
+
+class Pool3DPluginDynamicCreator : public TensorRTPluginCreator {
+ public:
+  const char* getPluginName() const TRT_NOEXCEPT override {
+    return "pool3d_plugin_dynamic";
+  }
+
+  const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
+
+  nvinfer1::IPluginV2* deserializePlugin(
+      const char* name, const void* serial_data,
+      size_t serial_length) TRT_NOEXCEPT override {
+    return new Pool3DPluginDynamic(serial_data, serial_length);
+  }
+};
+REGISTER_TRT_PLUGIN_V2(Pool3DPluginDynamicCreator);
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 11187a1c79fca..6fd3944a6c528 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -555,10 +555,6 @@ if(WITH_GPU AND TENSORRT_FOUND)
     if (NOT EXISTS ${TEST_SPLIT_CONVERTER_MODEL}/split_converter.tgz)
         inference_download_and_uncompress_without_verify(${TEST_SPLIT_CONVERTER_MODEL} ${INFERENCE_URL}/tensorrt_test "split_converter.tgz")
     endif()
-    set(TEST_INSTANCE_NORM_MODEL "${TRT_MODEL_INSTALL_DIR}/trt_instance_norm_test")
-    if (NOT EXISTS ${TEST_INSTANCE_NORM_MODEL}/instance_norm.tgz)
-        inference_download_and_uncompress_without_verify(${TEST_INSTANCE_NORM_MODEL} ${INFERENCE_URL}/tensorrt_test "instance_norm.tgz")
-    endif()
     inference_analysis_test(trt_mobilenet_test SRCS trt_mobilenet_test.cc
             EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
             ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
@@ -577,9 +573,6 @@ if(WITH_GPU AND TENSORRT_FOUND)
     inference_analysis_test(trt_split_converter_test SRCS trt_split_converter_test.cc
             EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
             ARGS --infer_model=${TEST_SPLIT_CONVERTER_MODEL}/)
-    inference_analysis_test(trt_instance_norm_test SRCS trt_instance_norm_converter_test.cc
-            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
-            ARGS --infer_model=${TEST_INSTANCE_NORM_MODEL}/)
     inference_analysis_test(test_analyzer_capi_exp_gpu SRCS analyzer_capi_exp_gpu_tester.cc
             EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c
             ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_statis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_statis_tester.cc
index b8ccb8cee507b..d33b11c389a09 100644
--- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_statis_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_statis_tester.cc
@@ -36,10 +36,10 @@ TEST(Analyzer_seq_pool1_fuse_statis, fuse_statis) {
   ASSERT_TRUE(fuse_statis.count("repeated_fc_relu_fuse"));
   ASSERT_EQ(fuse_statis.at("fc_fuse"), 10);
   EXPECT_EQ(fuse_statis.at("seqpool_concat_fuse"), 2);
-  EXPECT_EQ(fuse_statis.at("squared_mat_sub_fuse"), 2);
+  EXPECT_EQ(fuse_statis.at("squared_mat_sub_fuse"), 0);
   EXPECT_EQ(fuse_statis.at("repeated_fc_relu_fuse"), 2);
   LOG(INFO) << "num_ops: " << num_ops;
-  EXPECT_EQ(num_ops, 171);
+  EXPECT_EQ(num_ops, 185);
 }
 
 }  // namespace seq_pool1_tester
diff --git a/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc b/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc
index a7ff5af1bdc24..b74d1189b804b 100644
--- a/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc
+++ b/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc
@@ -77,7 +77,7 @@ TEST(tensorrt_tester_LeViT, trt_fp32_bz2) {
                   FLAGS_modeldir + "/inference.pdiparams");
   config.EnableUseGpu(100, 0);
   config.EnableTensorRtEngine(
-      1 << 20, 2, 6, paddle_infer::PrecisionType::kFloat32, false, false);
+      1 << 20, 2, 50, paddle_infer::PrecisionType::kFloat32, false, false);
   // get groudtruth by disbale ir
   paddle_infer::services::PredictorPool pred_pool_no_ir(config_no_ir, 1);
   SingleThreadPrediction(pred_pool_no_ir.Retrive(0), &my_input_data_map,
@@ -103,7 +103,7 @@ TEST(tensorrt_tester_LeViT, serial_diff_batch_trt_fp32) {
   config.SetModel(FLAGS_modeldir + "/inference.pdmodel",
                   FLAGS_modeldir + "/inference.pdiparams");
   config.EnableUseGpu(100, 0);
-  config.EnableTensorRtEngine(1 << 20, max_batch_size, 6,
+  config.EnableTensorRtEngine(1 << 20, max_batch_size, 50,
                               paddle_infer::PrecisionType::kFloat32, false,
                               false);
   paddle_infer::services::PredictorPool pred_pool(config, 1);
@@ -145,7 +145,7 @@ TEST(tensorrt_tester_LeViT, multi_thread4_trt_fp32_bz2) {
                   FLAGS_modeldir + "/inference.pdiparams");
   config.EnableUseGpu(100, 0);
   config.EnableTensorRtEngine(
-      1 << 20, 2, 6, paddle_infer::PrecisionType::kFloat32, false, false);
+      1 << 20, 2, 50, paddle_infer::PrecisionType::kFloat32, false, false);
   // get groudtruth by disbale ir
   paddle_infer::services::PredictorPool pred_pool_no_ir(config_no_ir, 1);
   SingleThreadPrediction(pred_pool_no_ir.Retrive(0), &my_input_data_map,
@@ -174,6 +174,6 @@ TEST(tensorrt_tester_LeViT, multi_thread4_trt_fp32_bz2) {
 
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
-  ::google::ParseCommandLineFlags(&argc, &argv, true);
+  ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
   return RUN_ALL_TESTS();
 }
diff --git a/paddle/fluid/inference/tests/infer_ut/test_det_mv3_db.cc b/paddle/fluid/inference/tests/infer_ut/test_det_mv3_db.cc
index cf3398b49ee9b..eb31acbdf7ca1 100644
--- a/paddle/fluid/inference/tests/infer_ut/test_det_mv3_db.cc
+++ b/paddle/fluid/inference/tests/infer_ut/test_det_mv3_db.cc
@@ -164,6 +164,6 @@ TEST(mkldnn_tester_det_mv3_db, multi_thread2_mkl_fp32_bz2) {
 
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
-  ::google::ParseCommandLineFlags(&argc, &argv, true);
+  ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
   return RUN_ALL_TESTS();
 }
diff --git a/paddle/fluid/inference/tests/infer_ut/test_ernie_text_cls.cc b/paddle/fluid/inference/tests/infer_ut/test_ernie_text_cls.cc
index 6ef894cc3d1d6..3fa41b201c680 100644
--- a/paddle/fluid/inference/tests/infer_ut/test_ernie_text_cls.cc
+++ b/paddle/fluid/inference/tests/infer_ut/test_ernie_text_cls.cc
@@ -132,6 +132,6 @@ TEST(mkldnn_tester_ernie_text_cls, multi_thread4_mkl_fp32_bz2) {
 
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
-  ::google::ParseCommandLineFlags(&argc, &argv, true);
+  ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
   return RUN_ALL_TESTS();
 }
diff --git a/paddle/fluid/inference/tests/infer_ut/test_ernie_xnli_int8.cc b/paddle/fluid/inference/tests/infer_ut/test_ernie_xnli_int8.cc
index 9e83551126552..4e924e3197965 100644
--- a/paddle/fluid/inference/tests/infer_ut/test_ernie_xnli_int8.cc
+++ b/paddle/fluid/inference/tests/infer_ut/test_ernie_xnli_int8.cc
@@ -186,7 +186,8 @@ TEST(tensorrt_tester_ernie_xnli, oss_varlen_truth_data_int8) {
 
 int main(int argc, char **argv) {
   ::testing::InitGoogleTest(&argc, argv);
-  ::google::ParseCommandLineFlags(&argc, &argv, true);
+  ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
+
 #if IS_TRT_VERSION_GE(7200)
   return RUN_ALL_TESTS();
 #endif
diff --git a/paddle/fluid/inference/tests/infer_ut/test_mobilnetv1.cc b/paddle/fluid/inference/tests/infer_ut/test_mobilnetv1.cc
index 21991d0da06a1..eaa7bac89efcd 100644
--- a/paddle/fluid/inference/tests/infer_ut/test_mobilnetv1.cc
+++ b/paddle/fluid/inference/tests/infer_ut/test_mobilnetv1.cc
@@ -81,6 +81,6 @@ TEST(tensorrt_tester_mobilenetv1, tuned_dynamic_trt_fp32_bz2) {
 
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
-  ::google::ParseCommandLineFlags(&argc, &argv, true);
+  ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
   return RUN_ALL_TESTS();
 }
diff --git a/paddle/fluid/inference/tests/infer_ut/test_ppyolo_mbv3.cc b/paddle/fluid/inference/tests/infer_ut/test_ppyolo_mbv3.cc
index 2d69c933c2f81..ff1647432a12d 100644
--- a/paddle/fluid/inference/tests/infer_ut/test_ppyolo_mbv3.cc
+++ b/paddle/fluid/inference/tests/infer_ut/test_ppyolo_mbv3.cc
@@ -151,6 +151,6 @@ TEST(DISABLED_mkldnn_tester_ppyolo_mbv3, multi_thread4_mkl_bz2) {
 
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
-  ::google::ParseCommandLineFlags(&argc, &argv, true);
+  ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
   return RUN_ALL_TESTS();
 }
diff --git a/paddle/fluid/inference/tests/infer_ut/test_ppyolov2_r50vd.cc b/paddle/fluid/inference/tests/infer_ut/test_ppyolov2_r50vd.cc
index d74a333232473..9689ec20956a1 100644
--- a/paddle/fluid/inference/tests/infer_ut/test_ppyolov2_r50vd.cc
+++ b/paddle/fluid/inference/tests/infer_ut/test_ppyolov2_r50vd.cc
@@ -150,6 +150,6 @@ TEST(mkldnn_tester_ppyolov2_r50vd, multi_thread2_mkl_bz2) {
 
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
-  ::google::ParseCommandLineFlags(&argc, &argv, true);
+  ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
   return RUN_ALL_TESTS();
 }
diff --git a/paddle/fluid/inference/tests/infer_ut/test_resnet50.cc b/paddle/fluid/inference/tests/infer_ut/test_resnet50.cc
index 6157fdbdb108a..01bec2916e94a 100644
--- a/paddle/fluid/inference/tests/infer_ut/test_resnet50.cc
+++ b/paddle/fluid/inference/tests/infer_ut/test_resnet50.cc
@@ -236,6 +236,6 @@ TEST(DISABLED_tensorrt_tester_resnet50, profile_multi_thread_trt_fp32) {
 
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
-  ::google::ParseCommandLineFlags(&argc, &argv, true);
+  ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
   return RUN_ALL_TESTS();
 }
diff --git a/paddle/fluid/inference/tests/infer_ut/test_resnet50_quant.cc b/paddle/fluid/inference/tests/infer_ut/test_resnet50_quant.cc
index ed7ab7b5eee7b..380954f9e527d 100644
--- a/paddle/fluid/inference/tests/infer_ut/test_resnet50_quant.cc
+++ b/paddle/fluid/inference/tests/infer_ut/test_resnet50_quant.cc
@@ -165,6 +165,6 @@ TEST(DISABLED_tensorrt_tester_resnet50_quant, multi_thread_multi_instance) {
 
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
-  ::google::ParseCommandLineFlags(&argc, &argv, true);
+  ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
   return RUN_ALL_TESTS();
 }
diff --git a/paddle/fluid/inference/tests/infer_ut/test_yolov3.cc b/paddle/fluid/inference/tests/infer_ut/test_yolov3.cc
index 845bcbc5c5b5f..69a9e8d6a900a 100644
--- a/paddle/fluid/inference/tests/infer_ut/test_yolov3.cc
+++ b/paddle/fluid/inference/tests/infer_ut/test_yolov3.cc
@@ -150,6 +150,6 @@ TEST(test_yolov3, multi_thread4_mkl_bz2) {
 
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
-  ::google::ParseCommandLineFlags(&argc, &argv, true);
+  ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
   return RUN_ALL_TESTS();
 }
diff --git a/paddle/fluid/inference/utils/io_utils.cc b/paddle/fluid/inference/utils/io_utils.cc
index 3691285ba3a51..87331e1978f95 100644
--- a/paddle/fluid/inference/utils/io_utils.cc
+++ b/paddle/fluid/inference/utils/io_utils.cc
@@ -197,6 +197,9 @@ void SerializeShapeRangeInfo(
 void DeserializeShapeRangeInfo(
     const std::string &path, paddle::inference::proto::ShapeRangeInfos *info) {
   int fd = open(path.c_str(), O_RDONLY);
+  if (fd == -1) {
+    PADDLE_THROW(platform::errors::NotFound("File [%s] is not found.", path));
+  }
   google::protobuf::io::FileInputStream *is =
       new google::protobuf::io::FileInputStream(fd);
   google::protobuf::TextFormat::Parse(is, info);
diff --git a/paddle/fluid/inference/utils/io_utils_tester.cc b/paddle/fluid/inference/utils/io_utils_tester.cc
index 766afed4e5014..ffd97232652fd 100644
--- a/paddle/fluid/inference/utils/io_utils_tester.cc
+++ b/paddle/fluid/inference/utils/io_utils_tester.cc
@@ -118,4 +118,8 @@ TEST(shape_info_io, read_and_write) {
   std::vector<std::string> names{"test1"};
   paddle::inference::UpdateShapeRangeInfo(path, min_shape, max_shape, opt_shape,
                                           names);
+
+  ASSERT_THROW(paddle::inference::DeserializeShapeRangeInfo(
+                   "no_exists_file", &min_shape, &max_shape, &opt_shape);
+               , paddle::platform::EnforceNotMet);
 }
diff --git a/paddle/fluid/operators/arg_max_op_xpu.cc b/paddle/fluid/operators/arg_max_op_xpu.cc
index 8060b5cf755c0..71ec26ea5a792 100644
--- a/paddle/fluid/operators/arg_max_op_xpu.cc
+++ b/paddle/fluid/operators/arg_max_op_xpu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 
diff --git a/paddle/fluid/operators/arg_min_op_npu.cc b/paddle/fluid/operators/arg_min_op_npu.cc
index f776412c16239..cc81e320080b7 100644
--- a/paddle/fluid/operators/arg_min_op_npu.cc
+++ b/paddle/fluid/operators/arg_min_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 
 #include "paddle/fluid/operators/arg_min_max_op_base.h"
 #include "paddle/fluid/operators/npu_op_runner.h"
diff --git a/paddle/fluid/operators/argsort_op_npu.cc b/paddle/fluid/operators/argsort_op_npu.cc
index e36dd322e0ea1..f2a57b4b9bdfb 100644
--- a/paddle/fluid/operators/argsort_op_npu.cc
+++ b/paddle/fluid/operators/argsort_op_npu.cc
@@ -1,8 +1,11 @@
 /* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
+
     http://www.apache.org/licenses/LICENSE-2.0
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -15,156 +18,142 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename DeviceContext, typename T>
+using Tensor = framework::Tensor;
+using NPUDeviceContext = platform::NPUDeviceContext;
+
+template <typename T>
+static void TranposeNPU(const framework::ExecutionContext& ctx,
+                        const aclrtStream& stream, std::vector<int64_t>* perm,
+                        const Tensor& in, Tensor* out) {
+  out->mutable_data<T>(ctx.GetPlace());
+  NpuOpRunner runner;
+  runner.SetType("Transpose")
+      .AddInput(in)
+      .AddInput(std::move(*perm))
+      .AddOutput(*out)
+      .Run(stream);
+}
+
+static void CastToInt64(const framework::ExecutionContext& ctx,
+                        const aclrtStream& stream, const Tensor& in,
+                        Tensor* out) {
+  out->mutable_data<int64_t>(ctx.GetPlace());
+  NpuOpRunner runner;
+  runner.SetType("Cast")
+      .AddInput(in)
+      .AddOutput(*out)
+      .AddAttr("dst_type", ACL_INT64)
+      .Run(stream);
+}
+
+template <typename T>
 class ArgsortNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* input = ctx.Input<framework::Tensor>("X");
     auto* output = ctx.Output<framework::Tensor>("Out");
-    output->mutable_data<T>(ctx.GetPlace());
     auto* indices = ctx.Output<framework::Tensor>("Indices");
-    indices->mutable_data<int32_t>(ctx.GetPlace());
+    int axis = ctx.Attr<int>("axis");
+    bool descending = ctx.Attr<bool>("descending");
 
-    int32_t axis = ctx.Attr<int>("axis");
-    auto in_dims = indices->dims();
+    auto in_dims = input->dims();
     axis = (axis < 0) ? (in_dims.size() + axis) : axis;
-    bool descending = ctx.Attr<bool>("descending");
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    framework::NPUAttributeMap sort_attr_input = {
-        {"axis", static_cast<int32_t>(-1)}, {"descending", descending}};
+
+    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
+    framework::NPUAttributeMap attr = {{"axis", -1},
+                                       {"descending", descending}};
+
+    Tensor indices_tmp(framework::proto::VarType::INT32);
+    indices_tmp.Resize(indices->dims());
 
     if (axis == -1 || axis + 1 == in_dims.size()) {
-      const auto& sort_runner =
-          NpuOpRunner("Sort", {*input}, {*output, *indices}, sort_attr_input);
-      sort_runner.Run(stream);
+      output->mutable_data<T>(ctx.GetPlace());
+      indices_tmp.mutable_data<int32_t>(ctx.GetPlace());
+      const auto& runner =
+          NpuOpRunner("Sort", {*input}, {*output, indices_tmp}, attr);
+      runner.Run(stream);
     } else {
-      // transpose
-      std::vector<int> trans;
-      for (int i = 0; i < axis; i++) {
-        trans.push_back(i);
-      }
-      trans.push_back(in_dims.size() - 1);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans.push_back(i);
+      std::vector<int64_t> perm;
+      for (int64_t i = 0; i < in_dims.size(); i++) {
+        perm.emplace_back(i);
       }
-      trans.push_back(axis);
-      framework::DDim trans_dims(in_dims);
-      for (size_t i = 0; i < trans.size(); i++) {
-        trans_dims[i] = in_dims[trans[i]];
+      std::swap(perm[axis], perm[in_dims.size() - 1]);
+
+      std::vector<int64_t> shape;
+      for (size_t i = 0; i < perm.size(); i++) {
+        shape.emplace_back(in_dims[perm[i]]);
       }
-      framework::NPUAttributeMap trans_attr_input = {{"perm", trans}};
-      Tensor trans_input;
-      trans_input.mutable_data<T>(trans_dims, ctx.GetPlace());
-      const auto& trans_input_runner =
-          NpuOpRunner("TransposeD", {*input}, {trans_input}, trans_attr_input);
-      trans_input_runner.Run(stream);
-      Tensor trans_indices;
-      trans_indices.mutable_data<int32_t>(trans_dims, ctx.GetPlace());
-      const auto& trans_indice_runner = NpuOpRunner(
-          "TransposeD", {*indices}, {trans_indices}, trans_attr_input);
-      trans_indice_runner.Run(stream);
-      Tensor trans_output;
+      auto trans_dims = framework::make_ddim(shape);
+
+      Tensor trans_input(input->type());
+      trans_input.Resize(trans_dims);
+      TranposeNPU<T>(ctx, stream, &perm, *input, &trans_input);
+
+      Tensor trans_output(input->type());
+      Tensor trans_indices(framework::proto::VarType::INT32);
       trans_output.mutable_data<T>(trans_dims, ctx.GetPlace());
-      const auto& trans_output_runner = NpuOpRunner(
-          "TransposeD", {*output}, {trans_output}, trans_attr_input);
-      trans_output_runner.Run(stream);
-      const auto& sort_runner =
-          NpuOpRunner("Sort", {trans_input}, {trans_output, trans_indices},
-                      sort_attr_input);
-      sort_runner.Run(stream);
-      // transpose back
-      const auto& trans_indices_back_runner = NpuOpRunner(
-          "TransposeD", {trans_indices}, {*indices}, trans_attr_input);
-      trans_indices_back_runner.Run(stream);
-      const auto& trans_output_back_runner = NpuOpRunner(
-          "TransposeD", {trans_output}, {*output}, trans_attr_input);
-      trans_output_back_runner.Run(stream);
+      trans_indices.mutable_data<int32_t>(trans_dims, ctx.GetPlace());
+
+      const auto& runner = NpuOpRunner("Sort", {trans_input},
+                                       {trans_output, trans_indices}, attr);
+      runner.Run(stream);
+
+      TranposeNPU<T>(ctx, stream, &perm, trans_output, output);
+      TranposeNPU<int32_t>(ctx, stream, &perm, trans_indices, &indices_tmp);
     }
+    CastToInt64(ctx, stream, indices_tmp, indices);
   }
 };
 
-template <typename Type>
-static void ReshapeNPU(const framework::Tensor* input,
-                       const std::vector<Type>& input_shapes,
-                       framework::Tensor* output) {
-  output->ShareDataWith(*input);
-  output->Resize(framework::make_ddim(std::move(input_shapes)));
-}
-
 template <typename T, typename Type>
 static void FullAssignNPU(const framework::ExecutionContext& ctx,
-                          Type ind_lastdim, Type outer_dim,
-                          const framework::DDim& trans_dims,
-                          const framework::Tensor* input,
-                          const framework::Tensor* indices,
-                          framework::Tensor* t_out) {
-  // reshape input
-  Type input_shape = ind_lastdim * outer_dim;
-  std::vector<Type> input_shapes = {input_shape};
-  Tensor input_reshape_tensor(input->type());
-  ReshapeNPU<Type>(input, input_shapes, &input_reshape_tensor);
-  // reshape index
-  std::vector<Type> index_shapes = {outer_dim, ind_lastdim};
-  framework::DDim ind_2d = framework::make_ddim({outer_dim, ind_lastdim});
-  Tensor ind_2d_tensor(indices->type());
-  ReshapeNPU<Type>(indices, index_shapes, &ind_2d_tensor);
-  // range_flatten_index
-  std::vector<int32_t> range_flatten_index;
-  for (Type i = 0; i < input_shape; i += ind_lastdim) {
-    range_flatten_index.push_back(static_cast<int32_t>(i));
+                          const aclrtStream& stream,
+                          const framework::DDim in_dims, const Tensor& input,
+                          const Tensor& indices, Tensor* t_out) {
+  const int64_t input_height =
+      framework::product(framework::slice_ddim(in_dims, 0, in_dims.size() - 1));
+  const int64_t input_width = in_dims[in_dims.size() - 1];
+
+  Tensor input_tmp;
+  input_tmp.ShareDataWith(input);
+  input_tmp.Resize(
+      framework::make_ddim(std::vector<int64_t>{input_height * input_width}));
+
+  Tensor indices_tmp;
+  indices_tmp.ShareDataWith(indices);
+  indices_tmp.Resize(
+      framework::make_ddim(std::vector<int64_t>{input_height, input_width}));
+
+  std::vector<int64_t> indexs_value;
+  for (Type i = 0; i < input_height; i++) {
+    indexs_value.push_back(i * input_width);
   }
-  Tensor range_flatten_index_tensor(framework::proto::VarType::INT32);
-  range_flatten_index_tensor.Resize(framework::make_ddim({outer_dim}));
-  range_flatten_index_tensor.mutable_data<int32_t>(
-      {static_cast<int>(range_flatten_index.size())}, ctx.GetPlace());
-  TensorFromVector(range_flatten_index, ctx.device_context(),
-                   &range_flatten_index_tensor);
-  Tensor range_flatten_index_expand_tensor(range_flatten_index_tensor.type());
-  std::vector<Type> flatten_shape = {outer_dim, 1};
-  ReshapeNPU<Type>(&range_flatten_index_tensor, flatten_shape,
-                   &range_flatten_index_expand_tensor);
-  auto stream =
-      ctx.template device_context<paddle::platform::NPUDeviceContext>()
-          .stream();
-  Tensor ind_2d_add_tensor;
-  ind_2d_add_tensor.mutable_data<int32_t>(ind_2d, ctx.GetPlace());
-  const auto& runner_ind_2d_tensor = NpuOpRunner(
-      std::string("Add"), {ind_2d_tensor, range_flatten_index_expand_tensor},
-      {ind_2d_add_tensor}, {});
-  runner_ind_2d_tensor.Run(stream);
-  Tensor ind_reshape_tensor(ind_2d_add_tensor.type());
-  ReshapeNPU<Type>(&ind_2d_add_tensor, input_shapes, &ind_reshape_tensor);
-  Tensor ind_reshape_expand_tensor(ind_reshape_tensor.type());
-  std::vector<Type> ind_shape = {input_shape, 1};
-  ReshapeNPU<Type>(&ind_reshape_tensor, ind_shape, &ind_reshape_expand_tensor);
-  // expand_index
-  Tensor input_scatter_tensor;
-  input_scatter_tensor.Resize({input_shape});
-  input_scatter_tensor.mutable_data<T>(ctx.GetPlace());
-  Tensor input_scatter_tensor_ori;
-  input_scatter_tensor_ori.Resize({input_shape});
-  input_scatter_tensor_ori.mutable_data<T>(ctx.GetPlace());
-  std::vector<Type> trans_shapes;
-
-  for (int i = 0; i < trans_dims.size(); i++) {
-    trans_shapes.push_back(trans_dims[i]);
-  }
-  NpuOpRunner runner_scatter;
-  runner_scatter.SetType("TensorScatterUpdate")
-      .AddInput(input_scatter_tensor_ori)
-      .AddInput(ind_reshape_expand_tensor)
-      .AddInput(input_reshape_tensor)
-      .AddOutput(input_scatter_tensor);
-  runner_scatter.Run(stream);
-  framework::TensorCopy(input_scatter_tensor, ctx.GetPlace(),
-                        ctx.template device_context<platform::DeviceContext>(),
-                        t_out);
-  t_out->Resize(framework::make_ddim(trans_shapes));
+  Tensor indexs_tmp(indices.type());
+  framework::TensorFromVector<int64_t>(indexs_value, ctx.device_context(),
+                                       &indexs_tmp);
+  indexs_tmp.Resize(
+      framework::make_ddim(std::vector<int64_t>{input_height, 1}));
+
+  Tensor indices_index(indices.type());
+  indices_index.mutable_data<int64_t>(indices_tmp.dims(), ctx.GetPlace());
+  const auto& runner_add =
+      NpuOpRunner("Add", {indices_tmp, indexs_tmp}, {indices_index}, {});
+  runner_add.Run(stream);
+
+  indices_index.Resize(
+      framework::make_ddim(std::vector<int64_t>{input_height * input_width}));
+
+  t_out->mutable_data<T>(ctx.GetPlace());
+  Tensor out_tmp(t_out->type());
+  out_tmp.ShareDataWith(*t_out);
+
+  const auto& runner =
+      NpuOpRunner("TensorScatterUpdate", {input_tmp, indices_index, input_tmp},
+                  {out_tmp}, {});
+  runner.Run(stream);
 }
 
-template <typename DeviceContext, typename T>
+template <typename T>
 class ArgsortGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -172,75 +161,42 @@ class ArgsortGradNPUKernel : public framework::OpKernel<T> {
     auto* dX = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
     int axis = ctx.Attr<int>("axis");
+
     auto in_dims = indices->dims();
     axis = (axis < 0) ? (in_dims.size() + axis) : axis;
-    auto place = ctx.GetPlace();
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    dX->mutable_data<T>(ctx.GetPlace());
-    Tensor dxt;
-    dxt.mutable_data<T>(dX->dims(), place);
-    const auto& runner_flatten =
-        NpuOpRunner(std::string("Flatten"), {*dX}, {dxt}, {});
-    runner_flatten.Run(stream);
-    FillNpuTensorWithConstant<T>(&dxt, static_cast<T>(0));
     if (dO->numel() == 0) return;
-    // Do full assig  n
-    if (axis == -1 || axis + 1 == in_dims.size()) {
-      const int64_t outer_dim = framework::product(
-          framework::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t ind_lastdim = in_dims[in_dims.size() - 1];
-      FullAssignNPU<T, int64_t>(ctx, ind_lastdim, outer_dim, in_dims, dO,
-                                indices, dX);
 
+    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
+
+    if (axis == -1 || axis + 1 == in_dims.size()) {
+      FullAssignNPU<T, int64_t>(ctx, stream, in_dims, *dO, *indices, dX);
     } else {
-      // If not full assign do transpose
-      std::vector<int> trans;
-      for (int i = 0; i < axis; i++) {
-        trans.push_back(i);
-      }
-      trans.push_back(in_dims.size() - 1);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans.push_back(i);
+      std::vector<int64_t> perm;
+      for (int64_t i = 0; i < in_dims.size(); i++) {
+        perm.emplace_back(i);
       }
-      trans.push_back(axis);
-      framework::DDim trans_dims(in_dims);
-      for (size_t i = 0; i < trans.size(); i++) {
-        trans_dims[i] = in_dims[trans[i]];
-      }
-      std::vector<int> axis;
-      for (size_t i = 0; i < trans.size(); i++) {
-        axis.push_back(in_dims[trans[i]]);
+      std::swap(perm[axis], perm[in_dims.size() - 1]);
+
+      std::vector<int64_t> shape;
+      for (size_t i = 0; i < perm.size(); i++) {
+        shape.emplace_back(in_dims[perm[i]]);
       }
-      framework::NPUAttributeMap attr_input = {{"perm", trans}};
-      Tensor trans_dO;
-      trans_dO.mutable_data<T>(trans_dims, ctx.GetPlace());
-      Tensor trans_ind;
-      trans_ind.mutable_data<int32_t>(trans_dims, ctx.GetPlace());
-      // Do transpose
-      const auto& runner_transpose_dx = NpuOpRunner(
-          std::string("TransposeD"), {*dO}, {trans_dO}, {attr_input});
-      runner_transpose_dx.Run(stream);
-      const auto& runner_transpose_ind = NpuOpRunner(
-          std::string("TransposeD"), {*indices}, {trans_ind}, {attr_input});
-      runner_transpose_ind.Run(stream);
-
-      const int64_t outer_dim = framework::product(
-          framework::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
-      const int64_t ind_lastdim = trans_dims[trans_dims.size() - 1];
-
-      Tensor tmp_out;
-      tmp_out.mutable_data<T>(trans_dims, ctx.GetPlace());
-
-      FullAssignNPU<T, int64_t>(ctx, ind_lastdim, outer_dim, trans_dims,
-                                &trans_dO, &trans_ind, &tmp_out);
-
-      // transpose back
-      const auto& runner_transpose_out = NpuOpRunner(
-          std::string("TransposeD"), {tmp_out}, {*dX}, {attr_input});
-      runner_transpose_out.Run(stream);
+      auto trans_dims = framework::make_ddim(shape);
+
+      Tensor trans_dout(dO->type());
+      Tensor trans_ids(indices->type());
+      trans_dout.Resize(trans_dims);
+      trans_ids.Resize(trans_dims);
+
+      TranposeNPU<T>(ctx, stream, &perm, *dO, &trans_dout);
+      TranposeNPU<int64_t>(ctx, stream, &perm, *indices, &trans_ids);
+
+      Tensor trans_dx(dO->type());
+      trans_dx.Resize(trans_dims);
+      FullAssignNPU<T, int64_t>(ctx, stream, trans_dims, trans_dout, trans_ids,
+                                &trans_dx);
+
+      TranposeNPU<T>(ctx, stream, &perm, trans_dx, dX);
     }
   }
 };
@@ -251,11 +207,8 @@ class ArgsortGradNPUKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-REGISTER_OP_NPU_KERNEL(
-    argsort, ops::ArgsortNPUKernel<plat::NPUDeviceContext, float>,
-    ops::ArgsortNPUKernel<plat::NPUDeviceContext, plat::float16>);
+REGISTER_OP_NPU_KERNEL(argsort, ops::ArgsortNPUKernel<float>,
+                       ops::ArgsortNPUKernel<plat::float16>);
 
-REGISTER_OP_NPU_KERNEL(argsort_grad,
-                       ops::ArgsortGradNPUKernel<plat::NPUDeviceContext, float>,
-                       ops::ArgsortGradNPUKernel<plat::NPUDeviceContext,
-                                                 paddle::platform::float16>);
+REGISTER_OP_NPU_KERNEL(argsort_grad, ops::ArgsortGradNPUKernel<float>,
+                       ops::ArgsortGradNPUKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/bincount_op.cc b/paddle/fluid/operators/bincount_op.cc
new file mode 100644
index 0000000000000..8b2fa60f8722e
--- /dev/null
+++ b/paddle/fluid/operators/bincount_op.cc
@@ -0,0 +1,116 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/bincount_op.h"
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace paddle {
+namespace operators {
+
+using framework::OpKernelType;
+using framework::Tensor;
+
+class BincountOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
+                      platform::errors::InvalidArgument(
+                          "Input(X) of BincountOp should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
+                      platform::errors::InvalidArgument(
+                          "Output(Out) of BincountOp should not be null."));
+
+    auto input_dim = ctx->GetInputDim("X");
+    auto minlength = ctx->Attrs().Get<int>("minlength");
+
+    PADDLE_ENFORCE_GE(minlength, 0,
+                      platform::errors::InvalidArgument(
+                          "The minlength should be greater than or equal to 0."
+                          "But received minlength is %d",
+                          minlength));
+
+    PADDLE_ENFORCE_EQ(input_dim.size(), 1,
+                      platform::errors::InvalidArgument(
+                          "The 'shape' of Input(X) must be 1-D tensor."
+                          "But the dimension of Input(X) is [%d]",
+                          input_dim.size()));
+
+    if (ctx->HasInput("Weights")) {
+      auto weights_dim = ctx->GetInputDim("Weights");
+      PADDLE_ENFORCE_EQ(weights_dim.size(), 1,
+                        platform::errors::InvalidArgument(
+                            "The 'shape' of Input(Weights) must be 1-D tensor."
+                            "But the dimension of Input(Weights) is [%d]",
+                            weights_dim.size()));
+
+      PADDLE_ENFORCE_EQ(
+          weights_dim[0], input_dim[0],
+          platform::errors::InvalidArgument(
+              "The 'shape' of Input(Weights) must be equal to the 'shape' of "
+              "Input(X)."
+              "But received: the 'shape' of Input(Weights) is [%s],"
+              "the 'shape' of Input(X) is [%s]",
+              weights_dim, input_dim));
+    }
+
+    ctx->SetOutputDim("Out", framework::make_ddim({-1}));
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const {
+    auto data_type =
+        ctx.HasInput("Weights")
+            ? OperatorWithKernel::IndicateVarDataType(ctx, "Weights")
+            : OperatorWithKernel::IndicateVarDataType(ctx, "X");
+    return framework::OpKernelType(data_type, ctx.device_context());
+  }
+};
+
+class BincountOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor) The input tensor of Bincount op,");
+    AddInput("Weights", "(Tensor) The weights tensor of Bincount op,")
+        .AsDispensable();
+    AddOutput("Out", "(Tensor) The output tensor of Bincount op,");
+    AddAttr<int>("minlength", "(int) The minimal numbers of bins")
+        .SetDefault(0)
+        .EqualGreaterThan(0);
+    AddComment(R"DOC(
+          Bincount Operator.
+          Computes frequency of each value in the input tensor.
+          Elements of input tensor should be non-negative ints.
+      )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(
+    bincount, ops::BincountOp, ops::BincountOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OP_CPU_KERNEL(
+    bincount, ops::BincountKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::BincountKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::BincountKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::BincountKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/bincount_op.cu b/paddle/fluid/operators/bincount_op.cu
new file mode 100644
index 0000000000000..757f728629106
--- /dev/null
+++ b/paddle/fluid/operators/bincount_op.cu
@@ -0,0 +1,160 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/operators/bincount_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/gpu_launch_config.h"
+#include "paddle/fluid/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using platform::PADDLE_CUDA_NUM_THREADS;
+
+inline int GET_BLOCKS(const int N) {
+  return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS;
+}
+
+template <typename T, typename InputT, typename OutT>
+__global__ void KernelBincount(const InputT* input, const int total_elements,
+                               const bool has_weights, const T* weights,
+                               OutT* output) {
+  if (!has_weights) {
+    for (int i = threadIdx.x; i < total_elements; i += blockDim.x) {
+      paddle::platform::CudaAtomicAdd(&output[input[i]], 1L);
+    }
+  } else {
+    for (int i = threadIdx.x; i < total_elements; i += blockDim.x) {
+      paddle::platform::CudaAtomicAdd(&output[input[i]],
+                                      static_cast<OutT>(weights[i]));
+    }
+  }
+}
+
+template <typename DeviceContext, typename T, typename InputT>
+void BincountCUDAInner(const framework::ExecutionContext& context) {
+  const Tensor* input = context.Input<framework::Tensor>("X");
+  const Tensor* weights = context.Input<framework::Tensor>("Weights");
+  Tensor* output = context.Output<framework::Tensor>("Out");
+  auto& minlength = context.Attr<int>("minlength");
+
+  const InputT* input_data = input->data<InputT>();
+
+  const int input_numel = input->numel();
+
+  if (input_data == nullptr) {
+    framework::DDim out_dim{0};
+    output->Resize(out_dim);
+    output->mutable_data<T>(context.GetPlace());
+    return;
+  }
+  auto input_x = framework::EigenVector<InputT>::Flatten(*input);
+
+  framework::Tensor input_min_t, input_max_t;
+  auto* input_max_data =
+      input_max_t.mutable_data<InputT>({1}, context.GetPlace());
+  auto* input_min_data =
+      input_min_t.mutable_data<InputT>({1}, context.GetPlace());
+
+  auto input_max_scala = framework::EigenScalar<InputT>::From(input_max_t);
+  auto input_min_scala = framework::EigenScalar<InputT>::From(input_min_t);
+
+  auto* place = context.template device_context<DeviceContext>().eigen_device();
+  input_max_scala.device(*place) = input_x.maximum();
+  input_min_scala.device(*place) = input_x.minimum();
+
+  Tensor input_min_cpu, input_max_cpu;
+  TensorCopySync(input_max_t, platform::CPUPlace(), &input_max_cpu);
+  TensorCopySync(input_min_t, platform::CPUPlace(), &input_min_cpu);
+
+  InputT input_min = input_min_cpu.data<InputT>()[0];
+
+  PADDLE_ENFORCE_GE(
+      input_min, static_cast<InputT>(0),
+      platform::errors::InvalidArgument(
+          "The elements in input tensor must be non-negative ints"));
+
+  int64_t output_size =
+      static_cast<int64_t>(input_max_cpu.data<InputT>()[0]) + 1L;
+
+  output_size = std::max(output_size, static_cast<int64_t>(minlength));
+  framework::DDim out_dim{output_size};
+  output->Resize(out_dim);
+
+  bool has_weights = (weights != nullptr);
+
+  const T* weights_data = has_weights ? weights->data<T>() : nullptr;
+
+  auto stream =
+      context.template device_context<platform::CUDADeviceContext>().stream();
+
+  if (!has_weights) {
+    int64_t* output_data = output->mutable_data<int64_t>(context.GetPlace());
+    math::SetConstant<DeviceContext, int64_t>()(
+        context.template device_context<DeviceContext>(), output, 0L);
+
+    KernelBincount<T, InputT, int64_t><<<GET_BLOCKS(input_numel),
+                                         PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+        input_data, input_numel, has_weights, weights_data, output_data);
+  } else {
+    const auto& weights_type = weights->type();
+
+    if (weights_type == framework::proto::VarType::FP32) {
+      float* output_data = output->mutable_data<float>(context.GetPlace());
+      math::SetConstant<DeviceContext, float>()(
+          context.template device_context<DeviceContext>(), output,
+          static_cast<float>(0));
+
+      KernelBincount<T, InputT, float><<<GET_BLOCKS(input_numel),
+                                         PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+          input_data, input_numel, has_weights, weights_data, output_data);
+    } else {
+      double* output_data = output->mutable_data<double>(context.GetPlace());
+      math::SetConstant<DeviceContext, double>()(
+          context.template device_context<DeviceContext>(), output,
+          static_cast<double>(0));
+
+      KernelBincount<T, InputT, double><<<GET_BLOCKS(input_numel),
+                                          PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+          input_data, input_numel, has_weights, weights_data, output_data);
+    }
+  }
+}
+
+template <typename DeviceContext, typename T>
+class BincountCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* input = context.Input<framework::Tensor>("X");
+    const auto& input_type = input->type();
+
+    if (input_type == framework::proto::VarType::INT32) {
+      BincountCUDAInner<DeviceContext, T, int>(context);
+    } else if (input_type == framework::proto::VarType::INT64) {
+      BincountCUDAInner<DeviceContext, T, int64_t>(context);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    bincount, ops::BincountCUDAKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::BincountCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::BincountCUDAKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::BincountCUDAKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/bincount_op.h b/paddle/fluid/operators/bincount_op.h
new file mode 100644
index 0000000000000..a142332bce266
--- /dev/null
+++ b/paddle/fluid/operators/bincount_op.h
@@ -0,0 +1,109 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T, typename InputT>
+void BincountInner(const framework::ExecutionContext& context) {
+  const Tensor* input = context.Input<framework::Tensor>("X");
+  const Tensor* weights = context.Input<framework::Tensor>("Weights");
+  Tensor* output = context.Output<framework::Tensor>("Out");
+  auto& minlength = context.Attr<int>("minlength");
+
+  const InputT* input_data = input->data<InputT>();
+
+  auto input_numel = input->numel();
+
+  if (input_data == nullptr) {
+    framework::DDim out_dim{0};
+    output->Resize(out_dim);
+    output->mutable_data<InputT>(context.GetPlace());
+    return;
+  }
+
+  PADDLE_ENFORCE_GE(
+      *std::min_element(input_data, input_data + input_numel),
+      static_cast<InputT>(0),
+      platform::errors::InvalidArgument(
+          "The elements in input tensor must be non-negative ints"));
+
+  int64_t output_size = static_cast<int64_t>(*std::max_element(
+                            input_data, input_data + input_numel)) +
+                        1L;
+  output_size = std::max(output_size, static_cast<int64_t>(minlength));
+
+  framework::DDim out_dim{output_size};
+  output->Resize(out_dim);
+
+  bool has_weights = (weights != nullptr);
+
+  if (has_weights) {
+    const T* weights_data = weights->data<T>();
+    const auto& weights_type = weights->type();
+    if (weights_type == framework::proto::VarType::FP32) {
+      float* output_data = output->mutable_data<float>(context.GetPlace());
+      math::SetConstant<DeviceContext, float>()(
+          context.template device_context<DeviceContext>(), output,
+          static_cast<float>(0));
+      for (int64_t i = 0; i < input_numel; i++) {
+        output_data[input_data[i]] += static_cast<float>(weights_data[i]);
+      }
+    } else {
+      double* output_data = output->mutable_data<double>(context.GetPlace());
+      math::SetConstant<DeviceContext, double>()(
+          context.template device_context<DeviceContext>(), output,
+          static_cast<double>(0));
+      for (int64_t i = 0; i < input_numel; i++) {
+        output_data[input_data[i]] += static_cast<double>(weights_data[i]);
+      }
+    }
+
+  } else {
+    int64_t* output_data = output->mutable_data<int64_t>(context.GetPlace());
+    math::SetConstant<DeviceContext, int64_t>()(
+        context.template device_context<DeviceContext>(), output, 0L);
+    for (int64_t i = 0; i < input_numel; i++) {
+      output_data[input_data[i]] += 1L;
+    }
+  }
+}
+
+template <typename DeviceContext, typename T>
+class BincountKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* input = context.Input<framework::Tensor>("X");
+    const auto& input_type = input->type();
+
+    if (input_type == framework::proto::VarType::INT32) {
+      BincountInner<DeviceContext, T, int>(context);
+    } else if (input_type == framework::proto::VarType::INT64) {
+      BincountInner<DeviceContext, T, int64_t>(context);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/cast_op_xpu.cc b/paddle/fluid/operators/cast_op_xpu.cc
index c7c0f81f2131f..c1a296f2b2788 100644
--- a/paddle/fluid/operators/cast_op_xpu.cc
+++ b/paddle/fluid/operators/cast_op_xpu.cc
@@ -23,6 +23,9 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+using var_type = framework::proto::VarType;
+namespace plat = paddle::platform;
+
 template <typename DeviceContext, typename InT>
 class CastXPUKernel : public framework::OpKernel<InT> {
   using XPUInTDType = typename XPUTypeTrait<InT>::Type;
@@ -31,53 +34,49 @@ class CastXPUKernel : public framework::OpKernel<InT> {
   void Compute(const framework::ExecutionContext& context) const override {
     auto* in = context.Input<framework::Tensor>("X");
     auto* out = context.Output<framework::Tensor>("Out");
-    auto in_type = static_cast<framework::proto::VarType::Type>(
-        context.Attr<int>("in_dtype"));
-    auto out_type = static_cast<framework::proto::VarType::Type>(
-        context.Attr<int>("out_dtype"));
+    auto in_type = static_cast<var_type::Type>(context.Attr<int>("in_dtype"));
+    auto out_type = static_cast<var_type::Type>(context.Attr<int>("out_dtype"));
     auto* in_data = in->data<InT>();
 
     auto numel = in->numel();
     auto& dev_ctx = context.template device_context<DeviceContext>();
     int r = -1;
-    if (out_type == framework::proto::VarType::FP32) {
-      auto* out_data = out->mutable_data<float>(context.GetPlace());
-      r = xpu::cast_v2<XPUInTDType, float>(
-          dev_ctx.x_context(), reinterpret_cast<const XPUInTDType*>(in_data),
-          out_data, numel);
-    } else if (out_type == framework::proto::VarType::INT32) {
-      auto* out_data = out->mutable_data<int>(context.GetPlace());
-      r = xpu::cast_v2<XPUInTDType, int32_t>(
-          dev_ctx.x_context(), reinterpret_cast<const XPUInTDType*>(in_data),
-          out_data, numel);
-    } else if (out_type == framework::proto::VarType::INT64) {
-      auto* out_data = out->mutable_data<int64_t>(context.GetPlace());
-      r = xpu::cast_v2<XPUInTDType, int64_t>(
-          dev_ctx.x_context(), reinterpret_cast<const XPUInTDType*>(in_data),
-          out_data, numel);
-    } else if ((out_type == framework::proto::VarType::BOOL) &&
-               (in_type == framework::proto::VarType::FP32)) {
-      auto* out_data = out->mutable_data<bool>(context.GetPlace());
-      r = xpu::cast_v2<float, int8_t>(
-          dev_ctx.x_context(), (const float*)in_data,
-          reinterpret_cast<int8_t*>(out_data), numel);
-    } else if (out_type == framework::proto::VarType::FP16) {
-      auto* out_data =
-          out->mutable_data<paddle::platform::float16>(context.GetPlace());
-      r = xpu::cast_v2<XPUInTDType, float16>(
-          dev_ctx.x_context(), reinterpret_cast<const XPUInTDType*>(in_data),
-          reinterpret_cast<float16*>(out_data), numel);
-
-    } else {
-      PADDLE_THROW(platform::errors::Unavailable("Not supported cast %d -> %d",
-                                                 in_type, out_type));
+    switch (out_type) {
+      case var_type::FP32:
+        r = xpu::cast_v2<XPUInTDType, float>(
+            dev_ctx.x_context(), reinterpret_cast<const XPUInTDType*>(in_data),
+            out->mutable_data<float>(context.GetPlace()), numel);
+        break;
+      case var_type::FP16:
+        r = xpu::cast_v2<XPUInTDType, float16>(
+            dev_ctx.x_context(), reinterpret_cast<const XPUInTDType*>(in_data),
+            reinterpret_cast<float16*>(
+                out->mutable_data<plat::float16>(context.GetPlace())),
+            numel);
+        break;
+      case var_type::INT64:
+        r = xpu::cast_v2<XPUInTDType, int64_t>(
+            dev_ctx.x_context(), reinterpret_cast<const XPUInTDType*>(in_data),
+            out->mutable_data<int64_t>(context.GetPlace()), numel);
+        break;
+      case var_type::INT32:
+        r = xpu::cast_v2<XPUInTDType, int32_t>(
+            dev_ctx.x_context(), reinterpret_cast<const XPUInTDType*>(in_data),
+            out->mutable_data<int>(context.GetPlace()), numel);
+        break;
+      case var_type::BOOL:
+        r = xpu::cast_v2<XPUInTDType, bool>(
+            dev_ctx.x_context(), reinterpret_cast<const XPUInTDType*>(in_data),
+            out->mutable_data<bool>(context.GetPlace()), numel);
+        break;
+      default:
+        PADDLE_THROW(platform::errors::Unavailable(
+            "Not supported cast %d -> %d", in_type, out_type));
     }
     PADDLE_ENFORCE_EQ(
         r, XPU_SUCCESS,
-        platform::errors::External(
-            "XPU API return wrong value[%d], please check whether "
-            "Baidu Kunlun Card is properly installed.",
-            r));
+        platform::errors::External("XPU CAST API return wrong value[%d %s].", r,
+                                   XPUAPIErrorMsg[r]));
   }
 };
 
@@ -90,5 +89,6 @@ REGISTER_OP_XPU_KERNEL(
     ops::CastXPUKernel<paddle::platform::XPUDeviceContext, float>,
     ops::CastXPUKernel<paddle::platform::XPUDeviceContext,
                        paddle::platform::float16>,
-    ops::CastXPUKernel<paddle::platform::XPUDeviceContext, int64_t>);
+    ops::CastXPUKernel<paddle::platform::XPUDeviceContext, int64_t>,
+    ops::CastXPUKernel<paddle::platform::XPUDeviceContext, bool>);
 #endif
diff --git a/paddle/fluid/operators/clip_op_xpu.cc b/paddle/fluid/operators/clip_op_xpu.cc
new file mode 100644
index 0000000000000..7d4b02af418be
--- /dev/null
+++ b/paddle/fluid/operators/clip_op_xpu.cc
@@ -0,0 +1,78 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/clip_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class ClipXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* out = ctx.Output<Tensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+
+    auto max = static_cast<T>(ctx.Attr<float>("max"));
+    if (ctx.HasInput("Max")) {
+      Tensor max_cpu;
+      auto* max_t = ctx.Input<Tensor>("Max");
+      auto* max_data = max_t->data<T>();
+      if (platform::is_xpu_place(max_t->place())) {
+        TensorCopySync(*max_t, platform::CPUPlace(), &max_cpu);
+        max_data = max_cpu.data<T>();
+      }
+      max = max_data[0];
+    }
+
+    auto min = ctx.Attr<float>("min");
+    if (ctx.HasInput("Min")) {
+      Tensor min_cpu;
+      auto* min_t = ctx.Input<Tensor>("Min");
+      auto* min_data = min_t->data<T>();
+      if (platform::is_xpu_place(min_t->place())) {
+        TensorCopySync(*min_t, platform::CPUPlace(), &min_cpu);
+        min_data = min_cpu.data<T>();
+      }
+      min = min_data[0];
+    }
+
+    using XPUDataType = typename XPUTypeTrait<T>::Type;
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    auto x_data = reinterpret_cast<const XPUDataType*>(x->data<T>());
+    auto out_data = reinterpret_cast<XPUDataType*>(out->data<T>());
+    int r = xpu::clip_v2(dev_ctx.x_context(), x_data, out_data, x->numel(), min,
+                         max);
+    PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
+                                          "XPU API(clip_v2) return wrong "
+                                          "value[%d %s]",
+                                          r, XPUAPIErrorMsg[r]));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_XPU_KERNEL(clip, ops::ClipXPUKernel<plat::XPUDeviceContext, float>);
+
+#endif
diff --git a/paddle/fluid/operators/controlflow/CMakeLists.txt b/paddle/fluid/operators/controlflow/CMakeLists.txt
index 1a2df2a0c7ba3..d2ad93bbae921 100644
--- a/paddle/fluid/operators/controlflow/CMakeLists.txt
+++ b/paddle/fluid/operators/controlflow/CMakeLists.txt
@@ -22,3 +22,9 @@ endif()
 file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(equal_all);\nUSE_NO_KERNEL_OP(read_from_array);\n")
 file(APPEND ${pybind_file} "USE_OP(logical_and);\nUSE_OP(logical_or);\nUSE_OP(logical_xor);\nUSE_OP(logical_not);\n")
 file(APPEND ${pybind_file} "USE_OP(bitwise_and);\nUSE_OP(bitwise_or);\nUSE_OP(bitwise_xor);\nUSE_OP(bitwise_not);\n")
+
+if(WITH_XPU)
+  file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(equal, XPU);\nUSE_OP_DEVICE_KERNEL(not_equal, XPU);\n")
+  file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(less_than, XPU);\nUSE_OP_DEVICE_KERNEL(less_equal, XPU);\n")
+  file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(greater_than, XPU);\nUSE_OP_DEVICE_KERNEL(greater_equal, XPU);\n")
+endif()
diff --git a/paddle/fluid/operators/controlflow/compare_op_xpu.cc b/paddle/fluid/operators/controlflow/compare_op_xpu.cc
new file mode 100644
index 0000000000000..59e457caa1862
--- /dev/null
+++ b/paddle/fluid/operators/controlflow/compare_op_xpu.cc
@@ -0,0 +1,145 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/controlflow/compare_op.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T, typename XPUType>
+void XPUCompare(
+    const framework::ExecutionContext& ctx,
+    std::function<int(xpu::Context*, const XPUType*, const XPUType*, bool*,
+                      const std::vector<int>&, const std::vector<int>&)>
+        func) {
+  auto* x = ctx.Input<framework::Tensor>("X");
+  auto* y = ctx.Input<framework::Tensor>("Y");
+  auto* z = ctx.Output<framework::Tensor>("Out");
+
+  auto x_shape = framework::vectorize<int>(x->dims());
+  auto y_shape = framework::vectorize<int>(y->dims());
+
+  auto x_data = reinterpret_cast<const XPUType*>(x->data<T>());
+  auto y_data = reinterpret_cast<const XPUType*>(y->data<T>());
+  auto z_data = z->mutable_data<bool>(ctx.GetPlace());
+
+  auto& dev_ctx =
+      ctx.template device_context<paddle::platform::XPUDeviceContext>();
+
+  int ret = func(dev_ctx.x_context(), x_data, y_data, z_data, x_shape, y_shape);
+  PADDLE_ENFORCE_EQ(
+      ret, xpu::SUCCESS,
+      platform::errors::External(
+          "XPU kernel compare op occur error[%d %s] in XPUCompare.", ret,
+          XPUAPIErrorMsg[ret]));
+}
+
+template <typename DeviceContext, typename T>
+class EqualXPUKernel : public framework::OpKernel<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    XPUCompare<T, XPUType>(ctx, xpu::broadcast_equal<XPUType>);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class NotEqualXPUKernel : public framework::OpKernel<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    XPUCompare<T, XPUType>(ctx, xpu::broadcast_not_equal<XPUType>);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class LessThanXPUKernel : public framework::OpKernel<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    XPUCompare<T, XPUType>(ctx, xpu::broadcast_less_than<XPUType>);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class LessEqualXPUKernel : public framework::OpKernel<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    XPUCompare<T, XPUType>(ctx, xpu::broadcast_less_equal<XPUType>);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class GreaterThanXPUKernel : public framework::OpKernel<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    XPUCompare<T, XPUType>(ctx, xpu::broadcast_greater_than<XPUType>);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class GreaterEqualXPUKernel : public framework::OpKernel<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    XPUCompare<T, XPUType>(ctx, xpu::broadcast_greater_equal<XPUType>);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_XPU_KERNEL(equal,
+                       ops::EqualXPUKernel<plat::XPUDeviceContext, float>,
+                       ops::EqualXPUKernel<plat::XPUDeviceContext, int>,
+                       ops::EqualXPUKernel<plat::XPUDeviceContext, int64_t>);
+
+REGISTER_OP_XPU_KERNEL(not_equal,
+                       ops::NotEqualXPUKernel<plat::XPUDeviceContext, float>,
+                       ops::NotEqualXPUKernel<plat::XPUDeviceContext, int>,
+                       ops::NotEqualXPUKernel<plat::XPUDeviceContext, int64_t>);
+
+REGISTER_OP_XPU_KERNEL(less_than,
+                       ops::LessThanXPUKernel<plat::XPUDeviceContext, float>,
+                       ops::LessThanXPUKernel<plat::XPUDeviceContext, int>,
+                       ops::LessThanXPUKernel<plat::XPUDeviceContext, int64_t>);
+
+REGISTER_OP_XPU_KERNEL(
+    less_equal, ops::LessEqualXPUKernel<plat::XPUDeviceContext, float>,
+    ops::LessEqualXPUKernel<plat::XPUDeviceContext, int>,
+    ops::LessEqualXPUKernel<plat::XPUDeviceContext, int64_t>);
+
+REGISTER_OP_XPU_KERNEL(
+    greater_than, ops::GreaterThanXPUKernel<plat::XPUDeviceContext, float>,
+    ops::GreaterThanXPUKernel<plat::XPUDeviceContext, int>,
+    ops::GreaterThanXPUKernel<plat::XPUDeviceContext, int64_t>);
+
+REGISTER_OP_XPU_KERNEL(
+    greater_equal, ops::GreaterEqualXPUKernel<plat::XPUDeviceContext, float>,
+    ops::GreaterEqualXPUKernel<plat::XPUDeviceContext, int>,
+    ops::GreaterEqualXPUKernel<plat::XPUDeviceContext, int64_t>);
+
+#endif
diff --git a/paddle/fluid/operators/cumsum_op_npu.cc b/paddle/fluid/operators/cumsum_op_npu.cc
index e8cf1a46db3cc..0c0eb1577e802 100644
--- a/paddle/fluid/operators/cumsum_op_npu.cc
+++ b/paddle/fluid/operators/cumsum_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/cum_op.h"
@@ -21,6 +21,38 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
+static void CumsumImp(const Tensor& input, Tensor* output,
+                      const framework::NPUAttributeMap& attr_input,
+                      const framework::ExecutionContext& ctx) {
+  auto stream =
+      ctx.template device_context<paddle::platform::NPUDeviceContext>()
+          .stream();
+  if (input.type() == framework::proto::VarType::INT64) {
+    Tensor tmp_input;
+    tmp_input.mutable_data<float>(input.dims(), ctx.GetPlace());
+    auto dst_acl_dtype = ConvertToNpuDtype(tmp_input.type());
+    const auto& cast_runner_1 =
+        NpuOpRunner("Cast", {input}, {tmp_input},
+                    {{"dst_type", static_cast<int>(dst_acl_dtype)}});
+    cast_runner_1.Run(stream);
+
+    Tensor tmp_output;
+    tmp_output.mutable_data<float>(output->dims(), ctx.GetPlace());
+    const auto& runner =
+        NpuOpRunner("CumsumD", {tmp_input}, {tmp_output}, attr_input);
+    runner.Run(stream);
+
+    dst_acl_dtype = ConvertToNpuDtype(output->type());
+    const auto& cast_runner_2 =
+        NpuOpRunner("Cast", {tmp_output}, {*output},
+                    {{"dst_type", static_cast<int>(dst_acl_dtype)}});
+    cast_runner_2.Run(stream);
+  } else {
+    const auto& runner = NpuOpRunner("CumsumD", {input}, {*output}, attr_input);
+    runner.Run(stream);
+  }
+}
+
 template <typename DeviceContext, typename T>
 class CumSumNPUKernel : public framework::OpKernel<T> {
  public:
@@ -36,10 +68,6 @@ class CumSumNPUKernel : public framework::OpKernel<T> {
     framework::NPUAttributeMap attr_input = {
         {"axis", axis}, {"exclusive", exclusive}, {"reverse", reverse}};
 
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
     bool flatten = ctx.Attr<bool>("flatten");
     if (flatten) {
       PADDLE_ENFORCE_EQ(
@@ -53,11 +81,9 @@ class CumSumNPUKernel : public framework::OpKernel<T> {
 
       new_x.Resize(framework::make_ddim({x->numel()}));
 
-      const auto& runner = NpuOpRunner("CumsumD", {new_x}, {*out}, attr_input);
-      runner.Run(stream);
+      CumsumImp(new_x, out, attr_input, ctx);
     } else {
-      const auto& runner = NpuOpRunner("CumsumD", {*x}, {*out}, attr_input);
-      runner.Run(stream);
+      CumsumImp(*x, out, attr_input, ctx);
     }
   }
 };
@@ -69,5 +95,8 @@ namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 REGISTER_OP_NPU_KERNEL(
     cumsum, ops::CumSumNPUKernel<plat::NPUDeviceContext, int>,
+#ifdef PADDLE_WITH_ASCEND_INT64
+    ops::CumSumNPUKernel<plat::NPUDeviceContext, int64_t>,
+#endif
     ops::CumSumNPUKernel<plat::NPUDeviceContext, float>,
     ops::CumSumNPUKernel<plat::NPUDeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/dropout_op_npu.cc b/paddle/fluid/operators/dropout_op_npu.cc
index b5c8bfff0dc39..50d247d9c0590 100644
--- a/paddle/fluid/operators/dropout_op_npu.cc
+++ b/paddle/fluid/operators/dropout_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 
 #include <memory>
 #include <string>
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
index 48b98dafc7bb5..4cc4228b16429 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
@@ -167,10 +167,16 @@ namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
 REGISTER_OP_NPU_KERNEL(elementwise_sub, ops::ElementwiseSubNPUKernel<int>,
+#ifdef PADDLE_WITH_ASCEND_INT64
+                       ops::ElementwiseSubNPUKernel<int64_t>,
+#endif
                        ops::ElementwiseSubNPUKernel<float>,
                        ops::ElementwiseSubNPUKernel<plat::float16>);
 
 REGISTER_OP_NPU_KERNEL(elementwise_sub_grad,
                        ops::ElementwiseSubGradNPUKernel<int>,
+#ifdef PADDLE_WITH_ASCEND_INT64
+                       ops::ElementwiseSubGradNPUKernel<int64_t>,
+#endif
                        ops::ElementwiseSubGradNPUKernel<float>,
                        ops::ElementwiseSubGradNPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/expand_v2_op_npu.cc b/paddle/fluid/operators/expand_v2_op_npu.cc
index 85fe86a9e606f..4b0e0770573a6 100644
--- a/paddle/fluid/operators/expand_v2_op_npu.cc
+++ b/paddle/fluid/operators/expand_v2_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 
 #include "paddle/fluid/operators/expand_v2_op.h"
 #include "paddle/fluid/operators/npu_op_runner.h"
diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt
index 845e5659a8836..eec925b2c057b 100644
--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
@@ -18,6 +18,7 @@ register_operators(EXCLUDES
     fused_bn_add_activation_op
     fused_attention_op
     fused_transformer_op
+    fused_feedforward_op
     resnet_unit_op)
 
 # fusion_gru_op does not have CUDA kernel
@@ -79,6 +80,9 @@ if (WITH_GPU OR WITH_ROCM)
         nv_test(test_fused_residual_dropout_bias SRCS fused_residual_dropout_bias_test.cu DEPS tensor op_registry dropout_op layer_norm_op device_context generator memory)
         nv_test(test_fused_dropout_act_bias SRCS fused_dropout_act_bias_test.cu DEPS tensor op_registry dropout_op layer_norm_op device_context generator memory)
         nv_test(test_fused_layernorm_residual_dropout_bias SRCS fused_layernorm_residual_dropout_bias_test.cu DEPS tensor op_registry dropout_op layer_norm_op device_context generator memory)
+
+        op_library(fused_feedforward_op)
+        file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fused_feedforward);\n")
         # fused_attention_op
         op_library(fused_attention_op)
         file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fused_attention);\n")
diff --git a/paddle/fluid/operators/fused/fused_attention_op.cc b/paddle/fluid/operators/fused/fused_attention_op.cc
index a286c39f7f8db..6c4ac318264e8 100644
--- a/paddle/fluid/operators/fused/fused_attention_op.cc
+++ b/paddle/fluid/operators/fused/fused_attention_op.cc
@@ -328,9 +328,206 @@ class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
   }
 };
 
+class FusedAttentionGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE_EQ(
+        ctx->Attrs().Get<bool>("attn_dropout_is_test"), false,
+        platform::errors::InvalidArgument(
+            "GradOp is only callable when attn_dropout_is_test is false"));
+
+    OP_INOUT_CHECK(ctx->HasInput("Ln2Mean"), "Input", "Ln2Mean",
+                   "FusedAttentionGrad");
+    OP_INOUT_CHECK(ctx->HasInput("Ln2Variance"), "Input", "Ln2Variance",
+                   "FusedAttentionGrad");
+    if (ctx->HasOutput(framework::GradVarName("Ln2Scale"))) {
+      ctx->SetOutputDim(framework::GradVarName("Ln2Scale"),
+                        ctx->GetInputDim("Ln2Scale"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("Ln2Bias"))) {
+      ctx->SetOutputDim(framework::GradVarName("Ln2Bias"),
+                        ctx->GetInputDim("Ln2Bias"));
+    }
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "FusedAttentionGrad");
+    OP_INOUT_CHECK(ctx->HasInput("LnMean"), "Input", "LnMean",
+                   "FusedAttentionGrad");
+    OP_INOUT_CHECK(ctx->HasInput("LnVariance"), "Input", "LnVariance",
+                   "FusedAttentionGrad");
+    if (ctx->Attrs().Get<bool>("pre_layer_norm") == true) {
+      OP_INOUT_CHECK(ctx->HasInput("LnOut"), "Input", "LnOut",
+                     "FusedAttentionGrad");
+    }
+    OP_INOUT_CHECK(ctx->HasInput("QKVW"), "Input", "QKVW",
+                   "FusedAttentionGrad");
+    OP_INOUT_CHECK(ctx->HasInput("QKVBias"), "Input", "QKVBias",
+                   "FusedAttentionGrad");
+    OP_INOUT_CHECK(ctx->HasInput("SrcMask"), "Input", "SrcMask",
+                   "FusedAttentionGrad");
+    OP_INOUT_CHECK(ctx->HasInput("OutLinearW"), "Input", "OutLinearW",
+                   "FusedAttentionGrad");
+    OP_INOUT_CHECK(ctx->HasInput("OutLinearBias"), "Input", "OutLinearBias",
+                   "FusedAttentionGrad");
+
+    if (ctx->HasOutput(framework::GradVarName("LnScale"))) {
+      ctx->SetOutputDim(framework::GradVarName("LnScale"),
+                        ctx->GetInputDim("LnScale"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("LnBias"))) {
+      ctx->SetOutputDim(framework::GradVarName("LnBias"),
+                        ctx->GetInputDim("LnBias"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    }
+
+    ctx->SetOutputDim(framework::GradVarName("OutLinearBias"),
+                      ctx->GetInputDim("OutLinearBias"));
+    ctx->SetOutputDim(framework::GradVarName("OutLinearW"),
+                      ctx->GetInputDim("OutLinearW"));
+    ctx->SetOutputDim(framework::GradVarName("QKVW"), ctx->GetInputDim("QKVW"));
+    ctx->SetOutputDim(framework::GradVarName("QKVBias"),
+                      ctx->GetInputDim("QKVBias"));
+
+    ctx->SetOutputDim(framework::GradVarName("LnOut"),
+                      ctx->GetInputDim("LnOut"));
+    ctx->SetOutputDim(framework::GradVarName("FMHAOut"),
+                      ctx->GetInputDim("FMHAOut"));
+    ctx->SetOutputDim(framework::GradVarName("QKTVOut"),
+                      ctx->GetInputDim("QKTVOut"));
+    ctx->SetOutputDim(framework::GradVarName("TransposeOut2"),
+                      ctx->GetInputDim("TransposeOut2"));
+    ctx->SetOutputDim(framework::GradVarName("QKOut"),
+                      ctx->GetInputDim("QKOut"));
+    ctx->SetOutputDim(framework::GradVarName("SoftmaxOut"),
+                      ctx->GetInputDim("SoftmaxOut"));
+    ctx->SetOutputDim(framework::GradVarName("AttnDropoutOut"),
+                      ctx->GetInputDim("AttnDropoutOut"));
+    ctx->SetOutputDim(framework::GradVarName("SrcMaskOut"),
+                      ctx->GetInputDim("SrcMaskOut"));
+    ctx->SetOutputDim(framework::GradVarName("QKVOut"),
+                      ctx->GetInputDim("QKVOut"));
+    ctx->SetOutputDim(framework::GradVarName("QKVBiasOut"),
+                      ctx->GetInputDim("QKVBiasOut"));
+    ctx->SetOutputDim(framework::GradVarName("OutLinearOut"),
+                      ctx->GetInputDim("OutLinearOut"));
+    ctx->SetOutputDim(framework::GradVarName("BiasDropoutResidualOut"),
+                      ctx->GetInputDim("BiasDropoutResidualOut"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    auto input = ctx.Input<Tensor>("X");
+    auto input_data_type = input->type();
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
+};
+
+template <typename T>
+class FusedAttentionGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("fused_attention_grad");
+    op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y"));
+
+    // inputs x, parameters and their grad.
+    op->SetInput("X", this->Input("X"));
+    op->SetInput("QKVW", this->Input("QKVW"));
+    op->SetInput("QKVBias", this->Input("QKVBias"));
+    op->SetInput("SrcMask", this->Input("SrcMask"));
+    op->SetInput("OutLinearW", this->Input("OutLinearW"));
+    op->SetInput("OutLinearBias", this->Input("OutLinearBias"));
+    if (this->HasInput("LnScale")) {
+      op->SetInput("LnScale", this->Input("LnScale"));
+      op->SetOutput(framework::GradVarName("LnScale"),
+                    this->InputGrad("LnScale"));
+    }
+    if (this->HasInput("LnBias")) {
+      op->SetInput("LnBias", this->Input("LnBias"));
+      op->SetOutput(framework::GradVarName("LnBias"),
+                    this->InputGrad("LnBias"));
+    }
+    if (this->HasInput("Ln2Scale")) {
+      op->SetInput("Ln2Scale", this->Input("Ln2Scale"));
+      op->SetOutput(framework::GradVarName("Ln2Scale"),
+                    this->InputGrad("Ln2Scale"));
+    }
+    if (this->HasInput("Ln2Bias")) {
+      op->SetInput("Ln2Bias", this->Input("Ln2Bias"));
+      op->SetOutput(framework::GradVarName("Ln2Bias"),
+                    this->InputGrad("Ln2Bias"));
+    }
+
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op->SetOutput(framework::GradVarName("QKVW"), this->InputGrad("QKVW"));
+    op->SetOutput(framework::GradVarName("QKVBias"),
+                  this->InputGrad("QKVBias"));
+    op->SetOutput(framework::GradVarName("OutLinearBias"),
+                  this->InputGrad("OutLinearBias"));
+    op->SetOutput(framework::GradVarName("OutLinearW"),
+                  this->InputGrad("OutLinearW"));
+
+    // use forward outputs as backward inputs.
+    op->SetInput("LnOut", this->Output("LnOut"));
+    op->SetInput("LnMean", this->Output("LnMean"));
+    op->SetInput("LnVariance", this->Output("LnVariance"));
+    op->SetInput("QKVOut", this->Output("QKVOut"));
+    op->SetInput("QKVBiasOut", this->Output("QKVBiasOut"));
+    op->SetInput("TransposeOut2", this->Output("TransposeOut2"));
+    op->SetInput("QKOut", this->Output("QKOut"));
+    op->SetInput("QKTVOut", this->Output("QKTVOut"));
+    op->SetInput("SoftmaxOut", this->Output("SoftmaxOut"));
+    op->SetInput("AttnDropoutMaskOut", this->Output("AttnDropoutMaskOut"));
+    op->SetInput("AttnDropoutOut", this->Output("AttnDropoutOut"));
+    op->SetInput("SrcMaskOut", this->Output("SrcMaskOut"));
+    op->SetInput("FMHAOut", this->Output("FMHAOut"));
+    op->SetInput("OutLinearOut", this->Output("OutLinearOut"));
+
+    op->SetInput("Ln2Mean", this->Output("Ln2Mean"));
+    op->SetInput("Ln2Variance", this->Output("Ln2Variance"));
+    op->SetInput("DropoutMaskOut", this->Output("DropoutMaskOut"));
+    op->SetInput("BiasDropoutResidualOut",
+                 this->Output("BiasDropoutResidualOut"));
+    op->SetInput("QKVOut", this->Output("QKVOut"));
+
+    // backward outputs: dinput
+    op->SetOutput(framework::GradVarName("LnOut"), this->OutputGrad("LnOut"));
+    op->SetOutput(framework::GradVarName("QKVOut"), this->OutputGrad("QKVOut"));
+    op->SetOutput(framework::GradVarName("QKVBiasOut"),
+                  this->OutputGrad("QKVBiasOut"));
+    op->SetOutput(framework::GradVarName("QKTVOut"),
+                  this->OutputGrad("QKTVOut"));
+    op->SetOutput(framework::GradVarName("TransposeOut2"),
+                  this->OutputGrad("TransposeOut2"));
+    op->SetOutput(framework::GradVarName("QKOut"), this->OutputGrad("QKOut"));
+    op->SetOutput(framework::GradVarName("SoftmaxOut"),
+                  this->OutputGrad("SoftmaxOut"));
+    op->SetOutput(framework::GradVarName("AttnDropoutOut"),
+                  this->OutputGrad("AttnDropoutOut"));
+    op->SetOutput(framework::GradVarName("SrcMaskOut"),
+                  this->OutputGrad("SrcMaskOut"));
+    op->SetOutput(framework::GradVarName("FMHAOut"),
+                  this->OutputGrad("FMHAOut"));
+    op->SetOutput(framework::GradVarName("BiasDropoutResidualOut"),
+                  this->OutputGrad("BiasDropoutResidualOut"));
+    op->SetOutput(framework::GradVarName("OutLinearOut"),
+                  this->OutputGrad("OutLinearOut"));
+
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(fused_attention, ops::FusedAttentionOp,
-                  ops::FusedAttentionOpMaker);
+                  ops::FusedAttentionOpMaker,
+                  ops::FusedAttentionGradOpMaker<paddle::framework::OpDesc>,
+                  ops::FusedAttentionGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(fused_attention_grad, ops::FusedAttentionGradOp);
diff --git a/paddle/fluid/operators/fused/fused_attention_op.cu b/paddle/fluid/operators/fused/fused_attention_op.cu
index 18a42b5c2cee2..95e690cb17ec1 100644
--- a/paddle/fluid/operators/fused/fused_attention_op.cu
+++ b/paddle/fluid/operators/fused/fused_attention_op.cu
@@ -199,6 +199,237 @@ class FusedAttentionOpKernel : public framework::OpKernel<T> {
   }
 };
 
+template <typename T>
+class FusedAttentionGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    using U = LayerNormParamType<T>;
+    const auto pre_layer_norm = ctx.Attr<bool>("pre_layer_norm");
+    const float epsilon = ctx.Attr<float>("epsilon");
+    const float ln2epsilon = ctx.Attr<float>("ln_epsilon");
+
+    float attn_dropout_prob = ctx.Attr<float>("attn_dropout_rate");
+    bool is_test_1 = ctx.Attr<bool>("attn_dropout_is_test");
+    auto &dropout_implementation_1 =
+        ctx.Attr<std::string>("attn_dropout_implementation");
+    bool is_upscale_in_train_1 =
+        (dropout_implementation_1 == "upscale_in_train");
+    auto *seed_1 = ctx.HasInput("Seed1") ? ctx.Input<Tensor>("Seed1") : nullptr;
+    bool is_fix_seed_1 = ctx.Attr<bool>("attn_dropout_fix_seed");
+    int seed_val_1 = ctx.Attr<int>("attn_dropout_seed");
+
+    // get inputs.
+    auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    auto *d_y_data = d_y->data<T>();
+
+    // fw input
+    auto *input_x = ctx.Input<Tensor>("X");
+    auto *ln_scale = ctx.Input<Tensor>("LnScale");
+    auto *ln_2_scale = ctx.Input<Tensor>("Ln2Scale");
+    auto *x_data = input_x->data<T>();
+    auto *ln_scale_data = (ln_scale == nullptr ? nullptr : ln_scale->data<U>());
+    auto *ln_2_scale_data =
+        (ln_2_scale == nullptr ? nullptr : ln_2_scale->data<U>());
+    // fw parameters.
+    auto *src_mask = ctx.Input<Tensor>("SrcMask");
+    auto *qkv_weight = ctx.Input<Tensor>("QKVW");
+    auto *qkv_bias = ctx.Input<Tensor>("QKVBias");
+    auto *out_linear_weight = ctx.Input<Tensor>("OutLinearW");
+    auto *out_linear_bias = ctx.Input<Tensor>("OutLinearBias");
+    auto *src_mask_data = (src_mask == nullptr ? nullptr : src_mask->data<T>());
+    auto *qkv_weight_data = qkv_weight->data<T>();
+    auto *qkv_bias_data = qkv_bias->data<T>();
+    auto *out_linear_weight_data = out_linear_weight->data<T>();
+    auto *out_linear_bias_data = out_linear_bias->data<T>();
+
+    // fw output
+    auto *ln_mean = ctx.Input<Tensor>("LnMean");
+    auto *ln_var = ctx.Input<Tensor>("LnVariance");
+    auto *ln_out = ctx.Input<Tensor>("LnOut");
+    auto *fmha_out = ctx.Input<Tensor>("FMHAOut");
+    auto *transpose_out_2 = ctx.Input<Tensor>("TransposeOut2");
+    auto *qk_out = ctx.Input<Tensor>("QKOut");
+    auto *qktv_out = ctx.Input<Tensor>("QKTVOut");
+    auto *softmax_out = ctx.Input<Tensor>("SoftmaxOut");
+    auto *attn_dropout_mask_out = ctx.Input<Tensor>("AttnDropoutMaskOut");
+    auto *attn_dropout_out = ctx.Input<Tensor>("AttnDropoutOut");
+    auto *src_mask_out = ctx.Input<Tensor>("SrcMaskOut");
+    auto *out_linear_out = ctx.Input<Tensor>("OutLinearOut");
+    auto *ln_2_mean = ctx.Input<Tensor>("Ln2Mean");
+    auto *ln_2_var = ctx.Input<Tensor>("Ln2Variance");
+    auto *dropout_mask_out = ctx.Input<Tensor>("DropoutMaskOut");
+    auto *bias_dropout_residual_out =
+        ctx.Input<Tensor>("BiasDropoutResidualOut");
+    auto *ln_mean_data = ln_mean->data<U>();
+    auto *ln_var_data = ln_var->data<U>();
+    auto *ln_out_data = ln_out->data<T>();
+    auto *fmha_out_data = fmha_out->data<T>();
+    auto *transpose_out_2_data = transpose_out_2->data<T>();
+    auto *qk_out_data = qk_out->data<T>();
+    auto *qktv_out_data = qktv_out->data<T>();
+    auto *softmax_out_data = softmax_out->data<T>();
+    auto *src_mask_out_data = src_mask_out->data<T>();
+    auto *out_linear_out_data = out_linear_out->data<T>();
+    auto *ln_2_mean_data = ln_2_mean->data<U>();
+    auto *ln_2_var_data = ln_2_var->data<U>();
+    auto *dropout_mask_out_data = dropout_mask_out->data<uint8_t>();
+    auto *bias_dropout_residual_out_data = bias_dropout_residual_out->data<T>();
+
+    // output's grad
+    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *d_ln_out = ctx.Output<Tensor>(framework::GradVarName("LnOut"));
+    auto *d_qkv_out = ctx.Output<Tensor>(framework::GradVarName("QKVOut"));
+    auto *d_qkv_bias_out =
+        ctx.Output<Tensor>(framework::GradVarName("QKVBiasOut"));
+    auto *d_qktv_out = ctx.Output<Tensor>(framework::GradVarName("QKTVOut"));
+    auto *d_transpose_out_2 =
+        ctx.Output<Tensor>(framework::GradVarName("TransposeOut2"));
+    auto *d_qk_out = ctx.Output<Tensor>(framework::GradVarName("QKOut"));
+    auto *d_softmax_out =
+        ctx.Output<Tensor>(framework::GradVarName("SoftmaxOut"));
+    auto *d_attn_dropout_out =
+        ctx.Output<Tensor>(framework::GradVarName("AttnDropoutOut"));
+    auto *d_src_mask_out =
+        ctx.Output<Tensor>(framework::GradVarName("SrcMaskOut"));
+    auto *d_fmha_out = ctx.Output<Tensor>(framework::GradVarName("FMHAOut"));
+    auto *d_out_linear_out =
+        ctx.Output<Tensor>(framework::GradVarName("OutLinearOut"));
+    auto *d_bias_dropout_residual_out =
+        ctx.Output<Tensor>(framework::GradVarName("BiasDropoutResidualOut"));
+    auto *d_x_data = d_x->mutable_data<T>(ctx.GetPlace());
+    auto *d_ln_out_data = d_ln_out->mutable_data<T>(ctx.GetPlace());
+    auto *d_qkv_out_data = d_qkv_out->mutable_data<T>(ctx.GetPlace());
+    auto *d_qkv_bias_out_data = d_qkv_bias_out->mutable_data<T>(ctx.GetPlace());
+    auto *d_qktv_out_data = d_qktv_out->mutable_data<T>(ctx.GetPlace());
+    auto *d_transpose_out_2_data =
+        d_transpose_out_2->mutable_data<T>(ctx.GetPlace());
+    auto *d_qk_out_data = d_qk_out->mutable_data<T>(ctx.GetPlace());
+    auto *d_softmax_out_data = d_softmax_out->mutable_data<T>(ctx.GetPlace());
+    auto *d_attn_dropout_out_data =
+        d_attn_dropout_out->mutable_data<T>(ctx.GetPlace());
+    auto *d_src_mask_out_data = d_src_mask_out->mutable_data<T>(ctx.GetPlace());
+    auto *d_fmha_out_data = d_fmha_out->mutable_data<T>(ctx.GetPlace());
+    auto *d_out_linear_out_data =
+        d_out_linear_out->mutable_data<T>(ctx.GetPlace());
+    auto *d_bias_dropout_residual_out_data =
+        d_bias_dropout_residual_out->mutable_data<T>(ctx.GetPlace());
+
+    // parameter grad
+    auto *d_ln_scale = ctx.Output<Tensor>(framework::GradVarName("LnScale"));
+    auto *d_ln_bias = ctx.Output<Tensor>(framework::GradVarName("LnBias"));
+    auto *d_qkv_weight = ctx.Output<Tensor>(framework::GradVarName("QKVW"));
+    auto *d_qkv_bias = ctx.Output<Tensor>(framework::GradVarName("QKVBias"));
+    auto *d_out_linear_weight =
+        ctx.Output<Tensor>(framework::GradVarName("OutLinearW"));
+    auto *d_out_linear_bias =
+        ctx.Output<Tensor>(framework::GradVarName("OutLinearBias"));
+    auto *d_ln_2_scale = ctx.Output<Tensor>(framework::GradVarName("Ln2Scale"));
+    auto *d_ln_2_bias = ctx.Output<Tensor>(framework::GradVarName("Ln2Bias"));
+    auto *d_ln_scale_data =
+        (d_ln_scale == nullptr ? nullptr
+                               : d_ln_scale->mutable_data<U>(ctx.GetPlace()));
+    auto *d_ln_bias_data =
+        (d_ln_bias == nullptr ? nullptr
+                              : d_ln_bias->mutable_data<U>(ctx.GetPlace()));
+    auto *d_qkv_weight_data = d_qkv_weight->mutable_data<T>(ctx.GetPlace());
+    auto *d_qkv_bias_data = d_qkv_bias->mutable_data<T>(ctx.GetPlace());
+    auto *d_out_linear_weight_data =
+        d_out_linear_weight->mutable_data<T>(ctx.GetPlace());
+    auto *d_out_linear_bias_data =
+        d_out_linear_bias->mutable_data<T>(ctx.GetPlace());
+    auto *d_ln_2_scale_data =
+        (d_ln_2_scale == nullptr ? nullptr : d_ln_2_scale->mutable_data<U>(
+                                                 ctx.GetPlace()));
+    auto *d_ln_2_bias_data =
+        (d_ln_2_bias == nullptr ? nullptr
+                                : d_ln_2_bias->mutable_data<U>(ctx.GetPlace()));
+
+    const auto input_x_dims = input_x->dims();
+    const auto qkv_w_dims = qkv_weight->dims();
+
+    int batch_size = input_x_dims[0];
+    int max_seq_len = input_x_dims[1];
+    int dim_embed = input_x_dims[2];
+    int num_head = qkv_w_dims[1];
+    int dim_head = qkv_w_dims[2];
+
+    int bsz_seq = batch_size * max_seq_len;
+    int hidden_size = num_head * dim_head;
+    int output_size = 3 * hidden_size;
+    int input_size = dim_embed;
+
+    Tensor d_residual;
+    d_residual.Resize(input_x_dims);
+    T *d_residual_data = d_residual.mutable_data<T>(ctx.GetPlace());
+
+    bool transA = false;
+    bool transB = true;
+    bool compute_bias = true;
+    auto layer_norm_compute = AttnLayerNorm<T>(ctx.cuda_device_context(),
+                                               epsilon, bsz_seq, dim_embed);
+    auto qkv_compute =
+        AttnMatMul<T>(ctx.cuda_device_context(), transA, transB, bsz_seq,
+                      output_size, input_size, compute_bias);
+    AttnDropoutParam attn_dropout_param(
+        is_test_1, dropout_implementation_1, attn_dropout_prob,
+        is_upscale_in_train_1, is_fix_seed_1, seed_val_1, seed_1);
+    auto fmha_ref_compute =
+        FMHARef<T>(ctx.cuda_device_context(), batch_size, max_seq_len, num_head,
+                   dim_head, attn_dropout_param);
+    output_size = hidden_size;
+    transA = false;
+    transB = false;
+    compute_bias = false;
+    auto out_linear_compute =
+        AttnMatMul<T>(ctx.cuda_device_context(), transA, transB, bsz_seq,
+                      output_size, input_size, compute_bias);
+    DropoutParam dropout_param2(ctx, 0);
+    FusedDropoutLayerNormHelper<T, uint8_t> fused_dropout_layernorm_helper(
+        ctx.cuda_device_context(), bsz_seq, dim_embed, dropout_param2,
+        ln2epsilon);
+
+    fused_dropout_layernorm_helper.LayernormResidualDropoutBiasGrad(
+        ctx.cuda_device_context(), d_y_data, bias_dropout_residual_out_data,
+        dropout_mask_out_data, ln_2_scale_data, ln_2_mean_data, ln_2_var_data,
+        d_bias_dropout_residual_out_data, d_ln_2_scale_data, d_ln_2_bias_data,
+        d_out_linear_out_data, d_out_linear_bias_data, d_residual_data);
+
+    out_linear_compute.ComputeBackward(fmha_out_data, out_linear_weight_data,
+                                       d_out_linear_out_data, d_fmha_out_data,
+                                       d_out_linear_weight_data, nullptr);
+    fmha_ref_compute.ComputeBackward(
+        *transpose_out_2, *src_mask, *softmax_out, *attn_dropout_mask_out,
+        *attn_dropout_out, *qk_out, *src_mask_out, *d_fmha_out, d_qktv_out,
+        d_attn_dropout_out, d_softmax_out, d_src_mask_out, d_qk_out,
+        d_transpose_out_2, nullptr, d_qkv_bias_out);
+    cudaMemcpyAsync(d_qkv_out_data, d_qkv_bias_out_data,
+                    bsz_seq * 3 * num_head * dim_head * sizeof(T),
+                    cudaMemcpyDeviceToDevice);
+
+    if (pre_layer_norm) {
+      qkv_compute.ComputeBackward(ln_out_data, qkv_weight_data,
+                                  d_qkv_bias_out_data, d_ln_out_data,
+                                  d_qkv_weight_data, d_qkv_bias_data);
+      layer_norm_compute.ComputeBackward(x_data, d_ln_out_data, ln_scale_data,
+                                         ln_mean_data, ln_var_data, d_x_data,
+                                         d_ln_scale_data, d_ln_bias_data);
+    } else {
+      qkv_compute.ComputeBackward(x_data, qkv_weight_data, d_qkv_bias_out_data,
+                                  d_x_data, d_qkv_weight_data, d_qkv_bias_data);
+    }
+    // gradient accumulation
+    std::vector<const Tensor *> ins;
+    std::vector<Tensor *> outs;
+    ins.emplace_back(&d_residual);
+    ins.emplace_back(d_x);
+    outs.emplace_back(d_x);
+    int elewise_add_axis = -1;
+    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
+        ctx.cuda_device_context(), ins, &outs, elewise_add_axis,
+        AddFunctor<T>());
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -207,3 +438,7 @@ namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(fused_attention, ops::FusedAttentionOpKernel<float>,
                         ops::FusedAttentionOpKernel<double>,
                         ops::FusedAttentionOpKernel<plat::float16>);
+REGISTER_OP_CUDA_KERNEL(fused_attention_grad,
+                        ops::FusedAttentionGradKernel<float>,
+                        ops::FusedAttentionGradKernel<double>,
+                        ops::FusedAttentionGradKernel<plat::float16>);
diff --git a/paddle/fluid/operators/fused/fused_dropout_common.h b/paddle/fluid/operators/fused/fused_dropout_common.h
index 3fb58eab077bc..049c37f1ea0c4 100644
--- a/paddle/fluid/operators/fused/fused_dropout_common.h
+++ b/paddle/fluid/operators/fused/fused_dropout_common.h
@@ -110,27 +110,34 @@ inline __device__ void CalculateDBias(const T *tmp_sum, T *dbias,
   }
   __syncthreads();
   // reduce sum
-  T sum = static_cast<T>(0);
+  T sum[2] = {static_cast<T>(0)};
   int tid = threadIdx.y * blockDim.x + threadIdx.x;
   int x = tid >> 5;  // warp id
   int y = tid & 31;  // thread id on warp 0~31
 
   // need BlockSizeX * VecSize warps
-  if (x < BlockSizeX * VecSize) {
+  for (int j = x; j < BlockSizeX * VecSize; j += 32) {
 // reduce 128 to 32
 #pragma unroll
     for (int i = 0; i < (BlockSizeY >> 5); i++) {
-      sum += cache[x][y + i * 32];
+      sum[(j >> 5)] += cache[j][y + i * 32];
     }
   }
 
+  int reduce_num_pre_thread = (BlockSizeX * VecSize + 31) / 32;
   // reduce 32 to 1
-  sum = WarpReduceSum(sum);
+  for (int i = 0; i < reduce_num_pre_thread; i++) {
+    sum[i] = WarpReduceSum(sum[i]);
+  }
 
   // save sum to dbias
-  int bias_id = blockIdx.x * blockDim.x * VecSize + x;
-  if (y == 0 && x < VecSize * BlockSizeX && bias_id < cols) {
-    dbias[bias_id] = sum;
+  if (y == 0 && x < BlockSizeX * VecSize) {
+    for (int i = 0; i < reduce_num_pre_thread; i++) {
+      int bias_id = blockIdx.x * BlockSizeX * VecSize + x + i * 32;
+      if (bias_id < cols) {
+        dbias[bias_id] = sum[i];
+      }
+    }
   }
 }
 
diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cc b/paddle/fluid/operators/fused/fused_feedforward_op.cc
new file mode 100644
index 0000000000000..4e03c7369d10e
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_feedforward_op.cc
@@ -0,0 +1,359 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/matmul_v2_op.h"
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+
+class FusedFeedForwardOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *context) const override {
+    OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", "fused_feedforward");
+    OP_INOUT_CHECK(context->HasInput("Linear1Weight"), "Input", "Linear1Weight",
+                   "fused_feedforward");
+    OP_INOUT_CHECK(context->HasInput("Linear2Weight"), "Input", "Linear2Weight",
+                   "fused_feedforward");
+    OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out",
+                   "fused_feedforward");
+    OP_INOUT_CHECK(context->HasOutput("Dropout1Mask"), "Output", "Dropout1Mask",
+                   "fused_feedforward");
+    OP_INOUT_CHECK(context->HasOutput("Dropout2Mask"), "Output", "Dropout2Mask",
+                   "fused_feedforward");
+    OP_INOUT_CHECK(context->HasOutput("Ln1Mean"), "Output", "Ln1Mean",
+                   "fused_feedforward");
+    OP_INOUT_CHECK(context->HasOutput("Ln1Variance"), "Output", "Ln1Variance",
+                   "fused_feedforward");
+    OP_INOUT_CHECK(context->HasOutput("Ln2Mean"), "Output", "Ln2Mean",
+                   "fused_feedforward");
+    OP_INOUT_CHECK(context->HasOutput("Ln2Variance"), "Output", "Ln2Variance",
+                   "fused_feedforward");
+    OP_INOUT_CHECK(context->HasOutput("Linear1Out"), "Output", "Linear1Out",
+                   "fused_feedforward");
+    OP_INOUT_CHECK(context->HasOutput("Ln1Out"), "Output", "Ln1Out",
+                   "fused_feedforward");
+    OP_INOUT_CHECK(context->HasOutput("Dropout1Out"), "Output", "Dropout1Out",
+                   "fused_feedforward");
+    OP_INOUT_CHECK(context->HasOutput("Dropout2Out"), "Output", "Dropout2Out",
+                   "fused_feedforward");
+
+    auto dim_x = context->GetInputDim("X");
+    auto mat_dim_x =
+        math::CreateMatrixDescriptor(RowMatrixFromVector(dim_x), 0, false);
+    // verify for the pre layer_norm, the feature size must be larger than 1
+    PADDLE_ENFORCE_GT(
+        mat_dim_x.width_, static_cast<size_t>(1),
+        platform::errors::InvalidArgument("Product from the X shape[1] to "
+                                          "shape[n-1] must be larger than 1!"));
+    auto dim_Linear1Weight = context->GetInputDim("Linear1Weight");
+    auto tmp_dim_x = dim_x;
+    tmp_dim_x[dim_x.size() - 1] =
+        dim_Linear1Weight[dim_Linear1Weight.size() - 1];
+    context->SetOutputDim("Out", dim_x);
+    if (context->Attrs().Get<bool>("dropout1_is_test") == false) {
+      context->SetOutputDim("Dropout1Mask", tmp_dim_x);
+    }
+    context->SetOutputDim("Dropout1Out", tmp_dim_x);
+    context->SetOutputDim("Linear1Out", tmp_dim_x);
+    context->SetOutputDim("Ln1Out", dim_x);
+    context->SetOutputDim("Dropout2Out", dim_x);
+
+    if (context->Attrs().Get<bool>("dropout2_is_test") == false) {
+      context->SetOutputDim("Dropout2Mask", dim_x);
+    }
+    framework::DDim mean_dim =
+        framework::make_ddim({mat_dim_x.batch_size_ * mat_dim_x.height_});
+    context->SetOutputDim("Ln1Mean", mean_dim);
+    context->SetOutputDim("Ln1Variance", mean_dim);
+    context->SetOutputDim("Ln2Mean", mean_dim);
+    context->SetOutputDim("Ln2Variance", mean_dim);
+    context->ShareLoD("X", "Out");
+  }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+};
+
+class FusedFeedForwardOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "The input of FusedFeedForward op");
+    AddInput(
+        "Dropout1Seed",
+        "The seed of first dropout op, it has higher priority than the attr "
+        "fix_seed and seed")
+        .AsDispensable();
+    AddInput(
+        "Dropout2Seed",
+        "The seed of second dropout op, it has higher priority than the attr "
+        "fix_seed and seed")
+        .AsDispensable();
+
+    AddInput("Linear1Weight", "The linear1 weight of FusedFeedForward op");
+    AddInput("Linear1Bias", "The linear1 bias of FusedFeedForward op")
+        .AsDispensable();
+    AddInput("Linear2Weight", "The linear2 weight of FusedFeedForward op");
+    AddInput("Linear2Bias", "The linear2 bias input of FusedFeedForward op")
+        .AsDispensable();
+    AddInput("Ln1Scale", "The layer_norm1 scale of FusedFeedForward op")
+        .AsDispensable();
+    AddInput("Ln1Bias", "The layer_norm1 bias of FusedFeedForward op")
+        .AsDispensable();
+    AddInput("Ln2Scale", "The layer_norm2 scale of FusedFeedForward op")
+        .AsDispensable();
+    AddInput("Ln2Bias", "The layer_norm2 bias of FusedFeedForward op")
+        .AsDispensable();
+    AddOutput("Out", "The output of FusedFeedForward op");
+    AddOutput("Dropout1Mask", "The mask of dropout1").AsIntermediate();
+    AddOutput("Dropout2Mask", "The mask of dropout2").AsIntermediate();
+    AddOutput("Ln1Mean", "The mean of layer_norm1").AsIntermediate();
+    AddOutput("Ln1Variance", "The variance of layer_norm1").AsIntermediate();
+    AddOutput("Ln2Mean", "The mean of layer_nomr2").AsIntermediate();
+    AddOutput("Ln2Variance", "The variance of layer_norm2").AsIntermediate();
+    AddOutput("Linear1Out", "The output of linear1").AsIntermediate();
+    AddOutput("Ln1Out", "The output of layer_norm1").AsIntermediate();
+    AddOutput("Dropout1Out", "The output of dropout1").AsIntermediate();
+    AddOutput("Dropout2Out", "The output of dropout2").AsIntermediate();
+
+    AddAttr<bool>("pre_layer_norm", "true is pre layernorm").SetDefault(false);
+    AddAttr<float>("ln1_epsilon", "epsilon of pre layer_norm")
+        .SetDefault(1e-5f);
+    AddAttr<float>("ln2_epsilon", "epsilon of post layer_norm")
+        .SetDefault(1e-5f);
+    AddAttr<std::string>("act_method", "act_method").SetDefault("gelu");
+    AddAttr<float>("dropout1_rate", "the dropout rate of first dropout")
+        .SetDefault(.5f)
+        .AddCustomChecker([](const float &drop_p) {
+          PADDLE_ENFORCE_EQ(
+              drop_p >= 0.0f && drop_p <= 1.0f, true,
+              platform::errors::InvalidArgument(
+                  "'dropout1_rate' must be between 0.0 and 1.0."));
+        });
+    AddAttr<float>("dropout2_rate", "the dropout rate of second dropout")
+        .SetDefault(.5f)
+        .AddCustomChecker([](const float &drop_p) {
+          PADDLE_ENFORCE_EQ(
+              drop_p >= 0.0f && drop_p <= 1.0f, true,
+              platform::errors::InvalidArgument(
+                  "'dropout2_rate' must be between 0.0 and 1.0."));
+        });
+    AddAttr<std::string>("dropout1_implementation",
+                         "the dropout implementation of first dropout")
+        .SetDefault("downgrade_in_infer")
+        .AddCustomChecker([](const std::string &type) {
+          PADDLE_ENFORCE_EQ(
+              type == "downgrade_in_infer" || type == "upscale_in_train", true,
+              platform::errors::InvalidArgument(
+                  "dropout1_implementation can only be downgrade_in_infer or "
+                  "upscale_in_train"));
+        });
+    AddAttr<std::string>("dropout2_implementation",
+                         "the dropout implementation of second dropout")
+        .SetDefault("downgrade_in_infer")
+        .AddCustomChecker([](const std::string &type) {
+          PADDLE_ENFORCE_EQ(
+              type == "downgrade_in_infer" || type == "upscale_in_train", true,
+              platform::errors::InvalidArgument(
+                  "dropout2_implementation can only be downgrade_in_infer or "
+                  "upscale_in_train"));
+        });
+    AddAttr<bool>("dropout1_is_test", "the is_test of first dropout")
+        .SetDefault(false);
+    AddAttr<bool>("dropout2_is_test", "the is_test of second dropout")
+        .SetDefault(false);
+    AddAttr<bool>("dropout1_fix_seed", "the is_test of first dropout")
+        .SetDefault(false);
+    AddAttr<bool>("dropout2_fix_seed", "the is_test of second dropout")
+        .SetDefault(false);
+    AddAttr<int>("dropout1_seed", "Dropout1 random seed.").SetDefault(0);
+    AddAttr<int>("dropout2_seed", "Dropout2 random seed.").SetDefault(0);
+    AddComment(R"DOC(
+        the function of fused_feedforward operator is the same as the following pseudo code:
+        residual = src;
+        ln1_out = src;
+        if(pre_layer_norm){
+            ln1_out = layer_norm(src);
+        }
+        out = linear(dropout(activation(dropout(linear(ln1_out)))));
+        if(!pre_layer_norm) {
+            out = layer_norm(out);
+        }
+        )DOC");
+  }
+};
+
+class FusedFeedForwardOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE_EQ(ctx->Attrs().Get<bool>("dropout1_is_test"), false,
+                      platform::errors::InvalidArgument(
+                          "GradOp is only callable when is_test is false"));
+    PADDLE_ENFORCE_EQ(ctx->Attrs().Get<bool>("dropout2_is_test"), false,
+                      platform::errors::InvalidArgument(
+                          "GradOp is only callable when is_test is false"));
+    OP_INOUT_CHECK(ctx->HasInput("Dropout1Mask"), "Input", "Dropout1Mask",
+                   "FusedFeedForwardGrad");
+    OP_INOUT_CHECK(ctx->HasInput("Dropout2Mask"), "Input", "Dropout1Mask",
+                   "FusedFeedForwardGrad");
+    OP_INOUT_CHECK(ctx->HasInput("Linear1Out"), "Input", "Linear1Out",
+                   "FusedFeedForwardGrad");
+    OP_INOUT_CHECK(ctx->HasInput("Ln1Out"), "Input", "Ln1Out",
+                   "FusedFeedForwardGrad");
+    OP_INOUT_CHECK(ctx->HasInput("Dropout1Out"), "Input", "Dropout1Out",
+                   "FusedFeedForwardGrad");
+    OP_INOUT_CHECK(ctx->HasInput("Dropout2Out"), "Input", "Dropout2Out",
+                   "FusedFeedForwardGrad");
+    OP_INOUT_CHECK(ctx->HasInput("Linear1Weight"), "Input", "Linear1Weight",
+                   "FusedFeedForwardGrad");
+    OP_INOUT_CHECK(ctx->HasInput("Linear2Weight"), "Input", "Linear2Weight",
+                   "FusedFeedForwardGrad");
+    OP_INOUT_CHECK(ctx->HasInput("Ln1Mean"), "Input", "Ln1Mean",
+                   "FusedFeedForwardGrad");
+    OP_INOUT_CHECK(ctx->HasInput("Ln1Variance"), "Input", "Ln1Variance",
+                   "FusedFeedForwardGrad");
+    OP_INOUT_CHECK(ctx->HasInput("Ln2Mean"), "Input", "Ln2Mean",
+                   "FusedFeedForwardGrad");
+    OP_INOUT_CHECK(ctx->HasInput("Ln2Variance"), "Input", "Ln2Variance",
+                   "FusedFeedForwardGrad");
+
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
+                   framework::GradVarName("Out"), "FusedFeedForwardGrad");
+
+    auto d_out_dim = ctx->GetInputDim(framework::GradVarName("Out"));
+    ctx->SetOutputDim(framework::GradVarName("X"), d_out_dim);
+    if (ctx->HasOutput(framework::GradVarName("Ln1Scale"))) {
+      ctx->SetOutputDim(framework::GradVarName("Ln1Scale"),
+                        ctx->GetInputDim("Ln1Scale"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("Ln1Bias"))) {
+      ctx->SetOutputDim(framework::GradVarName("Ln1Bias"),
+                        ctx->GetInputDim("Ln1Bias"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("Ln2Scale"))) {
+      ctx->SetOutputDim(framework::GradVarName("Ln2Scale"),
+                        ctx->GetInputDim("Ln2Scale"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("Ln2Bias"))) {
+      ctx->SetOutputDim(framework::GradVarName("Ln2Bias"),
+                        ctx->GetInputDim("Ln2Bias"));
+    }
+    ctx->SetOutputDim(framework::GradVarName("Linear1Weight"),
+                      ctx->GetInputDim("Linear1Weight"));
+    if (ctx->HasOutput(framework::GradVarName("Linear1Bias"))) {
+      ctx->SetOutputDim(framework::GradVarName("Linear1Bias"),
+                        ctx->GetInputDim("Linear1Bias"));
+    }
+    ctx->SetOutputDim(framework::GradVarName("Linear2Weight"),
+                      ctx->GetInputDim("Linear2Weight"));
+    if (ctx->HasOutput(framework::GradVarName("Linear2Bias"))) {
+      ctx->SetOutputDim(framework::GradVarName("Linear2Bias"),
+                        ctx->GetInputDim("Linear2Bias"));
+    }
+  }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    auto input = ctx.Input<Tensor>("X");
+    auto input_data_type = input->type();
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
+};
+
+template <typename T>
+class FusedFeedForwardOpGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("fused_feedforward_grad");
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    op->SetInput("X", this->Input("X"));
+    op->SetInput("Linear1Weight", this->Input("Linear1Weight"));
+    op->SetInput("Linear1Bias", this->Input("Linear1Bias"));
+    op->SetInput("Linear2Weight", this->Input("Linear2Weight"));
+    op->SetInput("Ln1Scale", this->Input("Ln1Scale"));
+    op->SetInput("Ln1Bias", this->Input("Ln1Bias"));
+    op->SetInput("Ln2Scale", this->Input("Ln2Scale"));
+    op->SetInput("Ln2Bias", this->Input("Ln2Bias"));
+    op->SetInput("Dropout1Mask", this->Output("Dropout1Mask"));
+    op->SetInput("Dropout2Mask", this->Output("Dropout2Mask"));
+    op->SetInput("Linear1Out", this->Output("Linear1Out"));
+    op->SetInput("Ln1Out", this->Output("Ln1Out"));
+    op->SetInput("Ln1Mean", this->Output("Ln1Mean"));
+    op->SetInput("Ln1Variance", this->Output("Ln1Variance"));
+    op->SetInput("Ln2Mean", this->Output("Ln2Mean"));
+    op->SetInput("Ln2Variance", this->Output("Ln2Variance"));
+    op->SetInput("Dropout1Out", this->Output("Dropout1Out"));
+    op->SetInput("Dropout2Out", this->Output("Dropout2Out"));
+
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Ln1Scale"),
+                  this->InputGrad("Ln1Scale"));
+    op->SetOutput(framework::GradVarName("Ln1Bias"),
+                  this->InputGrad("Ln1Bias"));
+    op->SetOutput(framework::GradVarName("Ln2Scale"),
+                  this->InputGrad("Ln2Scale"));
+    op->SetOutput(framework::GradVarName("Ln2Bias"),
+                  this->InputGrad("Ln2Bias"));
+    op->SetOutput(framework::GradVarName("Linear1Weight"),
+                  this->InputGrad("Linear1Weight"));
+    op->SetOutput(framework::GradVarName("Linear1Bias"),
+                  this->InputGrad("Linear1Bias"));
+    op->SetOutput(framework::GradVarName("Linear2Weight"),
+                  this->InputGrad("Linear2Weight"));
+    if (this->HasInput("Linear2Bias")) {
+      op->SetInput("Linear2Bias", this->Input("Linear2Bias"));
+      op->SetOutput(framework::GradVarName("Linear2Bias"),
+                    this->InputGrad("Linear2Bias"));
+    }
+
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
+template <typename T>
+class FusedFeedForwardOpDoubleGradMaker
+    : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> grad_op) const override {}
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(fused_feedforward, ops::FusedFeedForwardOp,
+                  ops::FusedFeedForwardOpMaker,
+                  ops::FusedFeedForwardOpGradMaker<paddle::framework::OpDesc>,
+                  ops::FusedFeedForwardOpGradMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(fused_feedforward_grad, ops::FusedFeedForwardOpGrad);
diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cu b/paddle/fluid/operators/fused/fused_feedforward_op.cu
new file mode 100644
index 0000000000000..61a8a9a82f2e0
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_feedforward_op.cu
@@ -0,0 +1,394 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/matmul_v2_op.h"
+
+#include "paddle/fluid/operators/fused/fused_dropout_helper.h"
+#include "paddle/fluid/operators/layer_norm_kernel.cu.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class FusedFeedForwardKernel : public framework::OpKernel<T> {
+ public:
+  void MatMul(const platform::CUDADeviceContext& ctx,
+              const framework::Tensor& a, const framework::Tensor& b,
+              framework::Tensor* c) const {
+    auto blas = math::GetBlas<DeviceContext, T>(ctx);
+    auto a_2d = FoldInitDims(a);
+    auto b_2d = FoldInitDims(b);
+    auto mat_dim_a = math::CreateMatrixDescriptor(a_2d.dims(), 0, false);
+    auto mat_dim_b = math::CreateMatrixDescriptor(b_2d.dims(), 0, false);
+    T alpha = static_cast<T>(1.0);
+    blas.MatMul(a, mat_dim_a, b, mat_dim_b, alpha, c, T(0));
+  }
+
+  void FFN(const framework::Tensor& x, const framework::Tensor& linear1_weight,
+           const framework::Tensor* linear1_bias,
+           const framework::Tensor& linear2_weight,
+           const framework::Tensor* linear2_bias,
+           const framework::Tensor* ln1_scale,
+           const framework::Tensor* ln1_bias,
+           const framework::Tensor* ln2_scale,
+           const framework::Tensor* ln2_bias, framework::Tensor* out,
+           framework::Tensor* dropout1_mask, framework::Tensor* dropout2_mask,
+           framework::Tensor* ln1_mean, framework::Tensor* ln1_variance,
+           framework::Tensor* ln2_mean, framework::Tensor* ln2_variance,
+           framework::Tensor* linear1_out, framework::Tensor* ln1_out,
+           framework::Tensor* dropout1_out, framework::Tensor* dropout2_out,
+           const int bsz_seq, const int d_model, const int dim_feedforward,
+           const std::string& act_method, const bool pre_layer_norm,
+           const float epsilon1, const float epsilon2,
+           const DropoutParam& dropout_param1,
+           const DropoutParam& dropout_param2,
+           const platform::CUDADeviceContext& ctx) const {
+    FusedDropoutLayerNormHelper<T, uint8_t> pre_layernorm_helper(
+        bsz_seq, d_model, epsilon1);
+    FusedDropoutHelper<T, uint8_t> fused_act_dropout_helper(
+        ctx, bsz_seq, dim_feedforward, dropout_param1);
+    FusedDropoutLayerNormHelper<T, uint8_t> fused_dropout_layernorm_helper(
+        ctx, bsz_seq, d_model, dropout_param2, epsilon2);
+
+    auto place = ctx.GetPlace();
+    using U = LayerNormParamType<T>;
+    const framework::Tensor* in = &x;
+
+    const U* ln1_scale_ptr =
+        ln1_scale == nullptr ? nullptr : ln1_scale->data<U>();
+    const U* ln1_bias_ptr = ln1_bias == nullptr ? nullptr : ln1_bias->data<U>();
+    const U* ln2_scale_ptr =
+        ln2_scale == nullptr ? nullptr : ln2_scale->data<U>();
+    const U* ln2_bias_ptr = ln2_bias == nullptr ? nullptr : ln2_bias->data<U>();
+    const T* linear1_bias_ptr =
+        linear1_bias == nullptr ? nullptr : linear1_bias->data<T>();
+    const T* linear2_bias_ptr =
+        linear2_bias == nullptr ? nullptr : linear2_bias->data<T>();
+
+    if (pre_layer_norm) {
+      pre_layernorm_helper.LayerNorm(
+          ctx, x.data<T>(), ln1_scale_ptr, ln1_bias_ptr, ln1_out->data<T>(),
+          ln1_mean->data<U>(), ln1_variance->data<U>());
+      in = ln1_out;
+    }
+    MatMul(ctx, *in, linear1_weight, linear1_out);
+    fused_act_dropout_helper.DropoutActBias(
+        ctx, linear1_out->data<T>(), linear1_bias_ptr, act_method,
+        dropout1_out->data<T>(), dropout1_mask->data<uint8_t>());
+    framework::Tensor linear2_out;
+    linear2_out.mutable_data<T>({bsz_seq, d_model}, place);
+    MatMul(ctx, *dropout1_out, linear2_weight, &linear2_out);
+    if (!pre_layer_norm) {
+      fused_dropout_layernorm_helper.LayernormResidualDropoutBias(
+          ctx, linear2_out.data<T>(), x.data<T>(), linear2_bias_ptr,
+          ln2_scale_ptr, ln2_bias_ptr, dropout2_out->data<T>(),
+          dropout2_mask->data<uint8_t>(), out->data<T>(), ln2_mean->data<U>(),
+          ln2_variance->data<U>());
+    } else {
+      fused_dropout_layernorm_helper.ResidualDropoutBias(
+          ctx, linear2_out.data<T>(), x.data<T>(), linear2_bias_ptr,
+          out->data<T>(), dropout2_mask->data<uint8_t>());
+    }
+  }
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<framework::Tensor>("X");
+    auto* linear1_weight = context.Input<framework::Tensor>("Linear1Weight");
+    auto* linear1_bias = context.Input<framework::Tensor>("Linear1Bias");
+    auto* linear2_weight = context.Input<framework::Tensor>("Linear2Weight");
+    auto* linear2_bias = context.Input<framework::Tensor>("Linear2Bias");
+    auto* ln1_scale = context.Input<framework::Tensor>("Ln1Scale");
+    auto* ln1_bias = context.Input<framework::Tensor>("Ln1Bias");
+    auto* ln2_scale = context.Input<framework::Tensor>("Ln2Scale");
+    auto* ln2_bias = context.Input<framework::Tensor>("Ln2Bias");
+
+    auto* ln1_mean = context.Output<framework::Tensor>("Ln1Mean");
+    auto* ln1_variance = context.Output<framework::Tensor>("Ln1Variance");
+    auto* ln2_mean = context.Output<framework::Tensor>("Ln2Mean");
+    auto* ln2_variance = context.Output<framework::Tensor>("Ln2Variance");
+    auto* out = context.Output<framework::Tensor>("Out");
+    auto* dropout1_mask = context.Output<framework::Tensor>("Dropout1Mask");
+    auto* dropout2_mask = context.Output<framework::Tensor>("Dropout2Mask");
+    auto* linear1_out = context.Output<framework::Tensor>("Linear1Out");
+    auto* ln1_out = context.Output<framework::Tensor>("Ln1Out");
+    auto* dropout1_out = context.Output<framework::Tensor>("Dropout1Out");
+    auto* dropout2_out = context.Output<framework::Tensor>("Dropout2Out");
+
+    const std::string act_method = context.Attr<std::string>("act_method");
+
+    const bool pre_layer_norm = context.Attr<bool>("pre_layer_norm");
+    const float epsilon1 = context.Attr<float>("ln1_epsilon");
+    const float epsilon2 = context.Attr<float>("ln2_epsilon");
+
+    DropoutParam dropout_param1(context, 1);
+    DropoutParam dropout_param2(context, 2);
+
+    using U = LayerNormParamType<T>;
+    auto place = context.GetPlace();
+    out->mutable_data<T>(place);
+    dropout1_mask->mutable_data<uint8_t>(place);
+    dropout2_mask->mutable_data<uint8_t>(place);
+    ln1_mean->mutable_data<U>(place);
+    ln1_variance->mutable_data<U>(place);
+    ln2_mean->mutable_data<U>(place);
+    ln2_variance->mutable_data<U>(place);
+    linear1_out->mutable_data<T>(place);
+    ln1_out->mutable_data<T>(place);
+    dropout1_out->mutable_data<T>(place);
+    dropout2_out->mutable_data<T>(place);
+
+    auto x_dim = x->dims();
+    auto mat_dim_x =
+        math::CreateMatrixDescriptor(RowMatrixFromVector(x_dim), 0, false);
+
+    auto dim = linear1_weight->dims();
+    int d_model = dim[0];
+    int dim_feedforward = dim[dim.size() - 1];
+    int bsz_seq = mat_dim_x.batch_size_ * mat_dim_x.height_;
+
+    FFN(*x, *linear1_weight, linear1_bias, *linear2_weight, linear2_bias,
+        ln1_scale, ln1_bias, ln2_scale, ln2_bias, out, dropout1_mask,
+        dropout2_mask, ln1_mean, ln1_variance, ln2_mean, ln2_variance,
+        linear1_out, ln1_out, dropout1_out, dropout2_out, bsz_seq, d_model,
+        dim_feedforward, act_method, pre_layer_norm, epsilon1, epsilon2,
+        dropout_param1, dropout_param2, context.cuda_device_context());
+  }
+};
+
+template <typename DeviceContext, typename T>
+class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
+ public:
+  void MatMulGrad(const platform::CUDADeviceContext& ctx,
+                  const framework::Tensor& d_out, const framework::Tensor& a,
+                  const framework::Tensor& b, framework::Tensor* d_a,
+                  framework::Tensor* d_b) const {
+    auto blas = math::GetBlas<DeviceContext, T>(ctx);
+    auto a_2d = FoldInitDims(a);
+    auto b_2d = FoldInitDims(b);
+    auto mat_dim_a = math::CreateMatrixDescriptor(a_2d.dims(), 0, true);
+    auto mat_dim_b = math::CreateMatrixDescriptor(b_2d.dims(), 0, true);
+    auto mat_dim_dout = math::CreateMatrixDescriptor(d_out.dims(), 0, false);
+    T alpha = static_cast<T>(1.0);
+    blas.MatMul(d_out, mat_dim_dout, b, mat_dim_b, alpha, d_a, T(0));
+    blas.MatMul(a, mat_dim_a, d_out, mat_dim_dout, alpha, d_b, T(0));
+  }
+
+  void FFNGrad(
+      const framework::Tensor& d_out, const framework::Tensor& x,
+      const framework::Tensor& dropout1_mask,
+      const framework::Tensor& dropout2_mask,
+      const framework::Tensor& linear1_out, const framework::Tensor& ln1_out,
+      const framework::Tensor& dropout1_out,
+      const framework::Tensor& dropout2_out,
+      const framework::Tensor& linear1_weight,
+      const framework::Tensor* linear1_bias,
+      const framework::Tensor& linear2_weight,
+      const framework::Tensor* ln1_gamma, const framework::Tensor* ln1_beta,
+      const framework::Tensor& ln1_mean, const framework::Tensor& ln1_variance,
+      const framework::Tensor* ln2_gamma, const framework::Tensor* ln2_beta,
+      const framework::Tensor& ln2_mean, const framework::Tensor& ln2_variance,
+      framework::Tensor* d_x, framework::Tensor* d_linear1_weight,
+      framework::Tensor* d_linear1_bias, framework::Tensor* d_linear2_weight,
+      framework::Tensor* d_linear2_bias, framework::Tensor* d_ln1_gamma,
+      framework::Tensor* d_ln1_beta, framework::Tensor* d_ln2_gamma,
+      framework::Tensor* d_ln2_beta, const int bsz_seq, const int d_model,
+      const int dim_feedforward, const DropoutParam& dropout_param1,
+      const DropoutParam& dropout_param2, const std::string& act_method,
+      const bool pre_layer_norm, const float epsilon1, const float epsilon2,
+      const platform::CUDADeviceContext& ctx) const {
+    FusedDropoutLayerNormHelper<T, uint8_t> pre_layernorm_helper(
+        bsz_seq, d_model, epsilon1);
+    FusedDropoutHelper<T, uint8_t> fused_act_dropout_helper(
+        ctx, bsz_seq, dim_feedforward, dropout_param1);
+    FusedDropoutLayerNormHelper<T, uint8_t> fused_dropout_layernorm_helper(
+        ctx, bsz_seq, d_model, dropout_param2, epsilon2);
+
+    auto place = ctx.GetPlace();
+    using U = LayerNormParamType<T>;
+    const U* ln1_gamma_ptr =
+        ln1_gamma == nullptr ? nullptr : ln1_gamma->data<U>();
+    const U* ln1_beta_ptr = ln1_beta == nullptr ? nullptr : ln1_beta->data<U>();
+    const U* ln2_gamma_ptr =
+        ln2_gamma == nullptr ? nullptr : ln2_gamma->data<U>();
+    const U* ln2_beta_ptr = ln2_beta == nullptr ? nullptr : ln2_beta->data<U>();
+    const T* linear1_bias_ptr =
+        linear1_bias == nullptr ? nullptr : linear1_bias->data<T>();
+    T* d_linear1_bias_ptr =
+        d_linear1_bias == nullptr ? nullptr : d_linear1_bias->data<T>();
+    T* d_linear2_bias_ptr =
+        d_linear2_bias == nullptr ? nullptr : d_linear2_bias->data<T>();
+    U* d_ln1_gamma_ptr =
+        d_ln1_gamma == nullptr ? nullptr : d_ln1_gamma->data<U>();
+    U* d_ln1_beta_ptr = d_ln1_beta == nullptr ? nullptr : d_ln1_beta->data<U>();
+    U* d_ln2_gamma_ptr =
+        d_ln2_gamma == nullptr ? nullptr : d_ln2_gamma->data<U>();
+    U* d_ln2_beta_ptr = d_ln2_beta == nullptr ? nullptr : d_ln2_beta->data<U>();
+
+    framework::Tensor d_linear2_out, d_dropout2_out, d_residual;
+    d_linear2_out.mutable_data<T>({bsz_seq, d_model}, place);
+    d_dropout2_out.mutable_data<T>({bsz_seq, d_model}, place);
+    d_residual.mutable_data<T>({bsz_seq, d_model}, place);
+
+    if (pre_layer_norm) {
+      fused_dropout_layernorm_helper.ResidualDropoutBiasGrad(
+          ctx, d_out.data<T>(), dropout2_mask.data<uint8_t>(),
+          d_linear2_out.data<T>(), d_residual.data<T>(), d_linear2_bias_ptr);
+    } else {
+      fused_dropout_layernorm_helper.LayernormResidualDropoutBiasGrad(
+          ctx, d_out.data<T>(), dropout2_out.data<T>(),
+          dropout2_mask.data<uint8_t>(), ln2_gamma_ptr, ln2_mean.data<U>(),
+          ln2_variance.data<U>(), d_dropout2_out.data<T>(), d_ln2_gamma_ptr,
+          d_ln2_beta_ptr, d_linear2_out.data<T>(), d_linear2_bias_ptr,
+          d_residual.data<T>());
+    }
+
+    framework::Tensor d_dropout1_out;
+    d_dropout1_out.mutable_data<T>({bsz_seq, dim_feedforward}, place);
+    MatMulGrad(ctx, d_linear2_out, dropout1_out, linear2_weight,
+               &d_dropout1_out, d_linear2_weight);
+
+    framework::Tensor d_linear1_out;
+    d_linear1_out.mutable_data<T>({bsz_seq, dim_feedforward}, place);
+    fused_act_dropout_helper.DropoutActBiasGrad(
+        ctx, d_dropout1_out.data<T>(), linear1_out.data<T>(), linear1_bias_ptr,
+        dropout1_mask.data<uint8_t>(), d_linear1_out.data<T>(),
+        d_linear1_bias_ptr, act_method);
+
+    if (pre_layer_norm) {
+      framework::Tensor d_ln1_out;
+      d_ln1_out.mutable_data<T>({bsz_seq, d_model}, place);
+      MatMulGrad(ctx, d_linear1_out, ln1_out, linear1_weight, &d_ln1_out,
+                 d_linear1_weight);
+
+      pre_layernorm_helper.LayerNormGrad(ctx, d_ln1_out.data<T>(), x.data<T>(),
+                                         ln1_gamma_ptr, ln1_mean.data<U>(),
+                                         ln1_variance.data<U>(), d_x->data<T>(),
+                                         d_ln1_gamma_ptr, d_ln1_beta_ptr);
+    } else {
+      MatMulGrad(ctx, d_linear1_out, x, linear1_weight, d_x, d_linear1_weight);
+    }
+  }
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    using U = LayerNormParamType<T>;
+    auto d_out =
+        *context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto x = *context.Input<framework::Tensor>("X");
+    auto dropout1_mask = *context.Input<framework::Tensor>("Dropout1Mask");
+    auto dropout2_mask = *context.Input<framework::Tensor>("Dropout2Mask");
+    auto linear1_out = *context.Input<framework::Tensor>("Linear1Out");
+    auto ln1_out = *context.Input<framework::Tensor>("Ln1Out");
+    auto dropout1_out = *context.Input<framework::Tensor>("Dropout1Out");
+    auto dropout2_out = *context.Input<framework::Tensor>("Dropout2Out");
+    auto linear1_weight = *context.Input<framework::Tensor>("Linear1Weight");
+    auto* linear1_bias = context.Input<framework::Tensor>("Linear1Bias");
+    auto linear2_weight = *context.Input<framework::Tensor>("Linear2Weight");
+    auto ln1_mean = *context.Input<framework::Tensor>("Ln1Mean");
+    auto ln1_variance = *context.Input<framework::Tensor>("Ln1Variance");
+    auto* ln1_scale = context.Input<framework::Tensor>("Ln1Scale");
+    auto* ln1_bias = context.Input<framework::Tensor>("Ln1Bias");
+    auto ln2_mean = *context.Input<framework::Tensor>("Ln2Mean");
+    auto ln2_variance = *context.Input<framework::Tensor>("Ln2Variance");
+    auto* ln2_scale = context.Input<framework::Tensor>("Ln2Scale");
+    auto* ln2_bias = context.Input<framework::Tensor>("Ln2Bias");
+
+    auto* d_x = context.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* d_ln1_scale =
+        context.Output<framework::Tensor>(framework::GradVarName("Ln1Scale"));
+    auto* d_ln1_bias =
+        context.Output<framework::Tensor>(framework::GradVarName("Ln1Bias"));
+    auto* d_ln2_scale =
+        context.Output<framework::Tensor>(framework::GradVarName("Ln2Scale"));
+    auto* d_ln2_bias =
+        context.Output<framework::Tensor>(framework::GradVarName("Ln2Bias"));
+    auto* d_linear1_weight = context.Output<framework::Tensor>(
+        framework::GradVarName("Linear1Weight"));
+    auto* d_linear1_bias = context.Output<framework::Tensor>(
+        framework::GradVarName("Linear1Bias"));
+    auto* d_linear2_weight = context.Output<framework::Tensor>(
+        framework::GradVarName("Linear2Weight"));
+    auto* d_linear2_bias = context.Output<framework::Tensor>(
+        framework::GradVarName("Linear2Bias"));
+
+    const float epsilon1 = context.Attr<float>("ln1_epsilon");
+    const float epsilon2 = context.Attr<float>("ln2_epsilon");
+    const bool pre_layer_norm = context.Attr<bool>("pre_layer_norm");
+    const std::string act_method = context.Attr<std::string>("act_method");
+    DropoutParam dropout_param1(context, 1);
+    DropoutParam dropout_param2(context, 2);
+
+    auto place = context.GetPlace();
+    d_x->mutable_data<T>(place);
+    if (d_ln1_scale) {
+      d_ln1_scale->mutable_data<U>(place);
+    }
+    if (d_ln1_bias) {
+      d_ln1_bias->mutable_data<U>(place);
+    }
+    if (d_ln2_scale) {
+      d_ln2_scale->mutable_data<U>(place);
+    }
+    if (d_ln2_bias) {
+      d_ln2_bias->mutable_data<U>(place);
+    }
+    if (d_linear1_bias) {
+      d_linear1_bias->mutable_data<T>(place);
+    }
+    if (d_linear2_bias) {
+      d_linear2_bias->mutable_data<T>(place);
+    }
+    d_linear1_weight->mutable_data<T>(place);
+    d_linear2_weight->mutable_data<T>(place);
+
+    auto x_dim = x.dims();
+    auto mat_dim_x =
+        math::CreateMatrixDescriptor(RowMatrixFromVector(x_dim), 0, false);
+
+    auto linear1_weight_dim = linear1_weight.dims();
+    int d_model = linear1_weight_dim[0];
+    int dim_feedforward = linear1_weight_dim[linear1_weight_dim.size() - 1];
+    int bsz_seq = mat_dim_x.batch_size_ * mat_dim_x.height_;
+
+    FFNGrad(d_out, x, dropout1_mask, dropout2_mask, linear1_out, ln1_out,
+            dropout1_out, dropout2_out, linear1_weight, linear1_bias,
+            linear2_weight, ln1_scale, ln1_bias, ln1_mean, ln1_variance,
+            ln2_scale, ln2_bias, ln2_mean, ln2_variance, d_x, d_linear1_weight,
+            d_linear1_bias, d_linear2_weight, d_linear2_bias, d_ln1_scale,
+            d_ln1_bias, d_ln2_scale, d_ln2_bias, bsz_seq, d_model,
+            dim_feedforward, dropout_param1, dropout_param2, act_method,
+            pre_layer_norm, epsilon1, epsilon2, context.cuda_device_context());
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    fused_feedforward,
+    ops::FusedFeedForwardKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::FusedFeedForwardKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::FusedFeedForwardKernel<paddle::platform::CUDADeviceContext,
+                                paddle::platform::float16>);
+REGISTER_OP_CUDA_KERNEL(
+    fused_feedforward_grad,
+    ops::FusedFeedForwardGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::FusedFeedForwardGradKernel<paddle::platform::CUDADeviceContext,
+                                    double>,
+    ops::FusedFeedForwardGradKernel<paddle::platform::CUDADeviceContext,
+                                    paddle::platform::float16>);
diff --git a/paddle/fluid/operators/grid_sampler_op.h b/paddle/fluid/operators/grid_sampler_op.h
index b1857b49eede0..da386052c7dc0 100644
--- a/paddle/fluid/operators/grid_sampler_op.h
+++ b/paddle/fluid/operators/grid_sampler_op.h
@@ -82,6 +82,9 @@ static inline void clip(const platform::CPUDeviceContext& ctx,
       auto grid_abs = grid_slice_t.abs();
       auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
       grid_slice_t.device(place) = extra.cwiseMin(double_range - extra);
+      if (max_val == 0) {
+        grid_slice_t.device(place) = grid_slice_t.constant(static_cast<T>(0));
+      }
     } else {
       auto double_range = static_cast<T>((max_val + 1) * 2);
       auto grid_abs = (grid_slice_t + static_cast<T>(0.5)).abs();
@@ -128,6 +131,9 @@ static inline void clipWithMask(const platform::CPUDeviceContext& ctx,
           grid_scale_t * ((is_neg == one_more_flip).template cast<T>() -
                           (is_neg != one_more_flip).template cast<T>());
       grid_slice_t.device(place) = extra.cwiseMin(double_range - extra);
+      if (max_val == 0) {
+        grid_slice_t.device(place) = grid_slice_t.constant(static_cast<T>(0));
+      }
     } else {
       auto double_range = static_cast<T>((max_val + 1) * 2);
       auto grid_abs = (grid_slice_t + static_cast<T>(0.5)).abs();
diff --git a/paddle/fluid/operators/huber_loss_op_npu.cc b/paddle/fluid/operators/huber_loss_op_npu.cc
index a942615594154..33cbaec4dfc46 100644
--- a/paddle/fluid/operators/huber_loss_op_npu.cc
+++ b/paddle/fluid/operators/huber_loss_op_npu.cc
@@ -1,13 +1,16 @@
 /* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
+
     http://www.apache.org/licenses/LICENSE-2.0
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 
 #include "paddle/fluid/operators/huber_loss_op.h"
 #include "paddle/fluid/operators/npu_op_runner.h"
diff --git a/paddle/fluid/operators/interpolate_v2_op_npu.cc b/paddle/fluid/operators/interpolate_v2_op_npu.cc
index d893fbd019628..b30c7ac810c01 100644
--- a/paddle/fluid/operators/interpolate_v2_op_npu.cc
+++ b/paddle/fluid/operators/interpolate_v2_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 
 #include "paddle/fluid/operators/interpolate_v2_op.h"
 #include "paddle/fluid/operators/npu_op_runner.h"
diff --git a/paddle/fluid/operators/is_empty_op_npu.cc b/paddle/fluid/operators/is_empty_op_npu.cc
index 9155afecd021b..01579abd74d23 100644
--- a/paddle/fluid/operators/is_empty_op_npu.cc
+++ b/paddle/fluid/operators/is_empty_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 
 #include "paddle/fluid/operators/is_empty_op.h"
 
diff --git a/paddle/fluid/operators/log_loss_op_npu.cc b/paddle/fluid/operators/log_loss_op_npu.cc
index a8d906d4b5cad..74b44165dcc4c 100644
--- a/paddle/fluid/operators/log_loss_op_npu.cc
+++ b/paddle/fluid/operators/log_loss_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 
 #include "paddle/fluid/operators/log_loss_op.h"
 #include <cmath>
diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu.cc b/paddle/fluid/operators/lookup_table_v2_op_npu.cc
index b75ae8a65881a..3cb91c712335d 100644
--- a/paddle/fluid/operators/lookup_table_v2_op_npu.cc
+++ b/paddle/fluid/operators/lookup_table_v2_op_npu.cc
@@ -21,6 +21,9 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+constexpr int64_t kNoPadding = -1;
+
 template <typename DeviceContext, typename T>
 class LookupTableV2NPUKernel : public framework::OpKernel<T> {
  public:
@@ -35,16 +38,52 @@ class LookupTableV2NPUKernel : public framework::OpKernel<T> {
         platform::errors::InvalidArgument("npu only accept LoDTensor"));
     output_t->mutable_data<T>(ctx.GetPlace());
 
-    NpuOpRunner runner;
-    runner.SetType("GatherV2")
-        .AddInput(*table_t)
-        .AddInput(*ids_t)
-        .AddInput(std::vector<int32_t>{0})
+    int64_t padding_idx = ctx.Attr<int64_t>("padding_idx");
+    if (padding_idx == kNoPadding) {
+      NpuOpRunner runner;
+      runner.SetType("GatherV2")
+          .AddInput(*table_t)
+          .AddInput(*ids_t)
+          .AddInput(std::vector<int32_t>{0})
+#if (CANN_VERSION_CODE >= 503003)
+          .AddAttrs({{"batch_dims", 0}})
+#endif
+          .AddOutput(*output_t);
+      runner.Run();
+    } else {
+      Tensor tmp_table_t(table_t->type());
+      tmp_table_t.mutable_data<T>(table_t->dims(), ctx.GetPlace());
+
+      Tensor index;
+      index.mutable_data<int32_t>({1, 1}, ctx.GetPlace());
+      FillNpuTensorWithConstant<int32_t>(&index,
+                                         static_cast<int32_t>(padding_idx));
+
+      auto updata_dim = framework::make_ddim({1, table_t->dims()[1]});
+      Tensor update;
+      update.mutable_data<T>(updata_dim, ctx.GetPlace());
+      FillNpuTensorWithConstant<T>(&update, static_cast<T>(0));
+      update.Resize(updata_dim);
+
+      NpuOpRunner update_runner;
+      update_runner.SetType("TensorScatterUpdate")
+          .AddInput(*table_t)
+          .AddInput(index)
+          .AddInput(update)
+          .AddOutput(tmp_table_t);
+      update_runner.Run();
+
+      NpuOpRunner runner;
+      runner.SetType("GatherV2")
+          .AddInput(tmp_table_t)
+          .AddInput(*ids_t)
+          .AddInput(std::vector<int32_t>{0})
 #if (CANN_VERSION_CODE >= 503003)
-        .AddAttrs({{"batch_dims", 0}})
+          .AddAttrs({{"batch_dims", 0}})
 #endif
-        .AddOutput(*output_t);
-    runner.Run();
+          .AddOutput(*output_t);
+      runner.Run();
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/math/pooling.cu b/paddle/fluid/operators/math/pooling.cu
index 48b0d2ab46057..84a970a9a2606 100644
--- a/paddle/fluid/operators/math/pooling.cu
+++ b/paddle/fluid/operators/math/pooling.cu
@@ -979,6 +979,49 @@ __global__ void KernelMaxPool3DGrad(
   }
 }
 
+template <typename PoolProcess, typename T>
+void Pool3dDirectCUDAFunctor<PoolProcess, T>::operator()(
+    const T* input, const std::vector<int>& input_shape,
+    const std::vector<int>& output_shape, const std::vector<int>& ksize,
+    const std::vector<int>& strides, const std::vector<int>& paddings,
+    bool exclusive, bool adaptive, T* output, gpuStream_t stream,
+    PoolProcess pool_compute) {
+  const int batch_size = input_shape[0];
+  const int input_channels = input_shape[1];
+  const int input_depth = input_shape[2];
+  const int input_height = input_shape[3];
+  const int input_width = input_shape[4];
+  const int output_channels = output_shape[1];
+  const int output_depth = output_shape[2];
+  const int output_height = output_shape[3];
+  const int output_width = output_shape[4];
+  const int ksize_depth = ksize[0];
+  const int ksize_height = ksize[1];
+  const int ksize_width = ksize[2];
+  const int stride_depth = strides[0];
+  const int stride_height = strides[1];
+  const int stride_width = strides[2];
+  const int padding_depth = paddings[0];
+  const int padding_height = paddings[1];
+  const int padding_width = paddings[2];
+
+  int nthreads = batch_size * output_channels * output_depth * output_height *
+                 output_width;
+  int thread_num = 1024;
+#ifdef WITH_NV_JETSON
+  thread_num = 512;
+#endif
+  int blocks = (nthreads + thread_num - 1) / thread_num;
+  dim3 threads(thread_num, 1);
+  dim3 grid(blocks, 1);
+
+  KernelPool3D<PoolProcess, T><<<grid, threads, 0, stream>>>(
+      nthreads, input, input_channels, input_depth, input_height, input_width,
+      output_depth, output_height, output_width, ksize_depth, ksize_height,
+      ksize_width, stride_depth, stride_height, stride_width, padding_depth,
+      padding_height, padding_width, pool_compute, exclusive, adaptive, output);
+}
+
 /*
  * Tensors are in NCDHW or NDHWC format.
  * Ksize, strides, paddings are three elements. These three elements represent
@@ -1315,6 +1358,11 @@ class MaxPool3dGradFunctor<platform::CUDADeviceContext, T> {
   }
 };
 
+template class Pool3dDirectCUDAFunctor<paddle::operators::math::MaxPool<float>,
+                                       float>;
+template class Pool3dDirectCUDAFunctor<paddle::operators::math::AvgPool<float>,
+                                       float>;
+
 template class MaxPool3dGradFunctor<platform::CUDADeviceContext, float>;
 template class MaxPool3dGradFunctor<platform::CUDADeviceContext, double>;
 template class MaxPool3dGradFunctor<platform::CUDADeviceContext,
diff --git a/paddle/fluid/operators/math/pooling.h b/paddle/fluid/operators/math/pooling.h
index 3715f6e26104a..4743f0dc9faf1 100644
--- a/paddle/fluid/operators/math/pooling.h
+++ b/paddle/fluid/operators/math/pooling.h
@@ -187,6 +187,20 @@ class MaxPool2dGradFunctor {
                   const std::string data_format, framework::Tensor* input_grad);
 };
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <typename PoolProcess, typename T>
+class Pool3dDirectCUDAFunctor {
+ public:
+  void operator()(const T* input, const std::vector<int>& input_shape,
+                  const std::vector<int>& output_shape,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings, bool exclusive,
+                  bool adaptive, T* output, gpuStream_t stream,
+                  PoolProcess pool_compute);
+};
+#endif
+
 template <typename DeviceContext, typename PoolProcess, typename T>
 class Pool3dFunctor {
  public:
diff --git a/paddle/fluid/operators/matmul_op_npu.cc b/paddle/fluid/operators/matmul_op_npu.cc
index d5606177a5592..df811abc1de98 100644
--- a/paddle/fluid/operators/matmul_op_npu.cc
+++ b/paddle/fluid/operators/matmul_op_npu.cc
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <memory>
-#include <string>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/npu_op_runner.h"
@@ -21,40 +19,253 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+using NPUDeviceContext = platform::NPUDeviceContext;
+
+template <typename T>
+static void Mul(const framework::ExecutionContext& ctx,
+                const aclrtStream& stream, const Tensor& X, const Tensor& Y,
+                Tensor* Out, const float alpha) {
+  Out->mutable_data<T>(ctx.GetPlace());
+
+  if (fabs(alpha - 1.0) < std::numeric_limits<float>::epsilon()) {
+    const auto& runner_dx = NpuOpRunner("Mul", {X, Y}, {*Out}, {});
+    runner_dx.Run(stream);
+  } else {
+    Tensor Out_temp(Out->type());
+    Out_temp.mutable_data<T>(Out->dims(), ctx.GetPlace());
+    const auto& runner_dx = NpuOpRunner("Mul", {X, Y}, {Out_temp}, {});
+    runner_dx.Run(stream);
+
+    const auto& runner =
+        NpuOpRunner("Muls", {Out_temp}, {*Out}, {{"value", alpha}});
+    runner.Run(stream);
+  }
+}
+
+template <typename T>
+static void Dot(const framework::ExecutionContext& ctx,
+                const aclrtStream& stream, const Tensor& X, const Tensor& Y,
+                Tensor* Out, const float alpha) {
+  Out->mutable_data<T>(ctx.GetPlace());
+
+  if (fabs(alpha - 1.0) < std::numeric_limits<float>::epsilon()) {
+    const auto& runner = NpuOpRunner("Dot", {X, Y}, {*Out});
+    runner.Run(stream);
+  } else {
+    Tensor Out_temp(Out->type());
+    Out_temp.mutable_data<T>(Out->dims(), ctx.GetPlace());
+    const auto& out_temp_runner = NpuOpRunner("Dot", {X, Y}, {Out_temp});
+    out_temp_runner.Run(stream);
+
+    const auto& runner =
+        NpuOpRunner("Muls", {Out_temp}, {*Out}, {{"value", alpha}});
+    runner.Run(stream);
+  }
+}
+
+template <typename T>
+static void MatMul2D(const framework::ExecutionContext& ctx,
+                     const aclrtStream& stream, const Tensor& X,
+                     const Tensor& Y, Tensor* Out, const bool trans_x,
+                     const bool trans_y, const float alpha) {
+  Out->mutable_data<T>(ctx.GetPlace());
+
+  if (fabs(alpha - 1.0) < std::numeric_limits<float>::epsilon()) {
+    const auto& runner =
+        NpuOpRunner("MatMul", {X, Y}, {*Out},
+                    {{"transpose_x1", trans_x}, {"transpose_x2", trans_y}});
+    runner.Run(stream);
+  } else {
+    Tensor Out_temp(Out->type());
+    Out_temp.mutable_data<T>(Out->dims(), ctx.GetPlace());
+    const auto& out_temp_runner =
+        NpuOpRunner("MatMul", {X, Y}, {Out_temp},
+                    {{"transpose_x1", trans_x}, {"transpose_x2", trans_y}});
+    out_temp_runner.Run(stream);
+
+    const auto& runner =
+        NpuOpRunner("Muls", {Out_temp}, {*Out}, {{"value", alpha}});
+    runner.Run(stream);
+  }
+}
+
+template <typename T>
+static void MatMulND(const framework::ExecutionContext& ctx,
+                     const aclrtStream& stream, const Tensor& X,
+                     const Tensor& Y, Tensor* Out, const bool trans_x,
+                     const bool trans_y, const float alpha) {
+  Out->mutable_data<T>(ctx.GetPlace());
+
+  if (fabs(alpha - 1.0) < std::numeric_limits<float>::epsilon()) {
+    const auto& runner =
+        NpuOpRunner("BatchMatMul", {X, Y}, {*Out},
+                    {{"adj_x1", trans_x}, {"adj_x2", trans_y}});
+    runner.Run(stream);
+  } else {
+    Tensor Out_temp(Out->type());
+    Out_temp.mutable_data<T>(Out->dims(), ctx.GetPlace());
+    const auto& out_temp_runner =
+        NpuOpRunner("BatchMatMul", {X, Y}, {Out_temp},
+                    {{"adj_x1", trans_x}, {"adj_x2", trans_y}});
+    out_temp_runner.Run(stream);
+
+    const auto& runner =
+        NpuOpRunner("Muls", {Out_temp}, {*Out}, {{"value", alpha}});
+    runner.Run(stream);
+  }
+}
+
+template <typename T>
+static void ReduceDims(const framework::ExecutionContext& ctx,
+                       const aclrtStream& stream,
+                       const std::vector<int64_t>& dims,
+                       const std::vector<int64_t>& brd_dims, const Tensor& in,
+                       Tensor* out) {
+  std::vector<int64_t> axes;
+  int64_t size = brd_dims.size();
+  int64_t diff = brd_dims.size() - dims.size();
+  for (int64_t i = 0; i < size; ++i) {
+    if (i < diff) {
+      axes.push_back(i);
+      continue;
+    }
+    if (brd_dims[i] > dims[i - diff]) {
+      axes.push_back(i);
+    }
+  }
+  out->mutable_data<T>(ctx.GetPlace());
+  const auto& runner = NpuOpRunner("ReduceSumD", {in}, {*out},
+                                   {{"axes", axes}, {"keep_dims", false}});
+  runner.Run(stream);
+}
+
 template <typename DeviceContext, typename T>
 class MatMulNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* y = ctx.Input<framework::Tensor>("Y");
-    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto* X = ctx.Input<framework::Tensor>("X");
+    auto* Y = ctx.Input<framework::Tensor>("Y");
+    auto* Out = ctx.Output<framework::Tensor>("Out");
     bool transpose_x = ctx.Attr<bool>("transpose_X");
     bool transpose_y = ctx.Attr<bool>("transpose_Y");
+    float alpha = static_cast<T>(ctx.Attr<float>("alpha"));
+
+    std::vector<int64_t> x_dims = framework::vectorize(X->dims());
+    std::vector<int64_t> y_dims = framework::vectorize(Y->dims());
+    std::vector<int64_t> out_dims = framework::vectorize(Out->dims());
+    int x_ndim = x_dims.size();
+    int y_ndim = y_dims.size();
+    int out_ndim = out_dims.size();
 
-    if (x->dims().size() == 2) {
-      out->mutable_data<T>(ctx.GetPlace());
+    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
 
-      const auto& runner = NpuOpRunner(
-          "MatMul", {*x, *y}, {*out},
-          {{"transpose_x1", transpose_x}, {"transpose_x2", transpose_y}});
+    // Case 1: [K] x [K] = [1]
+    if (x_ndim == 1 && y_ndim == 1) {
+      PADDLE_ENFORCE_EQ(
+          X->numel(), Y->numel(),
+          platform::errors::InvalidArgument(
+              "X's numbers must be equal to Y's numbers,"
+              "when X/Y's dims =1. But received X has [%d] elements,"
+              "received Y has [%d] elements",
+              X->numel(), Y->numel()));
+      Out->Resize({1});
+      Dot<T>(ctx, stream, *X, *Y, Out, alpha);
+      return;
+    }
 
-      auto stream =
-          ctx.template device_context<paddle::platform::NPUDeviceContext>()
-              .stream();
-      runner.Run(stream);
+    // Resize dim 1 to 2
+    Tensor x_temp, y_temp;
+    x_temp.ShareDataWith(*X);
+    y_temp.ShareDataWith(*Y);
+    if (x_ndim == 1) {
+      x_dims.insert(x_dims.begin(), 1);
+      out_dims.insert(out_dims.end() - 1, 1);
+      x_temp.Resize(framework::make_ddim(x_dims));
+      x_ndim = 2;
+      out_ndim += 1;
+    }
+    if (y_ndim == 1) {
+      y_dims.push_back(1);
+      out_dims.push_back(1);
+      y_temp.Resize(framework::make_ddim(y_dims));
+      y_ndim = 2;
+      out_ndim += 1;
+    }
+
+    const int K = transpose_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1];
+    if (transpose_y) {
+      PADDLE_ENFORCE_EQ(y_dims[y_ndim - 1], K,
+                        platform::errors::InvalidArgument(
+                            "Input(Y) has error dim."
+                            "Y'dims[%d] must be equal to %d"
+                            "But received Y'dims[%d] is %d",
+                            y_ndim - 1, K, y_ndim - 1, y_dims[y_ndim - 1]));
+    } else {
+      PADDLE_ENFORCE_EQ(y_dims[y_ndim - 2], K,
+                        platform::errors::InvalidArgument(
+                            "Input(Y) has error dim."
+                            "Y'dims[%d] must be equal to %d"
+                            "But received Y'dims[%d] is %d",
+                            y_ndim - 2, K, y_ndim - 2, y_dims[y_ndim - 2]));
+    }
+
+    // Case 2: [M, K] x [K, N] = [M, N]
+    if (x_ndim == 2 && y_ndim == 2) {
+      MatMul2D<T>(ctx, stream, x_temp, y_temp, Out, transpose_x, transpose_y,
+                  alpha);
+      return;
+    }
+
+    // Case 3: [B, M, K] x [K, N] =  [B, M, N], when transpose_x = false
+    // Equal: [B * M, K] x [K, N] = [B * M, N] => [B, M, N]
+    if (transpose_x == false && y_ndim == 2) {
+      std::vector<int64_t> vec_dim = {x_temp.numel() / K, K};
+      x_temp.Resize(framework::make_ddim(vec_dim));
+      MatMul2D<T>(ctx, stream, x_temp, y_temp, Out, transpose_x, transpose_y,
+                  alpha);
+      return;
+    }
 
-    } else if (x->dims().size() > 2) {
-      out->mutable_data<T>(ctx.GetPlace());
+    // Case 4: [B, M, K] x  [B, K, N] = [B, M, N]
+    std::vector<int64_t> x_broadcast_dims(out_ndim, 1);
+    std::vector<int64_t> y_broadcast_dims(out_ndim, 1);
+    std::copy(out_dims.begin(), out_dims.end() - 2, x_broadcast_dims.begin());
+    std::copy(out_dims.begin(), out_dims.end() - 2, y_broadcast_dims.begin());
+    std::copy(x_dims.end() - 2, x_dims.end(), x_broadcast_dims.end() - 2);
+    std::copy(y_dims.end() - 2, y_dims.end(), y_broadcast_dims.end() - 2);
 
-      const auto& runner =
-          NpuOpRunner("BatchMatMul", {*x, *y}, {*out},
-                      {{"adj_x1", transpose_x}, {"adj_x2", transpose_y}});
+    Tensor x_temp_brd(X->type());
+    if (x_dims == x_broadcast_dims) {
+      x_temp_brd.ShareDataWith(*X);
+      x_temp_brd.Resize(framework::make_ddim(x_broadcast_dims));
+    } else {
+      x_temp_brd.Resize(framework::make_ddim(x_broadcast_dims));
+      x_temp_brd.mutable_data<T>(ctx.GetPlace());
+      NpuOpRunner runner_brd;
+      runner_brd.SetType("BroadcastTo")
+          .AddInput(x_temp)
+          .AddInput(std::move(x_broadcast_dims))
+          .AddOutput(x_temp_brd)
+          .Run(stream);
+    }
 
-      auto stream =
-          ctx.template device_context<paddle::platform::NPUDeviceContext>()
-              .stream();
-      runner.Run(stream);
+    Tensor y_temp_brd(Y->type());
+    if (y_dims == y_broadcast_dims) {
+      y_temp_brd.ShareDataWith(*Y);
+      y_temp_brd.Resize(framework::make_ddim(y_broadcast_dims));
+    } else {
+      y_temp_brd.Resize(framework::make_ddim(y_broadcast_dims));
+      y_temp_brd.mutable_data<T>(ctx.GetPlace());
+      NpuOpRunner runner_brd;
+      runner_brd.SetType("BroadcastTo")
+          .AddInput(y_temp)
+          .AddInput(std::move(y_broadcast_dims))
+          .AddOutput(y_temp_brd)
+          .Run(stream);
     }
+    MatMulND<T>(ctx, stream, x_temp_brd, y_temp_brd, Out, transpose_x,
+                transpose_y, alpha);
   }
 };
 
@@ -62,109 +273,200 @@ template <typename DeviceContext, typename T>
 class MatMulGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* y = ctx.Input<framework::Tensor>("Y");
-    auto* dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
+    auto* X = ctx.Input<framework::Tensor>("X");
+    auto* Y = ctx.Input<framework::Tensor>("Y");
+    auto* dOut = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* dX = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* dY = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
+    bool transpose_x = ctx.Attr<bool>("transpose_X");
     bool transpose_y = ctx.Attr<bool>("transpose_Y");
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    if (x->dims().size() == 2) {
-      if (transpose_y) {
-        if (dx) {
-          dx->mutable_data<T>(ctx.GetPlace());
-          const auto& runner_dx =
-              NpuOpRunner("MatMul", {*dout, *y}, {*dx},
-                          {{"transpose_x1", false}, {"transpose_x2", false}});
-
-          runner_dx.Run(stream);
-        }
-        if (dy) {
-          dy->mutable_data<T>(ctx.GetPlace());
-          const auto& runner_dy =
-              NpuOpRunner("MatMul", {*dout, *x}, {*dy},
-                          {{"transpose_x1", true}, {"transpose_x2", false}});
+    float alpha = static_cast<T>(ctx.Attr<float>("alpha"));
 
-          runner_dy.Run(stream);
-        }
+    std::vector<int64_t> x_dims = framework::vectorize(X->dims());
+    std::vector<int64_t> y_dims = framework::vectorize(Y->dims());
+    std::vector<int64_t> out_dims = framework::vectorize(dOut->dims());
+    int x_ndim = x_dims.size();
+    int y_ndim = y_dims.size();
+    int out_ndim = out_dims.size();
 
-      } else {
-        if (dx) {
-          dx->mutable_data<T>(ctx.GetPlace());
-          const auto& runner_dx =
-              NpuOpRunner("MatMul", {*dout, *y}, {*dx},
-                          {{"transpose_x1", false}, {"transpose_x2", true}});
+    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
 
-          runner_dx.Run(stream);
-        }
-        if (dy) {
-          dy->mutable_data<T>(ctx.GetPlace());
-          const auto& runner_dy =
-              NpuOpRunner("MatMul", {*x, *dout}, {*dy},
-                          {{"transpose_x1", true}, {"transpose_x2", false}});
+    // Case 1: [K] x [K] = [1]
+    if (x_ndim == 1 && y_ndim == 1) {
+      Tensor dout_temp(dOut->type());
+      dout_temp.Resize(X->dims());
+      dout_temp.mutable_data<T>(ctx.GetPlace());
+      NpuOpRunner runner;
+      runner.SetType("BroadcastTo")
+          .AddInput(*dOut)
+          .AddInput(std::move(x_dims))
+          .AddOutput(dout_temp)
+          .Run(stream);
+
+      if (dX) {
+        Mul<T>(ctx, stream, dout_temp, *Y, dX, alpha);
+      }
+      if (dY) {
+        Mul<T>(ctx, stream, dout_temp, *X, dY, alpha);
+      }
+      return;
+    }
+
+    // Resize dim 1 to 2
+    Tensor x_temp, y_temp, dout_temp;
+    x_temp.ShareDataWith(*X);
+    y_temp.ShareDataWith(*Y);
+    dout_temp.ShareDataWith(*dOut);
+    if (x_ndim == 1) {
+      x_dims.insert(x_dims.begin(), 1);
+      out_dims.insert(out_dims.end() - 1, 1);
+      x_temp.Resize(framework::make_ddim(x_dims));
+      dout_temp.Resize(framework::make_ddim(out_dims));
+      x_ndim = 2;
+      out_ndim += 1;
+    }
+    if (y_ndim == 1) {
+      y_dims.push_back(1);
+      out_dims.push_back(1);
+      y_temp.Resize(framework::make_ddim(y_dims));
+      dout_temp.Resize(framework::make_ddim(out_dims));
+      y_ndim = 2;
+      out_ndim += 1;
+    }
 
-          runner_dy.Run(stream);
+    // Case 2: [M, K] x [K, N] = [M, N]
+    if (out_ndim == 2) {
+      if (dX) {
+        dX->Resize(framework::make_ddim(x_dims));
+        if (transpose_x) {
+          MatMul2D<T>(ctx, stream, y_temp, dout_temp, dX, transpose_y, true,
+                      alpha);
+        } else {
+          MatMul2D<T>(ctx, stream, dout_temp, y_temp, dX, false, !transpose_y,
+                      alpha);
         }
+        dX->Resize(X->dims());
       }
-    } else if (x->dims().size() > 2) {
-      if (transpose_y) {
-        if (dx) {
-          dx->mutable_data<T>(ctx.GetPlace());
-          const auto& runner_dx =
-              NpuOpRunner("BatchMatMul", {*dout, *y}, {*dx},
-                          {{"adj_x1", false}, {"adj_x2", false}});
-
-          runner_dx.Run(stream);
+      if (dY) {
+        dY->Resize(framework::make_ddim(y_dims));
+        if (transpose_y) {
+          MatMul2D<T>(ctx, stream, dout_temp, x_temp, dY, true, transpose_x,
+                      alpha);
+        } else {
+          MatMul2D<T>(ctx, stream, x_temp, dout_temp, dY, !transpose_x, false,
+                      alpha);
         }
-        if (dy) {
-          dy->mutable_data<T>(ctx.GetPlace());
-          const auto& runner_dy =
-              NpuOpRunner("BatchMatMul", {*dout, *x}, {*dy},
-                          {{"adj_x1", true}, {"adj_x2", false}});
+        dY->Resize(Y->dims());
+      }
+      return;
+    }
+
+    const int K = transpose_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1];
+    const int N = transpose_y ? y_dims[y_ndim - 2] : y_dims[y_ndim - 1];
 
-          runner_dy.Run(stream);
+    // Case 3: [B, M, K] x [K, N] =  [B, M, N], when transpose_x = false
+    // Equal: [B * M, K] x [K, N] = [B * M, N] => [B, M, N]
+    if (transpose_x == false && y_ndim == 2) {
+      std::vector<int64_t> x_vec_dim = {x_temp.numel() / K, K};
+      dout_temp.Resize(
+          framework::make_ddim(std::vector<int64_t>{dout_temp.numel() / N, N}));
+      if (dX) {
+        dX->Resize(framework::make_ddim(x_vec_dim));
+        MatMul2D<T>(ctx, stream, dout_temp, y_temp, dX, false, !transpose_y,
+                    alpha);
+        dX->Resize(X->dims());
+      }
+      if (dY) {
+        x_temp.Resize(framework::make_ddim(x_vec_dim));
+        if (transpose_y) {
+          MatMul2D<T>(ctx, stream, dout_temp, x_temp, dY, true, false, alpha);
+        } else {
+          MatMul2D<T>(ctx, stream, x_temp, dout_temp, dY, true, false, alpha);
         }
-      } else {
-        if (dx) {
-          dx->mutable_data<T>(ctx.GetPlace());
-          const auto& runner_dx =
-              NpuOpRunner("BatchMatMul", {*dout, *y}, {*dx},
-                          {{"adj_x1", false}, {"adj_x2", true}});
+      }
+      return;
+    }
 
-          runner_dx.Run(stream);
+    // Case 4: [B, M, K] x  [B, K, N] = [B, M, N]
+    std::vector<int64_t> x_broadcast_dims(out_ndim, 1);
+    std::vector<int64_t> y_broadcast_dims(out_ndim, 1);
+    std::copy(out_dims.begin(), out_dims.end() - 2, x_broadcast_dims.begin());
+    std::copy(out_dims.begin(), out_dims.end() - 2, y_broadcast_dims.begin());
+    std::copy(x_dims.end() - 2, x_dims.end(), x_broadcast_dims.end() - 2);
+    std::copy(y_dims.end() - 2, y_dims.end(), y_broadcast_dims.end() - 2);
+
+    Tensor x_temp_brd(X->type());
+    if (x_dims == x_broadcast_dims) {
+      x_temp_brd.ShareDataWith(*X);
+      x_temp_brd.Resize(framework::make_ddim(x_broadcast_dims));
+    } else {
+      x_temp_brd.Resize(framework::make_ddim(x_broadcast_dims));
+      x_temp_brd.mutable_data<T>(ctx.GetPlace());
+      NpuOpRunner runner_brd;
+      runner_brd.SetType("BroadcastTo")
+          .AddInput(x_temp)
+          .AddInput(std::move(x_broadcast_dims))
+          .AddOutput(x_temp_brd)
+          .Run(stream);
+    }
+
+    Tensor y_temp_brd(Y->type());
+    if (y_dims == y_broadcast_dims) {
+      y_temp_brd.ShareDataWith(*Y);
+      y_temp_brd.Resize(framework::make_ddim(y_broadcast_dims));
+    } else {
+      y_temp_brd.Resize(framework::make_ddim(y_broadcast_dims));
+      y_temp_brd.mutable_data<T>(ctx.GetPlace());
+      NpuOpRunner runner_brd;
+      runner_brd.SetType("BroadcastTo")
+          .AddInput(y_temp)
+          .AddInput(std::move(y_broadcast_dims))
+          .AddOutput(y_temp_brd)
+          .Run(stream);
+    }
+
+    if (dX) {
+      if (x_dims == x_broadcast_dims) {
+        if (transpose_x) {
+          MatMulND<T>(ctx, stream, y_temp_brd, dout_temp, dX, transpose_y, true,
+                      alpha);
+        } else {
+          MatMulND<T>(ctx, stream, dout_temp, y_temp_brd, dX, false,
+                      !transpose_y, alpha);
+        }
+      } else {
+        Tensor dx_temp(X->type());
+        dx_temp.Resize(framework::make_ddim(x_broadcast_dims));
+        if (transpose_x) {
+          MatMulND<T>(ctx, stream, y_temp_brd, dout_temp, &dx_temp, transpose_y,
+                      true, alpha);
+        } else {
+          MatMulND<T>(ctx, stream, dout_temp, y_temp_brd, &dx_temp, false,
+                      !transpose_y, alpha);
         }
-        if (dy) {
-          dy->mutable_data<T>(ctx.GetPlace());
-          if ((x->dims().size() == 3) && (dout->dims().size() == 3) &&
-              (dy->dims().size() == 2)) {
-            framework::Tensor dout_tmp;
-            dout_tmp.ShareDataWith(*dout);
-            std::vector<int> vec_dim =
-                framework::vectorize<int>(dout_tmp.dims());
-            std::vector<int> vec_dim_v{vec_dim[0] * vec_dim[1], vec_dim[2]};
-            dout_tmp.Resize(framework::make_ddim(vec_dim_v));
-
-            framework::Tensor x_tmp;
-            x_tmp.ShareDataWith(*x);
-            std::vector<int> vec_dim_x =
-                framework::vectorize<int>(x_tmp.dims());
-            std::vector<int> vec_dim_x_v{vec_dim_x[0] * vec_dim_x[1],
-                                         vec_dim_x[2]};
-            x_tmp.Resize(framework::make_ddim(vec_dim_x_v));
-            const auto& runner_dy =
-                NpuOpRunner("MatMul", {x_tmp, dout_tmp}, {*dy},
-                            {{"transpose_x1", true}, {"transpose_x2", false}});
-            runner_dy.Run(stream);
-          } else {
-            const auto& runner_dy =
-                NpuOpRunner("BatchMatMul", {*x, *dout}, {*dy},
-                            {{"adj_x1", true}, {"adj_x2", false}});
-            runner_dy.Run(stream);
-          }
+        ReduceDims<T>(ctx, stream, x_dims, x_broadcast_dims, dx_temp, dX);
+      }
+    }
+    if (dY) {
+      if (y_dims == y_broadcast_dims) {
+        if (transpose_y) {
+          MatMulND<T>(ctx, stream, dout_temp, x_temp_brd, dY, true, transpose_x,
+                      alpha);
+        } else {
+          MatMulND<T>(ctx, stream, x_temp_brd, dout_temp, dY, !transpose_x,
+                      false, alpha);
+        }
+      } else {
+        Tensor dy_temp(Y->type());
+        dy_temp.Resize(framework::make_ddim(y_broadcast_dims));
+        if (transpose_y) {
+          MatMulND<T>(ctx, stream, dout_temp, x_temp_brd, &dy_temp, true,
+                      transpose_x, alpha);
+        } else {
+          MatMulND<T>(ctx, stream, x_temp_brd, dout_temp, &dy_temp,
+                      !transpose_x, false, alpha);
         }
+        ReduceDims<T>(ctx, stream, y_dims, y_broadcast_dims, dy_temp, dY);
       }
     }
   }
diff --git a/paddle/fluid/operators/meshgrid_op_npu.cc b/paddle/fluid/operators/meshgrid_op_npu.cc
index 9605fa092f069..f22e2e178ef85 100644
--- a/paddle/fluid/operators/meshgrid_op_npu.cc
+++ b/paddle/fluid/operators/meshgrid_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 
 #include "paddle/fluid/operators/meshgrid_op.h"
 #include "paddle/fluid/operators/npu_op_runner.h"
diff --git a/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc
index 90f0de60b592d..f567f4660534c 100644
--- a/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc
@@ -104,8 +104,10 @@ class InterpolateMKLDNNKernel : public framework::OpKernel<T> {
           scale.push_back(scale[0]);
         } else {  // v2
           std::vector<float> scale_attr = ctx.Attr<std::vector<float>>("scale");
-          scale.resize(3, scale_attr[0]);
-          std::copy(scale_attr.begin(), scale_attr.end(), scale.begin());
+          if (scale_attr.size() > 0) {
+            scale.resize(3, scale_attr[0]);
+            std::copy(scale_attr.begin(), scale_attr.end(), scale.begin());
+          }
         }
       }
       if (scale[0] > 0.0f && scale[1] > 0.0f && scale[2] > 0.0f) {
diff --git a/paddle/fluid/operators/pad3d_op_npu.cc b/paddle/fluid/operators/pad3d_op_npu.cc
index 3a1fba9455003..483c895e0e65a 100644
--- a/paddle/fluid/operators/pad3d_op_npu.cc
+++ b/paddle/fluid/operators/pad3d_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc
index b343fc88d7b8d..5efc7e9b869b7 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 
 #include "paddle/fluid/operators/npu_op_runner.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc
index 834b63f199e37..b5f571c7fea2c 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 
 #include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h"
 #include "paddle/fluid/operators/npu_op_runner.h"
diff --git a/paddle/fluid/operators/roi_align_op_npu.cc b/paddle/fluid/operators/roi_align_op_npu.cc
index c1ba046ca6af1..c26db2500fd66 100644
--- a/paddle/fluid/operators/roi_align_op_npu.cc
+++ b/paddle/fluid/operators/roi_align_op_npu.cc
@@ -90,6 +90,94 @@ class ROIAlignNPUKernel : public framework::OpKernel<T> {
   }
 };
 
+template <typename T>
+class ROIAlignNPUGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
+    auto* out_grad =
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* in_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+
+    auto pooled_height = ctx.Attr<int>("pooled_height");
+    auto pooled_width = ctx.Attr<int>("pooled_width");
+    auto spatial_scale = ctx.Attr<float>("spatial_scale");
+    auto sample_num = ctx.Attr<int>("sampling_ratio");
+    auto in_dims = in->dims();
+    auto aligned = ctx.Attr<bool>("aligned");
+
+    int rois_num = rois->dims()[0];
+
+    auto place = ctx.GetPlace();
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    if (!in_grad) {
+      return;
+    }
+    in_grad->mutable_data<T>(place);
+
+    PADDLE_ENFORCE_EQ(
+        aligned, false,
+        platform::errors::InvalidArgument(
+            "ROIAlignGradNPU only support Aligned attribute equaled to False"));
+    PADDLE_ENFORCE_EQ(
+        ctx.HasInput("RoisNum"), true,
+        platform::errors::NotFound("Input(RoisNum) of ROIAlignGradOp "
+                                   "is not found while using NPU."));
+    PADDLE_ENFORCE_EQ(
+        rois->type(), framework::proto::VarType::FP32,
+        platform::errors::InvalidArgument(
+            "ROIAlignGradNPU only support ROIs type equaled to FP32."));
+
+    // Cast RoisNum to fp32 tensor
+    auto* RoisNum = ctx.Input<framework::Tensor>("RoisNum");
+    Tensor ROIs_N5;
+    ROIs_N5.mutable_data<float>({rois_num, 5}, place);
+    Tensor ROIsNum_fp;
+    ROIsNum_fp.mutable_data<T>(RoisNum->dims(), place);  // shape = [rois_num]
+    int nputype_fp32 =
+        static_cast<int>(ConvertToNpuDtype(framework::proto::VarType::FP32));
+    const auto& runner_cast = NpuOpRunner("Cast", {*RoisNum}, {ROIsNum_fp},
+                                          {{"dst_type", nputype_fp32}});
+    runner_cast.Run(stream);
+    ROIsNum_fp.Resize({rois_num, 1});
+
+    // Combine *ROIsNum with ROIs to get new ROIs
+    std::vector<paddle::framework::Tensor> x_list;
+    x_list.push_back(ROIsNum_fp);
+    x_list.push_back(*rois);
+    const auto& runner_concat = NpuOpRunner("ConcatD", {x_list}, {ROIs_N5},
+                                            {{"N", 2}, {"concat_dim", 1}});
+    runner_concat.Run(stream);
+
+    //  By analysis, in order to match cpu grad version,
+    //  rois[:,3:5] should substrate 1 before call ascend grad function
+    std::vector<float> vec_dlt = {0, 0, 0, -1.0f, -1.0f};
+    Tensor tsr_dlt;
+    tsr_dlt.mutable_data<float>({5}, place);
+    framework::TensorFromVector<float>(vec_dlt, ctx.device_context(), &tsr_dlt);
+    ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
+    const auto& runner_add =
+        NpuOpRunner("AddV2", {ROIs_N5, tsr_dlt}, {ROIs_N5}, {});
+    runner_add.Run(stream);
+
+    //  Call ascend RoiAlignGrad function
+    int roi_end_mode = 0;
+    const auto& runner_roi_align_grad =
+        NpuOpRunner("ROIAlignGrad", {*out_grad, ROIs_N5}, {*in_grad},
+                    {{"xdiff_shape", framework::vectorize<int>(in_dims)},
+                     {"pooled_width", pooled_width},
+                     {"pooled_height", pooled_height},
+                     {"spatial_scale", spatial_scale},
+                     {"sample_num", sample_num},
+                     {"roi_end_mode", roi_end_mode}});
+    runner_roi_align_grad.Run(stream);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -99,3 +187,7 @@ REGISTER_OP_NPU_KERNEL(
     ops::ROIAlignNPUKernel<paddle::platform::NPUDeviceContext, float>,
     ops::ROIAlignNPUKernel<paddle::platform::NPUDeviceContext, double>,
     ops::ROIAlignNPUKernel<paddle::platform::NPUDeviceContext, int>);
+
+REGISTER_OP_NPU_KERNEL(roi_align_grad, ops::ROIAlignNPUGradKernel<float>,
+                       ops::ROIAlignNPUGradKernel<double>,
+                       ops::ROIAlignNPUGradKernel<int>);
diff --git a/paddle/fluid/operators/roll_op.cc b/paddle/fluid/operators/roll_op.cc
index b6a8111592fb7..b74dfc984affb 100644
--- a/paddle/fluid/operators/roll_op.cc
+++ b/paddle/fluid/operators/roll_op.cc
@@ -40,21 +40,23 @@ class RollOp : public framework::OperatorWithKernel {
     auto dims = ctx->Attrs().Get<std::vector<int64_t>>("axis");
     auto shifts = ctx->Attrs().Get<std::vector<int64_t>>("shifts");
 
-    if (dims.size() != 0) {
-      PADDLE_ENFORCE_EQ(dims.size(), shifts.size(),
-                        platform::errors::InvalidArgument(
-                            "When dims.size() != 0, dims.size() "
-                            "should be equal to "
-                            "shifts.size(). But received "
-                            "dims.size() = %d, shifts.size() = %d",
-                            dims.size(), shifts.size()));
-    } else {
-      PADDLE_ENFORCE_EQ(shifts.size(), 1,
-                        platform::errors::InvalidArgument(
-                            "When dims.size() == 0, shifts.size() "
-                            "should be equal to 1, But received "
-                            "shifts.size() = %d",
-                            shifts.size()));
+    if (!ctx->HasInput("ShiftsTensor")) {
+      if (dims.size() != 0) {
+        PADDLE_ENFORCE_EQ(dims.size(), shifts.size(),
+                          platform::errors::InvalidArgument(
+                              "When dims.size() != 0, dims.size() "
+                              "should be equal to "
+                              "shifts.size(). But received "
+                              "dims.size() = %d, shifts.size() = %d",
+                              dims.size(), shifts.size()));
+      } else {
+        PADDLE_ENFORCE_EQ(shifts.size(), 1,
+                          platform::errors::InvalidArgument(
+                              "When dims.size() == 0, shifts.size() "
+                              "should be equal to 1, But received "
+                              "shifts.size() = %d",
+                              shifts.size()));
+      }
     }
 
     ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
@@ -105,6 +107,10 @@ class RollOpMaker : public framework::OpProtoAndCheckerMaker {
                                   "The number of places by which the elements "
                                   "of the tensor are shifted.")
         .SetDefault({});
+    AddInput("ShiftsTensor",
+             "The number of places by which the elements of the tensor "
+             "are shifted.")
+        .AsDispensable();
     AddAttr<std::vector<int64_t>>(
         "axis",
         "Axis along which to roll. It must have the same size "
@@ -129,6 +135,9 @@ class RollGradMaker : public framework::SingleGradOpMaker<T> {
   void Apply(GradOpPtr<T> op) const override {
     op->SetType("roll_grad");
     op->SetInput("X", this->Input("X"));
+    if (this->HasInput("ShiftsTensor")) {
+      op->SetInput("ShiftsTensor", this->Input("ShiftsTensor"));
+    }
     op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
     op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
     op->SetAttrMap(this->Attrs());
diff --git a/paddle/fluid/operators/roll_op.cu b/paddle/fluid/operators/roll_op.cu
index a170ce2fb111d..d70bd58887f84 100644
--- a/paddle/fluid/operators/roll_op.cu
+++ b/paddle/fluid/operators/roll_op.cu
@@ -59,6 +59,16 @@ class RollKernel<platform::CUDADeviceContext, T>
     auto* in = context.Input<LoDTensor>("X");
     auto* out = context.Output<LoDTensor>("Out");
     std::vector<int64_t> shifts = context.Attr<std::vector<int64_t>>("shifts");
+    if (context.HasInput("ShiftsTensor")) {
+      const auto* shifts_tensor =
+          context.Input<framework::Tensor>("ShiftsTensor");
+      PADDLE_ENFORCE_EQ(
+          shifts_tensor->dims().size(), 1,
+          platform::errors::InvalidArgument(
+              "The rank of ShiftsTensor is expected to be 1, got %s",
+              shifts_tensor->dims().size()));
+      shifts = GetDataFromTensor<int64_t>(shifts_tensor);
+    }
     std::vector<int64_t> dims = context.Attr<std::vector<int64_t>>("axis");
 
     auto* in_data = in->data<T>();
@@ -134,6 +144,16 @@ class RollGradKernel<platform::CUDADeviceContext, T>
     auto* in = context.Input<LoDTensor>(framework::GradVarName("Out"));
     auto* out = context.Output<LoDTensor>(framework::GradVarName("X"));
     std::vector<int64_t> shifts = context.Attr<std::vector<int64_t>>("shifts");
+    if (context.HasInput("ShiftsTensor")) {
+      const auto* shifts_tensor =
+          context.Input<framework::Tensor>("ShiftsTensor");
+      PADDLE_ENFORCE_EQ(
+          shifts_tensor->dims().size(), 1,
+          platform::errors::InvalidArgument(
+              "The rank of ShiftsTensor is expected to be 1, got %s",
+              shifts_tensor->dims().size()));
+      shifts = GetDataFromTensor<int64_t>(shifts_tensor);
+    }
     std::vector<int64_t> dims = context.Attr<std::vector<int64_t>>("axis");
 
     auto* in_data = in->data<T>();
diff --git a/paddle/fluid/operators/roll_op.h b/paddle/fluid/operators/roll_op.h
index e58ff521d8df7..affb5f226ed55 100644
--- a/paddle/fluid/operators/roll_op.h
+++ b/paddle/fluid/operators/roll_op.h
@@ -16,6 +16,8 @@
 #include <memory>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/utils.h"
+#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace operators {
@@ -85,6 +87,16 @@ class RollKernel : public framework::OpKernel<T> {
     auto& input = input_var->Get<LoDTensor>();
     auto* output = output_var->GetMutable<LoDTensor>();
     std::vector<int64_t> shifts = context.Attr<std::vector<int64_t>>("shifts");
+    if (context.HasInput("ShiftsTensor")) {
+      const auto* shifts_tensor =
+          context.Input<framework::Tensor>("ShiftsTensor");
+      PADDLE_ENFORCE_EQ(
+          shifts_tensor->dims().size(), 1,
+          platform::errors::InvalidArgument(
+              "The rank of ShiftsTensor is expected to be 1, got %s",
+              shifts_tensor->dims().size()));
+      shifts = GetDataFromTensor<int64_t>(shifts_tensor);
+    }
     std::vector<int64_t> dims = context.Attr<std::vector<int64_t>>("axis");
 
     std::vector<T> out_vec;
@@ -123,6 +135,11 @@ class RollGradKernel : public framework::OpKernel<T> {
     auto& input = input_var->Get<LoDTensor>();
     auto* output = output_var->GetMutable<LoDTensor>();
     std::vector<int64_t> shifts = context.Attr<std::vector<int64_t>>("shifts");
+    if (context.HasInput("ShiftsTensor")) {
+      const auto* shifts_tensor =
+          context.Input<framework::Tensor>("ShiftsTensor");
+      shifts = GetDataFromTensor<int64_t>(shifts_tensor);
+    }
     std::vector<int64_t> dims = context.Attr<std::vector<int64_t>>("axis");
 
     std::vector<T> out_vec;
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc
index 6f3b40dbbf394..400a09330a348 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 
 #include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h"
 #include "paddle/fluid/operators/npu_op_runner.h"
diff --git a/paddle/fluid/operators/slice_op_npu.cc b/paddle/fluid/operators/slice_op_npu.cc
index 52351a98bce37..a9092d7e2abbc 100644
--- a/paddle/fluid/operators/slice_op_npu.cc
+++ b/paddle/fluid/operators/slice_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 
 #include "paddle/fluid/operators/slice_op.h"
 #include "paddle/fluid/operators/npu_op_runner.h"
diff --git a/paddle/fluid/operators/stack_op_xpu.cc b/paddle/fluid/operators/stack_op_xpu.cc
index 9929df6e309d9..01ec4a2b16b4a 100644
--- a/paddle/fluid/operators/stack_op_xpu.cc
+++ b/paddle/fluid/operators/stack_op_xpu.cc
@@ -66,5 +66,7 @@ namespace plat = paddle::platform;
 namespace ops = paddle::operators;
 
 REGISTER_OP_XPU_KERNEL(stack,
+                       ops::StackXPUKernel<plat::XPUDeviceContext, int64_t>,
+                       ops::StackXPUKernel<plat::XPUDeviceContext, int>,
                        ops::StackXPUKernel<plat::XPUDeviceContext, float>);
 #endif
diff --git a/paddle/fluid/operators/string/faster_tokenizer_op.cc b/paddle/fluid/operators/string/faster_tokenizer_op.cc
index 49457af8f00c8..42047021b408a 100644
--- a/paddle/fluid/operators/string/faster_tokenizer_op.cc
+++ b/paddle/fluid/operators/string/faster_tokenizer_op.cc
@@ -100,9 +100,14 @@ void BasicTokenizer::Tokenize(const string& text, vector<wstring>* res) const {
     // String is converted into wstring failedly.
     return;
   }
-
-  std::wstring dest_text;
-  for (auto ch : unicode_text) {
+  std::wstring cache_text = L"";
+  auto PushCacheText = [&]() {
+    if (cache_text != L"") {
+      res->emplace_back(cache_text);
+      cache_text = L"";
+    }
+  };
+  for (auto& ch : unicode_text) {
     if (ch == 0 || ch == 0xfffd || IsControl(ch)) {
       continue;
     }
@@ -110,25 +115,24 @@ void BasicTokenizer::Tokenize(const string& text, vector<wstring>* res) const {
       ch = do_lower_case(ch);
     }
     if (IsChineseChar(ch) || IsPunctuation(ch)) {
-      dest_text += ' ';
-      dest_text += ch;
-      dest_text += ' ';
+      PushCacheText();
+      res->emplace_back(std::wstring{ch});
     } else if (IsWhiteSpace(ch)) {
-      dest_text += ' ';
+      PushCacheText();
     } else {
-      dest_text += ch;
+      cache_text += ch;
     }
   }
-  boost::split(*res, dest_text, boost::is_any_of(kStripChars));
+  PushCacheText();
 }
 
 WordPieceTokenizer::WordPieceTokenizer(
-    framework::Vocab* vocab, const wstring& unk_token /* = L"[UNK]"*/,
+    const framework::Vocab* vocab, const wstring& unk_token /* = L"[UNK]"*/,
     const size_t max_input_chars_per_word /* = 100 */)
     : vocab_(vocab),
       unk_token_(unk_token),
       max_input_chars_per_word_(max_input_chars_per_word) {
-  unk_token_id_ = (*vocab_)[unk_token_];
+  unk_token_id_ = vocab_->at(unk_token_);
 }
 
 void WordPieceTokenizer::Tokenize(const wstring& text,
@@ -178,7 +182,7 @@ void WordPieceTokenizer::Tokenize(const wstring& text,
   }
 }
 
-BertTokenizer::BertTokenizer(framework::Vocab* vocab,
+BertTokenizer::BertTokenizer(const framework::Vocab* vocab,
                              bool do_lower_case /* = false */,
                              const wstring& unk_token /* = L"[UNK]" */,
                              const wstring& pad_token /* = L"[PAD]" */,
@@ -196,11 +200,11 @@ BertTokenizer::BertTokenizer(framework::Vocab* vocab,
       vocab_(vocab),
       basic_tokenizer_(do_lower_case_),
       word_piece_tokenizer_(vocab_, unk_token) {
-  unk_token_id_ = (*vocab_)[unk_token_];
-  pad_token_id_ = (*vocab_)[pad_token_];
-  cls_token_id_ = (*vocab_)[cls_token_];
-  mask_token_id_ = (*vocab_)[mask_token_];
-  sep_token_id_ = (*vocab_)[sep_token_];
+  unk_token_id_ = vocab_->at(unk_token_);
+  pad_token_id_ = vocab_->at(pad_token_);
+  cls_token_id_ = vocab_->at(cls_token_);
+  mask_token_id_ = vocab_->at(mask_token_);
+  sep_token_id_ = vocab_->at(sep_token_);
 
   all_special_tokens_ = vector<wstring>(
       {unk_token_, pad_token_, cls_token_, mask_token_, sep_token_});
diff --git a/paddle/fluid/operators/string/faster_tokenizer_op.h b/paddle/fluid/operators/string/faster_tokenizer_op.h
old mode 100755
new mode 100644
index d9b7fa26a6704..5218b7c2eaa51
--- a/paddle/fluid/operators/string/faster_tokenizer_op.h
+++ b/paddle/fluid/operators/string/faster_tokenizer_op.h
@@ -56,13 +56,13 @@ class BasicTokenizer {
 
 class WordPieceTokenizer {
  public:
-  explicit WordPieceTokenizer(framework::Vocab* vocab,
+  explicit WordPieceTokenizer(const framework::Vocab* vocab,
                               const wstring& unk_token = L"[UNK]",
                               const size_t max_input_chars_per_word = 100);
   void Tokenize(const wstring& text, vector<int64_t>* output) const;
 
  private:
-  framework::Vocab* vocab_;
+  const framework::Vocab* vocab_;
   wstring unk_token_{L"[UNK]"};
   int64_t unk_token_id_;
   size_t max_input_chars_per_word_;
@@ -70,7 +70,8 @@ class WordPieceTokenizer {
 
 class BertTokenizer {
  public:
-  explicit BertTokenizer(framework::Vocab* vocab, bool do_lower_case = false,
+  explicit BertTokenizer(const framework::Vocab* vocab,
+                         bool do_lower_case = false,
                          const wstring& unk_token = L"[UNK]",
                          const wstring& pad_token = L"[PAD]",
                          const wstring& cls_token = L"[CLS]",
@@ -106,7 +107,7 @@ class BertTokenizer {
   bool do_lower_case_;
   wstring unk_token_, pad_token_, cls_token_, mask_token_, sep_token_;
   string padding_site_;
-  framework::Vocab* vocab_;
+  const framework::Vocab* vocab_;
   BasicTokenizer basic_tokenizer_;
   WordPieceTokenizer word_piece_tokenizer_;
   int64_t unk_token_id_, cls_token_id_, mask_token_id_, pad_token_id_,
@@ -140,21 +141,20 @@ class FasterTokenizerKernel : public framework::OpKernel<T> {
       return;
     }
 
-    BertTokenizer* tokenizer_ptr =
-        new BertTokenizer(const_cast<framework::Vocab*>(vocab), do_lower_case);
+    BertTokenizer tokenizer(vocab, do_lower_case);
     size_t batch_max_seq_len = 0;
     size_t batch_size = text->size();
 
     vector<unordered_map<string, vector<int64_t>>> batch_encode_inputs(
         batch_size);
     if (text_pair) {
-      tokenizer_ptr->BatchEncode(&batch_encode_inputs, *text, *text_pair,
-                                 is_split_into_words, max_seq_len,
-                                 pad_to_max_seq_len);
+      tokenizer.BatchEncode(&batch_encode_inputs, *text, *text_pair,
+                            is_split_into_words, max_seq_len,
+                            pad_to_max_seq_len);
     } else {
-      tokenizer_ptr->BatchEncode(&batch_encode_inputs, *text, vector<string>(),
-                                 is_split_into_words, max_seq_len,
-                                 pad_to_max_seq_len);
+      tokenizer.BatchEncode(&batch_encode_inputs, *text, vector<string>(),
+                            is_split_into_words, max_seq_len,
+                            pad_to_max_seq_len);
     }
 
     for (size_t i = 0; i < batch_size; ++i) {
@@ -173,7 +173,7 @@ class FasterTokenizerKernel : public framework::OpKernel<T> {
                               static_cast<int64_t>(batch_max_seq_len)}));
     auto* seg_ids_data = seg_ids->mutable_data<T>(ctx.GetPlace());
 
-    auto pad_token_id = tokenizer_ptr->GetPadTokenID();
+    auto pad_token_id = tokenizer.GetPadTokenID();
     for (size_t i = 0; i < batch_size; i++) {
       auto& encoder_input_ids = batch_encode_inputs[i]["input_ids"];
       auto& encoder_seg_ids = batch_encode_inputs[i]["token_type_ids"];
@@ -188,7 +188,6 @@ class FasterTokenizerKernel : public framework::OpKernel<T> {
       std::memset(seg_ids_data + i * batch_max_seq_len + seq_len, pad_token_id,
                   (batch_max_seq_len - seq_len) * sizeof(T));
     }
-    delete tokenizer_ptr;
   }
 };
 
diff --git a/paddle/fluid/operators/tril_triu_op_npu.cc b/paddle/fluid/operators/tril_triu_op_npu.cc
index cdabc28255b51..6e7e03911370f 100644
--- a/paddle/fluid/operators/tril_triu_op_npu.cc
+++ b/paddle/fluid/operators/tril_triu_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 
 #include "paddle/fluid/operators/tril_triu_op.h"
 #include "paddle/fluid/operators/npu_op_runner.h"
diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index dd65d743fad31..ef908be8462ed 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -705,8 +705,10 @@ PADDLE_DEFINE_EXPORTED_bool(allreduce_record_one_event, false,
  * Value Range: bool, default=false
  * Example: FLAGS_use_cinn=true would run PaddlePaddle using CINN
  */
+#ifdef PADDLE_WITH_CINN
 PADDLE_DEFINE_EXPORTED_bool(
     use_cinn, false, "It controls whether to run PaddlePaddle using CINN");
+#endif
 
 DEFINE_int32(record_pool_max_size, 2000000,
              "SlotRecordDataset slot record pool max size");
diff --git a/paddle/fluid/platform/xpu/xpu2_op_list.h b/paddle/fluid/platform/xpu/xpu2_op_list.h
index 0a9a9453b53e3..121d26e39dd8b 100644
--- a/paddle/fluid/platform/xpu/xpu2_op_list.h
+++ b/paddle/fluid/platform/xpu/xpu2_op_list.h
@@ -119,6 +119,35 @@ XPUOpMap& get_kl2_ops() {
       {"slice_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                                    pOpKernelType(vartype::FP16, XPUPlace()),
                                    pOpKernelType(vartype::INT32, XPUPlace())})},
+      {"equal", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
+                              pOpKernelType(vartype::INT32, XPUPlace()),
+                              pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"not_equal", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
+                                  pOpKernelType(vartype::INT32, XPUPlace()),
+                                  pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"less_than", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
+                                  pOpKernelType(vartype::INT32, XPUPlace()),
+                                  pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"less_equal", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
+                                   pOpKernelType(vartype::INT32, XPUPlace()),
+                                   pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"greater_than",
+       XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
+                     pOpKernelType(vartype::INT32, XPUPlace()),
+                     pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"greater_equal",
+       XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
+                     pOpKernelType(vartype::INT32, XPUPlace()),
+                     pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"clip", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"stack", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                              pOpKernelType(vartype::INT64, XPUPlace()),
+                              pOpKernelType(vartype::INT32, XPUPlace())})},
+      {"cast", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                             pOpKernelType(vartype::FP16, XPUPlace()),
+                             pOpKernelType(vartype::BOOL, XPUPlace()),
+                             pOpKernelType(vartype::INT64, XPUPlace()),
+                             pOpKernelType(vartype::INT32, XPUPlace())})},
       {"fill_any_like",
        XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
                      pOpKernelType(vartype::INT32, XPUPlace()),
diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc
index 08ab1d7d34466..54ea0f2aee17f 100644
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -40,6 +40,7 @@
 // need to manually specify them in this map.
 std::map<std::string, std::set<std::string>> op_ins_map = {
     {"layer_norm", {"X", "Scale", "Bias"}},
+    {"bincount", {"X", "Weights"}},
     {"fused_attention",
      {"X", "LnScale", "LnBias", "QKVW", "QKVBias", "SrcMask", "OutLinearW",
       "OutLinearBias", "Ln2Scale", "Ln2Bias"}},
@@ -71,6 +72,9 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
     {"sparse_momentum", {"Param", "Grad", "Velocity", "Index", "LearningRate"}},
     {"rnn", {"Input", "PreState", "WeightList", "SequenceLength"}},
     {"run_program", {"X", "Params"}},
+    {"fused_feedforward",
+     {"Dropout1Seed", "Dropout2Seed", "Linear1Bias", "Linear2Bias", "Ln1Scale",
+      "Ln1Bias", "Ln2Scale", "Ln2Bias"}},
     {"faster_tokenizer", {"Text", "Vocab", "TextPair"}},
     {"matrix_rank", {"X", "TolTensor"}},
     {"adam",
diff --git a/paddle/fluid/string/string_helper.cc b/paddle/fluid/string/string_helper.cc
index 141ac2ba47c5b..db9ee7592fc84 100644
--- a/paddle/fluid/string/string_helper.cc
+++ b/paddle/fluid/string/string_helper.cc
@@ -24,26 +24,6 @@
 namespace paddle {
 namespace string {
 
-inline size_t count_spaces(const char* s) {
-  size_t count = 0;
-
-  while (*s != 0 && isspace(*s++)) {
-    count++;
-  }
-
-  return count;
-}
-
-inline size_t count_nonspaces(const char* s) {
-  size_t count = 0;
-
-  while (*s != 0 && !isspace(*s++)) {
-    count++;
-  }
-
-  return count;
-}
-
 // remove leading and tailing spaces
 std::string trim_spaces(const std::string& str) {
   const char* p = str.c_str();
@@ -74,20 +54,6 @@ std::string erase_spaces(const std::string& str) {
   return result;
 }
 
-inline int str_to_float(const char* str, float* v) {
-  const char* head = str;
-  char* cursor = NULL;
-  int index = 0;
-  while (*(head += count_spaces(head)) != 0) {
-    v[index++] = std::strtof(head, &cursor);
-    if (head == cursor) {
-      break;
-    }
-    head = cursor;
-  }
-  return index;
-}
-
 bool ends_with(std::string const& input, std::string const& test) {
   if (test.size() > input.size()) return false;
   return std::equal(test.rbegin(), test.rend(), input.rbegin());
diff --git a/paddle/fluid/string/string_helper.h b/paddle/fluid/string/string_helper.h
index 1ab7690f8b517..4f1aee7c7ed17 100644
--- a/paddle/fluid/string/string_helper.h
+++ b/paddle/fluid/string/string_helper.h
@@ -26,9 +26,25 @@
 namespace paddle {
 namespace string {
 
-inline size_t count_spaces(const char* s);
+inline size_t count_spaces(const char* s) {
+  size_t count = 0;
 
-inline size_t count_nonspaces(const char* s);
+  while (*s != 0 && isspace(*s++)) {
+    count++;
+  }
+
+  return count;
+}
+
+inline size_t count_nonspaces(const char* s) {
+  size_t count = 0;
+
+  while (*s != 0 && !isspace(*s++)) {
+    count++;
+  }
+
+  return count;
+}
 
 template <class... ARGS>
 void format_string_append(std::string& str, const char* fmt,  // NOLINT
@@ -67,7 +83,19 @@ std::string trim_spaces(const std::string& str);
 // erase all spaces in str
 std::string erase_spaces(const std::string& str);
 
-int str_to_float(const char* str, float* v);
+inline int str_to_float(const char* str, float* v) {
+  const char* head = str;
+  char* cursor = NULL;
+  int index = 0;
+  while (*(head += count_spaces(head)) != 0) {
+    v[index++] = std::strtof(head, &cursor);
+    if (head == cursor) {
+      break;
+    }
+    head = cursor;
+  }
+  return index;
+}
 
 // checks whether the test string is a suffix of the input string.
 bool ends_with(std::string const& input, std::string const& test);
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 2cc4bd8d05fb8..9bdd9e14d58dc 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -2388,6 +2388,21 @@ function find_temporary_files() {
     fi
 }
 
+function build_pr_and_develop() {
+    cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
+    mkdir ${PADDLE_ROOT}/build/pr_whl && cp ${PADDLE_ROOT}/build/python/dist/*.whl ${PADDLE_ROOT}/build/pr_whl
+    rm -f ${PADDLE_ROOT}/build/python/dist/*.whl && rm -f ${PADDLE_ROOT}/build/python/build/.timestamp
+    rm -rf ${PADDLE_ROOT}/build/Makefile ${PADDLE_ROOT}/build/CMakeCache.txt
+    cmake_change=`git diff --name-only upstream/$BRANCH | grep "cmake/external" || true`
+    if [ ${cmake_change} ];then
+        rm -rf ${PADDLE_ROOT}/build/third_party
+    fi
+    git checkout .
+    git checkout -b develop_base_pr upstream/$BRANCH
+    cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
+    mkdir ${PADDLE_ROOT}/build/dev_whl && cp ${PADDLE_ROOT}/build/python/dist/*.whl ${PADDLE_ROOT}/build/dev_whl
+}
+
 
 function main() {
     local CMD=$1 
@@ -2397,6 +2412,9 @@ function main() {
       build_only)
         cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
         ;;
+      build_pr_dev)
+        build_pr_and_develop 
+        ;;
       build_and_check)
         set +e
         check_style_info=$(check_style)
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 2051a4f6fcd50..29548a64f3dad 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -98,6 +98,7 @@
 from .tensor.linalg import cholesky  # noqa: F401
 from .tensor.linalg import bmm  # noqa: F401
 from .tensor.linalg import histogram  # noqa: F401
+from .tensor.linalg import bincount  # noqa: F401
 from .tensor.linalg import mv  # noqa: F401
 from .tensor.logic import equal  # noqa: F401
 from .tensor.logic import greater_equal  # noqa: F401
@@ -295,6 +296,7 @@
 from . import hub  # noqa: F401
 from . import linalg  # noqa: F401
 from . import fft  # noqa: F401
+from . import signal  # noqa: F401
 
 import paddle.text  # noqa: F401
 import paddle.vision  # noqa: F401
@@ -398,6 +400,7 @@
            'bitwise_not',
            'mm',
            'flip',
+           'bincount',
            'histogram',
            'multiplex',
            'CUDAPlace',
diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py
index 571199b99b0d9..aea7ad0710222 100755
--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -1586,16 +1586,16 @@ def unscale_method(self, optimizer):
                 _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale,
                                                 param_grads_fp32,
                                                 temp_found_inf_fp32)
+
             self._found_inf = 1 if temp_found_inf_fp16 or temp_found_inf_fp32 else 0
+            is_found_inf = paddle.to_tensor([self._found_inf], dtype="int32")
 
             # TODO(shenliang03) Since dp allreduce in the optimizer is 
             # after the gradscaler, check_finite needs to synchronize global 
             # information. In the future, we should use check_group to speed.
             paddle.distributed.all_reduce(
-                paddle.to_tensor(
-                    [self._found_inf], dtype="int32"),
-                op=paddle.distributed.ReduceOp.MAX,
-                group=None)
+                is_found_inf, op=paddle.distributed.ReduceOp.MAX, group=None)
+            self._found_inf = is_found_inf.numpy()[0]
 
         # Only tensor_parallel and pipeline_parallel need to modify scaler
         if self._hcg.get_parallel_mode() in (ParallelMode.TENSOR_PARALLEL,
diff --git a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
index 0f5c24f022e3a..75aa9766e7b28 100644
--- a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
+++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
@@ -47,6 +47,7 @@ def _apply_collective_grads(parameters, comm_group):
         nranks = paddle.distributed.get_world_size(
         ) if comm_group is None else comm_group.nranks
         div_factor = paddle.to_tensor(nranks, dtype=coalesced_grad.dtype)
+        paddle.distributed.all_reduce(coalesced_grad, group=comm_group)
         paddle.fluid.framework._dygraph_tracer().trace_op(
             type="elementwise_div",
             inputs={'X': coalesced_grad,
@@ -54,8 +55,6 @@ def _apply_collective_grads(parameters, comm_group):
             outputs={'Out': coalesced_grad},
             attrs={'axis': -1})
 
-        paddle.distributed.all_reduce(coalesced_grad, group=comm_group)
-
     _split_tensors(coalesced_grads_and_vars)
 
 
diff --git a/python/paddle/fft.py b/python/paddle/fft.py
index 3ac02c9c8dc18..de15eba0feffa 100644
--- a/python/paddle/fft.py
+++ b/python/paddle/fft.py
@@ -12,50 +12,1613 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .tensor.fft import fft  # noqa: F401
-from .tensor.fft import fft2  # noqa: F401
-from .tensor.fft import fftn  # noqa: F401
-from .tensor.fft import ifft  # noqa: F401
-from .tensor.fft import ifft2  # noqa: F401
-from .tensor.fft import ifftn  # noqa: F401
-from .tensor.fft import rfft  # noqa: F401
-from .tensor.fft import rfft2  # noqa: F401
-from .tensor.fft import rfftn  # noqa: F401
-from .tensor.fft import irfft  # noqa: F401
-from .tensor.fft import irfft2  # noqa: F401
-from .tensor.fft import irfftn  # noqa: F401
-from .tensor.fft import hfft  # noqa: F401
-from .tensor.fft import hfft2  # noqa: F401
-from .tensor.fft import hfftn  # noqa: F401
-from .tensor.fft import ihfft  # noqa: F401
-from .tensor.fft import ihfft2  # noqa: F401
-from .tensor.fft import ihfftn  # noqa: F401
-from .tensor.fft import fftfreq  # noqa: F401
-from .tensor.fft import rfftfreq  # noqa: F401
-from .tensor.fft import fftshift  # noqa: F401
-from .tensor.fft import ifftshift  # noqa: F401
-
-__all__ = [ # noqa
+from typing import Sequence
+import numpy as np
+import paddle
+from .tensor.attribute import is_complex, is_floating_point, is_interger, _real_to_complex_dtype, _complex_to_real_dtype
+from .fluid.framework import in_dygraph_mode
+from . import _C_ops
+from .fluid.data_feeder import check_variable_and_dtype
+from .fluid.layer_helper import LayerHelper
+
+__all__ = [
     'fft',
-    'fft2',
-    'fftn',
     'ifft',
-    'ifft2',
-    'ifftn',
     'rfft',
-    'rfft2',
-    'rfftn',
     'irfft',
-    'irfft2',
-    'irfftn',
     'hfft',
-    'hfft2',
-    'hfftn',
     'ihfft',
+    'fft2',
+    'ifft2',
+    'rfft2',
+    'irfft2',
+    'hfft2',
     'ihfft2',
+    'fftn',
+    'ifftn',
+    'rfftn',
+    'irfftn',
+    'hfftn',
     'ihfftn',
     'fftfreq',
     'rfftfreq',
     'fftshift',
-    'ifftshift'
+    'ifftshift',
 ]
+
+
+def _check_normalization(norm):
+    if norm not in ['forward', 'backward', 'ortho']:
+        raise ValueError(
+            "Unexpected norm: {}. Norm should be forward, backward or ortho".
+            format(norm))
+
+
+def _check_fft_n(n):
+    if not isinstance(n, int):
+        raise ValueError(
+            "Invalid FFT argument n({}), it shoule be an integer.".format(n))
+    if n <= 0:
+        raise ValueError(
+            "Invalid FFT argument n({}), it should be positive.".format(n))
+
+
+def _check_fft_shape(x, s):
+    ndim = x.ndim
+    if not isinstance(s, Sequence):
+        raise ValueError(
+            "Invaid FFT argument s({}), it should be a sequence of integers.")
+
+    if len(s) > ndim:
+        raise ValueError(
+            "Length of FFT argument s should not be larger than the rank of input. "
+            "Received s: {}, rank of x: {}".format(s, ndim))
+    for size in s:
+        if not isinstance(size, int) or size <= 0:
+            raise ValueError("FFT sizes {} contains invalid value ({})".format(
+                s, size))
+
+
+def _check_fft_axis(x, axis):
+    ndim = x.ndim
+    if not isinstance(axis, int):
+        raise ValueError(
+            "Invalid FFT axis ({}), it shoule be an integer.".format(axis))
+    if axis < -ndim or axis >= ndim:
+        raise ValueError(
+            "Invalid FFT axis ({}), it should be in range [-{}, {})".format(
+                axis, ndim, ndim))
+
+
+def _check_fft_axes(x, axes):
+    ndim = x.ndim
+    if not isinstance(axes, Sequence):
+        raise ValueError(
+            "Invalid FFT axes ({}), it should be a sequence of integers.".
+            format(axes))
+    if len(axes) > ndim:
+        raise ValueError(
+            "Length of fft axes should not be larger than the rank of input. "
+            "Received, len of axes: {}, rank of x: {}".format(len(axes), ndim))
+    for axis in axes:
+        if not isinstance(axis, int) or axis < -ndim or axis >= ndim:
+            raise ValueError(
+                "FFT axes {} contains invalid value ({}), it should be in range [-{}, {})".
+                format(axes, axis, ndim, ndim))
+
+
+def _resize_fft_input(x, s, axes):
+    if len(s) != len(axes):
+        raise ValueError("length of `s` should equals length of `axes`.")
+    shape = x.shape
+    ndim = x.ndim
+
+    axes_to_pad = []
+    paddings = []
+    axes_to_slice = []
+    slices = []
+    for i, axis in enumerate(axes):
+        if shape[axis] < s[i]:
+            axes_to_pad.append(axis)
+            paddings.append(s[i] - shape[axis])
+        elif shape[axis] > s[i]:
+            axes_to_slice.append(axis)
+            slices.append((0, s[i]))
+
+    if axes_to_slice:
+        x = paddle.slice(
+            x,
+            axes_to_slice,
+            starts=[item[0] for item in slices],
+            ends=[item[1] for item in slices])
+    if axes_to_pad:
+        padding_widths = [0] * (2 * ndim)
+        for axis, pad in zip(axes_to_pad, paddings):
+            padding_widths[2 * axis + 1] = pad
+        x = paddle.nn.functional.pad(x, padding_widths)
+    return x
+
+
+def _normalize_axes(x, axes):
+    ndim = x.ndim
+    return [item if item >= 0 else (item + ndim) for item in axes]
+
+
+def _check_at_least_ndim(x, rank):
+    if x.ndim < rank:
+        raise ValueError("The rank of the input ({}) should >= {}".format(
+            x.ndim, rank))
+
+
+# public APIs 1d
+def fft(x, n=None, axis=-1, norm="backward", name=None):
+    """
+    Calculate one-dimensional discrete Fourier transform.
+
+    This function uses the efficient fast Fourier transform (FFT) algorithm [1] to 
+    calculate the 1-D * n * point discrete Fourier transform (DFT).
+
+    Args:
+        x (Tensor): The input data. It's a Tensor type. It's a complex.
+        n (int, optional): The length of the output transform axis. If `n` is less than 
+            the length input, the input will be cropped. If larger, the input is filled 
+            with zeros. If `n` is not given, the input length along the axis specified 
+            by `axis` is used.
+        axis (int, optional): Axis used to calculate FFT. If not specified, the last axis 
+            is used by default.       
+        norm (str): Indicates which direction to scale the `forward` or `backward` transform
+            pair and what normalization factor to use. The parameter value must be one 
+            of "forward" or "backward" or "ortho". Default is "backward", meaning no normalization on
+            the forward transforms and scaling by ``1/n`` on the `ifft`. "forward" instead applies 
+            the ``1/n`` factor on the forward tranform. For ``norm="ortho"``, both directions are 
+            scaled by ``1/sqrt(n)``.
+        name (str, optional): The default value is None.  Normally there is no need for user to set 
+            this property. For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        complex tensor. The truncated or zero-padded input, transformed along the axis indicated 
+        by `axis`, or the last one if `axis` is not specified.
+    
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            x = np.exp(3j * np.pi * np.arange(7) / 7)
+            xp = paddle.to_tensor(x)
+            fft_xp = paddle.fft.fft(xp).numpy()
+            print(fft_xp)
+            #  [1.+1.25396034e+00j 1.+4.38128627e+00j 1.-4.38128627e+00j
+            #   1.-1.25396034e+00j 1.-4.81574619e-01j 1.+8.88178420e-16j
+            #   1.+4.81574619e-01j]
+
+
+    """
+    if is_interger(x) or is_floating_point(x):
+        return fft_r2c(
+            x, n, axis, norm, forward=True, onesided=False, name=name)
+    else:
+        return fft_c2c(x, n, axis, norm, forward=True, name=name)
+
+
+def ifft(x, n=None, axis=-1, norm="backward", name=None):
+    """
+    Compute the 1-D inverse discrete Fourier Transform.
+
+    This function computes the inverse of the 1-D *n*-point discrete Fourier transform 
+    computed by `fft`.  In other words, ``ifft(fft(x)) == x`` to within numerical accuracy.
+
+    The input should be ordered in the same way as is returned by `fft`,
+    i.e.,
+
+    * ``x[0]`` should contain the zero frequency term,
+    * ``x[1:n//2]`` should contain the positive-frequency terms,
+    * ``x[n//2 + 1:]`` should contain the negative-frequency terms, in
+      increasing order starting from the most negative frequency.
+
+    For an even number of input points, ``x[n//2]`` represents the sum of
+    the values at the positive and negative Nyquist frequencies, as the two
+    are aliased together. 
+
+    Args:
+        x (Tensor): The input data. It's a Tensor type. It's a complex.
+        n (int, optional): The length of the output transform axis. If `n` is less than 
+            the length input, the input will be cropped. If larger, the input is filled 
+            with zeros. If `n` is not given, the input length along the axis specified 
+            by `axis` is used.
+        axis (int, optional): Axis used to calculate FFT. If not specified, the last axis 
+            is used by default.       
+        norm (str): Indicates which direction to scale the `forward` or `backward` transform
+            pair and what normalization factor to use. The parameter value must be one 
+            of "forward" or "backward" or "ortho". Default is "backward", meaning no normalization on
+            the forward transforms and scaling by ``1/n`` on the `ifft`. "forward" instead applies 
+            the ``1/n`` factor on the forward tranform. For ``norm="ortho"``, both directions are 
+            scaled by ``1/sqrt(n)``.
+        name (str, optional): The default value is None.  Normally there is no need for user to set 
+            this property. For more information, please refer to :ref:`api_guide_Name`.
+    
+    Returns:
+        complex tensor. The truncated or zero-padded input, transformed along the axis indicated 
+        by `axis`, or the last one if `axis` is not specified.
+
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            x = np.exp(3j * np.pi * np.arange(7) / 7)
+            xp = paddle.to_tensor(x)
+            ifft_xp = paddle.fft.ifft(xp).numpy()
+            print(ifft_xp)
+            #  [0.14285714+1.79137191e-01j 0.14285714+6.87963741e-02j
+            #   0.14285714+1.26882631e-16j 0.14285714-6.87963741e-02j
+            #   0.14285714-1.79137191e-01j 0.14285714-6.25898038e-01j
+            #   0.14285714+6.25898038e-01j]
+
+    """
+    if is_interger(x) or is_floating_point(x):
+        return fft_r2c(
+            x, n, axis, norm, forward=False, onesided=False, name=name)
+    else:
+        return fft_c2c(x, n, axis, norm, forward=False, name=name)
+
+
+def rfft(x, n=None, axis=-1, norm="backward", name=None):
+    """
+    The one dimensional FFT for real input.
+
+    This function computes the one dimensional *n*-point discrete Fourier
+    Transform (DFT) of a real-valued tensor by means of an efficient algorithm
+    called the Fast Fourier Transform (FFT).
+
+    When the DFT is computed for purely real input, the output is
+    Hermitian-symmetric. This function does not compute the negative frequency 
+    terms, and the length of the transformed axis of the output is therefore 
+    ``n//2 + 1``.
+
+    Args:
+        x(Tensor) : Real-valued input tensor 
+        n(int, optional): Number of points along transformation axis in the 
+            input to use. If `n` is smaller than the length of the input, the 
+            input is cropped. If it is larger, the input is padded with zeros. 
+            If `n` is not given, the length of the input along the axis 
+            specified by `axis` is used.
+        axis(int, optional): Axis over which to compute the FFT. Default value 
+            is last axis.
+        norm(str, optional) : Normalization mode, indicates which direction of 
+            the forward/backward  pair of transforms is scaled and with what 
+            normalization factor. Include {"backward", "ortho", "forward"}, 
+            default value is "backward".
+        name(str, optional): The default value is None.  Normally there is no 
+            need for user to set this property. For more information, please 
+            refer to :ref:`api_guide_Name` . 
+
+    Returns:
+        out(Tensor) : complex tensor
+
+    Raises:
+
+
+    Examples:
+    .. code-block:: python
+        import paddle
+
+        x = paddle.to_tensor([0.0, 1.0, 0.0, 0.0])
+        print(paddle.fft.rfft(x))
+        # Tensor(shape=[3], dtype=complex64, place=CUDAPlace(0), stop_gradient=True,
+        #        [ (1+0j), -1j    , (-1+0j)])
+    """
+    return fft_r2c(x, n, axis, norm, forward=True, onesided=True, name=name)
+
+
+def irfft(x, n=None, axis=-1, norm="backward", name=None):
+    """
+    Computes the inverse of `rfft`.
+
+    This function calculates the inverse of the one-dimensional *n* point discrete 
+    Fourier transform of the actual input calculated by "rfft". In other words, 
+    ``irfft(rfft(a),len(a)) == a`` is within the numerical accuracy range.
+
+    The input shall be in the form of "rfft", i.e. the actual zero frequency term, 
+    followed by the complex positive frequency term, in the order of increasing frequency. 
+    Because the discrete Fourier transform of the actual input is Hermite symmetric, 
+    the negative frequency term is regarded as the complex conjugate term of the corresponding 
+    positive frequency term.
+
+    Args:
+        x (Tensor): The input data. It's a Tensor type. It's a complex.
+        n (int, optional): The length of the output transform axis. For `n` output
+            points, ``n//2 + 1``input points are necessary. If the length of the input tensor is greater 
+            than `n`, it will be cropped, if it is shorter than this, fill in zero. If `n` is not given, 
+            it is considered to be ``2 * (k-1)``, where ``k`` is the length of the input axis specified 
+            along the ` axis'.
+        axis (int, optional): Axis used to calculate FFT. If not specified, the last axis 
+            is used by default.       
+        norm (str): Indicates which direction to scale the `forward` or `backward` transform
+            pair and what normalization factor to use. The parameter value must be one 
+            of "forward" or "backward" or "ortho". Default is "backward".
+        name (str, optional): The default value is None.  Normally there is no need for user to set 
+            this property. For more information, please refer to :ref:`api_guide_Name` . 
+
+    Returns:
+        Real tensor. Truncated or zero fill input for the transformation along the axis indicated by 
+        `axis`, or the last input if `axis` is not specified. The length of the conversion axis 
+        is `n`, or ``2 * k-2``, if `k` is None, where `k` is the length of the input conversion axis. 
+        If the output is an odd number, you need to specify the value of 'n', such as ``2 * k-1``
+        in some cases.
+    
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            x = np.array([1, -1j, -1])
+            xp = paddle.to_tensor(x)
+            irfft_xp = paddle.fft.irfft(xp).numpy()
+            print(irfft_xp)
+            #  [0. 1. 0. 0.]
+
+    """
+    return fft_c2r(x, n, axis, norm, forward=False, name=name)
+
+
+def hfft(x, n=None, axis=-1, norm="backward", name=None):
+    """
+    Compute the FFT of a signal that has Hermitian symmetry, a real
+    spectrum.
+
+    Args:
+        x (Tensor): The input data. It's a Tensor type. It's a complex.
+        n (int, optional): The length of the output transform axis. For `n` output
+            points, ``n//2 + 1`` input points are necessary. If the length of the input tensor is greater 
+            than `n`, it will be cropped, if it is shorter than this, fill in zero. If `n` is not given, 
+            it is considered to be ``2 * (k-1)``, where ``k`` is the length of the input axis specified 
+            along the ` axis'.
+        axis (int,optional): Axis used to calculate FFT. If not specified, the last axis 
+            is used by default.       
+        norm (str): Indicates which direction to scale the `forward` or `backward` transform
+            pair and what normalization factor to use. The parameter value must be one 
+            of "forward" or "backward" or "ortho". Default is "backward".
+        name (str, optional): The default value is None.  Normally there is no need for user to set 
+            this property. For more information, please refer to :ref:`api_guide_Name` . 
+
+    Returns:
+        Real tensor. Truncated or zero fill input for the transformation along the axis indicated by 
+        `axis`, or the last input if `axis` is not specified. The length of the conversion axis 
+        is `n`, or ``2 * k-2``, if `k` is None, where `k` is the length of the input conversion axis. 
+        If the output is an odd number, you need to specify the value of 'n', such as ``2 * k-1`` in 
+        some cases.
+    
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            x = np.array([1, -1j, -1])
+            xp = paddle.to_tensor(x)
+            hfft_xp = paddle.fft.hfft(xp).numpy()
+            print(hfft_xp)
+            #  [0. 0. 0. 4.]
+    """
+
+    return fft_c2r(x, n, axis, norm, forward=True, name=name)
+
+
+def ihfft(x, n=None, axis=-1, norm="backward", name=None):
+    """
+    The inverse FFT of a signal that has Hermitian symmetry.
+
+    This function computes the one dimensional *n*-point inverse FFT of a signal 
+    that has Hermitian symmetry by means of an efficient algorithm called 
+    the Fast Fourier Transform (FFT).
+
+    When the DFT is computed for purely real input, the output is
+    Hermitian-symmetric. This function does not compute the negative frequency 
+    terms, and the length of the transformed axis of the output is therefore 
+    ``n//2 + 1``.
+
+    Args:
+        x(Tensor): Input tensor.
+        n(int, optional): The number of points along transformation axis in the 
+            input to use.  If `n` is smaller than the length of the input, the 
+            input is cropped.  If it is larger, the input is padded with zeros. 
+            If `n` is not given, the length of the input along the axis 
+            specified by `axis` is used.
+        axis(int, optional) : Axis over which to compute the inverse FFT. If not
+            given, the last axis is used.
+        norm(str, optional) : Normalization mode, indicates which direction of 
+            the forward/backward pair of transforms is scaled and with what 
+            normalization factor. Include {"backward", "ortho", "forward"}, 
+            default value is "backward".
+        name(str, optional): The default value is None.  Normally there is no 
+            need for user to set this property. For more information, please 
+            refer to :ref:`api_guide_Name` . 
+
+    Returns:
+        out(Tensor) : complex tensor.
+
+    Examples:
+    .. code-block:: python
+        import paddle 
+
+        spectrum = paddle.to_tensor([10.0, -5.0, 0.0, -1.0, 0.0, -5.0])
+        print(paddle.fft.ifft(spectrum))
+        # Tensor(shape=[6], dtype=complex64, place=CUDAPlace(0), stop_gradient=True,
+        #       [(-0.1666666716337204+0j),  (1-1.9868215517249155e-08j), (2.3333334922790527-1.9868215517249155e-08j),  (3.5+0j), (2.3333334922790527+1.9868215517249155e-08j),  (1+1.9868215517249155e-08j)])
+        print(paddle.fft.ihfft(spectrum))
+        #  Tensor(shape = [4], dtype = complex64, place = CUDAPlace(0), stop_gradient = True,
+        #         [(-0.1666666716337204+0j),  (1-1.9868215517249155e-08j), (2.3333334922790527-1.9868215517249155e-08j),  (3.5+0j)])
+
+    """
+    return fft_r2c(x, n, axis, norm, forward=False, onesided=True, name=name)
+
+
+# public APIs nd
+def fftn(x, s=None, axes=None, norm="backward", name=None):
+    """
+    Compute the N-D discrete Fourier Transform.
+
+    This function calculates the n-D discrete Fourier transform on any number of axes 
+    in the M-D array by fast Fourier transform (FFT).
+
+    Args:
+        x (Tensor): The input data. It's a Tensor type. It's a complex.
+        s (sequence of ints, optional): Shape (length of each transformed axis) of the output
+            (``s[0]`` refers to axis 0, ``s[1]`` to axis 1, etc.).
+            This corresponds to ``n`` for ``fft(x, n)``.
+            Along any axis, if the given shape is smaller than that of the input,
+            the input is cropped. If it is larger, the input is padded with zeros.
+            if `s` is not given, the shape of the input along the axes specified
+            by `axes` is used.
+        axes (sequence of ints, optional): Axes used to calculate FFT. If not given, the last ``len(s)``
+            axes are used, or all axes if `s` is also not specified.      
+        norm (str): Indicates which direction to scale the `forward` or `backward` transform
+            pair and what normalization factor to use. The parameter value must be one 
+            of "forward" or "backward" or "ortho". Default is "backward", meaning no normalization on
+            the forward transforms and scaling by ``1/n`` on the `ifft`. "forward" instead applies 
+            the ``1/n`` factor on the forward tranform. For ``norm="ortho"``, both directions are 
+            scaled by ``1/sqrt(n)``.
+        name (str, optional): The default value is None.  Normally there is no need for user to set 
+            this property. For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        complex tensor. The truncated or zero-padded input, transformed along the axes indicated by 
+        `axes`, or by a combination of `s` and `x`, as explained in the parameters section above.
+    
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            x = np.mgrid[:4, :4, :4][1]
+            xp = paddle.to_tensor(x)
+            fftn_xp = paddle.fft.fftn(xp, axes=(1, 2)).numpy()
+            print(fftn_xp)
+            #  [[[24.+0.j  0.+0.j  0.+0.j  0.-0.j]
+            #   [-8.+8.j  0.+0.j  0.+0.j  0.-0.j]
+            #   [-8.+0.j  0.+0.j  0.+0.j  0.-0.j]
+            #   [-8.-8.j  0.+0.j  0.+0.j  0.-0.j]]
+            #   [[24.+0.j  0.+0.j  0.+0.j  0.-0.j]
+            #   [-8.+8.j  0.+0.j  0.+0.j  0.-0.j]
+            #   [-8.+0.j  0.+0.j  0.+0.j  0.-0.j]
+            #   [-8.-8.j  0.+0.j  0.+0.j  0.-0.j]]
+            #   [[24.+0.j  0.+0.j  0.+0.j  0.-0.j]
+            #   [-8.+8.j  0.+0.j  0.+0.j  0.-0.j]
+            #   [-8.+0.j  0.+0.j  0.+0.j  0.-0.j]
+            #   [-8.-8.j  0.+0.j  0.+0.j  0.-0.j]]
+            #   [[24.+0.j  0.+0.j  0.+0.j  0.-0.j]
+            #   [-8.+8.j  0.+0.j  0.+0.j  0.-0.j]
+            #   [-8.+0.j  0.+0.j  0.+0.j  0.-0.j]
+            #   [-8.-8.j  0.+0.j  0.+0.j  0.-0.j]]]
+    """
+    if is_interger(x) or is_floating_point(x):
+        return fftn_r2c(
+            x, s, axes, norm, forward=True, onesided=False, name=name)
+    else:
+        return fftn_c2c(x, s, axes, norm, forward=True, name=name)
+
+
+def ifftn(x, s=None, axes=None, norm="backward", name=None):
+    """
+    Compute the N-D inverse discrete Fourier Transform.
+
+    This function computes the inverse of the N-D discrete
+    Fourier Transform over any number of axes in an M-D array by
+    means of the Fast Fourier Transform (FFT).  In other words,
+    ``ifftn(fftn(x)) == x`` to within numerical accuracy.
+
+    The input, analogously to `ifft`, should be ordered in the same way as is
+    returned by `fftn`, i.e., it should have the term for zero frequency
+    in all axes in the low-order corner, the positive frequency terms in the
+    first half of all axes, the term for the Nyquist frequency in the middle
+    of all axes and the negative frequency terms in the second half of all
+    axes, in order of decreasingly negative frequency.
+
+    Args:
+        x (Tensor): The input data. It's a Tensor type. It's a complex.
+        s (sequence of ints, optional): Shape (length of each transformed axis) of the output
+            (``s[0]`` refers to axis 0, ``s[1]`` to axis 1, etc.).
+            This corresponds to ``n`` for ``fft(x, n)``.
+            Along any axis, if the given shape is smaller than that of the input,
+            the input is cropped. If it is larger, the input is padded with zeros.
+            if `s` is not given, the shape of the input along the axes specified
+            by `axes` is used.
+        axes (sequence of ints, optional): Axes used to calculate FFT. If not given, the last ``len(s)``
+            axes are used, or all axes if `s` is also not specified.      
+        norm (str): Indicates which direction to scale the `forward` or `backward` transform
+            pair and what normalization factor to use. The parameter value must be one 
+            of "forward" or "backward" or "ortho". Default is "backward", meaning no normalization on
+            the forward transforms and scaling by ``1/n`` on the `ifft`. "forward" instead applies 
+            the ``1/n`` factor on the forward tranform. For ``norm="ortho"``, both directions are 
+            scaled by ``1/sqrt(n)``.
+        name (str, optional): The default value is None.  Normally there is no need for user to set 
+            this property. For more information, please refer to :ref:`api_guide_Name`.
+        
+    Returns:
+        complex tensor. The truncated or zero-padded input, transformed along the axes indicated by 
+        `axes`, or by a combination of `s` and `x`, as explained in the parameters section above.
+    
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            x = np.eye(3)
+            xp = paddle.to_tensor(x)
+            ifftn_xp = paddle.fft.ifftn(xp, axes=(1,)).numpy()
+            print(ifftn_xp)
+
+            #   [[ 0.33333333+0.j          0.33333333+0.j          0.33333333-0.j        ]
+            #   [ 0.33333333+0.j         -0.16666667+0.28867513j -0.16666667-0.28867513j]
+            #   [ 0.33333333+0.j         -0.16666667-0.28867513j -0.16666667+0.28867513j]]
+
+    """
+    if is_interger(x) or is_floating_point(x):
+        return fftn_r2c(
+            x, s, axes, norm, forward=False, onesided=False, name=name)
+    else:
+        return fftn_c2c(x, s, axes, norm, forward=False, name=name)
+
+
+def rfftn(x, s=None, axes=None, norm="backward", name=None):
+    """
+    The N dimensional FFT for real input.
+
+    This function computes the N-dimensional discrete Fourier Transform over
+    any number of axes in an M-dimensional real array by means of the Fast
+    Fourier Transform (FFT).  By default, all axes are transformed, with the
+    real transform performed over the last axis, while the remaining
+    transforms are complex.
+
+    The transform for real input is performed over the last transformation
+    axis, as by `rfft`, then the transform over the remaining axes is
+    performed as by `fftn`.  The order of the output is as for `rfft` for the
+    final transformation axis, and as for `fftn` for the remaining
+    transformation axes.
+
+    Args:
+        x(Tensor) : Input tensor, taken to be real.
+        s(Sequence[int]) : Shape to use from the exec fft. The final element of 
+            `s` corresponds to `n` for ``rfft(x, n)``, while for the remaining 
+            axes, it corresponds to `n` for ``fft(x, n)``. Along any axis, if 
+            the given shape is smaller than that of the input, the input is 
+            cropped.  If it is larger, the input is padded with zeros. if `s` is 
+            not given, the shape of the input along the axes specified by `axes` 
+            is used.
+        axes(Sequence[int]) : Axes over which to compute the FFT.  If not given, 
+            the last ``len(s)`` axes are used, or all axes if `s` is also not 
+            specified.
+        norm(str, optional) : Normalization mode, indicates which direction of 
+            the forward/backward pair of transforms is scaled and with what 
+            normalization factor. Include {"backward", "ortho", "forward"}, 
+            default value is "backward".
+        name(str, optional): The default value is None.  Normally there is no 
+            need for user to set this property. For more information, please 
+            refer to :ref:`api_guide_Name` . 
+
+    Returns:
+        out(Tensor): complex tensor
+
+
+    Raises:
+        ValueError: If `s` and `axes` have different length.
+
+    Examples:
+    .. code-block:: python
+        import paddle
+
+        # default, all axis will be used to exec fft
+        x = paddle.ones((2, 3, 4))
+        print(paddle.fft.rfftn(x))
+        # Tensor(shape=[2, 3, 3], dtype=complex64, place=CUDAPlace(0), stop_gradient=True,
+        #        [[[(24+0j), 0j     , 0j     ],
+        #          [0j     , 0j     , 0j     ],
+        #          [0j     , 0j     , 0j     ]],
+        #
+        #         [[0j     , 0j     , 0j     ],
+        #          [0j     , 0j     , 0j     ],
+        #          [0j     , 0j     , 0j     ]]])
+
+        # use axes(2, 0)
+        print(paddle.fft.rfftn(x, axes=(2, 0)))
+        # Tensor(shape=[2, 3, 3], dtype=complex64, place=CUDAPlace(0), stop_gradient=True,
+        #        [[[(8+0j), 0j     , 0j     ],
+        #          [(8+0j), 0j     , 0j     ],
+        #          [(8+0j), 0j     , 0j     ]],
+        #
+        #         [[0j     , 0j     , 0j     ],
+        #          [0j     , 0j     , 0j     ],
+        #          [0j     , 0j     , 0j     ]]])
+
+    """
+    return fftn_r2c(x, s, axes, norm, forward=True, onesided=True, name=name)
+
+
+def irfftn(x, s=None, axes=None, norm="backward", name=None):
+    """
+    Computes the inverse of `rfftn`.
+
+    This function computes the inverse of the N-D discrete
+    Fourier Transform for real input over any number of axes in an
+    M-D array by means of the Fast Fourier Transform (FFT). In
+    other words, ``irfftn(rfftn(x), x.shape) == x`` to within numerical
+    accuracy. (The ``a.shape`` is necessary like ``len(a)`` is for `irfft`,
+    and for the same reason.)
+
+    The input should be ordered in the same way as is returned by `rfftn`,
+    i.e., as for `irfft` for the final transformation axis, and as for `ifftn`
+    along all the other axes.
+
+    Args:
+        x (Tensor): The input data. It's a Tensor type.
+        s (sequence of ints, optional): The length of the output transform axis. 
+            (``s[0]`` refers to axis 0, ``s[1]`` to axis 1, etc.). `s` is also the
+            number of input points used along this axis, except for the last axis,
+            where ``s[-1]//2+1`` points of the input are used. Along any axis, if 
+            the shape indicated by `s` is smaller than that of the input, the input 
+            is cropped. If it is larger, the input is padded with zeros. 
+            If `s` is not given, the shape of the input along the axes specified by axes 
+            is used. Except for the last axis which is taken to be ``2*(k-1)`` where 
+            ``k`` is the length of the input along that axis.
+        axes (sequence of ints, optional): Axes over which to compute the inverse FFT. If not given, the last
+            `len(s)` axes are used, or all axes if `s` is also not specified.      
+        norm (str): Indicates which direction to scale the `forward` or `backward` transform
+            pair and what normalization factor to use. The parameter value must be one 
+            of "forward" or "backward" or "ortho". Default is "backward".
+        name (str, optional): The default value is None.  Normally there is no need for user to set 
+            this property. For more information, please refer to :ref:`api_guide_Name`. 
+    
+    Returns:
+        Real tensor. The truncated or zero-padded input, transformed along the axes indicated by `axes`, 
+        or by a combination of `s` or `x`, as explained in the parameters section above. The length of 
+        each transformed axis is as given by the corresponding element of `s`, or the length of the input
+        in every axis except for the last one if `s` is not given. In the final transformed axis the length
+        of the output when `s` is not given is ``2*(m-1)``, where ``m`` is the length of the final 
+        transformed axis of the input. To get an odd number of output points in the final axis, 
+        `s` must be specified.
+
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            x = (np.array([2, 2, 3]) + 1j * np.array([2, 2, 3])).astype(np.complex128)
+            xp = paddle.to_tensor(x)
+            irfftn_xp = paddle.fft.irfftn(xp).numpy()
+            print(irfftn_xp)
+            #  [ 2.25 -1.25  0.25  0.75]
+    
+    """
+    return fftn_c2r(x, s, axes, norm, forward=False, name=name)
+
+
+def hfftn(x, s=None, axes=None, norm="backward", name=None):
+    """
+    Compute the N-D FFT of Hermitian symmetric complex input, i.e., a
+    signal with a real spectrum.
+
+    This function calculates the n-D discrete Fourier transform of Hermite symmetric 
+    complex input on any axis in M-D array by fast Fourier transform (FFT). 
+    In other words, ``ihfftn(hfftn(x, s)) == x is within the numerical accuracy range. 
+    (``s`` here are ``x.shape`` and ``s[-1] = x.shape[- 1] * 2 - 1``. This is necessary 
+    for the same reason that ``irfft` requires ``x.shape``.)
+
+    Args:
+        x (Tensor): The input data. It's a Tensor type.
+        s (sequence of ints, optional): The length of the output transform axis. 
+            (``s[0]`` refers to axis 0, ``s[1]`` to axis 1, etc.). `s` is also the
+            number of input points used along this axis, except for the last axis,
+            where ``s[-1]//2+1`` points of the input are used. Along any axis, if 
+            the shape indicated by `s` is smaller than that of the input, the input 
+            is cropped. If it is larger, the input is padded with zeros. 
+            If `s` is not given, the shape of the input along the axes specified by axes 
+            is used. Except for the last axis which is taken to be ``2*(k-1)`` where 
+            ``k`` is the length of the input along that axis.
+        axes (sequence of ints, optional): Axes over which to compute the inverse FFT. If not given, the last
+            `len(s)` axes are used, or all axes if `s` is also not specified.      
+        norm (str): Indicates which direction to scale the `forward` or `backward` transform
+            pair and what normalization factor to use. The parameter value must be one 
+            of "forward" or "backward" or "ortho". Default is "backward".
+        name (str, optional): The default value is None.  Normally there is no need for user to set 
+            this property. For more information, please refer to :ref:`api_guide_Name`. 
+    
+    Returns:
+        Real tensor. Truncate or zero fill input, transforming along the axis indicated by axis or 
+        a combination of `s` or `X`.
+    
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            x = (np.array([2, 2, 3]) + 1j * np.array([2, 2, 3])).astype(np.complex128)
+            xp = paddle.to_tensor(x)
+            hfftn_xp = paddle.fft.hfftn(xp).numpy()
+            print(hfftn_xp)
+            #  [ 9.  3.  1. -5.]
+
+
+    """
+    return fftn_c2r(x, s, axes, norm, forward=True, name=name)
+
+
+def ihfftn(x, s=None, axes=None, norm="backward", name=None):
+    """
+    The n dimensional inverse FFT of a signal that has Hermitian symmetry.
+
+    This function computes the n dimensional inverse FFT over any number of axes 
+    in an M-dimensional of a signal that has Hermitian symmetry by means of an 
+    efficient algorithm called the Fast Fourier Transform (FFT).
+
+    Args:
+        x(Tensor): Input tensor.
+        s(Sequence[int], optional) : Shape (length along each transformed axis) 
+            to use from the input. (``s[0]`` refers to axis 0, ``s[1]`` to axis 
+            1, etc.). Along any axis, if the given shape is smaller than that 
+            of the input, the input is cropped. If it is larger, the input is 
+            padded with zeros. if `s` is not given, the shape of the input 
+            along the axes specified by `axes` is used.
+        axis(Sequence[int], optional) : Axis over which to compute the inverse FFT. If not
+            given, the last axis is used.
+        norm(str, optional) : Normalization mode, indicates which direction of 
+            the forward/backward pair of transforms is scaled and with what 
+            normalization factor. Include {"backward", "ortho", "forward"}, 
+            default value is "backward".
+        name(str, optional): The default value is None.  Normally there is no 
+            need for user to set this property. For more information, please 
+            refer to :ref:`api_guide_Name` . 
+
+    Returns:
+        out(Tensor) : complex tensor.
+
+    Examples:
+    .. code-block:: python
+        import paddle 
+
+        spectrum = paddle.to_tensor([10.0, -5.0, 0.0, -1.0, 0.0, -5.0])
+        print(paddle.fft.ifft(spectrum))
+        # Tensor(shape=[6], dtype=complex64, place=CUDAPlace(0), stop_gradient=True,
+        #       [(-0.1666666716337204+0j),  (1-1.9868215517249155e-08j), (2.3333334922790527-1.9868215517249155e-08j),  (3.5+0j), (2.3333334922790527+1.9868215517249155e-08j),  (1+1.9868215517249155e-08j)])
+        print(paddle.fft.ihfft(spectrum))
+        #  Tensor(shape = [4], dtype = complex64, place = CUDAPlace(0), stop_gradient = True,
+        #         [(-0.1666666716337204+0j),  (1-1.9868215517249155e-08j), (2.3333334922790527-1.9868215517249155e-08j),  (3.5+0j)])
+
+    """
+    return fftn_r2c(x, s, axes, norm, forward=False, onesided=True, name=name)
+
+
+# public APIs 2d
+def fft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
+    """
+    Compute the 2-D discrete Fourier Transform
+
+    This function computes the N-D discrete Fourier Transform
+    over any axes in an M-D array by means of the
+    Fast Fourier Transform (FFT). By default, the transform is computed over
+    the last two axes of the input array, i.e., a 2-dimensional FFT.
+
+    Args:
+        x (Tensor): The input data. It's a Tensor type.
+        s (sequence of ints, optional): Shape (length of each transformed axis) of the output. 
+            It should be a sequence of 2 integers. This corresponds to ``n`` for ``fft(x, n)``. 
+            Along each axis, if the given shape is smaller than that of the input,
+            the input is cropped. If it is larger, the input is padded with zeros.
+            if `s` is not given, the shape of the input along the axes specified
+            by `axes` is used. Default is None.
+        axes (sequence of ints, optional):  Axes over which to compute the FFT. It should be a 
+            sequence of 2 integers. If not specified, the last two axes are used by default.       
+        norm (str): Indicates which direction to scale the `forward` or `backward` transform
+            pair and what normalization factor to use. The parameter value must be one 
+            of "forward" or "backward" or "ortho". Default is "backward".
+        name (str, optional): The default value is None.  Normally there is no need for user to set 
+            this property. For more information, please refer to :ref:`api_guide_Name`. 
+    
+    Returns:
+        Complex tensor. The truncated or zero-padded input, transformed along the axes indicated by `axes`, 
+        or the last two axes if `axes` is not given.
+    
+    Raises:
+        ValueError: if `s` not be a sequence of 2 integers or None.
+        ValueError: if `axes` not be a sequence of 2 integers or None.
+        ValueError: If the input dimension is smaller than 2.
+
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            x = np.mgrid[:2, :2][1]
+            xp = paddle.to_tensor(x)
+            fft2_xp = paddle.fft.fft2(xp).numpy()
+            print(fft2_xp)
+            #  [[ 2.+0.j -2.+0.j]
+            #   [ 0.+0.j  0.+0.j]]
+
+    """
+    _check_at_least_ndim(x, 2)
+    if s is not None:
+        if not isinstance(s, Sequence) or len(s) != 2:
+            raise ValueError(
+                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".
+                format(s))
+    if axes is not None:
+        if not isinstance(axes, Sequence) or len(axes) != 2:
+            raise ValueError(
+                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".
+                format(axes))
+    return fftn(x, s, axes, norm, name)
+
+
+def ifft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
+    """
+    Compute the 2-D inverse discrete Fourier Transform.
+
+    This function computes the inverse of the 2-D discrete Fourier
+    Transform over any number of axes in an M-D array by means of
+    the Fast Fourier Transform (FFT). In other words, ``ifft2(fft2(x)) == x``
+    to within numerical accuracy. By default, the inverse transform is
+    computed over the last two axes of the input array.
+
+    The input, analogously to `ifft`, should be ordered in the same way as is
+    returned by `fft2`, i.e., it should have the term for zero frequency
+    in the low-order corner of the two axes, the positive frequency terms in
+    the first half of these axes, the term for the Nyquist frequency in the
+    middle of the axes and the negative frequency terms in the second half of
+    both axes, in order of decreasingly negative frequency.
+
+    Args:
+        x (Tensor): The input data. It's a Tensor type.
+        s (sequence of ints, optional): Shape (length of each transformed axis) of the output. 
+            It should be a sequence of 2 integers. This corresponds to ``n`` for ``fft(x, n)``. 
+            Along each axis, if the given shape is smaller than that of the input,
+            the input is cropped. If it is larger, the input is padded with zeros.
+            if `s` is not given, the shape of the input along the axes specified
+            by `axes` is used. Default is None.
+        axes (sequence of ints, optional):  Axes over which to compute the FFT. It should be a 
+            sequence of 2 integers. If not specified, the last two axes are used by default.       
+        norm (str): Indicates which direction to scale the `forward` or `backward` transform
+            pair and what normalization factor to use. The parameter value must be one 
+            of "forward" or "backward" or "ortho". Default is "backward".
+        name (str, optional): The default value is None.  Normally there is no need for user to set 
+            this property. For more information, please refer to :ref:`api_guide_Name`.
+    
+    Returns:
+        Complex tensor. The truncated or zero-padded input, transformed along the axes indicated by `axes`, 
+        or the last two axes if `axes` is not given.
+
+    Raises:
+        ValueError: if `s` not be a sequence of 2 integers or None.
+        ValueError: if `axes` not be a sequence of 2 integers or None.
+        ValueError: If the input dimension is smaller than 2.
+
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            x = np.mgrid[:2, :2][1]
+            xp = paddle.to_tensor(x)
+            ifft2_xp = paddle.fft.ifft2(xp).numpy()
+            print(ifft2_xp)
+            #  [[ 0.5+0.j -0.5+0.j]
+            #   [ 0. +0.j  0. +0.j]]
+    """
+    _check_at_least_ndim(x, 2)
+    if s is not None:
+        if not isinstance(s, Sequence) or len(s) != 2:
+            raise ValueError(
+                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".
+                format(s))
+    if axes is not None:
+        if not isinstance(axes, Sequence) or len(axes) != 2:
+            raise ValueError(
+                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".
+                format(axes))
+    return ifftn(x, s, axes, norm, name)
+
+
+def rfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
+    """
+    The two dimensional FFT with real tensor input.
+
+    This is really just `rfftn` with different default behavior.
+    For more details see `rfftn`.
+
+    Args:
+        x(Tensor): Input tensor, taken to be real.
+        s(Sequence[int]) : Shape of the FFT.
+        axes(Sequence[int], optional): Axes over which to compute the FFT.
+        norm(str, optional) : {"backward", "ortho", "forward"}, 
+            default is "backward". Indicates which direction of the 
+            forward/backward pair of transforms is scaled and with what 
+            normalization factor.
+        name(str, optional): The default value is None.  Normally there is no 
+            need for user to set this property. For more information, please 
+            refer to :ref:`api_guide_Name` . 
+
+    Returns: 
+        out(Tensor): The result of the real 2-D FFT.
+
+    Raises:
+
+
+    Examples:
+
+    .. code-block:: python
+        import paddle
+        import numpy as np
+
+        x = paddle.to_tensor(np.mgrid[:5, :5][0].astype(np.float32))
+        print(paddle.fft.rfft2(x))
+        # Tensor(shape=[5, 3], dtype=complex64, place=CUDAPlace(0), stop_gradient=True,
+        #        [[ (50+0j)                                        ,  (1.1920928955078125e-07+0j)                    ,  0j                                             ],
+        #         [(-12.5+17.204774856567383j)                     , (-9.644234211236835e-08+7.006946134424652e-08j) ,  0j                                             ],
+        #         [(-12.500000953674316+4.061495304107666j)        , (3.6837697336977726e-08-1.1337477445749755e-07j),  0j                                             ],
+        #         [(-12.500000953674316-4.061495304107666j)        , (3.6837697336977726e-08+1.1337477445749755e-07j),  0j                                             ],
+        #         [(-12.5-17.204774856567383j)                     , (-9.644234211236835e-08-7.006946134424652e-08j) ,  0j                                             ]])
+    """
+    _check_at_least_ndim(x, 2)
+    if s is not None:
+        if not isinstance(s, Sequence) or len(s) != 2:
+            raise ValueError(
+                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".
+                format(s))
+    if axes is not None:
+        if not isinstance(axes, Sequence) or len(axes) != 2:
+            raise ValueError(
+                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".
+                format(axes))
+    return rfftn(x, s, axes, norm, name)
+
+
+def irfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
+    """
+    Computes the inverse of `rfft2`.
+
+    Args:
+        x (Tensor): The input data. It's a Tensor type.
+        s (sequence of ints, optional): Shape of the real output to the inverse FFT. Default is None.
+        axes (sequence of ints, optional): The axes over which to compute the inverse FFT. Axes 
+            must be two-dimensional. If not specified, the last two axes are used by default.       
+        norm (str): Indicates which direction to scale the `forward` or `backward` transform
+            pair and what normalization factor to use. The parameter value must be one 
+            of "forward" or "backward" or "ortho". Default is "backward".
+        name (str, optional): The default value is None.  Normally there is no need for user to set 
+            this property. For more information, please refer to :ref:`api_guide_Name` . 
+    
+    Returns:
+        Real tensor. The result of the inverse real 2-D FFT.
+
+    Raises:
+        ValueError: if `s` not be a sequence of 2 integers or None.
+        ValueError: if `axes` not be a sequence of 2 integers or None.
+        ValueError: If the input dimension is smaller than 2.
+    
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            x = (np.array([[3,2,3],[2, 2, 3]]) + 1j * np.array([[3,2,3],[2, 2, 3]])).astype(np.complex128)
+            xp = paddle.to_tensor(x)
+            irfft2_xp = paddle.fft.irfft2(xp).numpy()
+            print(irfft2_xp)
+            #  [[ 2.375 -1.125  0.375  0.875]
+            #   [ 0.125  0.125  0.125  0.125]]
+
+    """
+    _check_at_least_ndim(x, 2)
+    if s is not None:
+        if not isinstance(s, Sequence) or len(s) != 2:
+            raise ValueError(
+                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".
+                format(s))
+    if axes is not None:
+        if not isinstance(axes, Sequence) or len(axes) != 2:
+            raise ValueError(
+                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".
+                format(axes))
+    return irfftn(x, s, axes, norm, name)
+
+
+def hfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
+    """
+    Compute the 2-D FFT of a Hermitian complex array.
+
+    Args:
+        x (Tensor): The input data. It's a Tensor type.
+        s (sequence of ints, optional): Shape of the real output. Default is None.
+        axes (sequence of ints, optional):  Axes over which to compute the FFT. Axes must be 
+            two-dimensional. If not specified, the last two axes are used by default.       
+        norm (str): Indicates which direction to scale the `forward` or `backward` transform
+            pair and what normalization factor to use. The parameter value must be one 
+            of "forward" or "backward" or "ortho". Default is "backward".
+        name (str, optional): The default value is None.  Normally there is no need for user to set 
+            this property. For more information, please refer to :ref:`api_guide_Name`. 
+    
+    Returns:
+        Real tensor. The real result of the 2-D Hermitian complex real FFT.
+    
+    Raises:
+        ValueError: if `s` not be a sequence of 2 integers or None.
+        ValueError: if `axes` not be a sequence of 2 integers or None.
+        ValueError: If the input dimension is smaller than 2.
+    
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            x = (np.array([[3,2,3],[2, 2, 3]]) + 1j * np.array([[3,2,3],[2, 2, 3]])).astype(np.complex128)
+            xp = paddle.to_tensor(x)
+            hfft2_xp = paddle.fft.hfft2(xp).numpy()
+            print(hfft2_xp)
+            #  [[19.  7.  3. -9.]
+            #   [ 1.  1.  1.  1.]]
+
+
+    """
+    _check_at_least_ndim(x, 2)
+    if s is not None:
+        if not isinstance(s, Sequence) or len(s) != 2:
+            raise ValueError(
+                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".
+                format(s))
+    if axes is not None:
+        if not isinstance(axes, Sequence) or len(axes) != 2:
+            raise ValueError(
+                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".
+                format(axes))
+    return hfftn(x, s, axes, norm, name)
+
+
+def ihfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
+    """
+    Compute the two dimensional inverse FFT of a real spectrum.
+
+    This is really `ihfftn` with different defaults.
+    For more details see `ihfftn`.
+
+    Args:
+        x(Tensor): Input tensor
+        s(Sequence[int], optional): Shape of the real input to the inverse FFT.
+        axes(Sequance[int], optional): The axes over which to compute the 
+            inverse fft. Default is the last two axes.
+        norm(str, optional): {"backward", "ortho", "forward"}. Default is 
+        "backward".
+        name(str, optional): The default value is None.  Normally there is no 
+            need for user to set this property. For more information, please 
+            refer to :ref:`api_guide_Name` . 
+
+    Returns:
+        out(Tensor) : The result of the inverse hermitian 2-D FFT.
+
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            x = np.mgrid[:5, :5][0].astype(np.float64)
+            xp = paddle.to_tensor(x)
+            ihfft2_xp = paddle.fft.ihfft2(xp).numpy()
+            print(ihfft2_xp)
+            # [[ 2. +0.j          0. +0.j          0. +0.j        ]
+            #  [-0.5-0.68819096j  0. +0.j          0. +0.j        ]
+            #  [-0.5-0.16245985j  0. +0.j          0. +0.j        ]
+            #  [-0.5+0.16245985j  0. +0.j          0. +0.j        ]
+            #  [-0.5+0.68819096j  0. +0.j          0. +0.j        ]]
+    """
+    _check_at_least_ndim(x, 2)
+    if s is not None:
+        if not isinstance(s, Sequence) or len(s) != 2:
+            raise ValueError(
+                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".
+                format(s))
+    if axes is not None:
+        if not isinstance(axes, Sequence) or len(axes) != 2:
+            raise ValueError(
+                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".
+                format(axes))
+    return ihfftn(x, s, axes, norm, name)
+
+
+# public APIs utilities
+def fftfreq(n, d=1.0, dtype=None, name=None):
+    """
+    Return the Discrete Fourier Transform sample frequencies.
+
+    The returned float array `f` contains the frequency bin centers in cycles
+    per unit of the sample spacing (with zero at the start).  For instance, if
+    the sample spacing is in seconds, then the frequency unit is cycles/second.
+
+    Given input length `n` and a sample spacing `d`::
+
+      f = [0, 1, ...,   n/2-1,     -n/2, ..., -1] / (d*n)   if n is even
+      f = [0, 1, ..., (n-1)/2, -(n-1)/2, ..., -1] / (d*n)   if n is odd
+
+    Args:
+        n (int): Dimension inputed.
+        d (scalar, optional): Sample spacing (inverse of the sampling rate). Defaults is 1.
+        name (str, optional): The default value is None.  Normally there is no need for user to set 
+            this property. For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor. A tensor of length 'n' containing the sampling frequency.
+
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            x = np.array([3, 1, 2, 2, 3], dtype=float)
+            scalar_temp = 0.5
+            n = x.size
+            fftfreq_xp = paddle.fft.fftfreq(n, d=scalar_temp)
+            print(fftfreq_xp)
+
+            #  Tensor(shape=[5], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            #           [ 0.        ,  0.40000001,  0.80000001, -0.80000001, -0.40000001])
+    """
+
+    dtype = paddle.framework.get_default_dtype()
+    val = 1.0 / (n * d)
+    pos_max = (n + 1) // 2
+    neg_max = n // 2
+    indices = paddle.arange(-neg_max, pos_max, dtype=dtype, name=name)
+    indices = paddle.roll(indices, -neg_max, name=name)
+    return indices * val
+
+
+def rfftfreq(n, d=1.0, dtype=None, name=None):
+    """
+    Return the Discrete Fourier Transform sample frequencies.
+
+    The returned floating-point array "F" contains the center of the frequency unit, 
+    and the unit is the number of cycles of the sampling interval (the starting point is zero). 
+
+    Given input length `n` and a sample spacing `d`::
+
+      f = [0, 1, ...,     n/2-1,     n/2] / (d*n)   if n is even
+      f = [0, 1, ..., (n-1)/2-1, (n-1)/2] / (d*n)   if n is odd
+
+    the Nyquist frequency component is considered to be positive.
+
+    Args:
+        n (int): Dimension inputed.
+        d (scalar, optional): Sample spacing (inverse of the sampling rate). Defaults is 1.
+        name (str, optional): The default value is None.  Normally there is no need for user to set 
+            this property. For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor. A tensor of length ``n//2 + 1`` containing the sample frequencies.
+
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            x = np.array([3, 1, 2, 2, 3], dtype=float)
+            scalar_temp = 0.3
+            n = x.size
+            rfftfreq_xp = paddle.fft.rfftfreq(n, d=scalar_temp)
+            print(rfftfreq_xp)
+
+            #  Tensor(shape=[3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            #           [0.        , 0.66666669, 1.33333337])
+
+    """
+
+    dtype = paddle.framework.get_default_dtype()
+    val = 1.0 / (n * d)
+    pos_max = 1 + n // 2
+    indices = paddle.arange(0, pos_max, dtype=dtype, name=name)
+    return indices * val
+
+
+def fftshift(x, axes=None, name=None):
+    """
+    Shift the zero-frequency component to the center of the spectrum.
+
+    This function swaps half spaces for all the axes listed (all by default).
+    Note that ``y[0]`` is the Nyquist component only if ``len(x)`` is even.
+
+    Args:
+        n (int): Dimension inputed.
+        axes (int|tuple, optional): The axis on which to move. The default is none, which moves all axes.
+            Default is None.
+        name (str, optional): The default value is None.  Normally there is no need for user to set 
+            this property. For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor. The shifted tensor.
+    
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            x = np.array([3, 1, 2, 2, 3], dtype=float)
+            n = x.size
+            fftfreq_xp = paddle.fft.fftfreq(n, d=0.3)
+            res = paddle.fft.fftshift(fftfreq_xp).numpy()
+            print(res)
+            #  [-1.3333334 -0.6666667  0.         0.6666667  1.3333334]
+
+    """
+    shape = paddle.shape(x)
+    if axes is None:
+        # shift all axes
+        rank = paddle.rank(x).reshape([1])
+        axes = axes or paddle.arange(0, rank)
+        shifts = [size // 2 for size in shape]
+    elif isinstance(axes, int):
+        shifts = shape[axes] // 2
+    else:
+        shifts = [shape[ax] // 2 for ax in axes]
+    return paddle.roll(x, shifts, axes, name=name)
+
+
+def ifftshift(x, axes=None, name=None):
+    """
+    The inverse of `fftshift`. Although the even length 'x' is the same, the function of the 
+    odd length 'x' is different. An example.
+
+    Args:
+        n (int): Dimension inputed.
+        axes (int|tuple, optional): The axis on which to move. The default is none, which moves all axes.
+            Default is None.
+        name (str, optional): The default value is None.  Normally there is no need for user to set 
+            this property. For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor. The shifted tensor.
+    
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            x = np.array([3, 1, 2, 2, 3], dtype=float)
+            n = x.size
+            fftfreq_xp = paddle.fft.fftfreq(n, d=0.3)
+            res = paddle.fft.ifftshift(fftfreq_xp).numpy()
+            print(res)
+            #  [ 1.3333334 -1.3333334 -0.6666667  0.         0.6666667]
+
+    """
+    shape = paddle.shape(x)
+    if axes is None:
+        # shift all axes
+        rank = paddle.rank(x).reshape([1])
+        axes = axes or paddle.arange(0, rank)
+        shifts = [-size // 2 for size in shape]
+    elif isinstance(axes, int):
+        shifts = -shape[axes] // 2
+    else:
+        shifts = [-shape[ax] // 2 for ax in axes]
+    return paddle.roll(x, shifts, axes, name=name)
+
+
+# internal functions
+def fft_c2c(x, n, axis, norm, forward, name):
+    if is_interger(x):
+        x = paddle.cast(x, _real_to_complex_dtype(paddle.get_default_dtype()))
+    elif is_floating_point(x):
+        x = paddle.cast(x, _real_to_complex_dtype(x.dtype))
+    _check_normalization(norm)
+
+    axis = axis if axis is not None else -1
+    _check_fft_axis(x, axis)
+    axes = [axis]
+    axes = _normalize_axes(x, axes)
+    if n is not None:
+        _check_fft_n(n)
+        s = [n]
+        x = _resize_fft_input(x, s, axes)
+    op_type = 'fft_c2c'
+
+    check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], op_type)
+    if in_dygraph_mode():
+        attrs = ('axes', axes, 'normalization', norm, 'forward', forward)
+        out = getattr(_C_ops, op_type)(x, *attrs)
+    else:
+        inputs = {'X': [x], }
+        attrs = {'axes': axes, 'normalization': norm, 'forward': forward}
+        helper = LayerHelper(op_type, **locals())
+        dtype = helper.input_dtype(input_param_name='x')
+        out = helper.create_variable_for_type_inference(dtype)
+        outputs = {"Out": [out]}
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+    return out
+
+
+def fft_r2c(x, n, axis, norm, forward, onesided, name):
+    if is_interger(x):
+        x = paddle.cast(x, paddle.get_default_dtype())
+    _check_normalization(norm)
+    axis = axis if axis is not None else -1
+    _check_fft_axis(x, axis)
+    axes = [axis]
+    axes = _normalize_axes(x, axes)
+    if n is not None:
+        _check_fft_n(n)
+        s = [n]
+        x = _resize_fft_input(x, s, axes)
+    op_type = 'fft_r2c'
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], op_type)
+
+    if in_dygraph_mode():
+        attrs = ('axes', axes, 'normalization', norm, 'forward', forward,
+                 'onesided', onesided)
+        out = getattr(_C_ops, op_type)(x, *attrs)
+    else:
+        inputs = {'X': [x], }
+        attrs = {
+            'axes': axes,
+            'normalization': norm,
+            'forward': forward,
+            'onesided': onesided,
+        }
+        helper = LayerHelper(op_type, **locals())
+        dtype = helper.input_dtype(input_param_name='x')
+        out = helper.create_variable_for_type_inference(
+            _real_to_complex_dtype(dtype))
+        outputs = {"Out": [out]}
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+    return out
+
+
+def fft_c2r(x, n, axis, norm, forward, name):
+    if is_interger(x):
+        x = paddle.cast(x, _real_to_complex_dtype(paddle.get_default_dtype()))
+    elif is_floating_point(x):
+        x = paddle.cast(x, _real_to_complex_dtype(x.dtype))
+    _check_normalization(norm)
+    axis = axis if axis is not None else -1
+    _check_fft_axis(x, axis)
+    axes = [axis]
+    axes = _normalize_axes(x, axes)
+    if n is not None:
+        _check_fft_n(n)
+        s = [n // 2 + 1]
+        x = _resize_fft_input(x, s, axes)
+    op_type = 'fft_c2r'
+    check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], op_type)
+
+    if in_dygraph_mode():
+        if n is not None:
+            attrs = ('axes', axes, 'normalization', norm, 'forward', forward,
+                     'last_dim_size', n)
+        else:
+            attrs = ('axes', axes, 'normalization', norm, 'forward', forward)
+        out = getattr(_C_ops, op_type)(x, *attrs)
+    else:
+        inputs = {'X': [x], }
+        attrs = {'axes': axes, 'normalization': norm, 'forward': forward}
+        if n is not None:
+            attrs['last_dim_size'] = n
+        helper = LayerHelper(op_type, **locals())
+        dtype = helper.input_dtype(input_param_name='x')
+        out = helper.create_variable_for_type_inference(
+            _complex_to_real_dtype(dtype))
+        outputs = {"Out": [out]}
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+    return out
+
+
+def fftn_c2c(x, s, axes, norm, forward, name):
+    if is_interger(x):
+        x = paddle.cast(x, _real_to_complex_dtype(paddle.get_default_dtype()))
+    elif is_floating_point(x):
+        x = paddle.cast(x, _real_to_complex_dtype(x.dtype))
+    _check_normalization(norm)
+    if s is not None:
+        _check_fft_shape(x, s)
+
+    rank = x.ndim
+    if axes is None:
+        if s is None:
+            axes = list(range(rank))
+        else:
+            fft_ndims = len(s)
+            axes = list(range(rank - fft_ndims, rank))
+    else:
+        _check_fft_axes(x, axes)
+        axes = _normalize_axes(x, axes)
+        axes_argsoft = np.argsort(axes).tolist()
+        axes = [axes[i] for i in axes_argsoft]
+        if s is not None:
+            if len(s) != len(axes):
+                raise ValueError(
+                    "Length of s ({}) and length of axes ({}) does not match.".
+                    format(len(s), len(axes)))
+            s = [s[i] for i in axes_argsoft]
+
+    if s is not None:
+        x = _resize_fft_input(x, s, axes)
+    op_type = 'fft_c2c'
+    check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], op_type)
+
+    if in_dygraph_mode():
+        attrs = ('axes', axes, 'normalization', norm, 'forward', forward)
+        out = getattr(_C_ops, op_type)(x, *attrs)
+    else:
+        inputs = {'X': [x], }
+        attrs = {'axes': axes, 'normalization': norm, 'forward': forward}
+        helper = LayerHelper(op_type, **locals())
+        dtype = helper.input_dtype(input_param_name='x')
+        out = helper.create_variable_for_type_inference(dtype)
+        outputs = {"Out": [out]}
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+    return out
+
+
+def fftn_r2c(x, s, axes, norm, forward, onesided, name):
+    if is_interger(x):
+        x = paddle.cast(x, paddle.get_default_dtype())
+    _check_normalization(norm)
+    if s is not None:
+        _check_fft_shape(x, s)
+
+    rank = x.ndim
+    if axes is None:
+        if s is None:
+            axes = list(range(rank))
+        else:
+            fft_ndims = len(s)
+            axes = list(range(rank - fft_ndims, rank))
+    else:
+        _check_fft_axes(x, axes)
+        axes = _normalize_axes(x, axes)
+        axes_argsoft = np.argsort(axes[:-1]).tolist()
+        axes = [axes[i] for i in axes_argsoft] + [axes[-1]]
+        if s is not None:
+            if len(s) != len(axes):
+                raise ValueError(
+                    "Length of s ({}) and length of axes ({}) does not match.".
+                    format(len(s), len(axes)))
+            s = [s[i] for i in axes_argsoft] + [s[-1]]
+
+    if s is not None:
+        x = _resize_fft_input(x, s, axes)
+
+    op_type = 'fft_r2c'
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], op_type)
+
+    if in_dygraph_mode():
+        attrs = ('axes', axes, 'normalization', norm, 'forward', forward,
+                 'onesided', onesided)
+        out = getattr(_C_ops, op_type)(x, *attrs)
+    else:
+        inputs = {'X': [x], }
+        attrs = {
+            'axes': axes,
+            'normalization': norm,
+            'forward': forward,
+            'onesided': onesided,
+        }
+        helper = LayerHelper(op_type, **locals())
+        dtype = helper.input_dtype(input_param_name='x')
+        out = helper.create_variable_for_type_inference(
+            _real_to_complex_dtype(dtype))
+        outputs = {"Out": [out]}
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+
+    return out
+
+
+def fftn_c2r(x, s, axes, norm, forward, name):
+    if is_interger(x):
+        x = paddle.cast(x, _real_to_complex_dtype(paddle.get_default_dtype()))
+    elif is_floating_point(x):
+        x = paddle.cast(x, _real_to_complex_dtype(x.dtype))
+    _check_normalization(norm)
+    if s is not None:
+        _check_fft_shape(x, s)
+
+    rank = x.ndim
+    if axes is None:
+        if s is None:
+            axes = list(range(rank))
+        else:
+            fft_ndims = len(s)
+            axes = list(range(rank - fft_ndims, rank))
+    else:
+        _check_fft_axes(x, axes)
+        axes = _normalize_axes(x, axes)
+        axes_argsoft = np.argsort(axes[:-1]).tolist()
+        axes = [axes[i] for i in axes_argsoft] + [axes[-1]]
+        if s is not None:
+            if len(s) != len(axes):
+                raise ValueError(
+                    "Length of s ({}) and length of axes ({}) does not match.".
+                    format(len(s), len(axes)))
+            s = [s[i] for i in axes_argsoft] + [s[-1]]
+
+    if s is not None:
+        fft_input_shape = list(s)
+        fft_input_shape[-1] = fft_input_shape[-1] // 2 + 1
+        x = _resize_fft_input(x, fft_input_shape, axes)
+
+    op_type = 'fft_c2r'
+    check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], op_type)
+
+    if in_dygraph_mode():
+        if s:
+            attrs = ('axes', axes, 'normalization', norm, 'forward', forward,
+                     'last_dim_size', s[-1])
+        else:
+            attrs = ('axes', axes, 'normalization', norm, 'forward', forward)
+        out = getattr(_C_ops, op_type)(x, *attrs)
+    else:
+        inputs = {'X': [x], }
+        attrs = {'axes': axes, 'normalization': norm, 'forward': forward}
+        if s:
+            attrs["last_dim_size"] = s[-1]
+        helper = LayerHelper(op_type, **locals())
+        dtype = helper.input_dtype(input_param_name='x')
+        out = helper.create_variable_for_type_inference(
+            _complex_to_real_dtype(dtype))
+        outputs = {"Out": [out]}
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+    return out
diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py
index 694f9dc25e80c..e1855ee6db9af 100644
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -1590,7 +1590,10 @@ def transform(t, device, dtype, blocking):
 
             return new_t
 
-        self._apply(transform, device, dtype, blocking)
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore", category=UserWarning)
+            self._apply(transform, device, dtype, blocking)
+
         self._dtype = dtype
 
     # [aliases] Compatible with old method names
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index f444b5e9c0e5f..af2316a9a443e 100755
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -2316,10 +2316,13 @@ def cond(pred, true_fn=None, false_fn=None, name=None):
         the same shape because of dataflow model of PaddlePaddle while the
         tensors in the tuples or the lists can have different shapes.
 
-        2. Any tensors or operations created outside of ``true_fn`` and
-        ``false_fn`` will be executed regardless of which branch is selected at
-        runtime. This has frequently surprised users who expected a lazy
-        semantics. For example:
+        2. This API could be used under both static mode or dygraph mode. If it
+        is in dygraph mode, the API only runs one branch based on condition.
+
+        3. If it is in static mode, any tensors or operations created outside 
+        or inside of ``true_fn`` and ``false_fn`` will be in net building
+        regardless of which branch is selected at runtime. This has frequently
+        surprised users who expected a lazy semantics. For example:
 
         .. code-block:: python
 
@@ -2328,9 +2331,11 @@ def cond(pred, true_fn=None, false_fn=None, name=None):
             a = paddle.zeros((1, 1))
             b = paddle.zeros((1, 1))
             c = a * b
-            out = paddle.nn.cond(a < b, lambda: a + c, lambda: b * b)
+            out = paddle.static.nn.cond(a < b, lambda: a + c, lambda: b * b)
 
-        No matter whether ``a < b`` , ``c = a * b`` will run.
+        No matter whether ``a < b`` , ``c = a * b`` will be in net building and
+        run. ``a + c`` and ``b * b`` will be in net building, but only one
+        branch will be executed during runtime.
 
     Args:
         pred(Tensor): A boolean tensor whose numel should be 1. The boolean
@@ -2366,24 +2371,24 @@ def cond(pred, true_fn=None, false_fn=None, name=None):
             #     return 3, 2
             #
 
-
             def true_func():
-                return paddle.fill_constant(shape=[1, 2], dtype='int32',
-                                            value=1), paddle.fill_constant(shape=[2, 3],
-                                                                           dtype='bool',
-                                                                           value=True)
+                return paddle.full(shape=[1, 2], dtype='int32',
+                                   fill_value=1), paddle.full(shape=[2, 3],
+                                                              dtype='bool',
+                                                              fill_value=True)
 
 
             def false_func():
-                return paddle.fill_constant(shape=[3, 4], dtype='float32',
-                                            value=3), paddle.fill_constant(shape=[4, 5],
-                                                                           dtype='int64',
-                                                                           value=2)
+                return paddle.full(shape=[3, 4], dtype='float32',
+                                   fill_value=3), paddle.full(shape=[4, 5],
+                                                              dtype='int64',
+                                                              fill_value=2)
+
 
-            x = paddle.fill_constant(shape=[1], dtype='float32', value=0.1)
-            y = paddle.fill_constant(shape=[1], dtype='float32', value=0.23)
+            x = paddle.full(shape=[1], dtype='float32', fill_value=0.1)
+            y = paddle.full(shape=[1], dtype='float32', fill_value=0.23)
             pred = paddle.less_than(x=x, y=y, name=None)
-            ret = paddle.nn.cond(pred, true_func, false_func)
+            ret = paddle.static.nn.cond(pred, true_func, false_func)
             # ret is a tuple containing 2 tensors
             # ret[0] = [[1 1]]
             # ret[1] = [[ True  True  True]
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index c6d90ee404fb5..34ba1d19b809c 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -98,7 +98,9 @@ foreach(TEST_OP ${MIXED_DIST_TEST_OPS})
 endforeach()
 
 if(NOT WITH_GPU)
+    LIST(REMOVE_ITEM TEST_OPS test_fused_feedforward_op)
     LIST(REMOVE_ITEM TEST_OPS test_fused_attention_op)
+    LIST(REMOVE_ITEM TEST_OPS test_fused_attention_op_api)
 endif()
 
 if(((NOT WITH_ROCM) AND (NOT WITH_GPU)) OR WIN32)
@@ -212,6 +214,7 @@ if (NOT WITH_GLOO)
     LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_unused_variables_gloo)
     LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sparse_embedding_over_height_gloo)
     LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sparse_embedding_gloo)
+    LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sparse_embedding_diff_length_gloo)
 endif()
 
 if ((NOT WITH_GPU) AND (NOT WITH_ROCM))
@@ -377,14 +380,14 @@ function(bash_test_modules TARGET_NAME)
 
     if(WITH_COVERAGE)
         add_test(NAME ${TARGET_NAME}
-            COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python 
+            COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python
             TEST_TARGET_NAME=${TARGET_NAME} TEST_TIMEOUT=${timeout} ${bash_test_modules_ENVS}
             WITH_COVERAGE=ON COVERAGE_FILE=${PADDLE_BINARY_DIR}/python-coverage.data
             bash ${CMAKE_CURRENT_BINARY_DIR}/${bash_test_modules_START_BASH}
             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
     else()
         add_test(NAME ${TARGET_NAME}
-            COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python 
+            COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python
             TEST_TARGET_NAME=${TARGET_NAME} TEST_TIMEOUT=${timeout} ${bash_test_modules_ENVS}
             bash ${CMAKE_CURRENT_BINARY_DIR}/${bash_test_modules_START_BASH}
             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
@@ -419,14 +422,14 @@ function(parallel_bash_test_modules TARGET_NAME)
 
     if(WITH_COVERAGE)
         add_test(NAME ${TARGET_NAME}
-            COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python 
+            COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python
             TEST_TARGET_NAME=${TARGET_NAME} TEST_TIMEOUT=${timeout} ${parallel_bash_test_modules_ENVS} UnitTests=${uts_string}
             WITH_COVERAGE=ON COVERAGE_FILE=${PADDLE_BINARY_DIR}/python-coverage.data
             bash ${CMAKE_CURRENT_BINARY_DIR}/${parallel_bash_test_modules_START_BASH}
             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
     else()
         add_test(NAME ${TARGET_NAME}
-            COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python 
+            COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python
             TEST_TARGET_NAME=${TARGET_NAME} TEST_TIMEOUT=${timeout} ${parallel_bash_test_modules_ENVS} UnitTests=${uts_string}
             bash ${CMAKE_CURRENT_BINARY_DIR}/${parallel_bash_test_modules_START_BASH}
             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
diff --git a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py
index f269979746a08..c927476caecd1 100644
--- a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py
+++ b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py
@@ -248,5 +248,48 @@ def test_with_error(self):
             del os.environ['FLAGS_USE_STANDALONE_EXECUTOR']
 
 
+class TestException(unittest.TestCase):
+    def setUp(self):
+        self.place = paddle.CPUPlace()
+
+    def build_program(self):
+        main_program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        with paddle.static.program_guard(main_program, startup_program):
+            w = paddle.rand([10, 20])
+            ids = paddle.static.data(name="id", shape=[5], dtype='int64')
+            emb = paddle.nn.functional.embedding(
+                x=ids, weight=w, sparse=False, name="embedding")
+
+        return main_program, startup_program, emb
+
+    def _run(self, feeds):
+        paddle.seed(2020)
+
+        main_program, startup_program, fetch_vars = self.build_program()
+
+        exe = paddle.static.Executor(self.place)
+        exe.run(startup_program)
+
+        for feed in feeds:
+            out = exe.run(main_program, feed=feed, fetch_list=fetch_vars)
+
+        return out
+
+    def run_new_executor(self, feed):
+        os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] = '1'
+        out = self._run(feed)
+        del os.environ['FLAGS_USE_STANDALONE_EXECUTOR']
+        return out
+
+    def test_exception(self):
+        feed = [{
+            'id': np.array([1, 2, 3, 4, 5]).astype(np.int64)
+        }, {
+            'id': np.array([1, 2, 3, 4, 11]).astype(np.int64)
+        }]
+        self.assertRaises(ValueError, self.run_new_executor, feed)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
index b951afdfad5ea..927456b396ea5 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
@@ -58,8 +58,10 @@ set_tests_properties(test_trt_conv_pass PROPERTIES TIMEOUT 120)
 set_tests_properties(test_trt_dynamic_shape PROPERTIES TIMEOUT 120)
 if(WITH_NV_JETSON)
   set_tests_properties(test_trt_pool_op PROPERTIES ENVIRONMENT FLAGS_fraction_of_gpu_memory_to_use=0.1 TIMEOUT 450)
+  set_tests_properties(test_trt_pool3d_op PROPERTIES ENVIRONMENT FLAGS_fraction_of_gpu_memory_to_use=0.1 TIMEOUT 450)
 else()
   set_tests_properties(test_trt_pool_op PROPERTIES ENVIRONMENT FLAGS_fraction_of_gpu_memory_to_use=0.1 TIMEOUT 45)
+  set_tests_properties(test_trt_pool3d_op PROPERTIES ENVIRONMENT FLAGS_fraction_of_gpu_memory_to_use=0.1 TIMEOUT 45)
 endif()
 set_tests_properties(test_trt_reduce_mean_op PROPERTIES TIMEOUT 60)
 set_tests_properties(test_trt_tile_op PROPERTIES TIMEOUT 60)
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_instance_norm.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_instance_norm.py
index 3f7c2a0fae6f0..acd920ccd57ae 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_instance_norm.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_instance_norm.py
@@ -24,8 +24,6 @@
 
 class TrtConvertInstanceNormTest(TrtLayerAutoScanTest):
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
-        inputs = program_config.inputs
-        weights = program_config.weights
         attrs = [
             program_config.ops[i].attrs
             for i in range(len(program_config.ops))
@@ -38,52 +36,71 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
 
     def sample_program_configs(self):
         def generate_input1(attrs: List[Dict[str, Any]], shape_input):
-            return np.ones(shape_input).astype(np.float32)
+            return np.random.random(shape_input).astype(np.float32)
 
         def generate_input2(attrs: List[Dict[str, Any]], shape_input):
-            return np.ones(len(shape_input) - 1).astype(np.float32)
-
-        for epsilon in [0.0005, -1, 1]:
-            dics = [{"epsilon": epsilon}]
-
-            ops_config = [{
-                "op_type": "instance_norm",
-                "op_inputs": {
-                    "X": ["input_data"],
-                    "Scale": ["scale_data"],
-                    "Bias": ["bias_data"]
-                },
-                "op_outputs": {
-                    "Y": ["y_data"],
-                    "SavedMean": ["saved_mean_data"],
-                    "SavedVariance": ["saved_variance_data"]
-                },
-                "op_attrs": dics[0]
-            }]
-            ops = self.generate_op_config(ops_config)
-            shape_input = [1, 3, 64, 64]
-            program_config = ProgramConfig(
-                ops=ops,
-                weights={
-                    "bias_data": TensorConfig(data_gen=partial(
-                        generate_input2, dics, shape_input)),
-                    "scale_data": TensorConfig(data_gen=partial(
-                        generate_input2, dics, shape_input))
-                },
-                inputs={
-                    "input_data": TensorConfig(data_gen=partial(
-                        generate_input1, dics, shape_input))
-                },
-                outputs=["y_data"])
-
-            yield program_config
+            return np.random.random(shape_input[1]).astype(np.float32)
+
+        for batch in [1, 2, 4]:
+            for shape_input in [[batch, 16], [batch, 32, 64],
+                                [batch, 16, 32, 64]]:
+                self.in_dim = len(shape_input)
+                for epsilon in [0.0005, -1, 1]:
+                    dics = [{"epsilon": epsilon}]
+                    ops_config = [{
+                        "op_type": "instance_norm",
+                        "op_inputs": {
+                            "X": ["input_data"],
+                            "Scale": ["scale_data"],
+                            "Bias": ["bias_data"]
+                        },
+                        "op_outputs": {
+                            "Y": ["y_data"],
+                            "SavedMean": ["saved_mean_data"],
+                            "SavedVariance": ["saved_variance_data"]
+                        },
+                        "op_attrs": dics[0]
+                    }]
+                    ops = self.generate_op_config(ops_config)
+                    program_config = ProgramConfig(
+                        ops=ops,
+                        weights={
+                            "bias_data": TensorConfig(data_gen=partial(
+                                generate_input2, dics, shape_input)),
+                            "scale_data": TensorConfig(data_gen=partial(
+                                generate_input2, dics, shape_input))
+                        },
+                        inputs={
+                            "input_data": TensorConfig(data_gen=partial(
+                                generate_input1, dics, shape_input))
+                        },
+                        outputs=["y_data"])
+
+                    yield program_config
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
-            self.dynamic_shape.min_input_shape = {"input_data": [1, 3, 32, 32]}
-            self.dynamic_shape.max_input_shape = {"input_data": [4, 3, 64, 64]}
-            self.dynamic_shape.opt_input_shape = {"input_data": [1, 3, 64, 64]}
+            if self.in_dim == 2:
+                self.dynamic_shape.min_input_shape = {"input_data": [1, 4]}
+                self.dynamic_shape.max_input_shape = {"input_data": [4, 64]}
+                self.dynamic_shape.opt_input_shape = {"input_data": [2, 16]}
+            elif self.in_dim == 3:
+                self.dynamic_shape.min_input_shape = {"input_data": [1, 1, 4]}
+                self.dynamic_shape.max_input_shape = {
+                    "input_data": [4, 32, 256]
+                }
+                self.dynamic_shape.opt_input_shape = {"input_data": [2, 3, 32]}
+            elif self.in_dim == 4:
+                self.dynamic_shape.min_input_shape = {
+                    "input_data": [1, 1, 4, 4]
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "input_data": [4, 32, 128, 256]
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "input_data": [2, 3, 32, 32]
+                }
 
         def clear_dynamic_shape():
             self.dynamic_shape.min_input_shape = {}
@@ -91,8 +108,7 @@ def clear_dynamic_shape():
             self.dynamic_shape.opt_input_shape = {}
 
         def generate_trt_nodes_num(attrs, dynamic_shape):
-            inputs = program_config.inputs
-            if dynamic_shape:
+            if dynamic_shape or self.in_dim != 4:
                 return 0, 3
             return 1, 2
 
@@ -108,7 +124,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
             attrs, False), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-2
+            attrs, False), 1e-5
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
@@ -117,7 +133,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
                                                                      True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-2
+                                                                     True), 1e-5
 
     def test(self):
         self.run_test()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_leaky_relu.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_leaky_relu.py
index 2a8206e58e00e..c647849fa7ee4 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_leaky_relu.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_leaky_relu.py
@@ -27,46 +27,59 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
-        def generate_input1(attrs: List[Dict[str, Any]]):
-            return np.ones([1, 3, 64, 64]).astype(np.float32)
-
-        for alpha in [0.02, 1.0, 100.0, -1.0, 0.0]:
-            for X_scale in [1.0, 100.0, 0.01, -0.1, 0.0]:
-                dics = [{
-                    "alpha": alpha,
-                    "use_mkldnn": True,
-                    "enable_int8": True,
-                    "X_scale": X_scale
-                }]
-
-                ops_config = [{
-                    "op_type": "leaky_relu",
-                    "op_inputs": {
-                        "X": ["input_data"],
-                    },
-                    "op_outputs": {
-                        "Out": ["y_data"],
-                    },
-                    "op_attrs": dics[0]
-                }]
-                ops = self.generate_op_config(ops_config)
-                program_config = ProgramConfig(
-                    ops=ops,
-                    weights={},
-                    inputs={
-                        "input_data":
-                        TensorConfig(data_gen=partial(generate_input1, dics))
-                    },
-                    outputs=["y_data"])
-
-                yield program_config
+        def generate_input1(shape):
+            return np.random.random(shape).astype(np.float32)
+
+        for batch in [1, 2]:
+            for shape in [[batch, 64], [batch, 32, 64], [batch, 8, 32, 32]]:
+                self.input_dim = len(shape)
+                for alpha in [0.02, 1.0, 100.0, -1.0, 0.0]:
+                    dics = [{"alpha": alpha}]
+                    ops_config = [{
+                        "op_type": "leaky_relu",
+                        "op_inputs": {
+                            "X": ["input_data"],
+                        },
+                        "op_outputs": {
+                            "Out": ["y_data"],
+                        },
+                        "op_attrs": dics[0]
+                    }]
+                    ops = self.generate_op_config(ops_config)
+                    program_config = ProgramConfig(
+                        ops=ops,
+                        weights={},
+                        inputs={
+                            "input_data": TensorConfig(data_gen=partial(
+                                generate_input1, shape))
+                        },
+                        outputs=["y_data"])
+
+                    yield program_config
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
-            self.dynamic_shape.min_input_shape = {"input_data": [1, 3, 32, 32]}
-            self.dynamic_shape.max_input_shape = {"input_data": [4, 3, 64, 64]}
-            self.dynamic_shape.opt_input_shape = {"input_data": [4, 3, 64, 64]}
+            if self.input_dim == 2:
+                self.dynamic_shape.min_input_shape = {"input_data": [1, 8]}
+                self.dynamic_shape.max_input_shape = {"input_data": [64, 128]}
+                self.dynamic_shape.opt_input_shape = {"input_data": [2, 16]}
+            elif self.input_dim == 3:
+                self.dynamic_shape.min_input_shape = {"input_data": [1, 8, 8]}
+                self.dynamic_shape.max_input_shape = {
+                    "input_data": [64, 128, 256]
+                }
+                self.dynamic_shape.opt_input_shape = {"input_data": [2, 16, 64]}
+            elif self.input_dim == 4:
+                self.dynamic_shape.min_input_shape = {
+                    "input_data": [1, 8, 8, 4]
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "input_data": [64, 64, 128, 128]
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "input_data": [2, 16, 64, 32]
+                }
 
         def clear_dynamic_shape():
             self.dynamic_shape.min_input_shape = {}
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pool2d.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pool2d.py
index 9ec2f83fa5ba0..ddb96c37db780 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pool2d.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pool2d.py
@@ -18,6 +18,7 @@
 import paddle.inference as paddle_infer
 from functools import partial
 from typing import Optional, List, Callable, Dict, Any, Set
+import unittest
 
 
 class TrtConvertPool2dTest(TrtLayerAutoScanTest):
@@ -32,6 +33,10 @@ def is_paddings_valid(self, program_config: ProgramConfig) -> bool:
                 for index in range(len(ksize)):
                     if ksize[index] <= paddings[index]:
                         return False
+        ver = paddle_infer.get_trt_compile_version()
+        if ver[0] * 1000 + ver[1] * 100 + ver[0] * 10 < 7000:
+            if program_config.ops[0].attrs['pooling_type'] == 'avg':
+                return False
         return True
 
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
@@ -46,16 +51,16 @@ def generate_input1(attrs: List[Dict[str, Any]]):
         def generate_weight1(attrs: List[Dict[str, Any]]):
             return np.random.random([24, 3, 3, 3]).astype(np.float32)
 
-        for strides in [[1, 1], [2, 2], [1, 2]]:
+        for strides in [[1, 1], [1, 2], [2, 2]]:
             for paddings in [[0, 2], [0, 3], [0, 1, 2, 3]]:
                 for pooling_type in ['max', 'avg']:
                     for padding_algotithm in ['EXPLICIT', 'SAME', 'VAILD']:
                         for ksize in [[2, 3], [3, 3]]:
                             for data_format in ['NCHW']:
                                 for global_pooling in [True, False]:
-                                    for exclusive in [True, False]:
+                                    for exclusive in [False, True]:
                                         for adaptive in [True, False]:
-                                            for ceil_mode in [True, False]:
+                                            for ceil_mode in [False, True]:
 
                                                 dics = [{
                                                     "pooling_type":
@@ -157,6 +162,29 @@ def teller2(program_config, predictor_config):
             teller2, SkipReasons.TRT_NOT_IMPLEMENTED,
             "It is not support that global_pooling is true for trt now.")
 
+        def teller3(program_config, predictor_config):
+            if self.dynamic_shape.min_input_shape == {} and program_config.ops[
+                    0].attrs['ceil_mode'] == True:
+                return True
+            return False
+
+        self.add_skip_case(
+            teller3, SkipReasons.TRT_NOT_IMPLEMENTED,
+            "It is not support that ceil_mode is true in static mode for trt now."
+        )
+
+        def teller4(program_config, predictor_config):
+            if self.dynamic_shape.min_input_shape != {} and (
+                    program_config.ops[0].attrs['strides'] == [1, 2] or
+                    program_config.ops[0].attrs['strides'] == [2, 2]):
+                return True
+            return False
+
+        self.add_skip_case(
+            teller4, SkipReasons.TRT_NOT_IMPLEMENTED,
+            "It is not support that strides is not equal [1, 1] in dynamic mode for trt now."
+        )
+
     def test(self):
         self.add_skip_trt_case()
         self.run_test()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul.py
index 080d1ccc9054b..99e99a8387784 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul.py
@@ -107,5 +107,43 @@ def set_params(self):
         self.alpha = 2.0
 
 
+class TensorRTMatMulBroadcastTest(InferencePassTest):
+    def setUp(self):
+        self.set_params()
+        place = fluid.CPUPlace()
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data_x = fluid.data(
+                name="data_x", shape=[-1, 6, 24], dtype="float32")
+            data_y = fluid.data(name="data_y", shape=[24, 16], dtype="float32")
+            matmul_out = fluid.layers.matmul(
+                x=data_x,
+                y=data_y,
+                transpose_x=self.transpose_x,
+                transpose_y=self.transpose_y,
+                alpha=self.alpha)
+            out = fluid.layers.batch_norm(matmul_out, is_test=True)
+
+        self.feeds = {
+            "data_x": np.ones([2, 6, 24]).astype("float32"),
+            "data_y": np.ones([24, 16]).astype("float32")
+        }
+        self.enable_trt = True
+        self.trt_parameters = TensorRTMatMulBroadcastTest.TensorRTParam(
+            1 << 30, 32, 0, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [out]
+
+    def set_params(self):
+        self.transpose_x = False
+        self.transpose_y = False
+        self.alpha = 1.0
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pool3d_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pool3d_op.py
new file mode 100644
index 0000000000000..6fbddcf5a1fc0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pool3d_op.py
@@ -0,0 +1,332 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import shutil
+import unittest
+import itertools
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+from paddle.fluid.core import AnalysisConfig
+
+
+class TensorRTPool3dTest(InferencePassTest):
+    def setUp(self):
+        self.bs = 1
+        self.channel = 3
+        self.depth = 8
+        self.height = 8
+        self.width = 8
+        self.pool_size = 2
+        self.pool_type = 'max'
+        self.pool_stride = 1
+        self.pool_padding = 0
+        self.global_pooling = False
+        self.ceil_mode = False
+        self.exclusive = False
+        self.enable_trt = True
+        self.serialize = False
+        self.precision = AnalysisConfig.Precision.Float32
+        self.feeds = {
+            'data': np.random.random(
+                [self.bs, self.channel, self.depth, self.height,
+                 self.width]).astype('float32'),
+        }
+
+    def set_extra_config(self):
+        pass
+
+    def build_network(self):
+        self.set_extra_config()
+        self.trt_parameters = TensorRTPool3dTest.TensorRTParam(
+            1 << 30, self.bs, 0, self.precision, self.serialize, False)
+
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name='data',
+                shape=[-1, self.channel, self.depth, self.height, self.width],
+                dtype='float32')
+            pool_out = fluid.layers.pool3d(
+                input=data,
+                pool_size=self.pool_size,
+                pool_type=self.pool_type,
+                pool_stride=self.pool_stride,
+                pool_padding=self.pool_padding,
+                global_pooling=self.global_pooling,
+                ceil_mode=self.ceil_mode,
+                exclusive=self.exclusive)
+            #out = fluid.layers.batch_norm(pool_out, is_test=True)
+            self.fetch_list = [pool_out]
+
+    def check_output(self):
+        if os.path.exists(self.path + "_opt_cache"):
+            shutil.rmtree(self.path + "_opt_cache")
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+    def run_test(self):
+        self.build_network()
+        self.check_output()
+
+    def test(self):
+        precision_options = [
+            AnalysisConfig.Precision.Float32, AnalysisConfig.Precision.Half
+        ]
+        serialize_options = [False, True]
+        dynamic_shape_profile = InferencePassTest.DynamicShapeParam({
+            'data': [
+                self.bs, self.channel, self.depth // 2, self.height // 2,
+                self.width // 2
+            ]
+        }, {
+            'data':
+            [self.bs, self.channel, self.depth, self.height, self.width]
+        }, {
+            'data':
+            [self.bs, self.channel, self.depth, self.height, self.width]
+        }, False)
+        dynamic_shape_options = [None, dynamic_shape_profile]
+
+        for precision, serialize, dynamic_shape in itertools.product(
+                precision_options, serialize_options, dynamic_shape_options):
+            is_dynamic = True if dynamic_shape_options is not None else False
+            with self.subTest('Precision: {}, Serialize: {}, Dynamic: {}'.
+                              format(precision, serialize, is_dynamic)):
+                self.precision = precision
+                self.serialize = serialize
+                self.dynamic_shape_params = dynamic_shape
+                self.run_test()
+
+
+class TensorRTAvgPool3dTest(TensorRTPool3dTest):
+    def set_extra_config(self):
+        self.pool_size = 2
+        self.pool_type = 'avg'
+        self.pool_stride = 1
+        self.pool_padding = 0
+        self.global_pooling = False
+        self.ceil_mode = False
+        self.exclusive = False
+
+
+class TensorRTGlobalPool3dTest(TensorRTPool3dTest):
+    def set_extra_config(self):
+        self.pool_size = 2
+        self.pool_type = 'max'
+        self.pool_stride = 1
+        self.pool_padding = 0
+        self.global_pooling = True
+        self.ceil_mode = False
+        self.exclusive = False
+
+
+class TensorRTCeilPool3dTest(TensorRTPool3dTest):
+    def set_extra_config(self):
+        self.pool_size = 2
+        self.pool_type = 'max'
+        self.pool_stride = 1
+        self.pool_padding = 0
+        self.global_pooling = False
+        self.ceil_mode = True
+        self.exclusive = False
+
+
+class TensorRTExclusivePool3dTest(TensorRTPool3dTest):
+    def set_extra_config(self):
+        self.pool_size = 2
+        self.pool_type = 'max'
+        self.pool_stride = 1
+        self.pool_padding = 0
+        self.global_pooling = False
+        self.ceil_mode = False
+        self.exclusive = True
+
+
+class TensorRTSamePaddingPool3dTest(InferencePassTest):
+    def set_extra_config(self):
+        self.pool_size = 2
+        self.pool_type = 'max'
+        self.pool_stride = 1
+        self.pool_padding = 'SAME'
+        self.global_pooling = False
+        self.ceil_mode = False
+        self.exclusive = False
+
+
+class TensorRTValidPaddingPool3dTest(InferencePassTest):
+    def set_extra_config(self):
+        self.pool_size = 2
+        self.pool_type = 'max'
+        self.pool_stride = 1
+        self.pool_padding = 'VALID'
+        self.global_pooling = False
+        self.ceil_mode = False
+        self.exclusive = False
+
+
+class TensorRTAdaptiveAvgPool3DTest(InferencePassTest):
+    def setUp(self):
+        self.bs = 1
+        self.channel = 3
+        self.depth = 8
+        self.height = 8
+        self.width = 8
+        self.enable_trt = True
+        self.serialize = False
+        self.precision = AnalysisConfig.Precision.Float32
+        self.feeds = {
+            'data': np.random.random(
+                [self.bs, self.channel, self.depth, self.height,
+                 self.width]).astype('float32'),
+        }
+
+    def build_network(self):
+        self.trt_parameters = TensorRTPool3dTest.TensorRTParam(
+            1 << 30, self.bs, 0, self.precision, self.serialize, False)
+
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name='data',
+                shape=[-1, self.channel, self.depth, self.height, self.width],
+                dtype='float32')
+            pool_out = paddle.nn.functional.adaptive_avg_pool3d(
+                x=data, output_size=[3, 3, 3])
+            #out = fluid.layers.batch_norm(pool_out, is_test=True)
+            self.fetch_list = [pool_out]
+
+    def check_output(self):
+        if os.path.exists(self.path + "_opt_cache"):
+            shutil.rmtree(self.path + "_opt_cache")
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+    def run_test(self):
+        self.build_network()
+        self.check_output()
+
+    def test(self):
+        precision_options = [
+            AnalysisConfig.Precision.Float32, AnalysisConfig.Precision.Half
+        ]
+        serialize_options = [False, True]
+        dynamic_shape_profile = InferencePassTest.DynamicShapeParam({
+            'data': [
+                self.bs, self.channel, self.depth // 2, self.height // 2,
+                self.width // 2
+            ]
+        }, {
+            'data':
+            [self.bs, self.channel, self.depth, self.height, self.width]
+        }, {
+            'data':
+            [self.bs, self.channel, self.depth, self.height, self.width]
+        }, False)
+        dynamic_shape_options = [None, dynamic_shape_profile]
+
+        for precision, serialize, dynamic_shape in itertools.product(
+                precision_options, serialize_options, dynamic_shape_options):
+            is_dynamic = True if dynamic_shape_options is not None else False
+            with self.subTest('Precision: {}, Serialize: {}, Dynamic: {}'.
+                              format(precision, serialize, is_dynamic)):
+                self.precision = precision
+                self.serialize = serialize
+                self.dynamic_shape_params = dynamic_shape
+                self.run_test()
+
+
+class TensorRTAdaptiveMaxPool3DTest(InferencePassTest):
+    def setUp(self):
+        self.bs = 1
+        self.channel = 3
+        self.depth = 8
+        self.height = 8
+        self.width = 8
+        self.enable_trt = True
+        self.serialize = False
+        self.precision = AnalysisConfig.Precision.Float32
+        self.feeds = {
+            'data': np.random.random(
+                [self.bs, self.channel, self.depth, self.height,
+                 self.width]).astype('float32'),
+        }
+
+    def build_network(self):
+        self.trt_parameters = TensorRTPool3dTest.TensorRTParam(
+            1 << 30, self.bs, 0, self.precision, self.serialize, False)
+
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name='data',
+                shape=[-1, self.channel, self.depth, self.height, self.width],
+                dtype='float32')
+            pool_out = paddle.nn.functional.adaptive_max_pool3d(
+                x=data, output_size=[3, 3, 3])
+            #out = fluid.layers.batch_norm(pool_out, is_test=True)
+            self.fetch_list = [pool_out]
+
+    def check_output(self):
+        if os.path.exists(self.path + "_opt_cache"):
+            shutil.rmtree(self.path + "_opt_cache")
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+    def run_test(self):
+        self.build_network()
+        self.check_output()
+
+    def test(self):
+        precision_options = [
+            AnalysisConfig.Precision.Float32, AnalysisConfig.Precision.Half
+        ]
+        serialize_options = [False, True]
+        dynamic_shape_profile = InferencePassTest.DynamicShapeParam({
+            'data': [
+                self.bs, self.channel, self.depth // 2, self.height // 2,
+                self.width // 2
+            ]
+        }, {
+            'data':
+            [self.bs, self.channel, self.depth, self.height, self.width]
+        }, {
+            'data':
+            [self.bs, self.channel, self.depth, self.height, self.width]
+        }, False)
+        dynamic_shape_options = [None, dynamic_shape_profile]
+
+        for precision, serialize, dynamic_shape in itertools.product(
+                precision_options, serialize_options, dynamic_shape_options):
+            is_dynamic = True if dynamic_shape_options is not None else False
+            with self.subTest('Precision: {}, Serialize: {}, Dynamic: {}'.
+                              format(precision, serialize, is_dynamic)):
+                self.precision = precision
+                self.serialize = serialize
+                self.dynamic_shape_params = dynamic_shape
+                self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_argsort_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_argsort_op_npu.py
index 824266578b9e5..2589b2a316a16 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_argsort_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_argsort_op_npu.py
@@ -18,7 +18,7 @@
 import unittest
 import sys
 sys.path.append("..")
-from op_test import OpTest, _set_use_system_allocator
+from op_test import OpTest
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
@@ -63,9 +63,6 @@ def set_npu(self):
         self.__class__.use_npu = True
         self.__class__.no_need_check_grad = True
 
-    def init_kernel_type(self):
-        self.use_mkldnn = False
-
     def init_inputshape(self):
         self.input_shape = (2, 2, 2, 3, 3)
 
@@ -158,7 +155,8 @@ def set_npu(self):
         self.__class__.use_npu = True
 
     def test_check_grad(self):
-        self.check_grad_with_place(self.place, ["X"], "Out")
+        self.check_grad_with_place(
+            self.place, ["X"], "Out", max_relative_error=0.03)
 
 
 class TestArgsortOpAxis1NPUFP32(TestArgsortOpAxis0NPUFP32):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_cumsum_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_cumsum_op_npu.py
index 5a3f98524bbd0..9289da6641e7d 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_cumsum_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_cumsum_op_npu.py
@@ -249,5 +249,45 @@ def init_testcase(self):
         self.outputs = {'Out': self.inputs['X'].cumsum()}
 
 
+#----------------Cumsum Int64----------------
+class TestNPUCumSumOpInt64(TestNPUCumSumOp1):
+    def init_testcase(self):
+        self.attrs = {'axis': -1, 'reverse': True}
+        self.inputs = {
+            'X': np.random.randint(
+                1, 10000, size=(5, 6, 10)).astype(self.dtype)
+        }
+        self.outputs = {
+            'Out': np.flip(
+                np.flip(
+                    self.inputs['X'], axis=2).cumsum(axis=2), axis=2)
+        }
+
+
+def create_test_int64(parent):
+    class TestCumSumInt64(parent):
+        def init_dtype(self):
+            self.dtype = np.int64
+
+    cls_name = "{0}_{1}".format(parent.__name__, "Int64")
+    TestCumSumInt64.__name__ = cls_name
+    globals()[cls_name] = TestCumSumInt64
+
+
+create_test_int64(TestNPUCumSumOp1)
+create_test_int64(TestNPUCumSumOp2)
+create_test_int64(TestNPUCumSumOp3)
+create_test_int64(TestNPUCumSumOp4)
+create_test_int64(TestNPUCumSumOp5)
+create_test_int64(TestNPUCumSumOp7)
+create_test_int64(TestNPUCumSumExclusive1)
+create_test_int64(TestNPUCumSumExclusive2)
+create_test_int64(TestNPUCumSumExclusive3)
+create_test_int64(TestNPUCumSumExclusive4)
+create_test_int64(TestNPUCumSumExclusive5)
+create_test_int64(TestNPUCumSumReverseExclusive)
+create_test_int64(TestNPUCumSumWithFlatten1)
+create_test_int64(TestNPUCumSumWithFlatten2)
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_sub_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_sub_op_npu.py
index 7c8710fd42b36..fac2bc66ff49b 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_sub_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_sub_op_npu.py
@@ -95,6 +95,11 @@ def init_dtype(self):
         self.dtype = np.int32
 
 
+class TestElementwiseSubOpInt64(TestElementwiseSubOp):
+    def init_dtype(self):
+        self.dtype = np.int64
+
+
 class TestSubtractAPI(unittest.TestCase):
     def test_name(self):
         with paddle.static.program_guard(paddle.static.Program()):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py
index 56f04a6e993f3..1031be4c1a7b4 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py
@@ -33,14 +33,15 @@ def setUp(self):
         self.place = paddle.NPUPlace(0)
 
         self.init_dtype()
-        self.init_dim()
+        self.init_dims()
+        self.init_padding_idx()
         np.random.seed(SEED)
-        bsz = 6
-        seqlen = 8
-        vocab = 10
-        w = np.ones([vocab, self.dim]).astype(self.dtype)
-        x = np.random.randint(0, vocab, size=(bsz, seqlen)).astype(np.int32)
-        out = np.ones([bsz, seqlen, self.dim]).astype(self.dtype)
+        w = np.random.random([self.vocab, self.dim]).astype(self.dtype)
+        x = np.random.randint(
+            0, self.vocab, size=(self.bsz, self.seqlen)).astype(np.int32)
+        out = w[x]
+        if self.padding_idx != -1:
+            out[np.squeeze(x == self.padding_idx)] = np.zeros(self.dim)
 
         self.inputs = {
             'W': OpTest.np_dtype_to_fluid_dtype(w),
@@ -50,7 +51,7 @@ def setUp(self):
             'is_sparse': False,
             'is_distributed': False,
             'remote_prefetch': False,
-            'padding_idx': -1
+            'padding_idx': self.padding_idx
         }
         self.outputs = {'Out': out}
 
@@ -60,10 +61,16 @@ def set_npu(self):
     def init_dtype(self):
         self.dtype = np.float32
 
-    def init_dim(self):
+    def init_dims(self):
+        self.bsz = 6
+        self.seqlen = 8
+        self.vocab = 10
         # embedding_dim is not multiple of 32
         self.dim = 20
 
+    def init_padding_idx(self):
+        self.padding_idx = -1
+
     def test_check_output(self):
         self.check_output_with_place(self.place)
 
@@ -85,7 +92,10 @@ def set_npu(self):
 
 
 class TestLookupTableV2Dim32(TestLookupTableV2):
-    def init_dim(self):
+    def init_dims(self):
+        self.bsz = 6
+        self.seqlen = 8
+        self.vocab = 10
         # embedding_dim is multiple of 32
         self.dim = 64
 
@@ -96,7 +106,10 @@ class TestLookupTableV2Dim32FP16(TestLookupTableV2):
     def init_dtype(self):
         self.dtype = np.float16
 
-    def init_dim(self):
+    def init_dims(self):
+        self.bsz = 6
+        self.seqlen = 8
+        self.vocab = 10
         self.dim = 64
 
     def set_npu(self):
@@ -104,5 +117,10 @@ def set_npu(self):
         self.__class__.no_need_check_grad = True
 
 
+class TestLookupTableV2WithPadding(TestLookupTableV2):
+    def init_padding_idx(self):
+        self.padding_idx = np.random.randint(0, self.vocab)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_matmul_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_matmul_op_npu.py
new file mode 100644
index 0000000000000..a8dc0c137c353
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_matmul_op_npu.py
@@ -0,0 +1,329 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+def reference_matmul(X, Y, transpose_X=False, transpose_Y=False, scale=1.0):
+    """Reference forward implementation using np.matmul."""
+    # np.matmul does not support the transpose flags, so we manually
+    # transpose X and Y appropriately.
+    if transpose_X:
+        if X.ndim == 1:
+            X = X.reshape((X.size, ))
+        elif X.ndim == 2:
+            X = X.T
+        else:
+            dim = [i for i in range(len(X.shape))]
+            dim[-1], dim[len(X.shape) - 2] = dim[len(X.shape) - 2], dim[-1]
+            X = np.transpose(X, tuple(dim))
+    if transpose_Y:
+        if Y.ndim == 1:
+            Y = Y.reshape((Y.size, ))
+        else:
+            dim = [i for i in range(len(Y.shape))]
+            dim[-1], dim[len(Y.shape) - 2] = dim[len(Y.shape) - 2], dim[-1]
+            Y = np.transpose(Y, tuple(dim))
+
+    Out = np.matmul(X, Y)
+    if not Out.shape:
+        # We do not support 0-dimensional Tensors (scalars). So where
+        # np.matmul outputs a scalar, we must convert to a Tensor of
+        # shape (1, ) instead.
+        # Everywhere else, we are compatible with np.matmul.
+        Out = np.array([Out], dtype="float64")
+    if abs(scale - 1.0) > 1e-09:
+        Out = Out * scale
+    return Out
+
+
+class TestMatMulOp(OpTest):
+    """
+    basic case
+    """
+
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "matmul"
+        self.init_dtype()
+        self.init_alpha()
+        self.config()
+
+        X = np.random.random(self.x_shape).astype(self.dtype)
+        Y = np.random.random(self.y_shape).astype(self.dtype)
+        # -0.1 ~ 0.1
+        X = -0.1 + 0.2 * X
+        Y = -0.1 + 0.2 * Y
+
+        Out = reference_matmul(X, Y, self.transpose_X, self.transpose_Y,
+                               self.alpha)
+        Out = Out.astype(self.dtype)
+        self.inputs = {'X': X, 'Y': Y}
+        self.attrs = {
+            'transpose_X': self.transpose_X,
+            'transpose_Y': self.transpose_Y,
+            'alpha': self.alpha
+        }
+        self.outputs = {'Out': Out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.place = paddle.NPUPlace(0)
+
+    def config(self):
+        self.x_shape = (100, )
+        self.y_shape = (100, )
+        self.transpose_X = False
+        self.transpose_Y = False
+
+    def init_alpha(self):
+        self.alpha = 1.0
+
+    def init_dtype(self):
+        self.dtype = "float32"
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-7)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(self.place, ['X', 'Y'], 'Out')
+
+
+class TestMatMulOp1(TestMatMulOp):
+    """
+    case x_ndim == 1, y_ndim != 1
+    """
+
+    def config(self):
+        self.x_shape = (100, )
+        self.y_shape = (1, 3, 2, 100)
+        self.transpose_X = False
+        self.transpose_Y = True
+
+
+class TestMatMulOp2(TestMatMulOp):
+    """
+    case x_ndim != 1, y_ndim == 1
+    """
+
+    def config(self):
+        self.x_shape = (1, 2, 100, 1)
+        self.y_shape = (100, )
+        self.transpose_X = True
+        self.transpose_Y = False
+
+
+class TestMatMulOp3(TestMatMulOp):
+    """
+    case [M, K] x [K, N] = [M, N]
+    """
+
+    def config(self):
+        self.x_shape = (2, 100)
+        self.y_shape = (100, 2)
+        self.transpose_X = False
+        self.transpose_Y = False
+
+
+class TestMatMulOp4(TestMatMulOp):
+    """
+    case [M, K] x [K, N] = [M, N]
+    """
+
+    def config(self):
+        self.x_shape = (2, 100)
+        self.y_shape = (2, 100)
+        self.transpose_X = False
+        self.transpose_Y = True
+
+
+class TestMatMulOp5(TestMatMulOp):
+    """
+    case [M, K] x [K, N] = [M, N]
+    """
+
+    def config(self):
+        self.x_shape = (100, 2)
+        self.y_shape = (100, 2)
+        self.transpose_X = True
+        self.transpose_Y = False
+
+
+class TestMatMulOp6(TestMatMulOp):
+    """
+    case [B, M, K] x [K, N] =  [B, M, N]
+    """
+
+    def config(self):
+        self.x_shape = (2, 2, 25)
+        self.y_shape = (25, 4)
+        self.transpose_X = False
+        self.transpose_Y = False
+
+
+class TestMatMulOp7(TestMatMulOp):
+    """
+    case [B, M, K] x [K, N] =  [B, M, N]
+    """
+
+    def config(self):
+        self.x_shape = (1, 2, 25)
+        self.y_shape = (4, 25)
+        self.transpose_X = False
+        self.transpose_Y = True
+
+
+class TestMatMulOp8(TestMatMulOp):
+    """
+    case [B, M, K] x [K, N] =  [B, M, N]
+    """
+
+    def config(self):
+        self.x_shape = (1, 25, 4)
+        self.y_shape = (25, 4)
+        self.transpose_X = True
+        self.transpose_Y = False
+
+
+class TestMatMulOp9(TestMatMulOp):
+    """
+    case [B, M, K] x  [B, K, N] = [B, M, N]
+    """
+
+    def config(self):
+        self.x_shape = (2, 5, 10)
+        self.y_shape = (2, 10, 5)
+        self.transpose_X = False
+        self.transpose_Y = False
+
+
+class TestMatMulOp10(TestMatMulOp):
+    """
+    case [B, M, K] x  [B, K, N] = [B, M, N]
+    """
+
+    def config(self):
+        self.x_shape = (2, 10, 5)
+        self.y_shape = (2, 10, 5)
+        self.transpose_X = True
+        self.transpose_Y = False
+
+
+class TestMatMulOp11(TestMatMulOp):
+    """
+    case [B, M, K] x  [B, K, N] = [B, M, N]
+    """
+
+    def config(self):
+        self.x_shape = (2, 5, 10)
+        self.y_shape = (2, 5, 10)
+        self.transpose_X = False
+        self.transpose_Y = True
+
+
+class TestMatMulOp12(TestMatMulOp):
+    """
+    case to check the gradient for special case
+    """
+
+    def config(self):
+        self.x_shape = (100)
+        self.y_shape = (1, 2, 2, 100, 2)
+        self.transpose_X = False
+        self.transpose_Y = False
+
+
+class TestMatMulOp13(TestMatMulOp):
+    """
+    case to check the gradient for special case
+    """
+
+    def config(self):
+        self.x_shape = (2, 1, 100)
+        self.y_shape = (100)
+        self.transpose_X = False
+        self.transpose_Y = False
+
+
+#--------------------test matmul alpha--------------------
+def create_test_alpha_class(parent):
+    class TestMatMulOpAlphaCase(parent):
+        def init_alpha(self):
+            self.alpha = 0.125
+
+    cls_name = "{0}_{1}".format(parent.__name__, "Alpha")
+    TestMatMulOpAlphaCase.__name__ = cls_name
+    globals()[cls_name] = TestMatMulOpAlphaCase
+
+
+create_test_alpha_class(TestMatMulOp)
+create_test_alpha_class(TestMatMulOp1)
+create_test_alpha_class(TestMatMulOp2)
+create_test_alpha_class(TestMatMulOp3)
+create_test_alpha_class(TestMatMulOp4)
+create_test_alpha_class(TestMatMulOp5)
+create_test_alpha_class(TestMatMulOp6)
+create_test_alpha_class(TestMatMulOp9)
+create_test_alpha_class(TestMatMulOp10)
+create_test_alpha_class(TestMatMulOp11)
+create_test_alpha_class(TestMatMulOp12)
+create_test_alpha_class(TestMatMulOp13)
+
+
+#--------------------test matmul fp16--------------------
+def create_test_fp16_class(parent, atol=0.001, max_relative_error=2.5):
+    class TestMatMulOpFp16Case(parent):
+        def init_kernel_type(self):
+            self.dtype = np.float16
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place, atol=atol)
+
+        def test_check_grad(self):
+            self.check_grad_with_place(
+                self.place, ['X', 'Y'],
+                'Out',
+                max_relative_error=max_relative_error)
+
+    cls_name = "{0}_{1}".format(parent.__name__, "Fp16")
+    TestMatMulOpFp16Case.__name__ = cls_name
+    globals()[cls_name] = TestMatMulOpFp16Case
+
+
+create_test_fp16_class(TestMatMulOp)
+create_test_fp16_class(TestMatMulOp1)
+create_test_fp16_class(TestMatMulOp2)
+create_test_fp16_class(TestMatMulOp3)
+create_test_fp16_class(TestMatMulOp4)
+create_test_fp16_class(TestMatMulOp5)
+create_test_fp16_class(TestMatMulOp6)
+create_test_fp16_class(TestMatMulOp9)
+create_test_fp16_class(TestMatMulOp10)
+create_test_fp16_class(TestMatMulOp11)
+create_test_fp16_class(TestMatMulOp12)
+create_test_fp16_class(TestMatMulOp13)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_bincount_op.py b/python/paddle/fluid/tests/unittests/test_bincount_op.py
new file mode 100644
index 0000000000000..851bf7b01125a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_bincount_op.py
@@ -0,0 +1,205 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid import Program, program_guard
+from op_test import OpTest
+
+paddle.enable_static()
+
+
+class TestBincountOpAPI(unittest.TestCase):
+    """Test bincount api."""
+
+    def test_static_graph(self):
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(train_program, startup_program):
+            inputs = fluid.data(name='input', dtype='int64', shape=[7])
+            weights = fluid.data(name='weights', dtype='int64', shape=[7])
+            output = paddle.bincount(inputs, weights=weights)
+            place = fluid.CPUPlace()
+            if fluid.core.is_compiled_with_cuda():
+                place = fluid.CUDAPlace(0)
+            exe = fluid.Executor(place)
+            exe.run(startup_program)
+            img = np.array([0, 1, 1, 3, 2, 1, 7]).astype(np.int64)
+            w = np.array([0, 1, 1, 2, 2, 1, 0]).astype(np.int64)
+            res = exe.run(train_program,
+                          feed={'input': img,
+                                'weights': w},
+                          fetch_list=[output])
+            actual = np.array(res[0])
+            expected = np.bincount(img, weights=w)
+            self.assertTrue(
+                (actual == expected).all(),
+                msg='bincount output is wrong, out =' + str(actual))
+
+    def test_dygraph(self):
+        with fluid.dygraph.guard():
+            inputs_np = np.array([0, 1, 1, 3, 2, 1, 7]).astype(np.int64)
+            inputs = fluid.dygraph.to_variable(inputs_np)
+            actual = paddle.bincount(inputs)
+            expected = np.bincount(inputs)
+            self.assertTrue(
+                (actual.numpy() == expected).all(),
+                msg='bincount output is wrong, out =' + str(actual.numpy()))
+
+
+class TestBincountOpError(unittest.TestCase):
+    """Test bincount op error."""
+
+    def run_network(self, net_func):
+        with fluid.dygraph.guard():
+            net_func()
+
+    def test_input_value_error(self):
+        """Test input tensor should be non-negative."""
+
+        def net_func():
+            input_value = paddle.to_tensor([1, 2, 3, 4, -5])
+            paddle.bincount(input_value)
+
+        with self.assertRaises(ValueError):
+            self.run_network(net_func)
+
+    def test_input_shape_error(self):
+        """Test input tensor should be 1-D tansor."""
+
+        def net_func():
+            input_value = paddle.to_tensor([[1, 2, 3], [4, 5, 6]])
+            paddle.bincount(input_value)
+
+        with self.assertRaises(ValueError):
+            self.run_network(net_func)
+
+    def test_minlength_value_error(self):
+        """Test minlength is non-negative ints."""
+
+        def net_func():
+            input_value = paddle.to_tensor([1, 2, 3, 4, 5])
+            paddle.bincount(input_value, minlength=-1)
+
+        with self.assertRaises(IndexError):
+            self.run_network(net_func)
+
+    def test_input_type_errors(self):
+        """Test input tensor should only contain non-negative ints."""
+
+        def net_func():
+            input_value = paddle.to_tensor([1., 2., 3., 4., 5.])
+            paddle.bincount(input_value)
+
+        with self.assertRaises(TypeError):
+            self.run_network(net_func)
+
+    def test_weights_shape_error(self):
+        """Test weights tensor should have the same shape as input tensor."""
+
+        def net_func():
+            input_value = paddle.to_tensor([1, 2, 3, 4, 5])
+            weights = paddle.to_tensor([1, 1, 1, 1, 1, 1])
+            paddle.bincount(input_value, weights=weights)
+
+        with self.assertRaises(ValueError):
+            self.run_network(net_func)
+
+
+class TestBincountOp(OpTest):
+    # without weights
+    def setUp(self):
+        self.op_type = "bincount"
+        self.init_test_case()
+        self.inputs = {"X": self.np_input}
+        self.attrs = {"minlength": self.minlength}
+        self.outputs = {"Out": self.Out}
+
+    def init_test_case(self):
+        self.minlength = 0
+        self.np_input = np.random.randint(low=0, high=20, size=10)
+        self.Out = np.bincount(self.np_input, minlength=self.minlength)
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestCase1(TestBincountOp):
+    # with weights(FLOAT32)
+    def setUp(self):
+        self.op_type = "bincount"
+        self.init_test_case()
+        self.inputs = {"X": self.np_input, "Weights": self.np_weights}
+        self.attrs = {"minlength": self.minlength}
+        self.outputs = {"Out": self.Out}
+
+    def init_test_case(self):
+        self.minlength = 0
+        self.np_weights = np.random.randint(
+            low=0, high=20, size=10).astype(np.float32)
+        self.np_input = np.random.randint(low=0, high=20, size=10)
+        self.Out = np.bincount(
+            self.np_input, weights=self.np_weights,
+            minlength=self.minlength).astype(np.float32)
+
+
+class TestCase2(TestBincountOp):
+    # with weights(other)
+    def setUp(self):
+        self.op_type = "bincount"
+        self.init_test_case()
+        self.inputs = {"X": self.np_input, "Weights": self.np_weights}
+        self.attrs = {"minlength": self.minlength}
+        self.outputs = {"Out": self.Out}
+
+    def init_test_case(self):
+        self.minlength = 0
+        self.np_weights = np.random.randint(low=0, high=20, size=10)
+        self.np_input = np.random.randint(low=0, high=20, size=10)
+        self.Out = np.bincount(
+            self.np_input, weights=self.np_weights, minlength=self.minlength)
+
+
+class TestCase3(TestBincountOp):
+    # empty input
+    def init_test_case(self):
+        self.minlength = 0
+        self.np_input = np.array([], dtype=np.int64)
+        self.Out = np.bincount(self.np_input, minlength=self.minlength)
+
+
+class TestCase4(TestBincountOp):
+    # with input(INT32)
+    def init_test_case(self):
+        self.minlength = 0
+        self.np_input = np.random.randint(
+            low=0, high=20, size=10).astype(np.int32)
+        self.Out = np.bincount(self.np_input, minlength=self.minlength)
+
+
+class TestCase5(TestBincountOp):
+    # with minlength greater than max(X)
+    def init_test_case(self):
+        self.minlength = 20
+        self.np_input = np.random.randint(low=0, high=10, size=10)
+        self.Out = np.bincount(self.np_input, minlength=self.minlength)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index 63985415c51f6..0b8a80f0c837a 100755
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -515,10 +515,28 @@ def _get_data(self, batch, args):
             return batch
         elif args.update_method != "local":
             new_batch = []
-            for offset, item in enumerate(batch):
-                if offset % 2 == args.trainer_id:
-                    new_batch.append(item)
-            return new_batch
+
+            # NOTE(@xiongkun03) args.diff_batch means batch length is different: 
+            # such as : batch = [2,3,4,5], then the first rank will get [2]  and 
+            # the second rank will get [3,4,5]. 
+            # this function is for test sparse_embedding_differ_length
+            if hasattr(args, "diff_batch") and args.diff_batch:
+                assert len(
+                    batch) > 2, "in differ_batch mode, len(batch) must > 2."
+                if paddle.distributed.get_rank() == 0:
+                    new_batch.append(batch[0])
+                elif paddle.distributed.get_rank() == 1:
+                    new_batch.extend([_ for _ in batch[1:]])
+                else:
+                    raise NotImplementedError(
+                        "Current TestParallelDyGraphRunnerBase don't support world_size > 2"
+                    )
+                return new_batch
+            else:
+                for offset, item in enumerate(batch):
+                    if offset % 2 == args.trainer_id:
+                        new_batch.append(item)
+                return new_batch
         else:
             return batch
 
@@ -699,6 +717,7 @@ def runtime_main(test_class):
     parser.add_argument('--use_fleet_api', action='store_true')
     parser.add_argument('--use_fleet_api_20', action='store_true')
     parser.add_argument('--use_local_sgd', action='store_true')
+    parser.add_argument('--diff_batch', action='store_true')
     parser.add_argument('--ut4grad_allreduce', action='store_true')
     parser.add_argument(
         '--hallreduce_inter_nranks', type=int, required=False, default=2)
@@ -798,6 +817,7 @@ def setUp(self):
         self._gloo_mode = False  # now, support gloo backend
         self._pipeline_mode = False
         self._mp_mode = False
+        self._diff_batch = False
         # FIXME(typhoonzero): I added this stupid argument to enable
         # testing allreduce layers, which users can call layers.allreduce
         # to accumulate tensors at anywhere. Find a better way to do this
@@ -1100,6 +1120,8 @@ def _get_gloo_trainer_cmd(self, model, ep, update_method, trainer_id,
         #assert self._use_reader_alloc == False, "gloo not support _use_reduce"
         if self._save_model:
             tr_cmd += " --save_model"
+        if self._diff_batch:
+            tr_cmd += " --diff_batch"
         self.__use_cuda = False
         self.__use_xpu = False
         assert self.__use_cuda == False, "gloo not support use cuda"
diff --git a/python/paddle/fluid/tests/unittests/test_fused_attention_op.py b/python/paddle/fluid/tests/unittests/test_fused_attention_op.py
index a5578d71c5cd0..7359adff62021 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_attention_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_attention_op.py
@@ -18,6 +18,7 @@
 import paddle.nn as nn
 import paddle.fluid.core as core
 import paddle.nn.functional as F
+import paddle.incubate.nn.functional as incubate_f
 from paddle.nn.layer.norm import LayerNorm
 from paddle.nn.layer.common import Linear, Dropout
 from paddle.nn.layer.transformer import _convert_attention_mask
@@ -33,6 +34,8 @@ def setUp(self):
         self.generate_input_data()
         paddle.set_default_dtype(self.x_type)
         self.__class__.op_type = "fused_attention"
+        # use autograd to check grad in this unittest.
+        self.__class__.no_need_check_grad = True
         self.q_proj = Linear(
             self.embed_dim,
             self.embed_dim,
@@ -146,7 +149,9 @@ def GetBaselineOut(self):
             final_out = self.norm1(residual_out)
         if self.pre_layer_norm:
             final_out = self.norm2(residual_out)
-        return final_out
+        paddle.autograd.backward(
+            [final_out], [paddle.to_tensor(self.dout)], retain_graph=True)
+        return final_out, tensor_query.grad
 
     def GetFusedAttentionOut(self):
         paddle.disable_static(place=paddle.CUDAPlace(0))
@@ -190,18 +195,22 @@ def GetFusedAttentionOut(self):
 
         if attn_mask is not None:
             attn_mask = _convert_attention_mask(attn_mask, x.dtype)
-        final_out = F.fused_multi_head_attention(
+        final_out = incubate_f.fused_multi_head_attention(
             x, qkv_weight_tensor, out_linear_weight, self.pre_layer_norm,
             ln1_scale, ln1_bias, ln2_scale, ln2_bias, epsilon, qkv_bias_tensor,
             out_linear_bias, attn_mask, self.dropout_prob,
             self.attn_dropout_prob, ln2_epsilon)
-        return final_out
+        paddle.autograd.backward(
+            [final_out], [paddle.to_tensor(self.dout)], retain_graph=True)
+        return final_out, x.grad
 
     def test_fused_attention_op(self):
-        final_out_ref = self.GetBaselineOut()
-        final_out = self.GetFusedAttentionOut()
+        final_out_ref, x_grad_ref = self.GetBaselineOut()
+        final_out, x_grad = self.GetFusedAttentionOut()
         np.testing.assert_allclose(
             final_out_ref, final_out.numpy(), rtol=1e-5, atol=1e-5)
+        np.testing.assert_allclose(
+            x_grad_ref, x_grad.numpy(), rtol=1e-5, atol=1e-5)
 
 
 class TestFusedAttentionOpFp16(TestFusedAttentionOp):
@@ -225,10 +234,12 @@ def config(self):
         self.key_length, self.value_length = self.query_length, self.query_length
 
     def test_fused_attention_op(self):
-        final_out_ref = self.GetBaselineOut()
-        final_out = self.GetFusedAttentionOut()
+        final_out_ref, x_grad_ref = self.GetBaselineOut()
+        final_out, x_grad = self.GetFusedAttentionOut()
         np.testing.assert_allclose(
             final_out_ref, final_out.numpy(), rtol=1e-5, atol=1e-1)
+        np.testing.assert_allclose(
+            x_grad_ref, x_grad.numpy(), rtol=1e-5, atol=1e-1)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_fused_attention_op_api.py b/python/paddle/fluid/tests/unittests/test_fused_attention_op_api.py
new file mode 100644
index 0000000000000..e59ecc19d05cb
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fused_attention_op_api.py
@@ -0,0 +1,262 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+import paddle
+import paddle.nn as nn
+import paddle.fluid.core as core
+import paddle.nn.functional as F
+from paddle.incubate.nn.layer.fused_transformer import FusedMultiHeadAttention
+from paddle import tensor
+from paddle.fluid import layers
+from paddle.static import Program, program_guard
+import unittest
+
+
+def fc(x, weight):
+    return np.matmul(x, weight)
+
+
+def softmax(x):
+    np.seterr(invalid='ignore')
+    output = np.zeros(x.shape, dtype=np.float64)
+    for i in range(x.shape[0]):
+        for j in range(x.shape[1]):
+            for k in range(x.shape[2]):
+                x_curr = x[i, j, k, :]
+                e_x = np.exp(x_curr - np.amax(x_curr))
+                output[i, j, k, :] = e_x / np.sum(e_x)
+    return output
+
+
+def batch_matmul(x, y):
+    assert x.shape[0] == y.shape[0]
+    assert x.shape[1] == y.shape[1]
+    retval = np.zeros(
+        (x.shape[0], x.shape[1], x.shape[2], y.shape[3]), dtype=np.float64)
+    for i in range(x.shape[0]):
+        for j in range(x.shape[1]):
+            retval[i, j, :, :] = np.matmul(x[i, j, :, :], y[i, j, :, :])
+    return retval
+
+
+def layer_norm(x, has_scale, has_bias, weight, bias, epsilon=1e-05):
+    batch_size, src_len, d_model = x.shape
+    x = x.reshape((batch_size * src_len, d_model))
+    mu = np.mean(x, axis=1, keepdims=True)
+    sigma_squar = np.sum(np.square(x - mu), axis=1) / d_model
+    x1_up = (x - mu)
+    x1_down_1 = sigma_squar + epsilon
+    x1_down = np.sqrt(x1_down_1)
+    x1_down = x1_down.reshape((x1_down.shape[0], 1))
+    x1 = x1_up / x1_down
+    x_scaled = x1
+    if (has_scale):
+        x_scaled = weight * x1
+    x_scaled_bias = x_scaled
+    if (has_bias):
+        x_scaled_bias = x_scaled + bias
+    x_scaled_bias = x_scaled_bias.reshape((batch_size, src_len, d_model))
+    return x_scaled_bias
+
+
+def compute_reference(pre_layer_norm, query, attn_mask, ln_scale, ln_bias,
+                      ln_2_scale, ln_2_bias, qkv_weight, qkv_bias,
+                      out_linear_weight, out_linear_bias):
+    batch_size = query.shape[0]
+    seq_len = query.shape[1]
+    embed_dim = query.shape[2]
+
+    if (pre_layer_norm):
+        ln_out = layer_norm(query, True, True, ln_scale, ln_bias)
+
+    num_head = qkv_weight.shape[1]
+    head_dim = qkv_weight.shape[2]
+    # embed_dim, 3, num_heads, self.head_dim
+    qkv_weight = qkv_weight.transpose((3, 0, 1, 2))
+    qkv_weight = qkv_weight.reshape(qkv_weight.shape[0], qkv_weight.shape[1] *
+                                    qkv_weight.shape[2] * qkv_weight.shape[3])
+
+    if (pre_layer_norm):
+        ln_out = ln_out.reshape(batch_size * seq_len, embed_dim)
+        qkv = fc(ln_out, qkv_weight)
+        ln_out = ln_out.reshape(batch_size, seq_len, embed_dim)
+    else:
+        query = query.reshape(batch_size * seq_len, embed_dim)
+        qkv = fc(query, qkv_weight)
+        query = query.reshape(batch_size, seq_len, embed_dim)
+
+    qkv = qkv.reshape(batch_size, seq_len, 3, num_head, head_dim)
+    # q*k^t
+    qkv = qkv.transpose(
+        (2, 0, 1, 3, 4))  # 3, batch_size, seq_len, num_head, head_dim
+    qkv = qkv.transpose(
+        (0, 1, 3, 2, 4))  # 3, batch_size, num_head, seq_len, head_dim
+
+    q = qkv[0:1, ::]
+    q = q.reshape(batch_size, num_head, seq_len, head_dim)
+    k = qkv[1:2, ::]  #[1, batch_size, num_head, seq_len, head_dim] 
+    k = k.reshape(batch_size, num_head, seq_len, head_dim)
+    v = qkv[2::]
+    v = v.reshape(batch_size, num_head, seq_len, head_dim)
+
+    k = k.transpose([0, 1, 3, 2])  #[batch_size, num_head, head_dim, seq_len]
+    qkt = batch_matmul(q, k / np.sqrt(head_dim, dtype=np.float64))
+
+    if attn_mask is not None:
+        if attn_mask.dtype.name == 'int64':
+            attn_mask = (attn_mask.astype(qkt.dtype) - 1.0) * 1e9
+        else:
+            attn_mask = attn_mask.astype(qkt.dtype)
+        qkt += attn_mask
+
+    # softmax
+    softmax_out = softmax(qkt)
+    attn_heads = batch_matmul(softmax_out, v)
+
+    attn_heads = attn_heads.transpose(
+        (0, 2, 1, 3))  # [batch_size, seq_len, num_head, head_dim]
+
+    # out_linear
+    out_linear_input = attn_heads.reshape(batch_size, seq_len,
+                                          num_head * head_dim)
+    out_linear_out = fc(out_linear_input, out_linear_weight)
+
+    # bias add, dropout, residual add, layer_norm.
+    out_linear_bias_out = out_linear_out + out_linear_bias
+    out_linear_bias_dropout_out = out_linear_bias_out
+    out_linear_bias_dropout_residual_out = query + out_linear_bias_dropout_out
+    out_linear_bias_dropout_residual_ln_out = layer_norm(
+        out_linear_bias_dropout_residual_out, True, True, ln_2_scale, ln_2_bias)
+    return out_linear_bias_dropout_residual_ln_out
+
+
+class TestFusedAttentionAPI(unittest.TestCase):
+    def setUp(self):
+        self.config()
+        self.generate_input_data()
+
+    def config(self):
+        self.x_type = np.float32
+        self.attn_mask_type = np.float64
+        self.pre_layer_norm = True
+        self.training = True
+        self.need_weight = False
+
+        self.batch_size = 1
+        self.query_length = 2
+        self.head_dim = 2
+        self.num_heads = 2
+        self.embed_dim = self.head_dim * self.num_heads
+
+        self.dropout_prob = 0.0
+        self.attn_dropout_prob = 0.0
+        self.weight_attr = None
+        self.bias_attr = None
+
+        self.kdim, self.vdim = self.embed_dim, self.embed_dim
+        self.key_length, self.value_length = self.query_length, self.query_length
+
+    def generate_input_data(self):
+        self.query = np.random.rand(self.batch_size, self.query_length,
+                                    self.embed_dim).astype(self.x_type)
+        self.attn_mask = np.ones(
+            (self.batch_size, self.num_heads, self.query_length,
+             self.key_length),
+            dtype=self.attn_mask_type)
+        if self.attn_mask_type == np.int64:
+            self.attn_mask = np.tril(self.attn_mask)
+        elif self.attn_mask_type == np.float64:
+            self.attn_mask = (np.tril(self.attn_mask) - 1.0) * 1e9
+        else:
+            raise ValueError("'attn_mask_type' should be 'int64' or 'float64'.")
+        self.key, self.value = self.query, self.query
+
+    def run_imperative(self):
+        fused_attn = FusedMultiHeadAttention(
+            self.embed_dim, self.num_heads, self.dropout_prob,
+            self.attn_dropout_prob, self.kdim, self.vdim, self.pre_layer_norm,
+            self.need_weight, self.weight_attr, self.bias_attr)
+        out = fused_attn(
+            paddle.to_tensor(self.query),
+            paddle.to_tensor(self.query),
+            paddle.to_tensor(self.query), paddle.to_tensor(self.attn_mask))
+        ref_out = compute_reference(self.pre_layer_norm, self.query,
+                                    self.attn_mask,
+                                    fused_attn.pre_ln_scale.numpy(),
+                                    fused_attn.pre_ln_bias.numpy(),
+                                    fused_attn.ln_scale.numpy(),
+                                    fused_attn.ln_bias.numpy(),
+                                    fused_attn.qkv_weight.numpy(),
+                                    fused_attn.qkv_bias.numpy(),
+                                    fused_attn.linear_weight.numpy(),
+                                    fused_attn.linear_bias.numpy())
+        self.assertTrue(np.allclose(ref_out, out, rtol=1e-5, atol=1e-5))
+
+    def run_static(self):
+        fused_attn = FusedMultiHeadAttention(
+            self.embed_dim, self.num_heads, self.dropout_prob,
+            self.attn_dropout_prob, self.kdim, self.vdim, self.pre_layer_norm,
+            self.need_weight, self.weight_attr, self.bias_attr)
+
+        x = paddle.static.data(
+            name='X',
+            shape=[self.batch_size, self.query_length, self.embed_dim],
+            dtype=self.x_type)
+        attn_mask = paddle.static.data(
+            name='SrcMask',
+            shape=[
+                self.batch_size, self.num_heads, self.query_length,
+                self.key_length
+            ],
+            dtype=self.attn_mask_type)
+        final_out = fused_attn(x, x, x, attn_mask)
+
+        place = paddle.CUDAPlace(0)
+        exe = paddle.static.Executor(place)
+        exe.run(paddle.static.default_startup_program())
+        out, qkv_weight, qkv_bias, out_linear_weight, linear_bias, ln_scale, ln_bias, ln_2_scale, ln_2_bias = exe.run(
+            paddle.static.default_main_program(),
+            feed={"X": self.query,
+                  "SrcMask": self.attn_mask},
+            fetch_list=[
+                final_out, fused_attn.qkv_weight, fused_attn.qkv_bias,
+                fused_attn.linear_weight, fused_attn.linear_bias,
+                fused_attn.pre_ln_scale, fused_attn.pre_ln_bias,
+                fused_attn.ln_scale, fused_attn.ln_bias
+            ])
+
+        return out, qkv_weight, qkv_bias, out_linear_weight, linear_bias, ln_scale, ln_bias, ln_2_scale, ln_2_bias
+
+    def test_static_api(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(Program()):
+            out, qkv_weight, qkv_bias, linear_weight, linear_bias, ln_scale, ln_bias, ln_2_scale, ln_2_bias = self.run_static(
+            )
+        ref_out = compute_reference(self.pre_layer_norm, self.query,
+                                    self.attn_mask, ln_scale, ln_bias,
+                                    ln_2_scale, ln_2_bias, qkv_weight, qkv_bias,
+                                    linear_weight, linear_bias)
+        self.assertTrue(
+            np.allclose(
+                np.array(ref_out), np.array(out), rtol=1e-5, atol=1e-5))
+
+    def test_dynamic_api(self):
+        paddle.disable_static(place=paddle.CUDAPlace(0))
+        self.run_imperative()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fused_feedforward_op.py b/python/paddle/fluid/tests/unittests/test_fused_feedforward_op.py
new file mode 100644
index 0000000000000..5ea43d2edf0e6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fused_feedforward_op.py
@@ -0,0 +1,329 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.nn.layer import transformer
+import paddle.nn.functional as F
+import paddle.incubate.nn.functional as incubate_f
+from paddle.nn.layer.norm import LayerNorm
+from paddle.nn.layer.common import Linear, Dropout
+import unittest
+from op_test import OpTest
+
+
+class TestFusedFFNOp(OpTest):
+    def getDtype(self):
+        self.dtype = "float32"
+        self.layer_norm_dtype = "float32"
+
+    def getShape(self):
+        self.batch_size = np.random.randint(1, 32)
+        self.query_length = np.random.randint(32, 128)
+        self.d_model = np.random.randint(32, 512)
+        self.dim_feedforward = np.random.randint(32, 512)
+
+    def getDiff(self):
+        self.rtol = 1e-3
+        self.atol = 1e-4
+
+    def getActivation(self):
+        self.act_method = "gelu"
+
+    def getNormalizeBefore(self):
+        self.pre_layer_norm = False
+
+    def setUp(self):
+        paddle.disable_static()
+        self.__class__.op_type = "fused_feedforward"
+        #check grad in test_out_and_grad()
+        self.__class__.no_need_check_grad = True
+        self.getDtype()
+        self.getShape()
+        self.getDiff()
+        self.getActivation()
+        self.getNormalizeBefore()
+        paddle.set_default_dtype(self.dtype)
+        self.weight_attr = None
+        self.bias_attr = None
+
+        self.weight_attrs = transformer._convert_param_attr_to_list(
+            self.weight_attr, 2)
+        self.bias_attrs = transformer._convert_param_attr_to_list(
+            self.bias_attr, 2)
+        self.linear1 = Linear(
+            self.d_model,
+            self.dim_feedforward,
+            self.weight_attrs[1],
+            bias_attr=self.bias_attrs[1])
+        self.linear2 = Linear(
+            self.dim_feedforward,
+            self.d_model,
+            self.weight_attrs[1],
+            bias_attr=self.bias_attrs[1])
+
+        paddle.set_default_dtype(self.layer_norm_dtype)
+        self.norm1 = LayerNorm(self.d_model)
+        self.norm2 = LayerNorm(self.d_model)
+        self.dropout = Dropout(0.0, mode="upscale_in_train")
+        self.dropout1 = Dropout(0.0, mode="upscale_in_train")
+        self.dropout2 = Dropout(0.0, mode="upscale_in_train")
+        self.activation = getattr(F, self.act_method)
+
+        self.src = np.random.random((self.batch_size, self.query_length,
+                                     self.d_model)).astype(self.dtype)
+        self.dout = np.random.random((self.batch_size, self.query_length,
+                                      self.d_model)).astype(self.dtype)
+
+    def Base(self):
+        paddle.disable_static()
+        tensor_src = paddle.to_tensor(self.src, stop_gradient=False)
+        residual = paddle.to_tensor(self.src)
+        if self.pre_layer_norm:
+            ln1_out = self.norm1(tensor_src)
+            linear2_out = self.linear2(
+                self.dropout(self.activation(self.linear1(ln1_out))))
+            dropout2_out = residual + self.dropout2(linear2_out)
+            paddle.autograd.backward([dropout2_out],
+                                     [paddle.to_tensor(self.dout)], True)
+            return dropout2_out, tensor_src.grad
+        else:
+            linear2_out = self.linear2(
+                self.dropout(self.activation(self.linear1(tensor_src))))
+            dropout2_out = residual + self.dropout2(linear2_out)
+            dropout2_out = self.norm2(dropout2_out)
+            paddle.autograd.backward([dropout2_out],
+                                     [paddle.to_tensor(self.dout)], True)
+            return dropout2_out, tensor_src.grad
+
+    def FusedFFN(self):
+        paddle.disable_static()
+        linear1_weight = paddle.to_tensor(
+            self.linear1.weight, stop_gradient=False)
+        linear1_bias = paddle.to_tensor(self.linear1.bias, stop_gradient=False)
+        linear2_weight = paddle.to_tensor(
+            self.linear2.weight, stop_gradient=False)
+        linear2_bias = paddle.to_tensor(self.linear2.bias, stop_gradient=False)
+        ln1_scale = paddle.to_tensor(self.norm1.weight, stop_gradient=False)
+        ln1_bias = paddle.to_tensor(self.norm1.bias, stop_gradient=False)
+        ln2_scale = paddle.to_tensor(self.norm2.weight, stop_gradient=False)
+        ln2_bias = paddle.to_tensor(self.norm2.bias, stop_gradient=False)
+        x = paddle.to_tensor(self.src, stop_gradient=False)
+        out = incubate_f.fused_feedforward(
+            x,
+            linear1_weight,
+            linear2_weight,
+            linear1_bias,
+            linear2_bias,
+            ln1_scale,
+            ln1_bias,
+            ln2_scale,
+            ln2_bias,
+            0.0,
+            0.0,
+            activation=self.act_method,
+            pre_layer_norm=self.pre_layer_norm)
+        paddle.autograd.backward([out], [paddle.to_tensor(self.dout)])
+        return out, x.grad
+
+    def test_out_and_grad(self):
+        base_out, base_grad = self.Base()
+        fused_out, fused_grad = self.FusedFFN()
+        np.testing.assert_allclose(
+            base_out.numpy(), fused_out.numpy(), rtol=self.rtol, atol=self.atol)
+        np.testing.assert_allclose(
+            base_grad.numpy(),
+            fused_grad.numpy(),
+            rtol=self.rtol,
+            atol=self.atol)
+
+
+class TestFusedFFNOpFp16(TestFusedFFNOp):
+    def getDtype(self):
+        self.dtype = "float16"
+        self.layer_norm_dtype = "float32"
+
+    def getDiff(self):
+        self.rtol = 1e-1
+        self.atol = 1e-2
+
+    def getShape(self):
+        self.batch_size = 4
+        self.query_length = 32
+        self.d_model = 128
+        self.dim_feedforward = 256
+
+
+class TestFusedFFNOpFp64(TestFusedFFNOp):
+    def getDtype(self):
+        self.dtype = "float64"
+        self.layer_norm_dtype = "float64"
+
+
+class TestFusedFFNOpActivation(TestFusedFFNOp):
+    def getActivation(self):
+        self.act_method = "relu"
+
+
+class TestFusedFFNOpNormalizeBefore(TestFusedFFNOp):
+    def getNormalizeBefore(self):
+        self.pre_layer_norm = True
+
+    def getShape(self):
+        self.batch_size = 1
+        self.query_length = 1
+        self.d_model = 8
+        self.dim_feedforward = 8
+
+
+class APITestStaticFusedFFN(unittest.TestCase):
+    def test_static(self):
+        paddle.enable_static()
+        dtype = "float32"
+        layer_norm_dtype = "float32"
+        batch_size = 1
+        d_model = 8
+        dim_feedforward = 8
+
+        x = paddle.static.data(
+            name='x', shape=[batch_size, d_model, dim_feedforward], dtype=dtype)
+        linear1_weight = paddle.static.data(
+            name='linear1_weight',
+            shape=[d_model, dim_feedforward],
+            dtype=dtype)
+        linear1_bias = paddle.static.data(
+            name='linear1_bias', shape=[dim_feedforward])
+        linear2_weight = paddle.static.data(
+            name='linear2_weight',
+            shape=[dim_feedforward, d_model],
+            dtype=dtype)
+        linear2_bias = paddle.static.data(name='linear2_bias', shape=[d_model])
+        ln1_scale = paddle.static.data(name='ln1_scale', shape=[d_model])
+        ln1_bias = paddle.static.data(name='ln1_scale', shape=[d_model])
+        ln2_scale = paddle.static.data(name='ln2_scale', shape=[d_model])
+        ln2_bias = paddle.static.data(name='ln2_scale', shape=[d_model])
+
+        fused_out = incubate_f.fused_feedforward(
+            x,
+            linear1_weight,
+            linear2_weight,
+            linear1_bias,
+            linear2_bias,
+            ln1_scale,
+            ln1_bias,
+            ln2_scale,
+            ln2_bias,
+            0.0,
+            0.0,
+            activation="relu",
+            pre_layer_norm=False)
+
+        ######base ffn######
+        linear1_out = F.linear(x, linear1_weight, linear1_bias)
+        act_out = F.relu(linear1_out)
+        dropout1_out = F.dropout(x=act_out, p=0.0, training=False)
+        linear2_out = F.linear(dropout1_out, linear2_weight, linear2_bias)
+        dropout2_out = x + F.dropout(x=linear2_out, p=0.0, training=False)
+        ln_out = F.layer_norm(
+            dropout2_out,
+            normalized_shape=list([d_model]),
+            weight=ln2_scale,
+            bias=ln2_bias)
+        ######base ffn######
+
+        exe = paddle.static.Executor(paddle.CUDAPlace(0))
+
+        x_data = np.random.random(
+            (batch_size, d_model, dim_feedforward)).astype(dtype)
+        linear1_weight_data = np.random.random(
+            (d_model, dim_feedforward)).astype(dtype)
+        linear1_bias_data = np.zeros((dim_feedforward)).astype(dtype)
+        linear2_weight_data = np.random.random(
+            (dim_feedforward, d_model)).astype(dtype)
+        linear2_bias_data = np.zeros((d_model)).astype(dtype)
+
+        ln1_scale_data = np.ones((d_model)).astype(layer_norm_dtype)
+        ln1_bias_data = np.zeros((d_model)).astype(layer_norm_dtype)
+        ln2_scale_data = np.ones((d_model)).astype(layer_norm_dtype)
+        ln2_bias_data = np.zeros((d_model)).astype(layer_norm_dtype)
+
+        res_list = [fused_out, ln_out]
+        real_res = []
+
+        for res in res_list:
+            fetch = exe.run(feed={
+                'x': x_data,
+                'linear1_weight': linear1_weight_data,
+                'linear1_bias': linear1_bias_data,
+                'linear2_weight': linear2_weight_data,
+                'linear2_bias': linear2_bias_data,
+                'ln1_scale': ln1_scale_data,
+                'ln1_bias': ln1_bias_data,
+                'ln2_scale': ln2_scale_data,
+                'ln2_bias': ln2_bias_data
+            },
+                            fetch_list=[res])
+            real_res.append(fetch)
+        self.assertTrue(
+            np.allclose(
+                real_res[0], real_res[1], atol=1e-3),
+            "two value is check diff")
+
+
+class TestFusedFFNOpError(unittest.TestCase):
+    def test_errors(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+
+            def test_dtype():
+                x = paddle.static.data(
+                    name='x', shape=[1, 10, 10], dtype="int32")
+                linear1_weight = paddle.static.data(
+                    name='linear1_weight', shape=[1, 10, 10], dtype="float32")
+                linear2_weight = paddle.static.data(
+                    name='linear2_weight', shape=[1, 10, 10], dtype="float32")
+                incubate_f.fused_feedforward(x, linear1_weight, linear2_weight)
+
+            self.assertRaises(TypeError, test_dtype)
+
+            def test_dropout_rate_type():
+                x = paddle.static.data(
+                    name='x1', shape=[1, 10, 10], dtype="float32")
+                linear1_weight = paddle.static.data(
+                    name='linear1_weight1', shape=[10, 10], dtype="float32")
+                linear2_weight = paddle.static.data(
+                    name='linear2_weight1', shape=[10, 10], dtype="float32")
+                incubate_f.fused_feedforward(
+                    x, linear1_weight, linear2_weight, dropout1_rate="a")
+
+            self.assertRaises(TypeError, test_dropout_rate_type)
+
+            def test_dropout_rate_value():
+                x = paddle.static.data(
+                    name='x2', shape=[1, 10, 10], dtype="float32")
+                linear1_weight = paddle.static.data(
+                    name='linear1_weight2', shape=[10, 10], dtype="float32")
+                linear2_weight = paddle.static.data(
+                    name='linear2_weight2', shape=[10, 10], dtype="float32")
+                incubate_f.fused_feedforward(
+                    x, linear1_weight, linear2_weight, dropout2_rate=-1)
+
+            self.assertRaises(ValueError, test_dropout_rate_value)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding_diff_length_gloo.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding_diff_length_gloo.py
new file mode 100644
index 0000000000000..1c425a40a9b39
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding_diff_length_gloo.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import sys
+import unittest
+
+import paddle.fluid as fluid
+from test_dist_base import TestDistBase
+from spawn_runner_base import TestDistSpawnRunner
+from parallel_dygraph_sparse_embedding import TestSparseEmbedding
+from parallel_dygraph_sparse_embedding_fp64 import TestSparseEmbeddingFP64
+
+flag_name = os.path.splitext(__file__)[0]
+
+
+class TestParallelDygraphSparseEmdedding_GLOO(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = False
+        self._gloo_mode = True
+        self._dygraph = True
+        self._diff_batch = True
+
+    def test_sparse_embedding(self):
+        self.check_with_place(
+            "parallel_dygraph_sparse_embedding.py",
+            delta=1e-5,
+            check_error_log=True,
+            log_name=flag_name)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_run_cinn.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_run_cinn.py
index d4722c2e1819f..d26c7a1bb441e 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_run_cinn.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_run_cinn.py
@@ -14,17 +14,31 @@
 
 from __future__ import print_function
 
+import logging
 import numpy as np
 import paddle
 import unittest
 
 paddle.enable_static()
 
+logging.basicConfig(
+    format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
+logger = logging.getLogger(__name__)
 
+
+def set_cinn_flag(val):
+    cinn_compiled = False
+    try:
+        paddle.set_flags({'FLAGS_use_cinn': val})
+        cinn_compiled = True
+    except ValueError:
+        logger.warning("The used paddle is not compiled with CINN.")
+    return cinn_compiled
+
+
+@unittest.skipIf(not set_cinn_flag(True), "Paddle is not compiled with CINN.")
 class TestParallelExecutorRunCinn(unittest.TestCase):
     def test_run_from_cinn(self):
-        paddle.set_flags({'FLAGS_use_cinn': False})
-
         main_program = paddle.static.Program()
         startup_program = paddle.static.Program()
         with paddle.static.program_guard(main_program, startup_program):
@@ -49,7 +63,7 @@ def test_run_from_cinn(self):
                         fetch_list=[prediction.name],
                         return_merged=False)
 
-        paddle.set_flags({'FLAGS_use_cinn': False})
+        set_cinn_flag(False)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_roll_op.py b/python/paddle/fluid/tests/unittests/test_roll_op.py
index 99121d2953a14..bca7665b814db 100644
--- a/python/paddle/fluid/tests/unittests/test_roll_op.py
+++ b/python/paddle/fluid/tests/unittests/test_roll_op.py
@@ -122,6 +122,34 @@ def test_axis_out_range():
 
         self.assertRaises(ValueError, test_axis_out_range)
 
+    def test_shifts_as_tensor_dygraph(self):
+        with fluid.dygraph.guard():
+            x = paddle.arange(9).reshape([3, 3])
+            shape = paddle.shape(x)
+            shifts = shape // 2
+            axes = [0, 1]
+            out = paddle.roll(x, shifts=shifts, axis=axes).numpy()
+            expected_out = np.array([[8, 6, 7], [2, 0, 1], [5, 3, 4]])
+            self.assertTrue(np.allclose(out, expected_out))
+
+    def test_shifts_as_tensor_static(self):
+        with program_guard(Program(), Program()):
+            x = paddle.arange(9).reshape([3, 3]).astype('float32')
+            shape = paddle.shape(x)
+            shifts = shape // 2
+            axes = [0, 1]
+            out = paddle.roll(x, shifts=shifts, axis=axes)
+            expected_out = np.array([[8, 6, 7], [2, 0, 1], [5, 3, 4]])
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            [out_np] = exe.run(fetch_list=[out])
+            self.assertTrue(np.allclose(out_np, expected_out))
+
+            if paddle.is_compiled_with_cuda():
+                exe = fluid.Executor(fluid.CPUPlace())
+                [out_np] = exe.run(fetch_list=[out])
+                self.assertTrue(np.allclose(out_np, expected_out))
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_signal.py b/python/paddle/fluid/tests/unittests/test_signal.py
index a109a5aa5d1a6..ecbbd8f52db9b 100644
--- a/python/paddle/fluid/tests/unittests/test_signal.py
+++ b/python/paddle/fluid/tests/unittests/test_signal.py
@@ -652,7 +652,7 @@ def test_frame(self):
         self.assertTrue(
             np.allclose(
                 frame_for_api_test(self.x, self.frame_length, self.hop_length, self.axis),
-                paddle.tensor.signal.frame(
+                paddle.signal.frame(
                     paddle.to_tensor(self.x),
                     self.frame_length,
                     self.hop_length,
@@ -678,7 +678,7 @@ def test_frame_static(self):
         mp, sp = paddle.static.Program(), paddle.static.Program()
         with paddle.static.program_guard(mp, sp):
             input = paddle.static.data('input', self.x.shape, dtype=self.x.dtype)
-            output = paddle.tensor.signal.frame(
+            output = paddle.signal.frame(
                      input,
                      self.frame_length,
                      self.hop_length,
@@ -708,7 +708,7 @@ def test_frame_static(self):
 class TestFrameException(unittest.TestCase):
     def test_frame(self):
         with self.assertRaises(self.expect_exception):
-            paddle.tensor.signal.frame(
+            paddle.signal.frame(
                 paddle.to_tensor(self.x),
                 self.frame_length,
                 self.hop_length,
@@ -731,7 +731,7 @@ def test_overlap_add(self):
         self.assertTrue(
             np.allclose(
                 overlap_add_for_api_test(self.x, self.hop_length, self.axis),
-                paddle.tensor.signal.overlap_add(
+                paddle.signal.overlap_add(
                     paddle.to_tensor(self.x),
                     self.hop_length,
                     self.axis),
@@ -756,7 +756,7 @@ def test_overlap_add_static(self):
         mp, sp = paddle.static.Program(), paddle.static.Program()
         with paddle.static.program_guard(mp, sp):
             input = paddle.static.data('input', self.x.shape, dtype=self.x.dtype)
-            output = paddle.tensor.signal.overlap_add(
+            output = paddle.signal.overlap_add(
                      input,
                      self.hop_length,
                      self.axis),
@@ -783,7 +783,7 @@ def test_overlap_add_static(self):
 class TestOverlapAddException(unittest.TestCase):
     def test_overlap_add(self):
         with self.assertRaises(self.expect_exception):
-            paddle.tensor.signal.overlap_add(
+            paddle.signal.overlap_add(
                 paddle.to_tensor(self.x),
                 self.hop_length,
                 self.axis)
@@ -848,7 +848,7 @@ def test_stft(self):
         self.assertTrue(
             np.allclose(
                 stft(self.x, self.n_fft, self.hop_length, self.win_length, win_l, self.center, self.pad_mode),
-                paddle.tensor.signal.stft(
+                paddle.signal.stft(
                     paddle.to_tensor(self.x),
                     self.n_fft,
                     self.hop_length,
@@ -891,7 +891,7 @@ def test_stft(self):
             win_p = paddle.to_tensor(self.window)
 
         with self.assertRaises(self.expect_exception):
-            paddle.tensor.signal.stft(
+            paddle.signal.stft(
                 paddle.to_tensor(self.x),
                 self.n_fft,
                 self.hop_length,
@@ -934,7 +934,7 @@ def test_istft(self):
         self.assertTrue(
             np.allclose(
                 istft(self.x, self.hop_length, self.win_length, win_l, self.center, self.length),
-                paddle.tensor.signal.istft(
+                paddle.signal.istft(
                     paddle.to_tensor(self.x),
                     self.n_fft,
                     self.hop_length,
@@ -986,7 +986,7 @@ def test_istft(self):
             win_p = paddle.to_tensor(self.window)
 
         with self.assertRaises(self.expect_exception):
-            paddle.tensor.signal.istft(
+            paddle.signal.istft(
                 paddle.to_tensor(self.x),
                 self.n_fft,
                 self.hop_length,
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_cast_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_cast_op_xpu.py
index f1ba8828f2b33..1633d82772289 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_cast_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_cast_op_xpu.py
@@ -16,71 +16,48 @@
 import sys
 
 sys.path.append("..")
-import op_test
 import unittest
+import op_test
 import numpy as np
 import paddle
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard
 
-
-class TestCastOp1(op_test.OpTest):
-    def setUp(self):
-        ipt = np.random.random(size=[10, 10])
-        self.inputs = {'X': ipt.astype('float32')}
-        self.outputs = {'Out': ipt.astype('float32')}
-        self.attrs = {
-            'in_dtype': int(core.VarDesc.VarType.FP32),
-            'out_dtype': int(core.VarDesc.VarType.FP32)
-        }
-        self.op_type = 'cast'
-
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place)
-
-    def test_grad(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, ['X'], ['Out'])
-
-
-class TestCastOp2(op_test.OpTest):
-    def setUp(self):
-        ipt = np.random.random(size=[10, 10])
-        self.inputs = {'X': ipt.astype('float32')}
-        self.outputs = {'Out': ipt.astype('float16')}
-        self.attrs = {
-            'in_dtype': int(core.VarDesc.VarType.FP32),
-            'out_dtype': int(core.VarDesc.VarType.FP16)
-        }
-        self.op_type = 'cast'
-
-    def test_check_output(self):
-        #self.check_output(atol=1e-3)
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place, atol=1e-3)
-
-
-class TestCastOp3(op_test.OpTest):
-    def setUp(self):
-        ipt = np.random.random(size=[10, 10])
-        self.inputs = {'X': ipt.astype('float16')}
-        self.outputs = {'Out': ipt.astype('float32')}
-        self.attrs = {
-            'in_dtype': int(core.VarDesc.VarType.FP16),
-            'out_dtype': int(core.VarDesc.VarType.FP32)
-        }
-        self.op_type = 'cast'
-
-    def test_check_output(self):
-        #self.check_output(atol=1e-3)
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place, atol=1e-3)
+typeid_dict = {
+    'int32': int(core.VarDesc.VarType.INT32),
+    'int64': int(core.VarDesc.VarType.INT64),
+    'float32': int(core.VarDesc.VarType.FP32),
+    'float16': int(core.VarDesc.VarType.FP16),
+    'bool': int(core.VarDesc.VarType.BOOL),
+}
+
+
+def create_test_class(in_typename, out_typename):
+    class Cls(op_test.OpTest):
+        def setUp(self):
+            ipt = np.random.random(size=[10, 10])
+            self.inputs = {'X': ipt.astype(in_typename)}
+            self.outputs = {'Out': ipt.astype(in_typename).astype(out_typename)}
+            self.attrs = {
+                'in_dtype': typeid_dict[in_typename],
+                'out_dtype': typeid_dict[out_typename],
+            }
+            self.op_type = 'cast'
+
+        def test_check_output(self):
+            if paddle.is_compiled_with_xpu():
+                place = paddle.XPUPlace(0)
+                self.check_output_with_place(place)
+
+    cls_name = "cast_{0}_{1}".format(in_typename, out_typename)
+    Cls.__name__ = cls_name
+    globals()[cls_name] = Cls
+
+
+for in_type in {'float16', 'float32', 'int32', 'int64', 'bool'}:
+    for out_type in {'float16', 'float32', 'int32', 'int64'}:
+        create_test_class(in_type, out_type)
 
 
 class TestCastOpError(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_clip_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_clip_op_xpu.py
new file mode 100644
index 0000000000000..6c58c7ccf2cc0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_clip_op_xpu.py
@@ -0,0 +1,216 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import sys
+sys.path.append("..")
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from op_test_xpu import OpTest, XPUOpTest
+import paddle
+from paddle.fluid import Program, program_guard
+
+
+class TestClipOp(XPUOpTest):
+    def set_xpu(self):
+        self.__class__.use_xpu = True
+        self.place = paddle.XPUPlace(0)
+
+    def setUp(self):
+        self.set_xpu()
+        self.max_relative_error = 0.006
+
+        self.inputs = {}
+        self.initTestCase()
+
+        self.op_type = "clip"
+        self.attrs = {}
+        self.attrs['min'] = self.min
+        self.attrs['max'] = self.max
+        if 'Min' in self.inputs:
+            min_v = self.inputs['Min']
+        else:
+            min_v = self.attrs['min']
+
+        if 'Max' in self.inputs:
+            max_v = self.inputs['Max']
+        else:
+            max_v = self.attrs['max']
+
+        input = np.random.random(self.shape).astype("float32")
+        input[np.abs(input - min_v) < self.max_relative_error] = 0.5
+        input[np.abs(input - max_v) < self.max_relative_error] = 0.5
+        self.inputs['X'] = input
+        self.outputs = {'Out': np.clip(self.inputs['X'], min_v, max_v)}
+
+    def test_check_output(self):
+        paddle.enable_static()
+        self.check_output_with_place(self.place)
+        paddle.disable_static()
+
+    def test_check_grad_normal(self):
+        paddle.enable_static()
+        self.check_grad_with_place(self.place, ['X'], 'Out')
+        paddle.disable_static()
+
+    def initTestCase(self):
+        self.shape = (4, 10, 10)
+        self.max = 0.8
+        self.min = 0.3
+        self.inputs['Max'] = np.array([0.8]).astype('float32')
+        self.inputs['Min'] = np.array([0.1]).astype('float32')
+
+
+class TestCase1(TestClipOp):
+    def initTestCase(self):
+        self.shape = (8, 16, 8)
+        self.max = 0.7
+        self.min = 0.0
+
+
+class TestCase2(TestClipOp):
+    def initTestCase(self):
+        self.shape = (8, 16)
+        self.max = 1.0
+        self.min = 0.0
+
+
+class TestCase3(TestClipOp):
+    def initTestCase(self):
+        self.shape = (4, 8, 16)
+        self.max = 0.7
+        self.min = 0.2
+
+
+class TestCase4(TestClipOp):
+    def initTestCase(self):
+        self.shape = (4, 8, 8)
+        self.max = 0.7
+        self.min = 0.2
+        self.inputs['Max'] = np.array([0.8]).astype('float32')
+        self.inputs['Min'] = np.array([0.3]).astype('float32')
+
+
+class TestCase5(TestClipOp):
+    def initTestCase(self):
+        self.shape = (4, 8, 16)
+        self.max = 0.5
+        self.min = 0.5
+
+
+class TestClipOpError(unittest.TestCase):
+    def test_errors(self):
+        paddle.enable_static()
+        with program_guard(Program(), Program()):
+            input_data = np.random.random((2, 4)).astype("float32")
+
+            def test_Variable():
+                fluid.layers.clip(x=input_data, min=-1.0, max=1.0)
+
+            self.assertRaises(TypeError, test_Variable)
+
+            def test_dtype():
+                x2 = fluid.layers.data(name='x2', shape=[1], dtype='int32')
+                fluid.layers.clip(x=x2, min=-1.0, max=1.0)
+
+            self.assertRaises(TypeError, test_dtype)
+        paddle.disable_static()
+
+
+class TestClipAPI(unittest.TestCase):
+    def _executed_api(self, x, min=None, max=None):
+        return paddle.clip(x, min, max)
+
+    def test_clip(self):
+        paddle.enable_static()
+        data_shape = [1, 9, 9, 4]
+        data = np.random.random(data_shape).astype('float32')
+        images = fluid.data(name='image', shape=data_shape, dtype='float32')
+        min = fluid.data(name='min', shape=[1], dtype='float32')
+        max = fluid.data(name='max', shape=[1], dtype='float32')
+
+        place = fluid.XPUPlace(0) if fluid.core.is_compiled_with_xpu(
+        ) else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+
+        out_1 = self._executed_api(images, min=min, max=max)
+        out_2 = self._executed_api(images, min=0.2, max=0.9)
+        out_3 = self._executed_api(images, min=0.3)
+        out_4 = self._executed_api(images, max=0.7)
+        out_5 = self._executed_api(images, min=min)
+        out_6 = self._executed_api(images, max=max)
+        out_7 = self._executed_api(images, max=-1.)
+        out_8 = self._executed_api(images)
+
+        res1, res2, res3, res4, res5, res6, res7, res8 = exe.run(
+            fluid.default_main_program(),
+            feed={
+                "image": data,
+                "min": np.array([0.2]).astype('float32'),
+                "max": np.array([0.8]).astype('float32')
+            },
+            fetch_list=[
+                out_1, out_2, out_3, out_4, out_5, out_6, out_7, out_8
+            ])
+
+        self.assertTrue(np.allclose(res1, data.clip(0.2, 0.8)))
+        self.assertTrue(np.allclose(res2, data.clip(0.2, 0.9)))
+        self.assertTrue(np.allclose(res3, data.clip(min=0.3)))
+        self.assertTrue(np.allclose(res4, data.clip(max=0.7)))
+        self.assertTrue(np.allclose(res5, data.clip(min=0.2)))
+        self.assertTrue(np.allclose(res6, data.clip(max=0.8)))
+        self.assertTrue(np.allclose(res7, data.clip(max=-1)))
+        self.assertTrue(np.allclose(res8, data))
+        paddle.disable_static()
+
+    def test_clip_dygraph(self):
+        paddle.disable_static()
+        place = fluid.XPUPlace(0) if fluid.core.is_compiled_with_xpu(
+        ) else fluid.CPUPlace()
+        paddle.disable_static(place)
+        data_shape = [1, 9, 9, 4]
+        data = np.random.random(data_shape).astype('float32')
+        images = paddle.to_tensor(data, dtype='float32')
+        v_min = paddle.to_tensor(np.array([0.2], dtype=np.float32))
+        v_max = paddle.to_tensor(np.array([0.8], dtype=np.float32))
+
+        out_1 = self._executed_api(images, min=0.2, max=0.8)
+        images = paddle.to_tensor(data, dtype='float32')
+        out_2 = self._executed_api(images, min=0.2, max=0.9)
+        images = paddle.to_tensor(data, dtype='float32')
+        out_3 = self._executed_api(images, min=v_min, max=v_max)
+
+        self.assertTrue(np.allclose(out_1.numpy(), data.clip(0.2, 0.8)))
+        self.assertTrue(np.allclose(out_2.numpy(), data.clip(0.2, 0.9)))
+        self.assertTrue(np.allclose(out_3.numpy(), data.clip(0.2, 0.8)))
+
+    def test_errors(self):
+        paddle.enable_static()
+        x1 = fluid.data(name='x1', shape=[1], dtype="int16")
+        x2 = fluid.data(name='x2', shape=[1], dtype="int8")
+        self.assertRaises(TypeError, paddle.clip, x=x1, min=0.2, max=0.8)
+        self.assertRaises(TypeError, paddle.clip, x=x2, min=0.2, max=0.8)
+        paddle.disable_static()
+
+
+class TestInplaceClipAPI(TestClipAPI):
+    def _executed_api(self, x, min=None, max=None):
+        return x.clip_(min, max)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_compare_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_compare_op_xpu.py
new file mode 100644
index 0000000000000..5496c53a420b9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_compare_op_xpu.py
@@ -0,0 +1,272 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import sys
+sys.path.append("..")
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from op_test_xpu import OpTest, XPUOpTest
+import paddle
+from paddle.fluid import Program, program_guard
+
+
+def create_test_class(op_type, typename, callback):
+    class Cls(OpTest):
+        def setUp(self):
+            a = np.random.random(size=(10, 7)).astype(typename)
+            b = np.random.random(size=(10, 7)).astype(typename)
+            c = callback(a, b)
+            self.inputs = {'X': a, 'Y': b}
+            self.outputs = {'Out': c}
+            self.op_type = op_type
+            self.use_xpu = True
+            self.attrs = {'use_xpu': True}
+
+        def test_check_output(self):
+            paddle.enable_static()
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+        def test_errors(self):
+            paddle.enable_static()
+            with program_guard(Program(), Program()):
+                x = fluid.layers.data(name='x', shape=[2], dtype='int32')
+                y = fluid.layers.data(name='y', shape=[2], dtype='int32')
+                a = fluid.layers.data(name='a', shape=[2], dtype='int16')
+                if self.op_type == "less_than":
+                    self.assertRaises(
+                        TypeError,
+                        fluid.layers.less_than,
+                        x=x,
+                        y=y,
+                        force_cpu=1)
+                op = eval("fluid.layers.%s" % self.op_type)
+                self.assertRaises(TypeError, op, x=x, y=y, cond=1)
+                self.assertRaises(TypeError, op, x=x, y=a)
+                self.assertRaises(TypeError, op, x=a, y=y)
+
+    cls_name = "{0}_{1}".format(op_type, typename)
+    Cls.__name__ = cls_name
+    globals()[cls_name] = Cls
+
+
+for _type_name in {'float32', 'int32', 'int64'}:
+    if _type_name == 'float64' and core.is_compiled_with_rocm():
+        _type_name = 'float32'
+
+    create_test_class('less_than', _type_name, lambda _a, _b: _a < _b)
+    create_test_class('less_equal', _type_name, lambda _a, _b: _a <= _b)
+    create_test_class('greater_than', _type_name, lambda _a, _b: _a > _b)
+    create_test_class('greater_equal', _type_name, lambda _a, _b: _a >= _b)
+    create_test_class('equal', _type_name, lambda _a, _b: _a == _b)
+    create_test_class('not_equal', _type_name, lambda _a, _b: _a != _b)
+
+
+def create_paddle_case(op_type, callback):
+    class PaddleCls(unittest.TestCase):
+        def setUp(self):
+            self.op_type = op_type
+            self.input_x = np.array([1, 2, 3, 4]).astype(np.int64)
+            self.input_y = np.array([1, 3, 2, 4]).astype(np.int64)
+            self.real_result = callback(self.input_x, self.input_y)
+            self.place = fluid.XPUPlace(0) if fluid.core.is_compiled_with_xpu(
+            ) else fluid.CPUPlace()
+
+        def test_api(self):
+            paddle.enable_static()
+            with program_guard(Program(), Program()):
+                x = fluid.data(name='x', shape=[4], dtype='int64')
+                y = fluid.data(name='y', shape=[4], dtype='int64')
+                op = eval("paddle.%s" % (self.op_type))
+                out = op(x, y)
+                exe = fluid.Executor(self.place)
+                res, = exe.run(feed={"x": self.input_x,
+                                     "y": self.input_y},
+                               fetch_list=[out])
+            self.assertEqual((res == self.real_result).all(), True)
+
+        def test_api_float(self):
+            if self.op_type == "equal":
+                paddle.enable_static()
+                with program_guard(Program(), Program()):
+                    x = fluid.data(name='x', shape=[4], dtype='int64')
+                    y = fluid.data(name='y', shape=[1], dtype='int64')
+                    op = eval("paddle.%s" % (self.op_type))
+                    out = op(x, y)
+                    exe = fluid.Executor(self.place)
+                    res, = exe.run(feed={"x": self.input_x,
+                                         "y": 1.0},
+                                   fetch_list=[out])
+                self.real_result = np.array([1, 0, 0, 0]).astype(np.int64)
+                self.assertEqual((res == self.real_result).all(), True)
+
+        def test_dynamic_api(self):
+            paddle.disable_static()
+            x = paddle.to_tensor(self.input_x)
+            y = paddle.to_tensor(self.input_y)
+            op = eval("paddle.%s" % (self.op_type))
+            out = op(x, y)
+            self.assertEqual((out.numpy() == self.real_result).all(), True)
+            paddle.enable_static()
+
+        def test_dynamic_api_int(self):
+            if self.op_type == "equal":
+                paddle.disable_static()
+                x = paddle.to_tensor(self.input_x)
+                op = eval("paddle.%s" % (self.op_type))
+                out = op(x, 1)
+                self.real_result = np.array([1, 0, 0, 0]).astype(np.int64)
+                self.assertEqual((out.numpy() == self.real_result).all(), True)
+                paddle.enable_static()
+
+        def test_dynamic_api_float(self):
+            if self.op_type == "equal":
+                paddle.disable_static()
+                x = paddle.to_tensor(self.input_x)
+                op = eval("paddle.%s" % (self.op_type))
+                out = op(x, 1.0)
+                self.real_result = np.array([1, 0, 0, 0]).astype(np.int64)
+                self.assertEqual((out.numpy() == self.real_result).all(), True)
+                paddle.enable_static()
+
+        def test_assert(self):
+            def test_dynamic_api_string(self):
+                if self.op_type == "equal":
+                    paddle.disable_static()
+                    x = paddle.to_tensor(self.input_x)
+                    op = eval("paddle.%s" % (self.op_type))
+                    out = op(x, "1.0")
+                    paddle.enable_static()
+
+            self.assertRaises(TypeError, test_dynamic_api_string)
+
+        def test_dynamic_api_bool(self):
+            if self.op_type == "equal":
+                paddle.disable_static()
+                x = paddle.to_tensor(self.input_x)
+                op = eval("paddle.%s" % (self.op_type))
+                out = op(x, True)
+                self.real_result = np.array([1, 0, 0, 0]).astype(np.int64)
+                self.assertEqual((out.numpy() == self.real_result).all(), True)
+                paddle.enable_static()
+
+        def test_broadcast_api_1(self):
+            paddle.enable_static()
+            with program_guard(Program(), Program()):
+                x = paddle.static.data(
+                    name='x', shape=[1, 2, 1, 3], dtype='int32')
+                y = paddle.static.data(name='y', shape=[1, 2, 3], dtype='int32')
+                op = eval("paddle.%s" % (self.op_type))
+                out = op(x, y)
+                exe = paddle.static.Executor(self.place)
+                input_x = np.arange(1, 7).reshape((1, 2, 1, 3)).astype(np.int32)
+                input_y = np.arange(0, 6).reshape((1, 2, 3)).astype(np.int32)
+                real_result = callback(input_x, input_y)
+                res, = exe.run(feed={"x": input_x,
+                                     "y": input_y},
+                               fetch_list=[out])
+            self.assertEqual((res == real_result).all(), True)
+
+        def test_broadcast_api_2(self):
+            paddle.enable_static()
+            with program_guard(Program(), Program()):
+                x = paddle.static.data(name='x', shape=[1, 2, 3], dtype='int32')
+                y = paddle.static.data(
+                    name='y', shape=[1, 2, 1, 3], dtype='int32')
+                op = eval("paddle.%s" % (self.op_type))
+                out = op(x, y)
+                exe = paddle.static.Executor(self.place)
+                input_x = np.arange(0, 6).reshape((1, 2, 3)).astype(np.int32)
+                input_y = np.arange(1, 7).reshape((1, 2, 1, 3)).astype(np.int32)
+                real_result = callback(input_x, input_y)
+                res, = exe.run(feed={"x": input_x,
+                                     "y": input_y},
+                               fetch_list=[out])
+            self.assertEqual((res == real_result).all(), True)
+
+        def test_broadcast_api_3(self):
+            paddle.enable_static()
+            with program_guard(Program(), Program()):
+                x = paddle.static.data(name='x', shape=[5], dtype='int32')
+                y = paddle.static.data(name='y', shape=[3, 1], dtype='int32')
+                op = eval("paddle.%s" % (self.op_type))
+                out = op(x, y)
+                exe = paddle.static.Executor(self.place)
+                input_x = np.arange(0, 5).reshape((5)).astype(np.int32)
+                input_y = np.array([5, 3, 2]).reshape((3, 1)).astype(np.int32)
+                real_result = callback(input_x, input_y)
+                res, = exe.run(feed={"x": input_x,
+                                     "y": input_y},
+                               fetch_list=[out])
+            self.assertEqual((res == real_result).all(), True)
+
+        def test_bool_api_4(self):
+            paddle.enable_static()
+            with program_guard(Program(), Program()):
+                x = paddle.static.data(name='x', shape=[3, 1], dtype='bool')
+                y = paddle.static.data(name='y', shape=[3, 1], dtype='bool')
+                op = eval("paddle.%s" % (self.op_type))
+                out = op(x, y)
+                exe = paddle.static.Executor(self.place)
+                input_x = np.array([True, False, True]).astype(np.bool)
+                input_y = np.array([True, True, False]).astype(np.bool)
+                real_result = callback(input_x, input_y)
+                res, = exe.run(feed={"x": input_x,
+                                     "y": input_y},
+                               fetch_list=[out])
+            self.assertEqual((res == real_result).all(), True)
+
+        def test_bool_broadcast_api_4(self):
+            paddle.enable_static()
+            with program_guard(Program(), Program()):
+                x = paddle.static.data(name='x', shape=[3, 1], dtype='bool')
+                y = paddle.static.data(name='y', shape=[1], dtype='bool')
+                op = eval("paddle.%s" % (self.op_type))
+                out = op(x, y)
+                exe = paddle.static.Executor(self.place)
+                input_x = np.array([True, False, True]).astype(np.bool)
+                input_y = np.array([True]).astype(np.bool)
+                real_result = callback(input_x, input_y)
+                res, = exe.run(feed={"x": input_x,
+                                     "y": input_y},
+                               fetch_list=[out])
+            self.assertEqual((res == real_result).all(), True)
+
+        def test_attr_name(self):
+            paddle.enable_static()
+            with program_guard(Program(), Program()):
+                x = fluid.layers.data(name='x', shape=[4], dtype='int32')
+                y = fluid.layers.data(name='y', shape=[4], dtype='int32')
+                op = eval("paddle.%s" % (self.op_type))
+                out = op(x=x, y=y, name="name_%s" % (self.op_type))
+            self.assertEqual("name_%s" % (self.op_type) in out.name, True)
+
+    cls_name = "TestCase_{}".format(op_type)
+    PaddleCls.__name__ = cls_name
+    globals()[cls_name] = PaddleCls
+
+
+create_paddle_case('less_than', lambda _a, _b: _a < _b)
+create_paddle_case('less_equal', lambda _a, _b: _a <= _b)
+create_paddle_case('greater_than', lambda _a, _b: _a > _b)
+create_paddle_case('greater_equal', lambda _a, _b: _a >= _b)
+create_paddle_case('equal', lambda _a, _b: _a == _b)
+create_paddle_case('not_equal', lambda _a, _b: _a != _b)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_stack_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_stack_op_xpu.py
index 7c546391f6f43..68e5a6ccdbfb7 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_stack_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_stack_op_xpu.py
@@ -97,5 +97,27 @@ def initParameters(self):
         self.axis = 3
 
 
+class TestStackOpint64(TestStackOpBase):
+    def initDefaultParameters(self):
+        self.num_inputs = 4
+        self.input_dim = (5, 6, 7)
+        self.axis = 0
+        self.dtype = 'int64'
+
+    def initParameters(self):
+        self.num_inputs = 16
+
+
+class TestStackOpint(TestStackOpBase):
+    def initDefaultParameters(self):
+        self.num_inputs = 4
+        self.input_dim = (5, 6, 7)
+        self.axis = 0
+        self.dtype = 'int'
+
+    def initParameters(self):
+        self.num_inputs = 16
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/incubate/nn/__init__.py b/python/paddle/incubate/nn/__init__.py
new file mode 100644
index 0000000000000..aada78e4ec6a4
--- /dev/null
+++ b/python/paddle/incubate/nn/__init__.py
@@ -0,0 +1,19 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .layer.fused_transformer import FusedMultiHeadAttention  # noqa: F401 
+
+__all__ = [  #noqa
+    'FusedMultiHeadAttention',
+]
diff --git a/python/paddle/incubate/nn/functional/__init__.py b/python/paddle/incubate/nn/functional/__init__.py
new file mode 100644
index 0000000000000..4d1c3eee025b0
--- /dev/null
+++ b/python/paddle/incubate/nn/functional/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .fused_transformer import fused_multi_head_attention
+from .fused_transformer import fused_feedforward
+
+__all__ = ['fused_multi_head_attention', 'fused_feedforward']
diff --git a/python/paddle/incubate/nn/functional/fused_transformer.py b/python/paddle/incubate/nn/functional/fused_transformer.py
new file mode 100644
index 0000000000000..68109b4ae694a
--- /dev/null
+++ b/python/paddle/incubate/nn/functional/fused_transformer.py
@@ -0,0 +1,385 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid.framework import in_dygraph_mode
+from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype
+from paddle.fluid import core, dygraph_utils
+from paddle import _C_ops
+
+__all__ = []
+
+
+def _verify_dropout_rate(dropout_rate):
+    if not isinstance(dropout_rate, (float, int)):
+        raise TypeError("dropout_rate argument should be a number")
+    if dropout_rate < 0 or dropout_rate > 1:
+        raise ValueError("dropout_rate argument should between 0 and 1")
+
+
+def fused_feedforward(x,
+                      linear1_weight,
+                      linear2_weight,
+                      linear1_bias=None,
+                      linear2_bias=None,
+                      ln1_scale=None,
+                      ln1_bias=None,
+                      ln2_scale=None,
+                      ln2_bias=None,
+                      dropout1_rate=0.5,
+                      dropout2_rate=0.5,
+                      activation="relu",
+                      ln1_epsilon=1e-5,
+                      ln2_epsilon=1e-5,
+                      pre_layer_norm=False,
+                      name=None):
+    """
+    This is a fusion operator to compute feed forward layer in transformer model architecture.
+    This operator only supports running on GPU. The function of the operator is consistent with
+    the following pseudo code:
+
+    .. code-block:: python
+
+        residual = src;
+        if pre_layer_norm:
+            src = layer_norm(src)
+        src = linear(dropout(activation(dropout(linear(src)))))
+        if not pre_layer_norm:
+            src = layer_norm(out)
+
+    Args:
+        x (Tensor): the input tensor could be 3-D tensor, the input data type could be float16, float32 or float64, the shape is`[batch\_size, sequence\_length, d_model]`.
+        linear1_weight (Tensor): The weight of first linear, the data type is same as `x`, the shape is `[d\_model, dim\_feedforward]`.
+        linear2_weight (Tensor): The weight of second linear, the data type is same as `x`, the shape is `[dim\_feedforward, d\_model]`.
+        linear1_bias (Tensor, optional): The bias of first linear, the data type is same as `x`, the shape is `[dim_feedforward]`. Default None.
+        linear2_bias (Tensor, optional): The bias of second linear, the data type is same as `x`, the shape is `[d_model]`. Default None.
+        ln1_scale (Tensor, optional): the weight of first layer_norm, the data type is float32 or float64, the shape is same as `x`. Default None.
+        ln1_bias (Tensor, optional): The bias of first layer_norm, the data type is float32 or float64, the shape is `[d\_model]`. Default None.
+        ln2_scale (Tensor, optional): The weight of second layer_norm, the data type is float32 or float64, the shape is same as `x`. Default None.
+        ln2_bias (Tensor, optional): The bias of second layer_norm, the data type is float32 or float64, the shape is `[d\_model]`. Default None.
+        dropout1_rate (float, optional): The first dropout probability of setting units to zero. Default 0.5.
+        dropout2_rate (float, optional): The second dropout probability of setting units to zero. Default 0.5.
+        activation (str, optional): The activation. Default "relu".
+        ln1_epsilon (float, optional): Small float of first layer_norm added to denominator to avoid dividing by zero. Default is 1e-5.
+        ln2_epsilon (float, optional): Small float of second layer_norm added to denominator to avoid dividing by zero. Default is 1e-5.
+        pre_layer_norm (bool, optional): add layer_norm in the pre-processing stage or post-processing state.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor: The output Tensor, the data type and shape is same as `x`.
+
+    Examples:
+        .. code-block:: python
+
+            # required: gpu
+            import paddle
+            import numpy as np
+            x_data = np.random.random((1, 8, 8)).astype("float32")
+            linear1_weight_data = np.random.random((8, 8)).astype("float32")
+            linear2_weight_data = np.random.random((8, 8)).astype("float32")
+            x = paddle.to_tensor(x_data)
+            linear1_weight = paddle.to_tensor(linear1_weight_data)
+            linear2_weight = paddle.to_tensor(linear2_weight_data)
+            out = paddle.incubate.nn.functional.fused_feedforward(x, linear1_weight, linear2_weight)
+            print(out.numpy().shape)
+            # (1, 8, 8)
+    """
+    _verify_dropout_rate(dropout1_rate)
+    _verify_dropout_rate(dropout2_rate)
+
+    if in_dygraph_mode():
+        out, _, _, _, _, _, _, _, _, _, _ = _C_ops.fused_feedforward(
+            x, None, None, linear1_weight, linear1_bias, linear2_weight,
+            linear2_bias, ln1_scale, ln1_bias, ln2_scale, ln2_bias,
+            'pre_layer_norm', pre_layer_norm, 'ln1_epsilon', ln1_epsilon,
+            'ln2_epsilon', ln2_epsilon, 'act_method', activation,
+            'dropout1_rate', dropout1_rate, 'dropout2_rate', dropout2_rate)
+        return out
+
+    helper = LayerHelper("fused_feedforward")
+    dtype = x.dtype
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                             'fused_feedforward')
+    check_dtype(dtype, 'dtype', ['float16', 'float32', 'float64'],
+                'fused_feedforward')
+
+    out = helper.create_variable_for_type_inference(x.dtype)
+    dropout1_mask = helper.create_variable_for_type_inference(
+        'uint8', stop_gradient=True)
+    dropout2_mask = helper.create_variable_for_type_inference(
+        'uint8', stop_gradient=True)
+    ln1_mean = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True)
+    ln1_variance = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True)
+    ln2_mean = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True)
+    ln2_variance = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True)
+    linear1_out = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True)
+    ln1_out = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True)
+    dropout1_out = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True)
+    dropout2_out = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True)
+
+    helper.append_op(
+        type='fused_feedforward',
+        inputs={
+            'X': x,
+            'Linear1Weight': linear1_weight,
+            'Linear1Bias': linear1_bias,
+            'Linear2Weight': linear2_weight,
+            'Linear2Bias': linear2_bias,
+            'Ln1Scale': ln1_scale,
+            'Ln1Bias': ln1_bias,
+            'Ln2Scale': ln2_scale,
+            'Ln2Bias': ln2_bias,
+        },
+        outputs={
+            'Out': out,
+            'Dropout1Mask': dropout1_mask,
+            'Dropout2Mask': dropout2_mask,
+            'Ln1Mean': ln1_mean,
+            'Ln1Variance': ln1_variance,
+            'Ln2Mean': ln2_mean,
+            'Ln2Variance': ln2_variance,
+            'Linear1Out': linear1_out,
+            'Ln1Out': ln1_out,
+            'Dropout1Out': dropout1_out,
+            'Dropout2Out': dropout2_out,
+        },
+        attrs={
+            'dropout1_rate': dropout1_rate,
+            'dropout2_rate': dropout2_rate,
+            'act_method': activation,
+            'pre_layer_norm': pre_layer_norm,
+            'ln1_epsilon': ln1_epsilon,
+            'ln2_epsilon': ln2_epsilon,
+        })
+    return out
+
+
+def fused_multi_head_attention(x,
+                               qkv_weight,
+                               linear_weight,
+                               pre_layer_norm=False,
+                               pre_ln_scale=None,
+                               pre_ln_bias=None,
+                               ln_scale=None,
+                               ln_bias=None,
+                               pre_ln_epsilon=1e-05,
+                               qkv_bias=None,
+                               linear_bias=None,
+                               attn_mask=None,
+                               dropout_rate=0.5,
+                               attn_dropout_rate=0.5,
+                               ln_epsilon=1e-05,
+                               name=None):
+    """
+    Attention mapps queries and a set of key-value pairs to outputs, and
+    Multi-Head Attention performs multiple parallel attention to jointly attending
+    to information from different representation subspaces. This API only
+    support self_attention. The pseudo code is as follows:
+    if pre_layer_norm:
+    	out = layer_norm(x);
+        out = linear(out) + qkv)bias
+    else:
+	out = linear(x) + bias;
+    out = transpose(out, perm=[2, 0, 3, 1, 4]);
+    # extract q, k and v from out.
+    q = out[0:1,::]
+    k = out[1:2,::]
+    v = out[2:3,::]
+    out = q * k^t;
+    out = attn_mask + out;
+    out = softmax(out);
+    out = dropout(out);
+    out = out * v;
+    out = transpose(out, perm=[0, 2, 1, 3]);
+    out = out_linear(out);
+    out = layer_norm(x + dropout(linear_bias + out));
+
+    Parameters:
+        x (Tensor): The input tensor of fused_multi_head_attention. The shape is
+            `[batch\_size, sequence\_len, embed\_dim]`.
+        qkv_weight (Tensor): The qkv weight tensor. The shape is `[3, num_head, dim_head, dim_embed]`.
+        linear_weight (Tensor): The linear weight tensor. The shape is `[embed_dim, embed_dim]`.
+        pre_layer_norm (bool, optional): whether it is pre_layer_norm (True) or post_layer_norm architecture 
+	    (False). Default False.
+        pre_ln_scale (Tensor, optional): The weight tensor of pre layernorm. Default None.
+        pre_ln_bias (Tensor, optional): The bias tensor of pre layernorm. Default None.
+        ln_scale (Tensor, optional): The weight tensor of layernorm. Default None.
+        ln_bias (Tensor, optional): The bias tensor of layernorm. Default None.
+        pre_ln_epsilon (float, optional): Small float value added to denominator of the pre layer_norm
+            to avoid dividing by zero. Default is 1e-5.
+        qkv_bias (Tensor, optional): The bias of qkv computation. The shape is `[3, num_head, dim_head]`.
+            Default None.
+        linear_bias (Tensor, optional): The bias of linear. The shape is `[embed_dim]`. Default None.
+        attn_mask (Tensor, optional):  A tensor used in multi-head attention to prevents attention to 
+ 	    some unwanted positions, usually the paddings or the subsequent positions. It is a tensor 
+            with shape broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`. When the 
+            data type is bool, the unwanted positions have `False` values and the others have `True` values. 
+            When the data type is int, the unwanted positions have 0 values and the others have 1 values. 
+            When the data type is float, the unwanted positions have `-INF` values and the others have 0 values. 
+            It can be None when nothing wanted or needed to be prevented attention to. Default None.
+        dropout_rate (float, optional): The dropout probability used on attention
+            weights to drop some attention targets for the dropout after attention.
+            0 for no dropout. Default 0.5.
+        attn_dropout_rate (float, optional): The dropout probability used on attention
+            weights to drop some attention targets for the dropout in attention.
+            0 for no dropout. Default 0.5.
+        ln_epsilon (float, optional): Small float value added to denominator of layer_norm
+            to avoid dividing by zero. Default is 1e-5.
+
+    Examples:
+
+        .. code-block:: python
+
+            # required: gpu
+            import paddle
+            import paddle.incubate.nn.functional as F
+
+            # input: [batch_size, seq_len, embed_dim]
+            x = paddle.rand(shape=(2, 4, 128), dtype="float32")
+            # qkv_weight: [3, num_head, head_dim, embed_dim]
+            qkv_weight = paddle.rand(shape=(3, 4, 32, 128), dtype="float32")
+            # qkv_bias: [3, num_head, head_dim]
+            qkv_bias = paddle.rand(shape=(3, 4, 32), dtype="float32")
+            # linear_weight: [embed_dim, embed_dim]
+            linear_weight = paddle.rand(shape=(128, 128), dtype="float32")
+            # linear_bias: [embed_dim]
+            linear_bias = paddle.rand(shape=[128], dtype="float32")
+            # self attention mask: [batch_size, num_heads, seq_len, seq_len]
+            attn_mask = paddle.rand(shape=(2, 4, 4, 4), dtype="float32")
+
+            # output: [batch_size, seq_len, embed_dim]
+            output = F.fused_multi_head_attention(
+                x, qkv_weight, linear_weight, False,
+                None, None, None, None, 1e-5, qkv_bias,
+                linear_bias, attn_mask)
+            # [2, 4, 128]
+            print(output.shape)
+    """
+    if in_dygraph_mode():
+        # pre_ln_mean, pre_ln_variance, pre_ln_out, qkv_out, qkv_bias_out, transpose_out, qk_out,
+        # qktv_out, softmax_out, attn_dropout_mask_out, attn_dropout_out, attn_mask_out, fmha_out,
+        # linear_out, dropout_mask_out, ln_mean_out, ln_var_out, bias_dropout_residual_out, final_out
+        assert len(qkv_weight.shape
+                   ) == 4, "The dims of the shape of qkv_weight should be 4."
+        assert qkv_weight.shape[
+            0] == 3, "The shape of qkv_weight should be [3, num_head, head_dim, embed_dim]."
+        assert qkv_weight.shape[3] == x.shape[
+            2], "The 3rd dim of qkv_weight and 2nd dim of x should be the same, i.e., embed_dim."
+        _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, final_out = _C_ops.fused_attention(
+            x, pre_ln_scale, pre_ln_bias, qkv_weight, qkv_bias, attn_mask,
+            linear_weight, linear_bias, ln_scale, ln_bias, 'pre_layer_norm',
+            pre_layer_norm, 'epsilon', pre_ln_epsilon, 'dropout_rate',
+            dropout_rate, 'attn_dropout_rate', attn_dropout_rate, 'ln_epsilon',
+            ln_epsilon)
+        return final_out
+    else:
+        helper = LayerHelper('fused_multi_head_attention', **locals())
+        dtype = x.dtype
+        # check dtypes
+        check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                                 'fused_multihead_attention')
+        check_dtype(dtype, 'dtype', ['float16', 'float32', 'float64'],
+                    'fused_multi_head_attention')
+
+        # set inputs
+        inputs = dict()
+        inputs['X'] = [x]
+        if pre_ln_scale:
+            inputs['LnScale'] = [pre_ln_scale]
+        if pre_ln_bias:
+            inputs['LnBias'] = [pre_ln_bias]
+        inputs['QKVW'] = [qkv_weight]
+        inputs['QKVBias'] = [qkv_bias]
+        inputs['SrcMask'] = attn_mask
+        inputs['OutLinearW'] = [linear_weight]
+        inputs['OutLinearBias'] = [linear_bias]
+        if ln_scale:
+            inputs['Ln2Scale'] = [ln_scale]
+        if ln_bias:
+            inputs['Ln2Bias'] = [ln_bias]
+
+        # set attrs
+        attrs = {
+            'pre_layer_norm': pre_layer_norm,
+            'epsilon': pre_ln_epsilon,
+            'ln_epsilon': ln_epsilon,
+            'dropout_rate': dropout_rate,
+            'attn_dropout_rate': attn_dropout_rate
+        }
+
+        # set outputs
+        pre_ln_mean_out = helper.create_variable_for_type_inference(
+            dtype=dtype, stop_gradient=True)
+        pre_ln_variance_out = helper.create_variable_for_type_inference(
+            dtype=dtype, stop_gradient=True)
+        pre_ln_out = helper.create_variable_for_type_inference(dtype=dtype)
+
+        qkv_out = helper.create_variable_for_type_inference(dtype=dtype)
+        qkv_bias_out = helper.create_variable_for_type_inference(dtype=dtype)
+
+        transpose_out = helper.create_variable_for_type_inference(dtype=dtype)
+        qk_out = helper.create_variable_for_type_inference(dtype=dtype)
+        qktv_out = helper.create_variable_for_type_inference(dtype=dtype)
+        softmax_out = helper.create_variable_for_type_inference(dtype=dtype)
+        attn_dropout_mask_out = helper.create_variable_for_type_inference(
+            dtype=core.VarDesc.VarType.UINT8, stop_gradient=True)
+        attn_dropout_out = helper.create_variable_for_type_inference(
+            dtype=dtype)
+        attn_mask_out = helper.create_variable_for_type_inference(dtype=dtype)
+        fmha_out = helper.create_variable_for_type_inference(dtype=dtype)
+        out_linear_out = helper.create_variable_for_type_inference(dtype=dtype)
+        dropout_mask_out = helper.create_variable_for_type_inference(
+            dtype=core.VarDesc.VarType.UINT8, stop_gradient=True)
+        ln_mean_out = helper.create_variable_for_type_inference(
+            dtype=dtype, stop_gradient=True)
+        ln_variance_out = helper.create_variable_for_type_inference(
+            dtype=dtype, stop_gradient=True)
+        bias_dropout_residual_out = helper.create_variable_for_type_inference(
+            dtype=dtype)
+        final_out = helper.create_variable_for_type_inference(dtype=dtype)
+
+        helper.append_op(
+            type='fused_attention',
+            inputs=inputs,
+            outputs={
+                "LnMean": pre_ln_mean_out,
+                "LnVariance": pre_ln_variance_out,
+                "LnOut": pre_ln_out,
+                "QKVOut": qkv_out,
+                "QKVBiasOut": qkv_bias_out,
+                "TransposeOut2": transpose_out,
+                "QKOut": qk_out,
+                "QKTVOut": qktv_out,
+                "SoftmaxOut": softmax_out,
+                "AttnDropoutMaskOut": attn_dropout_mask_out,
+                "AttnDropoutOut": attn_dropout_out,
+                "SrcMaskOut": attn_mask_out,
+                "FMHAOut": fmha_out,
+                "OutLinearOut": out_linear_out,
+                "DropoutMaskOut": dropout_mask_out,
+                "Ln2Mean": ln_mean_out,
+                "Ln2Variance": ln_variance_out,
+                "BiasDropoutResidualOut": bias_dropout_residual_out,
+                'Y': final_out
+            },
+            attrs=attrs)
+        return final_out
diff --git a/python/paddle/nn/layer/fused_transformer.py b/python/paddle/incubate/nn/layer/fused_transformer.py
similarity index 79%
rename from python/paddle/nn/layer/fused_transformer.py
rename to python/paddle/incubate/nn/layer/fused_transformer.py
index 0084f7ff339df..16588dcef3d27 100644
--- a/python/paddle/nn/layer/fused_transformer.py
+++ b/python/paddle/incubate/nn/layer/fused_transformer.py
@@ -12,27 +12,42 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import copy
+from paddle.nn import functional as F
+from paddle.incubate.nn import functional as incubate_f
+from paddle.nn import Layer
+from paddle.framework import ParamAttr
+import paddle
+from paddle.nn.layer.transformer import _convert_attention_mask
+from paddle.nn.initializer import Constant
+
+import collections
+
 
 class FusedMultiHeadAttention(Layer):
     """
-    Attention mapps queries and a set of key-value pairs to outputs, and
+   Attention mapps queries and a set of key-value pairs to outputs, and
     Multi-Head Attention performs multiple parallel attention to jointly attending
     to information from different representation subspaces.
-
     Please refer to `Attention Is All You Need <https://arxiv.org/pdf/1706.03762.pdf>`_
     for more details.
-
     Parameters:
         embed_dim (int): The expected feature size in the input and output.
         num_heads (int): The number of heads in multi-head attention.
-        dropout (float, optional): The dropout probability used on attention
-            weights to drop some attention targets. 0 for no dropout. Default 0
+        dropout_rate (float, optional): The dropout probability used on attention
+            weights to drop some attention targets for the dropout after attention. 
+            0 for no dropout. Default 0.5.
+        attn_dropout_rate (float, optional): The dropout probability used on attention
+            weights to drop some attention targets for the dropout in attention. 
+            0 for no dropout. Default 0.5.
         kdim (int, optional): The feature size in key. If None, assumed equal to
             `embed_dim`. Default None.
         vdim (int, optional): The feature size in value. If None, assumed equal to
             `embed_dim`. Default None.
+        normalize_before (bool, optional): Indicate  whether it is pre_layer_norm (True) 
+            or post_layer_norm architecture (False). Default False.
         need_weights (bool, optional): Indicate whether to return the attention
-            weights. Default False.
+            weights. Now, only False is supported. Default False.
         weight_attr(ParamAttr, optional):  To specify the weight parameter property.
             Default: None, which means the default weight parameter property is used.
             See usage for details in :code:`ParamAttr` .
@@ -40,35 +55,84 @@ class FusedMultiHeadAttention(Layer):
             Default: None, which means the default bias parameter property is used.
             If it is set to False, this layer will not have trainable bias parameter.
             See usage for details in :code:`ParamAttr` .
-         
     Examples:
-
         .. code-block:: python
-
             import paddle
-
-            # encoder input: [batch_size, sequence_length, d_model]
+            # input: [batch_size, sequence_length, embed_dim]
             query = paddle.rand((2, 4, 128))
             # self attention mask: [batch_size, num_heads, query_len, query_len]
             attn_mask = paddle.rand((2, 2, 4, 4))
-            multi_head_attn = paddle.nn.MultiHeadAttention(128, 2)
+            multi_head_attn = paddle.incubate.nn.FusedMultiHeadAttention(128, 2)
             output = multi_head_attn(query, None, None, attn_mask=attn_mask)  # [2, 4, 128]
     """
 
-    Cache = collections.namedtuple("Cache", ["k", "v"])
-    StaticCache = collections.namedtuple("StaticCache", ["k", "v"])
-
     def __init__(self,
                  embed_dim,
                  num_heads,
-                 dropout=0.,
+                 dropout_rate=0.5,
+                 attn_dropout_rate=0.5,
                  kdim=None,
                  vdim=None,
+                 normalize_before=False,
                  need_weights=False,
                  weight_attr=None,
-                 bias_attr=None):
+                 bias_attr=None,
+                 name=None):
         super(FusedMultiHeadAttention, self).__init__()
-        raise NotImplementedError()
+
+        assert embed_dim > 0, ("Expected embed_dim to be greater than 0, "
+                               "but recieved {}".format(embed_dim))
+        assert num_heads > 0, ("Expected nhead to be greater than 0, "
+                               "but recieved {}".format(num_heads))
+
+        attn_dropout_rate = dropout_rate if attn_dropout_rate is None else attn_dropout_rate
+        self.normalize_before = normalize_before
+        self._dtype = self._helper.get_default_dtype()
+        self._weight_attr = weight_attr
+        self._bias_attr = bias_attr
+
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
+        assert need_weights == False, "Only support need_weight is False now."
+
+        self.qkv_weight = self.create_parameter(
+            shape=[3, num_heads, self.head_dim, embed_dim],
+            attr=self._weight_attr,
+            dtype=self._dtype,
+            is_bias=False)
+        self.qkv_bias = self.create_parameter(
+            shape=[3, num_heads, self.head_dim],
+            attr=self._bias_attr,
+            dtype=self._dtype,
+            is_bias=True)
+        self.linear_weight = self.create_parameter(
+            shape=[embed_dim, embed_dim],
+            attr=self._weight_attr,
+            dtype=self._dtype,
+            is_bias=False)
+        self.linear_bias = self.create_parameter(
+            shape=[embed_dim],
+            attr=self._bias_attr,
+            dtype=self._dtype,
+            is_bias=True)
+
+        self.pre_ln_scale = self.create_parameter(
+            attr=self._weight_attr,
+            shape=[embed_dim],
+            default_initializer=Constant(value=1.0))
+        self.pre_ln_bias = self.create_parameter(
+            attr=self._bias_attr, shape=[embed_dim], is_bias=True)
+        self.ln_scale = self.create_parameter(
+            attr=self._weight_attr,
+            shape=[embed_dim],
+            default_initializer=Constant(value=1.0))
+        self.ln_bias = self.create_parameter(
+            attr=self._bias_attr, shape=[embed_dim], is_bias=True)
+
+        self.dropout_rate = dropout_rate
+        self.attn_dropout_rate = attn_dropout_rate
+
+        self.name = name
 
     def forward(self, query, key=None, value=None, attn_mask=None, cache=None):
         """
@@ -97,30 +161,34 @@ def forward(self, query, key=None, value=None, attn_mask=None, cache=None):
                 `-INF` values and the others have 0 values. It can be None when 
                 nothing wanted or needed to be prevented attention to. Default None.
             cache (MultiHeadAttention.Cache|MultiHeadAttention.StaticCache, optional):
-                It is a namedtuple with `k` and `v` as fields, and stores tensors
-                shaped `[batch_size, num_heads, length, embed_dim]` which are results
-                of linear projection, reshape and transpose calculations in
-                MultiHeadAttention. If it is an instance of `Cache`, `k` and `v`
-                fields reserve intermediate results of previous positions, which
-                mostly used for decoder self attention. If it is an instance of
-                `StaticCache`, `key` and `value` args would be ignored, `k` and
-                `v` fields would be used as calculated results on `key` and
-                `value`, which mostly used for decoder-encoder cross attention.
-                It is only used for inference and should be None for training.
-                Default None.
+                Now, only None is supported. Default None.
         Returns:
             Tensor|tuple: It is a tensor that has the same shape and data type \
-                as `query`, representing attention output. Or a tuple if \
-                `need_weights` is True or `cache` is not None. If `need_weights` \
-                is True, except for attention output, the tuple also includes \
-                the attention weights tensor shaped `[batch_size, num_heads, query_length, key_length]`. \
-                If `cache` is not None, the tuple then includes the new cache \
-                having the same type as `cache`, and if it is `StaticCache`, it \
-                is same as the input `cache`, if it is `Cache`, the new cache \
-                reserves tensors concatanating raw tensors with intermediate \
-                results of current query.
+                as `query`, representing attention output. 
         """
-        raise NotImplementedError()
+        if attn_mask is not None:
+            # Support bool or int mask
+            attn_mask = _convert_attention_mask(attn_mask, query.dtype)
+
+        assert cache == None, "Only support cache is None now."
+
+        out = incubate_f.fused_multi_head_attention(
+            x=query,
+            qkv_weight=self.qkv_weight,
+            linear_weight=self.linear_weight,
+            pre_layer_norm=self.normalize_before,
+            pre_ln_scale=self.pre_ln_scale,
+            pre_ln_bias=self.pre_ln_bias,
+            ln_scale=self.ln_scale,
+            ln_bias=self.ln_bias,
+            pre_ln_epsilon=1e-05,
+            qkv_bias=self.qkv_bias,
+            linear_bias=self.linear_bias,
+            attn_mask=attn_mask,
+            dropout_rate=self.dropout_rate,
+            attn_dropout_rate=self.attn_dropout_rate,
+            ln_epsilon=1e-05)
+        return out
 
 
 class FusedFeedForward(Layer):
@@ -186,7 +254,8 @@ class FusedTransformerEncoderLayer(Layer):
     Examples:
 
         .. code-block:: python
-
+	    
+	    # required: gpu
             import paddle
             from paddle.nn import TransformerEncoderLayer
 
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index 8daae3d0ca90e..1af53e0826be8 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -61,7 +61,6 @@
 from .conv import conv1d  # noqa: F401
 from .conv import conv1d_transpose  # noqa: F401
 from .common import linear  # noqa: F401
-from .fused_transformer import fused_multi_head_attention  # noqa: F401
 from .conv import conv2d  # noqa: F401
 from .conv import conv2d_transpose  # noqa: F401
 from .conv import conv3d  # noqa: F401
@@ -212,6 +211,5 @@
            'layer_norm',
            'instance_norm',
            'class_center_sample',
-           'fused_multi_head_attention',
            'sparse_attention',
 ]
diff --git a/python/paddle/nn/functional/fused_transformer.py b/python/paddle/nn/functional/fused_transformer.py
deleted file mode 100644
index 565ef223a96cb..0000000000000
--- a/python/paddle/nn/functional/fused_transformer.py
+++ /dev/null
@@ -1,127 +0,0 @@
-#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-from ...fluid.framework import in_dygraph_mode
-from paddle import _C_ops
-
-__all__ = []
-
-
-def fused_multi_head_attention(x,
-                               qkv_weight,
-                               linear_weight,
-                               pre_layer_norm=False,
-                               pre_ln_scale=None,
-                               pre_ln_bias=None,
-                               ln_scale=None,
-                               ln_bias=None,
-                               pre_ln_epsilon=1e-05,
-                               qkv_bias=None,
-                               linear_bias=None,
-                               attn_mask=None,
-                               dropout_rate=0.5,
-                               attn_dropout_rate=0.5,
-                               ln_epsilon=1e-05,
-                               name=None):
-    """
-    Attention mapps queries and a set of key-value pairs to outputs, and
-    Multi-Head Attention performs multiple parallel attention to jointly attending
-    to information from different representation subspaces. This API only 
-    support self_attention. The pseudo code is as follows:
-    if pre_layer_norm:
-    	out = layer_norm(x);
-        out = linear(out) + qkv)bias
-    else:
-	out = linear(x) + bias;
-    out = transpose(out, perm=[2, 0, 3, 1, 4]);
-    # extract q, k and v from out.
-    q = out[0:1,::]
-    k = out[1:2,::]
-    v = out[2:3,::]
-    out = q * k^t;
-    out = attn_mask + out;
-    out = softmax(out);
-    out = dropout(out);
-    out = out * v;
-    out = transpose(out, perm=[0, 2, 1, 3]);      
-    out = out_linear(out);
-    out = layer_norm(x + dropout(linear_bias + out));
-
-    Parameters:
-        x (Tensor): The input tensor of fused_multi_head_attention. The shape is 
-            `[batch\_size, sequence\_len, embed\_dim]`.
-        qkv_weight (Tensor): The qkv weight tensor. The shape is `[3, num_head, dim_head, dim_embed]`.
-        linear_weight (Tensor): The linear weight tensor. The shape is `[embed_dim, embed_dim]`.
-        pre_layer_norm (bool, optional): whether it is pre_layer_norm or post_layer_norm architecture. 
-            Default False.
-        pre_ln_scale (Tensor, optional): The weight tensor of pre layernorm. Default None.
-        pre_ln_bias (Tensor, optional): The bias tensor of pre layernorm. Default None.
-        ln_scale (Tensor, optional): The weight tensor of layernorm. Default None.
-        ln_bias (Tensor, optional): The bias tensor of layernorm. Default None.
-        pre_ln_epsilon (float, optional): Small float value added to denominator of the pre layer_norm 
-            to avoid dividing by zero. Default is 1e-5.
-        qkv_bias (Tensor, optional): The bias of qkv computation. The shape is `[3, num_head, dim_head]`. 
-            Default None.
-        linear_bias (Tensor, optional): The bias of linear. The shape is `[embed_dim]`. Default None.
-        attn_mask (Tensor, optional):
-        dropout_rate (float, optional): The dropout probability used on attention
-            weights to drop some attention targets for the dropout after attention. 
-            0 for no dropout. Default 0.
-        attn_dropout_rate (float, optional): The dropout probability used on attention
-            weights to drop some attention targets for the dropout in attention. 
-            0 for no dropout. Default 0.
-        ln_epsilon (float, optional): Small float value added to denominator of layer_norm 
-            to avoid dividing by zero. Default is 1e-5.
-         
-    Examples:
-
-        .. code-block:: python
-            
-            # required: gpu            
-            import paddle
-            import paddle.nn.functional as F
-
-            # input: [batch_size, seq_len, embed_dim]
-            x = paddle.rand(shape=(2, 4, 128), dtype="float32")
-            # qkv_weight: [3, num_head, dim_head, dim_embed]
-            qkv_weight = paddle.rand(shape=(3, 4, 32, 128), dtype="float32")
-            # qkv_bias: [3, num_head, dim_head]
-            qkv_bias = paddle.rand(shape=(3, 4, 32), dtype="float32")
-            # linear_weight: [embed_dim, embed_dim]
-            linear_weight = paddle.rand(shape=(128, 128), dtype="float32")
-            # linear_bias: [embed_dim]
-            linear_bias = paddle.rand(shape=[128], dtype="float32")
-            # self attention mask: [batch_size, num_heads, seq_len, seq_len]
-            attn_mask = paddle.rand(shape=(2, 4, 4, 4), dtype="float32")
-
-            # output: [batch_size, seq_len, embed_dim]
-            output = F.fused_multi_head_attention(
-                x, qkv_weight, linear_weight, False,
-                None, None, None, None, 1e-5, qkv_bias,
-                linear_bias, attn_mask)
-            # [2, 4, 128]
-            print(output.shape)
-    """
-    if in_dygraph_mode():
-        # pre_ln_mean, pre_ln_variance, pre_ln_out, qkv_out, qkv_bias_out, transpose_out, qk_out, 
-        # qktv_out, softmax_out, attn_dropout_mask_out, attn_dropout_out, attn_mask_out, fmha_out, 
-        # linear_out, dropout_mask_out, ln_mean_out, ln_var_out, bias_dropout_residual_out, final_out
-        _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, final_out = _C_ops.fused_attention(
-            x, pre_ln_scale, pre_ln_bias, qkv_weight, qkv_bias, attn_mask,
-            linear_weight, linear_bias, ln_scale, ln_bias, 'pre_layer_norm',
-            pre_layer_norm, 'epsilon', pre_ln_epsilon, 'dropout_rate',
-            dropout_rate, 'attn_dropout_rate', attn_dropout_rate, 'ln_epsilon',
-            ln_epsilon)
-        return final_out
diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py
index 89843885c8a12..9b765a1d7c782 100644
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: define normalization api  
+# TODO: define normalization api
 import paddle
 import paddle.fluid as fluid
 from ...fluid.data_feeder import check_variable_and_dtype, check_type
@@ -35,7 +35,7 @@ def normalize(x, p=2, axis=1, epsilon=1e-12, name=None):
     .. math::
 
         y = \frac{x}{ \max\left( \lvert \lvert x \rvert \rvert_p, epsilon\right) }
-    
+
     .. math::
         \lvert \lvert x \rvert \rvert_p = \left( \sum_i {\lvert x_i \rvert^p}  \right)^{1/p}
 
@@ -45,7 +45,7 @@ def normalize(x, p=2, axis=1, epsilon=1e-12, name=None):
     Parameters:
         x (Tensor): The input tensor could be N-D tensor, and the input data type could be float32 or float64.
         p (float|int, optional): The exponent value in the norm formulation. Default: 2
-        axis (int, optional): The axis on which to apply normalization. If `axis < 0`, the dimension to normalization is `x.ndim + axis`. -1 is the last dimension. 
+        axis (int, optional): The axis on which to apply normalization. If `axis < 0`, the dimension to normalization is `x.ndim + axis`. -1 is the last dimension.
         epsilon (float, optional): Small float added to denominator to avoid dividing by zero. Default is 1e-12.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
@@ -123,13 +123,13 @@ def batch_norm(x,
     Applies Batch Normalization as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
 
     nn.functional.batch_norm is uesd for nn.BatchNorm1D, nn.BatchNorm2D, nn.BatchNorm3D. Please use above API for BatchNorm.
-    
+
     Parameters:
         x(Tesnor): input value. It's data type should be float32, float64.
         running_mean(Tensor): running mean.
         running_var(Tensor): running variance.
         weight(Tensor): The weight tensor of batch_norm, can not be None.
-        bias(Tensor): The bias tensor of batch_norm can not be None. 
+        bias(Tensor): The bias tensor of batch_norm can not be None.
         epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
         momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
         training(bool, optional): True means train mode which compute by batch data and track global mean and var during train period. False means inference mode which compute by global mean and var which calculated by train period. Defalut False.
@@ -252,7 +252,7 @@ def layer_norm(x,
                name=None):
     """
     see more detail in paddle.nn.LayerNorm
-    
+
     Parameters:
         x(Tensor): Input Tensor. It's data type should be float32, float64.
         normalized_shape(int|list|tuple): Input shape from an expected input of
@@ -277,7 +277,7 @@ def layer_norm(x,
 
           np.random.seed(123)
           x_data = np.random.random(size=(2, 2, 2, 3)).astype('float32')
-          x = paddle.to_tensor(x_data) 
+          x = paddle.to_tensor(x_data)
           layer_norm_out = paddle.nn.functional.layer_norm(x, x.shape[1:])
           print(layer_norm_out)
     """
@@ -378,7 +378,7 @@ def instance_norm(x,
 
           np.random.seed(123)
           x_data = np.random.random(size=(2, 2, 2, 3)).astype('float32')
-          x = paddle.to_tensor(x_data) 
+          x = paddle.to_tensor(x_data)
           instance_norm_out = paddle.nn.functional.instance_norm(x)
 
           print(instance_norm_out)
diff --git a/python/paddle/tensor/signal.py b/python/paddle/signal.py
similarity index 97%
rename from python/paddle/tensor/signal.py
rename to python/paddle/signal.py
index 86022a1748356..fc80c7cbc80f3 100644
--- a/python/paddle/tensor/signal.py
+++ b/python/paddle/signal.py
@@ -16,16 +16,14 @@
 
 import paddle
 
-from .attribute import is_complex, is_floating_point
+from .tensor.attribute import is_complex, is_floating_point
 from .fft import fft_r2c, fft_c2r, fft_c2c
-from ..fluid.data_feeder import check_variable_and_dtype
-from ..fluid.framework import in_dygraph_mode
-from ..fluid.layer_helper import LayerHelper
-from .. import _C_ops
+from .fluid.data_feeder import check_variable_and_dtype
+from .fluid.framework import in_dygraph_mode
+from .fluid.layer_helper import LayerHelper
+from . import _C_ops
 
 __all__ = [
-    'frame',
-    'overlap_add',
     'stft',
     'istft',
 ]
@@ -56,7 +54,7 @@ def frame(x, frame_length, hop_length, axis=-1, name=None):
     .. code-block:: python
 
         import paddle
-        from paddle.tensor.signal import frame
+        from paddle.signal import frame
         
         # 1D
         x = paddle.arange(8)
@@ -177,7 +175,7 @@ def overlap_add(x, hop_length, axis=-1, name=None):
     .. code-block:: python
 
         import paddle
-        from paddle.tensor.signal import overlap_add
+        from paddle.signal import overlap_add
         
         # 2D
         x0 = paddle.arange(16).reshape([8, 2])
@@ -291,11 +289,11 @@ def stft(x,
             real-valued input and `onesided` is `True`) or `[..., n_fft, num_frames]`(
             `onesided` is `False`)
     
-    Exampels:
+    Examples:
         .. code-block:: python
     
             import paddle
-            from paddle.tensor.signal import stft
+            from paddle.signal import stft
     
             # real-valued input
             x = paddle.randn([8, 48000], dtype=paddle.float64)
@@ -415,7 +413,7 @@ def istft(x,
     - :math:`N`: Value of `n_fft`.
     - :math:`H`: Value of `hop_length`.
 
-    Result of `istft` expected to be the inverse of `paddle.tensor.signal.stft`, but it is
+    Result of `istft` expected to be the inverse of `paddle.signal.stft`, but it is
         not guaranteed to reconstruct a exactly realizible time-domain signal from a STFT
         complex tensor which has been modified (via masking or otherwise). Therefore, `istft`
         gives the [Griffin-Lim optimal estimate](https://ieeexplore.ieee.org/document/1164317)
@@ -454,12 +452,12 @@ def istft(x,
         A tensor of least squares estimation of the reconstructed signal(s) with shape
             `[..., seq_length]`
 
-    Exampels:
+    Examples:
         .. code-block:: python
 
             import numpy as np
             import paddle
-            from paddle.tensor.signal import stft, istft
+            from paddle.signal import stft, istft
 
             paddle.seed(0)
 
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index b898b60fe4712..04d0a3c745f10 100755
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -44,6 +44,7 @@
 from .linalg import cholesky  # noqa: F401
 from .linalg import bmm  # noqa: F401
 from .linalg import histogram  # noqa: F401
+from .linalg import bincount  # noqa: F401
 from .linalg import mv  # noqa: F401
 from .linalg import eig  # noqa: F401
 from .linalg import matrix_power  # noqa: F401
@@ -220,8 +221,6 @@
 from .array import create_array  # noqa: F401
 
 from .einsum import einsum  # noqa: F401
-from . import fft
-from . import signal
 
 #this list used in math_op_patch.py for _binary_creator_
 tensor_method_func  = [ #noqa
@@ -236,6 +235,7 @@
            'cholesky',
            'bmm',
            'histogram',
+           'bincount',
            'mv',
            'matrix_power',
            'qr',
diff --git a/python/paddle/tensor/fft.py b/python/paddle/tensor/fft.py
deleted file mode 100644
index 20fd143589fa4..0000000000000
--- a/python/paddle/tensor/fft.py
+++ /dev/null
@@ -1,1601 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Sequence
-import numpy as np
-import paddle
-from .attribute import is_complex, is_floating_point, is_interger, _real_to_complex_dtype, _complex_to_real_dtype
-from ..fluid.framework import in_dygraph_mode
-from .. import _C_ops
-from ..fluid.data_feeder import check_variable_and_dtype
-from ..fluid.layer_helper import LayerHelper
-
-__all__ = []
-
-
-def _check_normalization(norm):
-    if norm not in ['forward', 'backward', 'ortho']:
-        raise ValueError(
-            "Unexpected norm: {}. Norm should be forward, backward or ortho".
-            format(norm))
-
-
-def _check_fft_n(n):
-    if not isinstance(n, int):
-        raise ValueError(
-            "Invalid FFT argument n({}), it shoule be an integer.".format(n))
-    if n <= 0:
-        raise ValueError(
-            "Invalid FFT argument n({}), it should be positive.".format(n))
-
-
-def _check_fft_shape(x, s):
-    ndim = x.ndim
-    if not isinstance(s, Sequence):
-        raise ValueError(
-            "Invaid FFT argument s({}), it should be a sequence of integers.")
-
-    if len(s) > ndim:
-        raise ValueError(
-            "Length of FFT argument s should not be larger than the rank of input. "
-            "Received s: {}, rank of x: {}".format(s, ndim))
-    for size in s:
-        if not isinstance(size, int) or size <= 0:
-            raise ValueError("FFT sizes {} contains invalid value ({})".format(
-                s, size))
-
-
-def _check_fft_axis(x, axis):
-    ndim = x.ndim
-    if not isinstance(axis, int):
-        raise ValueError(
-            "Invalid FFT axis ({}), it shoule be an integer.".format(axis))
-    if axis < -ndim or axis >= ndim:
-        raise ValueError(
-            "Invalid FFT axis ({}), it should be in range [-{}, {})".format(
-                axis, ndim, ndim))
-
-
-def _check_fft_axes(x, axes):
-    ndim = x.ndim
-    if not isinstance(axes, Sequence):
-        raise ValueError(
-            "Invalid FFT axes ({}), it should be a sequence of integers.".
-            format(axes))
-    if len(axes) > ndim:
-        raise ValueError(
-            "Length of fft axes should not be larger than the rank of input. "
-            "Received, len of axes: {}, rank of x: {}".format(len(axes), ndim))
-    for axis in axes:
-        if not isinstance(axis, int) or axis < -ndim or axis >= ndim:
-            raise ValueError(
-                "FFT axes {} contains invalid value ({}), it should be in range [-{}, {})".
-                format(axes, axis, ndim, ndim))
-
-
-def _resize_fft_input(x, s, axes):
-    if len(s) != len(axes):
-        raise ValueError("length of `s` should equals length of `axes`.")
-    shape = x.shape
-    ndim = x.ndim
-
-    axes_to_pad = []
-    paddings = []
-    axes_to_slice = []
-    slices = []
-    for i, axis in enumerate(axes):
-        if shape[axis] < s[i]:
-            axes_to_pad.append(axis)
-            paddings.append(s[i] - shape[axis])
-        elif shape[axis] > s[i]:
-            axes_to_slice.append(axis)
-            slices.append((0, s[i]))
-
-    if axes_to_slice:
-        x = paddle.slice(
-            x,
-            axes_to_slice,
-            starts=[item[0] for item in slices],
-            ends=[item[1] for item in slices])
-    if axes_to_pad:
-        padding_widths = [0] * (2 * ndim)
-        for axis, pad in zip(axes_to_pad, paddings):
-            padding_widths[2 * axis + 1] = pad
-        x = paddle.nn.functional.pad(x, padding_widths)
-    return x
-
-
-def _normalize_axes(x, axes):
-    ndim = x.ndim
-    return [item if item >= 0 else (item + ndim) for item in axes]
-
-
-def _check_at_least_ndim(x, rank):
-    if x.ndim < rank:
-        raise ValueError("The rank of the input ({}) should >= {}".format(
-            x.ndim, rank))
-
-
-# public APIs 1d
-def fft(x, n=None, axis=-1, norm="backward", name=None):
-    """
-    Calculate one-dimensional discrete Fourier transform.
-
-    This function uses the efficient fast Fourier transform (FFT) algorithm [1] to 
-    calculate the 1-D * n * point discrete Fourier transform (DFT).
-
-    Args:
-        x (Tensor): The input data. It's a Tensor type. It's a complex.
-        n (int, optional): The length of the output transform axis. If `n` is less than 
-            the length input, the input will be cropped. If larger, the input is filled 
-            with zeros. If `n` is not given, the input length along the axis specified 
-            by `axis` is used.
-        axis (int, optional): Axis used to calculate FFT. If not specified, the last axis 
-            is used by default.       
-        norm (str): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
-            of "forward" or "backward" or "ortho". Default is "backward", meaning no normalization on
-            the forward transforms and scaling by ``1/n`` on the `ifft`. "forward" instead applies 
-            the ``1/n`` factor on the forward tranform. For ``norm="ortho"``, both directions are 
-            scaled by ``1/sqrt(n)``.
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
-            this property. For more information, please refer to :ref:`api_guide_Name`.
-
-    Returns:
-        complex tensor. The truncated or zero-padded input, transformed along the axis indicated 
-        by `axis`, or the last one if `axis` is not specified.
-    
-    Examples:
-
-        .. code-block:: python
-
-            import numpy as np
-            import paddle
-
-            x = np.exp(3j * np.pi * np.arange(7) / 7)
-            xp = paddle.to_tensor(x)
-            fft_xp = paddle.fft.fft(xp).numpy()
-            print(fft_xp)
-            #  [1.+1.25396034e+00j 1.+4.38128627e+00j 1.-4.38128627e+00j
-            #   1.-1.25396034e+00j 1.-4.81574619e-01j 1.+8.88178420e-16j
-            #   1.+4.81574619e-01j]
-
-
-    """
-    if is_interger(x) or is_floating_point(x):
-        return fft_r2c(
-            x, n, axis, norm, forward=True, onesided=False, name=name)
-    else:
-        return fft_c2c(x, n, axis, norm, forward=True, name=name)
-
-
-def ifft(x, n=None, axis=-1, norm="backward", name=None):
-    """
-    Compute the 1-D inverse discrete Fourier Transform.
-
-    This function computes the inverse of the 1-D *n*-point discrete Fourier transform 
-    computed by `fft`.  In other words, ``ifft(fft(x)) == x`` to within numerical accuracy.
-
-    The input should be ordered in the same way as is returned by `fft`,
-    i.e.,
-
-    * ``x[0]`` should contain the zero frequency term,
-    * ``x[1:n//2]`` should contain the positive-frequency terms,
-    * ``x[n//2 + 1:]`` should contain the negative-frequency terms, in
-      increasing order starting from the most negative frequency.
-
-    For an even number of input points, ``x[n//2]`` represents the sum of
-    the values at the positive and negative Nyquist frequencies, as the two
-    are aliased together. 
-
-    Args:
-        x (Tensor): The input data. It's a Tensor type. It's a complex.
-        n (int, optional): The length of the output transform axis. If `n` is less than 
-            the length input, the input will be cropped. If larger, the input is filled 
-            with zeros. If `n` is not given, the input length along the axis specified 
-            by `axis` is used.
-        axis (int, optional): Axis used to calculate FFT. If not specified, the last axis 
-            is used by default.       
-        norm (str): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
-            of "forward" or "backward" or "ortho". Default is "backward", meaning no normalization on
-            the forward transforms and scaling by ``1/n`` on the `ifft`. "forward" instead applies 
-            the ``1/n`` factor on the forward tranform. For ``norm="ortho"``, both directions are 
-            scaled by ``1/sqrt(n)``.
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
-            this property. For more information, please refer to :ref:`api_guide_Name`.
-    
-    Returns:
-        complex tensor. The truncated or zero-padded input, transformed along the axis indicated 
-        by `axis`, or the last one if `axis` is not specified.
-
-    Examples:
-
-        .. code-block:: python
-
-            import numpy as np
-            import paddle
-
-            x = np.exp(3j * np.pi * np.arange(7) / 7)
-            xp = paddle.to_tensor(x)
-            ifft_xp = paddle.fft.ifft(xp).numpy()
-            print(ifft_xp)
-            #  [0.14285714+1.79137191e-01j 0.14285714+6.87963741e-02j
-            #   0.14285714+1.26882631e-16j 0.14285714-6.87963741e-02j
-            #   0.14285714-1.79137191e-01j 0.14285714-6.25898038e-01j
-            #   0.14285714+6.25898038e-01j]
-
-    """
-    if is_interger(x) or is_floating_point(x):
-        return fft_r2c(
-            x, n, axis, norm, forward=False, onesided=False, name=name)
-    else:
-        return fft_c2c(x, n, axis, norm, forward=False, name=name)
-
-
-def rfft(x, n=None, axis=-1, norm="backward", name=None):
-    """
-    The one dimensional FFT for real input.
-
-    This function computes the one dimensional *n*-point discrete Fourier
-    Transform (DFT) of a real-valued tensor by means of an efficient algorithm
-    called the Fast Fourier Transform (FFT).
-
-    When the DFT is computed for purely real input, the output is
-    Hermitian-symmetric. This function does not compute the negative frequency 
-    terms, and the length of the transformed axis of the output is therefore 
-    ``n//2 + 1``.
-
-    Args:
-        x(Tensor) : Real-valued input tensor 
-        n(int, optional): Number of points along transformation axis in the 
-            input to use. If `n` is smaller than the length of the input, the 
-            input is cropped. If it is larger, the input is padded with zeros. 
-            If `n` is not given, the length of the input along the axis 
-            specified by `axis` is used.
-        axis(int, optional): Axis over which to compute the FFT. Default value 
-            is last axis.
-        norm(str, optional) : Normalization mode, indicates which direction of 
-            the forward/backward  pair of transforms is scaled and with what 
-            normalization factor. Include {"backward", "ortho", "forward"}, 
-            default value is "backward".
-        name(str, optional): The default value is None.  Normally there is no 
-            need for user to set this property. For more information, please 
-            refer to :ref:`api_guide_Name` . 
-
-    Returns:
-        out(Tensor) : complex tensor
-
-    Raises:
-
-
-    Examples:
-    .. code-block:: python
-        import paddle
-
-        x = paddle.to_tensor([0.0, 1.0, 0.0, 0.0])
-        print(paddle.fft.rfft(x))
-        # Tensor(shape=[3], dtype=complex64, place=CUDAPlace(0), stop_gradient=True,
-        #        [ (1+0j), -1j    , (-1+0j)])
-    """
-    return fft_r2c(x, n, axis, norm, forward=True, onesided=True, name=name)
-
-
-def irfft(x, n=None, axis=-1, norm="backward", name=None):
-    """
-    Computes the inverse of `rfft`.
-
-    This function calculates the inverse of the one-dimensional *n* point discrete 
-    Fourier transform of the actual input calculated by "rfft". In other words, 
-    ``irfft(rfft(a),len(a)) == a`` is within the numerical accuracy range.
-
-    The input shall be in the form of "rfft", i.e. the actual zero frequency term, 
-    followed by the complex positive frequency term, in the order of increasing frequency. 
-    Because the discrete Fourier transform of the actual input is Hermite symmetric, 
-    the negative frequency term is regarded as the complex conjugate term of the corresponding 
-    positive frequency term.
-
-    Args:
-        x (Tensor): The input data. It's a Tensor type. It's a complex.
-        n (int, optional): The length of the output transform axis. For `n` output
-            points, ``n//2 + 1``input points are necessary. If the length of the input tensor is greater 
-            than `n`, it will be cropped, if it is shorter than this, fill in zero. If `n` is not given, 
-            it is considered to be ``2 * (k-1)``, where ``k`` is the length of the input axis specified 
-            along the ` axis'.
-        axis (int, optional): Axis used to calculate FFT. If not specified, the last axis 
-            is used by default.       
-        norm (str): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
-            of "forward" or "backward" or "ortho". Default is "backward".
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
-            this property. For more information, please refer to :ref:`api_guide_Name` . 
-
-    Returns:
-        Real tensor. Truncated or zero fill input for the transformation along the axis indicated by 
-        `axis`, or the last input if `axis` is not specified. The length of the conversion axis 
-        is `n`, or ``2 * k-2``, if `k` is None, where `k` is the length of the input conversion axis. 
-        If the output is an odd number, you need to specify the value of 'n', such as ``2 * k-1``
-        in some cases.
-    
-    Examples:
-
-        .. code-block:: python
-
-            import numpy as np
-            import paddle
-
-            x = np.array([1, -1j, -1])
-            xp = paddle.to_tensor(x)
-            irfft_xp = paddle.fft.irfft(xp).numpy()
-            print(irfft_xp)
-            #  [0. 1. 0. 0.]
-
-    """
-    return fft_c2r(x, n, axis, norm, forward=False, name=name)
-
-
-def hfft(x, n=None, axis=-1, norm="backward", name=None):
-    """
-    Compute the FFT of a signal that has Hermitian symmetry, a real
-    spectrum.
-
-    Args:
-        x (Tensor): The input data. It's a Tensor type. It's a complex.
-        n (int, optional): The length of the output transform axis. For `n` output
-            points, ``n//2 + 1`` input points are necessary. If the length of the input tensor is greater 
-            than `n`, it will be cropped, if it is shorter than this, fill in zero. If `n` is not given, 
-            it is considered to be ``2 * (k-1)``, where ``k`` is the length of the input axis specified 
-            along the ` axis'.
-        axis (int,optional): Axis used to calculate FFT. If not specified, the last axis 
-            is used by default.       
-        norm (str): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
-            of "forward" or "backward" or "ortho". Default is "backward".
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
-            this property. For more information, please refer to :ref:`api_guide_Name` . 
-
-    Returns:
-        Real tensor. Truncated or zero fill input for the transformation along the axis indicated by 
-        `axis`, or the last input if `axis` is not specified. The length of the conversion axis 
-        is `n`, or ``2 * k-2``, if `k` is None, where `k` is the length of the input conversion axis. 
-        If the output is an odd number, you need to specify the value of 'n', such as ``2 * k-1`` in 
-        some cases.
-    
-    Examples:
-
-        .. code-block:: python
-
-            import numpy as np
-            import paddle
-
-            x = np.array([1, -1j, -1])
-            xp = paddle.to_tensor(x)
-            hfft_xp = paddle.fft.hfft(xp).numpy()
-            print(hfft_xp)
-            #  [0. 0. 0. 4.]
-    """
-
-    return fft_c2r(x, n, axis, norm, forward=True, name=name)
-
-
-def ihfft(x, n=None, axis=-1, norm="backward", name=None):
-    """
-    The inverse FFT of a signal that has Hermitian symmetry.
-
-    This function computes the one dimensional *n*-point inverse FFT of a signal 
-    that has Hermitian symmetry by means of an efficient algorithm called 
-    the Fast Fourier Transform (FFT).
-
-    When the DFT is computed for purely real input, the output is
-    Hermitian-symmetric. This function does not compute the negative frequency 
-    terms, and the length of the transformed axis of the output is therefore 
-    ``n//2 + 1``.
-
-    Args:
-        x(Tensor): Input tensor.
-        n(int, optional): The number of points along transformation axis in the 
-            input to use.  If `n` is smaller than the length of the input, the 
-            input is cropped.  If it is larger, the input is padded with zeros. 
-            If `n` is not given, the length of the input along the axis 
-            specified by `axis` is used.
-        axis(int, optional) : Axis over which to compute the inverse FFT. If not
-            given, the last axis is used.
-        norm(str, optional) : Normalization mode, indicates which direction of 
-            the forward/backward pair of transforms is scaled and with what 
-            normalization factor. Include {"backward", "ortho", "forward"}, 
-            default value is "backward".
-        name(str, optional): The default value is None.  Normally there is no 
-            need for user to set this property. For more information, please 
-            refer to :ref:`api_guide_Name` . 
-
-    Returns:
-        out(Tensor) : complex tensor.
-
-    Examples:
-    .. code-block:: python
-        import paddle 
-
-        spectrum = paddle.to_tensor([10.0, -5.0, 0.0, -1.0, 0.0, -5.0])
-        print(paddle.fft.ifft(spectrum))
-        # Tensor(shape=[6], dtype=complex64, place=CUDAPlace(0), stop_gradient=True,
-        #       [(-0.1666666716337204+0j),  (1-1.9868215517249155e-08j), (2.3333334922790527-1.9868215517249155e-08j),  (3.5+0j), (2.3333334922790527+1.9868215517249155e-08j),  (1+1.9868215517249155e-08j)])
-        print(paddle.fft.ihfft(spectrum))
-        #  Tensor(shape = [4], dtype = complex64, place = CUDAPlace(0), stop_gradient = True,
-        #         [(-0.1666666716337204+0j),  (1-1.9868215517249155e-08j), (2.3333334922790527-1.9868215517249155e-08j),  (3.5+0j)])
-
-    """
-    return fft_r2c(x, n, axis, norm, forward=False, onesided=True, name=name)
-
-
-# public APIs nd
-def fftn(x, s=None, axes=None, norm="backward", name=None):
-    """
-    Compute the N-D discrete Fourier Transform.
-
-    This function calculates the n-D discrete Fourier transform on any number of axes 
-    in the M-D array by fast Fourier transform (FFT).
-
-    Args:
-        x (Tensor): The input data. It's a Tensor type. It's a complex.
-        s (sequence of ints, optional): Shape (length of each transformed axis) of the output
-            (``s[0]`` refers to axis 0, ``s[1]`` to axis 1, etc.).
-            This corresponds to ``n`` for ``fft(x, n)``.
-            Along any axis, if the given shape is smaller than that of the input,
-            the input is cropped. If it is larger, the input is padded with zeros.
-            if `s` is not given, the shape of the input along the axes specified
-            by `axes` is used.
-        axes (sequence of ints, optional): Axes used to calculate FFT. If not given, the last ``len(s)``
-            axes are used, or all axes if `s` is also not specified.      
-        norm (str): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
-            of "forward" or "backward" or "ortho". Default is "backward", meaning no normalization on
-            the forward transforms and scaling by ``1/n`` on the `ifft`. "forward" instead applies 
-            the ``1/n`` factor on the forward tranform. For ``norm="ortho"``, both directions are 
-            scaled by ``1/sqrt(n)``.
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
-            this property. For more information, please refer to :ref:`api_guide_Name`.
-
-    Returns:
-        complex tensor. The truncated or zero-padded input, transformed along the axes indicated by 
-        `axes`, or by a combination of `s` and `x`, as explained in the parameters section above.
-    
-    Examples:
-
-        .. code-block:: python
-
-            import numpy as np
-            import paddle
-
-            x = np.mgrid[:4, :4, :4][1]
-            xp = paddle.to_tensor(x)
-            fftn_xp = paddle.fft.fftn(xp, axes=(1, 2)).numpy()
-            print(fftn_xp)
-            #  [[[24.+0.j  0.+0.j  0.+0.j  0.-0.j]
-            #   [-8.+8.j  0.+0.j  0.+0.j  0.-0.j]
-            #   [-8.+0.j  0.+0.j  0.+0.j  0.-0.j]
-            #   [-8.-8.j  0.+0.j  0.+0.j  0.-0.j]]
-            #   [[24.+0.j  0.+0.j  0.+0.j  0.-0.j]
-            #   [-8.+8.j  0.+0.j  0.+0.j  0.-0.j]
-            #   [-8.+0.j  0.+0.j  0.+0.j  0.-0.j]
-            #   [-8.-8.j  0.+0.j  0.+0.j  0.-0.j]]
-            #   [[24.+0.j  0.+0.j  0.+0.j  0.-0.j]
-            #   [-8.+8.j  0.+0.j  0.+0.j  0.-0.j]
-            #   [-8.+0.j  0.+0.j  0.+0.j  0.-0.j]
-            #   [-8.-8.j  0.+0.j  0.+0.j  0.-0.j]]
-            #   [[24.+0.j  0.+0.j  0.+0.j  0.-0.j]
-            #   [-8.+8.j  0.+0.j  0.+0.j  0.-0.j]
-            #   [-8.+0.j  0.+0.j  0.+0.j  0.-0.j]
-            #   [-8.-8.j  0.+0.j  0.+0.j  0.-0.j]]]
-    """
-    if is_interger(x) or is_floating_point(x):
-        return fftn_r2c(
-            x, s, axes, norm, forward=True, onesided=False, name=name)
-    else:
-        return fftn_c2c(x, s, axes, norm, forward=True, name=name)
-
-
-def ifftn(x, s=None, axes=None, norm="backward", name=None):
-    """
-    Compute the N-D inverse discrete Fourier Transform.
-
-    This function computes the inverse of the N-D discrete
-    Fourier Transform over any number of axes in an M-D array by
-    means of the Fast Fourier Transform (FFT).  In other words,
-    ``ifftn(fftn(x)) == x`` to within numerical accuracy.
-
-    The input, analogously to `ifft`, should be ordered in the same way as is
-    returned by `fftn`, i.e., it should have the term for zero frequency
-    in all axes in the low-order corner, the positive frequency terms in the
-    first half of all axes, the term for the Nyquist frequency in the middle
-    of all axes and the negative frequency terms in the second half of all
-    axes, in order of decreasingly negative frequency.
-
-    Args:
-        x (Tensor): The input data. It's a Tensor type. It's a complex.
-        s (sequence of ints, optional): Shape (length of each transformed axis) of the output
-            (``s[0]`` refers to axis 0, ``s[1]`` to axis 1, etc.).
-            This corresponds to ``n`` for ``fft(x, n)``.
-            Along any axis, if the given shape is smaller than that of the input,
-            the input is cropped. If it is larger, the input is padded with zeros.
-            if `s` is not given, the shape of the input along the axes specified
-            by `axes` is used.
-        axes (sequence of ints, optional): Axes used to calculate FFT. If not given, the last ``len(s)``
-            axes are used, or all axes if `s` is also not specified.      
-        norm (str): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
-            of "forward" or "backward" or "ortho". Default is "backward", meaning no normalization on
-            the forward transforms and scaling by ``1/n`` on the `ifft`. "forward" instead applies 
-            the ``1/n`` factor on the forward tranform. For ``norm="ortho"``, both directions are 
-            scaled by ``1/sqrt(n)``.
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
-            this property. For more information, please refer to :ref:`api_guide_Name`.
-        
-    Returns:
-        complex tensor. The truncated or zero-padded input, transformed along the axes indicated by 
-        `axes`, or by a combination of `s` and `x`, as explained in the parameters section above.
-    
-    Examples:
-
-        .. code-block:: python
-
-            import numpy as np
-            import paddle
-
-            x = np.eye(3)
-            xp = paddle.to_tensor(x)
-            ifftn_xp = paddle.fft.ifftn(xp, axes=(1,)).numpy()
-            print(ifftn_xp)
-
-            #   [[ 0.33333333+0.j          0.33333333+0.j          0.33333333-0.j        ]
-            #   [ 0.33333333+0.j         -0.16666667+0.28867513j -0.16666667-0.28867513j]
-            #   [ 0.33333333+0.j         -0.16666667-0.28867513j -0.16666667+0.28867513j]]
-
-    """
-    if is_interger(x) or is_floating_point(x):
-        return fftn_r2c(
-            x, s, axes, norm, forward=False, onesided=False, name=name)
-    else:
-        return fftn_c2c(x, s, axes, norm, forward=False, name=name)
-
-
-def rfftn(x, s=None, axes=None, norm="backward", name=None):
-    """
-    The N dimensional FFT for real input.
-
-    This function computes the N-dimensional discrete Fourier Transform over
-    any number of axes in an M-dimensional real array by means of the Fast
-    Fourier Transform (FFT).  By default, all axes are transformed, with the
-    real transform performed over the last axis, while the remaining
-    transforms are complex.
-
-    The transform for real input is performed over the last transformation
-    axis, as by `rfft`, then the transform over the remaining axes is
-    performed as by `fftn`.  The order of the output is as for `rfft` for the
-    final transformation axis, and as for `fftn` for the remaining
-    transformation axes.
-
-    Args:
-        x(Tensor) : Input tensor, taken to be real.
-        s(Sequence[int]) : Shape to use from the exec fft. The final element of 
-            `s` corresponds to `n` for ``rfft(x, n)``, while for the remaining 
-            axes, it corresponds to `n` for ``fft(x, n)``. Along any axis, if 
-            the given shape is smaller than that of the input, the input is 
-            cropped.  If it is larger, the input is padded with zeros. if `s` is 
-            not given, the shape of the input along the axes specified by `axes` 
-            is used.
-        axes(Sequence[int]) : Axes over which to compute the FFT.  If not given, 
-            the last ``len(s)`` axes are used, or all axes if `s` is also not 
-            specified.
-        norm(str, optional) : Normalization mode, indicates which direction of 
-            the forward/backward pair of transforms is scaled and with what 
-            normalization factor. Include {"backward", "ortho", "forward"}, 
-            default value is "backward".
-        name(str, optional): The default value is None.  Normally there is no 
-            need for user to set this property. For more information, please 
-            refer to :ref:`api_guide_Name` . 
-
-    Returns:
-        out(Tensor): complex tensor
-
-
-    Raises:
-        ValueError: If `s` and `axes` have different length.
-
-    Examples:
-    .. code-block:: python
-        import paddle
-
-        # default, all axis will be used to exec fft
-        x = paddle.ones((2, 3, 4))
-        print(paddle.fft.rfftn(x))
-        # Tensor(shape=[2, 3, 3], dtype=complex64, place=CUDAPlace(0), stop_gradient=True,
-        #        [[[(24+0j), 0j     , 0j     ],
-        #          [0j     , 0j     , 0j     ],
-        #          [0j     , 0j     , 0j     ]],
-        #
-        #         [[0j     , 0j     , 0j     ],
-        #          [0j     , 0j     , 0j     ],
-        #          [0j     , 0j     , 0j     ]]])
-
-        # use axes(2, 0)
-        print(paddle.fft.rfftn(x, axes=(2, 0)))
-        # Tensor(shape=[2, 3, 3], dtype=complex64, place=CUDAPlace(0), stop_gradient=True,
-        #        [[[(8+0j), 0j     , 0j     ],
-        #          [(8+0j), 0j     , 0j     ],
-        #          [(8+0j), 0j     , 0j     ]],
-        #
-        #         [[0j     , 0j     , 0j     ],
-        #          [0j     , 0j     , 0j     ],
-        #          [0j     , 0j     , 0j     ]]])
-
-    """
-    return fftn_r2c(x, s, axes, norm, forward=True, onesided=True, name=name)
-
-
-def irfftn(x, s=None, axes=None, norm="backward", name=None):
-    """
-    Computes the inverse of `rfftn`.
-
-    This function computes the inverse of the N-D discrete
-    Fourier Transform for real input over any number of axes in an
-    M-D array by means of the Fast Fourier Transform (FFT). In
-    other words, ``irfftn(rfftn(x), x.shape) == x`` to within numerical
-    accuracy. (The ``a.shape`` is necessary like ``len(a)`` is for `irfft`,
-    and for the same reason.)
-
-    The input should be ordered in the same way as is returned by `rfftn`,
-    i.e., as for `irfft` for the final transformation axis, and as for `ifftn`
-    along all the other axes.
-
-    Args:
-        x (Tensor): The input data. It's a Tensor type.
-        s (sequence of ints, optional): The length of the output transform axis. 
-            (``s[0]`` refers to axis 0, ``s[1]`` to axis 1, etc.). `s` is also the
-            number of input points used along this axis, except for the last axis,
-            where ``s[-1]//2+1`` points of the input are used. Along any axis, if 
-            the shape indicated by `s` is smaller than that of the input, the input 
-            is cropped. If it is larger, the input is padded with zeros. 
-            If `s` is not given, the shape of the input along the axes specified by axes 
-            is used. Except for the last axis which is taken to be ``2*(k-1)`` where 
-            ``k`` is the length of the input along that axis.
-        axes (sequence of ints, optional): Axes over which to compute the inverse FFT. If not given, the last
-            `len(s)` axes are used, or all axes if `s` is also not specified.      
-        norm (str): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
-            of "forward" or "backward" or "ortho". Default is "backward".
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
-            this property. For more information, please refer to :ref:`api_guide_Name`. 
-    
-    Returns:
-        Real tensor. The truncated or zero-padded input, transformed along the axes indicated by `axes`, 
-        or by a combination of `s` or `x`, as explained in the parameters section above. The length of 
-        each transformed axis is as given by the corresponding element of `s`, or the length of the input
-        in every axis except for the last one if `s` is not given. In the final transformed axis the length
-        of the output when `s` is not given is ``2*(m-1)``, where ``m`` is the length of the final 
-        transformed axis of the input. To get an odd number of output points in the final axis, 
-        `s` must be specified.
-
-    Examples:
-
-        .. code-block:: python
-
-            import numpy as np
-            import paddle
-
-            x = (np.array([2, 2, 3]) + 1j * np.array([2, 2, 3])).astype(np.complex128)
-            xp = paddle.to_tensor(x)
-            irfftn_xp = paddle.fft.irfftn(xp).numpy()
-            print(irfftn_xp)
-            #  [ 2.25 -1.25  0.25  0.75]
-    
-    """
-    return fftn_c2r(x, s, axes, norm, forward=False, name=name)
-
-
-def hfftn(x, s=None, axes=None, norm="backward", name=None):
-    """
-    Compute the N-D FFT of Hermitian symmetric complex input, i.e., a
-    signal with a real spectrum.
-
-    This function calculates the n-D discrete Fourier transform of Hermite symmetric 
-    complex input on any axis in M-D array by fast Fourier transform (FFT). 
-    In other words, ``ihfftn(hfftn(x, s)) == x is within the numerical accuracy range. 
-    (``s`` here are ``x.shape`` and ``s[-1] = x.shape[- 1] * 2 - 1``. This is necessary 
-    for the same reason that ``irfft` requires ``x.shape``.)
-
-    Args:
-        x (Tensor): The input data. It's a Tensor type.
-        s (sequence of ints, optional): The length of the output transform axis. 
-            (``s[0]`` refers to axis 0, ``s[1]`` to axis 1, etc.). `s` is also the
-            number of input points used along this axis, except for the last axis,
-            where ``s[-1]//2+1`` points of the input are used. Along any axis, if 
-            the shape indicated by `s` is smaller than that of the input, the input 
-            is cropped. If it is larger, the input is padded with zeros. 
-            If `s` is not given, the shape of the input along the axes specified by axes 
-            is used. Except for the last axis which is taken to be ``2*(k-1)`` where 
-            ``k`` is the length of the input along that axis.
-        axes (sequence of ints, optional): Axes over which to compute the inverse FFT. If not given, the last
-            `len(s)` axes are used, or all axes if `s` is also not specified.      
-        norm (str): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
-            of "forward" or "backward" or "ortho". Default is "backward".
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
-            this property. For more information, please refer to :ref:`api_guide_Name`. 
-    
-    Returns:
-        Real tensor. Truncate or zero fill input, transforming along the axis indicated by axis or 
-        a combination of `s` or `X`.
-    
-    Examples:
-
-        .. code-block:: python
-
-            import numpy as np
-            import paddle
-
-            x = (np.array([2, 2, 3]) + 1j * np.array([2, 2, 3])).astype(np.complex128)
-            xp = paddle.to_tensor(x)
-            hfftn_xp = paddle.fft.hfftn(xp).numpy()
-            print(hfftn_xp)
-            #  [ 9.  3.  1. -5.]
-
-
-    """
-    return fftn_c2r(x, s, axes, norm, forward=True, name=name)
-
-
-def ihfftn(x, s=None, axes=None, norm="backward", name=None):
-    """
-    The n dimensional inverse FFT of a signal that has Hermitian symmetry.
-
-    This function computes the n dimensional inverse FFT over any number of axes 
-    in an M-dimensional of a signal that has Hermitian symmetry by means of an 
-    efficient algorithm called the Fast Fourier Transform (FFT).
-
-    Args:
-        x(Tensor): Input tensor.
-        s(Sequence[int], optional) : Shape (length along each transformed axis) 
-            to use from the input. (``s[0]`` refers to axis 0, ``s[1]`` to axis 
-            1, etc.). Along any axis, if the given shape is smaller than that 
-            of the input, the input is cropped. If it is larger, the input is 
-            padded with zeros. if `s` is not given, the shape of the input 
-            along the axes specified by `axes` is used.
-        axis(Sequence[int], optional) : Axis over which to compute the inverse FFT. If not
-            given, the last axis is used.
-        norm(str, optional) : Normalization mode, indicates which direction of 
-            the forward/backward pair of transforms is scaled and with what 
-            normalization factor. Include {"backward", "ortho", "forward"}, 
-            default value is "backward".
-        name(str, optional): The default value is None.  Normally there is no 
-            need for user to set this property. For more information, please 
-            refer to :ref:`api_guide_Name` . 
-
-    Returns:
-        out(Tensor) : complex tensor.
-
-    Examples:
-    .. code-block:: python
-        import paddle 
-
-        spectrum = paddle.to_tensor([10.0, -5.0, 0.0, -1.0, 0.0, -5.0])
-        print(paddle.fft.ifft(spectrum))
-        # Tensor(shape=[6], dtype=complex64, place=CUDAPlace(0), stop_gradient=True,
-        #       [(-0.1666666716337204+0j),  (1-1.9868215517249155e-08j), (2.3333334922790527-1.9868215517249155e-08j),  (3.5+0j), (2.3333334922790527+1.9868215517249155e-08j),  (1+1.9868215517249155e-08j)])
-        print(paddle.fft.ihfft(spectrum))
-        #  Tensor(shape = [4], dtype = complex64, place = CUDAPlace(0), stop_gradient = True,
-        #         [(-0.1666666716337204+0j),  (1-1.9868215517249155e-08j), (2.3333334922790527-1.9868215517249155e-08j),  (3.5+0j)])
-
-    """
-    return fftn_r2c(x, s, axes, norm, forward=False, onesided=True, name=name)
-
-
-# public APIs 2d
-def fft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
-    """
-    Compute the 2-D discrete Fourier Transform
-
-    This function computes the N-D discrete Fourier Transform
-    over any axes in an M-D array by means of the
-    Fast Fourier Transform (FFT). By default, the transform is computed over
-    the last two axes of the input array, i.e., a 2-dimensional FFT.
-
-    Args:
-        x (Tensor): The input data. It's a Tensor type.
-        s (sequence of ints, optional): Shape (length of each transformed axis) of the output. 
-            It should be a sequence of 2 integers. This corresponds to ``n`` for ``fft(x, n)``. 
-            Along each axis, if the given shape is smaller than that of the input,
-            the input is cropped. If it is larger, the input is padded with zeros.
-            if `s` is not given, the shape of the input along the axes specified
-            by `axes` is used. Default is None.
-        axes (sequence of ints, optional):  Axes over which to compute the FFT. It should be a 
-            sequence of 2 integers. If not specified, the last two axes are used by default.       
-        norm (str): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
-            of "forward" or "backward" or "ortho". Default is "backward".
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
-            this property. For more information, please refer to :ref:`api_guide_Name`. 
-    
-    Returns:
-        Complex tensor. The truncated or zero-padded input, transformed along the axes indicated by `axes`, 
-        or the last two axes if `axes` is not given.
-    
-    Raises:
-        ValueError: if `s` not be a sequence of 2 integers or None.
-        ValueError: if `axes` not be a sequence of 2 integers or None.
-        ValueError: If the input dimension is smaller than 2.
-
-    Examples:
-
-        .. code-block:: python
-
-            import numpy as np
-            import paddle
-
-            x = np.mgrid[:2, :2][1]
-            xp = paddle.to_tensor(x)
-            fft2_xp = paddle.fft.fft2(xp).numpy()
-            print(fft2_xp)
-            #  [[ 2.+0.j -2.+0.j]
-            #   [ 0.+0.j  0.+0.j]]
-
-    """
-    _check_at_least_ndim(x, 2)
-    if s is not None:
-        if not isinstance(s, Sequence) or len(s) != 2:
-            raise ValueError(
-                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".
-                format(s))
-    if axes is not None:
-        if not isinstance(axes, Sequence) or len(axes) != 2:
-            raise ValueError(
-                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".
-                format(axes))
-    return fftn(x, s, axes, norm, name)
-
-
-def ifft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
-    """
-    Compute the 2-D inverse discrete Fourier Transform.
-
-    This function computes the inverse of the 2-D discrete Fourier
-    Transform over any number of axes in an M-D array by means of
-    the Fast Fourier Transform (FFT). In other words, ``ifft2(fft2(x)) == x``
-    to within numerical accuracy. By default, the inverse transform is
-    computed over the last two axes of the input array.
-
-    The input, analogously to `ifft`, should be ordered in the same way as is
-    returned by `fft2`, i.e., it should have the term for zero frequency
-    in the low-order corner of the two axes, the positive frequency terms in
-    the first half of these axes, the term for the Nyquist frequency in the
-    middle of the axes and the negative frequency terms in the second half of
-    both axes, in order of decreasingly negative frequency.
-
-    Args:
-        x (Tensor): The input data. It's a Tensor type.
-        s (sequence of ints, optional): Shape (length of each transformed axis) of the output. 
-            It should be a sequence of 2 integers. This corresponds to ``n`` for ``fft(x, n)``. 
-            Along each axis, if the given shape is smaller than that of the input,
-            the input is cropped. If it is larger, the input is padded with zeros.
-            if `s` is not given, the shape of the input along the axes specified
-            by `axes` is used. Default is None.
-        axes (sequence of ints, optional):  Axes over which to compute the FFT. It should be a 
-            sequence of 2 integers. If not specified, the last two axes are used by default.       
-        norm (str): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
-            of "forward" or "backward" or "ortho". Default is "backward".
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
-            this property. For more information, please refer to :ref:`api_guide_Name`.
-    
-    Returns:
-        Complex tensor. The truncated or zero-padded input, transformed along the axes indicated by `axes`, 
-        or the last two axes if `axes` is not given.
-
-    Raises:
-        ValueError: if `s` not be a sequence of 2 integers or None.
-        ValueError: if `axes` not be a sequence of 2 integers or None.
-        ValueError: If the input dimension is smaller than 2.
-
-    Examples:
-
-        .. code-block:: python
-
-            import numpy as np
-            import paddle
-
-            x = np.mgrid[:2, :2][1]
-            xp = paddle.to_tensor(x)
-            ifft2_xp = paddle.fft.ifft2(xp).numpy()
-            print(ifft2_xp)
-            #  [[ 0.5+0.j -0.5+0.j]
-            #   [ 0. +0.j  0. +0.j]]
-    """
-    _check_at_least_ndim(x, 2)
-    if s is not None:
-        if not isinstance(s, Sequence) or len(s) != 2:
-            raise ValueError(
-                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".
-                format(s))
-    if axes is not None:
-        if not isinstance(axes, Sequence) or len(axes) != 2:
-            raise ValueError(
-                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".
-                format(axes))
-    return ifftn(x, s, axes, norm, name)
-
-
-def rfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
-    """
-    The two dimensional FFT with real tensor input.
-
-    This is really just `rfftn` with different default behavior.
-    For more details see `rfftn`.
-
-    Args:
-        x(Tensor): Input tensor, taken to be real.
-        s(Sequence[int]) : Shape of the FFT.
-        axes(Sequence[int], optional): Axes over which to compute the FFT.
-        norm(str, optional) : {"backward", "ortho", "forward"}, 
-            default is "backward". Indicates which direction of the 
-            forward/backward pair of transforms is scaled and with what 
-            normalization factor.
-        name(str, optional): The default value is None.  Normally there is no 
-            need for user to set this property. For more information, please 
-            refer to :ref:`api_guide_Name` . 
-
-    Returns: 
-        out(Tensor): The result of the real 2-D FFT.
-
-    Raises:
-
-
-    Examples:
-
-    .. code-block:: python
-        import paddle
-        import numpy as np
-
-        x = paddle.to_tensor(np.mgrid[:5, :5][0].astype(np.float32))
-        print(paddle.fft.rfft2(x))
-        # Tensor(shape=[5, 3], dtype=complex64, place=CUDAPlace(0), stop_gradient=True,
-        #        [[ (50+0j)                                        ,  (1.1920928955078125e-07+0j)                    ,  0j                                             ],
-        #         [(-12.5+17.204774856567383j)                     , (-9.644234211236835e-08+7.006946134424652e-08j) ,  0j                                             ],
-        #         [(-12.500000953674316+4.061495304107666j)        , (3.6837697336977726e-08-1.1337477445749755e-07j),  0j                                             ],
-        #         [(-12.500000953674316-4.061495304107666j)        , (3.6837697336977726e-08+1.1337477445749755e-07j),  0j                                             ],
-        #         [(-12.5-17.204774856567383j)                     , (-9.644234211236835e-08-7.006946134424652e-08j) ,  0j                                             ]])
-    """
-    _check_at_least_ndim(x, 2)
-    if s is not None:
-        if not isinstance(s, Sequence) or len(s) != 2:
-            raise ValueError(
-                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".
-                format(s))
-    if axes is not None:
-        if not isinstance(axes, Sequence) or len(axes) != 2:
-            raise ValueError(
-                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".
-                format(axes))
-    return rfftn(x, s, axes, norm, name)
-
-
-def irfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
-    """
-    Computes the inverse of `rfft2`.
-
-    Args:
-        x (Tensor): The input data. It's a Tensor type.
-        s (sequence of ints, optional): Shape of the real output to the inverse FFT. Default is None.
-        axes (sequence of ints, optional): The axes over which to compute the inverse FFT. Axes 
-            must be two-dimensional. If not specified, the last two axes are used by default.       
-        norm (str): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
-            of "forward" or "backward" or "ortho". Default is "backward".
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
-            this property. For more information, please refer to :ref:`api_guide_Name` . 
-    
-    Returns:
-        Real tensor. The result of the inverse real 2-D FFT.
-
-    Raises:
-        ValueError: if `s` not be a sequence of 2 integers or None.
-        ValueError: if `axes` not be a sequence of 2 integers or None.
-        ValueError: If the input dimension is smaller than 2.
-    
-    Examples:
-
-        .. code-block:: python
-
-            import numpy as np
-            import paddle
-
-            x = (np.array([[3,2,3],[2, 2, 3]]) + 1j * np.array([[3,2,3],[2, 2, 3]])).astype(np.complex128)
-            xp = paddle.to_tensor(x)
-            irfft2_xp = paddle.fft.irfft2(xp).numpy()
-            print(irfft2_xp)
-            #  [[ 2.375 -1.125  0.375  0.875]
-            #   [ 0.125  0.125  0.125  0.125]]
-
-    """
-    _check_at_least_ndim(x, 2)
-    if s is not None:
-        if not isinstance(s, Sequence) or len(s) != 2:
-            raise ValueError(
-                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".
-                format(s))
-    if axes is not None:
-        if not isinstance(axes, Sequence) or len(axes) != 2:
-            raise ValueError(
-                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".
-                format(axes))
-    return irfftn(x, s, axes, norm, name)
-
-
-def hfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
-    """
-    Compute the 2-D FFT of a Hermitian complex array.
-
-    Args:
-        x (Tensor): The input data. It's a Tensor type.
-        s (sequence of ints, optional): Shape of the real output. Default is None.
-        axes (sequence of ints, optional):  Axes over which to compute the FFT. Axes must be 
-            two-dimensional. If not specified, the last two axes are used by default.       
-        norm (str): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
-            of "forward" or "backward" or "ortho". Default is "backward".
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
-            this property. For more information, please refer to :ref:`api_guide_Name`. 
-    
-    Returns:
-        Real tensor. The real result of the 2-D Hermitian complex real FFT.
-    
-    Raises:
-        ValueError: if `s` not be a sequence of 2 integers or None.
-        ValueError: if `axes` not be a sequence of 2 integers or None.
-        ValueError: If the input dimension is smaller than 2.
-    
-    Examples:
-
-        .. code-block:: python
-
-            import numpy as np
-            import paddle
-
-            x = (np.array([[3,2,3],[2, 2, 3]]) + 1j * np.array([[3,2,3],[2, 2, 3]])).astype(np.complex128)
-            xp = paddle.to_tensor(x)
-            hfft2_xp = paddle.fft.hfft2(xp).numpy()
-            print(hfft2_xp)
-            #  [[19.  7.  3. -9.]
-            #   [ 1.  1.  1.  1.]]
-
-
-    """
-    _check_at_least_ndim(x, 2)
-    if s is not None:
-        if not isinstance(s, Sequence) or len(s) != 2:
-            raise ValueError(
-                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".
-                format(s))
-    if axes is not None:
-        if not isinstance(axes, Sequence) or len(axes) != 2:
-            raise ValueError(
-                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".
-                format(axes))
-    return hfftn(x, s, axes, norm, name)
-
-
-def ihfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
-    """
-    Compute the two dimensional inverse FFT of a real spectrum.
-
-    This is really `ihfftn` with different defaults.
-    For more details see `ihfftn`.
-
-    Args:
-        x(Tensor): Input tensor
-        s(Sequence[int], optional): Shape of the real input to the inverse FFT.
-        axes(Sequance[int], optional): The axes over which to compute the 
-            inverse fft. Default is the last two axes.
-        norm(str, optional): {"backward", "ortho", "forward"}. Default is 
-        "backward".
-        name(str, optional): The default value is None.  Normally there is no 
-            need for user to set this property. For more information, please 
-            refer to :ref:`api_guide_Name` . 
-
-    Returns:
-        out(Tensor) : The result of the inverse hermitian 2-D FFT.
-
-    Examples:
-
-        .. code-block:: python
-
-            import numpy as np
-            import paddle
-
-            x = np.mgrid[:5, :5][0].astype(np.float64)
-            xp = paddle.to_tensor(x)
-            ihfft2_xp = paddle.fft.ihfft2(xp).numpy()
-            print(ihfft2_xp)
-            # [[ 2. +0.j          0. +0.j          0. +0.j        ]
-            #  [-0.5-0.68819096j  0. +0.j          0. +0.j        ]
-            #  [-0.5-0.16245985j  0. +0.j          0. +0.j        ]
-            #  [-0.5+0.16245985j  0. +0.j          0. +0.j        ]
-            #  [-0.5+0.68819096j  0. +0.j          0. +0.j        ]]
-    """
-    _check_at_least_ndim(x, 2)
-    if s is not None:
-        if not isinstance(s, Sequence) or len(s) != 2:
-            raise ValueError(
-                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".
-                format(s))
-    if axes is not None:
-        if not isinstance(axes, Sequence) or len(axes) != 2:
-            raise ValueError(
-                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".
-                format(axes))
-    return ihfftn(x, s, axes, norm, name)
-
-
-# public APIs utilities
-def fftfreq(n, d=1.0, dtype=None, name=None):
-    """
-    Return the Discrete Fourier Transform sample frequencies.
-
-    The returned float array `f` contains the frequency bin centers in cycles
-    per unit of the sample spacing (with zero at the start).  For instance, if
-    the sample spacing is in seconds, then the frequency unit is cycles/second.
-
-    Given input length `n` and a sample spacing `d`::
-
-      f = [0, 1, ...,   n/2-1,     -n/2, ..., -1] / (d*n)   if n is even
-      f = [0, 1, ..., (n-1)/2, -(n-1)/2, ..., -1] / (d*n)   if n is odd
-
-    Args:
-        n (int): Dimension inputed.
-        d (scalar, optional): Sample spacing (inverse of the sampling rate). Defaults is 1.
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
-            this property. For more information, please refer to :ref:`api_guide_Name`.
-
-    Returns:
-        Tensor. A tensor of length 'n' containing the sampling frequency.
-
-    Examples:
-
-        .. code-block:: python
-
-            import numpy as np
-            import paddle
-
-            x = np.array([3, 1, 2, 2, 3], dtype=float)
-            scalar_temp = 0.5
-            n = x.size
-            fftfreq_xp = paddle.fft.fftfreq(n, d=scalar_temp)
-            print(fftfreq_xp)
-
-            #  Tensor(shape=[5], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-            #           [ 0.        ,  0.40000001,  0.80000001, -0.80000001, -0.40000001])
-    """
-
-    dtype = paddle.framework.get_default_dtype()
-    val = 1.0 / (n * d)
-    pos_max = (n + 1) // 2
-    neg_max = n // 2
-    indices = paddle.arange(-neg_max, pos_max, dtype=dtype, name=name)
-    indices = paddle.roll(indices, -neg_max, name=name)
-    return indices * val
-
-
-def rfftfreq(n, d=1.0, dtype=None, name=None):
-    """
-    Return the Discrete Fourier Transform sample frequencies.
-
-    The returned floating-point array "F" contains the center of the frequency unit, 
-    and the unit is the number of cycles of the sampling interval (the starting point is zero). 
-
-    Given input length `n` and a sample spacing `d`::
-
-      f = [0, 1, ...,     n/2-1,     n/2] / (d*n)   if n is even
-      f = [0, 1, ..., (n-1)/2-1, (n-1)/2] / (d*n)   if n is odd
-
-    the Nyquist frequency component is considered to be positive.
-
-    Args:
-        n (int): Dimension inputed.
-        d (scalar, optional): Sample spacing (inverse of the sampling rate). Defaults is 1.
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
-            this property. For more information, please refer to :ref:`api_guide_Name`.
-
-    Returns:
-        Tensor. A tensor of length ``n//2 + 1`` containing the sample frequencies.
-
-    Examples:
-
-        .. code-block:: python
-
-            import numpy as np
-            import paddle
-
-            x = np.array([3, 1, 2, 2, 3], dtype=float)
-            scalar_temp = 0.3
-            n = x.size
-            rfftfreq_xp = paddle.fft.rfftfreq(n, d=scalar_temp)
-            print(rfftfreq_xp)
-
-            #  Tensor(shape=[3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-            #           [0.        , 0.66666669, 1.33333337])
-
-    """
-
-    dtype = paddle.framework.get_default_dtype()
-    val = 1.0 / (n * d)
-    pos_max = 1 + n // 2
-    indices = paddle.arange(0, pos_max, dtype=dtype, name=name)
-    return indices * val
-
-
-def fftshift(x, axes=None, name=None):
-    """
-    Shift the zero-frequency component to the center of the spectrum.
-
-    This function swaps half spaces for all the axes listed (all by default).
-    Note that ``y[0]`` is the Nyquist component only if ``len(x)`` is even.
-
-    Args:
-        n (int): Dimension inputed.
-        axes (int|tuple, optional): The axis on which to move. The default is none, which moves all axes.
-            Default is None.
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
-            this property. For more information, please refer to :ref:`api_guide_Name`.
-
-    Returns:
-        Tensor. The shifted tensor.
-    
-    Examples:
-
-        .. code-block:: python
-
-            import numpy as np
-            import paddle
-
-            x = np.array([3, 1, 2, 2, 3], dtype=float)
-            n = x.size
-            fftfreq_xp = paddle.fft.fftfreq(n, d=0.3)
-            res = paddle.fft.fftshift(fftfreq_xp).numpy()
-            print(res)
-            #  [-1.3333334 -0.6666667  0.         0.6666667  1.3333334]
-
-    """
-    shape = paddle.shape(x)
-    if axes is None:
-        # shift all axes
-        rank = paddle.rank(x).reshape([1])
-        axes = axes or paddle.arange(0, rank)
-        shifts = [size // 2 for size in shape]
-    elif isinstance(axes, int):
-        shifts = shape[axes] // 2
-    else:
-        shifts = [shape[ax] // 2 for ax in axes]
-    return paddle.roll(x, shifts, axes, name=name)
-
-
-def ifftshift(x, axes=None, name=None):
-    """
-    The inverse of `fftshift`. Although the even length 'x' is the same, the function of the 
-    odd length 'x' is different. An example.
-
-    Args:
-        n (int): Dimension inputed.
-        axes (int|tuple, optional): The axis on which to move. The default is none, which moves all axes.
-            Default is None.
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
-            this property. For more information, please refer to :ref:`api_guide_Name`.
-
-    Returns:
-        Tensor. The shifted tensor.
-    
-    Examples:
-
-        .. code-block:: python
-
-            import numpy as np
-            import paddle
-
-            x = np.array([3, 1, 2, 2, 3], dtype=float)
-            n = x.size
-            fftfreq_xp = paddle.fft.fftfreq(n, d=0.3)
-            res = paddle.fft.ifftshift(fftfreq_xp).numpy()
-            print(res)
-            #  [ 1.3333334 -1.3333334 -0.6666667  0.         0.6666667]
-
-    """
-    shape = paddle.shape(x)
-    if axes is None:
-        # shift all axes
-        rank = paddle.rank(x).reshape([1])
-        axes = axes or paddle.arange(0, rank)
-        shifts = [-size // 2 for size in shape]
-    elif isinstance(axes, int):
-        shifts = -shape[axes] // 2
-    else:
-        shifts = [-shape[ax] // 2 for ax in axes]
-    return paddle.roll(x, shifts, axes, name=name)
-
-
-# internal functions
-def fft_c2c(x, n, axis, norm, forward, name):
-    if is_interger(x):
-        x = paddle.cast(x, _real_to_complex_dtype(paddle.get_default_dtype()))
-    elif is_floating_point(x):
-        x = paddle.cast(x, _real_to_complex_dtype(x.dtype))
-    _check_normalization(norm)
-
-    axis = axis if axis is not None else -1
-    _check_fft_axis(x, axis)
-    axes = [axis]
-    axes = _normalize_axes(x, axes)
-    if n is not None:
-        _check_fft_n(n)
-        s = [n]
-        x = _resize_fft_input(x, s, axes)
-    op_type = 'fft_c2c'
-
-    check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], op_type)
-    if in_dygraph_mode():
-        attrs = ('axes', axes, 'normalization', norm, 'forward', forward)
-        out = getattr(_C_ops, op_type)(x, *attrs)
-    else:
-        inputs = {'X': [x], }
-        attrs = {'axes': axes, 'normalization': norm, 'forward': forward}
-        helper = LayerHelper(op_type, **locals())
-        dtype = helper.input_dtype(input_param_name='x')
-        out = helper.create_variable_for_type_inference(dtype)
-        outputs = {"Out": [out]}
-        helper.append_op(
-            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
-    return out
-
-
-def fft_r2c(x, n, axis, norm, forward, onesided, name):
-    if is_interger(x):
-        x = paddle.cast(x, paddle.get_default_dtype())
-    _check_normalization(norm)
-    axis = axis if axis is not None else -1
-    _check_fft_axis(x, axis)
-    axes = [axis]
-    axes = _normalize_axes(x, axes)
-    if n is not None:
-        _check_fft_n(n)
-        s = [n]
-        x = _resize_fft_input(x, s, axes)
-    op_type = 'fft_r2c'
-    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], op_type)
-
-    if in_dygraph_mode():
-        attrs = ('axes', axes, 'normalization', norm, 'forward', forward,
-                 'onesided', onesided)
-        out = getattr(_C_ops, op_type)(x, *attrs)
-    else:
-        inputs = {'X': [x], }
-        attrs = {
-            'axes': axes,
-            'normalization': norm,
-            'forward': forward,
-            'onesided': onesided,
-        }
-        helper = LayerHelper(op_type, **locals())
-        dtype = helper.input_dtype(input_param_name='x')
-        out = helper.create_variable_for_type_inference(
-            _real_to_complex_dtype(dtype))
-        outputs = {"Out": [out]}
-        helper.append_op(
-            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
-    return out
-
-
-def fft_c2r(x, n, axis, norm, forward, name):
-    if is_interger(x):
-        x = paddle.cast(x, _real_to_complex_dtype(paddle.get_default_dtype()))
-    elif is_floating_point(x):
-        x = paddle.cast(x, _real_to_complex_dtype(x.dtype))
-    _check_normalization(norm)
-    axis = axis if axis is not None else -1
-    _check_fft_axis(x, axis)
-    axes = [axis]
-    axes = _normalize_axes(x, axes)
-    if n is not None:
-        _check_fft_n(n)
-        s = [n // 2 + 1]
-        x = _resize_fft_input(x, s, axes)
-    op_type = 'fft_c2r'
-    check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], op_type)
-
-    if in_dygraph_mode():
-        if n is not None:
-            attrs = ('axes', axes, 'normalization', norm, 'forward', forward,
-                     'last_dim_size', n)
-        else:
-            attrs = ('axes', axes, 'normalization', norm, 'forward', forward)
-        out = getattr(_C_ops, op_type)(x, *attrs)
-    else:
-        inputs = {'X': [x], }
-        attrs = {'axes': axes, 'normalization': norm, 'forward': forward}
-        if n is not None:
-            attrs['last_dim_size'] = n
-        helper = LayerHelper(op_type, **locals())
-        dtype = helper.input_dtype(input_param_name='x')
-        out = helper.create_variable_for_type_inference(
-            _complex_to_real_dtype(dtype))
-        outputs = {"Out": [out]}
-        helper.append_op(
-            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
-    return out
-
-
-def fftn_c2c(x, s, axes, norm, forward, name):
-    if is_interger(x):
-        x = paddle.cast(x, _real_to_complex_dtype(paddle.get_default_dtype()))
-    elif is_floating_point(x):
-        x = paddle.cast(x, _real_to_complex_dtype(x.dtype))
-    _check_normalization(norm)
-    if s is not None:
-        _check_fft_shape(x, s)
-
-    rank = x.ndim
-    if axes is None:
-        if s is None:
-            axes = list(range(rank))
-        else:
-            fft_ndims = len(s)
-            axes = list(range(rank - fft_ndims, rank))
-    else:
-        _check_fft_axes(x, axes)
-        axes = _normalize_axes(x, axes)
-        axes_argsoft = np.argsort(axes).tolist()
-        axes = [axes[i] for i in axes_argsoft]
-        if s is not None:
-            if len(s) != len(axes):
-                raise ValueError(
-                    "Length of s ({}) and length of axes ({}) does not match.".
-                    format(len(s), len(axes)))
-            s = [s[i] for i in axes_argsoft]
-
-    if s is not None:
-        x = _resize_fft_input(x, s, axes)
-    op_type = 'fft_c2c'
-    check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], op_type)
-
-    if in_dygraph_mode():
-        attrs = ('axes', axes, 'normalization', norm, 'forward', forward)
-        out = getattr(_C_ops, op_type)(x, *attrs)
-    else:
-        inputs = {'X': [x], }
-        attrs = {'axes': axes, 'normalization': norm, 'forward': forward}
-        helper = LayerHelper(op_type, **locals())
-        dtype = helper.input_dtype(input_param_name='x')
-        out = helper.create_variable_for_type_inference(dtype)
-        outputs = {"Out": [out]}
-        helper.append_op(
-            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
-    return out
-
-
-def fftn_r2c(x, s, axes, norm, forward, onesided, name):
-    if is_interger(x):
-        x = paddle.cast(x, paddle.get_default_dtype())
-    _check_normalization(norm)
-    if s is not None:
-        _check_fft_shape(x, s)
-
-    rank = x.ndim
-    if axes is None:
-        if s is None:
-            axes = list(range(rank))
-        else:
-            fft_ndims = len(s)
-            axes = list(range(rank - fft_ndims, rank))
-    else:
-        _check_fft_axes(x, axes)
-        axes = _normalize_axes(x, axes)
-        axes_argsoft = np.argsort(axes[:-1]).tolist()
-        axes = [axes[i] for i in axes_argsoft] + [axes[-1]]
-        if s is not None:
-            if len(s) != len(axes):
-                raise ValueError(
-                    "Length of s ({}) and length of axes ({}) does not match.".
-                    format(len(s), len(axes)))
-            s = [s[i] for i in axes_argsoft] + [s[-1]]
-
-    if s is not None:
-        x = _resize_fft_input(x, s, axes)
-
-    op_type = 'fft_r2c'
-    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], op_type)
-
-    if in_dygraph_mode():
-        attrs = ('axes', axes, 'normalization', norm, 'forward', forward,
-                 'onesided', onesided)
-        out = getattr(_C_ops, op_type)(x, *attrs)
-    else:
-        inputs = {'X': [x], }
-        attrs = {
-            'axes': axes,
-            'normalization': norm,
-            'forward': forward,
-            'onesided': onesided,
-        }
-        helper = LayerHelper(op_type, **locals())
-        dtype = helper.input_dtype(input_param_name='x')
-        out = helper.create_variable_for_type_inference(
-            _real_to_complex_dtype(dtype))
-        outputs = {"Out": [out]}
-        helper.append_op(
-            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
-
-    return out
-
-
-def fftn_c2r(x, s, axes, norm, forward, name):
-    if is_interger(x):
-        x = paddle.cast(x, _real_to_complex_dtype(paddle.get_default_dtype()))
-    elif is_floating_point(x):
-        x = paddle.cast(x, _real_to_complex_dtype(x.dtype))
-    _check_normalization(norm)
-    if s is not None:
-        _check_fft_shape(x, s)
-
-    rank = x.ndim
-    if axes is None:
-        if s is None:
-            axes = list(range(rank))
-        else:
-            fft_ndims = len(s)
-            axes = list(range(rank - fft_ndims, rank))
-    else:
-        _check_fft_axes(x, axes)
-        axes = _normalize_axes(x, axes)
-        axes_argsoft = np.argsort(axes[:-1]).tolist()
-        axes = [axes[i] for i in axes_argsoft] + [axes[-1]]
-        if s is not None:
-            if len(s) != len(axes):
-                raise ValueError(
-                    "Length of s ({}) and length of axes ({}) does not match.".
-                    format(len(s), len(axes)))
-            s = [s[i] for i in axes_argsoft] + [s[-1]]
-
-    if s is not None:
-        fft_input_shape = list(s)
-        fft_input_shape[-1] = fft_input_shape[-1] // 2 + 1
-        x = _resize_fft_input(x, fft_input_shape, axes)
-
-    op_type = 'fft_c2r'
-    check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], op_type)
-
-    if in_dygraph_mode():
-        if s:
-            attrs = ('axes', axes, 'normalization', norm, 'forward', forward,
-                     'last_dim_size', s[-1])
-        else:
-            attrs = ('axes', axes, 'normalization', norm, 'forward', forward)
-        out = getattr(_C_ops, op_type)(x, *attrs)
-    else:
-        inputs = {'X': [x], }
-        attrs = {'axes': axes, 'normalization': norm, 'forward': forward}
-        if s:
-            attrs["last_dim_size"] = s[-1]
-        helper = LayerHelper(op_type, **locals())
-        dtype = helper.input_dtype(input_param_name='x')
-        out = helper.create_variable_for_type_inference(
-            _complex_to_real_dtype(dtype))
-        outputs = {"Out": [out]}
-        helper.append_op(
-            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
-    return out
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index 6853d904adbf6..aea56432fa9ca 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -1293,6 +1293,59 @@ def histogram(input, bins=100, min=0, max=0, name=None):
     return out
 
 
+def bincount(x, weights=None, minlength=0, name=None):
+    """
+    Computes frequency of each value in the input tensor. 
+
+    Args:
+        x (Tensor): A Tensor with non-negative integer. Should be 1-D tensor.
+        weights (Tensor, optional): Weight for each value in the input tensor. Should have the same shape as input. Default is None.
+        minlength (int, optional): Minimum number of bins. Should be non-negative integer. Default is 0.
+        name(str, optional): The default value is None.  Normally there is no need for user to set this
+            property.  For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor: The tensor of frequency.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            x = paddle.to_tensor([1, 2, 1, 4, 5])
+            result1 = paddle.bincount(x)
+            print(result1) # [0, 2, 1, 0, 1, 1]
+
+            w = paddle.to_tensor([2.1, 0.4, 0.1, 0.5, 0.5])
+            result2 = paddle.bincount(x, weights=w)
+            print(result2) # [0., 2.19999981, 0.40000001, 0., 0.50000000, 0.50000000]
+    """
+    if x.dtype not in [paddle.int32, paddle.int64]:
+        raise TypeError("Elements in Input(x) should all be integers")
+
+    if in_dygraph_mode():
+        return _C_ops.bincount(x, weights, "minlength", minlength)
+
+    helper = LayerHelper('bincount', **locals())
+
+    check_variable_and_dtype(x, 'X', ['int32', 'int64'], 'bincount')
+
+    if weights is not None:
+        check_variable_and_dtype(weights, 'Weights',
+                                 ['int32', 'int64', 'float32', 'float64'],
+                                 'bincount')
+        out = helper.create_variable_for_type_inference(dtype=weights.dtype)
+    else:
+        out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(
+        type='bincount',
+        inputs={'X': x,
+                'Weights': weights},
+        outputs={'Out': out},
+        attrs={'minlength': minlength})
+    return out
+
+
 def mv(x, vec, name=None):
     """
     Performs a matrix-vector product of the matrix x and the vector vec.
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 5f7588cb2a9a0..9b9b2d9431eeb 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -696,15 +696,24 @@ def roll(x, shifts, axis=None, name=None):
 
     helper = LayerHelper("roll", **locals())
     check_type(axis, 'axis', (list, tuple), 'roll')
-    check_type(shifts, 'shifts', (list, tuple), 'roll')
+
     out = helper.create_variable_for_type_inference(x.dtype)
 
-    helper.append_op(
-        type='roll',
-        inputs={'X': x},
-        outputs={'Out': out},
-        attrs={'axis': axis,
-               'shifts': shifts})
+    if isinstance(shifts, Variable):
+        helper.append_op(
+            type='roll',
+            inputs={'X': x,
+                    "ShiftsTensor": shifts},
+            outputs={'Out': out},
+            attrs={'axis': axis})
+    else:
+        check_type(shifts, 'shifts', (list, tuple), 'roll')
+        helper.append_op(
+            type='roll',
+            inputs={'X': x},
+            outputs={'Out': out},
+            attrs={'axis': axis,
+                   'shifts': shifts})
     return out
 
 
diff --git a/tools/ci_model_benchmark.sh b/tools/ci_model_benchmark.sh
new file mode 100644
index 0000000000000..574169869376a
--- /dev/null
+++ b/tools/ci_model_benchmark.sh
@@ -0,0 +1,104 @@
+#!/bin/bash
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+function check_whl {
+    pip uninstall -y paddlepaddle_gpu
+    pip install build/pr_whl/*.whl
+    [ $? -ne 0 ] && echo "install paddle failed." && exit 1
+
+    unzip -q build/pr_whl/*.whl -d /tmp/pr
+    unzip -q build/dev_whl/*.whl -d /tmp/develop
+
+    sed -i '/version.py/d' /tmp/pr/*/RECORD
+    sed -i '/version.py/d' /tmp/develop/*/RECORD
+    diff_whl=`diff /tmp/pr/*/RECORD /tmp/develop/*/RECORD|wc -l`
+    [ $? -ne 0 ] && echo "diff paddle whl failed." && exit 1
+    if [ ${diff_whl} -eq 0 ];then
+        echo "paddle whl does not diff in PR-CI-Model-benchmark, so skip this ci"
+        echo "ipipe_log_param_isSkipTest_model_benchmark: 1" 
+        echo "cpu_benchmark=ON" >${cfs_dir}/model_benchmark/${AGILE_PULL_ID}/${AGILE_REVISION}/pass.txt
+        exit 0
+    else
+        echo "ipipe_log_param_isSkipTest_model_benchmark: 0"
+    fi
+}
+
+
+function compile_install_paddle {
+    export CUDA_ARCH_NAME=${CUDA_ARCH_NAME:-Auto}
+    export PY_VERSION=3.7
+    export WITH_DISTRIBUTE=ON
+    export WITH_GPU=ON
+    export WITH_TENSORRT=OFF
+    export WITH_TESTING=OFF
+    export WITH_UNITY_BUILD=ON
+    check_whl
+    cd /workspace/Paddle
+    git clone --depth=1 https://github.com/paddlepaddle/benchmark.git
+    cd benchmark
+    set +x
+    wget -q --no-proxy https://xly-devops.bj.bcebos.com/benchmark/new_clone/benchmark/benchmark_allgit.tar.gz
+    tar xf benchmark_allgit.tar.gz
+    set -x
+}
+
+function init_benchmark {
+    cd /workspace/Paddle/benchmark
+    git clone PaddleClas.bundle PaddleClas
+
+}
+
+function prepare_data {
+    cd ${cache_dir}
+    if [ -d "benchmark_data" ];then 
+        echo -e "benchmark_data exist!"
+    else
+        mkdir benchmark_data && cd benchmark_data
+        mkdir dataset && cd dataset
+        wget --no-proxy -q https://paddle-qa.bj.bcebos.com/benchmark_data/Bert.zip 
+        unzip Bert.zip
+        wget --no-proxy -q https://paddle-qa.bj.bcebos.com/benchmark_data/imagenet100_data.zip
+        unzip imagenet100_data.zip
+    fi
+}
+
+function run_model_benchmark {
+    cd /workspace/Paddle
+    pip install build/pr_whl/*.whl
+    cd ${cache_dir}/benchmark_data
+    export data_path=${cfs_dir}/model_dataset/model_benchmark_data
+    export prepare_path=${cfs_dir}/model_dataset/model_benchmark_prepare
+    export BENCHMARK_ROOT=/workspace/Paddle/benchmark
+    cd ${BENCHMARK_ROOT}/scripts/benchmark_ci
+    bash model_ci.sh
+}
+
+case $1 in
+  whl_check)
+    compile_install_paddle
+  ;;
+  run_benchmark)
+    init_benchmark
+    prepare_data
+    run_model_benchmark
+  ;;
+  run_all)
+    compile_install_paddle
+    prepare_data
+    run_model_benchmark
+  ;;
+esac