diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9dc6febdfaaa5..ef5d415212eeb 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -257,7 +257,7 @@ option(WITH_BOX_PS "Compile with box_ps support" OFF)
 option(WITH_XBYAK "Compile with xbyak support" ON)
 option(WITH_CONTRIB "Compile the third-party contributation" OFF)
 option(WITH_PSCORE "Compile with parameter server support" ${WITH_DISTRIBUTE})
-option(WITH_HETERPS "Compile with heterps" OFF})
+option(WITH_HETERPS "Compile with heterps" OFF)
 option(WITH_INFERENCE_API_TEST
        "Test fluid inference C++ high-level api interface" OFF)
 option(WITH_INFERENCE_NVTX "Paddle inference with nvtx for profiler" OFF)
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index 9c1d71914bc21..82c4ec14d9ef8 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -7,28 +7,33 @@ if(WITH_NV_JETSON)
   set(paddle_known_gpu_archs "53 62 72")
   set(paddle_known_gpu_archs10 "53 62 72")
   set(paddle_known_gpu_archs11 "53 62 72 87")
+  set(paddle_known_gpu_archs12 "53 62 72 87 90")
 elseif(NEW_RELEASE_ALL)
   message("Using New Release Strategy - All Arches Packge")
   add_definitions(-DNEW_RELEASE_ALL)
-  set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80 86")
-  set(paddle_known_gpu_archs10 "35 50 52 60 61 70 75")
+  set(paddle_known_gpu_archs "50 52 60 61 70 75 80 86 90")
+  set(paddle_known_gpu_archs10 "50 52 60 61 70 75")
   set(paddle_known_gpu_archs11 "50 60 61 70 75 80")
+  set(paddle_known_gpu_archs12 "50 60 61 70 75 80 90")
 elseif(NEW_RELEASE_PYPI)
   message("Using New Release Strategy - Cubin Packge")
   add_definitions(-DNEW_RELEASE_PYPI)
-  set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80 86")
+  set(paddle_known_gpu_archs "50 52 60 61 70 75 80 86 90")
   set(paddle_known_gpu_archs10 "")
   set(paddle_known_gpu_archs11 "61 70 75 80")
+  set(paddle_known_gpu_archs12 "61 70 75 80 90")
 elseif(NEW_RELEASE_JIT)
   message("Using New Release Strategy - JIT Packge")
   add_definitions(-DNEW_RELEASE_JIT)
-  set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80 86")
-  set(paddle_known_gpu_archs10 "35 50 60 70 75")
-  set(paddle_known_gpu_archs11 "35 50 60 70 75 80")
+  set(paddle_known_gpu_archs "50 52 60 61 70 75 80 86 90")
+  set(paddle_known_gpu_archs10 "50 60 70 75")
+  set(paddle_known_gpu_archs11 "50 60 70 75 80")
+  set(paddle_known_gpu_archs12 "50 60 70 75 80 90")
 else()
-  set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80")
+  set(paddle_known_gpu_archs "50 52 60 61 70 75 80 90")
   set(paddle_known_gpu_archs10 "50 52 60 61 70 75")
   set(paddle_known_gpu_archs11 "52 60 61 70 75 80")
+  set(paddle_known_gpu_archs12 "52 60 61 70 75 80 90")
 endif()
 
 ######################################################################################
@@ -100,12 +105,12 @@ endfunction()
 function(select_nvcc_arch_flags out_variable out_arch_bin)
   # List of arch names
   set(archs_names
-      "Kepler"
       "Maxwell"
       "Pascal"
       "Volta"
       "Turing"
       "Ampere"
+      "Hopper"
       "All"
       "Manual")
   set(archs_name_default "Auto")
@@ -144,9 +149,7 @@ function(select_nvcc_arch_flags out_variable out_arch_bin)
     unset(CUDA_ARCH_PTX CACHE)
   endif()
 
-  if(${CUDA_ARCH_NAME} STREQUAL "Kepler")
-    set(cuda_arch_bin "30 35")
-  elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
+  if(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
     if(WITH_NV_JETSON)
       set(cuda_arch_bin "53")
     else()
@@ -176,6 +179,8 @@ function(select_nvcc_arch_flags out_variable out_arch_bin)
         set(cuda_arch_bin "80 86")
       endif()
     endif()
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Hopper")
+    set(cuda_arch_bin "90")
   elseif(${CUDA_ARCH_NAME} STREQUAL "All")
     set(cuda_arch_bin ${paddle_known_gpu_archs})
   elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")
@@ -266,6 +271,11 @@ elseif(${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0) # CUDA 11.2+
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
+elseif(${CMAKE_CUDA_COMPILER_VERSION} LESS 13.0) # CUDA 12.0+
+  set(paddle_known_gpu_archs "${paddle_known_gpu_archs12} 86")
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
 endif()
 
 if(NOT ${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0)
diff --git a/cmake/phi_header.cmake b/cmake/phi_header.cmake
index d5000eadbd14d..b1476761897ea 100644
--- a/cmake/phi_header.cmake
+++ b/cmake/phi_header.cmake
@@ -17,24 +17,21 @@ set(PADDLE_INFERENCE_INSTALL_DIR
 
 function(phi_header_path_compat TARGET_PATH)
   message(STATUS "phi header path compat processing: ${TARGET_PATH}")
-  string(FIND ${TARGET_PATH} "experimental" pos)
-  if(pos GREATER 1)
-    file(GLOB HEADERS "${TARGET_PATH}/*" "*.h")
-    foreach(header ${HEADERS})
-      if(${header} MATCHES ".*.h$")
-        file(READ ${header} HEADER_CONTENT)
-        string(REPLACE "paddle/phi/" "paddle/include/experimental/phi/"
-                       HEADER_CONTENT "${HEADER_CONTENT}")
-        string(REPLACE "paddle/fluid/platform/"
-                       "paddle/include/experimental/phi/" HEADER_CONTENT
-                       "${HEADER_CONTENT}")
-        string(REPLACE "paddle/utils/" "paddle/include/experimental/utils/"
-                       HEADER_CONTENT "${HEADER_CONTENT}")
-        file(WRITE ${header} "${HEADER_CONTENT}")
-        message(STATUS "phi header path compat processing complete: ${header}")
-      endif()
-    endforeach()
-  endif()
+  file(GLOB HEADERS "${TARGET_PATH}/*" "*.h")
+  foreach(header ${HEADERS})
+    if(${header} MATCHES ".*.h$")
+      file(READ ${header} HEADER_CONTENT)
+      string(REPLACE "paddle/phi/" "paddle/include/experimental/phi/"
+                     HEADER_CONTENT "${HEADER_CONTENT}")
+      string(REPLACE "paddle/fluid/platform/"
+                     "paddle/include/experimental/phi/" HEADER_CONTENT
+                     "${HEADER_CONTENT}")
+      string(REPLACE "paddle/utils/" "paddle/include/experimental/utils/"
+                     HEADER_CONTENT "${HEADER_CONTENT}")
+      file(WRITE ${header} "${HEADER_CONTENT}")
+      message(STATUS "phi header path compat processing complete: ${header}")
+    endif()
+  endforeach()
 endfunction()
 
 phi_header_path_compat(
@@ -51,6 +48,7 @@ phi_header_path_compat(
   ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/common)
 phi_header_path_compat(
   ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/core)
+phi_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/)
 
 # In order to be compatible with the original behavior, the header file name needs to be changed
 file(RENAME
diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc
index 63071139a5f40..defc84fbe3d9c 100644
--- a/paddle/fluid/distributed/collective/reducer.cc
+++ b/paddle/fluid/distributed/collective/reducer.cc
@@ -821,9 +821,9 @@ void EagerReducer::MarkVarReady(const size_t var_index,
 
   auto &group = groups_[group_index];
   auto &group_tensor = group.dense_tensors_[inside_group_index];
-  const auto length = group.length_[inside_group_index];
 
   if (!group.is_sparse_) {
+    const auto length = group.length_[inside_group_index];
     if (is_used_var) {
       auto *autograd_meta = tensors_[var_index].get_autograd_meta();
       auto &grad_tensor =
diff --git a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
index 278fbf127036b..4e105d138b7e8 100644
--- a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
@@ -73,7 +73,6 @@
 # bacward api's output usually affected by backward api's input
 special_prune_dict = {
     "matmul_grad": {"x": "grad_y", "y": "grad_x"},
-    "multiply_grad": {"x": "grad_y", "y": "grad_x"},
 }
 
 
@@ -276,6 +275,8 @@ class {} : public egr::GradNodeBase {{
   // Before log info
 {}
   // Forward API Call
+{}
+  // Check NaN and Inf if needed
 {}
   // Get Outputs
 {}
@@ -1675,6 +1676,7 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
                     forward_api_name,
                     before_log_str,
                     forward_call_str,
+                    check_nan_inf_str,
                     get_outputs_str,
                     forward_api_name,
                     check_inplace_str,
diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc
index a220fe18fb35d..2216b6b01427e 100644
--- a/paddle/fluid/eager/backward.cc
+++ b/paddle/fluid/eager/backward.cc
@@ -113,7 +113,6 @@ std::vector<paddle::Tensor> RunBackward(
 
   std::queue<GradNodeBase*> force_sequential_nodes_forward_queue =
       egr::Controller::Instance().GetForceSequentialNodes();
-  egr::Controller::Instance().ClearForceSequentialNodes();
   std::deque<GradNodeBase*> force_sequential_nodes_queue;
   std::set<GradNodeBase*> force_sequential_nodes_set;
   std::set<GradNodeBase*> ready_force_sequential_nodes;
@@ -421,6 +420,7 @@ void Backward(const std::vector<paddle::Tensor>& tensors,  // outputs
   VLOG(3) << "Run in Backward";
   paddle::platform::RecordEvent backward_record_event(
       "backward", paddle::platform::TracerEventType::UserDefined, 1);
+  egr::Controller::Instance().ClearForceSequentialNodes();
   RunBackward(tensors, grad_tensors, retain_graph);
   phi::autotune::AutoTuneStatus::Instance().Update();
 }
diff --git a/paddle/fluid/eager/nan_inf_utils.cc b/paddle/fluid/eager/nan_inf_utils.cc
index 17cf8825d5c15..6eae40fca36cf 100644
--- a/paddle/fluid/eager/nan_inf_utils.cc
+++ b/paddle/fluid/eager/nan_inf_utils.cc
@@ -122,6 +122,11 @@ void CheckTensorHasNanOrInf(const std::string& api_name, const Tensor& tensor) {
   }
 }
 
+void CheckTensorHasNanOrInf(const std::string& api_name,
+                            const paddle::optional<Tensor>& tensor) {
+  CheckTensorHasNanOrInf(api_name, tensor.get());
+}
+
 void CheckTensorHasNanOrInf(const std::string& api_name,
                             const TupleOfTwoTensors& tensors) {
   CheckTensorHasNanOrInf(api_name, std::get<0>(tensors));
@@ -169,6 +174,14 @@ void CheckTensorHasNanOrInf(const std::string& api_name,
   }
 }
 
+void CheckTensorHasNanOrInf(
+    const std::string& api_name,
+    const paddle::optional<std::vector<Tensor>>& tensors) {
+  if (tensors) {
+    CheckTensorHasNanOrInf(api_name, tensors.get());
+  }
+}
+
 void CheckTensorHasNanOrInf(
     const std::string& api_name,
     const paddle::small_vector<std::vector<paddle::Tensor>,
diff --git a/paddle/fluid/eager/nan_inf_utils.h b/paddle/fluid/eager/nan_inf_utils.h
index cb19fd2f9d794..8d7ed7ffb76b2 100644
--- a/paddle/fluid/eager/nan_inf_utils.h
+++ b/paddle/fluid/eager/nan_inf_utils.h
@@ -20,6 +20,7 @@
 
 #include "paddle/fluid/eager/type_defs.h"
 #include "paddle/phi/api/include/tensor.h"
+#include "paddle/utils/optional.h"
 #include "paddle/utils/small_vector.h"
 
 namespace egr {
@@ -36,6 +37,9 @@ using TupleOfTensorAndVector =
 
 void CheckTensorHasNanOrInf(const std::string& api_name, const Tensor& tensor);
 
+void CheckTensorHasNanOrInf(const std::string& api_name,
+                            const paddle::optional<Tensor>& tensor);
+
 void CheckTensorHasNanOrInf(const std::string& api_name,
                             const TupleOfTwoTensors& tensors);
 
@@ -54,6 +58,10 @@ void CheckTensorHasNanOrInf(const std::string& api_name,
 void CheckTensorHasNanOrInf(const std::string& api_name,
                             const std::vector<Tensor>& tensors);
 
+void CheckTensorHasNanOrInf(
+    const std::string& api_name,
+    const paddle::optional<std::vector<Tensor>>& tensors);
+
 void CheckTensorHasNanOrInf(const std::string& api_name,
                             const TupleOfTensorAndVector& tensors);
 
diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h
index a05f2858c0df3..7e002c8154147 100644
--- a/paddle/fluid/framework/data_type.h
+++ b/paddle/fluid/framework/data_type.h
@@ -83,12 +83,13 @@ struct DataTypeTrait<void> {
   _ForEachDataTypeHelper_(                                      \
       callback, ::paddle::platform::complex<double>, COMPLEX128);
 
-#define _ForEachDataTypeNormal_(callback)            \
-  _ForEachDataTypeHelper_(callback, float, FP32);    \
-  _ForEachDataTypeHelper_(callback, double, FP64);   \
-  _ForEachDataTypeHelper_(callback, int, INT32);     \
-  _ForEachDataTypeHelper_(callback, int64_t, INT64); \
-  _ForEachDataTypeHelper_(callback, ::paddle::platform::float16, FP16);
+#define _ForEachDataTypeNormal_(callback)                               \
+  _ForEachDataTypeHelper_(callback, float, FP32);                       \
+  _ForEachDataTypeHelper_(callback, double, FP64);                      \
+  _ForEachDataTypeHelper_(callback, int, INT32);                        \
+  _ForEachDataTypeHelper_(callback, int64_t, INT64);                    \
+  _ForEachDataTypeHelper_(callback, ::paddle::platform::float16, FP16); \
+  _ForEachDataTypeHelper_(callback, ::paddle::platform::bfloat16, BF16);
 
 // For the use of thrust, as index-type elements can be only integers.
 #define _ForEachDataTypeTiny_(callback)          \
diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
index b9055d38d38c5..de2e38c2f1165 100755
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -50,11 +50,19 @@ message ShardingConfig {
   optional bool enable_tuning = 15 [ default = false ]; // incubate for auto parallel
 }
 
+// for dygraph
+message MpConfig {
+    optional bool sync_param= 1 [ default = false ];
+    optional bool sync_grad= 2 [ default = false ];
+    optional bool sync_moment= 3 [ default = false ];
+}
+
 message HybridConfig {
   optional int32 dp_degree = 1 [ default = -1 ];
   optional int32 mp_degree = 2 [ default = 1 ];
   optional int32 pp_degree = 3 [ default = 1 ];
   optional int32 sharding_degree = 4 [ default = 1 ];
+  optional MpConfig mp_configs = 5;
 }
 
 message AMPConfig {
diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc
index f21ca0c858acc..0294e1ca54b43 100644
--- a/paddle/fluid/framework/feed_fetch_method.cc
+++ b/paddle/fluid/framework/feed_fetch_method.cc
@@ -95,7 +95,7 @@ phi::DenseTensor& GetVariableTensor(const Scope& scope,
   PADDLE_ENFORCE_EQ(var->IsType<phi::DenseTensor>(),
                     true,
                     platform::errors::InvalidArgument(
-                        "Only support lod tensor in GetVariableTensor now."));
+                        "Only support DenseTensor in GetVariableTensor now."));
   return *var->GetMutable<phi::DenseTensor>();
 }
 
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 91c3ba6d608b4..b1db3dd0a43cb 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -126,6 +126,7 @@ pass_library(matmul_scale_fuse_pass inference)
 pass_library(gpu_cpu_map_matmul_to_mul_pass inference)
 pass_library(dense_fc_to_sparse_pass inference)
 pass_library(dense_multihead_matmul_to_sparse_pass inference)
+pass_library(delete_cast_op_pass inference)
 pass_library(generate_pass DEPS pass_desc_proto)
 target_link_libraries(generate_pass pass_desc_proto)
 
@@ -242,7 +243,6 @@ if(WITH_XPU)
   pass_library(fused_multi_transformer_xpu_quant_pass inference DIR xpu DEPS
                ${XPU_PASS_DEPS})
   pass_library(stack_fuse_pass inference DIR xpu DEPS ${XPU_PASS_DEPS})
-  pass_library(delete_cast_op_pass inference DIR xpu DEPS ${XPU_PASS_DEPS})
 endif()
 
 cc_library(
@@ -407,6 +407,11 @@ cc_test(
   test_delete_dequant_weight_linear_op_pass
   SRCS delete_weight_dequant_linear_op_pass_tester.cc
   DEPS delete_weight_dequant_linear_op_pass)
+cc_test(
+  test_delete_cast_op_pass
+  SRCS delete_cast_op_pass_test.cc
+  DEPS delete_cast_op_pass)
+
 if(WITH_GPU OR WITH_ROCM)
   cc_test(
     test_embedding_eltwise_layernorm_fuse_pass
@@ -521,8 +526,4 @@ if(WITH_XPU)
     test_stack_fuse_pass
     SRCS xpu/stack_fuse_pass_test.cc
     DEPS stack_fuse_pass)
-  cc_test(
-    test_delete_cast_op_pass
-    SRCS xpu/delete_cast_op_pass_test.cc
-    DEPS delete_cast_op_pass)
 endif()
diff --git a/paddle/fluid/framework/ir/xpu/delete_cast_op_pass.cc b/paddle/fluid/framework/ir/delete_cast_op_pass.cc
similarity index 93%
rename from paddle/fluid/framework/ir/xpu/delete_cast_op_pass.cc
rename to paddle/fluid/framework/ir/delete_cast_op_pass.cc
index fb417322476b2..bfda0f3238010 100644
--- a/paddle/fluid/framework/ir/xpu/delete_cast_op_pass.cc
+++ b/paddle/fluid/framework/ir/delete_cast_op_pass.cc
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/xpu/delete_cast_op_pass.h"
-#include <string>
+#include "paddle/fluid/framework/ir/delete_cast_op_pass.h"
+
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-#include "paddle/fluid/framework/ir/xpu/pass_utils.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 
@@ -127,11 +126,11 @@ int DeleteCastOpPass::ApplyCastWriteReadPass(ir::Graph* graph) const {
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* graph) {
     VLOG(4) << "handle ApplyCastWriteReadPass fuse";
-    GET_IR_NODE(cast0);
-    GET_IR_NODE(write_to_array);
-    GET_IR_NODE(cast0_in);
-    GET_IR_NODE(cast0_out);
-    GET_IR_NODE(write_to_array_out);
+    GET_IR_NODE_FROM_SUBGRAPH(cast0, cast0, pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(write_to_array, write_to_array, pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(cast0_in, cast0_in, pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(cast0_out, cast0_out, pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(write_to_array_out, write_to_array_out, pattern);
 
     // write_to_array_out(in graph1) may not link to any op nodes, so we fine
     // read_from_array by write_to_array_out name.
@@ -281,13 +280,13 @@ int DeleteCastOpPass::ApplyCastLodResetWriteReadPass(ir::Graph* graph) const {
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* graph) {
     VLOG(4) << "handle ApplyCastLodResetWriteReadPass fuse";
-    GET_IR_NODE(cast0);
-    GET_IR_NODE(lod_reset);
-    GET_IR_NODE(write_to_array);
-    GET_IR_NODE(cast0_in);
-    GET_IR_NODE(cast0_out);
-    GET_IR_NODE(lod_reset_out);
-    GET_IR_NODE(write_to_array_out);
+    GET_IR_NODE_FROM_SUBGRAPH(cast0, cast0, pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(lod_reset, lod_reset, pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(write_to_array, write_to_array, pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(cast0_in, cast0_in, pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(cast0_out, cast0_out, pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(lod_reset_out, lod_reset_out, pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(write_to_array_out, write_to_array_out, pattern);
 
     // write_to_array_out(in graph1) may not link to any op nodes, so we fine
     // read_from_array by write_to_array_out name.
@@ -482,13 +481,13 @@ int DeleteCastOpPass::ApplyCastIndexSamplePass(ir::Graph* graph) const {
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* graph) {
     VLOG(4) << "handle ApplyCastIndexSamplePass fuse";
-    GET_IR_NODE(cast0);
-    GET_IR_NODE(index_sample);
-    GET_IR_NODE(cast1);
-    GET_IR_NODE(cast0_in);
-    GET_IR_NODE(cast0_out);
-    GET_IR_NODE(index_sample_out);
-    GET_IR_NODE(cast1_out);
+    GET_IR_NODE_FROM_SUBGRAPH(cast0, cast0, pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(index_sample, index_sample, pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(cast1, cast1, pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(cast0_in, cast0_in, pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(cast0_out, cast0_out, pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(index_sample_out, index_sample_out, pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(cast1_out, cast1_out, pattern);
 
     index_sample->Op()->RenameInput(cast0_out->Name(), cast0_in->Name());
     index_sample->Op()->RenameOutput(index_sample_out->Name(),
@@ -545,9 +544,9 @@ int DeleteCastOpPass::ApplyCastPass(ir::Graph* graph) const {
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* graph) {
     VLOG(4) << "handle ApplyCastPass fuse";
-    GET_IR_NODE(cast);
-    GET_IR_NODE(cast_in);
-    GET_IR_NODE(cast_out);
+    GET_IR_NODE_FROM_SUBGRAPH(cast, cast, pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(cast_in, cast_in, pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(cast_out, cast_out, pattern);
     for (auto* out_op_node : cast_out->outputs) {
       out_op_node->Op()->RenameInput(cast_out->Name(), cast_in->Name());
       IR_NODE_LINK_TO(cast_in, out_op_node);
diff --git a/paddle/fluid/framework/ir/xpu/delete_cast_op_pass.h b/paddle/fluid/framework/ir/delete_cast_op_pass.h
similarity index 100%
rename from paddle/fluid/framework/ir/xpu/delete_cast_op_pass.h
rename to paddle/fluid/framework/ir/delete_cast_op_pass.h
diff --git a/paddle/fluid/framework/ir/xpu/delete_cast_op_pass_test.cc b/paddle/fluid/framework/ir/delete_cast_op_pass_test.cc
similarity index 100%
rename from paddle/fluid/framework/ir/xpu/delete_cast_op_pass_test.cc
rename to paddle/fluid/framework/ir/delete_cast_op_pass_test.cc
diff --git a/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc
index f124c3cc44adf..0b591120014e3 100644
--- a/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc
@@ -99,13 +99,15 @@ Conv2dXPUPattern::Conv2dXPUPattern(PDPattern* pattern,
   auto conv = pattern->NewNode(conv_repr())->assert_is_op(conv_type_);
   auto input = pattern->NewNode(input_repr())
                    ->assert_is_op_input(conv_type_, "Input")
-                   ->AsInput();
+                   ->AsInput()
+                   ->assert_more([](Node* node) {
+                     return node->Var()->GetShape().size() == 4;
+                   });
   auto conv_filter = pattern->NewNode(conv_filter_repr())
                          ->assert_is_op_input(conv_type_, "Filter")
                          ->AsInput();
   auto conv_out = pattern->NewNode(conv_out_repr())
-                      ->assert_is_op_output(conv_type_, "Output")
-                      ->assert_var_not_persistable();
+                      ->assert_is_op_output(conv_type_, "Output");
   conv->LinksFrom({input, conv_filter}).LinksTo({conv_out});
   // ew_bias_add op
   PDNode* ew_bias_add = nullptr;
@@ -116,11 +118,17 @@ Conv2dXPUPattern::Conv2dXPUPattern(PDPattern* pattern,
     ew_bias_add_y = pattern->NewNode(ew_bias_add_y_repr())
                         ->assert_is_op_input("elementwise_add", "Y")
                         ->assert_is_persistable_var()
-                        ->assert_has_n_outputs(1);
+                        ->assert_has_n_outputs(1)
+                        ->assert_more([](Node* node) {
+                          return node->Var()->GetShape().size() == 1;
+                        });
     ew_bias_add =
         pattern->NewNode(ew_bias_add_repr())->assert_is_op("elementwise_add");
     ew_bias_add_out = pattern->NewNode(ew_bias_add_out_repr())
                           ->assert_is_op_output("elementwise_add", "Out");
+    if (with_bn_ || with_branch_ || !act_type_.empty()) {
+      ew_bias_add_out->assert_has_n_outputs(1);
+    }
     ew_bias_add->LinksFrom({conv_out, ew_bias_add_y})
         .LinksTo({ew_bias_add_out});
   } else {
@@ -159,6 +167,9 @@ Conv2dXPUPattern::Conv2dXPUPattern(PDPattern* pattern,
     bn = pattern->NewNode(bn_repr())->assert_is_op("batch_norm");
     bn_out =
         pattern->NewNode(bn_out_repr())->assert_is_op_output("batch_norm", "Y");
+    if (with_branch_ || !act_type_.empty()) {
+      bn_out->assert_has_n_outputs(1);
+    }
     bn_mean_out = pattern->NewNode(bn_mean_out_repr())
                       ->assert_is_op_output("batch_norm", "MeanOut");
     bn_saved_mean = pattern->NewNode(bn_saved_mean_repr())
@@ -179,23 +190,27 @@ Conv2dXPUPattern::Conv2dXPUPattern(PDPattern* pattern,
       bn_out->assert_is_op_input("elementwise_add", "Y")->AsIntermediate();
       ew_branch_add_in = pattern->NewNode(ew_branch_add_in_repr())
                              ->assert_is_op_input("elementwise_add", "X")
-                             ->AsInput()
-                             ->assert_more([](Node* node) {
-                               return node->Var()->GetShape().size() == 4;
-                             });
+                             ->AsInput();
     } else if (with_branch_y_) {
       bn_out->assert_is_op_input("elementwise_add", "X")->AsIntermediate();
       ew_branch_add_in = pattern->NewNode(ew_branch_add_in_repr())
                              ->assert_is_op_input("elementwise_add", "Y")
-                             ->AsInput()
-                             ->assert_more([](Node* node) {
-                               return node->Var()->GetShape().size() == 4;
-                             });
+                             ->AsInput();
     }
-    ew_branch_add =
-        pattern->NewNode(ew_branch_add_repr())->assert_is_op("elementwise_add");
+    ew_branch_add = pattern->NewNode(ew_branch_add_repr())
+                        ->assert_is_op("elementwise_add")
+                        ->assert_more([](Node* node) {
+                          if (node->inputs.size() != 2) {
+                            return false;
+                          }
+                          return node->inputs[0]->Var()->GetShape() ==
+                                 node->inputs[1]->Var()->GetShape();
+                        });
     ew_branch_add_out = pattern->NewNode(ew_branch_add_out_repr())
                             ->assert_is_op_output("elementwise_add", "Out");
+    if (!act_type_.empty()) {
+      ew_branch_add_out->assert_has_n_outputs(1);
+    }
     ew_branch_add->LinksFrom({bn_out, ew_branch_add_in})
         .LinksTo({ew_branch_add_out});
   } else {
@@ -401,6 +416,7 @@ int Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph,
         scope->FindVar(conv_filter->Name())->GetMutable<phi::DenseTensor>();
     auto filter_dims = filter_t->dims();
     bool has_bias = with_bn || with_conv_bias;
+    bool has_branch = with_branch_x || with_branch_y;
     // Create conv_fusion_bias (conv bias) variable
     Node* fusion_bias_node = nullptr;
     if (has_bias) {
@@ -501,18 +517,17 @@ int Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph,
     framework::OpDesc conv2d_xpu_op_desc(block);
     // set input&output var
     conv2d_xpu_op_desc.SetType("conv2d_xpu");
-    conv2d_xpu_op_desc.SetInput("input", {input->Name()});
+    conv2d_xpu_op_desc.SetInput("x", {input->Name()});
     conv2d_xpu_op_desc.SetInput("filter", {filter_int16->Name()});
     conv2d_xpu_op_desc.SetInput("filter_max", {filter_max->Name()});
-    conv2d_xpu_op_desc.SetOutput("output", {conv2d_xpu_out_name});
-    conv2d_xpu_op_desc.SetOutput("output_max", {conv_out_max_name});
+    conv2d_xpu_op_desc.SetOutput("out", {conv2d_xpu_out_name});
+    conv2d_xpu_op_desc.SetOutput("out_max", {conv_out_max_name});
     // set fusion_bias input node
     if (has_bias) {
       conv2d_xpu_op_desc.SetInput("bias", {fusion_bias_node->Name()});
-      conv2d_xpu_op_desc.SetAttr("has_bias", has_bias);
     }
     // set ew_branch_add input node
-    if (ew_branch_add_in != nullptr) {
+    if (ew_branch_add != nullptr) {
       conv2d_xpu_op_desc.SetInput("branch", {ew_branch_add_in->Name()});
     }
     // set attrs of conv2d_xpu
@@ -566,7 +581,8 @@ int Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph,
     conv2d_xpu_op_desc.SetAttr("place_z", std::vector<int>{10});
     conv2d_xpu_op_desc.SetAttr("paddings", conv_paddings);
     conv2d_xpu_op_desc.SetAttr("block_lod", std::vector<int>{1});
-    conv2d_xpu_op_desc.SetAttr("has_branch", with_branch_x || with_branch_y);
+    conv2d_xpu_op_desc.SetAttr("has_branch", has_branch);
+    conv2d_xpu_op_desc.SetAttr("has_bias", has_bias);
 
     auto* conv2d_xpu = graph->CreateOpNode(&conv2d_xpu_op_desc);
     IR_NODE_LINK_TO(input, conv2d_xpu);
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
index 359bab844303f..4c1538a28fedb 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
@@ -31,6 +31,7 @@
 #include "cinn/frontend/syntax.h"
 #include "cinn/hlir/framework/graph.h"
 #include "cinn/hlir/framework/graph_compiler.h"
+#include "cinn/hlir/framework/visualize_helper.h"
 #include "gflags/gflags.h"
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/ir/graph.h"
@@ -49,6 +50,7 @@
 
 DECLARE_bool(enable_pe_launch_cinn);
 DECLARE_bool(enable_cinn_auto_tune);
+DECLARE_string(cinn_subgraph_graphviz_dir);
 namespace paddle {
 namespace framework {
 namespace paddle2cinn {
@@ -73,7 +75,6 @@ const CinnCompiledObject &CinnCompiler::Compile(
     const std::map<std::string, const phi::DenseTensor *> &input_tensors,
     const Target &target,
     void *stream) {
-  VLOG(4) << "-- The graph to be compiled is:\n" << VizGraph(graph);
   CinnCacheKeyByAddress cur_key_by_address(
       graph, input_tensors, target.arch_str());
   CinnCacheKeyByStructure cur_key_by_struct;
@@ -85,6 +86,26 @@ const CinnCompiledObject &CinnCompiler::Compile(
     if (!cache_by_struct_.count(cur_key_by_struct)) {
       VLOG(4) << "Not found CinnCompiledObject in cache_by_struct_.";
       std::int64_t compiled_num = real_compiled_num_.fetch_add(1);
+
+      if (!FLAGS_cinn_subgraph_graphviz_dir.empty()) {
+        const std::string &viz_path = FLAGS_cinn_subgraph_graphviz_dir +
+                                      "/fusion_groups_" +
+                                      std::to_string(compiled_num) + "/";
+        if (!::cinn::hlir::framework::MakeDirectory(
+                viz_path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH)) {
+          LOG_IF(WARNING, compiled_num == 0)
+              << "Failed to make directory: \"" << viz_path
+              << "\", the CINN subgraph's graphviz dot file will not print.";
+        } else {
+          LOG_IF(INFO, compiled_num == 0)
+              << "The CINN subgraph's graphviz dot file will writing into "
+                 "path: \""
+              << FLAGS_cinn_subgraph_graphviz_dir << "\"";
+          ::cinn::hlir::framework::WriteToFile(viz_path + "cinn_subgraph.dot",
+                                               VizGraph(graph));
+        }
+      }
+
       auto compiled_res =
           CompileGraph(graph, input_tensors, target, compiled_num, stream);
       std::unique_lock<std::mutex> guard(lock_);
diff --git a/paddle/fluid/imperative/layout_transformer.h b/paddle/fluid/imperative/layout_transformer.h
index 4dba2d16d598c..2bdbead6aae0d 100644
--- a/paddle/fluid/imperative/layout_transformer.h
+++ b/paddle/fluid/imperative/layout_transformer.h
@@ -402,10 +402,12 @@ class ArgmaxOpTransformer
           case paddle::framework::proto::AttrType::INT: {
             auto axis = PADDLE_GET_CONST(int, (*attrs)["axis"]);
             (*attrs)["axis"] = static_cast<int>(perm[axis]);
+            break;
           }
           case paddle::framework::proto::AttrType::LONG: {
             auto axis = PADDLE_GET_CONST(int64_t, (*attrs)["axis"]);
             (*attrs)["axis"] = static_cast<int64_t>(perm[axis]);
+            break;
           }
           default:
             VLOG(4) << "The data_type of axis is Error, axis must be int or "
diff --git a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc
index 2589a20eb284d..963197850c9fd 100644
--- a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc
+++ b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc
@@ -102,32 +102,53 @@ void ConvertToMixedPrecisionPass::SaveMixedModel() {
   framework::ProgramDesc mixed_program_desc;
   framework::ir::GraphToProgram(*main_graph_, &mixed_program_desc);
 
-  auto parameters = scope_.LocalVarNames();
-  std::sort(parameters.begin(), parameters.end());
-
-  auto SerializeParams = [&]() -> std::string {
-    std::ostringstream os;
-    phi::CPUContext ctx;
-    for (const auto& param : parameters) {
-      PADDLE_ENFORCE_NOT_NULL(
-          scope_.FindVar(param),
-          platform::errors::NotFound(
-              "Block should already have a '%s' variable", param));
-      auto* tensor = scope_.FindVar(param)->GetMutable<phi::DenseTensor>();
-      framework::SerializeToStream(os, *tensor, ctx);
+  auto SerializeParams = [&](const std::string& path) {
+    auto IsPersistable = [](const framework::VarDesc* var) {
+      if (var->Persistable() &&
+          var->GetType() != framework::proto::VarType::FEED_MINIBATCH &&
+          var->GetType() != framework::proto::VarType::FETCH_LIST &&
+          var->GetType() != framework::proto::VarType::RAW) {
+        return true;
+      }
+      return false;
+    };
+    framework::ProgramDesc save_program;
+    auto* save_block = save_program.MutableBlock(0);
+
+    const auto& global_block = mixed_program_desc.Block(0);
+    std::vector<std::string> save_var_list;
+    for (framework::VarDesc* var : global_block.AllVars()) {
+      if (IsPersistable(var)) {
+        framework::VarDesc* new_var = save_block->Var(var->Name());
+        new_var->SetShape(var->GetShape());
+        new_var->SetDataType(var->GetDataType());
+        new_var->SetType(var->GetType());
+        new_var->SetLoDLevel(var->GetLoDLevel());
+        new_var->SetPersistable(true);
+
+        save_var_list.push_back(new_var->Name());
+      }
     }
-    return os.str();
+    std::sort(save_var_list.begin(), save_var_list.end());
+    auto* op = save_block->AppendOp();
+    op->SetType("save_combine");
+    op->SetInput("X", save_var_list);
+    op->SetAttr("file_path", path);
+    op->CheckAttrs();
+
+    framework::Executor exe(platform::CPUPlace{});
+    exe.Run(save_program, &scope_, 0, true, true);
   };
 
-  auto StrToBinary = [](const std::string& path, const std::string& str) {
+  auto SerializeProg = [&](const std::string& path) {
+    auto str = mixed_program_desc.Proto()->SerializeAsString();
     std::ofstream file(path.c_str(), std::ios::binary);
     file.write(str.c_str(), str.size());
     file.close();
   };
 
-  StrToBinary(mixed_model_file_,
-              mixed_program_desc.Proto()->SerializeAsString());
-  StrToBinary(mixed_params_file_, SerializeParams());
+  SerializeProg(mixed_model_file_);
+  SerializeParams(mixed_params_file_);
 }
 
 bool OpSupportPrecision(const std::string& op_type,
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index e50adbedc54cb..790c32b31e129 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -155,11 +155,10 @@ phi::Backend ConvertBackend(paddle_infer::PlaceType backend) {
       return phi::Backend::CPU;
   }
 }
-}  // namespace
 
-bool PaddleTensorToLoDTensor(const PaddleTensor &pt,
-                             phi::DenseTensor *t,
-                             const platform::Place &place) {
+bool PaddleTensorToDenseTensor(const PaddleTensor &pt,
+                               phi::DenseTensor *t,
+                               const platform::Place &place) {
   framework::DDim ddim = phi::make_ddim(pt.shape);
   void *input_ptr;
   if (pt.dtype == PaddleDType::INT64) {
@@ -270,6 +269,7 @@ bool PaddleTensorToLoDTensor(const PaddleTensor &pt,
   t->set_lod(lod);
   return true;
 }
+}  // namespace
 
 bool AnalysisPredictor::Init(
     const std::shared_ptr<framework::Scope> &parent_scope,
@@ -919,6 +919,17 @@ void AnalysisPredictor::MkldnnPreSet(const std::vector<PaddleTensor> &inputs) {
 #endif
 }
 
+void AnalysisPredictor::MkldnnPreSet(
+    const std::vector<paddle::Tensor> &inputs) {
+#ifdef PADDLE_WITH_MKLDNN
+  std::vector<std::vector<int>> inputs_shape;
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    inputs_shape.emplace_back(phi::vectorize<int>(inputs[i].dims()));
+  }
+  MkldnnPreSet(inputs_shape);
+#endif
+}
+
 void AnalysisPredictor::MkldnnPreSet(
     const std::vector<std::vector<int>> &inputs_shape) {
 #ifdef PADDLE_WITH_MKLDNN
@@ -1033,6 +1044,70 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
   return true;
 }
 
+bool AnalysisPredictor::Run(const std::vector<paddle::Tensor> &inputs,
+                            std::vector<paddle::Tensor> *outputs) {
+  inference::DisplayMemoryInfo(place_, "before run");
+  paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
+#ifdef PADDLE_WITH_MKLDNN
+  if (config_.use_mkldnn_) MkldnnPreSet(inputs);
+#endif
+  VLOG(3) << "predict start";
+  // set feed variable
+  framework::Scope *scope = sub_scope_ ? sub_scope_ : scope_.get();
+  PADDLE_ENFORCE_NOT_NULL(
+      scope,
+      platform::errors::PreconditionNotMet("The scope should not be nullptr."));
+  if (!SetFeed(inputs, scope)) {
+    LOG(ERROR) << "fail to set feed";
+    return false;
+  }
+
+#ifdef PADDLE_WITH_TENSORRT
+  if (config_.tensorrt_engine_enabled()) {
+    inference::tensorrt::TensorRTEngine::predictor_id_per_thread =
+        predictor_id_;
+    VLOG(3) << "thread_local var predictor_id in TensorRTEngine is set to: "
+            << inference::tensorrt::TensorRTEngine::predictor_id_per_thread;
+  }
+#endif
+
+  // Run the inference program
+  // if share variables, we need not create variables
+  executor_->Run();
+
+  inference::DisplayMemoryInfo(place_, "after run");
+
+  // get fetch variable
+  if (!GetFetch(outputs, scope)) {
+    LOG(ERROR) << "fail to get fetches";
+    return false;
+  }
+
+  // All the containers in the scope will be hold in inference, but the
+  // operators assume that the container will be reset after each batch.
+  // Here is a bugfix, collect all the container variables, and reset then to a
+  // bool; the next time, the operator will call MutableData and construct a new
+  // container again, so that the container will be empty for each batch.
+  if (sub_scope_) {
+    tensor_array_batch_cleaner_.CollectNoTensorVars(sub_scope_);
+  }
+  tensor_array_batch_cleaner_.ResetNoTensorVars();
+
+  // recover the cpu_math_library_num_threads to 1, in order to avoid thread
+  // conflict when integrating it into deployment service.
+  paddle::platform::SetNumThreads(1);
+#ifdef PADDLE_WITH_MKLDNN
+  if (config_.use_mkldnn_) MkldnnPostReset();
+#endif
+#if defined(PADDLE_WITH_MKLML)
+  // Frees unused memory allocated by the Intel® MKL Memory Allocator to
+  // avoid memory leak. See:
+  // https://software.intel.com/en-us/mkl-developer-reference-c-mkl-free-buffers
+  platform::dynload::MKL_Free_Buffers();
+#endif
+  return true;
+}
+
 bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
                                 framework::Scope *scope) {
   VLOG(3) << "Predictor::set_feed";
@@ -1047,7 +1122,7 @@ bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
 
   for (size_t i = 0; i < inputs.size(); ++i) {
     phi::DenseTensor *input = &feed_tensors_[i];
-    if (!PaddleTensorToLoDTensor(inputs[i], input, place_)) {
+    if (!PaddleTensorToDenseTensor(inputs[i], input, place_)) {
       return false;
     }
     int idx = -1;
@@ -1061,7 +1136,41 @@ bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
     } else {
       idx = PADDLE_GET_CONST(int, feeds_[i]->GetAttr("col"));
     }
-    framework::SetFeedVariable(scope, *input, "feed", idx);
+    framework::SetFeedVariable(scope, *input, framework::kFeedOpType, idx);
+  }
+  return true;
+}
+
+bool AnalysisPredictor::SetFeed(const std::vector<paddle::Tensor> &inputs,
+                                framework::Scope *scope) {
+  VLOG(3) << "Predictor::set_feed";
+  PADDLE_ENFORCE_EQ(inputs.size(),
+                    feeds_.size(),
+                    platform::errors::InvalidArgument(
+                        "wrong feed input size, need %d but get %d.",
+                        feeds_.size(),
+                        inputs.size()));
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    PADDLE_ENFORCE_EQ(inputs[i].initialized(),
+                      true,
+                      paddle::platform::errors::InvalidArgument(
+                          "The input Tensor expected to be initialized."));
+  }
+
+  if (std::all_of(inputs.cbegin(), inputs.cend(), [&](const paddle::Tensor &t) {
+        return !t.name().empty() && feed_names_.count(t.name());
+      })) {
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      auto &t = framework::GetVariableTensor(*scope, inputs[i].name());
+      t.ShareDataWith(
+          *std::dynamic_pointer_cast<phi::DenseTensor>(inputs[i].impl()));
+    }
+  } else {
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      auto &t = framework::GetVariableTensor(*scope, idx2feeds_[i]);
+      t.ShareDataWith(
+          *std::dynamic_pointer_cast<phi::DenseTensor>(inputs[i].impl()));
+    }
   }
   return true;
 }
@@ -1100,7 +1209,7 @@ bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
             idx,
             i));
     framework::FetchType &fetch_var =
-        framework::GetFetchVariable(*scope, "fetch", idx);
+        framework::GetFetchVariable(*scope, framework::kFetchOpType, idx);
     auto &fetch = PADDLE_GET(phi::DenseTensor, fetch_var);
     auto type = framework::TransToProtoVarType(fetch.dtype());
     auto output = &(outputs->at(i));
@@ -1125,6 +1234,19 @@ bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
   return true;
 }
 
+bool AnalysisPredictor::GetFetch(std::vector<paddle::Tensor> *outputs,
+                                 framework::Scope *scope) {
+  VLOG(3) << "Predictor::get_fetch";
+  outputs->resize(fetches_.size());
+  for (size_t i = 0; i < fetches_.size(); ++i) {
+    auto const &name = idx2fetches_[i];
+    auto &t = framework::GetVariableTensor(*scope, name);
+    (*outputs)[i] =
+        std::move(paddle::Tensor(std::make_shared<phi::DenseTensor>(t), name));
+  }
+  return true;
+}
+
 void AnalysisPredictor::PrepareArgument() {
   VLOG(3) << "AnalysisPredictor::PrepareArgument";
   // Init std::unique_ptr argument_.
@@ -1579,7 +1701,7 @@ void AnalysisPredictor::PrepareFeedFetch() {
                               "The sub_scope should not be nullptr."));
   CreateFeedFetchVar(sub_scope_);
   for (auto *op : inference_program_->Block(0).AllOps()) {
-    if (op->Type() == "feed") {
+    if (op->Type() == framework::kFeedOpType) {
       int idx = PADDLE_GET_CONST(int, op->GetAttr("col"));
       if (feeds_.size() <= static_cast<size_t>(idx)) {
         feeds_.resize(idx + 1);
@@ -1587,7 +1709,7 @@ void AnalysisPredictor::PrepareFeedFetch() {
       feeds_[idx] = op;
       feed_names_[op->Output("Out")[0]] = idx;
       idx2feeds_[idx] = op->Output("Out")[0];
-    } else if (op->Type() == "fetch") {
+    } else if (op->Type() == framework::kFetchOpType) {
       int idx = PADDLE_GET_CONST(int, op->GetAttr("col"));
       if (fetches_.size() <= static_cast<size_t>(idx)) {
         fetches_.resize(idx + 1);
@@ -1602,9 +1724,9 @@ void AnalysisPredictor::CreateFeedFetchVar(framework::Scope *scope) {
   PADDLE_ENFORCE_NOT_NULL(
       scope,
       platform::errors::InvalidArgument("The scope should not be nullptr."));
-  auto *var = scope->Var("feed");
+  auto *var = scope->Var(framework::kFeedOpType);
   var->GetMutable<framework::FeedList>();
-  var = scope->Var("fetch");
+  var = scope->Var(framework::kFetchOpType);
   var->GetMutable<framework::FetchList>();
 }
 
@@ -2186,7 +2308,7 @@ void AnalysisPredictor::ClearIntermediateTensor() {
       const std::string name = var->Name();
       auto *variable = executor_->GetScope()->FindVar(name);
       if (variable != nullptr && variable->IsType<phi::DenseTensor>() &&
-          name != "feed" && name != "fetch") {
+          name != framework::kFeedOpType && name != framework::kFetchOpType) {
         VLOG(3) << "Clear Intermediate Tensor: " << name;
         auto *t = variable->GetMutable<phi::DenseTensor>();
         t->clear();
@@ -2567,6 +2689,7 @@ USE_TRT_CONVERTER(expand_as_v2)
 USE_TRT_CONVERTER(take_along_axis)
 USE_TRT_CONVERTER(skip_groupnorm_act)
 USE_TRT_CONVERTER(preln_groupnorm_act)
+USE_TRT_CONVERTER(cumsum)
 #if IS_TRT_VERSION_GE(8522)
 USE_TRT_CONVERTER(flash_multihead_matmul)
 USE_TRT_CONVERTER(cross_multihead_matmul)
@@ -2654,6 +2777,11 @@ std::map<std::string, DataType> Predictor::GetOutputTypes() {
 
 bool Predictor::Run() { return predictor_->ZeroCopyRun(); }
 
+bool Predictor::Run(const std::vector<paddle::Tensor> &inputs,
+                    std::vector<paddle::Tensor> *outputs) {
+  return predictor_->Run(inputs, outputs);
+}
+
 std::unique_ptr<Predictor> Predictor::Clone(void *stream) {
   auto analysis_pred = predictor_->Clone(stream);
   std::unique_ptr<Predictor> pred(new Predictor(std::move(analysis_pred)));
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index 5a578a9b94fcb..83207a8bfd654 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -31,15 +31,14 @@
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/resource_manager.h"
 #include "paddle/fluid/platform/device/gpu/gpu_types.h"
-#include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/phi/core/dense_tensor.h"
 #ifdef PADDLE_WITH_TESTING
 #include <gtest/gtest.h>
 #include <gtest/gtest_prod.h>
 #endif
 
 namespace paddle_infer {
-using float16 = paddle::platform::float16;
 namespace experimental {
 class InternalUtils;
 };
@@ -150,6 +149,16 @@ class AnalysisPredictor : public PaddlePredictor {
            std::vector<PaddleTensor> *output_data,
            int batch_size = -1) override;
 
+  ///
+  /// \brief Run the prediction engine (Recommended).
+  ///
+  /// \param[in] inputs input tensors
+  /// \param[out] outputs output tensors
+  /// \return Whether the function executed successfully
+  ///
+  bool Run(const std::vector<paddle::Tensor> &inputs,
+           std::vector<paddle::Tensor> *outputs) override;
+
   ///
   /// \brief Get the input names
   ///
@@ -378,6 +387,17 @@ class AnalysisPredictor : public PaddlePredictor {
   ///
   bool SetFeed(const std::vector<PaddleTensor> &input_datas,
                framework::Scope *scope);
+
+  ///
+  /// \brief Prepare input data, only used in Run()
+  ///
+  /// \param[in] inputs inpute tensors
+  /// \param[in] scope the scope used by predictor
+  /// \return Whether the function executed successfully
+  ///
+  bool SetFeed(const std::vector<paddle::Tensor> &inputs,
+               framework::Scope *scope);
+
   ///
   /// \brief Get the output data, only used in Run()
   ///
@@ -387,6 +407,16 @@ class AnalysisPredictor : public PaddlePredictor {
   ///
   bool GetFetch(std::vector<PaddleTensor> *output_data,
                 framework::Scope *scope);
+
+  ///
+  /// \brief Get the output data, only used in Run()
+  ///
+  /// \param[out] outputs output tensors
+  /// \param[in] scope the scope used by predictor
+  /// \return Whether the function executed successfully
+  ///
+  bool GetFetch(std::vector<paddle::Tensor> *outputs, framework::Scope *scope);
+
   ///
   /// \brief Get the output data, only used in GetFetch()
   ///
@@ -404,6 +434,14 @@ class AnalysisPredictor : public PaddlePredictor {
   /// \param[in] inputs tensors
   ///
   void MkldnnPreSet(const std::vector<PaddleTensor> &inputs);
+  ///
+  /// \brief PreSet for Mkldnn multi-thread and dynamic shape input.
+  ///
+  /// Used in AnalysisPredictor::Run().
+  ///
+  /// \param[in] inputs tensors
+  ///
+  void MkldnnPreSet(const std::vector<paddle::Tensor> &inputs);
 
   ///
   /// \brief PreSet for Mkldnn multi-thread and dynamic shape input.
diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
index fc23caee65638..11f214bc45d53 100644
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -83,7 +83,7 @@ else()
   if(WITH_MKL)
     set(FLAG_OPENMP "-fopenmp")
   endif()
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 ${FLAG_OPENMP}")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 ${FLAG_OPENMP}")
 endif()
 
 if(WITH_GPU)
diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h
index e83c1a9f9444c..3a51f91b3afc2 100644
--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -221,6 +221,16 @@ class PD_INFER_DECL PaddlePredictor {
                    std::vector<PaddleTensor>* output_data,
                    int batch_size = -1) = 0;
 
+  /// \brief This interface takes input and runs the network (Recommended).
+  /// \param[in] inputs An list of Tensor as the input to the network.
+  /// \param[out] output_data Pointer to the tensor list, which holds the output
+  /// Tensor
+  /// \return Whether the run is successful
+  virtual bool Run(const std::vector<paddle::Tensor>& inputs,
+                   std::vector<paddle::Tensor>* outputs) {
+    return false;
+  }
+
   /// \brief  Used to get the name of the network input.
   /// Be inherited by AnalysisPredictor, Only used in ZeroCopy scenarios.
   /// \return Input tensor names.
diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h
index d7f15e0529894..54a9d9af117ca 100644
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -128,6 +128,17 @@ class PD_INFER_DECL Predictor {
   ///
   bool Run();
 
+  ///
+  /// \brief Run the prediction engine (Recommended)
+  ///
+  /// \param[in] inputs An list of Tensor as the input to the network.
+  /// \param[out] outputs Pointer to the tensor list, which holds the output
+  /// Tensor
+  ///
+  /// \return Whether the run is successful
+  bool Run(const std::vector<paddle::Tensor>& inputs,
+           std::vector<paddle::Tensor>* outputs);
+
   ///
   /// \brief Get the output names
   ///
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 3cc8b077ad7e6..a1fe08b081eeb 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -276,6 +276,7 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
         "transpose_flatten_concat_fuse_pass",  //
         "conv2d_fusion_layout_transfer_pass",  //
         "auto_mixed_precision_pass",           //
+        "delete_cast_op_pass",                 //
         "inplace_op_var_pass",                 // should be the last pass.
   });
 
diff --git a/paddle/fluid/inference/api/paddle_tensor.h b/paddle/fluid/inference/api/paddle_tensor.h
index 0301892792dc3..b9c86a60f55b8 100644
--- a/paddle/fluid/inference/api/paddle_tensor.h
+++ b/paddle/fluid/inference/api/paddle_tensor.h
@@ -21,6 +21,8 @@
 
 #include "paddle_infer_declare.h"  // NOLINT
 
+#include "paddle/phi/api/include/tensor.h"  // expose paddle::Tensor
+
 #ifdef PADDLE_WITH_ONNXRUNTIME
 #include "onnxruntime_c_api.h"    // NOLINT
 #include "onnxruntime_cxx_api.h"  // NOLINT
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index cbe26a3d31e4d..1793e1207771e 100755
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -106,6 +106,7 @@ list(
   skip_groupnorm_act_op.cc
   preln_groupnorm_act_op.cc
   expand_v2_op.cc
+  cumsum_op.cc
   temporal_shift_op.cc)
 
 if(${TENSORRT_MAJOR_VERSION} GREATER_EQUAL 7)
diff --git a/paddle/fluid/inference/tensorrt/convert/cumsum_op.cc b/paddle/fluid/inference/tensorrt/convert/cumsum_op.cc
new file mode 100644
index 0000000000000..a46bf1efa171b
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/cumsum_op.cc
@@ -0,0 +1,157 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+/*
+ * Cumsum Op
+ */
+class CumsumOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope,
+                  bool test_mode) override {
+#if IS_TRT_VERSION_GE(7220)
+    VLOG(3) << "convert a cumsum op to tensorrt layer";
+    framework::OpDesc op_desc(op, nullptr);
+    std::string input_x_name = op_desc.Input("X").front();
+    std::string output_name = op_desc.Output("Out").front();
+    auto* input_x_tensor = engine_->GetITensor(input_x_name);
+    auto dims = input_x_tensor->getDimensions();
+    auto rank = dims.nbDims;
+    int axis = 0;
+    if (op_desc.HasAttr("axis")) {
+      axis = PADDLE_GET_CONST(int, op_desc.GetAttr("axis"));
+      if (axis < 0) {
+        axis += rank;
+      }
+    }
+
+    // getAxisLength default is a scalar
+    auto getAxisLength =
+        [&](nvinfer1::ITensor* inpTensor, int axis, bool scalar = true) {
+          auto dims = inpTensor->getDimensions();
+          int d = dims.d[axis];
+          if (d >= 0) {
+            return Add1DConstantLayer(d, "", scalar);
+          } else {
+            nvinfer1::ITensor* inpShape = Shape(inpTensor);
+            return GetEleTensorOfShape(inpShape, d, scalar);
+          }
+        };
+
+    // Create "inputSliced" tensor that is sliced on dimension[axis] to length 1
+    nvinfer1::Dims start;
+    start.nbDims = rank;
+    std::vector<int32_t> start_vec(rank, 0);
+    std::fill(start.d, start.d + rank, 0);
+
+    nvinfer1::Dims size;
+    size.nbDims = rank;
+    nvinfer1::Dims stride;
+    stride.nbDims = rank;
+    auto axisLength = getAxisLength(input_x_tensor, axis, false);
+
+    auto starts_tensor =
+        Add1DConstantLayer(start_vec, output_name + "_start_tensor_");
+    auto sizes_tensor = axis == 0 ? Add1DConstantLayer(1)
+                                  : getAxisLength(input_x_tensor, 0, false);
+    auto strides_tensor = axis == 0 ? axisLength : Add1DConstantLayer(1);
+
+    for (int i = 1; i < rank; i++) {
+      if (i == axis) {
+        std::vector<nvinfer1::ITensor*> strides_itensors = {strides_tensor,
+                                                            axisLength};
+        strides_tensor = Concat(strides_itensors);
+        std::vector<nvinfer1::ITensor*> sizes_itensors = {
+            sizes_tensor, Add1DConstantLayer(1)};
+        sizes_tensor = Concat(sizes_itensors);
+      } else {
+        auto currLength = getAxisLength(input_x_tensor, i, false);
+        std::vector<nvinfer1::ITensor*> strides_itensors = {
+            strides_tensor, Add1DConstantLayer(1)};
+        strides_tensor = Concat(strides_itensors);
+        std::vector<nvinfer1::ITensor*> sizes_itensors = {sizes_tensor,
+                                                          currLength};
+        sizes_tensor = Concat(sizes_itensors);
+      }
+    }
+
+    auto inputSliced = TRT_ENGINE_ADD_LAYER(
+        engine_, Slice, *input_x_tensor, start, size, stride);
+    inputSliced->setInput(1, *starts_tensor);
+    inputSliced->setInput(2, *sizes_tensor);
+    inputSliced->setInput(3, *strides_tensor);
+    auto inputSliced_output = inputSliced->getOutput(0);
+
+    // Scan through each slice across axis and add it to the running sum
+    auto loop = TRT_ENGINE_ADD_LAYER(engine_, Loop);
+    nvinfer1::ITensor* tripLimit = getAxisLength(input_x_tensor, axis);
+    loop->addTripLimit(*tripLimit, nvinfer1::TripLimit::kCOUNT);
+    auto iterator = loop->addIterator(*input_x_tensor, axis);
+    auto data = iterator->getOutput(0);
+
+    // Squeeze inputSliced down to same shape as `data`
+    auto sliced_dims = inputSliced_output->getDimensions();
+    std::vector<int32_t> subscripts(sliced_dims.nbDims);
+    std::iota(subscripts.begin(), subscripts.end(), 0);
+    auto p = std::remove_if(subscripts.begin(),
+                            subscripts.end(),
+                            [axis](int x) { return x == axis; });
+    subscripts.resize(p - subscripts.begin());
+    auto newDims = Gather(Shape(inputSliced_output), subscripts);
+    inputSliced_output = Reshape(inputSliced_output, newDims);
+
+    // creat ZeroTensor
+    std::vector<float> zero_vec{0.f};
+    auto zero = Add1DConstantLayer(zero_vec);
+    auto cast = TRT_ENGINE_ADD_LAYER(engine_, Identity, *zero);
+    cast->setOutputType(0, inputSliced_output->getType());
+
+    zero = TRT_ENGINE_ADD_LAYER(
+               engine_,
+               ElementWise,
+               *inputSliced_output,
+               *BroadcastTensors(cast->getOutput(0), inputSliced_output),
+               nvinfer1::ElementWiseOperation::kPROD)
+               ->getOutput(0);
+
+    auto runningSum = loop->addRecurrence(*zero);
+    auto runningSumTensor = runningSum->getOutput(0);
+    auto curSum = TRT_ENGINE_ADD_LAYER(engine_,
+                                       ElementWise,
+                                       *data,
+                                       *runningSumTensor,
+                                       nvinfer1::ElementWiseOperation::kSUM);
+    runningSum->setInput(1, *curSum->getOutput(0));
+    auto reverseFlag = nvinfer1::LoopOutput::kCONCATENATE;
+    nvinfer1::ILoopOutputLayer* loopOut =
+        loop->addLoopOutput(*curSum->getOutput(0), reverseFlag, axis);
+    loopOut->setInput(1, *tripLimit);
+    RreplenishLayerAndOutput(loopOut, "cumsum", {output_name}, test_mode);
+#else
+    VLOG(3) << "Cumsum is not supported when TensorRT < 7.2.2";
+#endif
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(cumsum, CumsumOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index f0e0d969a1600..e2dfe4d5ba304 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -427,6 +427,41 @@ class OpConverter {
     return shuffle->getOutput(0);
   }
 
+  nvinfer1::ITensor* BroadcastTensor(nvinfer1::ITensor* input,
+                                     const int nbDims) {
+    auto oldShape = Shape(input);
+    auto oldShapeDims = oldShape->getDimensions();
+    const int rank = oldShapeDims.nbDims;
+    if (rank > nbDims) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Cannot broadcast a higher rank tensor to a lower rank tensor."));
+    }
+    if (rank < nbDims) {
+      nvinfer1::ITensor* concat_shape_tensor;
+      auto* one_rank_tensor =
+          Add1DConstantLayer(std::vector<int32_t>(nbDims - rank, 1));
+      std::vector<nvinfer1::ITensor*> itensors;
+      itensors.push_back(one_rank_tensor);
+      itensors.push_back(oldShape);
+      concat_shape_tensor = Concat(itensors);
+      input = Reshape(input, concat_shape_tensor);
+    }
+    return input;
+  }
+
+  nvinfer1::ITensor* BroadcastTensors(nvinfer1::ITensor* a,
+                                      nvinfer1::ITensor* b) {
+    const int aDims = a->getDimensions().nbDims;
+    const int bDims = b->getDimensions().nbDims;
+    if (aDims == bDims) {
+      VLOG(3) << "Broadcast two equal rank tensors";
+    }
+    if (aDims > bDims) {
+      return BroadcastTensor(b, aDims);
+    }
+    return BroadcastTensor(a, bDims);
+  }
+
   // Concat not make rank changed
   nvinfer1::ITensor* Concat(const std::vector<nvinfer1::ITensor*>& inputs,
                             int axis = 0) {
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index affc7327a3cee..9ce57fe6aee91 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -2729,6 +2729,25 @@ struct SimpleOpTypeSetTeller : public Teller {
 #endif
     }
 
+    if (op_type == "cumsum") {
+#if !IS_TRT_VERSION_GE(7220)
+      VLOG(3) << "cumsum is not supported when TensorRT < 7.2.2";
+      return false;
+#endif
+      if (!with_dynamic_shape) {
+        VLOG(3) << "the cumsum does not support "
+                   "static shape yet";
+        return false;
+      }
+      auto* block = desc.Block();
+      if (block == nullptr) {
+        VLOG(3) << "The block desc is nullptr, we can't continue to analyze. "
+                   "Developers need to check whether block_desc is passed in "
+                   "the pass.";
+        return false;
+      }
+    }
+
     if (op_type == "temporal_shift") {
 #if !IS_TRT_VERSION_GE(8200)
       VLOG(3) << "temporal_shift is not supported when TensorRT < 8.2";
@@ -2931,7 +2950,8 @@ struct SimpleOpTypeSetTeller : public Teller {
       "skip_groupnorm_act",
       "preln_groupnorm_act",
       "temporal_shift",
-      "grid_sampler"};
+      "grid_sampler",
+      "cumsum"};
 
   std::unordered_set<std::string> teller_set{
       "mul",
@@ -3090,7 +3110,8 @@ struct SimpleOpTypeSetTeller : public Teller {
       "skip_groupnorm_act",
       "preln_groupnorm_act",
       "temporal_shift",
-      "grid_sampler"};
+      "grid_sampler",
+      "cumsum"};
 };
 
 struct GenericPluginTeller : public Teller {
diff --git a/paddle/fluid/jit/engine/predictor_engine.cc b/paddle/fluid/jit/engine/predictor_engine.cc
index d18f4f487dbe2..54e35bc0f69dd 100644
--- a/paddle/fluid/jit/engine/predictor_engine.cc
+++ b/paddle/fluid/jit/engine/predictor_engine.cc
@@ -22,11 +22,6 @@
 namespace paddle {
 namespace jit {
 
-static PaddleTensor DenseTensorToPaddleTensor(DenseTensor *t);
-static bool PaddleTensorToDenseTensor(const PaddleTensor &pt,
-                                      DenseTensor *t,
-                                      const platform::Place &place);
-
 PredictorEngine::PredictorEngine(
     const std::shared_ptr<FunctionInfo> &info,
     const std::shared_ptr<VariableMap> &params_dict,
@@ -52,6 +47,7 @@ PredictorEngine::PredictorEngine(
   config.SetSkipLoadParams(true);
   config.SetApplyOptim(true);
   config.SwitchIrOptim(true);
+  config.SwitchUseFeedFetchOps(false);
 
   predictor_.reset(new AnalysisPredictor(config));
 
@@ -78,135 +74,15 @@ std::unique_ptr<BaseEngine> PredictorEngine::Clone(void *stream) {
 
 std::vector<Tensor> PredictorEngine::operator()(
     const std::vector<Tensor> &inputs) {
-  auto dense_tensors = utils::ToDenseTensors(inputs);
-  return utils::ToTensors(this->operator()(dense_tensors));
-}
-
-std::vector<DenseTensor> PredictorEngine::operator()(
-    const std::vector<DenseTensor> &inputs) {
-  std::vector<PaddleTensor> pt_inputs;
-  std::vector<PaddleTensor> pt_outputs;
-  for (auto &t : inputs) {
-    auto non_const_t = const_cast<DenseTensor *>(&t);
-    pt_inputs.emplace_back(DenseTensorToPaddleTensor(non_const_t));
-  }
-
-  predictor_->Run(pt_inputs, &pt_outputs);
-
-  std::vector<DenseTensor> outputs;
-  for (auto &pt : pt_outputs) {
-    DenseTensor t;
-    PaddleTensorToDenseTensor(pt, &t, place_);
-    outputs.emplace_back(t);
-  }
+  std::vector<Tensor> outputs;
+  predictor_->Run(inputs, &outputs);
 
   return outputs;
 }
 
-static PaddleTensor DenseTensorToPaddleTensor(DenseTensor *t) {
-  PaddleTensor pt;
-  switch (framework::TransToProtoVarType(t->dtype())) {
-    case framework::proto::VarType::INT32: {
-      pt.data.Reset(t->data(), t->numel() * sizeof(int32_t));
-      pt.dtype = PaddleDType::INT32;
-    } break;
-    case framework::proto::VarType::INT64: {
-      pt.data.Reset(t->data(), t->numel() * sizeof(int64_t));
-      pt.dtype = PaddleDType::INT64;
-    } break;
-    case framework::proto::VarType::FP32: {
-      pt.data.Reset(t->data(), t->numel() * sizeof(float));
-      pt.dtype = PaddleDType::FLOAT32;
-    } break;
-    default:
-      PADDLE_THROW(
-          platform::errors::Unimplemented("Unsupported tensor date type. Now "
-                                          "only supports INT64, FP32, INT32."));
-  }
-  pt.shape = phi::vectorize<int>(t->dims());
-  return pt;
-}
-
-static bool PaddleTensorToDenseTensor(const PaddleTensor &pt,
-                                      DenseTensor *t,
-                                      const platform::Place &place) {
-  framework::DDim ddim = phi::make_ddim(pt.shape);
-  void *input_ptr;
-  switch (pt.dtype) {
-    case PaddleDType::INT64:
-      input_ptr = t->mutable_data<int64_t>(ddim, place);
-      break;
-    case PaddleDType::FLOAT32:
-      input_ptr = t->mutable_data<float>(ddim, place);
-      break;
-    case PaddleDType::INT32:
-      input_ptr = t->mutable_data<int32_t>(ddim, place);
-      break;
-    case PaddleDType::FLOAT16:
-      input_ptr = t->mutable_data<float16>(ddim, place);
-      break;
-    default:
-      LOG(ERROR) << "unsupported feed type " << pt.dtype;
-      return false;
-  }
-
-  PADDLE_ENFORCE_NOT_NULL(
-      input_ptr,
-      paddle::platform::errors::Fatal(
-          "Cannot convert to LoDTensor because LoDTensor creation failed."));
-  PADDLE_ENFORCE_NOT_NULL(
-      pt.data.data(),
-      paddle::platform::errors::InvalidArgument(
-          "The data contained in the input PaddleTensor is illegal."));
-
-  if (platform::is_cpu_place(place)) {
-    // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
-    std::memcpy(
-        static_cast<void *>(input_ptr), pt.data.data(), pt.data.length());
-  } else if (platform::is_ipu_place(place)) {
-#ifdef PADDLE_WITH_IPU
-    std::memcpy(
-        static_cast<void *>(input_ptr), pt.data.data(), pt.data.length());
-#else
-    PADDLE_THROW(paddle::platform::errors::Fatal(
-        "Not compile with WITH_IPU, should not reach here."));
-#endif
-  } else if (platform::is_gpu_place(place)) {
-    PADDLE_ENFORCE_EQ(platform::is_xpu_place(place),
-                      false,
-                      platform::errors::InvalidArgument(
-                          "Only one choice can be made between CPU and XPU."));
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto *dev_ctx = static_cast<const phi::GPUContext *>(pool.Get(place));
-    auto dst_gpu_place = place;
-    memory::Copy(dst_gpu_place,
-                 static_cast<void *>(input_ptr),
-                 platform::CPUPlace(),
-                 pt.data.data(),
-                 pt.data.length(),
-                 dev_ctx->stream());
-#else
-    PADDLE_THROW(paddle::platform::errors::Fatal(
-        "Not compile with CUDA, should not reach here."));
-#endif
-  } else if (platform::is_xpu_place(place)) {
-#ifdef PADDLE_WITH_XPU
-    auto dst_xpu_place = place;
-    memory::Copy(dst_xpu_place,
-                 static_cast<void *>(input_ptr),
-                 platform::CPUPlace(),
-                 pt.data.data(),
-                 pt.data.length());
-#else
-    PADDLE_THROW(paddle::platform::errors::Fatal(
-        "Not compile with XPU, should not reach here."));
-#endif
-  } else {
-    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
-        "The analysis predictor supports CPU, GPU and XPU now."));
-  }
-  return true;
+std::vector<DenseTensor> PredictorEngine::operator()(
+    const std::vector<DenseTensor> &inputs) {
+  return utils::ToDenseTensors(this->operator()(utils::ToTensors(inputs)));
 }
 
 }  // namespace jit
diff --git a/paddle/fluid/operators/average_accumulates_op.cc b/paddle/fluid/operators/average_accumulates_op.cc
deleted file mode 100644
index a59b78c3cd44b..0000000000000
--- a/paddle/fluid/operators/average_accumulates_op.cc
+++ /dev/null
@@ -1,142 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/phi/infermeta/multiary.h"
-
-namespace paddle {
-namespace operators {
-
-class AverageAccumulatesOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "param"),
-                          ctx.GetPlace());
-  }
-};
-
-class AverageAccumulatesOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("param", "(Tensor), The parameter to be accumulated.");
-    AddInput("in_sum_1",
-             "(Tensor), A tensor used to store the parameter "
-             "sums with the same shape as input(param).");
-    AddInput("in_sum_2",
-             "(Tensor), A auxiliary tensor to help "
-             "accumulating sums of parameter values with the same shape as "
-             "input(param). It is used to avoid loss of precision due to too "
-             "many sums.");
-    AddInput("in_sum_3",
-             "(Tensor), A auxiliary tensor to help "
-             "accumulating sums of parameter values with the same shape as "
-             "input(param).");
-    AddInput("in_num_accumulates",
-             "(Tensor<int64_t>), The accumulating times of current window with "
-             "shape [1].");
-    AddInput(
-        "in_old_num_accumulates",
-        "(Tensor<int64_t>), The accumulating times of previous window with "
-        "shape [1].");
-    AddInput("in_num_updates",
-             "(Tensor<int64_t>), The total number of batches used by training "
-             "before this batch with shape [1].");
-
-    AddOutput("out_sum_1",
-              "(Tensor), A tensor used to store the "
-              "parameter sums with the same shape as input(param).");
-    AddOutput("out_sum_2",
-              "(Tensor), A auxiliary tensor to help "
-              "accumulating sums of parameter values with the same shape as "
-              "input(param). It is used to avoid loss of precision due to too "
-              "many sums.");
-    AddOutput("out_sum_3",
-              "(Tensor), A auxiliary tensor to help "
-              "accumulating sums of parameter values with the same shape as "
-              "input(param).");
-    AddOutput(
-        "out_num_accumulates",
-        "(Tensor<int64_t>), The accumulating times of current window with "
-        "shape [1].");
-    AddOutput(
-        "out_old_num_accumulates",
-        "(Tensor<int64_t>) The accumulating times of previous window with "
-        "shape [1].");
-    AddOutput("out_num_updates",
-              "(Tensor<int64_t>), The total number of batches used by training "
-              "before this batch with shape [1].");
-
-    AddAttr<float>("average_window",
-                   "(float, default 0) "
-                   "The rate of average window size relative to num_updates.")
-        .SetDefault(0);
-    AddAttr<int64_t>("max_average_window",
-                     "(int64_t) "
-                     "Maximum size of average window. It suggests that the "
-                     "number of mini-batches "
-                     "in one pass is appropriate value to set.");
-    AddAttr<int64_t>("min_average_window",
-                     "(int64_t, default 10000L) "
-                     "Minimu size of average window.")
-        .SetDefault(10000L);
-
-    AddComment(R"DOC(
-AverageAccumulates Operator.
-Accumulate the sum of parameter within sliding window. The size of sliding window is
-determined by 'average_window', 'max_average_window' and 'min_average_window'.
-Memory was shared by Input(in_sum_1) and Output(out_sum_1) which acts as an accumulator 'sum_1'.
-'sum_2', 'sum_3', 'num_accumulates', 'old_num_accumulates' and 'num_updates' were the same as 'sum_1'.
-
-All the accumulators were inited to zero before training.
-
-And for a mini-batch in training, accumulators were computed as below steps:
-    num_updates += 1
-    num_accumulates += 1
-    sum_1 += param
-    if num_updates % kMaxNumAccumulates == 0:
-        sum_2 += sum_1
-        sum_1 = 0
-    if num_accumulates >= min_average_window && num_accumulates >= min(max_average_window, num_updates * average_window):
-        sum_3 = sum_1 + sum_2
-        sum_1 = 0
-        sum_2 = 0
-        old_num_accumulates = num_accumulates
-        num_accumulates = 0
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-DECLARE_INFER_SHAPE_FUNCTOR(average_accumulates,
-                            AverageAccumulatesInferShapeFunctor,
-                            PD_INFER_META(phi::AverageAccumulatesInferMeta));
-
-REGISTER_OPERATOR(
-    average_accumulates,
-    ops::AverageAccumulatesOp,
-    ops::AverageAccumulatesOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    AverageAccumulatesInferShapeFunctor);
diff --git a/paddle/fluid/operators/clip_by_norm_op.cc b/paddle/fluid/operators/clip_by_norm_op.cc
deleted file mode 100644
index 3805e11d752e3..0000000000000
--- a/paddle/fluid/operators/clip_by_norm_op.cc
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/clip_by_norm_op.h"
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/phi/core/infermeta_utils.h"
-#include "paddle/phi/infermeta/unary.h"
-
-namespace ops = paddle::operators;
-
-DECLARE_INFER_SHAPE_FUNCTOR(clip_by_norm,
-                            ClipByNormInferShapeFunctor,
-                            PD_INFER_META(phi::ClipByNormInferMeta));
-
-REGISTER_OP_WITHOUT_GRADIENT(clip_by_norm,
-                             ops::ClipByNormOp,
-                             ops::ClipByNormOpMaker,
-                             ClipByNormInferShapeFunctor);
diff --git a/paddle/fluid/operators/collective/CMakeLists.txt b/paddle/fluid/operators/collective/CMakeLists.txt
index 3855733a98271..b356497962689 100644
--- a/paddle/fluid/operators/collective/CMakeLists.txt
+++ b/paddle/fluid/operators/collective/CMakeLists.txt
@@ -27,8 +27,6 @@ register_operators(
   gen_bkcl_id_op
   c_gen_nccl_id_op
   gen_nccl_id_op
-  c_gen_hccl_id_op
-  gen_hccl_id_op
   c_gen_cncl_id_op
   DEPS
   ${COLLECTIVE_DEPS})
diff --git a/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc b/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc
deleted file mode 100644
index 98bcd78b9dadc..0000000000000
--- a/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace framework {
-class Scope;
-}  // namespace framework
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-
-class CCommInitOpAscend : public framework::OperatorBase {
- public:
-  CCommInitOpAscend(const std::string& type,
-                    const framework::VariableNameMap& inputs,
-                    const framework::VariableNameMap& outputs,
-                    const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& place) const override {
-    PADDLE_ENFORCE_EQ(platform::is_npu_place(place),
-                      true,
-                      platform::errors::PreconditionNotMet(
-                          "CCommInitOpAscend can run on npu place only."));
-
-    auto var = scope.FindVar(Input("X"));
-    PADDLE_ENFORCE_NOT_NULL(
-        var, platform::errors::InvalidArgument("Input con not be empty."));
-
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "PaddlePaddle should compile with NPU."));
-  }
-};
-
-class CCommInitOpAscendMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "Raw variable contains a NCCL UniqueId instaces.");
-    AddComment(R"DOC(
-CCommInit operator
-
-Initialize collective communicatoin context within this trainer
-)DOC");
-    AddAttr<int>("rank_ids",
-                 "(int) The number of ranks of distributed trainers");
-    AddAttr<int>("rank",
-                 "(int) The rank of the trainer in distributed training.");
-    AddAttr<int>("device_id",
-                 "(int) The deivce_id on which to initialize the communicator."
-                 "Now, you only have to set this attr manually for pipeline "
-                 "training. Otherwise, make it as default.")
-        .SetDefault(-1);
-    AddAttr<int>("ring_id", "(int default 0) user specified ring id")
-        .SetDefault(0);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(c_comm_init_hccl,
-                  ops::CCommInitOpAscend,
-                  ops::CCommInitOpAscendMaker);
diff --git a/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc b/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc
deleted file mode 100644
index 130c45dfaad50..0000000000000
--- a/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <string>
-
-#include "glog/logging.h"
-#include "paddle/fluid/framework/op_proto_maker.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/var_type_traits.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/gen_comm_id_helper.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace operators {
-
-class CGenHCCLIdOp : public framework::OperatorBase {
- public:
-  CGenHCCLIdOp(const std::string& type,
-               const framework::VariableNameMap& inputs,
-               const framework::VariableNameMap& outputs,
-               const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& dev_place) const override {}
-};
-
-class CGenHCCLIdOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    VLOG(3) << "ele";
-    AddOutput("Out", "Raw variable contains a HCCL UniqueId instaces.");
-    AddComment(R"DOC(
-CGenHCCLId operator
-
-For trainer 0: generate a new UniqueId and send it to all the other trainers.
-For trainer 1~n: start a gRPC server to get the UniqueId, once got, stop the server.
-)DOC");
-    AddAttr<std::string>("endpoint",
-                         "(string), e.g. 127.0.0.1:6175 "
-                         "current listen endpoint");
-    AddAttr<std::vector<std::string>>(
-        "other_endpoints",
-        "['trainer1_ip:port', 'trainer2_ip:port', ...] "
-        "list of other trainer endpoints")
-        .SetDefault({});
-    AddAttr<int>("rank",
-                 "(int default 0) "
-                 "The rank of the trainer in distributed training.")
-        .SetDefault(0);
-    AddAttr<int>("ring_id", "(int default 0) user specified ring id")
-        .SetDefault(0);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(c_gen_hccl_id, ops::CGenHCCLIdOp, ops::CGenHCCLIdOpMaker);
diff --git a/paddle/fluid/operators/collective/gen_hccl_id_op.cc b/paddle/fluid/operators/collective/gen_hccl_id_op.cc
deleted file mode 100644
index d472d589de544..0000000000000
--- a/paddle/fluid/operators/collective/gen_hccl_id_op.cc
+++ /dev/null
@@ -1,83 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <ostream>
-#include <string>
-
-#include "glog/logging.h"
-#include "paddle/fluid/framework/op_proto_maker.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/var_type_traits.h"
-#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/string/split.h"
-
-namespace paddle {
-namespace operators {
-
-class GenHCCLIdOp : public framework::OperatorBase {
- public:
-  GenHCCLIdOp(const std::string& type,
-              const framework::VariableNameMap& inputs,
-              const framework::VariableNameMap& outputs,
-              const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& dev_place) const override {}
-};
-
-class GenHCCLIdOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddOutput("HCCLID", "Raw variable contains a HCCL UniqueId instaces.");
-    AddComment(R"DOC(
-GenHCCLId operator
-
-For trainer 0: generate a new UniqueId and send it to all the other trainers.
-For trainer 1~n: start a gRPC server to get the UniqueId, once got, stop the server.
-)DOC");
-    AddAttr<std::vector<std::string>>(
-        "trainers",
-        "['trainer0_ip:port', 'trainer1_ip:port', ...] "
-        "list of all trainer endpoints")
-        .SetDefault({});
-    AddAttr<int>("trainer_id",
-                 "(int) "
-                 "The index of the trainer in distributed training.");
-    AddAttr<int>("hccl_comm_num",
-                 "(int default 1) "
-                 "The number of nccl communicator num.")
-        .SetDefault(1);
-    AddAttr<bool>("use_hierarchical_allreduce",
-                  "(bool default false) "
-                  "Wheter to use hierarchical allreduce.")
-        .SetDefault(false);
-    AddAttr<int>("hierarchical_allreduce_inter_nranks",
-                 "(int default 1) "
-                 "Wheter to use hierarchical allreduce.")
-        .SetDefault(-1);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(gen_hccl_id, ops::GenHCCLIdOp, ops::GenHCCLIdOpMaker);
diff --git a/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc
deleted file mode 100644
index 41367305e2666..0000000000000
--- a/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc
+++ /dev/null
@@ -1,378 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
-
-#include <arpa/inet.h>
-#include <netdb.h>
-#include <netinet/in.h>
-#include <stdlib.h>
-#include <sys/socket.h>
-
-#include <algorithm>
-#include <ostream>
-#include <string>
-
-#include "glog/logging.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/var_type_traits.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/string/split.h"
-
-DECLARE_int32(get_host_by_name_time);
-
-namespace paddle {
-namespace operators {
-
-constexpr char COMM_HEAD[] = "_pd_gen_comm_id_";
-#define HCCL_UNIQUE_ID_BYTES 1024
-
-// Check system calls, such as socket, bind.
-#define CHECK_SYS_CALL(call, name)          \
-  do {                                      \
-    int retval;                             \
-    CHECK_SYS_CALL_VAL(call, name, retval); \
-  } while (false)
-
-#define CHECK_SYS_CALL_VAL(call, name, retval)              \
-  do {                                                      \
-    RETRY_SYS_CALL_VAL(call, name, retval);                 \
-    if (retval == -1) {                                     \
-      PADDLE_THROW(platform::errors::Unavailable(           \
-          "Call to %s failed: %s", name, strerror(errno))); \
-    }                                                       \
-  } while (false)
-
-#define RETRY_SYS_CALL_VAL(call, name, retval)                           \
-  do {                                                                   \
-    retval = (call);                                                     \
-    if (retval == -1 &&                                                  \
-        (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) {   \
-      LOG(WARNING) << "Call " << name << " returned " << strerror(errno) \
-                   << " retry";                                          \
-    } else {                                                             \
-      break;                                                             \
-    }                                                                    \
-  } while (true)
-
-static int SocketSend(int fd, const char* buffer, int size) {
-  int offset = 0;
-  int bytes = 0;
-  while (offset < size) {
-    bytes = send(fd, buffer + offset, size - offset, 0);
-    if (bytes == -1) {
-      if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) {
-        // send failed
-        return -1;
-      } else {
-        bytes = 0;
-      }
-    }
-    offset += bytes;
-  }
-  return offset;
-}
-
-static int SocketRecv(int fd, char* buffer, int size) {
-  int offset = 0;
-  int bytes = 0;
-  while (offset < size) {
-    bytes = recv(fd, buffer + offset, size - offset, 0);
-    if (bytes == 0) {
-      // closed by client, maybe probing alive client
-      return 0;
-    }
-    if (bytes == -1) {
-      if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) {
-        return -1;
-      } else {
-        bytes = 0;
-      }
-    }
-    offset += bytes;
-  }
-  return offset;
-}
-
-static void BindOrConnectFailed(int timeout,
-                                int* try_times,
-                                int* total_time,
-                                const char* op,
-                                const std::string& ep) {
-  PADDLE_ENFORCE_LT(
-      *total_time,
-      timeout,
-      platform::errors::Unavailable("%s addr=%s timeout, failed reason: %s",
-                                    op,
-                                    ep.c_str(),
-                                    strerror(errno)));
-  ++(*try_times);
-  int retry_time = std::min(*try_times * 500, 3000);  // max 3 seconds
-  *total_time += retry_time;
-
-  LOG(WARNING) << op << " addr=" << ep << " failed " << *try_times
-               << " times with reason: " << strerror(errno) << " retry after "
-               << retry_time / 1000.0 << " seconds";
-  std::this_thread::sleep_for(std::chrono::milliseconds(retry_time));
-}
-
-int CreateListenSocket(const std::string& ep) {
-  auto addr = paddle::string::Split(ep, ':');
-  PADDLE_ENFORCE_EQ(
-      addr.size(),
-      2UL,
-      platform::errors::InvalidArgument(
-          "The endpoint should contain host and port, but got %s.", ep));
-  std::string host = addr[0];
-  int port = std::stoi(addr[1]);
-
-  // creating socket fd
-  int server_fd = -1;
-  CHECK_SYS_CALL_VAL(socket(AF_INET, SOCK_STREAM, 0), "socket", server_fd);
-
-  // NOTE. Solutions to `Address already in use`.
-  // 1. Reuse addr&port. Otherwise, once the server closes the socket
-  // before client, the server will enter TIME-WAIT status. If we bind port
-  // again, the error `Address already in use` will appear.
-  // 2. Or we can close the client first to ensure that the server does
-  // not enter the TIME-WAIT state. But this is obviously not as convenient
-  // as the reuse method.
-  int opt = 1;
-#if defined(SO_REUSEPORT)
-  // since Linux kernel 3.9
-  CHECK_SYS_CALL(setsockopt(server_fd,
-                            SOL_SOCKET,
-                            SO_REUSEADDR | SO_REUSEPORT,
-                            &opt,
-                            sizeof(opt)),
-                 "setsockopt");
-#else
-  CHECK_SYS_CALL(
-      setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)),
-      "setsockopt");
-#endif
-
-  struct sockaddr_in address;
-  address.sin_family = AF_INET;
-  address.sin_addr.s_addr = INADDR_ANY;
-  address.sin_port = htons(port);
-
-  // TODO(wangxi) Set from env, default 900s=15min
-  int timeout = 900 * 1000;
-  int try_times = 0;
-  int total_time = 0;
-  while (true) {
-    int ret_val = -1;
-    RETRY_SYS_CALL_VAL(
-        bind(server_fd, (struct sockaddr*)&address, sizeof(address)),
-        "bind",
-        ret_val);
-
-    if (ret_val == -1) {
-      BindOrConnectFailed(timeout, &try_times, &total_time, "bind", ep);
-      continue;
-    }
-    break;
-  }
-
-  CHECK_SYS_CALL(listen(server_fd, 3), "listen");
-  LOG(INFO) << "Server listening on: " << ep << " successful.";
-  return server_fd;
-}
-
-void CloseSocket(int fd) { CHECK_SYS_CALL(close(fd), "close"); }
-
-static int SocketAccept(int server_fd, const char* head) {
-  struct sockaddr_in client_addr;
-  socklen_t addr_length = sizeof(client_addr);
-  char buffer[1024] = {0};
-  int conn = -1;
-
-  while (true) {
-    CHECK_SYS_CALL_VAL(accept(server_fd,
-                              reinterpret_cast<struct sockaddr*>(&client_addr),
-                              &addr_length),
-                       "accept",
-                       conn);
-
-    int ret_val = SocketRecv(conn, buffer, strlen(head));
-    if (ret_val > 0 && strncmp(buffer, head, strlen(head)) == 0) {
-      break;  // accept client
-    } else {
-      VLOG(3) << "socket read failed with ret_val=" << ret_val;
-      CloseSocket(conn);
-    }
-  }
-  return conn;
-}
-
-static int ConnectAddr(const std::string& ep, const char* head) {
-  auto addr = paddle::string::Split(ep, ':');
-  PADDLE_ENFORCE_EQ(
-      addr.size(),
-      2UL,
-      platform::errors::InvalidArgument(
-          "The endpoint should contain host and port, but got %s.", ep));
-  std::string host = addr[0];
-  int port = std::stoi(addr[1]);
-
-  int sock = -1;
-  CHECK_SYS_CALL_VAL(socket(AF_INET, SOCK_STREAM, 0), "socket", sock);
-
-  struct sockaddr_in server_addr;
-  memset(&server_addr, 0, sizeof(server_addr));
-  server_addr.sin_family = AF_INET;
-  server_addr.sin_port = htons(port);
-
-  char* ip = NULL;
-  struct hostent* hp = NULL;
-  // sleep for get_host_by_name_time seconds.
-  for (int i = 0; 2 * i < FLAGS_get_host_by_name_time; i++) {
-    hp = gethostbyname(host.c_str());
-    if (hp != NULL) {
-      break;
-    }
-    std::this_thread::sleep_for(std::chrono::seconds(2));
-    LOG(WARNING) << "gethostbyname " << host.c_str() << " error!";
-  }
-  PADDLE_ENFORCE_NOT_NULL(
-      hp,
-      platform::errors::InvalidArgument("Fail to get host by name %s.", host));
-
-  int i = 0;
-  while (hp->h_addr_list[i] != NULL) {
-    ip = inet_ntoa(*(struct in_addr*)hp->h_addr_list[i]);
-    VLOG(3) << "gethostbyname  host:" << host << "  ->ip: " << ip;
-    break;
-  }
-
-  PADDLE_ENFORCE_GT(inet_pton(AF_INET, ip, &server_addr.sin_addr),
-                    0,
-                    platform::errors::Unavailable(
-                        "Open address %s failed: %s", ep, strerror(errno)));
-
-  // TODO(wangxi) Set from env, default 900s=15min
-  int timeout = 900 * 1000;
-  int try_times = 0;
-  int total_time = 0;
-  while (true) {
-    int ret_val = -1;
-    RETRY_SYS_CALL_VAL(
-        connect(sock, (struct sockaddr*)&server_addr, sizeof(server_addr)),
-        "connect",
-        ret_val);
-
-    if (ret_val == -1) {
-      BindOrConnectFailed(timeout, &try_times, &total_time, "connect", ep);
-      continue;
-    }
-
-    CHECK_SYS_CALL(SocketSend(sock, head, strlen(head)), "send");
-    break;
-  }
-  return sock;
-}
-
-static void RecvHCCLID(int conn, HcclRootInfo* hccl_id) {
-  char buffer[1024] = {0};
-  static_assert(HCCL_UNIQUE_ID_BYTES <= 1024,
-                "hccl id bytes must <= buffer size");
-
-  CHECK_SYS_CALL(SocketRecv(conn, buffer, HCCL_UNIQUE_ID_BYTES),
-                 "recv hccl id");
-  memcpy(hccl_id, buffer, HCCL_UNIQUE_ID_BYTES);
-}
-
-static void SendHCCLID(int conn, HcclRootInfo* hccl_id) {
-  char buffer[1024] = {0};
-  memcpy(buffer, hccl_id, HCCL_UNIQUE_ID_BYTES);
-
-  CHECK_SYS_CALL(SocketSend(conn, buffer, HCCL_UNIQUE_ID_BYTES),
-                 "send hccl id");
-}
-
-void SendBroadCastHCCLID(std::vector<std::string> servers,
-                         int hccl_comm_num,
-                         std::function<std::string(size_t)> func,
-                         const framework::Scope& scope) {
-  // connect with server
-  std::vector<int> connects;
-  for (auto server : servers) {
-    VLOG(3) << "connecting endpoint: " << server;
-    int conn = ConnectAddr(server, COMM_HEAD);
-    connects.push_back(conn);
-  }
-  VLOG(3) << "connecting completed...";
-
-  for (int i = 0; i < hccl_comm_num; ++i) {
-    std::string var_name = func(i);
-    auto var = scope.FindVar(var_name);
-    PADDLE_ENFORCE_NOT_NULL(
-        var,
-        platform::errors::NotFound("Variable with name %s is not found",
-                                   var_name.c_str()));
-    auto hccl_id = var->GetMutable<HcclRootInfo>();
-    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclGetRootInfo(hccl_id));
-
-    int j = 0;
-    for (auto conn : connects) {
-      VLOG(3) << "sending hccl_id_var: " << var_name << " to " << servers[j]
-              << " hccl_comm_no: " << i;
-      SendHCCLID(conn, hccl_id);
-      ++j;
-    }
-    VLOG(3) << "sending completed...";
-  }
-
-  // close client
-  for (auto conn : connects) {
-    CloseSocket(conn);
-  }
-}
-
-void RecvBroadCastHCCLID(std::string endpoint,
-                         int hccl_comm_num,
-                         std::function<std::string(size_t)> func,
-                         const framework::Scope& scope) {
-  int server = CreateListenSocket(endpoint);
-  RecvBroadCastHCCLID(server, endpoint, hccl_comm_num, func, scope);
-  CloseSocket(server);
-}
-
-void RecvBroadCastHCCLID(int server_fd,
-                         std::string endpoint,
-                         int hccl_comm_num,
-                         std::function<std::string(size_t)> func,
-                         const framework::Scope& scope) {
-  int client = SocketAccept(server_fd, COMM_HEAD);
-
-  for (int i = 0; i < hccl_comm_num; ++i) {
-    std::string var_name = func(i);
-    auto var = scope.FindVar(var_name);
-    PADDLE_ENFORCE_NOT_NULL(
-        var,
-        platform::errors::NotFound("Variable with name %s is not found",
-                                   var_name.c_str()));
-    auto hccl_id = var->GetMutable<HcclRootInfo>();
-
-    VLOG(3) << "trainer: " << endpoint << " receiving hccl_id_var: " << var_name
-            << " from trainer 0, hccl_comm_no: " << i;
-    RecvHCCLID(client, hccl_id);
-  }
-  VLOG(3) << "receiving completed...";
-  CloseSocket(client);
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/collective/gen_hccl_id_op_helper.h b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.h
deleted file mode 100644
index a64a44f9f6166..0000000000000
--- a/paddle/fluid/operators/collective/gen_hccl_id_op_helper.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <functional>
-#include <string>
-#include <vector>
-
-namespace paddle {
-namespace framework {
-class Scope;
-}  // namespace framework
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-
-int CreateListenSocket(const std::string& ep);
-
-void CloseSocket(int fd);
-
-void SendBroadCastHCCLID(std::vector<std::string> servers,
-                         int nccl_comm_num,
-                         std::function<std::string(size_t)> func,
-                         const framework::Scope& scope);
-
-// server listen on endpoint, then recv nccl id
-void RecvBroadCastHCCLID(std::string endpoint,
-                         int nccl_comm_num,
-                         std::function<std::string(size_t)> func,
-                         const framework::Scope& scope);
-
-// recv nccl id from socket
-void RecvBroadCastHCCLID(int server_fd,
-                         std::string endpoint,
-                         int nccl_comm_num,
-                         std::function<std::string(size_t)> func,
-                         const framework::Scope& scope);
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc
index 30fdb90ce1069..4c7578c010473 100644
--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
@@ -22,6 +22,13 @@
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
+
+PADDLE_DEFINE_EXPORTED_bool(
+    cache_inference_while_scope,
+    false,
+    "Cache the scope of the while op to avoid repeated creation of the scope "
+    "for each iteration and improve inference performance.");
+
 namespace paddle {
 namespace framework {
 class InferShapeContext;
@@ -257,14 +264,23 @@ class WhileOp : public framework::OperatorBase {
             scope.FindVar(Input(kCondition))->Get<phi::DenseTensor>());
       }
     } else {
-      auto &current_scope = scope.NewScope();
-
-      BuildScopeForControlFlowOp(*core_, *block, &current_scope);
-      core_->reset_scope(&current_scope);
+      framework::Scope *current_scope = nullptr;
+      if (!FLAGS_cache_inference_while_scope) {
+        current_scope = &(scope.NewScope());
+        BuildScopeForControlFlowOp(*core_, *block, current_scope);
+        core_->reset_scope(current_scope);
+      } else {
+        if (cached_inference_scope_ == nullptr) {
+          cached_inference_scope_ = &(scope.NewScope());
+          BuildScopeForControlFlowOp(*core_, *block, cached_inference_scope_);
+          core_->reset_scope(cached_inference_scope_);
+        }
+        current_scope = cached_inference_scope_;
+      }
 
       while (cond_data) {
-        for (auto &name : current_scope.LocalVarNames()) {
-          auto *var = current_scope.Var(name);
+        for (auto &name : current_scope->LocalVarNames()) {
+          auto *var = current_scope->Var(name);
           if (var->IsType<phi::DenseTensor>()) {
             // Clear all lod information for all lod_tensors.
             auto *t = var->GetMutable<phi::DenseTensor>();
@@ -283,7 +299,9 @@ class WhileOp : public framework::OperatorBase {
             scope.FindVar(Input(kCondition))->Get<phi::DenseTensor>());
       }
 
-      scope.DeleteScope(&current_scope);
+      if (!FLAGS_cache_inference_while_scope) {
+        scope.DeleteScope(current_scope);
+      }
     }
   }
 
@@ -291,6 +309,7 @@ class WhileOp : public framework::OperatorBase {
   mutable std::shared_ptr<framework::Executor> executor_{nullptr};
   mutable std::unique_ptr<framework::ExecutorPrepareContext> ctx_{nullptr};
   mutable std::shared_ptr<framework::InterpreterCore> core_{nullptr};
+  mutable framework::Scope *cached_inference_scope_{nullptr};
 };
 
 class WhileOpMaker : public framework::OpProtoAndCheckerMaker {
diff --git a/paddle/fluid/operators/generator/get_expected_kernel_func.cc b/paddle/fluid/operators/generator/get_expected_kernel_func.cc
index a4b0e637e12dc..ce2cbb43deed0 100644
--- a/paddle/fluid/operators/generator/get_expected_kernel_func.cc
+++ b/paddle/fluid/operators/generator/get_expected_kernel_func.cc
@@ -158,5 +158,23 @@ phi::KernelKey GetMatrixNmsExpectedKernelType(
                         platform::CPUPlace());
 }
 
+phi::KernelKey GetUniqueExpectedKernelType(
+    const framework::ExecutionContext& ctx,
+    const framework::OperatorWithKernel* op_ptr) {
+  (void)ctx;
+  // Return CPUPlace when Attr("is_sorted") is false. Because it means
+  // that fluid.layers.unique is called, but there is no cuda kernel.
+  if (!ctx.Attr<bool>("is_sorted")) {
+    return phi::KernelKey(
+        op_ptr->OperatorWithKernel::IndicateVarDataType(ctx, "X"),
+        platform::CPUPlace());
+  } else {
+    // new version paddle.unique is called.
+    return phi::KernelKey(
+        op_ptr->OperatorWithKernel::IndicateVarDataType(ctx, "X"),
+        ctx.GetPlace());
+  }
+}
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/generator/get_expected_kernel_func.h b/paddle/fluid/operators/generator/get_expected_kernel_func.h
index a83f5865e3499..cbbb74e2312ed 100644
--- a/paddle/fluid/operators/generator/get_expected_kernel_func.h
+++ b/paddle/fluid/operators/generator/get_expected_kernel_func.h
@@ -44,5 +44,9 @@ phi::KernelKey GetMatrixNmsExpectedKernelType(
     const framework::ExecutionContext& ctx,
     const framework::OperatorWithKernel* op_ptr);
 
+phi::KernelKey GetUniqueExpectedKernelType(
+    const framework::ExecutionContext& ctx,
+    const framework::OperatorWithKernel* op_ptr);
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/generator/type_mapping.py b/paddle/fluid/operators/generator/type_mapping.py
index 8aec1bcc49a5e..e6b59b7823abe 100644
--- a/paddle/fluid/operators/generator/type_mapping.py
+++ b/paddle/fluid/operators/generator/type_mapping.py
@@ -76,7 +76,7 @@
     'int64_t[]': 'std::vector<int64_t>',
     'float[]': 'std::vector<float>',
     'double[]': 'std::vector<double>',
-    'str[]': 'std::vector<<std::string>',
+    'str[]': 'std::vector<std::string>',
 }
 
 output_type_map = {'Tensor': 'Tensor', 'Tensor[]': 'std::vector<Tensor>'}
diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.cc b/paddle/fluid/operators/hierarchical_sigmoid_op.cc
index e1de4a9a4d312..e73d2a2b5ce39 100644
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc
@@ -124,7 +124,6 @@ class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<AttrType>("num_classes", "(int, optional), The number of classes")
         .SetDefault(2);
     // for parameter prefetch
-    AddAttr<bool>("remote_prefetch", "").SetDefault(false);
     AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
     AddAttr<std::vector<int64_t>>("height_sections",
                                   "Height for each output SelectedRows.")
diff --git a/paddle/fluid/operators/isfinite_op.cu b/paddle/fluid/operators/isfinite_op.cu
old mode 100644
new mode 100755
index d8e18f58fa9f2..80a65cbda916b
--- a/paddle/fluid/operators/isfinite_op.cu
+++ b/paddle/fluid/operators/isfinite_op.cu
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/operators/isfinite_op.h"
+#include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
@@ -22,18 +23,21 @@ REGISTER_OP_CUDA_KERNEL(
     ops::OverflowKernel<phi::GPUContext, int, ops::InfinityFunctor>,
     ops::OverflowKernel<phi::GPUContext, float, ops::InfinityFunctor>,
     ops::OverflowKernel<phi::GPUContext, double, ops::InfinityFunctor>,
-    ops::OverflowKernel<phi::GPUContext, plat::float16, ops::InfinityFunctor>);
+    ops::OverflowKernel<phi::GPUContext, plat::float16, ops::InfinityFunctor>,
+    ops::OverflowKernel<phi::GPUContext, plat::bfloat16, ops::InfinityFunctor>);
 
 REGISTER_OP_CUDA_KERNEL(
     isnan,
     ops::OverflowKernel<phi::GPUContext, int, ops::NANFunctor>,
     ops::OverflowKernel<phi::GPUContext, float, ops::NANFunctor>,
     ops::OverflowKernel<phi::GPUContext, double, ops::NANFunctor>,
-    ops::OverflowKernel<phi::GPUContext, plat::float16, ops::NANFunctor>);
+    ops::OverflowKernel<phi::GPUContext, plat::float16, ops::NANFunctor>,
+    ops::OverflowKernel<phi::GPUContext, plat::bfloat16, ops::NANFunctor>);
 
 REGISTER_OP_CUDA_KERNEL(
     isfinite,
     ops::OverflowKernel<phi::GPUContext, int, ops::IsfiniteFunctor>,
     ops::OverflowKernel<phi::GPUContext, float, ops::IsfiniteFunctor>,
     ops::OverflowKernel<phi::GPUContext, double, ops::IsfiniteFunctor>,
-    ops::OverflowKernel<phi::GPUContext, plat::float16, ops::IsfiniteFunctor>);
+    ops::OverflowKernel<phi::GPUContext, plat::float16, ops::IsfiniteFunctor>,
+    ops::OverflowKernel<phi::GPUContext, plat::bfloat16, ops::IsfiniteFunctor>);
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index 6975873b13796..42cb92db8625e 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -39,10 +39,6 @@ cc_test(
   vol2col_test
   SRCS vol2col_test.cc
   DEPS vol2col)
-cc_test(
-  sequence_pooling_test
-  SRCS sequence_pooling_test.cc
-  DEPS sequence_pooling)
 cc_test(
   beam_search_test
   SRCS beam_search_test.cc
diff --git a/paddle/fluid/operators/optimizers/adadelta_op.cc b/paddle/fluid/operators/optimizers/adadelta_op.cc
index 2df8ff971cef1..cb2c374d017fd 100644
--- a/paddle/fluid/operators/optimizers/adadelta_op.cc
+++ b/paddle/fluid/operators/optimizers/adadelta_op.cc
@@ -39,6 +39,7 @@ class AdadeltaOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("AvgSquaredGrad", "(Tensor) Input average of squared gradient");
     AddInput("AvgSquaredUpdate",
              "(Tensor) Input average of squared parameter updates");
+    AddInput("LearningRate", "(Tensor) Learning rate");
     AddInput("MasterParam", "FP32 master weight for AMP.").AsDispensable();
 
     AddOutput("ParamOut", "(Tensor) Output parameter");
diff --git a/paddle/fluid/operators/optimizers/merged_momentum_op.cc b/paddle/fluid/operators/optimizers/merged_momentum_op.cc
deleted file mode 100644
index 17d31e35fdec2..0000000000000
--- a/paddle/fluid/operators/optimizers/merged_momentum_op.cc
+++ /dev/null
@@ -1,111 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/infermeta_utils.h"
-#include "paddle/phi/infermeta/multiary.h"
-
-namespace paddle {
-namespace operators {
-
-class MergedMomentumOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    auto param_dtype =
-        framework::OperatorWithKernel::IndicateVarDataType(ctx, "Param");
-    return phi::KernelKey(param_dtype, ctx.GetPlace());
-  }
-};
-
-class MergedMomentumOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Param",
-             "(Tensor, default Tensor<float>) "
-             "Input parameter that has to be updated")
-        .AsDuplicable();
-    AddInput("Grad",
-             "(Tensor, default Tensor<float>) "
-             "Input gradient of the parameter")
-        .AsDuplicable();
-    AddInput("Velocity",
-             "(Tensor, default Tensor<float>) "
-             "Input velocity (corresponding to the parameter) "
-             "that has to be updated")
-        .AsDuplicable();
-    AddInput("LearningRate",
-             "(Tensor, default Tensor<float>) "
-             "Input learning rate")
-        .AsDuplicable();
-    AddInput("MasterParam", "FP32 master weight for AMP.")
-        .AsDispensable()
-        .AsDuplicable();
-    AddOutput("ParamOut",
-              "(Tensor) This output is updated parameter. "
-              "It shared memory with Input(Param).")
-        .AsDuplicable();
-    AddOutput("VelocityOut",
-              "(Tensor) This output is updated velocity. "
-              "It shared memory with Input(Velocity).")
-        .AsDuplicable();
-    AddOutput("MasterParamOut",
-              "The updated FP32 master weight for AMP. "
-              "It shared memory with Input(MasterParam).")
-        .AsDispensable()
-        .AsDuplicable();
-    AddAttr<float>("mu", "(float) Momentum coefficient");
-    AddAttr<bool>("use_nesterov",
-                  "(bool, default false) "
-                  "Use Nesterov Momentum or not.")
-        .SetDefault(false);
-    AddAttr<std::vector<std::string>>(
-        "regularization_method",
-        "(string) regularization_method, right now only "
-        "support l2decay or none")
-        .SetDefault({});
-    AddAttr<std::vector<float>>("regularization_coeff",
-                                "(float) regularization_coeff")
-        .SetDefault({});
-    AddAttr<bool>("multi_precision",
-                  "(bool, default false) "
-                  "Whether to use multi-precision during weight updating.")
-        .SetDefault(false);
-    AddAttr<float>(
-        "rescale_grad",
-        "(float, default 1.0) Multiply the gradient with `rescale_grad`"
-        "before updating. Often choose to be `1.0/batch_size`.")
-        .SetDefault(1.0f);
-    AddComment(R"DOC(Merged Momentum Optimizer.)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-DECLARE_INFER_SHAPE_FUNCTOR(merged_momentum,
-                            MergedMomentumInferShapeFunctor,
-                            PD_INFER_META(phi::MergedMomentumInferMeta));
-
-REGISTER_OP_WITHOUT_GRADIENT(merged_momentum,
-                             ops::MergedMomentumOp,
-                             ops::MergedMomentumOpMaker,
-                             MergedMomentumInferShapeFunctor);
diff --git a/paddle/fluid/operators/randperm_op.cc b/paddle/fluid/operators/randperm_op.cc
deleted file mode 100644
index 187b227f33170..0000000000000
--- a/paddle/fluid/operators/randperm_op.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/randperm_op.h"
-
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace operators {
-
-class RandpermOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"),
-                      true,
-                      platform::errors::NotFound(
-                          "The output(Out) of randperm op must not be null."));
-    int n = ctx->Attrs().Get<int>("n");
-    PADDLE_ENFORCE_GT(
-        n,
-        0,
-        platform::errors::InvalidArgument(
-            "The input 'n' of randperm op should be greater than 0. "
-            "But received %d.",
-            n));
-
-    ctx->SetOutputDim("Out", phi::make_ddim({n}));
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    auto data_type =
-        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype"));
-    return phi::KernelKey(data_type, ctx.GetPlace());
-  }
-};
-
-class RandpermOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddOutput("Out", "The output tensor of randperm op.");
-
-    AddAttr<int>(
-        "n", "The upper bound (exclusive), and it should be greater than 0.");
-    AddAttr<int>("dtype",
-                 "The data type of output tensor. "
-                 "Default: 3[int64].")
-        .SetDefault(framework::proto::VarType::INT64);
-    AddAttr<int>("seed",
-                 "Random seed used for permute samples. "
-                 "0 means use a seed generated by the system."
-                 "Note that if seed is not 0, this operator will always "
-                 "generate the same random permutation every time. "
-                 "Default: 0.")
-        .SetDefault(0);
-
-    AddComment(R"DOC(
-This operator returns a random permutation of integers from 0 to n-1.
-)DOC");
-  }
-};
-
-class RandpermOpVarTypeInference : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto var_data_type = static_cast<framework::proto::VarType::Type>(
-        PADDLE_GET_CONST(int, ctx->GetAttr("dtype")));
-    ctx->SetOutputDataType("Out", var_data_type);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(
-    randperm,
-    paddle::operators::RandpermOp,
-    paddle::operators::RandpermOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    paddle::operators::RandpermOpVarTypeInference);
diff --git a/paddle/fluid/operators/squared_l2_norm_op.cc b/paddle/fluid/operators/squared_l2_norm_op.cc
deleted file mode 100644
index 2e97f5b9b0dc2..0000000000000
--- a/paddle/fluid/operators/squared_l2_norm_op.cc
+++ /dev/null
@@ -1,89 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/infermeta_utils.h"
-#include "paddle/phi/infermeta/unary.h"
-
-namespace paddle {
-namespace operators {
-
-class SquaredL2NormOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-};
-
-template <typename T>
-class SquaredL2NormGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("squared_l2_norm_grad");
-
-    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    op->SetInput("X", this->Input("X"));
-
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-
-    op->SetAttrMap(this->Attrs());
-  }
-};
-
-class SquaredL2NormGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-};
-
-class SquaredL2NormOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor) The input of squared_l2_norm op.");
-    AddOutput("Out", "(Scalar) The output of squared_l2_norm op.");
-    AddComment(R"DOC(
-SquaredL2Norm Operator.
-
-Computes the squared L2 norm of a tensor.
-
-$$Out = \sum_{i} X_{i}^2$$
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-DECLARE_INFER_SHAPE_FUNCTOR(squared_l2_norm,
-                            SquaredL2NormInferShapeFunctor,
-                            PD_INFER_META(phi::SquaredL2NormInferMeta));
-
-DECLARE_INFER_SHAPE_FUNCTOR(squared_l2_norm_grad,
-                            SquaredL2NormGradInferShapeFunctor,
-                            PD_INFER_META(phi::UnchangedInferMeta));
-
-REGISTER_OPERATOR(squared_l2_norm,
-                  ops::SquaredL2NormOp,
-                  ops::SquaredL2NormOpMaker,
-                  ops::SquaredL2NormGradOpMaker<paddle::framework::OpDesc>,
-                  ops::SquaredL2NormGradOpMaker<paddle::imperative::OpBase>,
-                  SquaredL2NormInferShapeFunctor);
-
-REGISTER_OPERATOR(squared_l2_norm_grad,
-                  ops::SquaredL2NormGradOp,
-                  SquaredL2NormGradInferShapeFunctor);
diff --git a/paddle/fluid/operators/unique_op.cc b/paddle/fluid/operators/unique_op.cc
deleted file mode 100644
index 5484a16ca6bd4..0000000000000
--- a/paddle/fluid/operators/unique_op.cc
+++ /dev/null
@@ -1,168 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/unique_op.h"
-
-#include <memory>
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/infermeta_utils.h"
-#include "paddle/phi/infermeta/unary.h"
-
-namespace paddle {
-namespace operators {
-
-class UniqueOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "unique");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "unique");
-
-    bool return_index = ctx->Attrs().Get<bool>("return_index");
-    bool return_inverse = ctx->Attrs().Get<bool>("return_inverse");
-    bool return_counts = ctx->Attrs().Get<bool>("return_counts");
-    auto axis_vec = ctx->Attrs().Get<std::vector<int>>("axis");
-    auto data_type =
-        static_cast<phi::DataType>(static_cast<framework::proto::VarType::Type>(
-            ctx->Attrs().Get<int>("dtype")));
-
-    // Construct MetaTensor for InferMeta Func
-    using CompatMetaTensor = framework::CompatMetaTensor;
-    CompatMetaTensor x(ctx->GetInputVarPtrs("X")[0], ctx->IsRuntime());
-    CompatMetaTensor out(ctx->GetOutputVarPtrs("Out")[0], ctx->IsRuntime());
-    std::unique_ptr<CompatMetaTensor> indices(nullptr);
-    std::unique_ptr<CompatMetaTensor> index(nullptr);
-    std::unique_ptr<CompatMetaTensor> counts(nullptr);
-
-    if (return_index) {
-      OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "unique");
-      indices =
-          std::move(std::unique_ptr<CompatMetaTensor>(new CompatMetaTensor(
-              ctx->GetOutputVarPtrs("Indices")[0], ctx->IsRuntime())));
-    }
-    if (return_inverse) {
-      OP_INOUT_CHECK(ctx->HasOutput("Index"), "Output", "Index", "unique");
-      index = std::move(std::unique_ptr<CompatMetaTensor>(new CompatMetaTensor(
-          ctx->GetOutputVarPtrs("Index")[0], ctx->IsRuntime())));
-    }
-    if (return_counts) {
-      OP_INOUT_CHECK(ctx->HasOutput("Counts"), "Output", "Counts", "unique");
-      counts = std::move(std::unique_ptr<CompatMetaTensor>(new CompatMetaTensor(
-          ctx->GetOutputVarPtrs("Counts")[0], ctx->IsRuntime())));
-    }
-    bool is_sorted = ctx->Attrs().Get<bool>("is_sorted");
-    if (is_sorted) {
-      phi::UniqueInferMeta(x,
-                           return_index,
-                           return_inverse,
-                           return_counts,
-                           axis_vec,
-                           data_type,
-                           &out,
-                           indices.get(),
-                           index.get(),
-                           counts.get());
-    } else {
-      OP_INOUT_CHECK(ctx->HasOutput("Index"), "Output", "Index", "unique");
-      if (index == nullptr) {
-        index =
-            std::move(std::unique_ptr<CompatMetaTensor>(new CompatMetaTensor(
-                ctx->GetOutputVarPtrs("Index")[0], ctx->IsRuntime())));
-      }
-      phi::UniqueRawInferMeta(x,
-                              return_index,
-                              return_inverse,
-                              return_counts,
-                              axis_vec,
-                              data_type,
-                              is_sorted,
-                              &out,
-                              indices.get(),
-                              index.get(),
-                              counts.get());
-    }
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    // Return CPUPlace when Attr("is_sorted") is false. Because it means
-    // that fluid.layers.unique is called, but there is no cuda kernel.
-    if (!ctx.Attr<bool>("is_sorted")) {
-      return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-                            platform::CPUPlace());
-    } else {
-      // new version paddle.unique is called.
-      return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-                            ctx.GetPlace());
-    }
-  }
-};
-
-class UniqueOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "Input tensor. It should be a 1-D tensor when Attr(is_sorted)"
-             " is false or a N-D tensor when Attr(is_sorted) is true.");
-    AddAttr<int>("dtype", "data type for output index");
-    AddOutput("Out", "A unique subsequence for input tensor.");
-    AddOutput("Index",
-              "Equivalent to inverse in numpy.unique, "
-              "the indices for where elements in the original input ended up "
-              "in the returned unique tensor.");
-    AddOutput(
-        "Indices",
-        "The indices of the input tensor that result in the unique tensor.")
-        .AsDispensable();
-    AddOutput("Counts", "The counts for each unique element.").AsDispensable();
-    AddAttr<bool>("return_index",
-                  "If True, also return the indices of the input"
-                  " tensor that result in the unique Tensor.")
-        .SetDefault(false);
-    AddAttr<bool>(
-        "return_inverse",
-        "If True, also return the indices for where elements"
-        " in the original input ended up in the returned unique tensor.")
-        .SetDefault(false);
-    AddAttr<bool>("return_counts",
-                  "If True, also return the counts for each unique element.")
-        .SetDefault(false);
-    AddAttr<std::vector<int>>(
-        "axis",
-        "The axis to apply unique. If None, the input will be flattened.")
-        .SetDefault({});
-    AddAttr<bool>("is_sorted",
-                  "If True, the unique elements of X are in ascending order."
-                  "Otherwise, the unique elements are not sorted.")
-        .SetDefault(false);
-    AddComment(R"DOC(
-    1. Return a unique subsequence for 1-D input tensor, and an index tensor
-    pointing to this unique subsequence when Attr(is_sorted) is false. This
-    means paddle.unique is called.
-
-    2. Returns the unique elements of X in ascending order when Attr(is_sorted)
-    is true. This means fluid.layers.unique is called.
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_WITHOUT_GRADIENT(unique, ops::UniqueOp, ops::UniqueOpMaker);
diff --git a/paddle/fluid/operators/unity_build_rule.cmake b/paddle/fluid/operators/unity_build_rule.cmake
index 7ca431e8ea5d1..af90cbdfc1639 100644
--- a/paddle/fluid/operators/unity_build_rule.cmake
+++ b/paddle/fluid/operators/unity_build_rule.cmake
@@ -17,7 +17,6 @@ register_unity_group(
   assert_op.cc
   assign_value_op.cc
   attention_lstm_op.cc
-  average_accumulates_op.cc
   batch_fc_op.cc
   bce_loss_op.cc
   beam_search_op.cc
@@ -222,7 +221,6 @@ register_unity_group(
   mkldnn/quantize_mkldnn_op.cc
   queue_generator_op.cc
   random_crop_op.cc
-  randperm_op.cc
   range_op.cc
   rank_attention_op.cc
   rank_loss_op.cc
@@ -386,7 +384,6 @@ register_unity_group(
   conv_transpose_op.cu
   cos_sim_op.cu
   crop_op.cu
-  average_accumulates_op.cu
   conj_op.cu
   correlation_op.cu)
 register_unity_group(
@@ -500,7 +497,6 @@ register_unity_group(
 register_unity_group(
   cu
   random_crop_op.cu
-  randperm_op.cu
   range_op.cu
   reverse_op.cu
   partial_concat_op.cu
diff --git a/paddle/fluid/prim/api/composite_backward/composite_backward_api.h b/paddle/fluid/prim/api/composite_backward/composite_backward_api.h
index 286d3cae8de5d..c0830b2a75428 100644
--- a/paddle/fluid/prim/api/composite_backward/composite_backward_api.h
+++ b/paddle/fluid/prim/api/composite_backward/composite_backward_api.h
@@ -1483,9 +1483,10 @@ void batch_norm_grad(const Tensor& x,
         if (bias_grad) {
           set_output<T>(out_grad_data_sum, bias_grad);
         }
-        break;
       }
+      break;
     }
+
     default:
       PADDLE_THROW(phi::errors::InvalidArgument("Unknown storage order: %s",
                                                 data_layout));
diff --git a/paddle/fluid/prim/utils/static/composite_grad_desc_maker.h b/paddle/fluid/prim/utils/static/composite_grad_desc_maker.h
index 83b18814b19d4..b1b24af231f68 100644
--- a/paddle/fluid/prim/utils/static/composite_grad_desc_maker.h
+++ b/paddle/fluid/prim/utils/static/composite_grad_desc_maker.h
@@ -575,6 +575,9 @@ class CompositeGradOpMakerBase {
 
   const std::unordered_map<std::string, framework::Attribute>& RuntimeAttrs()
       const {
+    LOG(WARNING) << "CompositeGradOpMaker doesn't support use runtime attrs, "
+                    "but find the op"
+                 << fwd_op_.Type() << "use runtime attr.";
     return fwd_op_.GetRuntimeAttrMap();
   }
 
diff --git a/paddle/fluid/pybind/eager_generator.h b/paddle/fluid/pybind/eager_generator.h
index 2eb7934c911c5..03b8690569c22 100644
--- a/paddle/fluid/pybind/eager_generator.h
+++ b/paddle/fluid/pybind/eager_generator.h
@@ -220,7 +220,12 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
     {"sgd", {"Param", "LearningRate", "Grad", "MasterParam"}},
     {"adagrad", {"Param", "Grad", "Moment", "LearningRate", "MasterParam"}},
     {"adadelta",
-     {"Param", "Grad", "AvgSquaredGrad", "AvgSquaredUpdate", "MasterParam"}},
+     {"Param",
+      "Grad",
+      "AvgSquaredGrad",
+      "AvgSquaredUpdate",
+      "LearningRate",
+      "MasterParam"}},
     {"graph_khop_sampler", {"Row", "Eids", "Col_Ptr", "X"}},
     {"nce",
      {"Input",
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index f2d1c396617b1..93030c9138fa8 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -770,7 +770,11 @@ PyObject* ToPyObject(const std::vector<std::vector<size_t>>& value) {
 
 PyObject* ToPyObject(const std::vector<paddle::Tensor>& value,
                      bool return_py_none_if_not_initialize) {
+  // NOTE(liuyuanle): I encountered a bug(access violation) in windows. ref to
+  // https://stackoverflow.com/questions/55598839/how-to-fix-access-violation-error-when-returning-pyobject-from-c-function-usin
+  PyGILState_STATE gstate = PyGILState_Ensure();
   PyObject* result = PyList_New((Py_ssize_t)value.size());
+  PyGILState_Release(gstate);
 
   for (size_t i = 0; i < value.size(); i++) {
     if (!value[i].initialized() && return_py_none_if_not_initialize) {
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index 44966f930d3f1..0546dd84b6882 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -65,7 +65,7 @@ constexpr int NPY_UINT16_ = 4;
 // paddle::platform::float16 as numpy.float16.
 // Ref: https://github.com/pybind/pybind11/issues/1776
 template <>
-struct npy_format_descriptor<paddle_infer::float16> {
+struct npy_format_descriptor<phi::dtype::float16> {
   static py::dtype dtype() {
     handle ptr = npy_api::get().PyArray_DescrFromType_(NPY_FLOAT16_);
     return reinterpret_borrow<py::dtype>(ptr);
@@ -180,7 +180,7 @@ py::dtype PaddleDTypeToNumpyDType(PaddleDType dtype) {
       dt = py::dtype::of<float>();
       break;
     case PaddleDType::FLOAT16:
-      dt = py::dtype::of<paddle_infer::float16>();
+      dt = py::dtype::of<phi::dtype::float16>();
       break;
     case PaddleDType::UINT8:
       dt = py::dtype::of<uint8_t>();
@@ -264,7 +264,7 @@ void PaddleInferShareExternalData(paddle_infer::Tensor &tensor,  // NOLINT
         ToPaddleInferPlace(input_tensor.place().GetType()));
   } else if (input_tensor.dtype() == phi::DataType::FLOAT16) {
     tensor.ShareExternalData(
-        static_cast<paddle::platform::float16 *>(input_tensor.data()),
+        static_cast<phi::dtype::float16 *>(input_tensor.data()),
         shape,
         ToPaddleInferPlace(input_tensor.place().GetType()));
   } else if (input_tensor.dtype() == phi::DataType::INT32) {
@@ -353,7 +353,7 @@ size_t PaddleGetDTypeSize(PaddleDType dt) {
       size = sizeof(float);
       break;
     case PaddleDType::FLOAT16:
-      size = sizeof(paddle_infer::float16);
+      size = sizeof(phi::dtype::float16);
       break;
     case PaddleDType::INT8:
       size = sizeof(int8_t);
@@ -392,8 +392,8 @@ py::array ZeroCopyTensorToNumpy(ZeroCopyTensor &tensor) {  // NOLINT
       tensor.copy_to_cpu<float>(static_cast<float *>(array.mutable_data()));
       break;
     case PaddleDType::FLOAT16:
-      tensor.copy_to_cpu<paddle::platform::float16>(
-          static_cast<paddle::platform::float16 *>(array.mutable_data()));
+      tensor.copy_to_cpu<phi::dtype::float16>(
+          static_cast<phi::dtype::float16 *>(array.mutable_data()));
       break;
     case PaddleDType::UINT8:
       tensor.copy_to_cpu<uint8_t>(static_cast<uint8_t *>(array.mutable_data()));
@@ -432,8 +432,8 @@ py::array PaddleInferTensorToNumpy(paddle_infer::Tensor &tensor) {  // NOLINT
       tensor.CopyToCpu<float>(static_cast<float *>(array.mutable_data()));
       break;
     case PaddleDType::FLOAT16:
-      tensor.CopyToCpu<paddle::platform::float16>(
-          static_cast<paddle::platform::float16 *>(array.mutable_data()));
+      tensor.CopyToCpu<phi::dtype::float16>(
+          static_cast<phi::dtype::float16 *>(array.mutable_data()));
       break;
     case PaddleDType::UINT8:
       tensor.CopyToCpu(static_cast<uint8_t *>(array.mutable_data()));
@@ -1062,6 +1062,16 @@ void BindPaddleInferPredictor(py::module *m) {
       .def("get_output_names", &paddle_infer::Predictor::GetOutputNames)
       .def("get_input_handle", &paddle_infer::Predictor::GetInputHandle)
       .def("get_output_handle", &paddle_infer::Predictor::GetOutputHandle)
+      .def(
+          "run",
+          [](paddle_infer::Predictor &self, py::handle py_in_tensor_list) {
+            auto in_tensor_list =
+                CastPyArg2VectorOfTensor(py_in_tensor_list.ptr(), 0);
+            std::vector<paddle::Tensor> outputs;
+            self.Run(in_tensor_list, &outputs);
+            return py::handle(ToPyObject(outputs));
+          },
+          py::arg("inputs"))
       .def("run", [](paddle_infer::Predictor &self) { self.Run(); })
       .def("clone",
            [](paddle_infer::Predictor &self) { return self.Clone(nullptr); })
@@ -1091,9 +1101,9 @@ void BindZeroCopyTensor(py::module *m) {
       .def("copy_from_cpu", &ZeroCopyTensorCreate<int32_t>)
       .def("copy_from_cpu", &ZeroCopyTensorCreate<int64_t>)
       .def("copy_from_cpu", &ZeroCopyTensorCreate<float>)
+      .def("copy_from_cpu", &ZeroCopyTensorCreate<phi::dtype::float16>)
       // NOTE(liuyuanle): double must be bound after float.
       .def("copy_from_cpu", &ZeroCopyTensorCreate<double>)
-      .def("copy_from_cpu", &ZeroCopyTensorCreate<paddle_infer::float16>)
       .def("copy_from_cpu", &ZeroCopyTensorCreate<bool>)
       .def("copy_from_cpu", &ZeroCopyStringTensorCreate)
       .def("copy_to_cpu", &ZeroCopyTensorToNumpy)
@@ -1116,10 +1126,9 @@ void BindPaddleInferTensor(py::module *m) {
       .def("_copy_from_cpu_bind", &PaddleInferTensorCreate<int32_t>)
       .def("_copy_from_cpu_bind", &PaddleInferTensorCreate<int64_t>)
       .def("_copy_from_cpu_bind", &PaddleInferTensorCreate<float>)
+      .def("_copy_from_cpu_bind", &PaddleInferTensorCreate<phi::dtype::float16>)
       // NOTE(liuyuanle): double must be bound after float.
       .def("_copy_from_cpu_bind", &PaddleInferTensorCreate<double>)
-      .def("_copy_from_cpu_bind",
-           &PaddleInferTensorCreate<paddle_infer::float16>)
       .def("_copy_from_cpu_bind", &PaddleInferTensorCreate<bool>)
       .def("_copy_from_cpu_bind", &PaddleInferStringTensorCreate)
       .def("_share_external_data_bind", &PaddleInferShareExternalData)
diff --git a/paddle/fluid/pybind/place.cc b/paddle/fluid/pybind/place.cc
index d1d336b5bb009..aec21c6b0f629 100644
--- a/paddle/fluid/pybind/place.cc
+++ b/paddle/fluid/pybind/place.cc
@@ -373,7 +373,16 @@ void BindPlace(pybind11::module &m) {  // NOLINT
 #endif
       .def("__repr__", string::to_string<const platform::CUDAPlace &>)
       .def("__str__", string::to_string<const platform::CUDAPlace &>);
-
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool {
+    // Only GPUs with Compute Capability >= 53 support float16
+    return platform::GetGPUComputeCapability(place.device) >= 53;
+  });
+  m.def("is_bfloat16_supported", [](const platform::CUDAPlace &place) -> bool {
+    // Only GPUs with Compute Capability >= 80 support bfloat16
+    return platform::GetGPUComputeCapability(place.device) >= 80;
+  });
+#endif
   py::class_<platform::XPUPlace> xpuplace(m, "XPUPlace", R"DOC(
     **Note**:
     Examples:
@@ -492,7 +501,18 @@ void BindPlace(pybind11::module &m) {  // NOLINT
            &IsSamePlace<platform::CPUPlace, platform::CUDAPinnedPlace>)
       .def("__repr__", string::to_string<const platform::CPUPlace &>)
       .def("__str__", string::to_string<const platform::CPUPlace &>);
-
+  m.def("is_float16_supported",
+        [](const platform::CPUPlace &place) -> bool { return false; });
+  m.def("is_bfloat16_supported", [](const platform::CPUPlace &place) -> bool {
+#ifndef PADDLE_WITH_MKLDNN
+    return false;
+#else
+    if (phi::backends::cpu::MayIUse(phi::backends::cpu::cpu_isa_t::avx512_core))
+      return true;
+    else
+      return false;
+#endif
+  });
   py::class_<paddle::platform::CUDAPinnedPlace> cudapinnedplace(
       m, "CUDAPinnedPlace", R"DOC(
     CUDAPinnedPlace is a descriptor of a device.
diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc
index 9661d5524140b..5493cc945cf4c 100644
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -425,7 +425,8 @@ void BindOpDesc(pybind11::module *m) {
                     &pd::OpDesc::SetDistAttr,
                     pybind11::return_value_policy::reference)
       .def("inputs", [](pd::OpDesc &self) { return self.Inputs(); })
-      .def("outputs", &pd::OpDesc::Outputs);
+      .def("outputs", &pd::OpDesc::Outputs)
+      .def("get_attr_map", &pd::OpDesc::GetAttrMap);
 
   pybind11::class_<paddle::experimental::Scalar> scalar(*m, "Scalar", "");
   scalar.def(py::init<bool>())
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 65aa609e34fde..bde6357ccbe2f 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1960,17 +1960,6 @@ All parameter, weight, gradient are variables in Paddle.
       py::arg("sleep_inter") = 0,
       py::arg("redirect_stderr") = false);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool {
-    // Only GPUs with Compute Capability >= 53 support float16
-    return platform::GetGPUComputeCapability(place.device) >= 53;
-  });
-  m.def("is_bfloat16_supported", [](const platform::CUDAPlace &place) -> bool {
-    // Only GPUs with Compute Capability >= 80 support bfloat16
-    return platform::GetGPUComputeCapability(place.device) >= 80;
-  });
-#endif
-
   m.def("set_feed_variable",
         static_cast<void (*)(  // NOLINT
             Scope *,
diff --git a/paddle/ir/builtin_attribute.h b/paddle/ir/builtin_attribute.h
index 4572617ea57ec..82b5f8eb48aa5 100644
--- a/paddle/ir/builtin_attribute.h
+++ b/paddle/ir/builtin_attribute.h
@@ -16,6 +16,7 @@
 
 #include "paddle/ir/attribute.h"
 #include "paddle/ir/builtin_attribute_storage.h"
+#include "paddle/ir/utils.h"
 
 namespace ir {
 ///
@@ -82,15 +83,11 @@ class DictionaryAttribute : public ir::Attribute {
 }  // namespace ir
 
 namespace std {
-static std::size_t hash_combine(std::size_t lhs, std::size_t rhs) {
-  return lhs ^= rhs + 0x9e3779b9 + (lhs << 6) + (lhs >> 2);
-}
-
 template <>
 struct hash<ir::NamedAttribute> {
   std::size_t operator()(const ir::NamedAttribute &obj) const {
-    return hash_combine(std::hash<ir::Attribute>()(obj.name_),
-                        std::hash<ir::Attribute>()(obj.value_));
+    return ir::hash_combine(std::hash<ir::Attribute>()(obj.name_),
+                            std::hash<ir::Attribute>()(obj.value_));
   }
 };
 }  // namespace std
diff --git a/paddle/ir/builtin_attribute_storage.cc b/paddle/ir/builtin_attribute_storage.cc
index 961319bc4a94e..c7feacae4d64a 100644
--- a/paddle/ir/builtin_attribute_storage.cc
+++ b/paddle/ir/builtin_attribute_storage.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/ir/builtin_attribute_storage.h"
 #include "paddle/ir/builtin_attribute.h"
+#include "paddle/ir/utils.h"
 
 namespace ir {
 
@@ -32,7 +33,7 @@ DictionaryAttributeStorage::DictionaryAttributeStorage(const ParamKey &key) {
 std::size_t DictionaryAttributeStorage::HashValue(const ParamKey &key) {
   std::size_t hash_value = key.size();
   for (auto iter = key.begin(); iter != key.end(); ++iter) {
-    hash_value = hash_combine(
+    hash_value = ir::hash_combine(
         hash_value,
         std::hash<NamedAttribute>()(NamedAttribute(iter->first, iter->second)));
   }
diff --git a/paddle/ir/builtin_attribute_storage.h b/paddle/ir/builtin_attribute_storage.h
index a0fdca9f1e10f..a34648fb17e35 100644
--- a/paddle/ir/builtin_attribute_storage.h
+++ b/paddle/ir/builtin_attribute_storage.h
@@ -83,10 +83,6 @@ struct DictionaryAttributeStorage : public AttributeStorage {
   uint32_t size() const { return size_; }
 
  private:
-  static std::size_t hash_combine(std::size_t lhs, std::size_t rhs) {
-    return lhs ^= rhs + 0x9e3779b9 + (lhs << 6) + (lhs >> 2);
-  }
-
   NamedAttribute *data_;
   uint32_t size_;
 };
diff --git a/paddle/ir/builtin_type_storage.h b/paddle/ir/builtin_type_storage.h
index 876b6ceeffdce..132a1656a7975 100644
--- a/paddle/ir/builtin_type_storage.h
+++ b/paddle/ir/builtin_type_storage.h
@@ -17,6 +17,7 @@
 #include <type_traits>
 
 #include "paddle/ir/type.h"
+#include "paddle/ir/utils.h"
 
 namespace std {
 ///
@@ -109,20 +110,22 @@ struct DenseTensorTypeStorage : public ir::TypeStorage {
     std::size_t hash_value = 0;
     // hash dtype
     hash_value =
-        hash_combine(hash_value, std::hash<ir::Type>()(std::get<0>(key)));
+        ir::hash_combine(hash_value, std::hash<ir::Type>()(std::get<0>(key)));
     // hash dims
-    hash_value = hash_combine(hash_value, std::hash<Dim>()(std::get<1>(key)));
-    // hash layout
     hash_value =
-        hash_combine(hash_value,
-                     std::hash<std::underlying_type<DataLayout>::type>()(
-                         static_cast<std::underlying_type<DataLayout>::type>(
-                             std::get<2>(key))));
+        ir::hash_combine(hash_value, std::hash<Dim>()(std::get<1>(key)));
+    // hash layout
+    hash_value = ir::hash_combine(
+        hash_value,
+        std::hash<std::underlying_type<DataLayout>::type>()(
+            static_cast<std::underlying_type<DataLayout>::type>(
+                std::get<2>(key))));
     // hash lod
-    hash_value = hash_combine(hash_value, std::hash<LoD>()(std::get<3>(key)));
+    hash_value =
+        ir::hash_combine(hash_value, std::hash<LoD>()(std::get<3>(key)));
     // hash offset
     hash_value =
-        hash_combine(hash_value, std::hash<size_t>()(std::get<4>(key)));
+        ir::hash_combine(hash_value, std::hash<size_t>()(std::get<4>(key)));
     return hash_value;
   }
 
@@ -146,11 +149,6 @@ struct DenseTensorTypeStorage : public ir::TypeStorage {
   DataLayout layout_;
   LoD lod_;
   size_t offset_;
-
- private:
-  static std::size_t hash_combine(std::size_t lhs, std::size_t rhs) {
-    return lhs ^= rhs + 0x9e3779b9 + (lhs << 6) + (lhs >> 2);
-  }
 };
 
 }  // namespace ir
diff --git a/paddle/ir/op_base.h b/paddle/ir/op_base.h
new file mode 100644
index 0000000000000..38ff4002c6b2b
--- /dev/null
+++ b/paddle/ir/op_base.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/ir/operation.h"
+
+namespace ir {
+class OpBase {
+ public:
+  Operation *operation() { return operation_; }
+
+  explicit operator bool() { return operation() != nullptr; }
+
+  operator Operation *() const { return operation_; }
+
+  Operation *operator->() const { return operation_; }
+
+ protected:
+  explicit OpBase(Operation *operation) : operation_(operation) {}
+
+ private:
+  Operation *operation_;
+};
+
+}  // namespace ir
diff --git a/paddle/ir/operation.cc b/paddle/ir/operation.cc
new file mode 100644
index 0000000000000..e9d727f1b5fb3
--- /dev/null
+++ b/paddle/ir/operation.cc
@@ -0,0 +1,173 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/ir/operation.h"
+#include "paddle/ir/utils.h"
+
+namespace ir {
+// Allocate the required memory based on the size and number of inputs, outputs,
+// and operators, and construct it in the order of: OpOutlineResult,
+// OpInlineResult, Operation, Operand.
+Operation *Operation::create(const std::vector<ir::OpResult> &inputs,
+                             const std::vector<ir::Type> &output_types,
+                             ir::DictionaryAttribute attribute) {
+  // 1. Calculate the required memory size for OpResults + Operation +
+  // OpOperands.
+  uint32_t num_results = output_types.size();
+  uint32_t num_operands = inputs.size();
+  uint32_t max_inline_result_num =
+      detail::OpResultImpl::GetMaxInlineResultIndex() + 1;
+  size_t result_mem_size =
+      num_results > max_inline_result_num
+          ? sizeof(detail::OpOutlineResultImpl) *
+                    (num_results - max_inline_result_num) +
+                sizeof(detail::OpInlineResultImpl) * max_inline_result_num
+          : sizeof(detail::OpInlineResultImpl) * num_results;
+  size_t operand_mem_size = sizeof(detail::OpOperandImpl) * num_operands;
+  size_t op_mem_size = sizeof(Operation);
+  size_t base_size = result_mem_size + op_mem_size + operand_mem_size;
+  // 2. Malloc memory.
+  char *base_ptr = reinterpret_cast<char *>(aligned_malloc(base_size, 8));
+  // 3.1. Construct OpResults.
+  for (size_t idx = num_results; idx > 0; idx--) {
+    if (idx > max_inline_result_num) {
+      new (base_ptr)
+          detail::OpOutlineResultImpl(output_types[idx - 1], idx - 1);
+      base_ptr += sizeof(detail::OpOutlineResultImpl);
+    } else {
+      new (base_ptr) detail::OpInlineResultImpl(output_types[idx - 1], idx - 1);
+      base_ptr += sizeof(detail::OpInlineResultImpl);
+    }
+  }
+  // 3.2. Construct Operation.
+  Operation *op =
+      new (base_ptr) Operation(num_results, num_operands, attribute);
+  base_ptr += sizeof(Operation);
+  // 3.3. Construct OpOperands.
+  if ((reinterpret_cast<uintptr_t>(base_ptr) & 0x7) != 0) {
+    throw("The address of OpOperandImpl must be divisible by 8.");
+  }
+  for (size_t idx = 0; idx < num_operands; idx++) {
+    new (base_ptr) detail::OpOperandImpl(inputs[idx].impl_, op);
+    base_ptr += sizeof(detail::OpOperandImpl);
+  }
+  VLOG(4) << "Construct an Operation: " << op->print();
+  return op;
+}
+
+// Call destructors for OpResults, Operation, and OpOperands in sequence, and
+// finally free memory.
+void Operation::destroy() {
+  // 1. Get aligned_ptr by result_num.
+  uint32_t max_inline_result_num =
+      detail::OpResultImpl::GetMaxInlineResultIndex() + 1;
+  size_t result_mem_size =
+      num_results_ > max_inline_result_num
+          ? sizeof(detail::OpOutlineResultImpl) *
+                    (num_results_ - max_inline_result_num) +
+                sizeof(detail::OpInlineResultImpl) * max_inline_result_num
+          : sizeof(detail::OpInlineResultImpl) * num_results_;
+  char *aligned_ptr = reinterpret_cast<char *>(this) - result_mem_size;
+  // 2.1. Deconstruct OpResult.
+  char *base_ptr = aligned_ptr;
+  for (size_t idx = num_results_; idx > 0; idx--) {
+    if (!reinterpret_cast<detail::OpResultImpl *>(base_ptr)->use_empty()) {
+      throw("Cannot destroy a value that still has uses!");
+    }
+    if (idx > max_inline_result_num) {
+      reinterpret_cast<detail::OpOutlineResultImpl *>(base_ptr)
+          ->~OpOutlineResultImpl();
+      base_ptr += sizeof(detail::OpOutlineResultImpl);
+    } else {
+      reinterpret_cast<detail::OpInlineResultImpl *>(base_ptr)
+          ->~OpInlineResultImpl();
+      base_ptr += sizeof(detail::OpInlineResultImpl);
+    }
+  }
+  // 2.2. Deconstruct Operation.
+  if (reinterpret_cast<uintptr_t>(base_ptr) !=
+      reinterpret_cast<uintptr_t>(this)) {
+    throw("Operation address error");
+  }
+  reinterpret_cast<Operation *>(base_ptr)->~Operation();
+  base_ptr += sizeof(Operation);
+  // 2.3. Deconstruct OpOpOerand.
+  for (size_t idx = 0; idx < num_operands_; idx++) {
+    reinterpret_cast<detail::OpOperandImpl *>(base_ptr)->~OpOperandImpl();
+    base_ptr += sizeof(detail::OpOperandImpl);
+  }
+  // 3. Free memory.
+  VLOG(4) << "Destroy an Operation: {ptr = "
+          << reinterpret_cast<void *>(aligned_ptr)
+          << ", size = " << result_mem_size << "}";
+  aligned_free(reinterpret_cast<void *>(aligned_ptr));
+}
+
+Operation::Operation(uint32_t num_results,
+                     uint32_t num_operands,
+                     ir::DictionaryAttribute attribute) {
+  if (!attribute) {
+    throw("unexpected null attribute dictionary");
+  }
+  num_results_ = num_results;
+  num_operands_ = num_operands;
+  attribute_ = attribute;
+}
+
+ir::OpResult Operation::GetResultByIndex(uint32_t index) {
+  if (index >= num_results_) {
+    throw("index exceeds OP output range.");
+  }
+  uint32_t max_inline_idx = detail::OpResultImpl::GetMaxInlineResultIndex();
+  char *ptr = nullptr;
+  if (index > max_inline_idx) {
+    ptr = reinterpret_cast<char *>(this) -
+          (max_inline_idx + 1) * sizeof(detail::OpInlineResultImpl) -
+          (index - max_inline_idx) * sizeof(detail::OpOutlineResultImpl);
+  } else {
+    ptr = reinterpret_cast<char *>(this) -
+          (index + 1) * sizeof(detail::OpInlineResultImpl);
+  }
+  if (index > max_inline_idx) {
+    detail::OpOutlineResultImpl *result_impl_ptr =
+        reinterpret_cast<detail::OpOutlineResultImpl *>(ptr);
+    return ir::OpResult(result_impl_ptr);
+  } else {
+    detail::OpInlineResultImpl *result_impl_ptr =
+        reinterpret_cast<detail::OpInlineResultImpl *>(ptr);
+    return ir::OpResult(result_impl_ptr);
+  }
+}
+
+std::string Operation::print() {
+  std::stringstream result;
+  result << "{ " << num_results_ << " outputs, " << num_operands_
+         << " inputs } : ";
+  result << "[ ";
+  for (size_t idx = num_results_; idx > 0; idx--) {
+    result << GetResultByIndex(idx - 1).impl_ << ", ";
+  }
+  result << "] = ";
+  result << this << "( ";
+  for (size_t idx = 0; idx < num_operands_; idx++) {
+    result << reinterpret_cast<void *>(reinterpret_cast<char *>(this) +
+                                       sizeof(Operation) +
+                                       idx * sizeof(detail::OpOperandImpl))
+           << ", ";
+  }
+  result << ")";
+  return result.str();
+}
+
+}  // namespace ir
diff --git a/paddle/ir/operation.h b/paddle/ir/operation.h
new file mode 100644
index 0000000000000..924dcafb73dfc
--- /dev/null
+++ b/paddle/ir/operation.h
@@ -0,0 +1,57 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/ir/builtin_attribute.h"
+#include "paddle/ir/type.h"
+#include "paddle/ir/value_impl.h"
+
+namespace ir {
+
+class alignas(8) Operation final {
+ public:
+  ///
+  /// \brief Malloc memory and construct objects in the following order:
+  /// OpResultImpls|Operation|OpOperandImpls.
+  ///
+  static Operation *create(const std::vector<ir::OpResult> &inputs,
+                           const std::vector<ir::Type> &output_types,
+                           ir::DictionaryAttribute attribute);
+
+  void destroy();
+
+  ir::OpResult GetResultByIndex(uint32_t index);
+
+  std::string print();
+
+  ir::DictionaryAttribute attribute() { return attribute_; }
+
+  uint32_t num_results() { return num_results_; }
+
+  uint32_t num_operands() { return num_operands_; }
+
+ private:
+  Operation(uint32_t num_results,
+            uint32_t num_operands,
+            ir::DictionaryAttribute attribute);
+
+  ir::DictionaryAttribute attribute_;
+
+  uint32_t num_results_ = 0;
+
+  uint32_t num_operands_ = 0;
+};
+
+}  // namespace ir
diff --git a/paddle/ir/tests/CMakeLists.txt b/paddle/ir/tests/CMakeLists.txt
index d94789fd05682..e012ec5bd264d 100644
--- a/paddle/ir/tests/CMakeLists.txt
+++ b/paddle/ir/tests/CMakeLists.txt
@@ -1,2 +1,3 @@
 cc_test_old(type_test SRCS type_test.cc DEPS new_ir gtest)
 cc_test_old(ir_attribute_test SRCS ir_attribute_test.cc DEPS new_ir gtest)
+cc_test_old(ir_value_test SRCS ir_value_test.cc DEPS new_ir gtest)
diff --git a/paddle/ir/tests/ir_value_test.cc b/paddle/ir/tests/ir_value_test.cc
new file mode 100644
index 0000000000000..c04e7c35128f4
--- /dev/null
+++ b/paddle/ir/tests/ir_value_test.cc
@@ -0,0 +1,98 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include "paddle/ir/attribute.h"
+#include "paddle/ir/builtin_attribute.h"
+#include "paddle/ir/builtin_type.h"
+#include "paddle/ir/ir_context.h"
+#include "paddle/ir/operation.h"
+
+// This unittest is used to test the construction interfaces of value class and
+// operation. The constructed test scenario is: a = OP1(); b = OP2(); c = OP3(a,
+// b); d, e, f, g, h, i, j = OP4(a, c);
+
+ir::DictionaryAttribute CreateAttribute(std::string attribute_name,
+                                        std::string attribute) {
+  ir::IrContext *ctx = ir::IrContext::Instance();
+  ir::StrAttribute attr_name = ir::StrAttribute::get(ctx, attribute_name);
+  ir::Attribute attr_value = ir::StrAttribute::get(ctx, attribute);
+  std::map<ir::StrAttribute, ir::Attribute> named_attr;
+  named_attr.insert(
+      std::pair<ir::StrAttribute, ir::Attribute>(attr_name, attr_value));
+  return ir::DictionaryAttribute::get(ctx, named_attr);
+}
+
+TEST(value_test, value_test) {
+  ir::IrContext *ctx = ir::IrContext::Instance();
+  // 1. Construct OP1: a = OP1()
+  std::vector<ir::OpResult> op1_inputs = {};
+  std::vector<ir::Type> op1_output_types = {ir::Float32Type::get(ctx)};
+  ir::Operation *op1 = ir::Operation::create(
+      op1_inputs, op1_output_types, CreateAttribute("op1_name", "op1_attr"));
+  std::cout << op1->print() << std::endl;
+  // 2. Construct OP2: b = OP2();
+  std::vector<ir::OpResult> op2_inputs = {};
+  std::vector<ir::Type> op2_output_types = {ir::Float32Type::get(ctx)};
+  ir::Operation *op2 = ir::Operation::create(
+      op2_inputs, op2_output_types, CreateAttribute("op2_name", "op2_attr"));
+  std::cout << op2->print() << std::endl;
+  // 3. Construct OP3: c = OP3(a, b);
+  std::vector<ir::OpResult> op3_inputs = {op1->GetResultByIndex(0),
+                                          op2->GetResultByIndex(0)};
+  std::vector<ir::Type> op3_output_types = {ir::Float32Type::get(ctx)};
+  ir::Operation *op3 = ir::Operation::create(
+      op3_inputs, op3_output_types, CreateAttribute("op3_name", "op3_attr"));
+  std::cout << op3->print() << std::endl;
+  // 4. Construct OP4: d, e, f, g, h, i, j = OP4(a, c);
+  std::vector<ir::OpResult> op4_inputs = {op1->GetResultByIndex(0),
+                                          op3->GetResultByIndex(0)};
+  std::vector<ir::Type> op4_output_types;
+  for (size_t i = 0; i < 7; i++) {
+    op4_output_types.push_back(ir::Float32Type::get(ctx));
+  }
+  ir::Operation *op4 = ir::Operation::create(
+      op4_inputs, op4_output_types, CreateAttribute("op4_name", "op4_attr"));
+  std::cout << op4->print() << std::endl;
+
+  // Test 1:
+  EXPECT_EQ(op1->GetResultByIndex(0).GetDefiningOp(), op1);
+  EXPECT_EQ(op2->GetResultByIndex(0).GetDefiningOp(), op2);
+  EXPECT_EQ(op3->GetResultByIndex(0).GetDefiningOp(), op3);
+  EXPECT_EQ(op4->GetResultByIndex(6).GetDefiningOp(), op4);
+
+  // Test 2: op1_first_output -> op4_first_input
+  ir::OpResult op1_first_output = op1->GetResultByIndex(0);
+  ir::detail::OpOperandImpl *op4_first_input =
+      reinterpret_cast<ir::detail::OpOperandImpl *>(
+          reinterpret_cast<uintptr_t>(op4) + sizeof(ir::Operation));
+  EXPECT_EQ(static_cast<ir::Value>(op1_first_output).impl()->first_use(),
+            op4_first_input);
+  ir::detail::OpOperandImpl *op3_first_input =
+      reinterpret_cast<ir::detail::OpOperandImpl *>(
+          reinterpret_cast<uintptr_t>(op3) + sizeof(ir::Operation));
+  EXPECT_EQ(op4_first_input->next_use(), op3_first_input);
+  EXPECT_EQ(op3_first_input->next_use(), nullptr);
+
+  // destroy
+  std::cout << op1->GetResultByIndex(0).print_ud_chain() << std::endl;
+  op4->destroy();
+  std::cout << op1->GetResultByIndex(0).print_ud_chain() << std::endl;
+  op3->destroy();
+  std::cout << op1->GetResultByIndex(0).print_ud_chain() << std::endl;
+  op2->destroy();
+  std::cout << op1->GetResultByIndex(0).print_ud_chain() << std::endl;
+  op1->destroy();
+}
diff --git a/paddle/ir/tests/type_test.cc b/paddle/ir/tests/type_test.cc
index a11040e3656a5..d21afdcb80a59 100644
--- a/paddle/ir/tests/type_test.cc
+++ b/paddle/ir/tests/type_test.cc
@@ -21,6 +21,7 @@
 #include "paddle/ir/ir_context.h"
 #include "paddle/ir/type.h"
 #include "paddle/ir/type_base.h"
+#include "paddle/ir/utils.h"
 
 TEST(type_test, type_id) {
   // Define two empty classes, just for testing.
@@ -172,8 +173,8 @@ struct IntegerTypeStorage : public ir::TypeStorage {
   using ParamKey = std::pair<unsigned, unsigned>;
 
   static std::size_t HashValue(const ParamKey &key) {
-    return hash_combine(std::hash<unsigned>()(std::get<0>(key)),
-                        std::hash<unsigned>()(std::get<1>(key)));
+    return ir::hash_combine(std::hash<unsigned>()(std::get<0>(key)),
+                            std::hash<unsigned>()(std::get<1>(key)));
   }
 
   bool operator==(const ParamKey &key) const {
@@ -188,11 +189,6 @@ struct IntegerTypeStorage : public ir::TypeStorage {
 
   unsigned width_ : 30;
   unsigned signedness_ : 2;
-
- private:
-  static std::size_t hash_combine(std::size_t lhs, std::size_t rhs) {
-    return lhs ^= rhs + 0x9e3779b9 + (lhs << 6) + (lhs >> 2);
-  }
 };
 
 // Customize a parameterized type: IntegerType, storage type is
diff --git a/paddle/ir/utils.cc b/paddle/ir/utils.cc
new file mode 100644
index 0000000000000..9e6f1fcaf5790
--- /dev/null
+++ b/paddle/ir/utils.cc
@@ -0,0 +1,58 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/ir/utils.h"
+
+namespace ir {
+std::size_t hash_combine(std::size_t lhs, std::size_t rhs) {
+  return lhs ^= rhs + 0x9e3779b9 + (lhs << 6) + (lhs >> 2);
+}
+
+void *aligned_malloc(size_t size, size_t alignment) {
+  assert(alignment >= sizeof(void *) && (alignment & (alignment - 1)) == 0);
+  size = (size + alignment - 1) / alignment * alignment;
+#if defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 200112L
+  void *aligned_mem = nullptr;
+  if (posix_memalign(&aligned_mem, alignment, size) != 0) {
+    aligned_mem = nullptr;
+  }
+  return aligned_mem;
+#elif defined(_WIN32)
+  return _aligned_malloc(size, alignment);
+#else
+  void *mem = malloc(size + alignment);
+  if (mem == nullptr) {
+    return nullptr;
+  }
+  size_t adjust = alignment - reinterpret_cast<uint64_t>(mem) % alignment;
+  void *aligned_mem = reinterpret_cast<char *>(mem) + adjust;
+  *(reinterpret_cast<void **>(aligned_mem) - 1) = mem;
+  assert(reinterpret_cast<uint64_t>(aligned_mem) % alignment == 0);
+  return aligned_mem;
+#endif
+}
+
+void aligned_free(void *mem_ptr) {
+#if defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 200112L
+  free(mem_ptr);
+#elif defined(_WIN32)
+  _aligned_free(mem_ptr);
+#else
+  if (mem_ptr) {
+    free(*(reinterpret_cast<void **>(mem_ptr) - 1));
+  }
+#endif
+}
+
+}  // namespace ir
diff --git a/paddle/phi/ops/compat/randperm_sig.cc b/paddle/ir/utils.h
similarity index 60%
rename from paddle/phi/ops/compat/randperm_sig.cc
rename to paddle/ir/utils.h
index 14b28512e402a..b4dd00281e159 100644
--- a/paddle/phi/ops/compat/randperm_sig.cc
+++ b/paddle/ir/utils.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,14 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/core/compat/op_utils.h"
+#pragma once
 
-namespace phi {
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
 
-KernelSignature RandpermOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  return KernelSignature("randperm", {}, {"n", "dtype"}, {"Out"});
-}
+namespace ir {
+std::size_t hash_combine(std::size_t lhs, std::size_t rhs);
 
-}  // namespace phi
+void *aligned_malloc(size_t size, size_t alignment);
 
-PD_REGISTER_ARG_MAPPING_FN(randperm, phi::RandpermOpArgumentMapping);
+void aligned_free(void *mem_ptr);
+
+}  // namespace ir
diff --git a/paddle/ir/value.cc b/paddle/ir/value.cc
new file mode 100644
index 0000000000000..f5ecc41018bcf
--- /dev/null
+++ b/paddle/ir/value.cc
@@ -0,0 +1,183 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/ir/value.h"
+#include "paddle/ir/value_impl.h"
+
+namespace ir {
+// Operand
+OpOperand::OpOperand(const detail::OpOperandImpl *impl)
+    : impl_(const_cast<detail::OpOperandImpl *>(impl)) {}
+
+OpOperand &OpOperand::operator=(const OpOperand &rhs) {
+  if (this == &rhs) return *this;
+  impl_ = rhs.impl_;
+  return *this;
+}
+
+OpOperand &OpOperand::operator=(const detail::OpOperandImpl *impl) {
+  if (this->impl_ == impl) return *this;
+  impl_ = const_cast<detail::OpOperandImpl *>(impl);
+  return *this;
+}
+
+bool OpOperand::operator==(OpOperand other) const {
+  return impl_ == other.impl_;
+}
+
+bool OpOperand::operator!=(OpOperand other) const {
+  return impl_ != other.impl_;
+}
+
+bool OpOperand::operator!() const { return impl_ == nullptr; }
+
+OpOperand::operator bool() const { return impl_; }
+
+detail::OpOperandImpl *OpOperand::impl() const { return impl_; }
+
+// Value
+Value::Value(const detail::ValueImpl *impl)
+    : impl_(const_cast<detail::ValueImpl *>(impl)) {}
+
+bool Value::operator==(const Value &other) const {
+  return impl_ == other.impl_;
+}
+
+bool Value::operator!=(const Value &other) const {
+  return impl_ != other.impl_;
+}
+
+bool Value::operator!() const { return impl_ == nullptr; }
+
+Value::operator bool() const { return impl_; }
+
+detail::ValueImpl *Value::impl() const { return impl_; }
+
+ir::Type Value::type() const { return impl_->type(); }
+
+void Value::SetType(ir::Type type) { impl_->SetType(type); }
+
+Operation *Value::GetDefiningOp() const {
+  if (auto result = dyn_cast<OpResult>()) return result.owner();
+  return nullptr;
+}
+
+std::string Value::print_ud_chain() { return impl_->print_ud_chain(); }
+
+// OpResult
+bool OpResult::classof(Value value) {
+  return ir::isa<detail::OpResultImpl>(value.impl());
+}
+
+Operation *OpResult::owner() const { return impl()->owner(); }
+
+uint32_t OpResult::GetResultIndex() const { return impl()->GetResultIndex(); }
+
+detail::OpResultImpl *OpResult::impl() const {
+  return reinterpret_cast<detail::OpResultImpl *>(impl_);
+}
+
+uint32_t OpResult::GetValidInlineIndex(uint32_t index) {
+  uint32_t max_inline_index =
+      ir::detail::OpResultImpl::GetMaxInlineResultIndex();
+  return index <= max_inline_index ? index : max_inline_index;
+}
+
+// details
+namespace detail {
+ir::Operation *OpOperandImpl::owner() const { return owner_; }
+
+ir::detail::OpOperandImpl *OpOperandImpl::next_use() { return next_use_; }
+
+OpOperandImpl::OpOperandImpl(ir::Value source, ir::Operation *owner)
+    : source_(source), owner_(owner) {
+  prev_use_addr_ = source.impl()->first_use_addr();
+  next_use_ = source.impl()->first_use();
+  if (next_use_) {
+    next_use_->prev_use_addr_ = &next_use_;
+  }
+  source.impl()->SetFirstUse(this);
+}
+
+void OpOperandImpl::remove_from_ud_chain() {
+  if (!prev_use_addr_) return;
+  if (prev_use_addr_ == source_.impl()->first_use_addr()) {
+    /// NOTE: In ValueImpl, first_use_offseted_by_index_ use lower three bits
+    /// storage index information, so need to be updated using the SetFirstUse
+    /// method here.
+    source_.impl()->SetFirstUse(next_use_);
+  } else {
+    *prev_use_addr_ = next_use_;
+  }
+  if (next_use_) {
+    next_use_->prev_use_addr_ = prev_use_addr_;
+  }
+}
+
+OpOperandImpl::~OpOperandImpl() { remove_from_ud_chain(); }
+
+uint32_t ValueImpl::index() const {
+  uint32_t index =
+      reinterpret_cast<uintptr_t>(first_use_offseted_by_index_) & 0x07;
+  if (index < 6) return index;
+  return reinterpret_cast<OpOutlineResultImpl *>(const_cast<ValueImpl *>(this))
+      ->GetResultIndex();
+}
+
+std::string ValueImpl::print_ud_chain() {
+  std::stringstream result;
+  result << "Value[" << this << "] -> ";
+  OpOperandImpl *tmp = first_use();
+  if (tmp) {
+    result << "OpOperand[" << reinterpret_cast<void *>(tmp) << "] -> ";
+    while (tmp->next_use() != nullptr) {
+      result << "OpOperand[" << reinterpret_cast<void *>(tmp->next_use())
+             << "] -> ";
+      tmp = tmp->next_use();
+    }
+  }
+  result << "nullptr";
+  return result.str();
+}
+
+uint32_t OpResultImpl::GetResultIndex() const {
+  if (const auto *outline_result = ir::dyn_cast<OpOutlineResultImpl>(this)) {
+    return outline_result->GetResultIndex();
+  }
+  return ir::dyn_cast<OpInlineResultImpl>(this)->GetResultIndex();
+}
+
+ir::Operation *OpResultImpl::owner() const {
+  // For inline result, pointer offset index to obtain the address of op.
+  if (const auto *result = ir::dyn_cast<OpInlineResultImpl>(this)) {
+    result += result->GetResultIndex() + 1;
+    return reinterpret_cast<Operation *>(
+        const_cast<OpInlineResultImpl *>(result));
+  }
+  // For outline result, pointer offset outline_index to obtain the address of
+  // maximum inline result.
+  const OpOutlineResultImpl *outline_result =
+      (const OpOutlineResultImpl *)(this);
+  outline_result +=
+      (outline_result->outline_index_ - GetMaxInlineResultIndex());
+  // The offset of the maximum inline result distance op is
+  // GetMaxInlineResultIndex.
+  const auto *inline_result =
+      reinterpret_cast<const OpInlineResultImpl *>(outline_result);
+  inline_result += (GetMaxInlineResultIndex() + 1);
+  return reinterpret_cast<Operation *>(
+      const_cast<OpInlineResultImpl *>(inline_result));
+}
+}  // namespace detail
+}  // namespace ir
diff --git a/paddle/ir/value.h b/paddle/ir/value.h
new file mode 100644
index 0000000000000..3d197182cd6e9
--- /dev/null
+++ b/paddle/ir/value.h
@@ -0,0 +1,137 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/ir/cast_utils.h"
+#include "paddle/ir/type.h"
+
+namespace ir {
+class Operation;
+
+namespace detail {
+class OpOperandImpl;
+class ValueImpl;
+class OpResultImpl;
+}  // namespace detail
+
+///
+/// \brief OpOperand class represents the operand of operation. This class only
+/// provides interfaces, for specific implementation, see Impl class.
+///
+class OpOperand {
+ public:
+  OpOperand() = default;
+
+  OpOperand(const OpOperand &other) = default;
+
+  OpOperand(const detail::OpOperandImpl *impl);  // NOLINT
+
+  OpOperand &operator=(const OpOperand &rhs);
+
+  OpOperand &operator=(const detail::OpOperandImpl *impl);
+
+  bool operator==(OpOperand other) const;
+
+  bool operator!=(OpOperand other) const;
+
+  bool operator!() const;
+
+  explicit operator bool() const;
+
+  detail::OpOperandImpl *impl() const;
+
+ private:
+  detail::OpOperandImpl *impl_{nullptr};
+};
+
+///
+/// \brief Value class represents the SSA value in the IR system. This class
+/// only provides interfaces, for specific implementation, see Impl class.
+///
+class Value {
+ public:
+  Value() = default;
+
+  Value(const detail::ValueImpl *impl);  // NOLINT
+
+  Value(const Value &other) = default;
+
+  bool operator==(const Value &other) const;
+
+  bool operator!=(const Value &other) const;
+
+  bool operator!() const;
+
+  explicit operator bool() const;
+
+  template <typename T>
+  bool isa() const {
+    return ir::isa<T>(*this);
+  }
+
+  template <typename U>
+  U dyn_cast() const {
+    return ir::dyn_cast<U>(*this);
+  }
+
+  detail::ValueImpl *impl() const;
+
+  ir::Type type() const;
+
+  void SetType(ir::Type type);
+
+  Operation *GetDefiningOp() const;
+
+  std::string print_ud_chain();
+
+  friend struct std::hash<Value>;
+
+ protected:
+  detail::ValueImpl *impl_{nullptr};
+};
+
+///
+/// \brief OpResult class represents the value defined by a result of operation.
+/// This class only provides interfaces, for specific implementation, see Impl
+/// class.
+///
+class OpResult : public Value {
+ public:
+  using Value::Value;
+
+  static bool classof(Value value);
+
+  Operation *owner() const;
+
+  uint32_t GetResultIndex() const;
+
+  friend Operation;
+
+ private:
+  static uint32_t GetValidInlineIndex(uint32_t index);
+
+  detail::OpResultImpl *impl() const;
+};
+
+}  // namespace ir
+
+namespace std {
+template <>
+struct hash<ir::Value> {
+  std::size_t operator()(const ir::Value &obj) const {
+    return std::hash<const ir::detail::ValueImpl *>()(obj.impl_);
+  }
+};
+}  // namespace std
diff --git a/paddle/ir/value_impl.h b/paddle/ir/value_impl.h
new file mode 100644
index 0000000000000..2fa236dddd833
--- /dev/null
+++ b/paddle/ir/value_impl.h
@@ -0,0 +1,196 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/ir/value.h"
+
+namespace ir {
+static const uint32_t OUTLINE_OP_RESULT_INDEX = 6;
+
+class Operation;
+
+namespace detail {
+///
+/// \brief OpOperandImpl
+///
+class OpOperandImpl {
+ public:
+  ir::Operation *owner() const;
+
+  ir::detail::OpOperandImpl *next_use();
+
+  /// Remove this operand from the current use list.
+  void remove_from_ud_chain();
+
+  ~OpOperandImpl();
+
+  friend ir::Operation;
+
+ private:
+  OpOperandImpl(ir::Value source, ir::Operation *owner);
+
+  ir::detail::OpOperandImpl *next_use_ = nullptr;
+
+  ir::detail::OpOperandImpl **prev_use_addr_ = nullptr;
+
+  ir::Value source_;
+
+  ir::Operation *owner_ = nullptr;
+};
+
+///
+/// \brief ValueImpl is the base class of all drived Value classes such as
+/// OpResultImpl. This class defines all the information and usage interface in
+/// the IR Value. Each Value include three attributes:
+/// (1) type: ir::Type; (2) UD-chain of value: OpOperandImpl*, first operand
+/// address with offset of this value; (3) index: the position where the output
+/// list of the parent operator.
+///
+class alignas(8) ValueImpl {
+ public:
+  ///
+  /// \brief Interface functions of "type_" attribute.
+  ///
+  ir::Type type() const { return type_; }
+
+  void SetType(ir::Type type) { type_ = type; }
+
+  ///
+  /// \brief Interface functions of "first_use_offseted_by_index_" attribute.
+  ///
+  uint32_t index() const;
+
+  OpOperandImpl *first_use() const {
+    return reinterpret_cast<OpOperandImpl *>(
+        reinterpret_cast<uintptr_t>(first_use_offseted_by_index_) & (~0x07));
+  }
+
+  void SetFirstUse(OpOperandImpl *first_use) {
+    uint32_t offset = index();
+    first_use_offseted_by_index_ = reinterpret_cast<OpOperandImpl *>(
+        reinterpret_cast<uintptr_t>(first_use) + offset);
+    VLOG(4) << "The index of this value is " << offset
+            << ". Offset and set first use: " << first_use << " -> "
+            << first_use_offseted_by_index_ << ".";
+  }
+
+  OpOperandImpl **first_use_addr() { return &first_use_offseted_by_index_; }
+
+  bool use_empty() const { return first_use() == nullptr; }
+
+  std::string print_ud_chain();
+
+ protected:
+  ///
+  /// \brief Only can be constructed by derived classes such as OpResultImpl.
+  ///
+  explicit ValueImpl(ir::Type type, uint32_t index) {
+    if (index > OUTLINE_OP_RESULT_INDEX) {
+      throw("The value of index must not exceed 6");
+    }
+    type_ = type;
+    first_use_offseted_by_index_ = reinterpret_cast<OpOperandImpl *>(
+        reinterpret_cast<uintptr_t>(nullptr) + index);
+    VLOG(4) << "Construct a ValueImpl whose's index is " << index
+            << ". The offset first_use address is: "
+            << first_use_offseted_by_index_;
+  }
+
+  ///
+  /// \brief Attribute1: Type of value.
+  ///
+  ir::Type type_;
+
+  ///
+  /// \brief Attribute2/3: Record the UD-chain of value and index.
+  /// NOTE: The members of the OpOperandImpl include four pointers, so this
+  /// class is 8-byte aligned, and the lower 3 bits of its address are 0, so the
+  /// index can be stored in these 3 bits, stipulate:
+  /// (1) index = 0~5: represent positions 0 to 5 inline
+  /// output(OpInlineResultImpl); (2) index = 6: represent the position >=6
+  /// outline output(OpOutlineResultImpl); (3) index = 7 is reserved.
+  ///
+  OpOperandImpl *first_use_offseted_by_index_ = nullptr;
+};
+
+///
+/// \brief OpResultImpl is the implementation of an operation result.
+///
+class alignas(8) OpResultImpl : public ValueImpl {
+ public:
+  using ValueImpl::ValueImpl;
+
+  static bool classof(const ValueImpl &value) { return true; }
+
+  ///
+  /// \brief Get the parent operation of this result.(op_ptr = value_ptr +
+  /// index)
+  ///
+  ir::Operation *owner() const;
+
+  ///
+  /// \brief Get the result index of the operation result.
+  ///
+  uint32_t GetResultIndex() const;
+
+  ///
+  /// \brief Get the maximum number of results that can be stored inline.
+  ///
+  static uint32_t GetMaxInlineResultIndex() {
+    return OUTLINE_OP_RESULT_INDEX - 1;
+  }
+};
+
+///
+/// \brief OpInlineResultImpl is the implementation of an operation result whose
+/// index <= 5.
+///
+class OpInlineResultImpl : public OpResultImpl {
+ public:
+  OpInlineResultImpl(ir::Type type, uint32_t result_index)
+      : OpResultImpl(type, result_index) {
+    if (result_index > GetMaxInlineResultIndex()) {
+      throw("Inline result index should not exceed MaxInlineResultIndex(5)");
+    }
+  }
+
+  static bool classof(const OpResultImpl &value) {
+    return value.index() < OUTLINE_OP_RESULT_INDEX;
+  }
+
+  uint32_t GetResultIndex() const { return index(); }
+};
+
+///
+/// \brief OpOutlineResultImpl is the implementation of an operation result
+/// whose index > 5.
+///
+class OpOutlineResultImpl : public OpResultImpl {
+ public:
+  OpOutlineResultImpl(ir::Type type, uint32_t outline_index)
+      : OpResultImpl(type, OUTLINE_OP_RESULT_INDEX),
+        outline_index_(outline_index) {}
+
+  static bool classof(const OpResultImpl &value) {
+    return value.index() >= OUTLINE_OP_RESULT_INDEX;
+  }
+
+  uint32_t GetResultIndex() const { return outline_index_; }
+
+  uint32_t outline_index_;
+};
+
+}  // namespace detail
+}  // namespace ir
diff --git a/paddle/phi/api/include/tensor.h b/paddle/phi/api/include/tensor.h
index d3943750fd21e..24bcc63dbd278 100644
--- a/paddle/phi/api/include/tensor.h
+++ b/paddle/phi/api/include/tensor.h
@@ -416,7 +416,7 @@ class PADDLE_API Tensor final {
   /**
    * @brief Return the name of Tensor.
    * @note Used to adapt original execution mechanism and debug analysis
-   * in the development of new dygraph. It may be removed in the future.
+   * in the development of new dygraph.
    *
    * @return const std::string&
    */
@@ -425,7 +425,7 @@ class PADDLE_API Tensor final {
   /**
    * @brief Set name of Tensor.
    * @note Used to adapt original execution mechanism and debug analysis
-   * in the development of new dygraph. It may be removed in the future.
+   * in the development of new dygraph.
    *
    * @param const std::string& name
    */
@@ -657,7 +657,7 @@ class PADDLE_API Tensor final {
 
   /**
    * Tensor name: used to adapt original execution mechanism and debug analysis
-   * in the development of new dygraph. It may be removed in the future.
+   * in the development of new dygraph.
    */
   std::string name_{""};
 
diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc
index 3cebef1588ea5..6a409b6419623 100644
--- a/paddle/phi/api/lib/api_custom_impl.cc
+++ b/paddle/phi/api/lib/api_custom_impl.cc
@@ -136,6 +136,7 @@ Tensor add_n_impl(const std::vector<Tensor>& x) {
 Tensor copy_to_impl(const Tensor& x, Place place, bool blocking) {
   Tensor out;
   copy(x, place, blocking, &out);
+  out.set_name(x.name());
   return out;
 }
 
diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml
index 7bf3b5cd2fcd8..d288f0bf18f6a 100644
--- a/paddle/phi/api/yaml/backward.yaml
+++ b/paddle/phi/api/yaml/backward.yaml
@@ -1718,6 +1718,16 @@
   backward : square_double_grad
   inplace : (out_grad -> x_grad)
 
+- backward_op : squared_l2_norm_grad
+  forward : squared_l2_norm(Tensor x) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param: [x]
+  kernel :
+    func : squared_l2_norm_grad
+
 - backward_op : squeeze_double_grad
   forward : squeeze_grad(Tensor xshape, Tensor grad_out, IntArray axis) -> Tensor(grad_x)
   args : (Tensor grad_x_grad, IntArray axis)
diff --git a/paddle/phi/api/yaml/fused_ops.yaml b/paddle/phi/api/yaml/fused_ops.yaml
index c9fae2a81e3b7..b43d02fced54c 100644
--- a/paddle/phi/api/yaml/fused_ops.yaml
+++ b/paddle/phi/api/yaml/fused_ops.yaml
@@ -5,14 +5,14 @@
 # otherwise the operator only could be used in static mode.
 
 - op : conv2d_xpu
-  args : (Tensor input, Tensor input_max, Tensor filter, Tensor filter_max, Tensor bias, Tensor branch, int[] paddings, int[] dilations, int[] strides, str padding_algorithm, int groups, bool has_bias, bool has_branch, int act_type, float act_param)
-  output : Tensor(output), Tensor(output_max)
+  args : (Tensor x, Tensor x_max, Tensor filter, Tensor filter_max, Tensor bias, Tensor branch, int[] paddings, int[] dilations, int[] strides, str padding_algorithm, int groups, bool has_bias, bool has_branch, int act_type, float act_param)
+  output : Tensor(out), Tensor(out_max)
   infer_meta :
     func : Conv2dXPUInferMeta
   kernel :
     func : conv2d_xpu
-    data_type : input
-  optional : bias, branch, input_max
+    data_type : x
+  optional : bias, branch, x_max
 
 - op : embedding_with_eltwise_add_xpu
   args : (Tensor[] ids, Tensor[] tables, int64_t padding_idx)
diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml
index 4ba99b1b81312..3a67b3e4a3e46 100755
--- a/paddle/phi/api/yaml/legacy_backward.yaml
+++ b/paddle/phi/api/yaml/legacy_backward.yaml
@@ -475,8 +475,8 @@
     func : heaviside_grad
 
 - backward_op : hsigmoid_loss_grad
-  forward : hsigmoid_loss (Tensor x, Tensor label, Tensor w, Tensor bias, Tensor path, Tensor code, int num_classes, bool remote_prefetch, bool is_sparse) -> Tensor(out), Tensor(pre_out), Tensor(w_out)
-  args : (Tensor x, Tensor w, Tensor label, Tensor path, Tensor code, Tensor bias, Tensor pre_out, Tensor out_grad, int num_classes, bool remote_prefetch, bool is_sparse)
+  forward : hsigmoid_loss (Tensor x, Tensor label, Tensor w, Tensor bias, Tensor path, Tensor code, int num_classes, bool is_sparse) -> Tensor(out), Tensor(pre_out), Tensor(w_out)
+  args : (Tensor x, Tensor w, Tensor label, Tensor path, Tensor code, Tensor bias, Tensor pre_out, Tensor out_grad, int num_classes, bool is_sparse)
   output : Tensor(x_grad), Tensor(w_grad), Tensor(bias_grad)
   infer_meta :
     func : GeneralTernaryGradInferMeta
@@ -962,16 +962,6 @@
   invoke : concat( out_grad, axis)
   composite : split_grad(out_grad, axis, x_grad)
 
-- backward_op : squared_l2_norm_grad
-  forward : squared_l2_norm(Tensor x) -> Tensor(out)
-  args : (Tensor x, Tensor out_grad)
-  output : Tensor(x_grad)
-  infer_meta :
-    func : UnchangedInferMeta
-    param: [x]
-  kernel :
-    func : squared_l2_norm_grad
-
 - backward_op : strided_slice_grad
   forward : strided_slice (Tensor x, int[] axes, IntArray starts, IntArray ends, IntArray strides) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, int[] axes, IntArray starts, IntArray ends, IntArray strides)
diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml
index 53ae099e762ea..abd42601a8fa0 100755
--- a/paddle/phi/api/yaml/legacy_ops.yaml
+++ b/paddle/phi/api/yaml/legacy_ops.yaml
@@ -11,7 +11,7 @@
   backward : abs_grad
 
 - op : adadelta_
-  args : (Tensor param, Tensor grad, Tensor avg_squared_grad, Tensor avg_squared_update, Tensor master_param, float rho, float epsilon, bool multi_precision)
+  args : (Tensor param, Tensor grad, Tensor avg_squared_grad, Tensor avg_squared_update, Tensor learning_rate, Tensor master_param, float rho, float epsilon, bool multi_precision)
   output : Tensor(param_out), Tensor(moment_out), Tensor(inf_norm_out), Tensor(master_param_out)
   infer_meta :
     func : AdadeltaInferMeta
@@ -153,16 +153,6 @@
     data_type : dtype
     backend : place > output
 
-- op : average_accumulates_
-  args : (Tensor param, Tensor in_sum_1, Tensor in_sum_2, Tensor in_sum_3, Tensor in_num_accumulates, Tensor in_old_num_accumulates, Tensor in_num_updates, float average_window, int64_t max_average_window, int64_t min_average_window)
-  output : Tensor(out_sum_1), Tensor(out_sum_2), Tensor(out_sum_3), Tensor(out_num_accumulates), Tensor(out_old_num_accumulates), Tensor(out_num_updates)
-  infer_meta:
-    func : AverageAccumulatesInferMeta
-  kernel :
-    func : average_accumulates {dense, dense, dense, dense, dense ,dense, dense -> dense, dense, dense, dense, dense, dense}
-    data_type : param
-  inplace : (in_sum_1 -> out_sum_1), (in_sum_2 -> out_sum_2), (in_sum_3 -> out_sum_3), (in_num_accumulates -> out_num_accumulates), (in_old_num_accumulates -> out_old_num_accumulates), (in_num_updates -> out_num_updates)
-
 - op : batch_norm
   args : (Tensor x, Tensor mean, Tensor variance, Tensor scale, Tensor bias, bool is_test, float momentum, float epsilon, str data_layout, bool use_global_stats, bool trainable_statistics)
   output : Tensor(out), Tensor(mean_out), Tensor(variance_out), Tensor(saved_mean), Tensor(saved_variance), Tensor(reserve_space)
@@ -232,14 +222,6 @@
   kernel :
     func : class_center_sample
 
-- op : clip_by_norm
-  args : (Tensor x, float max_norm)
-  output : Tensor(out)
-  infer_meta :
-    func : ClipByNormInferMeta
-  kernel :
-    func : clip_by_norm
-
 - op : coalesce_tensor
   args : (Tensor[] input, DataType dtype, bool copy_data = false, bool set_constant = false, bool persist_output = false, float constant = 0.0, bool use_align = true, int align_size = -1, int size_of_dtype = -1, int64_t[] concated_shapes = {}, int64_t[] concated_ranks = {})
   output : Tensor[](output){input.size()}, Tensor(fused_output)
@@ -684,7 +666,7 @@
   backward : heaviside_grad
 
 - op : hsigmoid_loss
-  args : (Tensor x, Tensor label, Tensor w, Tensor bias, Tensor path, Tensor code, int num_classes, bool remote_prefetch, bool is_sparse)
+  args : (Tensor x, Tensor label, Tensor w, Tensor bias, Tensor path, Tensor code, int num_classes, bool is_sparse)
   output : Tensor(out), Tensor(pre_out), Tensor(w_out)
   infer_meta :
     func : HSigmoidLossInferMeta
@@ -884,17 +866,6 @@
     data_type : param
   inplace : (param -> param_out), (moment1 -> moment1_out), (moment2 -> moment2_out), (beta1_pow -> beta1_pow_out), (beta2_pow -> beta2_pow_out), (master_param -> master_param_out)
 
-- op : merged_momentum_
-  args : (Tensor[] param, Tensor[] grad, Tensor[] velocity, Tensor[] learning_rate, Tensor[] master_param, float mu, bool use_nesterov = false, str[] regularization_method = {}, float[] regularization_coeff = {}, bool multi_precision = false, float rescale_grad = 1.0f)
-  output : Tensor[](param_out){param.size()}, Tensor[](velocity_out){param.size()}, Tensor[](master_param_out){param.size()}
-  infer_meta :
-    func : MergedMomentumInferMeta
-  optional: master_param
-  kernel :
-    func : merged_momentum
-    data_type : param
-  inplace : (param -> param_out), (velocity -> velocity_out), (master_param -> master_param_out)
-
 - op : min
   args : (Tensor x, IntArray axis={}, bool keepdim=false)
   output : Tensor(out)
@@ -1229,15 +1200,6 @@
     func : split_with_num
   backward : split_with_num_grad
 
-- op : squared_l2_norm
-  args : (Tensor x)
-  output : Tensor
-  infer_meta :
-    func : SquaredL2NormInferMeta
-  kernel :
-    func : squared_l2_norm
-  backward : squared_l2_norm_grad
-
 - op : strided_slice
   args : (Tensor x, int[] axes, IntArray starts, IntArray ends, IntArray strides)
   output : Tensor
diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml
index f807a3d748ba1..44f065feb7d72 100644
--- a/paddle/phi/api/yaml/op_compat.yaml
+++ b/paddle/phi/api/yaml/op_compat.yaml
@@ -356,6 +356,12 @@
   extra :
     attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
 
+- op : clip_by_norm
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
 - op : complex
   backward : complex_grad
   inputs :
@@ -1421,6 +1427,12 @@
   outputs :
     out : Out
 
+- op : merged_momentum_
+  inputs :
+    {param : Param, grad : Grad, velocity : Velocity, learning_rate : LearningRate, master_param : MasterParam}
+  outputs :
+    {param_out : ParamOut, velocity_out : VelocityOut, master_param_out : MasterParamOut}
+
 - op : meshgrid
   backward : meshgrid_grad
   inputs :
@@ -1678,6 +1690,12 @@
       tensors_name : ShapeTensorList
   manual_signature : [randint]
 
+- op : randperm
+  outputs :
+    out : Out
+  extra :
+    attrs : [int seed = 0]
+
 - op : real
   backward : real_grad
   inputs :
@@ -2229,6 +2247,15 @@
         support_tensor : true
   manual_signature : [uniform]
 
+- op : unique
+  inputs :
+    {x : X}
+  outputs :
+    {out : Out, indices : Indices, inverse : Index, counts : Counts}
+  get_expected_kernel_type :
+    unique : GetUniqueExpectedKernelType
+  manual_signature : [unique]
+
 - op : unique_consecutive
   inputs :
     x : X
@@ -2301,3 +2328,10 @@
     {x: X, label: Label}
   outputs :
     out : Out
+
+- op: squared_l2_norm
+  backward: squared_l2_norm_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
index 3afbf00c049e6..20adbd31aca06 100644
--- a/paddle/phi/api/yaml/ops.yaml
+++ b/paddle/phi/api/yaml/ops.yaml
@@ -185,6 +185,16 @@
     data_type : x
   optional : ins_tag_weight
 
+- op : average_accumulates_
+  args : (Tensor param, Tensor in_sum_1, Tensor in_sum_2, Tensor in_sum_3, Tensor in_num_accumulates, Tensor in_old_num_accumulates, Tensor in_num_updates, float average_window = 0, int64_t max_average_window = INT64_MAX, int64_t min_average_window = 10000L)
+  output : Tensor(out_sum_1), Tensor(out_sum_2), Tensor(out_sum_3), Tensor(out_num_accumulates), Tensor(out_old_num_accumulates), Tensor(out_num_updates)
+  infer_meta:
+    func : AverageAccumulatesInferMeta
+  kernel :
+    func : average_accumulates {dense, dense, dense, dense, dense ,dense, dense -> dense, dense, dense, dense, dense, dense}
+    data_type : param
+  inplace : (in_sum_1 -> out_sum_1), (in_sum_2 -> out_sum_2), (in_sum_3 -> out_sum_3), (in_num_accumulates -> out_num_accumulates), (in_old_num_accumulates -> out_old_num_accumulates), (in_num_updates -> out_num_updates)
+
 - op : bce_loss
   args : (Tensor input, Tensor label)
   output : Tensor
@@ -345,6 +355,15 @@
     data_type : x
   backward : clip_grad
 
+- op : clip_by_norm
+  args : (Tensor x, float max_norm)
+  output : Tensor(out)
+  infer_meta :
+    func : ClipByNormInferMeta
+  kernel :
+    func : clip_by_norm {dense -> dense}
+           clip_by_norm_sr {selected_rows -> selected_rows}
+
 - op : complex
   args : (Tensor real, Tensor imag)
   output : Tensor
@@ -1190,6 +1209,17 @@
   kernel :
     func : merge_selected_rows {selected_rows -> selected_rows}
 
+- op : merged_momentum_
+  args : (Tensor[] param, Tensor[] grad, Tensor[] velocity, Tensor[] learning_rate, Tensor[] master_param, float mu, bool use_nesterov = false, str[] regularization_method = {}, float[] regularization_coeff = {}, bool multi_precision = false, float rescale_grad = 1.0f)
+  output : Tensor[](param_out){param.size()}, Tensor[](velocity_out){param.size()}, Tensor[](master_param_out){param.size()}
+  infer_meta :
+    func : MergedMomentumInferMeta
+  kernel :
+    func : merged_momentum
+    data_type : param
+  optional: master_param, master_param_out
+  inplace : (param -> param_out), (velocity -> velocity_out), (master_param -> master_param_out)
+
 - op : meshgrid
   args : (Tensor[] inputs)
   output : Tensor[]{inputs.size()}
@@ -1744,6 +1774,15 @@
            square_sr {selected_rows -> selected_rows}
   backward : square_grad
 
+- op : squared_l2_norm
+  args : (Tensor x)
+  output : Tensor(out)
+  infer_meta :
+    func : SquaredL2NormInferMeta
+  kernel :
+    func : squared_l2_norm
+  backward : squared_l2_norm_grad
+
 - op : squeeze
   args : (Tensor x, IntArray axis={})
   output : Tensor(out), Tensor(xshape)
diff --git a/paddle/phi/api/yaml/static_ops.yaml b/paddle/phi/api/yaml/static_ops.yaml
index 4e0d4cfc931c5..802c6b1d46df5 100644
--- a/paddle/phi/api/yaml/static_ops.yaml
+++ b/paddle/phi/api/yaml/static_ops.yaml
@@ -260,6 +260,17 @@
     param : [low, high, shape, dtype]
     data_type : dtype
 
+- op : randperm
+  args : (int n, DataType dtype = DataType::INT64)
+  output : Tensor(out)
+  infer_meta :
+    func : RandpermInferMeta
+    param : [n, dtype]
+  kernel :
+    func : randperm
+    param : [n, dtype]
+    data_type : dtype
+
 - op : reduce
   args : (Tensor x, int ring_id = 0, int root_id = 0, int reduce_type = 0)
   output : Tensor(out)
@@ -331,3 +342,13 @@
     func : uniform
     param: [shape, dtype, min, max, seed]
     data_type : dtype
+
+- op : unique
+  args : (Tensor x, bool return_index=false, bool return_inverse=false, bool return_counts=false, int[] axis={}, DataType dtype=DataType::INT64, bool is_sorted=false)
+  output : Tensor(out), Tensor(indices), Tensor(inverse), Tensor(counts)
+  optional : indices, counts
+  infer_meta :
+    func : UniqueRawInferMeta
+  kernel :
+    func : unique
+    data_type : x
diff --git a/paddle/phi/core/flags.cc b/paddle/phi/core/flags.cc
index 9cff3acccbd41..ad2e38b70d0ed 100644
--- a/paddle/phi/core/flags.cc
+++ b/paddle/phi/core/flags.cc
@@ -1001,6 +1001,20 @@ PADDLE_DEFINE_EXPORTED_bool(enable_cinn_auto_tune,
                             "It controls whether to use cinn with "
                             "its auto-tune feature enabled");
 
+/*
+ * CINN related FLAG
+ * Name: FLAGS_cinn_subgraph_graphviz_dir
+ * Since Version: 2.3
+ * Value Range: string, default=""
+ * Example: FLAGS_cinn_subgraph_graphviz_dir="./cinn_graph/" will save the
+ * CINN sub-graph into "./cinn_graph/", and each sub-graph will save into
+ * "fusion_groups_*"" directory
+ */
+PADDLE_DEFINE_EXPORTED_string(cinn_subgraph_graphviz_dir,
+                              "",
+                              "Specify the directory path of dot file of "
+                              "graph, which is used for debug.");
+
 #endif
 
 /*
diff --git a/paddle/phi/infermeta/fusion.cc b/paddle/phi/infermeta/fusion.cc
index ad8409487bb58..5c0aa3b8e89fd 100644
--- a/paddle/phi/infermeta/fusion.cc
+++ b/paddle/phi/infermeta/fusion.cc
@@ -35,8 +35,8 @@ inline int ConvOutSize(int input_size,
   return output_size;
 }
 
-void Conv2dXPUInferMeta(const MetaTensor& input,
-                        const MetaTensor& input_max,
+void Conv2dXPUInferMeta(const MetaTensor& x,
+                        const MetaTensor& x_max,
                         const MetaTensor& filter,
                         const MetaTensor& filter_max,
                         const MetaTensor& bias,
@@ -50,9 +50,9 @@ void Conv2dXPUInferMeta(const MetaTensor& input,
                         bool has_branch,
                         int act_type,
                         float act_param,
-                        MetaTensor* output,
-                        MetaTensor* output_max) {
-  auto in_dims = input.dims();
+                        MetaTensor* out,
+                        MetaTensor* out_max) {
+  auto in_dims = x.dims();
   auto filter_dims = filter.dims();
   // do some checks
   PADDLE_ENFORCE_EQ(
@@ -157,8 +157,8 @@ void Conv2dXPUInferMeta(const MetaTensor& input,
                                     strides[i]));
   }
   // set output and output max dims
-  output->set_dims(DDim(out_shape.data(), out_shape.size()));
-  output_max->set_dims(phi::make_ddim({4}));
+  out->set_dims(DDim(out_shape.data(), out_shape.size()));
+  out_max->set_dims(phi::make_ddim({4}));
 }
 
 void EmbeddingWithEltwiseAddXPUInferMeta(
diff --git a/paddle/phi/infermeta/fusion.h b/paddle/phi/infermeta/fusion.h
index 9dcf7342ae193..3105ea8a6d578 100644
--- a/paddle/phi/infermeta/fusion.h
+++ b/paddle/phi/infermeta/fusion.h
@@ -22,8 +22,8 @@ namespace phi {
 // Common InferMeta Functions for fusion operators.
 // NOTE: The InferMeta Functions in this file are arranged in alphabetic order.
 
-void Conv2dXPUInferMeta(const MetaTensor& input,
-                        const MetaTensor& input_max,
+void Conv2dXPUInferMeta(const MetaTensor& x,
+                        const MetaTensor& x_max,
                         const MetaTensor& filter,
                         const MetaTensor& filter_max,
                         const MetaTensor& bias,
@@ -37,8 +37,8 @@ void Conv2dXPUInferMeta(const MetaTensor& input,
                         bool has_branch,
                         int act_type,
                         float act_param,
-                        MetaTensor* output,
-                        MetaTensor* output_max);
+                        MetaTensor* out,
+                        MetaTensor* out_max);
 
 void EmbeddingWithEltwiseAddXPUInferMeta(
     const std::vector<const MetaTensor*>& ids,
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index ea93a5874932e..71fe149e7c0c0 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -40,6 +40,7 @@ void AdadeltaInferMeta(const MetaTensor& param,
                        const MetaTensor& grad,
                        const MetaTensor& avg_squared_grad,
                        const MetaTensor& avg_squared_update,
+                       const MetaTensor& learning_rate,
                        const MetaTensor& master_param,
                        float rho,
                        float epsilon,
@@ -48,6 +49,11 @@ void AdadeltaInferMeta(const MetaTensor& param,
                        MetaTensor* avg_squared_grad_out,
                        MetaTensor* avg_squared_update_out,
                        MetaTensor* master_param_out) {
+  auto lr_dims = learning_rate.dims();
+  PADDLE_ENFORCE_EQ(
+      phi::product(lr_dims),
+      1,
+      phi::errors::InvalidArgument("LearningRate should have one element"));
   auto param_dims = param.dims();
   PADDLE_ENFORCE_EQ(
       param_dims,
@@ -1426,7 +1432,6 @@ void HSigmoidLossInferMeta(const MetaTensor& x,
                            const MetaTensor& path,
                            const MetaTensor& code,
                            int num_classes,
-                           bool remote_prefetch,
                            bool is_sparse,
                            MetaTensor* out,
                            MetaTensor* pre_out,
diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h
index cf6ca3c2a9fb6..307e6115cfd56 100644
--- a/paddle/phi/infermeta/multiary.h
+++ b/paddle/phi/infermeta/multiary.h
@@ -43,6 +43,7 @@ void AdadeltaInferMeta(const MetaTensor& param,
                        const MetaTensor& grad,
                        const MetaTensor& avg_squared_grad,
                        const MetaTensor& avg_squared_update,
+                       const MetaTensor& learning_rate,
                        const MetaTensor& master_param,
                        float rho,
                        float epsilon,
@@ -311,7 +312,6 @@ void HSigmoidLossInferMeta(const MetaTensor& x,
                            const MetaTensor& path,
                            const MetaTensor& code,
                            int num_classes,
-                           bool remote_prefetch,
                            bool is_sparse,
                            MetaTensor* out,
                            MetaTensor* pre_out,
diff --git a/paddle/phi/kernels/adadelta_kernel.h b/paddle/phi/kernels/adadelta_kernel.h
index 15c07b3e6f967..16f4e6ca26980 100644
--- a/paddle/phi/kernels/adadelta_kernel.h
+++ b/paddle/phi/kernels/adadelta_kernel.h
@@ -24,6 +24,7 @@ void AdadeltaKernel(const Context& dev_ctx,
                     const DenseTensor& grad,
                     const DenseTensor& avg_squared_grad,
                     const DenseTensor& avg_squared_update,
+                    const DenseTensor& learning_rate,
                     const paddle::optional<DenseTensor>& master_param,
                     float rho,
                     float epsilon,
diff --git a/paddle/phi/kernels/batch_norm_kernel.cc b/paddle/phi/kernels/batch_norm_kernel.cc
index eddd65184fe93..570ba8dae06cf 100644
--- a/paddle/phi/kernels/batch_norm_kernel.cc
+++ b/paddle/phi/kernels/batch_norm_kernel.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/phi/kernels/batch_norm_kernel.h"
 
+#include "paddle/phi/backends/gpu/gpu_dnn.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 
@@ -66,6 +67,22 @@ PD_REGISTER_KERNEL(batch_norm_infer,
                    float,
                    double) {}
 #ifdef PADDLE_WITH_CUDA
+#if CUDNN_VERSION_MIN(8, 1, 0)
+PD_REGISTER_KERNEL(batch_norm_infer,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormInferKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16,
+                   phi::dtype::float16) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT16 ||
+      kernel_key.dtype() == phi::DataType::BFLOAT16) {
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+  }
+}
+#else
 PD_REGISTER_KERNEL(batch_norm_infer,
                    GPU,
                    ALL_LAYOUT,
@@ -79,6 +96,7 @@ PD_REGISTER_KERNEL(batch_norm_infer,
   }
 }
 #endif
+#endif
 #ifdef PADDLE_WITH_HIP
 PD_REGISTER_KERNEL(batch_norm_infer,
                    GPU,
diff --git a/paddle/phi/kernels/cpu/hsigmoid_loss_grad.h b/paddle/phi/kernels/cpu/hsigmoid_loss_grad.h
index 8c8b40c8d9fd0..f4b35c9101836 100644
--- a/paddle/phi/kernels/cpu/hsigmoid_loss_grad.h
+++ b/paddle/phi/kernels/cpu/hsigmoid_loss_grad.h
@@ -35,7 +35,6 @@ void HSigmoidLossGradKernelImpl(const Context& ctx,
                                 const DenseTensor& pre_out,
                                 const DenseTensor& out_grad,
                                 int num_classes,
-                                bool remote_prefetch,
                                 bool is_sparse,
                                 DenseTensor* x_grad,
                                 DenseTensor* w_grad,
diff --git a/paddle/phi/kernels/cpu/hsigmoid_loss_grad_kernel.cc b/paddle/phi/kernels/cpu/hsigmoid_loss_grad_kernel.cc
index bc741b32b3afc..9b7a2fd574ea8 100644
--- a/paddle/phi/kernels/cpu/hsigmoid_loss_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/hsigmoid_loss_grad_kernel.cc
@@ -31,7 +31,6 @@ void HSigmoidLossGradKernel(const Context& ctx,
                             const DenseTensor& pre_out,
                             const DenseTensor& out_grad,
                             int num_classes,
-                            bool remote_prefetch,
                             bool is_sparse,
                             DenseTensor* x_grad,
                             DenseTensor* w_grad,
@@ -46,7 +45,6 @@ void HSigmoidLossGradKernel(const Context& ctx,
                                 pre_out,
                                 out_grad,
                                 num_classes,
-                                remote_prefetch,
                                 is_sparse,
                                 x_grad,
                                 w_grad,
diff --git a/paddle/phi/kernels/cpu/hsigmoid_loss_kernel.cc b/paddle/phi/kernels/cpu/hsigmoid_loss_kernel.cc
index c6ee49ef34786..2a611a8d541ca 100644
--- a/paddle/phi/kernels/cpu/hsigmoid_loss_kernel.cc
+++ b/paddle/phi/kernels/cpu/hsigmoid_loss_kernel.cc
@@ -34,7 +34,6 @@ void HSigmoidLossKernel(const Context& ctx,
                         const paddle::optional<DenseTensor>& path,
                         const paddle::optional<DenseTensor>& code,
                         int num_classes,
-                        bool remote_prefetch,
                         bool is_sparse,
                         DenseTensor* out,
                         DenseTensor* pre_out,
diff --git a/paddle/phi/kernels/cpu/isfinite_kernel.cc b/paddle/phi/kernels/cpu/isfinite_kernel.cc
index 85d125794871d..c9f69c5f7e4f5 100644
--- a/paddle/phi/kernels/cpu/isfinite_kernel.cc
+++ b/paddle/phi/kernels/cpu/isfinite_kernel.cc
@@ -25,6 +25,7 @@ PD_REGISTER_KERNEL(isinf,
                    float,
                    double,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    int,
                    int64_t) {
   kernel->OutputAt(0).SetDataType(phi::DataType::BOOL);
@@ -37,6 +38,7 @@ PD_REGISTER_KERNEL(isnan,
                    float,
                    double,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    int,
                    int64_t) {
   kernel->OutputAt(0).SetDataType(phi::DataType::BOOL);
@@ -49,6 +51,7 @@ PD_REGISTER_KERNEL(isfinite,
                    float,
                    double,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    int,
                    int64_t) {
   kernel->OutputAt(0).SetDataType(phi::DataType::BOOL);
diff --git a/paddle/phi/kernels/funcs/isfinite_functor.h b/paddle/phi/kernels/funcs/isfinite_functor.h
index 1dc4fd57b4857..795b8f275c87e 100644
--- a/paddle/phi/kernels/funcs/isfinite_functor.h
+++ b/paddle/phi/kernels/funcs/isfinite_functor.h
@@ -45,6 +45,13 @@ struct IsNanFunctor<phi::dtype::float16, void> {
   }
 };
 
+template <>
+struct IsNanFunctor<phi::dtype::bfloat16, void> {
+  HOSTDEVICE bool operator()(const phi::dtype::bfloat16& a) const {
+    return phi::dtype::isnan(a);
+  }
+};
+
 template <typename T, class Enable = void>
 struct IsInfFunctor {
   HOSTDEVICE bool operator()(const T& a) const {
@@ -69,6 +76,13 @@ struct IsInfFunctor<phi::dtype::float16, void> {
   }
 };
 
+template <>
+struct IsInfFunctor<phi::dtype::bfloat16, void> {
+  HOSTDEVICE bool operator()(const phi::dtype::bfloat16& a) const {
+    return phi::dtype::isinf(a);
+  }
+};
+
 template <typename T, class Enable = void>
 struct IsFiniteFunctor {
   HOSTDEVICE bool operator()(const T& a) const {
@@ -94,5 +108,12 @@ struct IsFiniteFunctor<phi::dtype::float16, void> {
   }
 };
 
+template <>
+struct IsFiniteFunctor<phi::dtype::bfloat16, void> {
+  HOSTDEVICE bool operator()(const phi::dtype::bfloat16& a) const {
+    return phi::dtype::isfinite(a);
+  }
+};
+
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/fusion/xpu/conv2d_xpu_kernel.cc b/paddle/phi/kernels/fusion/xpu/conv2d_xpu_kernel.cc
index 9da39097e0f8d..0f7d8902de328 100644
--- a/paddle/phi/kernels/fusion/xpu/conv2d_xpu_kernel.cc
+++ b/paddle/phi/kernels/fusion/xpu/conv2d_xpu_kernel.cc
@@ -21,8 +21,8 @@ namespace fusion {
 
 template <typename T, typename Context>
 void Conv2dXPUKernel(const Context& ctx,
-                     const DenseTensor& input,
-                     const paddle::optional<DenseTensor>& input_max,
+                     const DenseTensor& x,
+                     const paddle::optional<DenseTensor>& x_max,
                      const DenseTensor& filter,
                      const DenseTensor& filter_max,
                      const paddle::optional<DenseTensor>& bias,
@@ -36,10 +36,10 @@ void Conv2dXPUKernel(const Context& ctx,
                      bool has_branch,
                      int act_type,
                      float act_param,
-                     DenseTensor* output,
-                     DenseTensor* output_max) {
+                     DenseTensor* out,
+                     DenseTensor* out_max) {
   using XPUType = typename XPUTypeTrait<T>::Type;
-  auto input_dims = input.dims();
+  auto input_dims = x.dims();
   auto filter_dims = filter.dims();
   // update paddings and dilations accoring to padding_algorithm
   std::vector<int> paddings_vec = paddings;
@@ -62,17 +62,16 @@ void Conv2dXPUKernel(const Context& ctx,
   int win_h = static_cast<int>(filter_dims[2]);
   int win_w = static_cast<int>(filter_dims[3]);
 
-  auto* input_data = reinterpret_cast<const XPUType*>(input.data<T>());
-  const float* input_max_data = input_max.get_ptr() == nullptr
-                                    ? nullptr
-                                    : input_max.get_ptr()->data<float>();
+  auto* input_data = reinterpret_cast<const XPUType*>(x.data<T>());
+  const float* input_max_data =
+      x_max.get_ptr() == nullptr ? nullptr : x_max.get_ptr()->data<float>();
   auto* branch_data =
       branch.get_ptr() == nullptr
           ? nullptr
           : reinterpret_cast<const XPUType*>(branch.get_ptr()->data<T>());
   const float* bias_data =
       bias.get_ptr() == nullptr ? nullptr : bias.get_ptr()->data<float>();
-  auto* out_data = reinterpret_cast<XPUType*>(ctx.template Alloc<T>(output));
+  auto* out_data = reinterpret_cast<XPUType*>(ctx.template Alloc<T>(out));
 
   xpu::Activation_t act(static_cast<xpu::Activation_t::act_enum>(act_type));
   if (act_type == xpu::Activation_t::LEAKY_RELU) {
@@ -98,13 +97,13 @@ void Conv2dXPUKernel(const Context& ctx,
           /* int64_t groups */ groups,
           /* const float* in_maxptr */ input_max_data,
           /* const float* filter_maxptr */ filter_max.data<float>(),
-          /* float* out_maxptr */ ctx.template Alloc<float>(output_max),
+          /* float* out_maxptr */ ctx.template Alloc<float>(out_max),
           /* bool is_nchw */ true,
           /* const float* bias */ bias_data,
           /* const TY* branch */ branch_data,
           /* const baidu::xpu::api::Activation_t& act */ act,
-          /* const float* branch_maxptr */ nullptr);
-  // /* const float* scale */ nullptr);
+          /* const float* branch_maxptr */ nullptr,
+          /* const float* scale */ nullptr);
   PADDLE_ENFORCE_XDNN_SUCCESS(r, "conv2d_xpu");
 }
 
diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
index ede2458744902..db7f3c3224a03 100644
--- a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
@@ -1314,14 +1314,18 @@ PD_REGISTER_KERNEL(batch_norm_grad_raw,
                    float,
                    phi::dtype::float16) {}
 #else
+#if CUDNN_VERSION_MIN(8, 1, 0)
+
 PD_REGISTER_KERNEL(batch_norm_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::BatchNormGradKernel,
                    float,
                    double,
+                   phi::dtype::bfloat16,
                    phi::dtype::float16) {
-  if (kernel_key.dtype() == phi::DataType::FLOAT16) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT16 ||
+      kernel_key.dtype() == phi::DataType::BFLOAT16) {
     kernel->OutputAt(0).SetDataType(phi::DataType::FLOAT32);  // x_grad
     kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);  // scale_grad
     kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);  // bias_grad
@@ -1334,6 +1338,22 @@ PD_REGISTER_KERNEL(batch_norm_grad_raw,
                    phi::BatchNormGradRawKernel,
                    float,
                    double,
+                   phi::dtype::bfloat16,
+                   phi::dtype::float16) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT16 ||
+      kernel_key.dtype() == phi::DataType::BFLOAT16) {
+    kernel->OutputAt(0).SetDataType(phi::DataType::FLOAT32);  // x_grad
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);  // scale_grad
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);  // bias_grad
+  }
+}
+#else
+PD_REGISTER_KERNEL(batch_norm_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormGradKernel,
+                   float,
+                   double,
                    phi::dtype::float16) {
   if (kernel_key.dtype() == phi::DataType::FLOAT16) {
     kernel->OutputAt(0).SetDataType(phi::DataType::FLOAT32);  // x_grad
@@ -1342,6 +1362,20 @@ PD_REGISTER_KERNEL(batch_norm_grad_raw,
   }
 }
 
+PD_REGISTER_KERNEL(batch_norm_grad_raw,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormGradRawKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT16) {
+    kernel->OutputAt(0).SetDataType(phi::DataType::FLOAT32);  // x_grad
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);  // scale_grad
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);  // bias_grad
+  }
+}
+#endif
 #endif
 
 #ifdef PADDLE_WITH_HIP
diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
index 63276e4d53024..fb1bca3daba86 100644
--- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -1221,6 +1221,7 @@ PD_REGISTER_KERNEL(batch_norm,
                    ALL_LAYOUT,
                    phi::BatchNormKernel,
                    float,
+                   phi::dtype::bfloat16,
                    phi::dtype::float16) {
   kernel->InputAt(1).SetDataType(phi::DataType::FLOAT32);
   kernel->InputAt(2).SetDataType(phi::DataType::FLOAT32);
@@ -1232,6 +1233,28 @@ PD_REGISTER_KERNEL(batch_norm,
   kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
 }
 #else
+#if CUDNN_VERSION_MIN(8, 1, 0)
+PD_REGISTER_KERNEL(batch_norm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16,
+                   phi::dtype::float16) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT16 ||
+      kernel_key.dtype() == phi::DataType::BFLOAT16) {
+    kernel->InputAt(1).SetDataType(phi::DataType::FLOAT32);
+    kernel->InputAt(2).SetDataType(phi::DataType::FLOAT32);
+    kernel->InputAt(3).SetDataType(phi::DataType::FLOAT32);
+    kernel->InputAt(4).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
+  }
+}
+#else
 PD_REGISTER_KERNEL(batch_norm,
                    GPU,
                    ALL_LAYOUT,
@@ -1250,5 +1273,6 @@ PD_REGISTER_KERNEL(batch_norm,
     kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
   }
 }
+#endif
 
 #endif
diff --git a/paddle/phi/kernels/gpu/isfinite_kernel.cu b/paddle/phi/kernels/gpu/isfinite_kernel.cu
index e8c2fa022ec7a..9bde1d7a5bd38 100644
--- a/paddle/phi/kernels/gpu/isfinite_kernel.cu
+++ b/paddle/phi/kernels/gpu/isfinite_kernel.cu
@@ -25,6 +25,7 @@ PD_REGISTER_KERNEL(isinf,
                    float,
                    double,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    int,
                    int64_t) {
   kernel->OutputAt(0).SetDataType(phi::DataType::BOOL);
@@ -37,6 +38,7 @@ PD_REGISTER_KERNEL(isnan,
                    float,
                    double,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    int,
                    int64_t) {
   kernel->OutputAt(0).SetDataType(phi::DataType::BOOL);
@@ -49,6 +51,7 @@ PD_REGISTER_KERNEL(isfinite,
                    float,
                    double,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    int,
                    int64_t) {
   kernel->OutputAt(0).SetDataType(phi::DataType::BOOL);
diff --git a/paddle/phi/kernels/gpu/pool_grad_kernel.cu b/paddle/phi/kernels/gpu/pool_grad_kernel.cu
index 598a48f802891..e4cfcb23b730e 100644
--- a/paddle/phi/kernels/gpu/pool_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/pool_grad_kernel.cu
@@ -25,7 +25,8 @@ PD_REGISTER_KERNEL(pool2d_grad,
                    phi::Pool2dGradKernel,
                    float,
                    double,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 PD_REGISTER_KERNEL(pool2d_double_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/pool_kernel.cu b/paddle/phi/kernels/gpu/pool_kernel.cu
index 6323909c9d0dc..65d0ef4bdc916 100644
--- a/paddle/phi/kernels/gpu/pool_kernel.cu
+++ b/paddle/phi/kernels/gpu/pool_kernel.cu
@@ -25,7 +25,8 @@ PD_REGISTER_KERNEL(pool2d,
                    phi::Pool2dKernel,
                    float,
                    double,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 PD_REGISTER_KERNEL(max_pool2d_with_index,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/unique_kernel.cu b/paddle/phi/kernels/gpu/unique_kernel.cu
index c073708ed8556..10cf1ea8df534 100644
--- a/paddle/phi/kernels/gpu/unique_kernel.cu
+++ b/paddle/phi/kernels/gpu/unique_kernel.cu
@@ -30,6 +30,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/unique_functor.h"
+#include "paddle/phi/kernels/index_select_kernel.h"
 
 namespace phi {
 
@@ -98,76 +99,6 @@ struct BinaryNotEqual {
   }
 };
 
-// index_select() function for DenseTensor
-template <typename Context, typename InT, typename IndexT>
-void IndexSelect(const Context& context,
-                 const DenseTensor& input,
-                 const DenseTensor& index,
-                 DenseTensor* output,
-                 int dim) {
-  auto input_dim = input.dims();
-  auto input_dim_size = input_dim.size();
-  auto output_dim = output->dims();
-
-  auto slice_size = 1;
-  for (auto i = dim + 1; i < input_dim_size; i++) {
-    slice_size *= input_dim[i];
-  }
-
-  auto input_width = slice_size * input_dim[dim];
-  auto output_width = slice_size * output_dim[dim];
-
-  auto outer_nums = 1;
-  for (auto i = 0; i < dim; i++) {
-    outer_nums *= input_dim[i];
-  }
-
-  auto index_size = index.dims()[0];
-
-  std::vector<InT> input_vec;
-  std::vector<IndexT> index_vec;
-  phi::TensorToVector(input, context, &input_vec);
-  phi::TensorToVector(index, context, &index_vec);
-  std::vector<InT> out_vec(output->numel());
-
-  for (int i = 0; i < index_size; i++) {
-    PADDLE_ENFORCE_GE(
-        index_vec[i],
-        0,
-        phi::errors::InvalidArgument(
-            "Variable value (index) of OP(index_select) "
-            "expected >= 0 and < %ld, but got %ld. Please check input "
-            "value.",
-            input_dim[dim],
-            index_vec[i]));
-    PADDLE_ENFORCE_LT(
-        index_vec[i],
-        input_dim[dim],
-        phi::errors::InvalidArgument(
-            "Variable value (index) of OP(index_select) "
-            "expected >= 0 and < %ld, but got %ld. Please check input "
-            "value.",
-            input_dim[dim],
-            index_vec[i]));
-  }
-
-  for (auto i = 0; i < outer_nums; i++) {
-    auto input_start_offset = i * input_width;
-    auto output_start_offset = i * output_width;
-
-    for (auto j = 0; j < index_size; j++) {
-      IndexT index_value = index_vec[j];
-      for (auto k = 0; k < slice_size; k++) {
-        out_vec[output_start_offset + j * slice_size + k] =
-            input_vec[input_start_offset + index_value * slice_size + k];
-      }
-    }
-  }
-  context.template Alloc<IndexT>(output);
-  phi::TensorFromVector(out_vec, context, output);
-  output->Resize(output_dim);
-}
-
 // The core logic of computing Unique for a flattend DenseTensor
 template <typename Context,
           typename InT,
@@ -354,24 +285,29 @@ static void UniqueDimsCUDATensor(const Context& context,
                                  int axis) {
   // 1. Transpose & reshape
   // Transpose tensor: eg. axis=1, [dim0, dim1, dim2] -> [dim1, dim0, dim2]
-  std::vector<int> permute(in.dims().size());
-  std::iota(permute.begin(), permute.end(), 0);
-  permute[axis] = 0;
-  permute[0] = axis;
-  std::vector<int64_t> in_trans_dims_vec(phi::vectorize(in.dims()));
-  in_trans_dims_vec[axis] = in.dims()[0];
-  in_trans_dims_vec[0] = in.dims()[axis];
   DenseTensor in_trans;
+  std::vector<int64_t> in_trans_dims_vec(phi::vectorize(in.dims()));
   auto in_trans_dims = phi::make_ddim(in_trans_dims_vec);
-  in_trans.Resize(in_trans_dims);
-  context.template Alloc<InT>(&in_trans);
-  phi::funcs::TransCompute<Context, InT>(
-      in.dims().size(),  // num of dims
-      context,           // device
-      in,                // original DenseTensor
-      &in_trans,         // DenseTensor after reshape
-      permute);          // index of axis
-
+  std::vector<int> permute(in.dims().size());
+  bool is_transpose = axis != 0;
+  if (is_transpose) {
+    std::iota(permute.begin(), permute.end(), 0);
+    permute[axis] = 0;
+    permute[0] = axis;
+    in_trans_dims_vec[axis] = in.dims()[0];
+    in_trans_dims_vec[0] = in.dims()[axis];
+    in_trans_dims = phi::make_ddim(in_trans_dims_vec);
+    in_trans.Resize(in_trans_dims);
+    context.template Alloc<InT>(&in_trans);
+    phi::funcs::TransCompute<Context, InT>(
+        in.dims().size(),  // num of dims
+        context,           // device
+        in,                // original DenseTensor
+        &in_trans,         // DenseTensor after reshape
+        permute);          // index of axis
+  } else {
+    in_trans.ShareDataWith(in);
+  }
   // Reshape tensor: eg. [dim1, dim0, dim2] -> [dim1, dim0*dim2]
   auto in_trans_flat_dims = phi::flatten_to_2d(in_trans_dims, 1);
   in_trans.Resize(in_trans_flat_dims);
@@ -407,22 +343,27 @@ static void UniqueDimsCUDATensor(const Context& context,
       row);
 
   // 3. Select indices and reshape back to get 'out'
-  DenseTensor out_trans;
   std::vector<int64_t> out_trans_dims_vec = in_trans_dims_vec;
   out_trans_dims_vec[0] = indices->numel();
-  out_trans.Resize(phi::make_ddim(out_trans_dims_vec));
-  context.template Alloc<InT>(&out_trans);
-
-  IndexSelect<Context, InT, IndexT>(context, in_trans, *indices, &out_trans, 0);
-
-  std::swap(out_trans_dims_vec[0], out_trans_dims_vec[axis]);
-  out->Resize(phi::make_ddim(out_trans_dims_vec));
-  context.template Alloc<InT>(out);
-  std::vector<DenseTensor> out_trans_unbind = phi::funcs::Unbind(out_trans);
-  phi::funcs::ConcatFunctor<Context, InT> concat_functor;
-  concat_functor(context, out_trans_unbind, 0, &out_trans);
-  phi::funcs::TransCompute<Context, InT>(
-      out_trans.dims().size(), context, out_trans, out, permute);
+  if (is_transpose) {
+    DenseTensor out_trans;
+    out_trans.Resize(phi::make_ddim(out_trans_dims_vec));
+    context.template Alloc<InT>(&out_trans);
+
+    phi::IndexSelectKernel<InT, Context>(
+        context, in_trans, *indices, 0, &out_trans);
+
+    std::swap(out_trans_dims_vec[0], out_trans_dims_vec[axis]);
+    out->Resize(phi::make_ddim(out_trans_dims_vec));
+    context.template Alloc<InT>(out);
+    phi::funcs::TransCompute<Context, InT>(
+        out_trans.dims().size(), context, out_trans, out, permute);
+  } else {
+    out->Resize(phi::make_ddim(out_trans_dims_vec));
+    context.template Alloc<InT>(out);
+
+    phi::IndexSelectKernel<InT, Context>(context, in_trans, *indices, 0, out);
+  }
 }
 
 // functor for processing a flattend DenseTensor
diff --git a/paddle/phi/kernels/hsigmoid_loss_grad_kernel.h b/paddle/phi/kernels/hsigmoid_loss_grad_kernel.h
index c36b343017fd5..254264b8c276e 100644
--- a/paddle/phi/kernels/hsigmoid_loss_grad_kernel.h
+++ b/paddle/phi/kernels/hsigmoid_loss_grad_kernel.h
@@ -29,7 +29,6 @@ void HSigmoidLossGradKernel(const Context& ctx,
                             const DenseTensor& pre_out,
                             const DenseTensor& out_grad,
                             int num_classes,
-                            bool remote_prefetch,
                             bool is_sparse,
                             DenseTensor* x_grad,
                             DenseTensor* w_grad,
diff --git a/paddle/phi/kernels/hsigmoid_loss_kernel.h b/paddle/phi/kernels/hsigmoid_loss_kernel.h
index 33a90c637e4e4..f1b659a5ba129 100644
--- a/paddle/phi/kernels/hsigmoid_loss_kernel.h
+++ b/paddle/phi/kernels/hsigmoid_loss_kernel.h
@@ -27,7 +27,6 @@ void HSigmoidLossKernel(const Context& ctx,
                         const paddle::optional<DenseTensor>& path,
                         const paddle::optional<DenseTensor>& code,
                         int num_classes,
-                        bool remote_prefetch,
                         bool is_sparse,
                         DenseTensor* out,
                         DenseTensor* pre_out,
diff --git a/paddle/phi/kernels/impl/adadelta_kernel_impl.h b/paddle/phi/kernels/impl/adadelta_kernel_impl.h
index b0c0a072acd55..18fcd953d6532 100644
--- a/paddle/phi/kernels/impl/adadelta_kernel_impl.h
+++ b/paddle/phi/kernels/impl/adadelta_kernel_impl.h
@@ -13,11 +13,10 @@
 // limitations under the License.
 
 #pragma once
-
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/kernels/adadelta_kernel.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
-#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace phi {
 
@@ -27,6 +26,7 @@ void AdadeltaKernel(const Context& dev_ctx,
                     const DenseTensor& grad,
                     const DenseTensor& avg_squared_grad,
                     const DenseTensor& avg_squared_update,
+                    const DenseTensor& learning_rate,
                     const paddle::optional<DenseTensor>& master_param,
                     float rho,
                     float epsilon,
@@ -56,29 +56,30 @@ void AdadeltaKernel(const Context& dev_ctx,
   auto eigen_avg_squared_update_out =
       EigenVector<MPDType>::Flatten(*avg_squared_update_out);
   auto& place = *dev_ctx.eigen_device();
-
   auto eigen_grad_cast = eigen_grad.template cast<MPDType>();
-
   eigen_avg_squared_grad_out.device(place) =
       rho_ * eigen_avg_squared_grad + (1 - rho_) * eigen_grad_cast.square();
-  auto update = -((eigen_avg_squared_update + epsilon_) /
-                  (eigen_avg_squared_grad_out + epsilon_))
-                     .sqrt() *
-                eigen_grad_cast;
-  eigen_avg_squared_update_out.device(place) =
-      rho_ * eigen_avg_squared_update + (1 - rho_) * update.square();
-
+  auto update =
+      -(((eigen_avg_squared_update + epsilon_).sqrt()) /
+        ((eigen_avg_squared_grad_out + epsilon_).sqrt()) * eigen_grad_cast);
+  Eigen::DSizes<int, 1> m_dsize(avg_squared_update_out->numel());
+  auto lr = EigenVector<MPDType>::Flatten(learning_rate);
   if (multi_precision) {
     auto eigen_master_param_out =
         EigenVector<MPDType>::Flatten(*master_param_outs);
     auto eigen_master_param = EigenVector<MPDType>::Flatten(*master_param);
 
-    eigen_master_param_out.device(place) = eigen_master_param + update;
+    eigen_master_param_out.device(place) =
+        eigen_master_param + lr.broadcast(m_dsize) * update;
     eigen_param_out.device(place) =
-        (eigen_param.template cast<MPDType>() + update).template cast<T>();
+        (eigen_param.template cast<MPDType>() + lr.broadcast(m_dsize) * update)
+            .template cast<T>();
   } else {
-    eigen_param_out.device(place) = eigen_param + update.template cast<T>();
+    eigen_param_out.device(place) =
+        eigen_param + (lr.broadcast(m_dsize) * update).template cast<T>();
   }
+  eigen_avg_squared_update_out.device(place) =
+      rho_ * eigen_avg_squared_update + (1 - rho_) * update.square();
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/selected_rows/hsigmoid_loss_grad_kernel.cc b/paddle/phi/kernels/selected_rows/hsigmoid_loss_grad_kernel.cc
index 4bb0352528e4e..9d450f1d5dbed 100644
--- a/paddle/phi/kernels/selected_rows/hsigmoid_loss_grad_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/hsigmoid_loss_grad_kernel.cc
@@ -48,7 +48,6 @@ void HSigmoidLossGradKernel(const Context& ctx,
                             const DenseTensor& pre_out,
                             const DenseTensor& out_grad,
                             int num_classes,
-                            bool remote_prefetch,
                             bool is_sparse,
                             DenseTensor* x_grad,
                             SelectedRows* w_grad,
@@ -74,7 +73,6 @@ void HSigmoidLossGradKernel(const Context& ctx,
                                      pre_out,
                                      out_grad,
                                      num_classes,
-                                     remote_prefetch,
                                      is_sparse,
                                      x_grad,
                                      w_grad_value,
diff --git a/paddle/phi/kernels/selected_rows/hsigmoid_loss_grad_kernel.h b/paddle/phi/kernels/selected_rows/hsigmoid_loss_grad_kernel.h
index 94ac63183fbfb..50719408acf11 100644
--- a/paddle/phi/kernels/selected_rows/hsigmoid_loss_grad_kernel.h
+++ b/paddle/phi/kernels/selected_rows/hsigmoid_loss_grad_kernel.h
@@ -31,7 +31,6 @@ void HSigmoidLossGradKernel(const Context& ctx,
                             const DenseTensor& pre_out,
                             const DenseTensor& out_grad,
                             int num_classes,
-                            bool remote_prefetch,
                             bool is_sparse,
                             DenseTensor* x_grad,
                             SelectedRows* w_grad,
diff --git a/paddle/phi/kernels/xpu/adadelta_kernel.cc b/paddle/phi/kernels/xpu/adadelta_kernel.cc
index e02a5aeabad2e..b87ec1afbdc36 100644
--- a/paddle/phi/kernels/xpu/adadelta_kernel.cc
+++ b/paddle/phi/kernels/xpu/adadelta_kernel.cc
@@ -25,6 +25,7 @@ void AdadeltaKernel(const Context& dev_ctx,
                     const DenseTensor& grad,
                     const DenseTensor& avg_squared_grad,
                     const DenseTensor& avg_squared_update,
+                    const DenseTensor& learning_rate,
                     const paddle::optional<DenseTensor>& master_param,
                     float rho,
                     float epsilon,
diff --git a/paddle/phi/ops/compat/adadelta_sig.cc b/paddle/phi/ops/compat/adadelta_sig.cc
index fd285e7e5d0e5..da7e4229a0d22 100644
--- a/paddle/phi/ops/compat/adadelta_sig.cc
+++ b/paddle/phi/ops/compat/adadelta_sig.cc
@@ -18,14 +18,18 @@ namespace phi {
 
 KernelSignature AdadeltaOpArgumentMapping(const ArgumentMappingContext& ctx) {
   if (ctx.IsDenseTensorInput("Grad")) {
-    return KernelSignature(
-        "adadelta",
-        {"Param", "Grad", "AvgSquaredGrad", "AvgSquaredUpdate", "MasterParam"},
-        {"rho", "epsilon", "multi_precision"},
-        {"ParamOut",
-         "AvgSquaredGradOut",
-         "AvgSquaredUpdateOut",
-         "MasterParamOut"});
+    return KernelSignature("adadelta",
+                           {"Param",
+                            "Grad",
+                            "AvgSquaredGrad",
+                            "AvgSquaredUpdate",
+                            "LearningRate",
+                            "MasterParam"},
+                           {"rho", "epsilon", "multi_precision"},
+                           {"ParamOut",
+                            "AvgSquaredGradOut",
+                            "AvgSquaredUpdateOut",
+                            "MasterParamOut"});
   }
 
   return KernelSignature("unregistered", {}, {}, {});
diff --git a/paddle/phi/ops/compat/average_accumulates_sig.cc b/paddle/phi/ops/compat/average_accumulates_sig.cc
deleted file mode 100644
index c14e8ab357553..0000000000000
--- a/paddle/phi/ops/compat/average_accumulates_sig.cc
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/core/compat/op_utils.h"
-
-namespace phi {
-KernelSignature AverageAccumulatesOpArgumentMapping(
-    const ArgumentMappingContext& ctx) {
-  return KernelSignature(
-      "average_accumulates",
-      {"param",
-       "in_sum_1",
-       "in_sum_2",
-       "in_sum_3",
-       "in_num_accumulates",
-       "in_old_num_accumulates",
-       "in_num_updates"},
-      {"average_window", "max_average_window", "min_average_window"},
-      {"out_sum_1",
-       "out_sum_2",
-       "out_sum_3",
-       "out_num_accumulates",
-       "out_old_num_accumulates",
-       "out_num_updates"});
-}
-}  // namespace phi
-PD_REGISTER_ARG_MAPPING_FN(average_accumulates,
-                           phi::AverageAccumulatesOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/clip_by_norm_sig.cc b/paddle/phi/ops/compat/clip_by_norm_sig.cc
deleted file mode 100644
index 8a2cecc0293d3..0000000000000
--- a/paddle/phi/ops/compat/clip_by_norm_sig.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/core/compat/op_utils.h"
-
-namespace phi {
-
-KernelSignature ClipByNormOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  if (ctx.IsDenseTensorInput("X")) {
-    return KernelSignature("clip_by_norm", {"X"}, {"max_norm"}, {"Out"});
-  } else if (ctx.IsSelectedRowsInput("X")) {
-    return KernelSignature("clip_by_norm_sr", {"X"}, {"max_norm"}, {"Out"});
-  }
-  return KernelSignature("unregistered", {}, {}, {});
-}
-
-}  // namespace phi
-
-PD_REGISTER_ARG_MAPPING_FN(clip_by_norm, phi::ClipByNormOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/hierarchical_sigmoid_sig.cc b/paddle/phi/ops/compat/hierarchical_sigmoid_sig.cc
index a8db0b33242bd..9499e0b9fc0dd 100644
--- a/paddle/phi/ops/compat/hierarchical_sigmoid_sig.cc
+++ b/paddle/phi/ops/compat/hierarchical_sigmoid_sig.cc
@@ -20,7 +20,7 @@ KernelSignature HierarchicalSigmoidOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
   return KernelSignature("hsigmoid_loss",
                          {"X", "Label", "W", "Bias", "PathTable", "PathCode"},
-                         {"num_classes", "remote_prefetch", "is_sparse"},
+                         {"num_classes", "is_sparse"},
                          {"Out", "PreOut", "W_Out"});
 }
 
@@ -36,7 +36,7 @@ KernelSignature HierarchicalSigmoidGradOpArgumentMapping(
                             "Bias",
                             "PreOut",
                             "Out@GRAD"},
-                           {"num_classes", "remote_prefetch", "is_sparse"},
+                           {"num_classes", "is_sparse"},
                            {"X@GRAD", "W@GRAD", "Bias@GRAD"});
   } else if (ctx.IsSelectedRowsOutput("W@GRAD")) {
     return KernelSignature("hsigmoid_loss_grad_sr",
@@ -48,7 +48,7 @@ KernelSignature HierarchicalSigmoidGradOpArgumentMapping(
                             "Bias",
                             "PreOut",
                             "Out@GRAD"},
-                           {"num_classes", "remote_prefetch", "is_sparse"},
+                           {"num_classes", "is_sparse"},
                            {"X@GRAD", "W@GRAD", "Bias@GRAD"});
   } else {
     return KernelSignature("unregistered", {}, {}, {});
diff --git a/paddle/phi/ops/compat/merged_momentum_sig.cc b/paddle/phi/ops/compat/merged_momentum_sig.cc
deleted file mode 100644
index 3444d5e2d3097..0000000000000
--- a/paddle/phi/ops/compat/merged_momentum_sig.cc
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/core/compat/op_utils.h"
-
-namespace phi {
-
-KernelSignature MergedMomentumOpArgumentMapping(
-    const ArgumentMappingContext& ctx) {
-  return KernelSignature(
-      "merged_momentum",
-      {"Param", "Grad", "Velocity", "LearningRate", "MasterParam"},
-      {"mu",
-       "use_nesterov",
-       "regularization_method",
-       "regularization_coeff",
-       "multi_precision",
-       "rescale_grad"},
-      {
-          "ParamOut",
-          "VelocityOut",
-          "MasterParamOut",
-      });
-}
-
-}  // namespace phi
-
-PD_REGISTER_ARG_MAPPING_FN(merged_momentum,
-                           phi::MergedMomentumOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/squared_l2_norm_sig.cc b/paddle/phi/ops/compat/squared_l2_norm_sig.cc
deleted file mode 100644
index 7b228008f2839..0000000000000
--- a/paddle/phi/ops/compat/squared_l2_norm_sig.cc
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/core/compat/op_utils.h"
-
-namespace phi {
-
-KernelSignature SquaredL2NormOpArgumentMapping(
-    const ArgumentMappingContext& ctx) {
-  return KernelSignature("squared_l2_norm", {"X"}, {}, {"Out"});
-}
-
-KernelSignature SquaredL2NormGradOpArgumentMapping(
-    const ArgumentMappingContext& ctx) {
-  return KernelSignature(
-      "squared_l2_norm_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"});
-}
-
-}  // namespace phi
-
-PD_REGISTER_ARG_MAPPING_FN(squared_l2_norm,
-                           phi::SquaredL2NormOpArgumentMapping);
-PD_REGISTER_ARG_MAPPING_FN(squared_l2_norm_grad,
-                           phi::SquaredL2NormGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/unique_sig.cc b/paddle/phi/ops/compat/unique_sig.cc
index 2a7ba543012f3..8a38775bc6080 100644
--- a/paddle/phi/ops/compat/unique_sig.cc
+++ b/paddle/phi/ops/compat/unique_sig.cc
@@ -17,6 +17,17 @@ limitations under the License. */
 namespace phi {
 
 KernelSignature UniqueOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.IsForInferShape()) {
+    return KernelSignature("unique_raw",
+                           {"X"},
+                           {"return_index",
+                            "return_inverse",
+                            "return_counts",
+                            "axis",
+                            "dtype",
+                            "is_sorted"},
+                           {"Out", "Indices", "Index", "Counts"});
+  }
   bool is_sorted = paddle::any_cast<bool>(ctx.Attr("is_sorted"));
   if (is_sorted) {
     return KernelSignature(
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 2db59b7b61ce7..4693d78e2dc32 100644
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -2237,7 +2237,7 @@ set +x
 set -x
         ut_endTime_s=`date +%s`
         echo "XPU testCase Time: $[ $ut_endTime_s - $ut_startTime_s ]s"
-        python ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py
+        python ${PADDLE_ROOT}/build/test/xpu/get_test_cover_info.py
         unset XPU_OP_LIST_DIR
         if [[ "$EXIT_CODE" != "0" ]]; then
             exit 8;
diff --git a/paddle/utils/string/tinyformat/tinyformat.h b/paddle/utils/string/tinyformat/tinyformat.h
index f9c55fe1835fd..41319c391455e 100644
--- a/paddle/utils/string/tinyformat/tinyformat.h
+++ b/paddle/utils/string/tinyformat/tinyformat.h
@@ -691,6 +691,8 @@ inline const char *streamStateFromFormat(std::ostream &out,       // NOLINT
       break;
     case 'X':
       out.setf(std::ios::uppercase);
+      break;
+
     case 'x':
     case 'p':
       out.setf(std::ios::hex, std::ios::basefield);
@@ -698,17 +700,23 @@ inline const char *streamStateFromFormat(std::ostream &out,       // NOLINT
       break;
     case 'E':
       out.setf(std::ios::uppercase);
+      break;
     case 'e':
       out.setf(std::ios::scientific, std::ios::floatfield);
       out.setf(std::ios::dec, std::ios::basefield);
       break;
     case 'F':
       out.setf(std::ios::uppercase);
+
+      break;
+
     case 'f':
       out.setf(std::ios::fixed, std::ios::floatfield);
       break;
     case 'G':
       out.setf(std::ios::uppercase);
+      break;
+
     case 'g':
       out.setf(std::ios::dec, std::ios::basefield);
       // As in boost::format, let stream decide float format.
diff --git a/python/env_dict.py.in b/python/env_dict.py.in
index 5b2078c67510c..00ca04dc56cde 100644
--- a/python/env_dict.py.in
+++ b/python/env_dict.py.in
@@ -73,5 +73,7 @@ env_dict={
     'JIT_RELEASE_WHL':'@JIT_RELEASE_WHL@',
     'WITH_PSLIB':'@WITH_PSLIB@',
     'PYBIND_INCLUDE_DIR':'@PYBIND_INCLUDE_DIR@',
-    'WITH_PYTHON':'@WITH_PYTHON@'
+    'WITH_PYTHON':'@WITH_PYTHON@',
+    'WITH_CINN':'@WITH_CINN@',
+    'CINN_SOURCE_DIR':'@CINN_SOURCE_DIR@'
 }
diff --git a/python/paddle/amp/__init__.py b/python/paddle/amp/__init__.py
index 7437b0d9df6e3..60df9de03ad11 100644
--- a/python/paddle/amp/__init__.py
+++ b/python/paddle/amp/__init__.py
@@ -28,4 +28,68 @@
 
 from . import debugging  # noqa: F401
 
-__all__ = ['auto_cast', 'GradScaler', 'decorate']
+from paddle.fluid import core
+from paddle.fluid.framework import (
+    _current_expected_place,
+    _get_paddle_place,
+)
+
+__all__ = [
+    'auto_cast',
+    'GradScaler',
+    'decorate',
+    'is_float16_supported',
+    'is_bfloat16_supported',
+]
+
+
+def is_float16_supported(device=None):
+    """
+    Determine whether the place supports float16 in the auto-mixed-precision training.
+
+    Args:
+        device (str|None, optional): Specify the running device.
+            It can be ``cpu``, ``gpu``, ``xpu``, ``gpu:x`` and ``xpu:x``,
+            where ``x`` is the index of the GPUs or XPUs. if device is None, the device is the current device. Default: None.
+
+    Examples:
+
+     .. code-block:: python
+
+        import paddle
+        paddle.amp.is_float16_supported() # True or False
+    """
+
+    device = (
+        _current_expected_place()
+        if device is None
+        else _get_paddle_place(device)
+    )
+
+    return core.is_float16_supported(device)
+
+
+def is_bfloat16_supported(device=None):
+    """
+    Determine whether the place supports bfloat16 in the auto-mixed-precision training.
+
+    Args:
+        device (str|None, optional): Specify the running device.
+            It can be ``cpu``, ``gpu``, ``xpu``, ``gpu:x`` and ``xpu:x``,
+            where ``x`` is the index of the GPUs or XPUs. if device is None, the device is the current device. Default: None.
+
+    Examples:
+
+     .. code-block:: python
+
+        import paddle
+        paddle.amp.is_bfloat16_supported() # True or False
+    """
+
+    device = (
+        _current_expected_place()
+        if device is None
+        else _get_paddle_place(device)
+    )
+
+    return core.is_bfloat16_supported(device)
diff --git a/python/paddle/amp/auto_cast.py b/python/paddle/amp/auto_cast.py
index 33c7855d89724..bc76f866d94eb 100644
--- a/python/paddle/amp/auto_cast.py
+++ b/python/paddle/amp/auto_cast.py
@@ -213,6 +213,9 @@ def pure_fp16_initialize(models):
                     paddle.nn.BatchNorm3D,
                     paddle.nn.LayerNorm,
                     paddle.nn.SyncBatchNorm,
+                    paddle.nn.InstanceNorm1D,
+                    paddle.nn.InstanceNorm2D,
+                    paddle.nn.InstanceNorm3D,
                 ),
             ):
                 continue
@@ -522,7 +525,7 @@ def amp_decorate(
 ):
     """
     Decorate models and optimizers for auto-mixed-precision. When level is O1(amp), the decorate will do nothing.
-    When level is O2(pure fp16), the decorate will cast all parameters of models to FP16, except BatchNorm and LayerNorm.
+    When level is O2(pure fp16), the decorate will cast all parameters of models to FP16, except BatchNorm, InstanceNorm and LayerNorm.
 
     Commonly, it is used together with `amp_guard` to achieve Pure fp16 in imperative mode.
 
@@ -530,7 +533,7 @@ def amp_decorate(
         models(Layer|list of Layer, optional): The defined models by user, models must be either a single model or a list of models. Default is None.
         optimizers(Optimizer|list of Optimizer, optional): The defined optimizers by user, optimizers must be either a single optimizer or a list of optimizers. Default is None.
         level(str, optional): Auto mixed precision level. Accepted values are "O1" and "O2": O1 represent mixed precision, the decorator will do nothing;
-             O2 represent Pure fp16/bf16, the decorator will cast all parameters of models to FP16/BF16, except BatchNorm and LayerNorm. Default is O1(amp)
+             O2 represent Pure fp16/bf16, the decorator will cast all parameters of models to FP16/BF16, except BatchNorm, InstanceNorm and LayerNorm. Default is O1(amp)
         dtype(str, optional): Whether to use 'float16' or 'bfloat16'. Default is 'float16'.
         master_weight(bool, optinal): For level='O2', whether to use multi-precision during weight updating. If master_weight is None, in O2 level optimizer will use multi-precision. Default is None.
         save_dtype(float, optional): The save model parameter dtype when use `paddle.save` or `paddle.jit.save`,it should be float16, bfloat16, float32, float64 or None.
@@ -741,7 +744,7 @@ def decorate(
 ):
     """
     Decorate models and optimizers for auto-mixed-precision. When level is O1(amp), the decorate will do nothing.
-    When level is O2(pure float16/bfloat16), the decorate will cast all parameters of models to float16/bfloat16, except BatchNorm and LayerNorm.
+    When level is O2(pure float16/bfloat16), the decorate will cast all parameters of models to float16/bfloat16, except BatchNorm, InstanceNorm and LayerNorm.
 
     Commonly, it is used together with `auto_cast` to achieve Pure float16/bfloat16 in imperative mode.
 
@@ -749,7 +752,7 @@ def decorate(
         models(Layer|list of Layer): The defined models by user, models must be either a single model or a list of models. Default is None.
         optimizers(Optimizer|list of Optimizer, optional): The defined optimizers by user, optimizers must be either a single optimizer or a list of optimizers. Default is None.
         level(str, optional): Auto mixed precision level. Accepted values are 'O1' and 'O2': O1 represent mixed precision, the decorator will do nothing;
-             O2 represent Pure float16/bfloat16, the decorator will cast all parameters of models to float16/bfloat16, except BatchNorm and LayerNorm. Default is O1(amp)
+             O2 represent Pure float16/bfloat16, the decorator will cast all parameters of models to float16/bfloat16, except BatchNorm, InstanceNorm and LayerNorm. Default is O1(amp)
         dtype(str, optional): Whether to use 'float16' or 'bfloat16'. Default is 'float16'.
         master_weight(bool, optinal): For level='O2', whether to use multi-precision during weight updating. If master_weight is None, in O2 level optimizer will use multi-precision. Default is None.
         save_dtype(float, optional): The save model parameter dtype when use `paddle.save` or `paddle.jit.save`,it should be float16, bfloat16, float32, float64 or None.
diff --git a/python/paddle/distributed/auto_parallel/constants.py b/python/paddle/distributed/auto_parallel/constants.py
index 83f5704f29cb0..d2fbadd78b9c5 100644
--- a/python/paddle/distributed/auto_parallel/constants.py
+++ b/python/paddle/distributed/auto_parallel/constants.py
@@ -102,6 +102,16 @@ def set_field_default_config(category, field, default_value):
 set_field_default_config(GRADIENT_MERGE, "k_steps", 1)
 set_field_default_config(GRADIENT_MERGE, "avg", True)
 
+#########################################
+# pipeline configuration
+#########################################
+PIPELINE = "pipeline"
+set_field_default_config(PIPELINE, "enable", False)
+set_field_default_config(PIPELINE, "schedule_mode", "1F1B")
+set_field_default_config(PIPELINE, "micro_batch_size", 1)
+set_field_default_config(PIPELINE, "accumulate_steps", 1)
+set_field_default_config(PIPELINE, "generation_batch_size", 1)
+
 #########################################
 # quantization configuration
 #########################################
diff --git a/python/paddle/distributed/auto_parallel/cost/estimate_cost.py b/python/paddle/distributed/auto_parallel/cost/estimate_cost.py
index 6c081f94a2aad..f9c0b3cb15db2 100644
--- a/python/paddle/distributed/auto_parallel/cost/estimate_cost.py
+++ b/python/paddle/distributed/auto_parallel/cost/estimate_cost.py
@@ -606,8 +606,8 @@ def get_cost_from_engine(engine, mode):
     )
 
     serial_startup_prog = (
-        engine._serial_startup_progs[mode].clone()
-        if mode in engine._serial_startup_progs
+        engine._fwd_dist_contexts[mode]._original_serial_main_program.clone()
+        if mode in engine._fwd_dist_contexts
         else engine._orig_startup_prog.clone()
     )
     losses = (
diff --git a/python/paddle/distributed/auto_parallel/dist_context.py b/python/paddle/distributed/auto_parallel/dist_context.py
index 22a83ae341d62..f3418f271825a 100644
--- a/python/paddle/distributed/auto_parallel/dist_context.py
+++ b/python/paddle/distributed/auto_parallel/dist_context.py
@@ -130,6 +130,9 @@ def __init__(
         # A flag indicates whether the used parallelism is data parallel
         self._data_parallel = False
 
+        # record upstream and downstream of cur rank
+        self._up_down_streams = UpDownStream()
+
         self._json_config = json_config
 
     @property
@@ -218,6 +221,10 @@ def gradient_scale(self, gs):
     def data_parallel(self):
         return self._data_parallel
 
+    @property
+    def up_down_streams(self):
+        return self._up_down_streams
+
     @data_parallel.setter
     def data_parallel(self, dp):
         self._data_parallel = dp
@@ -1220,3 +1227,45 @@ def parse_backward_blocks(self, program):
             self.nblock += 1
 
         assert self.nblock == len(program.blocks)
+
+
+class UpDownStream:
+    def __init__(self):
+        self._ups = {}
+        self._downs = {}
+
+    def add_up_stream(self, rank, up_stream):
+        ups = self._ups.get(rank, None)
+        if not ups:
+            self._ups[rank] = [up_stream]
+        elif up_stream != -1:
+            ups = list(filter(lambda a: a != -1, ups))
+            ups.append(up_stream)
+            self._ups[rank] = ups
+
+    def add_down_stream(self, rank, down_stream):
+        downs = self._downs.get(rank, None)
+        if not downs:
+            self._downs[rank] = [down_stream]
+        elif down_stream != -1:
+            downs = list(filter(lambda a: a != -1, downs))
+            downs.append(down_stream)
+            self._downs[rank] = downs
+
+    def add_pair_stream(self, up, down):
+        self.add_up_stream(up, -1)
+        self.add_up_stream(down, up)
+        self.add_down_stream(up, down)
+        self.add_down_stream(down, -1)
+
+    def ups(self, rank):
+        ups = self._ups.get(rank, None)
+        if not ups:
+            return None
+        return list(set(ups))
+
+    def downs(self, rank):
+        downs = self._downs.get(rank, None)
+        if not downs:
+            return None
+        return list(set(downs))
diff --git a/python/paddle/distributed/auto_parallel/dist_op.py b/python/paddle/distributed/auto_parallel/dist_op.py
index 7960adafbdfc4..8489d3f3332a6 100644
--- a/python/paddle/distributed/auto_parallel/dist_op.py
+++ b/python/paddle/distributed/auto_parallel/dist_op.py
@@ -29,8 +29,6 @@ class DistributedOperator:
     def __init__(self, serial_op, dist_attr=None):
         self._serial_op = serial_op
         if dist_attr is not None and isinstance(dist_attr, OperatorDistAttr):
-            pass
-
             # TODO: remove this deepcopy after we fix the issue
             self._dist_attr = copy.deepcopy(dist_attr)
             # self._dist_attr = dist_attr
@@ -56,21 +54,6 @@ def dist_attr(self, dist_attr):
         self._dist_attr = dist_attr
         # TODO: Do we really need to write back to serial op？
         self._serial_op.dist_attr = dist_attr
-        # if self._dist_attr is None:
-        #     self._dist_attr = OperatorDistAttr()
-        # # Create new dist_attr related to current serial_op
-        # dist_attr = self._filter_dist_attr(dist_attr)
-        # # Append suffix to mark the inputs or outputs
-        # if isinstance(dist_attr, dict):
-        #     # Copy the keys since we may add new ones
-        #     for key in list(dist_attr.keys()):
-        #         if isinstance(key, Variable):
-        #             if key.name in self._serial_op.input_arg_names:
-        #                 dist_attr[append_op_input_suffix(key.name)] = True
-        #             if key.name in self._serial_op.output_arg_names:
-        #                 dist_attr[append_op_output_suffix(key.name)] = True
-        # self._dist_attr.init(dist_attr)
-        # self._init_default_dist_attr()
 
     def get_serial_input(self, name):
         if self._serial_op.type == "create_py_reader":
@@ -83,81 +66,6 @@ def get_serial_output(self, name):
         tensor = self._serial_op.block._var_recursive(name)
         return tensor
 
-    # def _init_default_dist_attr(self):
-    #     for tensor_name in self._serial_op.input_arg_names:
-    #         if self._serial_op.type == "create_py_reader":
-    #             tensor = None
-    #         else:
-    #             tensor = self._serial_op.block._var_recursive(tensor_name)
-    #         self._serial_inputs[tensor_name] = tensor
-    #         if tensor is None:
-    #             tensor_shape = []
-    #         else:
-    #             if tensor.type in __no_shape_var_type__:
-    #                 tensor_shape = []
-    #             else:
-    #                 tensor_shape = tensor.shape
-    #         if self._dist_attr.get_input_dims_mapping(tensor_name) is None:
-    #             tensor_dims_mapping = [-1 for _ in range(len(tensor_shape))]
-    #             self._dist_attr.set_input_dims_mapping(
-    #                 tensor_name, tensor_dims_mapping
-    #             )
-    #     for tensor_name in self._serial_op.output_arg_names:
-    #         tensor = self._serial_op.block._var_recursive(tensor_name)
-    #         if tensor.type in __no_shape_var_type__:
-    #             tensor_shape = []
-    #         else:
-    #             tensor_shape = tensor.shape
-    #         self._serial_outputs[tensor_name] = tensor
-    #         if self._dist_attr.get_output_dims_mapping(tensor_name) is None:
-    #             tensor_dims_mapping = [-1 for _ in range(len(tensor_shape))]
-    #             self._dist_attr.set_output_dims_mapping(
-    #                 tensor_name, tensor_dims_mapping
-    #             )
-    #     if self._dist_attr.op_type is None:
-    #         self._dist_attr.op_type = self.serial_op.type
-    #     if self._dist_attr.impl_type is None:
-    #         self._dist_attr.impl_type = "default"
-    #     if self._dist_attr.impl_idx is None:
-    #         self._dist_attr.impl_idx = 0
-    #     if self._dist_attr.is_recompute is None:
-    #         self._dist_attr.is_recompute = False
-
-    # def _filter_dist_attr(self, dist_attr):
-    #     if dist_attr is None:
-    #         return None
-    #     new_dist_attr = None
-    #     if isinstance(dist_attr, dict):
-    #         new_dist_attr = {}
-    #         for key, value in dist_attr.items():
-    #             if isinstance(key, Variable):
-    #                 if (
-    #                     key.name in self._serial_op.input_arg_names
-    #                     or key.name in self._serial_op.output_arg_names
-    #                 ):
-    #                     new_dist_attr[key] = value
-    #             else:
-    #                 new_dist_attr[key] = value
-    #     elif isinstance(dist_attr, OperatorDistAttr):
-    #         new_dist_attr = copy.deepcopy(dist_attr)
-    #         new_dist_attr._inputs_dist_attrs.clear()
-    #         new_dist_attr._outputs_dist_attrs.clear()
-    #         for tensor_name in self._serial_op.input_arg_names:
-    #             tensor_dist_attr = dist_attr.get_input_dist_attr(tensor_name)
-    #             if tensor_dist_attr:
-    #                 new_dist_attr.set_input_dist_attr(
-    #                     tensor_name, tensor_dist_attr
-    #                 )
-    #         for tensor_name in self._serial_op.output_arg_names:
-    #             tensor_dist_attr = dist_attr.get_output_dist_attr(tensor_name)
-    #             if tensor_dist_attr:
-    #                 new_dist_attr.set_output_dist_attr(
-    #                     tensor_name, tensor_dist_attr
-    #                 )
-    #     else:
-    #         assert False, "Cannot recognize the {} parameter.".format(dist_attr)
-    #     return new_dist_attr
-
     def validate_dist_attr(self):
         if "read" in self.serial_op.type or "while" == self.serial_op.type:
             return True
@@ -402,5 +310,6 @@ def __call__(self, *args, **kwargs):
             if self._process_mesh is not None:
                 dist_op.dist_attr.mark_annotated("process_mesh")
             default_dist_ctx.add_dist_op_for_program(dist_op)
+            default_dist_ctx.add_process_mesh(self._process_mesh)
 
         return output
diff --git a/python/paddle/distributed/auto_parallel/dist_saver.py b/python/paddle/distributed/auto_parallel/dist_saver.py
index 87a0319204fd3..8772d234ddf99 100644
--- a/python/paddle/distributed/auto_parallel/dist_saver.py
+++ b/python/paddle/distributed/auto_parallel/dist_saver.py
@@ -192,17 +192,27 @@ def save_inference_model(self, path, feed_vars, fetch_vars, exe, **kwargs):
             used_inputs += op.input_arg_names
             used_outputs += op.output_arg_names
 
-        for idx, var_name in enumerate(feed_vars_names):
-            if var_name not in used_inputs:
-                feed_vars_names.pop(idx)
-        for idx, var_name in enumerate(fetch_vars_names):
-            if var_name not in used_outputs:
-                fetch_vars_names.pop(idx)
+        # delete duplicated elements and keep order
+        feed_vars_names = list({}.fromkeys(feed_vars_names).keys())
+        used_inputs = list({}.fromkeys(used_inputs).keys())
+        fetch_vars_names = list({}.fromkeys(fetch_vars_names).keys())
+        used_outputs = list({}.fromkeys(used_outputs).keys())
+
+        dist_feed_vars_names = [
+            var_name for var_name in feed_vars_names if var_name in used_inputs
+        ]
+        dist_fetch_vars_names = [
+            var_name
+            for var_name in fetch_vars_names
+            if var_name in used_outputs
+        ]
 
         dist_feed_vars = list(
-            reversed([global_block.vars[name] for name in feed_vars_names])
+            reversed([global_block.vars[name] for name in dist_feed_vars_names])
         )
-        dist_fetch_vars = [global_block.vars[name] for name in fetch_vars_names]
+        dist_fetch_vars = [
+            global_block.vars[name] for name in dist_fetch_vars_names
+        ]
 
         dist_filename = filename + "_dist" + str(rank_id)
         dist_path = os.path.join(dirname, dist_filename)
diff --git a/python/paddle/distributed/auto_parallel/engine.py b/python/paddle/distributed/auto_parallel/engine.py
index a84bea42d538f..9a4f8611daf42 100644
--- a/python/paddle/distributed/auto_parallel/engine.py
+++ b/python/paddle/distributed/auto_parallel/engine.py
@@ -17,7 +17,6 @@
 import numbers
 import os
 import random
-from collections import defaultdict
 
 import numpy as np
 
@@ -154,7 +153,6 @@ def __init__(
                 " or `paddle.static.Optimizer`."
             )
         self._optimizer = auto_utils.validate_opt(optimizer)
-        self._orig_optimizer = copy.deepcopy(self._optimizer)
 
         metrics = metrics or []
         for metric in auto_utils.to_list(metrics):
@@ -185,6 +183,12 @@ def __init__(
             )
             fleet.init(is_collective=True)
 
+        # for compute cost
+        # TODO: remove _fwd_main_progs and _orig_optimizer
+        self._fwd_dist_contexts = {}
+        self._fwd_main_progs = {}
+        self._orig_optimizer = copy.deepcopy(self._optimizer)
+
         self._executor = None
         self._cur_rank = paddle.distributed.get_rank()
         self._nranks = paddle.distributed.get_world_size()
@@ -194,14 +198,6 @@ def __init__(
         self._orig_startup_prog = static.default_startup_program()
         self._orig_dist_context = get_default_distributed_context()
         self._dist_contexts = {}
-        self._fwd_main_progs = {}
-        self._fwd_dist_contexts = {}
-        self._serial_main_progs = {}
-        self._serial_startup_progs = {}
-        self._dist_main_progs = defaultdict(dict)  # dist main programs
-        self._dist_startup_progs = defaultdict(dict)  # dist startup programs
-        self._feed_vars = {}
-        self._fetch_vars = {}
         self._planners = {}
         self._has_prepared = {"train": False, "eval": False, "predict": False}
         self._has_prepared_reader = {
@@ -334,9 +330,9 @@ def _prepare_data_tensor(self, inputs_spec, labels_spec, inputs, labels):
 
         return inputs, labels
 
-    def _prepare_reader(self):
-        dist_main_prog = self._dist_main_progs[self._mode][self._cur_rank]
+    def _prepare_reader(self, feed_list=[]):
         dist_context = self._dist_contexts[self._mode]
+        dist_main_prog = dist_context.dist_main_programs[self._cur_rank]
         dist_main_block = dist_main_prog.global_block()
 
         # NOTE: this list may be changed if Paddle changes the existing rules.
@@ -357,10 +353,13 @@ def _prepare_reader(self):
             if op.type in related_reader_ops:
                 reader_op_indices.append(idx)
         # Step 2: insert the new reader ops to cpp
+        # record the read ops' desc to insert to program of forward task_node
+        read_ops_desc = []
         new_reader_ops = []
         for idx in reversed(reader_op_indices):
             new_op_desc = dist_main_block.desc._prepend_op()
             new_op_desc.copy_from(dist_main_block.ops[idx].desc)
+            read_ops_desc.append(new_op_desc)
             new_op = Operator(
                 dist_main_block, new_op_desc, type=new_op_desc.type()
             )
@@ -379,6 +378,29 @@ def _prepare_reader(self):
         dist_main_block._sync_with_cpp()
         self._has_prepared_reader[self._mode] = True
 
+        # Insert read op to forward TaskNode if 1F1B pass is setted
+        if self.main_program._pipeline_opt:
+            assert "tasks" in self.main_program._pipeline_opt["fleet_opt"]
+            fleet_opt = self.main_program._pipeline_opt["fleet_opt"]
+            fwd_task = fleet_opt["tasks"][0]
+            fwd_prog = fwd_task.get_program()
+            fwd_block = fwd_prog.global_block()
+
+            for var in feed_list:
+                if var.name not in fwd_block.vars:
+                    fwd_block._clone_variable(var)
+
+            for op_desc in read_ops_desc:
+                new_op_desc = fwd_block.desc._prepend_op()
+                new_op_desc.copy_from(op_desc)
+                new_op = Operator(
+                    fwd_block, new_op_desc, type=new_op_desc.type()
+                )
+                fwd_block.ops.insert(0, new_op)
+
+            fwd_block._sync_with_cpp()
+            fwd_task.set_program(fwd_prog)
+
     def _prepare_feed(self, data, user_feeds, mode):
         feeds = {}
         if data is not None:
@@ -428,14 +450,16 @@ def _process_fetch_group(group_name, var_list):
                 fetch_names.append([])
             fetch_indices.append(group_indices)
 
+        dist_context = self._dist_contexts[mode]
+        fetch_vars = dist_context.serial_fetch_vars
         if mode != "predict":
-            _process_fetch_group("loss", self._fetch_vars[mode]["loss"])
+            _process_fetch_group("loss", fetch_vars["loss"])
         if mode != "predict":
-            metrics = self._fetch_vars[mode]["metrics"]
+            metrics = fetch_vars["metrics"]
             for i, var_list in enumerate(metrics):
                 _process_fetch_group("metrics_" + str(i), var_list)
         if mode == "predict":
-            _process_fetch_group("outputs", self._fetch_vars[mode]["outputs"])
+            _process_fetch_group("outputs", fetch_vars["outputs"])
         for usr_fetch in user_fetches:
             var_name = _to_name_str(usr_fetch)
             fetch(var_name)
@@ -472,7 +496,8 @@ def _prepare_logger(
                 logs["loss"] = outs[idx][0]
             group_idx += 1
             # logging metrics
-            metric_vars = self._fetch_vars[mode]["metrics"]
+            dist_context = self._dist_contexts[mode]
+            metric_vars = dist_context.serial_fetch_vars["metrics"]
             if metric_vars:
                 for metric in self._metrics:
                     metrics_indices = fetch_indices[group_idx]
@@ -503,15 +528,18 @@ def _prepare_logger(
         logs["fetches"] = logs_fetch
         return logs
 
-    def _prepare_program(self, mode):
+    def _prepare_program(self, mode, init_parameters=True):
         # Do the build process
         self._build(mode)
         # Do the planning process
         self._plan(mode)
         # Do the parallel process
         self._parallel(mode)
-        # Init comm and startup program
-        self._initialize(mode)
+        # Init comm
+        self._init_comm()
+        if init_parameters:
+            # startup program
+            self._initialize(mode)
         self._has_prepared[mode] = True
 
     def _build(self, mode):
@@ -543,9 +571,9 @@ def _build(self, mode):
 
             paddle.enable_static()
         else:
-            # build program in static graph mode
-            serial_main_prog = self._serial_main_progs.get(mode, None)
-            if serial_main_prog is not None:
+            # build program in static mode
+            dist_context = self._dist_contexts.get(mode, None)
+            if dist_context is not None:
                 return
 
             outputs = []
@@ -735,42 +763,23 @@ def _init_dist_context(self, mode):
                 )
                 dist_context.set_op_dist_attr_for_program(op, ref_op_dist_attr)
 
-    def _initialize(self, mode):
-        # Get the current content from the distributed context
-        self._serial_main_progs[mode] = self._dist_contexts[
-            mode
-        ].serial_main_program
-        self._serial_startup_progs[mode] = self._dist_contexts[
-            mode
-        ].serial_startup_program
-        self._dist_main_progs[mode] = self._dist_contexts[
-            mode
-        ].dist_main_programs
-        self._dist_startup_progs[mode] = self._dist_contexts[
-            mode
-        ].dist_startup_programs
-        self._feed_vars[mode] = self._dist_contexts[mode].serial_feed_vars
-        self._fetch_vars[mode] = self._dist_contexts[mode].serial_fetch_vars
-        self._optimizer = self._dist_contexts[mode]._serial_optimizer
-
+    def _init_comm(self):
         if self._nranks > 1:
             # Traverse different rank programs and traverse each op of them,
             # instantiate communication by process_mapping.
             all_process_groups = get_all_process_groups()
-            cur_rank = self._cur_rank
-            # NOTE: After the implementation of the unified dynamic and static communication group
-            # initialization mode in the future, the initialization logic of full mode
-            # will be removed because port occupation error may occur.
+
             if self._strategy.auto_mode == "full":
                 auto_utils.initialize_pg_in_full_mode(
-                    all_process_groups, cur_rank
+                    all_process_groups, self._cur_rank
                 )
             else:
                 for process_group in all_process_groups:
-                    if cur_rank not in process_group.ranks:
+                    if self._cur_rank not in process_group.ranks:
                         continue
                     process_group.instantiate()
 
+    def _initialize(self, mode):
         self._place = _get_device()
         if isinstance(self._place, paddle.framework.CUDAPlace):
             self._place = paddle.framework.CUDAPlace(
@@ -782,9 +791,9 @@ def _initialize(self, mode):
             np.random.seed(self._strategy.seed + self._dp_ranks[0])
             random.seed(self._strategy.seed + self._dp_ranks[0])
 
+        dist_context = self._dist_contexts[mode]
         if self._dygraph_mode:
-            dist_context = self._dist_contexts[mode]
-            dist_main_program = self._dist_main_progs[mode][self._cur_rank]
+            dist_main_program = dist_context.dist_main_programs[self._cur_rank]
             self.program_helper.init(
                 dist_main_program, self._place, dist_context
             )
@@ -792,7 +801,9 @@ def _initialize(self, mode):
         if self._executor is None:
             self._executor = paddle.static.Executor(self._place)
             uninitialized = []
-            dist_startup_prog = self._dist_startup_progs[mode][self._cur_rank]
+            dist_startup_prog = dist_context.dist_startup_programs[
+                self._cur_rank
+            ]
             for var in dist_startup_prog.list_vars():
                 scope_var = global_scope().find_var(var.name)
                 if scope_var and scope_var.get_tensor()._is_initialized():
@@ -809,7 +820,9 @@ def _initialize(self, mode):
 
         if self._strategy.reinit:
             self._logger.info("NOTE: parameters will be re-initialized.")
-            dist_startup_prog = self._dist_startup_progs[mode][self._cur_rank]
+            dist_startup_prog = dist_context.dist_startup_programs[
+                self._cur_rank
+            ]
             self._executor.run(dist_startup_prog)
 
     def fit(
@@ -1282,6 +1295,7 @@ def prepare(
         main_program=None,
         startup_program=None,
         mode=None,
+        init_parameters=True,
     ):
         if mode is not None:
             self.to_mode(mode)
@@ -1324,7 +1338,7 @@ def prepare(
         self._inputs_spec, self._labels_spec = inputs_spec, labels_spec
         self._inputs, self._labels = inputs, labels
         if not self._has_prepared[self._mode]:
-            self._prepare_program(self._mode)
+            self._prepare_program(self._mode, init_parameters)
         else:
             self._switch_mode(self._mode)
 
@@ -1375,16 +1389,17 @@ def _prepare_dataloader(
             )
             batch_size //= self._k_steps
 
-        dist_main_prog = self._dist_main_progs[self._mode][self._cur_rank]
-        dist_startup_prog = self._dist_startup_progs[self._mode][self._cur_rank]
+        dist_context = self._dist_contexts[self._mode]
+        dist_main_prog = dist_context.dist_main_programs[self._cur_rank]
+        dist_startup_prog = dist_context.dist_startup_programs[self._cur_rank]
         dist_main_block = dist_main_prog.global_block()
 
         # NOTE: Get feed_list, then insert dataloader op with sharded var shape.
         # Cause predict_program does not contain labels var,
         # then we will add labels var from serial_program to dist_program,
         # that maintains the length of feed_list equal to the length of dataset's values.
-        inputs_var = self._feed_vars[self._mode]["inputs"]
-        labels_var = self._feed_vars[self._mode]["labels"]
+        inputs_var = dist_context.serial_feed_vars["inputs"]
+        labels_var = dist_context.serial_feed_vars["labels"]
         feed_list = []
         for var in inputs_var + labels_var:
             if var.name in dist_main_block.vars:
@@ -1443,16 +1458,17 @@ def _prepare_dataloader_from_generator(
             )
             batch_size //= self._k_steps
 
-        dist_main_prog = self._dist_main_progs[self._mode][self._cur_rank]
-        dist_startup_prog = self._dist_startup_progs[self._mode][self._cur_rank]
+        dist_context = self._dist_contexts[self._mode]
+        dist_main_prog = dist_context.dist_main_programs[self._cur_rank]
+        dist_startup_prog = dist_context.dist_startup_programs[self._cur_rank]
         dist_main_block = dist_main_prog.global_block()
 
         # NOTE: Get feed_list, then insert dataloader op with sharded var shape.
         # Cause predict_program does not contain labels var,
         # then we will add labels var from serial_program to dist_program,
         # that maintains the length of feed_list equal to the length of dataset's values.
-        inputs_var = self._feed_vars[self._mode]["inputs"]
-        labels_var = self._feed_vars[self._mode]["labels"]
+        inputs_var = dist_context.serial_feed_vars["inputs"]
+        labels_var = dist_context.serial_feed_vars["labels"]
         feed_list = []
         for var in inputs_var + labels_var:
             if var.name in dist_main_block.vars:
@@ -1482,7 +1498,7 @@ def _prepare_dataloader_from_generator(
                 data_parallel_world_size=self._dp_world_sizes,
                 data_parallel_rank=self._dp_ranks,
             )
-        self._prepare_reader()
+        self._prepare_reader(feed_list)
         return dataloader
 
     def _tune(self, tune_data, tune_sample_split=None, batch_size=1):
@@ -1542,7 +1558,7 @@ def _metrics_name(self):
 
     def _switch_mode(self, mode):
         assert (
-            mode in self._dist_main_progs
+            mode in self._dist_contexts
         ), f"{mode} model is not ready, please call `prepare()` first."
         self.to_mode(mode)
         self._optimizer = self._dist_contexts[mode]._serial_optimizer
@@ -1556,8 +1572,8 @@ def to_mode(self, mode):
         self._mode = mode
 
     def _set_state_dict(self, mode, strict, state_dict, dist_attr):
-        program = self._dist_main_progs[mode][self._cur_rank]
         dist_context = self._dist_contexts[mode]
+        program = dist_context.dist_main_programs[self._cur_rank]
         cur_dist_attr = auto_utils.get_dist_attr(program, dist_context)
         converter = Converter(state_dict, dist_attr, cur_dist_attr)
         state_dict = converter.convert(strict=strict)
@@ -1622,10 +1638,10 @@ def save(self, path, training=True):
 
         """
         if training:
-            assert self._mode in self._serial_main_progs
-            serial_program = self._serial_main_progs[self._mode]
-            dist_main_prog = self._dist_main_progs[self._mode][self._cur_rank]
+            assert self._mode in self._dist_contexts
             dist_context = self._dist_contexts[self._mode]
+            serial_program = dist_context.serial_main_program
+            dist_main_prog = dist_context.dist_main_programs[self._cur_rank]
             self._saver.save(
                 path,
                 serial_program=serial_program,
@@ -1633,10 +1649,11 @@ def save(self, path, training=True):
                 dist_context=dist_context,
             )
         else:
-            assert "predict" in self._dist_main_progs
-            feed_vars = self._feed_vars["predict"]['inputs']
-            fetch_vars = self._fetch_vars["predict"]['outputs']
-            dist_main_prog = self._dist_main_progs["predict"][self._cur_rank]
+            assert "predict" in self._dist_contexts
+            dist_context = self._dist_contexts["predict"]
+            feed_vars = dist_context.serial_feed_vars['inputs']
+            fetch_vars = dist_context.serial_fetch_vars['outputs']
+            dist_main_prog = dist_context.dist_main_programs[self._cur_rank]
             if self._strategy.qat.enable and self._strategy.qat.onnx_format:
                 from paddle.static.quantization import QuantWeightPass
 
@@ -1776,11 +1793,13 @@ def cost(self, inputs_spec=None, labels_spec=None, mode=None):
 
     @property
     def main_program(self):
-        return self._dist_main_progs[self._mode][self._cur_rank]
+        dist_context = self._dist_contexts[self._mode]
+        return dist_context.dist_main_programs[self._cur_rank]
 
     @property
     def startup_program(self):
-        return self._dist_startup_progs[self._mode][self._cur_rank]
+        dist_context = self._dist_contexts[self._mode]
+        return dist_context.dist_startup_programs[self._cur_rank]
 
     @property
     def dist_context(self):
@@ -1788,15 +1807,30 @@ def dist_context(self):
 
     @property
     def serial_main_program(self):
-        return self._serial_main_progs[self._mode]
+        dist_context = self._dist_contexts[self._mode]
+        return dist_context.serial_main_program
 
     @property
     def serial_startup_program(self):
-        return self._serial_startup_progs[self._mode]
+        dist_context = self._dist_contexts[self._mode]
+        return dist_context.serial_startup_program
+
+    @property
+    def feed_vars(self):
+        dist_context = self._dist_contexts[self._mode]
+        return dist_context.serial_feed_vars
 
     @property
     def fetch_vars(self):
-        return self._fetch_vars[self._mode]
+        dist_context = self._dist_contexts[self._mode]
+        return dist_context.serial_fetch_vars
+
+    @property
+    def optimizer(self):
+        dist_context = self._dist_contexts[self._mode]
+        if dist_context._serial_optimizer:
+            return dist_context._serial_optimizer
+        return self._optimizer
 
     @property
     def inputs(self):
diff --git a/python/paddle/distributed/auto_parallel/interface.py b/python/paddle/distributed/auto_parallel/interface.py
index 9fda85ecef010..76207bc588968 100644
--- a/python/paddle/distributed/auto_parallel/interface.py
+++ b/python/paddle/distributed/auto_parallel/interface.py
@@ -79,7 +79,15 @@ def shard_tensor(x, process_mesh=None, shard_spec=None):
     assert isinstance(
         shard_spec, list
     ), f"Argument shard_spec {shard_spec} is not an instance of list"
-    dist_tensor = DistributedTensor(x)
+    if isinstance(x, str):
+        x = (
+            paddle.static.default_main_program()
+            .global_block()
+            ._var_recursive(x)
+        )
+        dist_tensor = DistributedTensor(x)
+    else:
+        dist_tensor = DistributedTensor(x)
     serial_tensor = dist_tensor.serial_tensor
     dist_tensor.dist_attr.process_mesh = process_mesh
     if serial_tensor.type in __no_shape_var_type__:
@@ -102,6 +110,7 @@ def shard_tensor(x, process_mesh=None, shard_spec=None):
     default_dist_ctx = get_default_distributed_context()
     default_dist_ctx.add_dist_tensor_for_program(dist_tensor)
     dist_tensor = default_dist_ctx.get_dist_tensor_for_program(x)
+    default_dist_ctx.add_process_mesh(process_mesh)
     return x
 
 
diff --git a/python/paddle/distributed/auto_parallel/parallelizer.py b/python/paddle/distributed/auto_parallel/parallelizer.py
index d2463f3308637..549f618c6cbc9 100644
--- a/python/paddle/distributed/auto_parallel/parallelizer.py
+++ b/python/paddle/distributed/auto_parallel/parallelizer.py
@@ -499,12 +499,19 @@ def parallelize(
                         break
                 if is_pipeline:
                     with paddle.static.program_guard(dist_main_prog):
-                        paddle.distributed.barrier()
+                        paddle.distributed.barrier(get_process_group(0))
 
             # Traverse different rank programs and traverse each op of them,
             # instantiate communication by process_mapping.
             all_process_groups = get_all_process_groups()
             for process_group in all_process_groups:
+                if len(_g_process_group_map) > 0:
+                    tmp = paddle.to_tensor([1], dtype="int32")
+                    paddle.distributed.all_reduce(
+                        tmp, sync_op=True, group=_g_process_group_map[0]
+                    )
+                    paddle.device.cuda.synchronize()
+
                 if rank not in process_group.ranks:
                     continue
                 process_group.instantiate()
diff --git a/python/paddle/distributed/auto_parallel/parallelizer_v2.py b/python/paddle/distributed/auto_parallel/parallelizer_v2.py
index a76a3f5dcb9ab..c4ef623b17260 100644
--- a/python/paddle/distributed/auto_parallel/parallelizer_v2.py
+++ b/python/paddle/distributed/auto_parallel/parallelizer_v2.py
@@ -177,10 +177,22 @@ def parallel(self, rank):
                     time.time() - time0, self._mode
                 )
             )
+            # Apply post optimization passes
+            time0 = time.time()
+            self._apply_post_optimization(
+                dist_main_prog, dist_startup_prog, rank, dist_params_grads
+            )
+            self._logger.debug(
+                "within parallel apply_post_optimization time: {}, mode {}".format(
+                    time.time() - time0, self._mode
+                )
+            )
         # Clone program for test
         if self._mode != 'train':
+            pipeline_opt = dist_main_prog._pipeline_opt
             dist_main_prog = dist_main_prog.clone(for_test=True)
             dist_startup_prog = dist_startup_prog.clone(for_test=True)
+            dist_main_prog._pipeline_opt = pipeline_opt
 
         # Store the distributed programs for further usages
         self._dist_context.dist_main_programs[rank] = dist_main_prog
@@ -247,7 +259,7 @@ def _apply_pre_optimization(
 
         # apply quantization pass
         # The pass can be applied when mode must be 'train'
-        if self._strategy.qat.enable:
+        if self._mode == 'train' and self._strategy.qat.enable:
             config = copy.deepcopy(self._strategy.qat.to_dict())
             config["dist_context"] = self._dist_context
             config["params_grads"] = params_grads
@@ -307,8 +319,8 @@ def _apply_post_optimization(
             )
             params_grads = self._pass_context.get_attr("params_grads")
 
-        # GradClip is train-only optimization
         if self._mode == "train":
+            # GradClip is train-only optimization
             config = copy.deepcopy(self._strategy.sharding.to_dict())
             config["dist_context"] = self._dist_context
             config["params_grads"] = params_grads
@@ -330,6 +342,13 @@ def _apply_post_optimization(
                 [main_program], [startup_program], self._pass_context
             )
 
+        if self._strategy.pipeline.enable:
+            self._strategy.gradient_merge.enable = True
+            self._strategy.gradient_merge.k_steps = (
+                self._strategy.pipeline.accumulate_steps
+            )
+            self._strategy.gradient_merge.avg = True
+
         # gradient_merge is then train-only optimization
         if self._mode == "train" and self._strategy.gradient_merge.enable:
             config = copy.deepcopy(self._strategy.gradient_merge.to_dict())
@@ -342,6 +361,16 @@ def _apply_post_optimization(
                 [main_program], [startup_program], self._pass_context
             )
 
+        if self._strategy.pipeline.enable:
+            config = copy.deepcopy(self._strategy.pipeline.to_dict())
+            config["dist_context"] = self._dist_context
+            auto_parallel_pipeline_pass = new_pass(
+                "auto_parallel_pipeline", config
+            )
+            auto_parallel_pipeline_pass.apply(
+                [main_program], [startup_program], self._pass_context
+            )
+
         if self._mode == "train" and self._strategy.fused_passes.enable:
             if len(self._strategy.fused_passes.fused_passes_list) > 0:
                 new_pass_list = []
diff --git a/python/paddle/distributed/auto_parallel/process_group.py b/python/paddle/distributed/auto_parallel/process_group.py
index 83e1642ba21bb..8c300cbcd53b6 100644
--- a/python/paddle/distributed/auto_parallel/process_group.py
+++ b/python/paddle/distributed/auto_parallel/process_group.py
@@ -52,9 +52,9 @@ def new_process_group(ranks, group_id=None, force_new_group=False):
     global _g_process_group_map
     if not force_new_group:
         # A key constructed from ranks is used for avoiding duplication
-        new_key = ''.join(map(str, sorted(ranks)))
+        new_key = ''.join(map(str, ranks))
         for pg_id, pg in _g_process_group_map.items():
-            cur_key = ''.join(map(str, sorted(pg.ranks)))
+            cur_key = ''.join(map(str, pg.ranks))
             if pg_id != 0 and new_key == cur_key:
                 return pg
     # If not matching the existing one, construct a new process group
@@ -82,7 +82,7 @@ def __init__(self, group_id, ranks):
                 group_id != 0
             ), "Process group id 0 is reserved for all ranks."
         self._group_id = group_id
-        self._ranks = sorted(ranks)
+        self._ranks = ranks
         # Add the current ranks into group 0
         if group_id != 0:
             global _g_process_group_map
@@ -109,7 +109,7 @@ def add_ranks(self, new_ranks):
                 not self.is_instantiate()
             ), "Cannot add new ranks after instantiating the process group"
         self._ranks.extend(new_ranks)
-        self._ranks = sorted(set(self.ranks))
+        self._ranks = list(set(self.ranks))
 
     def local_rank(self, global_rank):
         if global_rank in self.ranks:
diff --git a/python/paddle/distributed/auto_parallel/reshard.py b/python/paddle/distributed/auto_parallel/reshard.py
index 7461e85c67248..91e07fc651d20 100644
--- a/python/paddle/distributed/auto_parallel/reshard.py
+++ b/python/paddle/distributed/auto_parallel/reshard.py
@@ -848,7 +848,8 @@ def remove_no_need_ops(auto_parallel_main_prog, dist_context, rank_id):
                         remove_op_idx.append(idx)
 
             for idx in remove_op_idx[::-1]:
-                block._remove_op(idx)
+                block._remove_op(idx, sync=False)
+            block._sync_with_cpp()
 
     @staticmethod
     def remove_no_need_vars(
@@ -1000,7 +1001,8 @@ def remove_no_need_in_startup(
             if is_no_need_op:
                 remove_op_idx.append(idx)
         for idx in remove_op_idx[::-1]:
-            startup_block._remove_op(idx)
+            startup_block._remove_op(idx, sync=False)
+        startup_block._sync_with_cpp()
 
 
 class Resharder:
@@ -1441,6 +1443,8 @@ def find_op_desc_seq(self, dist_tensor, dist_attr, serial=False):
         target_process_group = target_process_mesh.process_ids
         target_process_shape = target_process_mesh.shape
 
+        op_role = dist_attr[2]
+
         if source_tensor.shape[0] < 0:
             assert source_tensor.shape[0] == -1
             new_shape = list(source_tensor.shape)
@@ -1583,6 +1587,10 @@ def find_op_desc_seq(self, dist_tensor, dist_attr, serial=False):
                         Resharder.concat_partitions(
                             partition_index_list, source_partition_index
                         )
+                        if int(op_role) == int(OpRole.Forward):
+                            self.dist_context.up_down_streams.add_pair_stream(
+                                to_send_process, target_process
+                            )
 
                 # append concat op desc
                 op_desc_seq[target_process].append(
@@ -2037,13 +2045,6 @@ def parse_op_desc(
                                     op_dist_attr.set_input_dims_mapping(
                                         new_name, dims_mapping
                                     )
-                                    # if (
-                                    #     old_name
-                                    #     in op_dist_attr._inputs_dist_attrs
-                                    # ):
-                                    #     op_dist_attr.del_input_dist_attr(
-                                    #         old_name
-                                    #     )
                                     op_dist_attr.set_input_dims_mapping(
                                         new_name, dims_mapping
                                     )
@@ -2067,7 +2068,6 @@ def parse_op_desc(
                                     op_dist_attr.set_input_dims_mapping(
                                         new_name, dims_mapping
                                     )
-                                    # op_dist_attr.del_input_dist_attr(old_name)
                                     op_dist_attr.set_input_dims_mapping(
                                         new_name, dims_mapping
                                     )
@@ -2095,7 +2095,6 @@ def parse_op_desc(
                                 op_dist_attr.set_input_dims_mapping(
                                     new_name, dims_mapping
                                 )
-                                # op_dist_attr.del_input_dist_attr(old_name)
                                 op_dist_attr.set_input_dims_mapping(
                                     new_name, dims_mapping
                                 )
@@ -2135,7 +2134,13 @@ def _get_subblock_input_attrs(self, op, var_name):
                             has_exist = True
                             break
                     if not has_exist:
-                        input_attrs.append([process_mesh, input_dims_mapping])
+                        input_attrs.append(
+                            [
+                                process_mesh,
+                                input_dims_mapping,
+                                op.attr('op_role'),
+                            ]
+                        )
         return input_attrs
 
     def _get_subblock_output_attrs(self, op, var_name):
@@ -2165,7 +2170,13 @@ def _get_subblock_output_attrs(self, op, var_name):
                             has_exist = True
                             break
                     if not has_exist:
-                        output_attrs.append([process_mesh, output_dims_mapping])
+                        output_attrs.append(
+                            [
+                                process_mesh,
+                                output_dims_mapping,
+                                op.attr('op_role'),
+                            ]
+                        )
         return output_attrs
 
     def _get_common_op_input_attrs(self, op, var_name):
@@ -2188,7 +2199,9 @@ def _get_common_op_input_attrs(self, op, var_name):
         input_dims_mapping = dist_attr.get_input_dims_mapping(var_name)
         input_attrs = []
         for process_mesh in process_meshes:
-            input_attrs.append([process_mesh, input_dims_mapping])
+            input_attrs.append(
+                [process_mesh, input_dims_mapping, op.attr('op_role')]
+            )
 
         return input_attrs
 
@@ -2207,7 +2220,7 @@ def get_op_input_attrs(self, op, var_name):
 
         assert (
             op_input_attrs
-        ), "The input '{}' of op '{}' has no distibution attributes in subblock".format(
+        ), "The input '{}' of op '{}' has no distributed attributes in subblock".format(
             op.name, var_name
         )
 
@@ -2215,30 +2228,24 @@ def get_op_input_attrs(self, op, var_name):
 
     def _remove_global_process_mesh(self):
         """Remove global process mesh from dist_context.process_meshes"""
-        processes = set()
+        process_ids = set()
         process_mesh_count = len(self.dist_context.process_meshes)
         if process_mesh_count > 1:
-            global_process_mesh_idx = None
+            global_process_mesh_idx = []
+            has_sub_process_mesh = False
             for process_mesh in self.dist_context.process_meshes:
-                for process in process_mesh.process_ids:
-                    processes.add(process)
+                for process_id in process_mesh.process_ids:
+                    process_ids.add(process_id)
             for idx, process_mesh in enumerate(
                 self.dist_context.process_meshes
             ):
-                if len(set(process_mesh.process_ids)) == len(processes):
-                    global_process_mesh_idx = idx
-                    break
+                if len(set(process_mesh.process_ids)) == len(process_ids):
+                    global_process_mesh_idx.append(idx)
+                elif set(process_mesh.process_ids) < process_ids:
+                    has_sub_process_mesh = True
 
-            if global_process_mesh_idx is not None:
-                is_removed = False
-                global_mesh = self.dist_context.process_meshes[idx]
-                for i, mesh in enumerate(self.dist_context.process_meshes):
-                    if i == idx:
-                        continue
-                    if set(mesh.process_ids) < set(global_mesh.process_ids):
-                        is_removed = True
-
-                if is_removed:
+            if has_sub_process_mesh:
+                for idx in reversed(global_process_mesh_idx):
                     self.dist_context.process_meshes.pop(idx)
 
     def _change_subblock_op_input_and_output(self, block_idx, block):
@@ -2278,7 +2285,6 @@ def _change_subblock_op_input_and_output(self, block_idx, block):
                             op_dist_attr.set_input_dist_attr(
                                 new_name, op_input_dist_attr
                             )
-                            # op_dist_attr.del_input_dist_attr(old_name)
 
                 # the outputs also need to be renamed when the output name is the same with input name in inplace op
                 for var_name in op.output_arg_names:
@@ -2302,7 +2308,6 @@ def _change_subblock_op_input_and_output(self, block_idx, block):
                         op_dist_attr.set_output_dist_attr(
                             new_name, op_output_dist_attr
                         )
-                        # op_dist_attr.del_output_dist_attr(old_name)
 
     def _reshard_input(self, block):
         idx = 0
@@ -2450,7 +2455,7 @@ def _hadnle_recv(self, block, idx, var, op, send_rank, recv_rank):
                     assert set_lod is True
 
                 # cast int64 to bool
-                block._insert_op(
+                cast_op = block._insert_op(
                     idx + 2,
                     type='cast',
                     inputs={
@@ -2465,6 +2470,7 @@ def _hadnle_recv(self, block, idx, var, op, send_rank, recv_rank):
                         'op_role': op.attr('op_role'),
                     },
                 )
+                cast_op._set_attr('op_namescope', "/auto_parallel/reshard")
             else:
                 if var.lod_level != 0:
                     recv_out = block.create_var(
@@ -2612,6 +2618,10 @@ def _reshard_output(self, block):
                                         ]
                                         if recv_rank == item:
                                             continue
+                                        if var.shape[0] == -1:
+                                            new_shape = list(var.shape)
+                                            new_shape[0] = self.batch_size
+                                            var.desc.set_shape(new_shape)
                                         if self.rank_id == item:
                                             # if send bool data, cast then send
                                             self._handle_send(
@@ -2640,6 +2650,10 @@ def _reshard_output(self, block):
                                     item = output_attr[0].process_ids[index]
                                     if recv_rank == item:
                                         continue
+                                    if var.shape[0] == -1:
+                                        new_shape = list(var.shape)
+                                        new_shape[0] = self.batch_size
+                                        var.desc.set_shape(new_shape)
                                     if self.rank_id == item:
                                         # if send bool data, cast then send
                                         self._handle_send(
@@ -2714,7 +2728,11 @@ def get_cost(self, op, tensor, cluster):
                     tensor.name
                 )
                 process_mesh = dist_op.dist_attr.process_mesh
-                dist_attr = [process_mesh, dims_mapping]
+                dist_attr = [
+                    process_mesh,
+                    dims_mapping,
+                    dist_op.serial_op.attr('op_role'),
+                ]
                 if dist_tensor is not None and self.need_reshard(
                     dist_tensor, dist_attr
                 ):
diff --git a/python/paddle/distributed/auto_parallel/strategy.py b/python/paddle/distributed/auto_parallel/strategy.py
index 58a08586ff5cb..a4dd2c54d2eed 100644
--- a/python/paddle/distributed/auto_parallel/strategy.py
+++ b/python/paddle/distributed/auto_parallel/strategy.py
@@ -102,6 +102,12 @@ def __init__(self, config_dict=None):
         super().__init__(category, config_dict)
 
 
+class PipelineConfig(BaseConfig):
+    def __init__(self, config_dict=None):
+        category = constants.PIPELINE
+        super().__init__(category, config_dict)
+
+
 class QATConfig(BaseConfig):
     def __init__(self, config_dict=None):
         category = constants.QAT
@@ -186,6 +192,9 @@ def __init__(self, config=None):
         config_dict = self._config_dict.get(constants.GRADIENT_MERGE, None)
         self.gradient_merge = GradientMergeConfig(config_dict)
 
+        config_dict = self._config_dict.get(constants.PIPELINE, None)
+        self.pipeline = PipelineConfig(config_dict)
+
         config_dict = self._config_dict.get(constants.QAT, None)
         self.qat = QATConfig(config_dict)
 
diff --git a/python/paddle/distributed/auto_parallel/topology.py b/python/paddle/distributed/auto_parallel/topology.py
new file mode 100644
index 0000000000000..9de045bd612a3
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/topology.py
@@ -0,0 +1,351 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import subprocess
+import warnings
+
+
+def call_cmd(cmd, err_msg, default_value):
+    process = subprocess.Popen(
+        cmd,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        universal_newlines=True,
+        shell=True,
+    )
+    stdout, stderr = process.communicate()
+    if stderr:
+        warnings.warn(err_msg)
+        stdout = default_value
+
+    return stdout
+
+
+class SingleNodeTopology:
+    def __init__(self):
+        self.pcie_latency = 0.0
+        self.pcie_bandwidth = float('inf')
+        self.nvlink_bandwidth = -1.0
+        self.nb_devices = 8
+
+        self.machine = {}
+        self.devices = []
+        self.links = []
+        self.json_object = None
+
+    def calculate_cpu_flops(self):
+        # Get number sockets
+        cmd = "lscpu | grep 'Socket(s)' | awk '{print $NF}'"
+        err_msg = "Failed to get number of sockets"
+        default_value = 4
+        nb_sockets = call_cmd(cmd, err_msg, default_value)
+
+        # Get number of cores per socket
+        cmd = "lscpu | grep 'Core(s) per socket' | awk '{print $NF}'"
+        err_msg = "Failed to get number of cores per socket"
+        default_value = 20
+        nb_cores_per_socket = call_cmd(cmd, err_msg, default_value)
+
+        # Get clock speed
+        cmd = "lscpu | grep GHz | awk -F '@' '{print $NF}' | awk -F 'G' '{print $1}'"
+        err_msg = "Failed to get cpu clock rate"
+        default_value = 2.4
+        clock_rate = call_cmd(cmd, err_msg, default_value)
+
+        # Get number of FMA units
+        # TODO(changtao02): find a way to detect this value
+        nb_fmas = 2
+
+        # Get SIMD width
+        simd_width_sp = 0
+        simd_width_dp = 0
+
+        cmd = "lscpu | grep sse"
+        err_msg = "Failed to get cpu vector size"
+        default_value = "sse"
+        vector_size = call_cmd(cmd, err_msg, default_value)
+
+        if vector_size:
+            simd_width_sp = 4  # 128 / 32
+            simd_width_dp = 2  # 128 / 64
+
+        cmd = "lscpu | grep avx2"
+        err_msg = "Failed to get cpu vector size"
+        default_value = "avx2"
+        vector_size = call_cmd(cmd, err_msg, default_value)
+
+        if vector_size:
+            simd_width_sp = 8  # 256 / 32
+            simd_width_dp = 4  # 256 / 64
+
+        cmd = "lscpu | grep avx512"
+        err_msg = "Failed to get cpu vector size"
+        default_value = "avx512"
+        vector_size = call_cmd(cmd, err_msg, default_value)
+
+        if vector_size:
+            simd_width_sp = 16  # 512 / 32
+            simd_width_dp = 8  # 512 / 64
+
+        gflops_per_element = (
+            int(nb_sockets)
+            * int(nb_cores_per_socket)
+            * float(clock_rate)
+            * nb_fmas
+        )
+        sp_gflops = gflops_per_element * simd_width_sp
+        dp_gflops = gflops_per_element * simd_width_dp
+
+        self.machine['sp_gflops'] = sp_gflops
+        self.machine['dp_gflops'] = dp_gflops
+
+    def pcie_gen2bandwidth(self, pcie_generation):
+        if pcie_generation == 1:
+            return 0.25
+        elif pcie_generation == 2:
+            return 0.5
+        elif pcie_generation == 3:
+            return 1.0
+        elif pcie_generation == 4:
+            return 2.0
+        elif pcie_generation == 5:
+            return 4.0
+        elif pcie_generation == 6:
+            return 8.0
+
+    def model2gflops(self, model):
+        if "H100" in model and "SXM5" in model:
+            return 60000, 30000
+        elif "H100" in model and "PCIe" in model:
+            return 48000, 24000
+        elif "A100" in model:
+            return 19500, 9700
+        elif "V100" in model:
+            return 15700, 7800
+        elif "P100" in model:
+            return 10600, 5300
+
+    def get_link_bandwidth(self, source_id, target_id):
+        # Get link type
+        row_id = 2 + source_id
+        column_id = 2 + target_id
+
+        cmd = (
+            "cat matrix.txt | awk 'FNR=="
+            + str(row_id)
+            + " {print $"
+            + str(column_id)
+            + "}'"
+        )
+        err_msg = "Failed to get topo matrix"
+        default_value = "NVL"
+        link_type = call_cmd(cmd, err_msg, default_value)
+
+        link_bandwidth = self.pcie_bandwidth
+
+        if "NV" in link_type:
+            if self.nvlink_bandwidth == -1.0:
+                cmd = "nvidia-smi nvlink -s -i 0 | tail -n 1 | awk '{print $3}'"
+                err_msg = "Failed to get nvlink bandwidth"
+                default_value = "25"
+                self.nvlink_bandwidth = float(
+                    call_cmd(cmd, err_msg, default_value)
+                )
+
+            link_bandwidth = int(link_type[2:]) * self.nvlink_bandwidth
+            link_type = "NVL"
+
+        return link_type, link_bandwidth
+
+    def get_host_info(self):
+        # Get hostname
+        cmd = "hostname -s"
+        err_msg = "Failed to get hostname"
+        default_value = "localhost"
+        hostname = call_cmd(cmd, err_msg, default_value).strip()
+
+        # Get ip address
+        cmd = "hostname -i"
+        err_msg = "Failed to get host ip address"
+        default_value = "127.0.0.1"
+        ip_addr = call_cmd(cmd, err_msg, default_value).strip()
+
+        # Get CPU memory (GB)
+        cmd = "cat /proc/meminfo | grep 'MemAvailable' | awk -F ':' '{print $NF}' | awk '{print $1}'"
+        err_msg = "Failed to get cpu memory"
+        default_value = "41366484"
+        cpu_memory = int(call_cmd(cmd, err_msg, default_value)) // 1e6
+
+        # Get single-point flops and double-point flops (GFLOPs)
+        self.calculate_cpu_flops()
+
+        self.machine['hostname'] = hostname
+        self.machine['addr'] = ip_addr
+        self.machine['memory'] = cpu_memory
+
+    def get_device_info(self):
+        # Get device count
+        cmd = "nvidia-smi -L | wc -l"
+        err_msg = "Failed to get device count"
+        default_value = "8"
+        self.nb_devices = int(call_cmd(cmd, err_msg, default_value))
+
+        # Get PCIe latency and bandwidth (ms, GB/s)
+        for i in range(self.nb_devices):
+            cmd = (
+                "nvidia-smi --id="
+                + str(i)
+                + " --query-gpu=pcie.link.gen.max --format=csv,noheader"
+            )
+            err_msg = "Failed to get max pcie link generation"
+            default_value = "4"
+            pcie_generation = int(call_cmd(cmd, err_msg, default_value))
+
+            cmd = (
+                "nvidia-smi --id="
+                + str(i)
+                + " --query-gpu=pcie.link.width.max --format=csv,noheader"
+            )
+            err_msg = "Failed to get max pcie link width"
+            default_value = "16"
+            pcie_width = int(call_cmd(cmd, err_msg, default_value))
+
+            self.pcie_bandwidth = min(
+                self.pcie_bandwidth,
+                self.pcie_gen2bandwidth(pcie_generation) * pcie_width,
+            )
+
+        dev_global_ids = []
+        dev_local_ids = []
+        dev_types = []
+        dev_models = []
+        dev_memories = []  # GiB
+        dev_sp_gflops = []  # GB/s
+        dev_dp_gflops = []  # GB/s
+
+        # Get device info
+        for i in range(self.nb_devices):
+            dev_global_ids.append(i)
+            dev_local_ids.append(i)
+            dev_types.append("GPU")
+
+            cmd = (
+                "nvidia-smi --id="
+                + str(i)
+                + " --query-gpu=name --format=csv,noheader"
+            )
+            err_msg = "Failed to get device name"
+            default_value = "NVIDIA A100-SXM4-40GB"
+            dev_models.append(call_cmd(cmd, err_msg, default_value).strip())
+
+            cmd = (
+                "nvidia-smi --id="
+                + str(i)
+                + " --query-gpu=memory.free --format=csv,noheader | awk '{print $1}'"
+            )
+            err_msg = "Failed to get device available memory"
+            default_value = "40536"
+            dev_memories.append(
+                int(call_cmd(cmd, err_msg, default_value)) // 1e3
+            )
+
+            sp_gflops, dp_gflops = self.model2gflops(dev_models[i])
+            dev_sp_gflops.append(sp_gflops)
+            dev_dp_gflops.append(dp_gflops)
+
+        for i in range(len(dev_global_ids)):
+            device = {}
+            device['global_id'] = dev_global_ids[i]
+            device['local_id'] = dev_local_ids[i]
+            device['type'] = dev_types[i]
+            device['model'] = dev_models[i]
+            device['memory'] = dev_memories[i]
+            device['sp_gflops'] = dev_sp_gflops[i]
+            device['dp_gflops'] = dev_dp_gflops[i]
+            self.devices.append(device)
+
+        self.machine['latency'] = self.pcie_latency
+        self.machine['bandwidth'] = self.pcie_bandwidth
+        self.machine['devices'] = self.devices
+
+    def get_link_info(self):
+        link_source_global_ids = []
+        link_target_global_ids = []
+        link_types = []
+        link_latencies = []  # ms
+        link_bandwidths = []  # GB/s
+
+        cmd = "nvidia-smi topo -m > matrix.txt"
+        err_msg = "Failed to get topo matrix"
+        default_value = ""
+        call_cmd(cmd, err_msg, default_value)
+
+        # Get link info between devices
+        for i in range(self.nb_devices):
+            for j in range(self.nb_devices):
+                if i == j:
+                    link_types.append("X")
+                    link_bandwidths.append(-1.0)
+                else:
+                    link_source_global_ids.append(i)
+                    link_target_global_ids.append(j)
+                    link_latencies.append(0.0)
+                    if i > j:
+                        index = j * self.nb_devices + i
+                        link_types.append(link_types[index])
+                        link_bandwidths.append(link_bandwidths[index])
+                    elif i < j:
+                        link_type, link_bandwidth = self.get_link_bandwidth(
+                            i, j
+                        )
+                        link_types.append(link_type)
+                        link_bandwidths.append(link_bandwidth)
+
+        for i in reversed(range(self.nb_devices)):
+            link_types.pop(i * self.nb_devices + i)
+            link_bandwidths.pop(i * self.nb_devices + i)
+
+        cmd = "rm matrix.txt"
+        err_msg = "Failed to delete matrix.txt"
+        default_value = ""
+        call_cmd(cmd, err_msg, default_value)
+
+        for i in range(len(link_types)):
+            link = {}
+            link['source_global_id'] = link_source_global_ids[i]
+            link['target_global_id'] = link_target_global_ids[i]
+            link['type'] = link_types[i]
+            link['latency'] = link_latencies[i]
+            link['bandwidth'] = link_bandwidths[i]
+            self.links.append(link)
+
+        self.machine['links'] = self.links
+
+    def detect(self):
+        # Get host info
+        self.get_host_info()
+
+        # Get device info
+        self.get_device_info()
+
+        # Get link info between devices
+        self.get_link_info()
+
+        self.json_object = json.dumps(self.machine, indent=4)
+        print(self.json_object)
+
+    def dump(self, output_path):
+        with open(output_path, "w") as outfile:
+            json.dump(self.machine, outfile, indent=4)
diff --git a/python/paddle/distributed/auto_parallel/tuner/profiler.py b/python/paddle/distributed/auto_parallel/tuner/profiler.py
index cca53773ebbef..27e0fa4984544 100644
--- a/python/paddle/distributed/auto_parallel/tuner/profiler.py
+++ b/python/paddle/distributed/auto_parallel/tuner/profiler.py
@@ -91,7 +91,7 @@ def init_process_groups(group_map, rank):
     # TODO should instantiate global group first
     all_process_groups = get_all_process_groups()
     for process_group in all_process_groups:
-        if rank not in process_group.ranks:
+        if process_group.id == 0 or rank not in process_group.ranks:
             continue
         print(process_group)
         process_group.instantiate()
diff --git a/python/paddle/distributed/communication/stream/all_reduce.py b/python/paddle/distributed/communication/stream/all_reduce.py
index 3b870afe6f5c1..6b38bffc0bf3f 100644
--- a/python/paddle/distributed/communication/stream/all_reduce.py
+++ b/python/paddle/distributed/communication/stream/all_reduce.py
@@ -122,9 +122,9 @@ def all_reduce(
             tensor, op, group, sync_op, use_calc_stream
         )
     else:
-        assert (
-            group is None
-        ), "Group can not be used in static graph mode for now."
+        # assert (
+        #     group is None
+        # ), "Group can not be used in static graph mode for now."
         return _all_reduce_in_static_mode(
             tensor, op, group, sync_op, use_calc_stream
         )
diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index 194e4bd667555..86292a2d90e79 100755
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -24,12 +24,6 @@
 from paddle.fluid.framework import _global_flags
 from paddle.fluid.wrapped_decorator import wrap_decorator
 
-protobuf_version = google.protobuf.__version__
-if protobuf_version >= "4.21.0":
-    from google._upb import _message
-else:
-    from google.protobuf.pyext import _message
-
 __all__ = []
 
 non_auto_func_called = True
@@ -1702,6 +1696,12 @@ def hybrid_configs(self, configs):
         check_configs_key(
             self.strategy.hybrid_configs, hybrid_config, "hybrid_configs"
         )
+
+        if "mp_configs" in configs:
+            assign_configs_value(
+                self.strategy.hybrid_configs.mp_configs, configs["mp_configs"]
+            )
+            configs.pop("mp_configs")
         assign_configs_value(self.strategy.hybrid_configs, configs)
 
     @property
@@ -2512,10 +2512,19 @@ def __repr__(self):
                                 self.strategy, f.name + "_configs"
                             )
                             config_fields = my_configs.DESCRIPTOR.fields
+                            protobuf_version = google.protobuf.__version__
+                            if protobuf_version >= "4.21.0":
+                                RepeatedScalarContainer = (
+                                    google._upb._message.RepeatedScalarContainer
+                                )
+                            else:
+                                RepeatedScalarContainer = (
+                                    google.protobuf.pyext._message.RepeatedScalarContainer
+                                )
                             for ff in config_fields:
                                 if isinstance(
                                     getattr(my_configs, ff.name),
-                                    _message.RepeatedScalarContainer,
+                                    RepeatedScalarContainer,
                                 ):
                                     values = getattr(my_configs, ff.name)
                                     for i, v in enumerate(values):
diff --git a/python/paddle/distributed/fleet/layers/mpu/mp_ops.py b/python/paddle/distributed/fleet/layers/mpu/mp_ops.py
index 08093710b3b89..884af3a441431 100644
--- a/python/paddle/distributed/fleet/layers/mpu/mp_ops.py
+++ b/python/paddle/distributed/fleet/layers/mpu/mp_ops.py
@@ -46,7 +46,15 @@ def _c_identity(tensor, group=None):
         class c_identity_eager(PyLayer):
             @staticmethod
             def forward(ctx, tensor):
-                return tensor
+                return _legacy_C_ops.c_identity(
+                    tensor,
+                    'use_calc_stream',
+                    True,
+                    'ring_id',
+                    group.id,
+                    'use_model_parallel',
+                    True,
+                )
 
             @staticmethod
             def backward(ctx, dy):
@@ -249,7 +257,15 @@ def forward(
 
             @staticmethod
             def backward(ctx, dy):
-                return dy
+                return _legacy_C_ops.c_identity(
+                    dy,
+                    'use_calc_stream',
+                    True,
+                    'ring_id',
+                    ctx.ring_id,
+                    'use_model_parallel',
+                    True,
+                )
 
         return mp_allreduce_eager.apply(
             tensor, group, use_calc_stream, use_model_parallel
diff --git a/python/paddle/distributed/fleet/meta_optimizers/common.py b/python/paddle/distributed/fleet/meta_optimizers/common.py
index c9474d397417a..15bd883e970be 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/common.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/common.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
 
 import paddle
 from paddle.framework import core
@@ -196,31 +195,6 @@ def _add_sync_by_allreduce(block):
                     OP_ROLE_KEY: OpRole.Forward,
                 },
             )
-        elif core.is_compiled_with_custom_device('npu'):
-            block.append_op(
-                type='c_gen_hccl_id',
-                inputs={},
-                outputs={'Out': comm_id_var},
-                attrs={
-                    'rank': rank,
-                    'endpoint': current_endpoint,
-                    'other_endpoints': other_endpoints,
-                    'ring_id': ring_id,
-                    OP_ROLE_KEY: OpRole.Forward,
-                },
-            )
-            block.append_op(
-                type='c_comm_init_hccl',
-                inputs={'X': comm_id_var},
-                outputs={},
-                attrs={
-                    'rank': rank,
-                    'ring_id': ring_id,
-                    'device_id': int(os.getenv("FLAGS_selected_npus")),
-                    'rank_ids': nranks,
-                    OP_ROLE_KEY: OpRole.Forward,
-                },
-            )
         else:
             raise ValueError(
                 "comm_id must be generated in paddlepaddle-xpu or paddlepaddle-xpu."
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
index 98604b8db3d8c..acd34f1b1d5b8 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
@@ -12,9 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+
 import paddle
 from paddle import framework
 from paddle.autograd import no_grad
+from paddle.distributed import fleet
 from paddle.framework import core
 from paddle.nn import ClipGradByGlobalNorm, clip
 
@@ -292,6 +294,83 @@ def __init__(self, optimizer, hcg, strategy):
                                 self._inner_opt._grad_clip, hcg
                             )
 
+    def _filter_fn(self, param):
+        p_name = param.name
+        tar_param = ["embedding", "layer_norm", ".b_"]
+        if param.is_distributed is False:
+            for tar in tar_param:
+                if tar in p_name:
+                    return True
+        return False
+
+    def _step(self, parameters_list):
+        mp_group = self._hcg.get_model_parallel_group()
+        src_rank = self._hcg.get_model_parallel_group_src_rank()
+        params = None
+        mp_configs = None
+
+        if mp_group.nranks > 1:
+            mp_configs = fleet.fleet._user_defined_strategy.hybrid_configs[
+                "mp_configs"
+            ]
+
+        if mp_configs and (
+            mp_configs.sync_param
+            or mp_configs.sync_grad
+            or mp_configs.sync_moment
+        ):
+            params = sorted(
+                [p for p in parameters_list if self._filter_fn(p)],
+                key=lambda p: p.name,
+            )
+
+        if mp_group.nranks > 1 and mp_configs and mp_configs.sync_grad:
+            for p in params:
+                if p.grad is None:
+                    continue
+                paddle.distributed.broadcast(
+                    p.grad, src=src_rank, group=mp_group, sync_op=True
+                )
+
+        self._inner_opt.step()
+
+        if mp_group.nranks > 1 and mp_configs and mp_configs.sync_param:
+            for p in params:
+                paddle.distributed.broadcast(
+                    p, src=src_rank, group=mp_group, sync_op=True
+                )
+
+        if mp_group.nranks > 1 and mp_configs and mp_configs.sync_moment:
+            for p in params:
+                # support opt state of adam and adamw to broadcast now.
+                if isinstance(
+                    self._inner_opt,
+                    (paddle.optimizer.Adam, paddle.optimizer.AdamW),
+                ):
+                    if (
+                        self._inner_opt._multi_precision
+                        and p.name in self._master_weights
+                    ):
+                        paddle.distributed.broadcast(
+                            self._inner_opt._master_weights[p.name],
+                            src=src_rank,
+                            group=mp_group,
+                            sync_op=True,
+                        )
+
+                    moment1 = self._inner_opt._get_accumulator(
+                        self._inner_opt._moment1_acc_str, p
+                    )
+                    moment2 = self._inner_opt._get_accumulator(
+                        self._inner_opt._moment2_acc_str, p
+                    )
+                    paddle.distributed.broadcast(
+                        moment1, src=src_rank, group=mp_group, sync_op=True
+                    )
+                    paddle.distributed.broadcast(
+                        moment2, src=src_rank, group=mp_group, sync_op=True
+                    )
+
     @no_grad()
     @framework.dygraph_only
     def step(self):
@@ -302,7 +381,7 @@ def step(self):
         if self._dp_enable:
             fused_allreduce_gradients(list(parameters_list), self._hcg)
 
-        self._inner_opt.step()
+        self._step(parameters_list)
 
     @no_grad()
     def minimize(
diff --git a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
index 24df3203183f5..3776583371526 100644
--- a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
+++ b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
@@ -24,7 +24,6 @@
     Variable,
     default_main_program,
     default_startup_program,
-    save_inference_model,
 )
 
 from ..base.private_helper_function import wait_server_ready
@@ -735,7 +734,7 @@ def _ps_inference_save_inference_model(
                 raise TypeError(
                     "in fleet.save_inference_model() function, main_program must be as Program type, CompiledProgram is not allowed"
                 )
-            save_inference_model(
+            paddle.fluid.io.save_inference_model(
                 dirname,
                 feeded_var_names,
                 target_vars,
@@ -746,7 +745,7 @@ def _ps_inference_save_inference_model(
                 export_for_deployment,
             )
         else:
-            save_inference_model(
+            paddle.fluid.io.save_inference_model(
                 dirname,
                 feeded_var_names,
                 target_vars,
diff --git a/python/paddle/distributed/passes/__init__.py b/python/paddle/distributed/passes/__init__.py
index 8550cb049b11e..8ab110e60c3b9 100644
--- a/python/paddle/distributed/passes/__init__.py
+++ b/python/paddle/distributed/passes/__init__.py
@@ -23,6 +23,7 @@
 from .auto_parallel_data_parallel_optimization import *  # noqa: F403
 from .auto_parallel_grad_clip import *  # noqa: F403
 from .auto_parallel_supplement_explicit_dependencies import *  # noqa: F403
+from .auto_parallel_pipeline import *  # noqa: F403
 from .cpp_pass import *  # noqa: F403
 from .ps_trainer_pass import *  # noqa: F403
 from .ps_server_pass import *  # noqa: F403
diff --git a/python/paddle/distributed/passes/auto_parallel_pipeline.py b/python/paddle/distributed/passes/auto_parallel_pipeline.py
new file mode 100644
index 0000000000000..5b707d088bf8b
--- /dev/null
+++ b/python/paddle/distributed/passes/auto_parallel_pipeline.py
@@ -0,0 +1,626 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from paddle.distributed.fleet.fleet_executor_utils import TaskNode
+from paddle.fluid import core
+from paddle.fluid.framework import Parameter, Program
+
+from .pass_base import PassBase, register_pass
+
+__not_shape_var_type__ = [
+    core.VarDesc.VarType.READER,
+    core.VarDesc.VarType.STEP_SCOPES,
+    core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+    core.VarDesc.VarType.FEED_MINIBATCH,
+    core.VarDesc.VarType.FETCH_LIST,
+]
+
+
+@register_pass("auto_parallel_pipeline")
+class PipelinePass(PassBase):
+    def __init__(self):
+        super().__init__()
+        self.set_attr("dist_context", None)
+
+    def _check_self(self):
+        if self.get_attr("dist_context") is None:
+            return False
+        return True
+
+    def _check_conflict(self, other_pass):
+        return True
+
+    def _apply_single_impl(self, main_program, startup_program, context):
+        self._dist_context = self.get_attr("dist_context")
+        self._acc_steps = self.get_attr("accumulate_steps")
+        self._mode = self.get_attr("schedule_mode")
+        self._gen_bsz = self.get_attr("generation_batch_size")
+        self._program = main_program
+
+        if self._mode == "1F1B":
+            raise NotImplementedError("1F1B has not been implemented")
+        elif self._mode == "F-Then-B":
+            raise NotImplementedError("F-Then-B has not been implemented")
+        elif self._mode == "stream":
+            self._insert_sync_ops_for_stream()
+            self._task_stream()
+        else:
+            raise ValueError(
+                "Now only 'F-then-B', '1F1B' and 'stream' are supported."
+                "The given value is {}.".format(self._mode)
+            )
+
+    def _insert_sync_ops_for_stream(self):
+
+        for block in self._program.blocks:
+            offset = 0
+            send_vars = []
+            # insert sync ops
+            for index, op in enumerate(list(block.ops)):
+                if op.type == 'send_v2':
+                    # step1: set 'use_calc_stream' False
+                    op._set_attr("use_calc_stream", False)
+                    op_role = op.attr('op_role')
+                    # step2: insert 'c_sync_calc_stream' op before 'send_v2' op
+                    var_name = op.input_arg_names[0]
+                    var = block.var(var_name)
+                    block._insert_op_without_sync(
+                        index=index + offset,
+                        type="c_sync_calc_stream",
+                        inputs={'X': [var]},
+                        outputs={'Out': [var]},
+                        attrs={'op_role': op_role},
+                    )
+                    offset += 1
+                    send_vars.append(var_name)
+
+            for var_name in send_vars:
+                nop_op = block.append_op(type='nop')
+                nop_op.desc.set_input('X', [var_name])
+                nop_op.desc.set_output('Out', [var_name])
+
+            block._sync_with_cpp()
+
+    def _create_param(self, dst_block, src_var):
+        copied_kwargs = {}
+        copied_kwargs['trainable'] = src_var.trainable
+        copied_kwargs['optimize_attr'] = src_var.optimize_attr
+        copied_kwargs['regularizer'] = src_var.regularizer
+        copied_kwargs['do_model_average'] = src_var.do_model_average
+        copied_kwargs['need_clip'] = src_var.need_clip
+
+        Parameter(
+            block=dst_block,
+            type=src_var.type,
+            name=src_var.name,
+            shape=src_var.shape,
+            dtype=src_var.dtype,
+            lod_level=src_var.lod_level,
+            error_clip=src_var.error_clip,
+            stop_gradient=src_var.stop_gradient,
+            is_data=src_var.is_data,
+            belong_to_optimizer=src_var.belong_to_optimizer,
+            **copied_kwargs
+        )
+
+    def _create_inter(self, dst_block, src_var):
+        dst_block.create_var(
+            type=src_var.type,
+            name=src_var.name,
+            shape=src_var.shape,
+            dtype=src_var.dtype,
+            lod_level=src_var.lod_level,
+            persistable=src_var.persistable,
+            error_clip=src_var.error_clip,
+            stop_gradient=src_var.stop_gradient,
+            is_data=src_var.is_data,
+            belong_to_optimizer=src_var.belong_to_optimizer,
+        )
+
+    def _create_var(
+        self, src_block, dst_block, src_varname, force_create=False
+    ):
+
+        if not force_create:
+            src_var = src_block.var(src_varname)
+        else:
+            src_var = src_block._var_recursive(src_varname)
+        if src_var.type in __not_shape_var_type__:
+            persist = getattr(src_var, 'persistable', False)
+            dst_block.create_var(
+                type=src_var.type,
+                name=src_var.name,
+                persistable=persist,
+                error_clip=src_var.error_clip,
+                stop_gradient=src_var.stop_gradient,
+                is_data=src_var.is_data,
+                belong_to_optimizer=src_var.belong_to_optimizer,
+            )
+        else:
+            if isinstance(src_var, Parameter):
+                self._create_param(dst_block, src_var)
+            else:
+                self._create_inter(dst_block, src_var)
+
+    def _create_program(self, src_block, dst_block, src_op, force_create=False):
+        dst_op_desc = dst_block.desc.append_op()
+        dst_op_desc.copy_from(src_op.desc)
+        for input_varname in src_op.input_arg_names:
+            if src_block.has_var(input_varname) or (
+                force_create and src_block._find_var_recursive(input_varname)
+            ):
+                self._create_var(
+                    src_block, dst_block, input_varname, force_create
+                )
+        for output_varname in src_op.output_arg_names:
+            if src_block.has_var(output_varname) or (
+                force_create and src_block._find_var_recursive(output_varname)
+            ):
+                self._create_var(
+                    src_block, dst_block, output_varname, force_create
+                )
+
+    def _get_pp_stage(self, rank):
+        pp_idx = None
+        for idx, process_mesh in enumerate(self._dist_context.process_meshes):
+            if rank in process_mesh.processes:
+                pp_idx = idx
+                break
+        return pp_idx
+
+    def _task_stream(self):
+        cur_rank = int(os.getenv("PADDLE_TRAINER_ID", 0))
+        trainer_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS", "").split(',')
+        nrank = len(trainer_endpoints)
+        num_of_functionality = 5
+
+        # compute current pp stage
+        pp_stages = len(self._dist_context.process_meshes)
+        cur_pp_stage = self._get_pp_stage(cur_rank)
+
+        start_prog = Program()
+        cond_prog = Program()
+        end_prog = Program()
+        send_prog = Program()
+        recv_prog = Program()
+
+        cond_var_name = None
+        send_vars_name = set()
+        recv_vars_name = {}
+        for ib, src_block in enumerate(self._program.blocks):
+            if ib == 0:
+                strat_block = start_prog.block(0)
+                end_block = end_prog.block(0)
+
+                is_after_while_op = False
+                for op in src_block.ops:
+                    if op.type == "while":
+                        assert len(op.input('Condition')) == 1
+                        cond_var_name = op.input('Condition')[0]
+                        is_after_while_op = True
+                        continue
+
+                    if not is_after_while_op:
+                        self._create_program(
+                            src_block, strat_block, op, force_create=True
+                        )
+                    else:
+                        self._create_program(
+                            src_block, end_block, op, force_create=True
+                        )
+            elif ib == 1:
+                send_block = send_prog.block(0)
+                recv_block = recv_prog.block(0)
+
+                is_after_send_op = False
+                is_after_recv_op = False
+                for op in src_block.ops:
+                    if op.type == "send_v2" and not is_after_send_op:
+                        is_after_send_op = True
+                        if cur_pp_stage == pp_stages - 1:
+                            if op.type in ["c_sync_calc_stream", "nop"]:
+                                continue
+                            if (
+                                op.type not in ["recv_2", "assign"]
+                                and op.has_attr('op_namescope')
+                                and "/auto_parallel/reshard"
+                                in op.attr('op_namescope')
+                            ):
+                                if (
+                                    len(op.desc.input_arg_names()) > 0
+                                    and "@RESHARD"
+                                    not in op.desc.input_arg_names()[0]
+                                ):
+                                    send_vars_name.add(
+                                        op.desc.input_arg_names()[0]
+                                    )
+                                    continue
+                                if op.type == "send_v2":
+                                    continue
+                        self._create_program(
+                            src_block, send_block, op, force_create=True
+                        )
+                        continue
+
+                    if (
+                        is_after_send_op
+                        and not is_after_recv_op
+                        and op.type == "recv_v2"
+                    ):
+                        is_after_recv_op = True
+                        if op.has_attr(
+                            'op_namescope'
+                        ) and "/auto_parallel/reshard" in op.attr(
+                            'op_namescope'
+                        ):
+                            var_name = op.desc.output_arg_names()[0]
+                            index = var_name.find("@")
+                            if index > 0:
+                                old_var_name = var_name[:index]
+                            else:
+                                old_var_name = var_name
+                            recv_vars_name[var_name] = old_var_name
+                            if not src_block._find_var_recursive(old_var_name):
+                                src_var = src_block._var_recursive(var_name)
+                                recv_block.create_var(
+                                    type=src_var.type,
+                                    name=old_var_name,
+                                    shape=src_var.shape,
+                                    dtype=src_var.dtype,
+                                    lod_level=src_var.lod_level,
+                                    persistable=src_var.persistable,
+                                    error_clip=src_var.error_clip,
+                                    stop_gradient=src_var.stop_gradient,
+                                    is_data=src_var.is_data,
+                                    belong_to_optimizer=src_var.belong_to_optimizer,
+                                )
+                            continue
+
+                        self._create_program(
+                            src_block, recv_block, op, force_create=True
+                        )
+                        continue
+
+                    if not is_after_send_op or not is_after_recv_op:
+                        if cur_pp_stage == pp_stages - 1:
+                            if op.type in ["c_sync_calc_stream", "nop"]:
+                                continue
+                            if (
+                                op.type not in ["recv_2", "assign"]
+                                and op.has_attr('op_namescope')
+                                and "/auto_parallel/reshard"
+                                in op.attr('op_namescope')
+                            ):
+                                if (
+                                    len(op.desc.input_arg_names()) > 0
+                                    and "@RESHARD"
+                                    not in op.desc.input_arg_names()[0]
+                                ):
+                                    send_vars_name.add(
+                                        op.desc.input_arg_names()[0]
+                                    )
+                                    continue
+                                if op.type == "send_v2":
+                                    continue
+                        self._create_program(
+                            src_block, send_block, op, force_create=True
+                        )
+
+                    if is_after_send_op and is_after_recv_op:
+                        if op.has_attr(
+                            'op_namescope'
+                        ) and "/auto_parallel/reshard" in op.attr(
+                            'op_namescope'
+                        ):
+                            var_name = op.desc.output_arg_names()[0]
+                            index = var_name.find("@")
+                            if index > 0:
+                                old_var_name = var_name[:index]
+                            else:
+                                old_var_name = var_name
+                            recv_vars_name[var_name] = old_var_name
+                            if not src_block._find_var_recursive(old_var_name):
+                                src_var = src_block._var_recursive(var_name)
+                                recv_block.create_var(
+                                    type=src_var.type,
+                                    name=old_var_name,
+                                    shape=src_var.shape,
+                                    dtype=src_var.dtype,
+                                    lod_level=src_var.lod_level,
+                                    persistable=src_var.persistable,
+                                    error_clip=src_var.error_clip,
+                                    stop_gradient=src_var.stop_gradient,
+                                    is_data=src_var.is_data,
+                                    belong_to_optimizer=src_var.belong_to_optimizer,
+                                )
+                            continue
+
+                        for in_name in op.desc.input_arg_names():
+                            if in_name in recv_vars_name:
+                                op.desc._rename_input(
+                                    in_name, recv_vars_name[in_name]
+                                )
+                        self._create_program(
+                            src_block, recv_block, op, force_create=True
+                        )
+            else:
+                raise Exception("Only support generation condition.")
+
+        start_prog._sync_with_cpp()
+        end_prog._sync_with_cpp()
+        send_prog._sync_with_cpp()
+        recv_prog._sync_with_cpp()
+
+        assert cond_var_name is not None
+
+        send_task_node_var_dtype = {}
+        send_task_node_var_shape = {}
+        recv_task_node_var_dtype = {}
+        recv_task_node_var_shape = {}
+        for var_name in list(send_vars_name):
+            var = send_prog.global_block().vars[var_name]
+            dtype = str(var.dtype)
+            send_task_node_var_dtype[var_name] = dtype[
+                dtype.find("paddle.") + len("paddle.") :
+            ]
+            send_task_node_var_shape[var_name] = var.shape
+        for var_name in list(set(recv_vars_name.values())):
+            var = recv_prog.global_block().vars[var_name]
+            dtype = str(var.dtype)
+            recv_task_node_var_dtype[var_name] = dtype[
+                dtype.find("paddle.") + len("paddle.") :
+            ]
+            recv_task_node_var_shape[var_name] = var.shape
+
+        vars_to_dtype = []
+        vars_to_shape = []
+        if len(send_task_node_var_dtype) > 0:
+            assert len(recv_task_node_var_dtype) == 0
+            vars_to_dtype = send_task_node_var_dtype
+            vars_to_shape = send_task_node_var_shape
+        if len(recv_task_node_var_dtype) > 0:
+            assert len(send_task_node_var_dtype) == 0
+            vars_to_dtype = recv_task_node_var_dtype
+            vars_to_shape = recv_task_node_var_shape
+
+        start_task_node = TaskNode(
+            rank=cur_rank,
+            max_run_times=self._acc_steps,
+            node_type="Start",
+            task_id=int(cur_rank * num_of_functionality + 0),
+            program=start_prog,
+            lazy_initialize=True,
+        )
+        cond_task_node = TaskNode(
+            rank=cur_rank,
+            max_run_times=self._acc_steps,
+            node_type="Cond",
+            task_id=int(cur_rank * num_of_functionality + 1),
+            program=cond_prog,
+            cond_var_name=cond_var_name,
+            lazy_initialize=True,
+        )
+        send_task_node = TaskNode(
+            rank=cur_rank,
+            max_run_times=self._acc_steps,
+            node_type="Compute",
+            task_id=int(cur_rank * num_of_functionality + 2),
+            program=send_prog,
+            lazy_initialize=True,
+        )
+        recv_task_node = TaskNode(
+            rank=cur_rank,
+            max_run_times=self._acc_steps,
+            node_type="Compute",
+            task_id=int(cur_rank * num_of_functionality + 3),
+            program=recv_prog,
+            lazy_initialize=True,
+            vars_to_dtype=vars_to_dtype,
+            vars_to_shape=vars_to_shape,
+        )
+        end_task_node = TaskNode(
+            rank=cur_rank,
+            max_run_times=self._acc_steps,
+            node_type="Compute",
+            task_id=int(cur_rank * num_of_functionality + 4),
+            program=end_prog,
+            lazy_initialize=True,
+        )
+
+        # add dependencies for task nodes intra stage
+        inf = -1
+        pp_buff_size = int(pp_stages - cur_pp_stage)
+        start_task_node.add_downstream_task(
+            cond_task_node.task_id(), self._gen_bsz
+        )
+        print(
+            "Task ",
+            start_task_node.task_id(),
+            "'s downstream is:",
+            cond_task_node.task_id(),
+            ", buffer size is:",
+            self._gen_bsz,
+        )
+        cond_task_node.add_upstream_task(
+            start_task_node.task_id(), self._gen_bsz
+        )
+        print(
+            "Task ",
+            cond_task_node.task_id(),
+            "'s upstream is:",
+            start_task_node.task_id(),
+            ", buffer size is:",
+            self._gen_bsz,
+        )
+        cond_task_node.add_downstream_task(send_task_node.task_id(), inf)
+        print(
+            "Task ",
+            cond_task_node.task_id(),
+            "'s downstream is:",
+            send_task_node.task_id(),
+            ", buffer size is:",
+            inf,
+        )
+        send_task_node.add_upstream_task(cond_task_node.task_id(), inf)
+        print(
+            "Task ",
+            send_task_node.task_id(),
+            "'s upstream is:",
+            cond_task_node.task_id(),
+            ", buffer size is:",
+            inf,
+        )
+        send_task_node.add_downstream_task(
+            recv_task_node.task_id(), pp_buff_size
+        )
+        print(
+            "Task ",
+            send_task_node.task_id(),
+            "'s downstream is:",
+            recv_task_node.task_id(),
+            ", buffer size is:",
+            pp_buff_size,
+        )
+        recv_task_node.add_upstream_task(send_task_node.task_id(), pp_buff_size)
+        print(
+            "Task ",
+            recv_task_node.task_id(),
+            "'s upstream is:",
+            send_task_node.task_id(),
+            ", buffer size is:",
+            pp_buff_size,
+        )
+        recv_task_node.add_downstream_task(
+            cond_task_node.task_id(), inf, core.DependType.LOOP
+        )
+        print(
+            "Task ",
+            recv_task_node.task_id(),
+            "'s downstream is:",
+            cond_task_node.task_id(),
+            ", buffer size is:",
+            inf,
+        )
+        cond_task_node.add_upstream_task(
+            recv_task_node.task_id(), inf, core.DependType.LOOP
+        )
+        print(
+            "Task ",
+            cond_task_node.task_id(),
+            "'s upstream is:",
+            recv_task_node.task_id(),
+            ", buffer size is:",
+            inf,
+        )
+        cond_task_node.add_downstream_task(
+            end_task_node.task_id(), inf, core.DependType.STOP_LOOP
+        )
+        print(
+            "Task ",
+            cond_task_node.task_id(),
+            "'s downstream is:",
+            end_task_node.task_id(),
+            ", buffer size is:",
+            inf,
+        )
+        end_task_node.add_upstream_task(
+            cond_task_node.task_id(), inf, core.DependType.STOP_LOOP
+        )
+        print(
+            "Task ",
+            end_task_node.task_id(),
+            "'s upstream is:",
+            cond_task_node.task_id(),
+            ", buffer size is:",
+            inf,
+        )
+
+        # add dependencies for task nodes inter stage
+        # get upstream ranks and downstream ranks of cur_rank
+        up_down_streams = self._dist_context.up_down_streams
+        pp_upstream_ranks = up_down_streams.ups(cur_rank)
+        pp_downstream_ranks = up_down_streams.downs(cur_rank)
+
+        for upstream_rank in pp_upstream_ranks:
+            upstream_pp_stage = self._get_pp_stage(upstream_rank)
+            if upstream_pp_stage < pp_stages - 1:
+                upstream_task_id = int(upstream_rank * num_of_functionality + 2)
+                send_task_node.add_upstream_task(upstream_task_id)
+                print(
+                    "Task ",
+                    send_task_node.task_id(),
+                    "'s upstream is:",
+                    upstream_task_id,
+                    ", buffer size is:",
+                    2,
+                )
+            else:
+                upstream_task_id = int(upstream_rank * num_of_functionality + 3)
+                recv_task_node.add_upstream_task(upstream_task_id)
+                print(
+                    "Task ",
+                    recv_task_node.task_id(),
+                    "'s upstream is:",
+                    upstream_task_id,
+                    ", buffer size is:",
+                    2,
+                )
+        for downstream_rank in pp_downstream_ranks:
+            if cur_pp_stage < pp_stages - 1:
+                downstream_task_id = int(
+                    downstream_rank * num_of_functionality + 2
+                )
+                send_task_node.add_downstream_task(downstream_task_id)
+                print(
+                    "Task ",
+                    send_task_node.task_id(),
+                    "'s downstream is:",
+                    downstream_task_id,
+                    ", buffer size is:",
+                    2,
+                )
+            else:
+                downstream_task_id = int(
+                    downstream_rank * num_of_functionality + 3
+                )
+                recv_task_node.add_downstream_task(downstream_task_id)
+                print(
+                    "Task ",
+                    recv_task_node.task_id(),
+                    "'s downstream is:",
+                    downstream_task_id,
+                    ", buffer size is:",
+                    2,
+                )
+
+        task_id_to_rank = {}
+        for i in range(nrank):
+            for j in range(num_of_functionality):
+                task_id_to_rank[int(i * num_of_functionality + j)] = i
+        self._program._pipeline_opt = {
+            "fleet_opt": {
+                'tasks': [
+                    start_task_node,
+                    cond_task_node,
+                    send_task_node,
+                    recv_task_node,
+                    end_task_node,
+                ],
+                'task_id_to_rank': task_id_to_rank,
+                'num_micro_batches': self._acc_steps,
+                'inference_generation': True,
+            }
+        }
diff --git a/python/paddle/distribution/__init__.py b/python/paddle/distribution/__init__.py
index 77b83fa6a94c5..418ef478aaf13 100644
--- a/python/paddle/distribution/__init__.py
+++ b/python/paddle/distribution/__init__.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 from paddle.distribution import transform
+from paddle.distribution.bernoulli import Bernoulli
 from paddle.distribution.beta import Beta
 from paddle.distribution.categorical import Categorical
 from paddle.distribution.dirichlet import Dirichlet
@@ -30,6 +31,7 @@
 from paddle.distribution.laplace import Laplace
 
 __all__ = [  # noqa
+    'Bernoulli',
     'Beta',
     'Categorical',
     'Dirichlet',
diff --git a/python/paddle/distribution/bernoulli.py b/python/paddle/distribution/bernoulli.py
new file mode 100644
index 0000000000000..d6c6551b0c5ce
--- /dev/null
+++ b/python/paddle/distribution/bernoulli.py
@@ -0,0 +1,485 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import numpy as np
+
+import paddle
+from paddle.distribution import exponential_family
+from paddle.fluid.data_feeder import check_type, convert_dtype
+from paddle.fluid.framework import _non_static_mode
+from paddle.fluid.layers import tensor
+from paddle.nn.functional import (
+    binary_cross_entropy_with_logits,
+    sigmoid,
+    softplus,
+)
+
+# Smallest representable number
+EPS = {
+    'float32': paddle.finfo(paddle.float32).eps,
+    'float64': paddle.finfo(paddle.float64).eps,
+}
+
+
+def _clip_probs(probs, dtype):
+    """Clip probs from [0, 1] to (0, 1) with ``eps``.
+
+    Args:
+        probs (Tensor): probs of Bernoulli.
+        dtype (str): data type.
+
+    Returns:
+        Tensor: Clipped probs.
+    """
+    eps = EPS.get(dtype)
+    return paddle.clip(probs, min=eps, max=1 - eps).astype(dtype)
+
+
+class Bernoulli(exponential_family.ExponentialFamily):
+    r"""Bernoulli distribution parameterized by ``probs``, which is the probability of value 1.
+
+    In probability theory and statistics, the Bernoulli distribution, named after Swiss
+    mathematician Jacob Bernoulli, is the discrete probability distribution of a random
+    variable which takes the value 1 with probability ``p`` and the value 0 with
+    probability ``q=1-p``.
+
+    The probability mass function of this distribution, over possible outcomes ``k``, is
+
+    .. math::
+
+        {\begin{cases}
+        q=1-p & \text{if }value=0 \\
+        p & \text{if }value=1
+        \end{cases}}
+
+    Args:
+        probs (float|Tensor): The ``probs`` input of Bernoulli distribution. The data type is float32 or float64. The range must be in [0, 1].
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            from paddle.distribution import Bernoulli
+
+            # init `probs` with a float
+            rv = Bernoulli(probs=0.3)
+
+            print(rv.mean)
+            # Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+            #        [0.30000001])
+
+            print(rv.variance)
+            # Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+            #        [0.21000001])
+
+            print(rv.entropy())
+            # Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+            #        [0.61086434])
+    """
+
+    def __init__(self, probs, name=None):
+        self.name = name or 'Bernoulli'
+        if not _non_static_mode():
+            check_type(
+                probs,
+                'probs',
+                (float, tensor.Variable),
+                self.name,
+            )
+
+        # Get/convert probs to tensor.
+        if self._validate_args(probs):
+            self.probs = probs
+            self.dtype = convert_dtype(probs.dtype)
+        else:
+            [self.probs] = self._to_tensor(probs)
+            self.dtype = paddle.get_default_dtype()
+
+        # Check probs range [0, 1].
+        if _non_static_mode():
+            """Not use `paddle.any` in static mode, which always be `True`."""
+            if (
+                paddle.any(self.probs < 0)
+                or paddle.any(self.probs > 1)
+                or paddle.any(paddle.isnan(self.probs))
+            ):
+                raise ValueError("The arg of `probs` must be in range [0, 1].")
+
+        # Clip probs from [0, 1] to (0, 1) with smallest representable number `eps`.
+        self.probs = _clip_probs(self.probs, self.dtype)
+        self.logits = self._probs_to_logits(self.probs, is_binary=True)
+
+        super().__init__(batch_shape=self.probs.shape, event_shape=())
+
+    @property
+    def mean(self):
+        """Mean of Bernoulli distribution.
+
+        Returns:
+            Tensor: Mean value of distribution.
+        """
+        return self.probs
+
+    @property
+    def variance(self):
+        """Variance of Bernoulli distribution.
+
+        Returns:
+            Tensor: Variance value of distribution.
+        """
+        return paddle.multiply(self.probs, (1 - self.probs))
+
+    def sample(self, shape):
+        """Sample from Bernoulli distribution.
+
+        Args:
+            shape (Sequence[int]): Sample shape.
+
+        Returns:
+            Tensor: Sampled data with shape `sample_shape` + `batch_shape` + `event_shape`.
+
+        Examples:
+
+            .. code-block:: python
+
+                import paddle
+                from paddle.distribution import Bernoulli
+
+                rv = Bernoulli(paddle.full((), 0.3))
+                print(rv.sample([100]).shape)
+                # [100]
+
+                rv = Bernoulli(paddle.to_tensor(0.3))
+                print(rv.sample([100]).shape)
+                # [100, 1]
+
+                rv = Bernoulli(paddle.to_tensor([0.3, 0.5]))
+                print(rv.sample([100]).shape)
+                # [100, 2]
+
+                rv = Bernoulli(paddle.to_tensor([0.3, 0.5]))
+                print(rv.sample([100, 2]).shape)
+                # [100, 2, 2]
+        """
+        name = self.name + '_sample'
+        if not _non_static_mode():
+            check_type(
+                shape,
+                'shape',
+                (np.ndarray, tensor.Variable, list, tuple),
+                name,
+            )
+
+        shape = shape if isinstance(shape, tuple) else tuple(shape)
+        shape = self._extend_shape(shape)
+
+        with paddle.no_grad():
+            return paddle.bernoulli(self.probs.expand(shape), name=name)
+
+    def rsample(self, shape, temperature=1.0):
+        """Sample from Bernoulli distribution (reparameterized).
+
+        The `rsample` is a continuously approximate of Bernoulli distribution reparameterized sample method.
+        [1] Chris J. Maddison, Andriy Mnih, and Yee Whye Teh. The Concrete Distribution: A Continuous Relaxation of Discrete Random Variables. 2016.
+        [2] Eric Jang, Shixiang Gu, and Ben Poole. Categorical Reparameterization with Gumbel-Softmax. 2016.
+
+        Note:
+            `rsample` need to be followed by a `sigmoid`, which converts samples' value to unit interval (0, 1).
+
+        Args:
+            shape (Sequence[int]): Sample shape.
+            temperature (float): temperature for rsample, must be positive.
+
+        Returns:
+            Tensor: Sampled data with shape `sample_shape` + `batch_shape` + `event_shape`.
+
+        Examples:
+
+            .. code-block:: python
+
+                import paddle
+                from paddle.distribution import Bernoulli
+
+                paddle.seed(2023)
+
+                rv = Bernoulli(paddle.full((), 0.3))
+                print(rv.sample([100]).shape)
+                # [100]
+
+                rv = Bernoulli(0.3)
+                print(rv.rsample([100]).shape)
+                # [100, 1]
+
+                rv = Bernoulli(paddle.to_tensor([0.3, 0.5]))
+                print(rv.rsample([100]).shape)
+                # [100, 2]
+
+                rv = Bernoulli(paddle.to_tensor([0.3, 0.5]))
+                print(rv.rsample([100, 2]).shape)
+                # [100, 2, 2]
+
+                # `rsample` has to be followed by a `sigmoid`
+                rv = Bernoulli(0.3)
+                rsample = rv.rsample([3, ])
+                rsample_sigmoid = paddle.nn.functional.sigmoid(rsample)
+                print(rsample, rsample_sigmoid)
+                # Tensor(shape=[3, 1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                #        [[-0.88315082],
+                #         [-0.62347704],
+                #         [-0.31513220]]) Tensor(shape=[3, 1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                #        [[0.29252526],
+                #         [0.34899110],
+                #         [0.42186251]])
+
+                # The smaller the `temperature`, the distribution of `rsample` closer to `sample`, with `probs` of 0.3.
+                print(paddle.nn.functional.sigmoid(rv.rsample([1000, ], temperature=1.0)).sum())
+                # Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                #        [361.06829834])
+
+                print(paddle.nn.functional.sigmoid(rv.rsample([1000, ], temperature=0.1)).sum())
+                # Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                #        [288.66418457])
+        """
+        name = self.name + '_rsample'
+        if not _non_static_mode():
+            check_type(
+                shape,
+                'shape',
+                (np.ndarray, tensor.Variable, list, tuple),
+                name,
+            )
+            check_type(
+                temperature,
+                'temperature',
+                (float,),
+                name,
+            )
+
+        shape = shape if isinstance(shape, tuple) else tuple(shape)
+        shape = self._extend_shape(shape)
+
+        temperature = paddle.full(
+            shape=(), fill_value=temperature, dtype=self.dtype
+        )
+
+        probs = self.probs.expand(shape)
+        uniforms = paddle.rand(shape, dtype=self.dtype)
+        return paddle.divide(
+            paddle.add(
+                paddle.subtract(uniforms.log(), (-uniforms).log1p()),
+                paddle.subtract(probs.log(), (-probs).log1p()),
+            ),
+            temperature,
+        )
+
+    def cdf(self, value):
+        r"""Cumulative distribution function(CDF) evaluated at value.
+
+        .. math::
+
+            { \begin{cases}
+            0 & \text{if } value \lt  0 \\
+            1 - p & \text{if } 0 \leq value \lt  1 \\
+            1 & \text{if } value \geq 1
+            \end{cases}
+            }
+
+        Args:
+            value (Tensor): Value to be evaluated.
+
+        Returns:
+            Tensor: CDF evaluated at value.
+
+        Examples:
+
+            .. code-block:: python
+
+                import paddle
+                from paddle.distribution import Bernoulli
+
+                rv = Bernoulli(0.3)
+                print(rv.cdf(paddle.to_tensor([1.0])))
+                # Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                #        [1.])
+        """
+        name = self.name + '_cdf'
+        if not _non_static_mode():
+            check_type(value, 'value', tensor.Variable, name)
+
+        value = self._check_values_dtype_in_probs(self.probs, value)
+        probs, value = paddle.broadcast_tensors([self.probs, value])
+
+        zeros = paddle.zeros_like(probs)
+        ones = paddle.ones_like(probs)
+
+        return paddle.where(
+            value < 0,
+            zeros,
+            paddle.where(value < 1, paddle.subtract(ones, probs), ones),
+            name=name,
+        )
+
+    def log_prob(self, value):
+        """Log of probability densitiy function.
+
+        Args:
+            value (Tensor): Value to be evaluated.
+
+        Returns:
+            Tensor: Log of probability densitiy evaluated at value.
+
+        Examples:
+
+            .. code-block:: python
+
+                import paddle
+                from paddle.distribution import Bernoulli
+
+                rv = Bernoulli(0.3)
+                print(rv.log_prob(paddle.to_tensor([1.0])))
+                # Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                #        [-1.20397282])
+        """
+        name = self.name + '_log_prob'
+        if not _non_static_mode():
+            check_type(value, 'value', tensor.Variable, name)
+
+        value = self._check_values_dtype_in_probs(self.probs, value)
+        logits, value = paddle.broadcast_tensors([self.logits, value])
+        return -binary_cross_entropy_with_logits(
+            logits, value, reduction='none', name=name
+        )
+
+    def prob(self, value):
+        r"""Probability density function(PDF) evaluated at value.
+
+        .. math::
+
+            { \begin{cases}
+                q=1-p & \text{if }value=0 \\
+                p & \text{if }value=1
+                \end{cases}
+            }
+
+        Args:
+            value (Tensor): Value to be evaluated.
+
+        Returns:
+            Tensor: PDF evaluated at value.
+
+        Examples:
+
+            .. code-block:: python
+
+                import paddle
+                from paddle.distribution import Bernoulli
+
+                rv = Bernoulli(0.3)
+                print(rv.prob(paddle.to_tensor([1.0])))
+                # Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                #        [0.29999998])
+        """
+        name = self.name + '_prob'
+        if not _non_static_mode():
+            check_type(value, 'value', tensor.Variable, name)
+
+        return self.log_prob(value).exp(name=name)
+
+    def entropy(self):
+        r"""Entropy of Bernoulli distribution.
+
+        .. math::
+
+            {
+                entropy = -(q \log q + p \log p)
+            }
+
+        Returns:
+            Tensor: Entropy of distribution.
+
+        Examples:
+
+            .. code-block:: python
+
+                import paddle
+                from paddle.distribution import Bernoulli
+
+                rv = Bernoulli(0.3)
+                print(rv.entropy())
+                # Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                #        [0.61086434])
+        """
+        name = self.name + '_entropy'
+
+        return binary_cross_entropy_with_logits(
+            self.logits, self.probs, reduction='none', name=name
+        )
+
+    def kl_divergence(self, other):
+        r"""The KL-divergence between two Bernoulli distributions.
+
+        .. math::
+
+            {
+                KL(a || b) = p_a \log(p_a / p_b) + (1 - p_a) \log((1 - p_a) / (1 - p_b))
+            }
+
+        Args:
+            other (Bernoulli): instance of Bernoulli.
+
+        Returns:
+            Tensor: kl-divergence between two Bernoulli distributions.
+
+        Examples:
+
+            .. code-block:: python
+
+                import paddle
+                from paddle.distribution import Bernoulli
+
+                rv = Bernoulli(0.3)
+                rv_other = Bernoulli(0.7)
+
+                print(rv.kl_divergence(rv_other))
+                # Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                #        [0.33891910])
+        """
+        name = self.name + '_kl_divergence'
+        if not _non_static_mode():
+            check_type(other, 'other', Bernoulli, name)
+
+        a_logits = self.logits
+        b_logits = other.logits
+
+        log_pa = -softplus(-a_logits)
+        log_pb = -softplus(-b_logits)
+
+        pa = sigmoid(a_logits)
+        one_minus_pa = sigmoid(-a_logits)
+
+        log_one_minus_pa = -softplus(a_logits)
+        log_one_minus_pb = -softplus(b_logits)
+
+        return paddle.add(
+            paddle.subtract(
+                paddle.multiply(log_pa, pa), paddle.multiply(log_pb, pa)
+            ),
+            paddle.subtract(
+                paddle.multiply(log_one_minus_pa, one_minus_pa),
+                paddle.multiply(log_one_minus_pb, one_minus_pa),
+            ),
+        )
diff --git a/python/paddle/distribution/kl.py b/python/paddle/distribution/kl.py
index ac3b94d4ebd66..3d630b5802b72 100644
--- a/python/paddle/distribution/kl.py
+++ b/python/paddle/distribution/kl.py
@@ -15,6 +15,7 @@
 import warnings
 
 import paddle
+from paddle.distribution.bernoulli import Bernoulli
 from paddle.distribution.beta import Beta
 from paddle.distribution.categorical import Categorical
 from paddle.distribution.dirichlet import Dirichlet
@@ -143,6 +144,11 @@ def __le__(self, other):
         return True
 
 
+@register_kl(Bernoulli, Bernoulli)
+def _kl_bernoulli_bernoulli(p, q):
+    return p.kl_divergence(q)
+
+
 @register_kl(Beta, Beta)
 def _kl_beta_beta(p, q):
     return (
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index 9a6572db72778..46f225e0d0910 100755
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -1715,35 +1715,68 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
 
 
 def infershape_for_composite(block, grad_op_desc):
-    # pruning empty output
+    # NOTE: why pruning the operator with empty output here ?
+    # Some backward operator will output emtpy var, which will cause infer
+    # shape error, such assign with input's stop_gradient=True
     if len(grad_op_desc.output_arg_names()) == 0:
         return
 
-    # append op to block
-    op_desc = block.desc.append_op()
-    op_desc.copy_from(grad_op_desc)
-    op_desc._set_attr(
-        core.op_proto_and_checker_maker.kOpRoleAttrName(),
-        core.op_proto_and_checker_maker.OpRole.Backward,
-    )
-
-    # create output var
+    # create output variable
     new_vars = set()
-    # create new gradient variables
-    for grad_var_name in op_desc.output_arg_names():
+    for grad_var_name in grad_op_desc.output_arg_names():
         if not (
             block.desc.has_var_recursive(grad_var_name.encode())
             or grad_var_name == core.empty_var_name()
         ):
-            block.desc.var(grad_var_name.encode())
+            # NOTE: stop_gradient will be set in append_op
+            desc = block.desc.var(grad_var_name.encode())
+            block.create_var(name=grad_var_name, desc=desc, type=desc.type())
             new_vars.add(grad_var_name)
 
-    # infer shape and infer dthype
-    op_desc.check_attrs()
-    op_desc.infer_var_type(block.desc)
-    op_desc.infer_shape(block.desc)
+    # NOTE For the primitive operator generated by decompositing phi grad kernel,
+    # we Operator to reconstruct the op_desc for reusing some complex logic, such
+    # as processing dispensable input, intermediate output, extra attrs, etc...
+    if framework.OpProtoHolder.instance().has_op_proto(grad_op_desc.type()):
+        op = block.append_op(
+            type=grad_op_desc.type(),
+            inputs={
+                name: [block._find_var_recursive(arg) for arg in args]
+                for name, args in grad_op_desc.inputs().items()
+            },
+            outputs={
+                name: [block._find_var_recursive(arg) for arg in args]
+                for name, args in grad_op_desc.outputs().items()
+            },
+            # NOTE Runtime attr will be ignore as the c++ GetRuntimeAttr
+            # interface cann't be exported to python. Please note the WARNNING
+            # message logged in RuntimeAttrs of composite_grad_desc_maker.h
+            attrs=grad_op_desc.get_attr_map(),
+        )
+        op.desc._set_attr(
+            core.op_proto_and_checker_maker.kOpRoleAttrName(),
+            core.op_proto_and_checker_maker.OpRole.Backward,
+        )
+        grad_op_desc.copy_from(op.desc)
+    # For the backward operator, we reuse the logic of _append_backward_var
+    else:
+        op_desc = block.desc.append_op()
+        op_desc.copy_from(grad_op_desc)
+        op_desc._set_attr(
+            core.op_proto_and_checker_maker.kOpRoleAttrName(),
+            core.op_proto_and_checker_maker.OpRole.Backward,
+        )
+        op_desc.check_attrs()
+        op_desc.infer_var_type(block.desc)
+        op_desc.infer_shape(block.desc)
+        for arg in op_desc.output_arg_names():
+            if arg in new_vars:
+                _infer_var_data_type_shape_(arg, block)
+
+        grad_op_desc.copy_from(op_desc)
 
-    for arg in op_desc.output_arg_names():
+    # NOTE: Some operator doesn't infer dtype correctly, this patch set the
+    # grad_var dtype same with corresponding forward variable.
+    for arg in grad_op_desc.output_arg_names():
         if arg in new_vars:
             _infer_var_data_type_shape_(arg, block)
 
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 708cc462e78ea..537abbc50a8a2 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -2758,8 +2758,6 @@ class Operator:
         'heter_listen_and_serv',
         'c_wait_comm',
         'c_wait_compute',
-        'c_gen_hccl_id',
-        'c_comm_init_hccl',
         'copy_cross_scope',
         'c_gen_cncl_id',
     }
@@ -2916,14 +2914,35 @@ def find_name(var_list, name):
                 for m in proto.outputs:
                     if (m.name not in outputs) and m.dispensable:
                         continue
-                    if not ((m.name in outputs) or m.dispensable):
-                        raise ValueError(
-                            (
-                                "Incorrect setting for output(s) of "
-                                "operator \"%s\", should set: [%s]."
+
+                    # FIXME: The outputs of primitive operator currently
+                    # doesn't include intermediate output as it will be dropped
+                    # in operator codegen, such as xshape output of reshape2.
+                    # It will fixed when the operator codegen support
+                    # intermediate output.
+                    if core._is_bwd_prim_enabled():
+                        if not (
+                            (m.name in outputs)
+                            or m.dispensable
+                            or m.intermediate
+                        ):
+                            raise ValueError(
+                                (
+                                    "Incorrect setting for output(s) of "
+                                    "operator \"%s\", should set: [%s]."
+                                )
+                                % (type, m.name)
                             )
-                            % (type, m.name)
-                        )
+                    else:
+                        if not ((m.name in outputs) or m.dispensable):
+                            raise ValueError(
+                                (
+                                    "Incorrect setting for output(s) of "
+                                    "operator \"%s\", should set: [%s]."
+                                )
+                                % (type, m.name)
+                            )
+
                 for out_proto in proto.outputs:
                     if out_proto.name not in outputs:
                         continue
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 6ed9e674689ee..db483b151e4eb 100755
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -3215,6 +3215,7 @@ def _append_optimize_op(self, block, param_and_grad):
                 param_and_grad[1],
                 avg_squared_grad_acc,
                 avg_squared_update_acc,
+                self._create_param_lr(param_and_grad),
                 master_weight,
                 self._rho,
                 self._epsilon,
@@ -3227,6 +3228,7 @@ def _append_optimize_op(self, block, param_and_grad):
                 "Grad": param_and_grad[1],
                 "AvgSquaredGrad": avg_squared_grad_acc,
                 "AvgSquaredUpdate": avg_squared_update_acc,
+                "LearningRate": self._create_param_lr(param_and_grad),
             }
             outputs = {
                 "ParamOut": param_and_grad[0],
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 6f461538a7c8d..909b658c0983c 100755
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -552,8 +552,6 @@ if((WITH_GPU) AND (CUDA_VERSION GREATER_EQUAL 11.6))
 endif()
 
 set_tests_properties(test_conv2d_op PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
-set_tests_properties(test_faster_tokenizer_op PROPERTIES LABELS
-                                                         "RUN_TYPE=EXCLUSIVE")
 set_tests_properties(test_conv2d_op_depthwise_conv
                      PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
 set_tests_properties(test_conv2d_api PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
@@ -774,14 +772,6 @@ add_subdirectory(sequence)
 add_subdirectory(rnn)
 add_subdirectory(distribution)
 
-if(NOT WIN32 OR NOT WITH_GPU)
-  add_subdirectory(fft)
-endif()
-
-if(WITH_XPU)
-  add_subdirectory(xpu)
-endif()
-
 # dist xpu tests:
 if(WITH_XPU_BKCL)
   py_test(test_collective_allreduce_api_xpu
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/generation_pipeline_pass_unittest.py b/python/paddle/fluid/tests/unittests/auto_parallel/generation_pipeline_pass_unittest.py
new file mode 100644
index 0000000000000..4a54b99df0dba
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/generation_pipeline_pass_unittest.py
@@ -0,0 +1,177 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+from paddle.distributed.fleet import auto
+
+_g_mesh = auto.ProcessMesh([0, 1])
+PP_MESH_0 = auto.ProcessMesh([0])
+PP_MESH_1 = auto.ProcessMesh([1])
+
+image_size = 1024
+class_num = 10
+
+
+class MyDataset(paddle.io.Dataset):
+    def __init__(self, num_samples):
+        super().__init__()
+        self.num_samples = num_samples
+
+    def __getitem__(self, index):
+        input = np.random.uniform(size=image_size).astype("float32")
+        input = np.random.uniform(size=image_size).astype("float32")
+        return input, input
+
+    def __len__(self):
+        return self.num_samples
+
+
+class MLPLayer(nn.Layer):
+    def __init__(
+        self,
+        hidden_size=1024,
+        intermediate_size=4 * 1024,
+        dropout_ratio=0.1,
+        initializer_range=0.02,
+    ):
+        super().__init__()
+        d_model = hidden_size
+        dim_feedforward = intermediate_size
+        weight_attr = paddle.ParamAttr(
+            initializer=nn.initializer.Normal(mean=0.0, std=initializer_range)
+        )
+        bias_attr = None
+
+        self.linear0 = nn.Linear(
+            d_model, dim_feedforward, weight_attr, bias_attr=bias_attr
+        )
+        self.linear1 = nn.Linear(
+            dim_feedforward, d_model, weight_attr, bias_attr=bias_attr
+        )
+        self.linear2 = nn.Linear(d_model, 1, weight_attr, bias_attr=bias_attr)
+        self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
+        self.dropout = nn.Dropout(dropout_ratio, mode="upscale_in_train")
+
+    def forward(self, input):
+        out = auto.shard_op(self.norm, PP_MESH_0)(input)
+        out = self.linear0(out)
+        out = F.gelu(out, approximate=True)
+        out = auto.shard_op(self.linear1, PP_MESH_1)(out)
+        out = self.dropout(out)
+        out = self.linear2(out)
+        return out
+
+
+class GEN(nn.Layer):
+    def __init__(self, mlp):
+        super().__init__()
+        self.mlp = mlp
+
+    def forward(self, input):
+        model_kwargs = {}
+
+        output = self.mlp(input)
+
+        cur_step = paddle.full([1], 0, dtype='int64')
+        total_step = paddle.full([1], 10, dtype='int64')
+
+        model_kwargs['input'] = input
+        model_kwargs['output'] = output
+
+        while cur_step < total_step:
+
+            out = self.mlp(model_kwargs['input'])
+            model_kwargs['res'] = out
+            paddle.increment(cur_step)
+
+            auto.shard_op(paddle.assign, _g_mesh)(model_kwargs['input'], out)
+
+        output = F.gelu(model_kwargs['input'], approximate=True)
+
+        return output, cur_step
+
+
+def get_model():
+
+    with paddle.LazyGuard():
+        mlp = MLPLayer()
+        gen = GEN(mlp)
+    return gen
+
+
+class TestGenerationPipeline(unittest.TestCase):
+    def test_pp2(self):
+
+        model = get_model()
+
+        strategy = auto.Strategy()
+        pipeline = strategy.pipeline
+        pipeline.enable = True
+        pipeline.schedule_mode = "stream"
+        pipeline.generation_batch_size = 4
+        pipeline.accumulate_steps = 4
+        engine = auto.Engine(model, strategy=strategy)
+
+        engine.prepare(
+            inputs_spec=paddle.static.InputSpec(
+                shape=[2, 1024], name='input', dtype='float32'
+            ),
+            labels_spec=paddle.static.InputSpec(
+                shape=[2, 1024], name='label', dtype='float32'
+            ),
+            mode="eval",
+        )
+
+        train_data = MyDataset(50 * 2)
+        train_dataloader = engine._prepare_dataloader_from_generator(
+            dataset=train_data,
+            capacity=70,
+            iterable=False,
+            batch_size=2,
+            epochs=1,
+            steps_per_epoch=100,
+        )
+        engine._prepare_reader()
+
+        fleet_opt = engine.main_program._pipeline_opt['fleet_opt']
+        assert len(fleet_opt['tasks']) == 5
+        assert fleet_opt['inference_generation']
+        assert fleet_opt['num_micro_batches'] == 4
+        num_task_in_rank = 5
+        for idx, (task_id, rank_id) in enumerate(
+            fleet_opt['task_id_to_rank'].items()
+        ):
+            assert (
+                task_id == rank_id * num_task_in_rank + idx % num_task_in_rank
+            )
+
+        train_dataloader._inner_dataloader.start()
+        try:
+            engine._executor.run(
+                engine.main_program, use_program_cache=False, return_numpy=False
+            )
+        except paddle.fluid.core.EOFException:
+            print("test done")
+            train_dataloader._inner_dataloader.reset()
+            train_dataloader._inner_dataloader.start()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_context.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_context.py
index 029f33f8c647e..10f78aedd4fb9 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_context.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_context.py
@@ -247,6 +247,7 @@ def test_deepcopy(self):
             "_backup_serial_main_program_stack",
             "_backup_serial_startup_program_stack",
             "_pass_context",
+            "_tensor_nodes_with_same_name",
         ]
 
         for i in range(len(copy_list)):
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_bf16.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_bf16.py
index c83c098959c13..411cee39eca54 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_bf16.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_bf16.py
@@ -203,7 +203,7 @@ def test_bf16_pass(self):
         bf16_o1_engine.prepare(
             inputs_spec=inputs_spec, labels_spec=labels_spec, mode="train"
         )
-        self.check_program(bf16_o1_engine._dist_main_progs["train"][0])
+        self.check_program(bf16_o1_engine.main_program)
         print("BF16!check program successfully!")
 
 
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_generation_pipeline.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_generation_pipeline.py
new file mode 100644
index 0000000000000..598359cd51685
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_generation_pipeline.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import subprocess
+import sys
+import tempfile
+import unittest
+
+
+class TestGenerationPipeline(unittest.TestCase):
+    def test_pp2(self):
+        file_dir = os.path.dirname(os.path.abspath(__file__))
+        launch_model_path = os.path.join(
+            file_dir, "generation_pipeline_pass_unittest.py"
+        )
+
+        if os.environ.get("WITH_COVERAGE", "OFF") == "ON":
+            coverage_args = ["-m", "coverage", "run", "--branch", "-p"]
+        else:
+            coverage_args = []
+
+        tmp_dir = tempfile.TemporaryDirectory()
+        cmd = (
+            [sys.executable, "-u"]
+            + coverage_args
+            + [
+                "-m",
+                "paddle.distributed.launch",
+                "--devices",
+                "0,1",
+                "--log_dir",
+                tmp_dir.name,
+                launch_model_path,
+            ]
+        )
+
+        process = subprocess.Popen(cmd)
+        process.wait()
+        self.assertEqual(process.returncode, 0)
+
+        tmp_dir.cleanup()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_topology.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_topology.py
new file mode 100644
index 0000000000000..6807d22ffc3f1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_topology.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from paddle.distributed.auto_parallel.topo import SingleNodeTopology
+
+
+def check_empty_json_object(json_object):
+    return json_object is not None
+
+
+class TestSingleNodeTopology(unittest.TestCase):
+    def test_empty_topology_json_object(self):
+        topo = SingleNodeTopology()
+        topo.detect()
+
+        self.assertTrue(check_empty_json_object(topo.json_object))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_mp_model.py b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_mp_model.py
index dec1eb949ddb8..26e740bfa6b79 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_mp_model.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_mp_model.py
@@ -181,6 +181,150 @@ def forward(self, x):
         return x
 
 
+class TestDistMPSyncTraning(unittest.TestCase):
+    def setUp(self):
+        strategy = fleet.DistributedStrategy()
+        self.model_parallel_size = 2
+        self.data_parallel_size = 1
+        strategy.hybrid_configs = {
+            "dp_degree": self.data_parallel_size,
+            "mp_degree": self.model_parallel_size,
+            "pp_degree": 1,
+            "mp_configs": {
+                "sync_param": False,
+                "sync_grad": False,
+                "sync_moment": False,
+            },
+        }
+        fleet.init(is_collective=True, strategy=strategy)
+
+    def build_model_optimizer_train(
+        self,
+        batchs,
+        fp16=False,
+        mp_sync_param=False,
+        mp_sync_grad=False,
+        mp_sync_moment=False,
+    ):
+        hcg = fleet.get_hybrid_communicate_group()
+        word_size = hcg.get_model_parallel_world_size()
+        mp_id = hcg.get_model_parallel_rank()
+        dp_id = hcg.get_data_parallel_rank()
+        rank_id = dist.get_rank()
+        paddle.seed(2023)
+        np.random.seed(2023)
+        random.seed(2023)
+        set_random_seed(1024, dp_id, rank_id)
+
+        np_fc1 = np.random.random_sample((hidden_size, inner_size))
+        np_fc2 = np.random.random_sample((inner_size, hidden_size))
+
+        model = SimpleMPNet(
+            vocab_size,
+            hidden_size,
+            inner_size,
+            output_size,
+            np_fc1,
+            np_fc2,
+            mp_id,
+        )
+        optimizer = paddle.optimizer.AdamW(
+            learning_rate=0.1, parameters=model.parameters()
+        )
+
+        strategy = fleet.fleet._user_defined_strategy
+        strategy.hybrid_configs = {
+            "dp_degree": self.data_parallel_size,
+            "mp_degree": self.model_parallel_size,
+            "pp_degree": 1,
+            "mp_configs": {
+                "sync_param": mp_sync_param,
+                "sync_grad": mp_sync_grad,
+                "sync_moment": mp_sync_moment,
+            },
+        }
+
+        model = fleet.distributed_model(model)
+        optimizer = fleet.distributed_optimizer(optimizer)
+        return self.train_batch(batchs, model, optimizer, fp16)
+
+    def train_batch(self, batchs, model, optimizer, fp16=False):
+        losses = []
+        if fp16:
+            scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
+            scaler = fleet.distributed_scaler(scaler)
+        for batch in batchs:
+            with paddle.amp.auto_cast(enable=fp16, level='O1'):
+                output = model(batch)
+                loss = output.mean()
+                losses.append(loss.numpy())
+            if fp16:
+                scaled = scaler.scale(loss)
+                scaled.backward()
+                scaler.step(optimizer)
+                scaler.update()
+            else:
+                loss.backward()
+                optimizer.step()
+                optimizer.clear_grad()
+        return losses
+
+    def mp_sync_base(
+        self, mp_sync_param=False, mp_sync_grad=False, mp_sync_moment=False
+    ):
+        batchs = []
+        for _ in range(5):
+            np_data = np.random.randint(
+                0,
+                vocab_size,
+                (
+                    batch_size,
+                    seq_length,
+                ),
+            )
+            batchs.append(paddle.to_tensor(np_data))
+
+        losses = self.build_model_optimizer_train(batchs)
+        losses_sync = self.build_model_optimizer_train(
+            batchs,
+            mp_sync_param=mp_sync_param,
+            mp_sync_grad=mp_sync_grad,
+            mp_sync_moment=mp_sync_moment,
+        )
+
+        for i in range(len(losses)):
+            np.testing.assert_allclose(losses[i], losses_sync[i], rtol=1e-6)
+
+        # test fp16
+        losses_fp16 = self.build_model_optimizer_train(batchs, fp16=True)
+        losses_sync_fp16 = self.build_model_optimizer_train(
+            batchs,
+            fp16=True,
+            mp_sync_param=mp_sync_param,
+            mp_sync_grad=mp_sync_grad,
+            mp_sync_moment=mp_sync_moment,
+        )
+
+        for i in range(len(losses_fp16)):
+            np.testing.assert_allclose(
+                losses_fp16[i], losses_sync_fp16[i], rtol=1e-6
+            )
+
+    def test_mp_sync_param(self):
+        self.mp_sync_base(mp_sync_param=True)
+
+    def test_mp_sync_grad(self):
+        self.mp_sync_base(mp_sync_grad=True)
+
+    def test_mp_sync_moment(self):
+        self.mp_sync_base(mp_sync_moment=True)
+
+    def test_mp_sync_all(self):
+        self.mp_sync_base(
+            mp_sync_param=True, mp_sync_grad=True, mp_sync_moment=True
+        )
+
+
 class TestDistMPTraning(unittest.TestCase):
     def setUp(self):
         strategy = fleet.DistributedStrategy()
diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution_bernoulli.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution_bernoulli.py
new file mode 100644
index 0000000000000..2229880b7a6bf
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_bernoulli.py
@@ -0,0 +1,596 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import scipy.special
+import scipy.stats
+from config import ATOL, DEVICES, RTOL
+from parameterize import (
+    TEST_CASE_NAME,
+    parameterize_cls,
+    parameterize_func,
+    place,
+)
+from test_distribution import DistributionNumpy
+
+import paddle
+from paddle.distribution import Bernoulli
+from paddle.distribution.kl import kl_divergence
+from paddle.fluid.data_feeder import convert_dtype
+
+np.random.seed(2023)
+paddle.seed(2023)
+
+# Smallest representable number.
+EPS = {
+    'float32': np.finfo('float32').eps,
+    'float64': np.finfo('float64').eps,
+}
+
+
+def _clip_probs_ndarray(probs, dtype):
+    """Clip probs from [0, 1] to (0, 1) with ``eps``"""
+    eps = EPS.get(dtype)
+    return np.clip(probs, a_min=eps, a_max=1 - eps).astype(dtype)
+
+
+def _sigmoid(z):
+    return scipy.special.expit(z)
+
+
+def _kstest(samples_a, samples_b, temperature=1):
+    """Uses the Kolmogorov-Smirnov test for goodness of fit."""
+    _, p_value = scipy.stats.ks_2samp(samples_a, samples_b)
+    return not (p_value < 0.02 * (min(1, temperature)))
+
+
+class BernoulliNumpy(DistributionNumpy):
+    def __init__(self, probs):
+        probs = np.array(probs)
+        if str(probs.dtype) not in ['float32', 'float64']:
+            self.dtype = 'float32'
+        else:
+            self.dtype = probs.dtype
+
+        self.batch_shape = np.shape(probs)
+
+        self.probs = _clip_probs_ndarray(
+            np.array(probs, dtype=self.dtype), str(self.dtype)
+        )
+        self.logits = self._probs_to_logits(self.probs, is_binary=True)
+
+        self.rv = scipy.stats.bernoulli(self.probs.astype('float64'))
+
+    @property
+    def mean(self):
+        return self.rv.mean().astype(self.dtype)
+
+    @property
+    def variance(self):
+        return self.rv.var().astype(self.dtype)
+
+    def sample(self, shape):
+        shape = np.array(shape, dtype='int')
+        if shape.ndim:
+            shape = shape.tolist()
+        else:
+            shape = [shape.tolist()]
+        return self.rv.rvs(size=shape + list(self.batch_shape)).astype(
+            self.dtype
+        )
+
+    def log_prob(self, value):
+        return self.rv.logpmf(value).astype(self.dtype)
+
+    def prob(self, value):
+        return self.rv.pmf(value).astype(self.dtype)
+
+    def cdf(self, value):
+        return self.rv.cdf(value).astype(self.dtype)
+
+    def entropy(self):
+        return (
+            np.maximum(
+                self.logits,
+                0,
+            )
+            - self.logits * self.probs
+            + np.log(1 + np.exp(-np.abs(self.logits)))
+        ).astype(self.dtype)
+
+    def kl_divergence(self, other):
+        """
+        .. math::
+
+            KL[a || b] = Pa * Log[Pa / Pb] + (1 - Pa) * Log[(1 - Pa) / (1 - Pb)]
+        """
+        p_a = self.probs
+        p_b = other.probs
+        return (
+            p_a * np.log(p_a / p_b) + (1 - p_a) * np.log((1 - p_a) / (1 - p_b))
+        ).astype(self.dtype)
+
+    def _probs_to_logits(self, probs, is_binary=False):
+        return (
+            (np.log(probs) - np.log1p(-probs)) if is_binary else np.log(probs)
+        ).astype(self.dtype)
+
+
+class BernoulliTest(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static(self.place)
+        with paddle.fluid.dygraph.guard(self.place):
+            # just for convenience
+            self.dtype = self.expected_dtype
+
+            # init numpy with `dtype`
+            self.init_numpy_data(self.probs, self.dtype)
+
+            # init paddle and check dtype convert.
+            self.init_dynamic_data(self.probs, self.default_dtype, self.dtype)
+
+    def init_numpy_data(self, probs, dtype):
+        probs = np.array(probs).astype(dtype)
+        self.rv_np = BernoulliNumpy(probs)
+
+    def init_dynamic_data(self, probs, default_dtype, dtype):
+        self.rv_paddle = Bernoulli(probs)
+        self.assertTrue(
+            dtype == convert_dtype(self.rv_paddle.probs.dtype),
+            (dtype, self.rv_paddle.probs.dtype),
+        )
+
+
+@place(DEVICES)
+@parameterize_cls(
+    (TEST_CASE_NAME, 'probs', 'default_dtype', 'expected_dtype'),
+    [
+        # 0-D probs
+        ('probs_00_32', paddle.full((), 0.0), 'float32', 'float32'),
+        ('probs_03_32', paddle.full((), 0.3), 'float32', 'float32'),
+        ('probs_10_32', paddle.full((), 1.0), 'float32', 'float32'),
+        (
+            'probs_00_64',
+            paddle.full((), 0.0, dtype='float64'),
+            'float64',
+            'float64',
+        ),
+        (
+            'probs_03_64',
+            paddle.full((), 0.3, dtype='float64'),
+            'float64',
+            'float64',
+        ),
+        (
+            'probs_10_64',
+            paddle.full((), 1.0, dtype='float64'),
+            'float64',
+            'float64',
+        ),
+        # 1-D probs
+        ('probs_00', 0.0, 'float64', 'float32'),
+        ('probs_03', 0.3, 'float64', 'float32'),
+        ('probs_10', 1.0, 'float64', 'float32'),
+        ('probs_tensor_03_32', paddle.to_tensor(0.3), 'float32', 'float32'),
+        (
+            'probs_tensor_03_64',
+            paddle.to_tensor(0.3, dtype='float64'),
+            'float64',
+            'float64',
+        ),
+        (
+            'probs_tensor_03_list_32',
+            paddle.to_tensor(
+                [
+                    0.3,
+                ]
+            ),
+            'float32',
+            'float32',
+        ),
+        (
+            'probs_tensor_03_list_64',
+            paddle.to_tensor(
+                [
+                    0.3,
+                ],
+                dtype='float64',
+            ),
+            'float64',
+            'float64',
+        ),
+        # N-D probs
+        (
+            'probs_tensor_0305',
+            paddle.to_tensor((0.3, 0.5)),
+            'float32',
+            'float32',
+        ),
+        (
+            'probs_tensor_03050104',
+            paddle.to_tensor(((0.3, 0.5), (0.1, 0.4))),
+            'float32',
+            'float32',
+        ),
+    ],
+)
+class BernoulliTestFeature(BernoulliTest):
+    def test_mean(self):
+        with paddle.fluid.dygraph.guard(self.place):
+            np.testing.assert_allclose(
+                self.rv_paddle.mean,
+                self.rv_np.mean,
+                rtol=RTOL.get(self.dtype),
+                atol=ATOL.get(self.dtype),
+            )
+
+    def test_variance(self):
+        with paddle.fluid.dygraph.guard(self.place):
+            np.testing.assert_allclose(
+                self.rv_paddle.variance,
+                self.rv_np.variance,
+                rtol=RTOL.get(self.dtype),
+                atol=ATOL.get(self.dtype),
+            )
+
+    @parameterize_func(
+        [
+            (
+                paddle.to_tensor(
+                    [
+                        0.0,
+                    ]
+                ),
+            ),
+            (
+                paddle.to_tensor(
+                    0.0,
+                ),
+            ),
+            (paddle.to_tensor(1.0),),
+            (paddle.to_tensor(0.0, dtype='float64'),),
+        ]
+    )
+    def test_log_prob(self, value):
+        with paddle.fluid.dygraph.guard(self.place):
+            if convert_dtype(value.dtype) == convert_dtype(
+                self.rv_paddle.probs.dtype
+            ):
+                log_prob = self.rv_paddle.log_prob(value)
+                np.testing.assert_allclose(
+                    log_prob,
+                    self.rv_np.log_prob(value),
+                    rtol=RTOL.get(self.dtype),
+                    atol=ATOL.get(self.dtype),
+                )
+                self.assertTrue(self.dtype == convert_dtype(log_prob.dtype))
+
+            else:
+                with self.assertWarns(UserWarning):
+                    self.rv_paddle.log_prob(value)
+
+    @parameterize_func(
+        [
+            (
+                paddle.to_tensor(
+                    [
+                        0.0,
+                    ]
+                ),
+            ),
+            (paddle.to_tensor(0.0),),
+            (paddle.to_tensor(1.0),),
+            (paddle.to_tensor(0.0, dtype='float64'),),
+        ]
+    )
+    def test_prob(self, value):
+        with paddle.fluid.dygraph.guard(self.place):
+            if convert_dtype(value.dtype) == convert_dtype(
+                self.rv_paddle.probs.dtype
+            ):
+                prob = self.rv_paddle.prob(value)
+                np.testing.assert_allclose(
+                    prob,
+                    self.rv_np.prob(value),
+                    rtol=RTOL.get(self.dtype),
+                    atol=ATOL.get(self.dtype),
+                )
+                self.assertTrue(self.dtype == convert_dtype(prob.dtype))
+
+            else:
+                with self.assertWarns(UserWarning):
+                    self.rv_paddle.prob(value)
+
+    @parameterize_func(
+        [
+            (
+                paddle.to_tensor(
+                    [
+                        0.0,
+                    ]
+                ),
+            ),
+            (paddle.to_tensor(0.0),),
+            (paddle.to_tensor(0.3),),
+            (paddle.to_tensor(0.7),),
+            (paddle.to_tensor(1.0),),
+            (paddle.to_tensor(0.0, dtype='float64'),),
+        ]
+    )
+    def test_cdf(self, value):
+        with paddle.fluid.dygraph.guard(self.place):
+            if convert_dtype(value.dtype) == convert_dtype(
+                self.rv_paddle.probs.dtype
+            ):
+                cdf = self.rv_paddle.cdf(value)
+                np.testing.assert_allclose(
+                    cdf,
+                    self.rv_np.cdf(value),
+                    rtol=RTOL.get(self.dtype),
+                    atol=ATOL.get(self.dtype),
+                )
+                self.assertTrue(self.dtype == convert_dtype(cdf.dtype))
+
+            else:
+                with self.assertWarns(UserWarning):
+                    self.rv_paddle.cdf(value)
+
+    def test_entropy(self):
+        with paddle.fluid.dygraph.guard(self.place):
+            np.testing.assert_allclose(
+                self.rv_paddle.entropy(),
+                self.rv_np.entropy(),
+                rtol=RTOL.get(self.dtype),
+                atol=ATOL.get(self.dtype),
+            )
+
+    def test_kl_divergence(self):
+        with paddle.fluid.dygraph.guard(self.place):
+            other_probs = paddle.to_tensor(0.9, dtype=self.dtype)
+
+            rv_paddle_other = Bernoulli(other_probs)
+            rv_np_other = BernoulliNumpy(other_probs)
+
+            np.testing.assert_allclose(
+                self.rv_paddle.kl_divergence(rv_paddle_other),
+                self.rv_np.kl_divergence(rv_np_other),
+                rtol=RTOL.get(self.dtype),
+                atol=ATOL.get(self.dtype),
+            )
+
+            np.testing.assert_allclose(
+                kl_divergence(self.rv_paddle, rv_paddle_other),
+                self.rv_np.kl_divergence(rv_np_other),
+                rtol=RTOL.get(self.dtype),
+                atol=ATOL.get(self.dtype),
+            )
+
+
+@place(DEVICES)
+@parameterize_cls(
+    (
+        TEST_CASE_NAME,
+        'probs',
+        'default_dtype',
+        'expected_dtype',
+        'shape',
+        'expected_shape',
+    ),
+    [
+        # 0-D probs
+        (
+            'probs_0d_1d',
+            paddle.full((), 0.3),
+            'float32',
+            'float32',
+            [
+                100,
+            ],
+            [
+                100,
+            ],
+        ),
+        (
+            'probs_0d_2d',
+            paddle.full((), 0.3),
+            'float32',
+            'float32',
+            [100, 1],
+            [100, 1],
+        ),
+        (
+            'probs_0d_3d',
+            paddle.full((), 0.3),
+            'float32',
+            'float32',
+            [100, 2, 3],
+            [100, 2, 3],
+        ),
+        # 1-D probs
+        (
+            'probs_1d_1d_32',
+            paddle.to_tensor(0.3),
+            'float32',
+            'float32',
+            [
+                100,
+            ],
+            [100, 1],
+        ),
+        (
+            'probs_1d_1d_64',
+            paddle.to_tensor(0.3, dtype='float64'),
+            'float64',
+            'float64',
+            paddle.to_tensor(
+                [
+                    100,
+                ]
+            ),
+            [100, 1],
+        ),
+        (
+            'probs_1d_2d',
+            paddle.to_tensor(0.3),
+            'float32',
+            'float32',
+            [100, 2],
+            [100, 2, 1],
+        ),
+        (
+            'probs_1d_3d',
+            paddle.to_tensor(0.3),
+            'float32',
+            'float32',
+            [100, 2, 3],
+            [100, 2, 3, 1],
+        ),
+        # N-D probs
+        (
+            'probs_2d_1d',
+            paddle.to_tensor((0.3, 0.5)),
+            'float32',
+            'float32',
+            [
+                100,
+            ],
+            [100, 2],
+        ),
+        (
+            'probs_2d_2d',
+            paddle.to_tensor((0.3, 0.5)),
+            'float32',
+            'float32',
+            [100, 3],
+            [100, 3, 2],
+        ),
+        (
+            'probs_2d_3d',
+            paddle.to_tensor((0.3, 0.5)),
+            'float32',
+            'float32',
+            [100, 4, 3],
+            [100, 4, 3, 2],
+        ),
+    ],
+)
+class BernoulliTestSample(BernoulliTest):
+    def test_sample(self):
+        with paddle.fluid.dygraph.guard(self.place):
+            sample_np = self.rv_np.sample(self.shape)
+            sample_paddle = self.rv_paddle.sample(self.shape)
+
+            self.assertEqual(list(sample_paddle.shape), self.expected_shape)
+            self.assertEqual(sample_paddle.dtype, self.rv_paddle.probs.dtype)
+
+            if self.probs.ndim:
+                for i in range(len(self.probs)):
+                    self.assertTrue(
+                        _kstest(
+                            sample_np[..., i].reshape(-1),
+                            sample_paddle.numpy()[..., i].reshape(-1),
+                        )
+                    )
+            else:
+                self.assertTrue(
+                    _kstest(
+                        sample_np.reshape(-1),
+                        sample_paddle.numpy().reshape(-1),
+                    )
+                )
+
+    @parameterize_func(
+        [
+            (1.0,),
+            (0.1,),
+        ]
+    )
+    def test_rsample(self, temperature):
+        """Compare two samples from `rsample` method, one from scipy `sample` and another from paddle `rsample`."""
+        with paddle.fluid.dygraph.guard(self.place):
+            sample_np = self.rv_np.sample(self.shape)
+            rsample_paddle = self.rv_paddle.rsample(self.shape, temperature)
+
+            self.assertEqual(list(rsample_paddle.shape), self.expected_shape)
+            self.assertEqual(rsample_paddle.dtype, self.rv_paddle.probs.dtype)
+
+            if self.probs.ndim:
+                for i in range(len(self.probs)):
+                    self.assertTrue(
+                        _kstest(
+                            sample_np[..., i].reshape(-1),
+                            (
+                                _sigmoid(rsample_paddle.numpy()[..., i]) > 0.5
+                            ).reshape(-1),
+                            temperature,
+                        )
+                    )
+            else:
+                self.assertTrue(
+                    _kstest(
+                        sample_np.reshape(-1),
+                        (_sigmoid(rsample_paddle.numpy()) > 0.5).reshape(-1),
+                        temperature,
+                    )
+                )
+
+    def test_rsample_backpropagation(self):
+        with paddle.fluid.dygraph.guard(self.place):
+            self.rv_paddle.probs.stop_gradient = False
+            rsample_paddle = self.rv_paddle.rsample(self.shape)
+            rsample_paddle = paddle.nn.functional.sigmoid(rsample_paddle)
+            grads = paddle.grad([rsample_paddle], [self.rv_paddle.probs])
+            self.assertEqual(len(grads), 1)
+            self.assertEqual(grads[0].dtype, self.rv_paddle.probs.dtype)
+            self.assertEqual(grads[0].shape, self.rv_paddle.probs.shape)
+
+
+@place(DEVICES)
+@parameterize_cls([TEST_CASE_NAME], ['BernoulliTestError'])
+class BernoulliTestError(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static(self.place)
+
+    @parameterize_func(
+        [
+            (-0.1, ValueError),
+            (1.1, ValueError),
+            (np.nan, ValueError),
+            (-1j + 1, TypeError),
+        ]
+    )
+    def test_bad_init(self, probs, error):
+        with paddle.fluid.dygraph.guard(self.place):
+            self.assertRaises(error, Bernoulli, probs)
+
+    @parameterize_func(
+        [
+            (
+                [0.3, 0.5],
+                paddle.to_tensor([0.1, 0.2, 0.3]),
+            ),
+        ]
+    )
+    def test_bad_broadcast(self, probs, value):
+        with paddle.fluid.dygraph.guard(self.place):
+            rv = Bernoulli(probs)
+            self.assertRaises(ValueError, rv.cdf, value)
+            self.assertRaises(ValueError, rv.log_prob, value)
+            self.assertRaises(ValueError, rv.prob, value)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution_bernoulli_static.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution_bernoulli_static.py
new file mode 100644
index 0000000000000..3390262792668
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_bernoulli_static.py
@@ -0,0 +1,468 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+from config import ATOL, DEVICES, RTOL
+from parameterize import (
+    TEST_CASE_NAME,
+    parameterize_cls,
+    parameterize_func,
+    place,
+)
+from test_distribution_bernoulli import BernoulliNumpy, _kstest, _sigmoid
+
+import paddle
+from paddle.distribution import Bernoulli
+from paddle.distribution.kl import kl_divergence
+
+np.random.seed(2023)
+paddle.seed(2023)
+paddle.enable_static()
+default_dtype = paddle.get_default_dtype()
+
+
+@place(DEVICES)
+@parameterize_cls(
+    (TEST_CASE_NAME, 'params'),  # params: name, probs, probs_other, value
+    [
+        (
+            'params',
+            (
+                # 1-D probs
+                (
+                    'probs_not_iterable',
+                    0.3,
+                    0.7,
+                    1.0,
+                ),
+                (
+                    'probs_not_iterable_and_broadcast_for_value',
+                    0.3,
+                    0.7,
+                    np.array([[0.0, 1.0], [1.0, 0.0]], dtype=default_dtype),
+                ),
+                # N-D probs
+                (
+                    'probs_tuple_0305',
+                    (0.3, 0.5),
+                    0.7,
+                    1.0,
+                ),
+                (
+                    'probs_tuple_03050104',
+                    ((0.3, 0.5), (0.1, 0.4)),
+                    0.7,
+                    1.0,
+                ),
+            ),
+        )
+    ],
+)
+class BernoulliTestFeature(unittest.TestCase):
+    def setUp(self):
+        self.program = paddle.static.Program()
+        self.executor = paddle.static.Executor(self.place)
+
+        self.params_len = len(self.params)
+
+        with paddle.static.program_guard(self.program):
+            self.init_numpy_data(self.params)
+            self.init_static_data(self.params)
+
+    def init_numpy_data(self, params):
+        self.mean_np = []
+        self.variance_np = []
+        self.log_prob_np = []
+        self.prob_np = []
+        self.cdf_np = []
+        self.entropy_np = []
+        self.kl_np = []
+
+        for _, probs, probs_other, value in params:
+            rv_np = BernoulliNumpy(probs)
+            rv_np_other = BernoulliNumpy(probs_other)
+
+            self.mean_np.append(rv_np.mean)
+            self.variance_np.append(rv_np.variance)
+            self.log_prob_np.append(rv_np.log_prob(value))
+            self.prob_np.append(rv_np.prob(value))
+            self.cdf_np.append(rv_np.cdf(value))
+            self.entropy_np.append(rv_np.entropy())
+            self.kl_np.append(rv_np.kl_divergence(rv_np_other))
+
+    def init_static_data(self, params):
+        with paddle.static.program_guard(self.program):
+            rv_paddles = []
+            rv_paddles_other = []
+            values = []
+            for _, probs, probs_other, value in params:
+                if not isinstance(value, np.ndarray):
+                    value = paddle.full([1], value, dtype=default_dtype)
+                else:
+                    value = paddle.to_tensor(value, place=self.place)
+
+                rv_paddles.append(Bernoulli(probs=paddle.to_tensor(probs)))
+                rv_paddles_other.append(
+                    Bernoulli(probs=paddle.to_tensor(probs_other))
+                )
+                values.append(value)
+
+            results = self.executor.run(
+                self.program,
+                feed={},
+                fetch_list=[
+                    [
+                        rv_paddles[i].mean,
+                        rv_paddles[i].variance,
+                        rv_paddles[i].log_prob(values[i]),
+                        rv_paddles[i].prob(values[i]),
+                        rv_paddles[i].cdf(values[i]),
+                        rv_paddles[i].entropy(),
+                        rv_paddles[i].kl_divergence(rv_paddles_other[i]),
+                        kl_divergence(rv_paddles[i], rv_paddles_other[i]),
+                    ]
+                    for i in range(self.params_len)
+                ],
+            )
+
+            self.mean_paddle = []
+            self.variance_paddle = []
+            self.log_prob_paddle = []
+            self.prob_paddle = []
+            self.cdf_paddle = []
+            self.entropy_paddle = []
+            self.kl_paddle = []
+            self.kl_func_paddle = []
+            for i in range(self.params_len):
+                (
+                    _mean,
+                    _variance,
+                    _log_prob,
+                    _prob,
+                    _cdf,
+                    _entropy,
+                    _kl,
+                    _kl_func,
+                ) = results[i * 8 : (i + 1) * 8]
+                self.mean_paddle.append(_mean)
+                self.variance_paddle.append(_variance)
+                self.log_prob_paddle.append(_log_prob)
+                self.prob_paddle.append(_prob)
+                self.cdf_paddle.append(_cdf)
+                self.entropy_paddle.append(_entropy)
+                self.kl_paddle.append(_kl)
+                self.kl_func_paddle.append(_kl_func)
+
+    def test_all(self):
+        for i in range(self.params_len):
+            self._test_mean(i)
+            self._test_variance(i)
+            self._test_log_prob(i)
+            self._test_prob(i)
+            self._test_cdf(i)
+            self._test_entropy(i)
+            self._test_kl_divergence(i)
+
+    def _test_mean(self, i):
+        np.testing.assert_allclose(
+            self.mean_np[i],
+            self.mean_paddle[i],
+            rtol=RTOL.get(default_dtype),
+            atol=ATOL.get(default_dtype),
+        )
+
+    def _test_variance(self, i):
+        np.testing.assert_allclose(
+            self.variance_np[i],
+            self.variance_paddle[i],
+            rtol=RTOL.get(default_dtype),
+            atol=ATOL.get(default_dtype),
+        )
+
+    def _test_log_prob(self, i):
+        np.testing.assert_allclose(
+            self.log_prob_np[i],
+            self.log_prob_paddle[i],
+            rtol=RTOL.get(default_dtype),
+            atol=ATOL.get(default_dtype),
+        )
+
+    def _test_prob(self, i):
+        np.testing.assert_allclose(
+            self.prob_np[i],
+            self.prob_paddle[i],
+            rtol=RTOL.get(default_dtype),
+            atol=ATOL.get(default_dtype),
+        )
+
+    def _test_cdf(self, i):
+        np.testing.assert_allclose(
+            self.cdf_np[i],
+            self.cdf_paddle[i],
+            rtol=RTOL.get(default_dtype),
+            atol=ATOL.get(default_dtype),
+        )
+
+    def _test_entropy(self, i):
+        np.testing.assert_allclose(
+            self.entropy_np[i],
+            self.entropy_paddle[i],
+            rtol=RTOL.get(default_dtype),
+            atol=ATOL.get(default_dtype),
+        )
+
+    def _test_kl_divergence(self, i):
+        np.testing.assert_allclose(
+            self.kl_np[i],
+            self.kl_paddle[i],
+            rtol=RTOL.get(default_dtype),
+            atol=ATOL.get(default_dtype),
+        )
+
+        np.testing.assert_allclose(
+            self.kl_np[i],
+            self.kl_func_paddle[i],
+            rtol=RTOL.get(default_dtype),
+            atol=ATOL.get(default_dtype),
+        )
+
+
+@place(DEVICES)
+@parameterize_cls(
+    (TEST_CASE_NAME, 'probs', 'shape', 'temperature', 'expected_shape'),
+    [
+        # 1-D probs
+        (
+            'probs_03',
+            (0.3,),
+            [
+                100,
+            ],
+            0.1,
+            [100, 1],
+        ),
+        # N-D probs
+        (
+            'probs_0305',
+            (0.3, 0.5),
+            [
+                100,
+            ],
+            0.1,
+            [100, 2],
+        ),
+    ],
+)
+class BernoulliTestSample(unittest.TestCase):
+    def setUp(self):
+        self.program = paddle.static.Program()
+        self.executor = paddle.static.Executor(self.place)
+
+        with paddle.static.program_guard(self.program):
+            self.init_numpy_data(self.probs, self.shape)
+            self.init_static_data(self.probs, self.shape, self.temperature)
+
+    def init_numpy_data(self, probs, shape):
+        self.rv_np = BernoulliNumpy(probs)
+        self.sample_np = self.rv_np.sample(shape)
+
+    def init_static_data(self, probs, shape, temperature):
+        with paddle.static.program_guard(self.program):
+            self.rv_paddle = Bernoulli(probs=paddle.to_tensor(probs))
+
+            [self.sample_paddle, self.rsample_paddle] = self.executor.run(
+                self.program,
+                feed={},
+                fetch_list=[
+                    self.rv_paddle.sample(shape),
+                    self.rv_paddle.rsample(shape, temperature),
+                ],
+            )
+
+    def test_sample(self):
+        with paddle.static.program_guard(self.program):
+            self.assertEqual(
+                list(self.sample_paddle.shape), self.expected_shape
+            )
+
+            for i in range(len(self.probs)):
+                self.assertTrue(
+                    _kstest(
+                        self.sample_np[..., i].reshape(-1),
+                        self.sample_paddle[..., i].reshape(-1),
+                    )
+                )
+
+    def test_rsample(self):
+        """Compare two samples from `rsample` method, one from scipy and another from paddle."""
+        with paddle.static.program_guard(self.program):
+            self.assertEqual(
+                list(self.rsample_paddle.shape), self.expected_shape
+            )
+
+            for i in range(len(self.probs)):
+                self.assertTrue(
+                    _kstest(
+                        self.sample_np[..., i].reshape(-1),
+                        (_sigmoid(self.rsample_paddle[..., i]) > 0.5).reshape(
+                            -1
+                        ),
+                        self.temperature,
+                    )
+                )
+
+
+@place(DEVICES)
+@parameterize_cls([TEST_CASE_NAME], ['BernoulliTestError'])
+class BernoulliTestError(unittest.TestCase):
+    def setUp(self):
+        self.program = paddle.static.Program()
+        self.executor = paddle.static.Executor(self.place)
+
+    @parameterize_func(
+        [
+            (0,),  # int
+            ((0.3,),),  # tuple
+            (
+                [
+                    0.3,
+                ],
+            ),  # list
+            (
+                np.array(
+                    [
+                        0.3,
+                    ]
+                ),
+            ),  # ndarray
+            (-1j + 1,),  # complex
+            ('0',),  # str
+        ]
+    )
+    def test_bad_init_type(self, probs):
+        with paddle.static.program_guard(self.program):
+            with self.assertRaises(TypeError):
+                [_] = self.executor.run(
+                    self.program, feed={}, fetch_list=[Bernoulli(probs=probs)]
+                )
+
+    @parameterize_func(
+        [
+            (100,),  # int
+            (100.0,),  # float
+        ]
+    )
+    def test_bad_sample_shape_type(self, shape):
+        with paddle.static.program_guard(self.program):
+            rv = Bernoulli(0.3)
+
+            with self.assertRaises(TypeError):
+                [_] = self.executor.run(
+                    self.program, feed={}, fetch_list=[rv.sample(shape)]
+                )
+
+            with self.assertRaises(TypeError):
+                [_] = self.executor.run(
+                    self.program, feed={}, fetch_list=[rv.rsample(shape)]
+                )
+
+    @parameterize_func(
+        [
+            (1,),  # int
+        ]
+    )
+    def test_bad_rsample_temperature_type(self, temperature):
+        with paddle.static.program_guard(self.program):
+            rv = Bernoulli(0.3)
+
+            with self.assertRaises(TypeError):
+                [_] = self.executor.run(
+                    self.program,
+                    feed={},
+                    fetch_list=[rv.rsample([100], temperature)],
+                )
+
+    @parameterize_func(
+        [
+            (1,),  # int
+            (1.0,),  # float
+            ([1.0],),  # list
+            ((1.0),),  # tuple
+            (np.array(1.0),),  # ndarray
+        ]
+    )
+    def test_bad_value_type(self, value):
+        with paddle.static.program_guard(self.program):
+            rv = Bernoulli(0.3)
+
+            with self.assertRaises(TypeError):
+                [_] = self.executor.run(
+                    self.program, feed={}, fetch_list=[rv.log_prob(value)]
+                )
+
+            with self.assertRaises(TypeError):
+                [_] = self.executor.run(
+                    self.program, feed={}, fetch_list=[rv.prob(value)]
+                )
+
+            with self.assertRaises(TypeError):
+                [_] = self.executor.run(
+                    self.program, feed={}, fetch_list=[rv.cdf(value)]
+                )
+
+    @parameterize_func(
+        [
+            (np.array(1.0),),  # ndarray or other distribution
+        ]
+    )
+    def test_bad_kl_other_type(self, other):
+        with paddle.static.program_guard(self.program):
+            rv = Bernoulli(0.3)
+
+            with self.assertRaises(TypeError):
+                [_] = self.executor.run(
+                    self.program, feed={}, fetch_list=[rv.kl_divergence(other)]
+                )
+
+    @parameterize_func(
+        [
+            (paddle.to_tensor([0.1, 0.2, 0.3]),),
+        ]
+    )
+    def test_bad_broadcast(self, value):
+        with paddle.static.program_guard(self.program):
+            rv = Bernoulli(paddle.to_tensor([0.3, 0.5]))
+
+            # `logits, value = paddle.broadcast_tensors([self.logits, value])`
+            # raise ValueError in dygraph, raise TypeError in static.
+            with self.assertRaises(TypeError):
+                [_] = self.executor.run(
+                    self.program, feed={}, fetch_list=[rv.cdf(value)]
+                )
+
+            with self.assertRaises(TypeError):
+                [_] = self.executor.run(
+                    self.program, feed={}, fetch_list=[rv.log_prob(value)]
+                )
+
+            with self.assertRaises(TypeError):
+                [_] = self.executor.run(
+                    self.program, feed={}, fetch_list=[rv.prob(value)]
+                )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/program_config.py b/python/paddle/fluid/tests/unittests/ir/inference/program_config.py
index 4591d5512c092..04e804ea135f7 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/program_config.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/program_config.py
@@ -131,8 +131,6 @@ def __repr__(self):
     'heter_listen_and_serv',
     'c_wait_comm',
     'c_wait_compute',
-    'c_gen_hccl_id',
-    'c_comm_init_hccl',
     'copy_cross_scope',
 }
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_inference_predictor_run.py b/python/paddle/fluid/tests/unittests/ir/inference/test_inference_predictor_run.py
new file mode 100644
index 0000000000000..99ba29956c5da
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_inference_predictor_run.py
@@ -0,0 +1,128 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import tempfile
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle.inference import Config, create_predictor
+
+
+class TestNet(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.fc1 = paddle.nn.Linear(4, 4)
+        self.fc2 = paddle.nn.Linear(4, 4)
+
+    def forward(self, x1, x2):
+        y1 = self.fc1(x1)
+        y2 = self.fc2(x2)
+        return y1 + y2
+
+
+@unittest.skipIf(
+    not paddle.is_compiled_with_cuda(), 'should compile with cuda.'
+)
+class TestPredictorRunWithTensor(unittest.TestCase):
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+        net = TestNet()
+        model = paddle.jit.to_static(
+            net,
+            input_spec=[
+                paddle.static.InputSpec(
+                    shape=[None, 4], dtype='float32', name='input0'
+                ),
+                paddle.static.InputSpec(
+                    shape=[None, 4], dtype='float32', name='input1'
+                ),
+            ],
+        )
+        paddle.jit.save(
+            model,
+            os.path.join(
+                self.temp_dir.name, 'test_predictor_run_model/inference'
+            ),
+        )
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
+    def init_predictor(self):
+        config = Config(
+            os.path.join(
+                self.temp_dir.name,
+                'test_predictor_run_model/inference.pdmodel',
+            ),
+            os.path.join(
+                self.temp_dir.name,
+                'test_predictor_run_model/inference.pdiparams',
+            ),
+        )
+        config.enable_use_gpu(256, 0)
+        config.enable_memory_optim()
+        predictor = create_predictor(config)
+        return predictor
+
+    def get_inputs(self):
+        input0 = np.array([[1, 2, 3, 4], [2, 3, 4, 5]]).astype(np.float32)
+        input1 = np.array([[0.1, 0.2, 0.3, 0.4], [1.2, 1.3, 1.4, 1.5]]).astype(
+            np.float32
+        )
+
+        input0_tensor = paddle.to_tensor(input0)
+        input1_tensor = paddle.to_tensor(input1)
+
+        return [input0_tensor, input1_tensor]
+
+    def get_disorder_output(self):
+        predictor = self.init_predictor()
+
+        [input0_tensor, input1_tensor] = self.get_inputs()
+
+        input_names = predictor.get_input_names()
+        input0_tensor.name = input_names[0]
+        input1_tensor.name = input_names[1]
+
+        # disorder
+        inputs = [input1_tensor, input0_tensor]
+        outputs = predictor.run(inputs)
+
+        return outputs[0]
+
+    def get_inorder_output(self):
+        predictor = self.init_predictor()
+
+        [input0_tensor, input1_tensor] = self.get_inputs()
+
+        # inorder
+        inputs = [input0_tensor, input1_tensor]
+        outputs = predictor.run(inputs)
+
+        return outputs[0]
+
+    def test_output(self):
+        inorder_output = self.get_inorder_output()
+        disorder_output = self.get_disorder_output()
+
+        assert np.allclose(
+            inorder_output.numpy().flatten(), disorder_output.numpy().flatten()
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_cumsum.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_cumsum.py
new file mode 100644
index 0000000000000..60dbfa37aab22
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_cumsum.py
@@ -0,0 +1,176 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from functools import partial
+from typing import List
+
+import numpy as np
+from program_config import ProgramConfig, TensorConfig
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest
+
+import paddle.inference as paddle_infer
+
+
+class TrtConvertCumsum(TrtLayerAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        ver = paddle_infer.get_trt_compile_version()
+        if ver[0] * 1000 + ver[1] * 100 + ver[2] * 10 < 7220:
+            return False
+        return True
+
+    def sample_program_configs(self):
+        self.trt_param.workspace_size = 1073741824
+
+        def generate_input1():
+            if self.dims == 2:
+                self.input_shape = [2, 3]
+                return np.random.random([2, 3]).astype(np.int32)
+            elif self.dims == 3:
+                self.input_shape = [2, 3, 4]
+                return np.random.random([2, 3, 4]).astype(np.int64)
+            elif self.dims == 4:
+                self.input_shape = [4, 3, 32, 32]
+                return np.random.random([4, 3, 32, 32]).astype(np.float32) - 0.5
+
+        for dims in [2, 3, 4]:
+            for axis in range(-1, dims):
+                for type in ["int32", "int64", "float32", "float64"]:
+                    self.dims = dims
+                    ops_config = [
+                        {
+                            "op_type": "cumsum",
+                            "op_inputs": {
+                                "X": ["input_data"],
+                            },
+                            "op_outputs": {"Out": ["output_data"]},
+                            "op_attrs": {"axis": axis, "dtype": type},
+                        }
+                    ]
+                    ops = self.generate_op_config(ops_config)
+
+                    program_config = ProgramConfig(
+                        ops=ops,
+                        weights={},
+                        inputs={
+                            "input_data": TensorConfig(
+                                data_gen=partial(generate_input1)
+                            ),
+                        },
+                        outputs=["output_data"],
+                    )
+
+                    yield program_config
+
+        # no op_attrs
+        for dims in [2, 3, 4]:
+            self.dims = dims
+            ops_config = [
+                {
+                    "op_type": "cumsum",
+                    "op_inputs": {
+                        "X": ["input_data"],
+                    },
+                    "op_outputs": {"Out": ["output_data"]},
+                    "op_attrs": {},
+                }
+            ]
+            ops = self.generate_op_config(ops_config)
+
+            program_config = ProgramConfig(
+                ops=ops,
+                weights={},
+                inputs={
+                    "input_data": TensorConfig(
+                        data_gen=partial(generate_input1)
+                    ),
+                },
+                outputs=["output_data"],
+            )
+
+            yield program_config
+
+    def sample_predictor_configs(
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
+        def generate_dynamic_shape():
+
+            if self.dims == 2:
+                self.dynamic_shape.min_input_shape = {
+                    "input_data": [2, 3],
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "input_data": [2, 3],
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "input_data": [2, 3],
+                }
+
+            elif self.dims == 3:
+                self.dynamic_shape.min_input_shape = {
+                    "input_data": [2, 3, 4],
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "input_data": [2, 3, 4],
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "input_data": [2, 3, 4],
+                }
+
+            elif self.dims == 4:
+                self.dynamic_shape.min_input_shape = {
+                    "input_data": [4, 3, 32, 32],
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "input_data": [4, 3, 32, 32],
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "input_data": [4, 3, 32, 32],
+                }
+
+        def generate_trt_nodes_num(attrs, dynamic_shape):
+            ver = paddle_infer.get_trt_compile_version()
+            if ver[0] * 1000 + ver[1] * 100 + ver[2] * 10 < 7220:
+                return 0, 3
+            return 1, 2
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        attrs = [
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
+        ]
+
+        # for static_shape
+        clear_dynamic_shape()
+
+        # for dynamic_shape
+        generate_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True
+        ), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True
+        ), 1e-2
+
+    def test(self):
+        self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/prim/test_comp_dispensable.py b/python/paddle/fluid/tests/unittests/prim/test_comp_dispensable.py
new file mode 100644
index 0000000000000..a4f4df5fdd1c5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/prim/test_comp_dispensable.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle
+
+
+class TestDispensable(unittest.TestCase):
+    def setUp(self):
+        paddle.fluid.core._set_prim_all_enabled(True)
+
+    def tearDown(self):
+        paddle.fluid.core._set_prim_all_enabled(False)
+
+    def test_dispensable(self):
+        @paddle.jit.to_static
+        def f(x):
+            return paddle.split(x, num_or_sections=2)
+
+        f = paddle.jit.to_static(f)
+        x = paddle.rand((8,))
+        x.stop_gradient = False
+
+        op = f.get_concrete_program(x)[1].backward_program.block(0).ops[-1]
+        self.assertEqual(
+            op.attr('op_role'),
+            int(paddle.fluid.core.op_proto_and_checker_maker.OpRole.Backward),
+        )
+        self.assertIn('AxisTensor', op.input_names)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_adadelta_op.py b/python/paddle/fluid/tests/unittests/test_adadelta_op.py
index 11db47b2475b9..f3eca8fec9cc7 100644
--- a/python/paddle/fluid/tests/unittests/test_adadelta_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adadelta_op.py
@@ -26,6 +26,7 @@ def adadelta_wrapper(
     Grad,
     AvgSquaredGrad,
     AvgSquaredUpdate,
+    LearningRate,
     master_weight=None,
     rho=0.95,
     epsilon=1e-6,
@@ -35,12 +36,13 @@ def adadelta_wrapper(
         Grad,
         AvgSquaredGrad,
         AvgSquaredUpdate,
+        LearningRate,
         None,
         rho,
         epsilon,
         False,
     )
-    return Param, AvgSquaredGrad, AvgSquaredUpdate
+    return Param, AvgSquaredGrad, AvgSquaredUpdate, LearningRate
 
 
 class TestAdadeltaOp1(OpTest):
@@ -58,11 +60,13 @@ def setUp(self):
         rho = 0.95
         epsilon = 1e-6
 
+        learning_rate = 1.0
         self.inputs = {
             'Param': param,
             'Grad': grad,
             'AvgSquaredGrad': avg_squared_grad,
             'AvgSquaredUpdate': avg_squared_update,
+            'LearningRate': np.array([learning_rate]).astype("float32"),
         }
 
         self.attrs = {'rho': rho, 'epsilon': epsilon}
@@ -113,12 +117,13 @@ def setUp(self):
         epsilon = 1e-6
 
         self.attrs = {'rho': rho, 'epsilon': epsilon}
-
+        learning_rate = 1.0
         self.inputs = {
             'Param': param,
             'Grad': grad,
             'AvgSquaredGrad': avg_squared_grad,
             'AvgSquaredUpdate': avg_squared_update,
+            'LearningRate': np.array([learning_rate]).astype("float32"),
         }
 
         avg_squared_grad_out = rho * avg_squared_grad + (1 - rho) * np.square(
diff --git a/python/paddle/fluid/tests/unittests/test_arange.py b/python/paddle/fluid/tests/unittests/test_arange.py
index a0d1ddc8b9eec..b8d9866ebc531 100644
--- a/python/paddle/fluid/tests/unittests/test_arange.py
+++ b/python/paddle/fluid/tests/unittests/test_arange.py
@@ -151,6 +151,7 @@ def test_out(self):
 
         expected_data = np.arange(0, 5, 1).astype(np.float32)
         self.assertEqual((out == expected_data).all(), True)
+        self.assertListEqual(list(x1.shape), [5])
 
 
 class TestArangeImperative(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_serial.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_serial.py
index f7c4fb0e94e89..11c817b9baeea 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_serial.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_serial.py
@@ -180,6 +180,9 @@ def check_send_recv_result(dist_main_prog, rank_id):
     return send_result and recv_result
 
 
+@unittest.skipIf(
+    not paddle.is_compiled_with_cuda(), "core is not compiled with CUDA"
+)
 class TestMLPReshard(unittest.TestCase):
     def test_mlp_serial(self):
         global _global_parallel_strategy
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
index 86ffea08a2254..bbe322ae0175b 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
@@ -16,7 +16,12 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, _set_use_system_allocator
+from eager_op_test import (
+    OpTest,
+    _set_use_system_allocator,
+    convert_float_to_uint16,
+    convert_uint16_to_float,
+)
 from op import Operator
 
 import paddle
@@ -239,7 +244,10 @@ def check_with_place(self, place, data_layout, dtype, shape):
                 raise ValueError("Unknown data layout.")
         scale_shape = [c]
 
-        x_val = np.random.random_sample(x_shape).astype(dtype)
+        if dtype == np.uint16:
+            x_val = np.random.random_sample(x_shape).astype(np.float32)
+        else:
+            x_val = np.random.random_sample(x_shape).astype(dtype)
         # generate some negative values to test case with relu fused
         x_val = x_val - 0.5
         scale_val = np.random.random_sample(scale_shape).astype(np.float32)
@@ -248,12 +256,20 @@ def check_with_place(self, place, data_layout, dtype, shape):
         mean = np.zeros(scale_shape).astype(np.float32)
         variance = np.ones(scale_shape).astype(np.float32)
 
-        y_out = _reference_testing(
-            x_val, scale_val, bias_val, mean, variance, epsilon, data_layout
-        ).astype(dtype)
+        if dtype == np.uint16:
+            y_out = _reference_testing(
+                x_val, scale_val, bias_val, mean, variance, epsilon, data_layout
+            ).astype(np.float32)
+            y_out = convert_float_to_uint16(y_out)
+        else:
+            y_out = _reference_testing(
+                x_val, scale_val, bias_val, mean, variance, epsilon, data_layout
+            ).astype(dtype)
         if self.fuse_with_relu:
             y_out = np.maximum(y_out, 0)
 
+        if dtype == np.uint16:
+            x_val = convert_float_to_uint16(x_val)
         scope = core.Scope()
 
         # create input
@@ -324,6 +340,11 @@ def check_with_place(self, place, data_layout, dtype, shape):
             y_tensor._set_dims(dims)
 
         # check inference result
+        atol = 1e-3
+        if dtype == np.uint16:
+            y_tensor = convert_uint16_to_float(y_tensor)
+            y_out = convert_uint16_to_float(y_out)
+            atol = 1e-2
         self.__assert_close(
             y_tensor,
             y_out,
@@ -335,7 +356,7 @@ def check_with_place(self, place, data_layout, dtype, shape):
             + str(np.dtype(dtype))
             + str(np.array(y_tensor))
             + str(y_out),
-            atol=1e-3,
+            atol=atol,
         )
 
     def test_check_output(self):
@@ -376,6 +397,29 @@ def test_check_output(self):
                 self.check_with_place(place, data_format, self.dtype, [2, 3])
 
 
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA or not support the bfloat16",
+)
+class TestBF16BatchNormOpInference(TestBatchNormOpInference):
+    def setUp(self):
+        self.dtype = np.uint16
+        self.use_mkldnn = False
+        self.fuse_with_relu = False
+        self.init_kernel_type()
+
+    def test_check_output(self):
+        places = [core.CUDAPlace(0)]
+        for place in places:
+            # for data_format in ["NCHW", "NHWC"]:
+            for data_format in ["NCHW"]:
+                self.check_with_place(
+                    place, data_format, self.dtype, [2, 3, 4, 5]
+                )
+                self.check_with_place(place, data_format, self.dtype, [2, 3])
+
+
 class TestBatchNormOpTraining(unittest.TestCase):
     def setUp(self):
         self.use_mkldnn = False
diff --git a/python/paddle/fluid/tests/unittests/test_empty_like_op.py b/python/paddle/fluid/tests/unittests/test_empty_like_op.py
index 8ccaabd7c2cf0..164275b1a7d83 100644
--- a/python/paddle/fluid/tests/unittests/test_empty_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_empty_like_op.py
@@ -15,6 +15,7 @@
 import unittest
 
 import numpy as np
+from eager_op_test import convert_uint16_to_float
 
 import paddle
 from paddle.fluid import core
@@ -38,7 +39,7 @@ def __check_out__(self, out):
             f'shape should be {self.dst_shape}, but get {shape}',
         )
 
-        if data_type in ['float32', 'float64', 'int32', 'int64']:
+        if data_type in ['float16', 'float32', 'float64', 'int32', 'int64']:
             max_value = np.nanmax(out)
             min_value = np.nanmin(out)
             always_non_full_zero = max_value >= min_value
@@ -47,6 +48,16 @@ def __check_out__(self, out):
                 always_full_zero or always_non_full_zero,
                 'always_full_zero or always_non_full_zero.',
             )
+        elif data_type in ['uint16']:
+            uout = convert_uint16_to_float(out)
+            max_value = np.nanmax(uout)
+            min_value = np.nanmin(uout)
+            always_non_full_zero = max_value >= min_value
+            always_full_zero = max_value == 0.0 and min_value == 0.0
+            self.assertTrue(
+                always_full_zero or always_non_full_zero,
+                'always_full_zero or always_non_full_zero.',
+            )
         elif data_type in ['bool']:
             total_num = out.size
             true_num = np.sum(out)
@@ -154,16 +165,13 @@ def setUp(self):
 
     def test_static_graph(self):
         paddle.enable_static()
-
-        dtype = 'float32'
-
         train_program = Program()
         startup_program = Program()
 
         with program_guard(train_program, startup_program):
-            x = np.random.random(self.x_shape).astype(dtype)
+            x = np.random.random(self.x_shape).astype(self.dtype)
             data_x = paddle.static.data(
-                'x', shape=self.data_x_shape, dtype=dtype
+                'x', shape=self.data_x_shape, dtype=self.dtype
             )
 
             out = paddle.empty_like(data_x)
@@ -176,7 +184,7 @@ def test_static_graph(self):
         exe = paddle.static.Executor(place)
         res = exe.run(train_program, feed={'x': x}, fetch_list=[out])
 
-        self.dst_dtype = dtype
+        self.dst_dtype = self.dtype
         self.dst_shape = x.shape
         self.__check_out__(res[0])
 
@@ -185,12 +193,80 @@ def test_static_graph(self):
     def init_config(self):
         self.x_shape = (200, 3)
         self.data_x_shape = [200, 3]
+        self.dtype = 'float32'
 
 
 class TestEmptyLikeAPI_Static2(TestEmptyLikeAPI_Static):
     def init_config(self):
         self.x_shape = (3, 200, 3)
         self.data_x_shape = [-1, 200, 3]
+        self.dtype = 'float32'
+
+
+class TestEmptyLikeAPI_StaticForFP16Op(TestEmptyLikeAPICommon):
+    def setUp(self):
+        self.init_config()
+
+    def init_config(self):
+        self.x_shape = (200, 3)
+        self.data_x_shape = [200, 3]
+        self.dtype = 'float16'
+
+    def test_static_graph(self):
+        paddle.enable_static()
+        if paddle.fluid.core.is_compiled_with_cuda():
+            place = paddle.CUDAPlace(0)
+            with paddle.static.program_guard(
+                paddle.static.Program(), paddle.static.Program()
+            ):
+                x = np.random.random([200, 3]).astype(self.dtype)
+                data_x = paddle.static.data(
+                    name="x", shape=[200, 3], dtype=self.dtype
+                )
+                out = paddle.empty_like(data_x)
+                exe = paddle.static.Executor(place)
+                res = exe.run(
+                    paddle.static.default_main_program(),
+                    feed={'x': x},
+                    fetch_list=[out],
+                )
+
+            self.dst_dtype = self.dtype
+            self.dst_shape = x.shape
+            self.__check_out__(res[0])
+
+
+class TestEmptyLikeAPI_StaticForBF16Op(TestEmptyLikeAPICommon):
+    def setUp(self):
+        self.init_config()
+
+    def init_config(self):
+        self.x_shape = (200, 3)
+        self.data_x_shape = [200, 3]
+        self.dtype = 'uint16'
+
+    def test_static_graph(self):
+        paddle.enable_static()
+        if paddle.fluid.core.is_compiled_with_cuda():
+            place = paddle.CUDAPlace(0)
+            with paddle.static.program_guard(
+                paddle.static.Program(), paddle.static.Program()
+            ):
+                x = np.random.random([200, 3]).astype(np.uint16)
+                data_x = paddle.static.data(
+                    name="x", shape=[200, 3], dtype=np.uint16
+                )
+                out = paddle.empty_like(data_x)
+                exe = paddle.static.Executor(place)
+                res = exe.run(
+                    paddle.static.default_main_program(),
+                    feed={'x': x},
+                    fetch_list=[out],
+                )
+
+            self.dst_dtype = self.dtype
+            self.dst_shape = x.shape
+            self.__check_out__(res[0])
 
 
 class TestEmptyError(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
index 9698fe9c54c05..752fbab31d57a 100644
--- a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
@@ -177,7 +177,6 @@ def python_api(
     path_code=None,
     num_classes=-1,
     is_sparse=False,
-    remote_prefetch=False,
 ):
     return paddle.nn.functional.hsigmoid_loss(
         input,
diff --git a/python/paddle/fluid/tests/unittests/test_input_spec.py b/python/paddle/fluid/tests/unittests/test_input_spec.py
index dad821438afb8..2bdce8b4b58c5 100644
--- a/python/paddle/fluid/tests/unittests/test_input_spec.py
+++ b/python/paddle/fluid/tests/unittests/test_input_spec.py
@@ -349,7 +349,7 @@ def test_run(self):
         )
         x = paddle.randn([2, 10])
         out = net(x)
-        np.testing.assert_equal(out.shape, [2, 5])
+        np.testing.assert_equal(net.forward._input_spec, None)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_instance_norm_op.py b/python/paddle/fluid/tests/unittests/test_instance_norm_op.py
index 9e4445b7575cd..bab904db6eef0 100644
--- a/python/paddle/fluid/tests/unittests/test_instance_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_instance_norm_op.py
@@ -15,9 +15,11 @@
 import unittest
 
 import numpy as np
+import parameterized as param
+from eager_op_test import OpTest
 
 import paddle
-from paddle import fluid
+from paddle import fluid, nn
 from paddle.fluid import Program, core, program_guard
 from paddle.fluid.dygraph import to_variable
 
@@ -33,7 +35,7 @@ def _reference_instance_norm_naive(x, scale, bias, epsilon, mean, var):
     var_tile = np.reshape(var, (n, c, 1, 1))
     var_tile = np.tile(var_tile, (1, 1, h, w))
 
-    x_norm = (x - mean_tile) / np.sqrt(var_tile + epsilon).astype('float32')
+    x_norm = (x - mean_tile) / np.sqrt(var_tile + epsilon)
     scale_tile = np.reshape(scale, (1, c, 1, 1))
     scale_tile = np.tile(scale_tile, (n, 1, h, w))
     bias_tile = np.reshape(bias, (1, c, 1, 1))
@@ -84,6 +86,633 @@ def _cal_mean_variance(x, epsilon, mean_shape):
     return mean, var
 
 
+def instance_norm_wrapper(x, weight=None, bias=None, esp=1e-05):
+    return paddle.nn.functional.instance_norm(
+        x, None, None, weight, bias, True, 0.9, esp
+    )
+
+
+class TestInstanceNormOp(OpTest):
+    def setUp(self):
+        self.op_type = "instance_norm"
+        self.prim_op_type = "comp"
+        self.python_api = instance_norm_wrapper
+        self.public_python_api = instance_norm_wrapper
+        self.python_out_sig = ['Y']
+        self.fw_comp_rtol = 1e-6
+        self.fw_comp_atol = 1e-6
+        self.rev_comp_rtol = 1e-4
+        self.rev_comp_atol = 1e-4
+        self.init_test_case()
+        ref_y_np, ref_mean_np, ref_var_np_tmp = _reference_instance_norm_naive(
+            self.x_np,
+            self.scale_np,
+            self.bias_np,
+            self.epsilon,
+            self.mean_np,
+            self.var_np,
+        )
+
+        ref_var_np = 1 / np.sqrt(ref_var_np_tmp + self.epsilon)
+        self.inputs = {
+            'X': self.x_np,
+            'Scale': self.scale_np,
+            'Bias': self.bias_np,
+        }
+        self.attrs = {'epsilon': self.epsilon}
+        self.outputs = {
+            'Y': ref_y_np,
+            'SavedMean': ref_mean_np,
+            'SavedVariance': ref_var_np,
+        }
+        self.enable_cinn = False
+
+    def test_check_output(self):
+        self.check_output(check_prim=True)
+
+    def test_check_grad(self):
+        self.check_grad(['X', 'Scale', 'Bias'], 'Y', check_prim=True)
+
+    def init_test_case(self):
+        x_shape = [2, 100, 4, 5]
+        n, c, h, w = x_shape[0], x_shape[1], x_shape[2], x_shape[3]
+        self.epsilon = 1e-05
+        dtype = np.float32
+        scale_shape = [c]
+        mean_shape = [n * c]
+        np.random.seed()
+        self.x_np = np.random.random_sample(x_shape).astype(dtype)
+        self.scale_np = np.random.random_sample(scale_shape).astype(dtype)
+        self.bias_np = np.random.random_sample(scale_shape).astype(dtype)
+        self.mean_np, self.var_np = _cal_mean_variance(
+            self.x_np, self.epsilon, mean_shape
+        )
+        self.dtype = dtype
+
+
+class TestInstanceNormFP64(TestInstanceNormOp):
+    def init_test_case(self):
+        x_shape = [2, 100, 4, 5]
+        n, c, h, w = x_shape[0], x_shape[1], x_shape[2], x_shape[3]
+        self.epsilon = 1e-5
+        dtype = np.float64
+        scale_shape = [c]
+        mean_shape = [n * c]
+        np.random.seed()
+        self.x_np = np.random.random_sample(x_shape).astype(dtype)
+        self.scale_np = np.ones(scale_shape).astype(dtype)
+        self.bias_np = np.zeros(scale_shape).astype(dtype)
+        self.mean_np, self.var_np = _cal_mean_variance(
+            self.x_np, self.epsilon, mean_shape
+        )
+        self.fw_comp_rtol = 1e-14
+        self.fw_comp_atol = 1e-14
+        self.rev_comp_rtol = 1e-13
+        self.rev_comp_atol = 1e-13
+        self.dtype = dtype
+
+
+class PrimGroupNorm(paddle.nn.Layer):
+    def __init__(self, num_channels, scale, bias):
+        super().__init__()
+        self.func = nn.InstanceNorm2D(num_channels)
+        paddle.assign(scale, self.func.scale)
+        paddle.assign(bias, self.func.bias)
+
+    def forward(self, x):
+        out = self.func(x)
+        return out
+
+
+def apply_to_static(net, use_cinn):
+    build_strategy = paddle.static.BuildStrategy()
+    build_strategy.build_cinn_pass = use_cinn
+    return paddle.jit.to_static(net, build_strategy=False)
+
+
+places = [paddle.CPUPlace()]
+if paddle.is_compiled_with_cuda():
+    places.append(paddle.CUDAPlace(0))
+
+
+@param.parameterized_class(
+    (
+        'name',
+        'shape',
+        'epsilon',
+        'data_format',
+        'places',
+        'dtype',
+        'threshold_list',
+        'special_threshold',
+    ),
+    (
+        (
+            'test0',
+            (2, 100, 3, 5),
+            1e-5,
+            'NCHW',
+            places,
+            'float32',
+            [
+                [1e-5, 1e-5, 1e-5],  # cpu thresholds for static
+                [1e-5, 1e-5, 1e-5],  # gpu thresholds for static
+            ],
+            None,
+        ),
+        (
+            'test1',
+            (2, 100, 3, 5),
+            1e-5,
+            'NCHW',
+            places,
+            'float32',
+            [
+                [1e-5, 1e-5, 1e-5],  # cpu thresholds for static
+                [1e-5, 1e-5, 1e-5],  # gpu thresholds for static
+            ],
+            None,
+        ),
+        (
+            'testbigdata_fp32',
+            (8, 32, 32, 64),
+            1e-5,
+            'NCHW',
+            places,
+            'float32',
+            [
+                [1e-5, 1e-5, 1e-5],  # cpu thresholds for static
+                [1e-5, 1e-5, 1e-5],  # gpu thresholds for static
+            ],  # gpu thresholds
+            [2e-2, 2e-2, 2e-2],  # special grad threshold for scale
+        ),
+        (
+            'test0_fp64',
+            (2, 100, 3, 5),
+            1e-5,
+            'NCHW',
+            places,
+            'float64',
+            [
+                [1e-14, 1e-14, 1e-14],  # cpu thresholds for static
+                [1e-14, 1e-14, 1e-14],  # gpu thresholds for static
+            ],
+            [1e-13, 1e-13, 1e-13],
+        ),
+        (
+            'test1_fp64',
+            (2, 100, 3, 5),
+            1e-5,
+            'NCHW',
+            places,
+            'float64',
+            [
+                [1e-14, 1e-14, 1e-14],  # cpu thresholds for static
+                [1e-14, 1e-14, 1e-14],  # gpu thresholds for static
+            ],
+            [1e-13, 1e-13, 1e-13],
+        ),
+        (
+            'testbigdata_fp64',
+            (8, 32, 32, 64),
+            1e-5,
+            'NCHW',
+            places,
+            'float64',
+            [
+                [1e-14, 1e-14, 1e-14],  # cpu thresholds
+                [1e-14, 1e-14, 1e-14],
+            ],  # gpu thresholds
+            [5e-11, 5e-11, 5e-11],  # for X_grad
+        ),
+    ),
+)
+class TestCompositeInstanceNormNorm(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        core._set_prim_all_enabled(True)
+
+    @classmethod
+    def tearDownClass(cls):
+        core._set_prim_all_enabled(False)
+
+    def setUp(self):
+        np.random.seed(1234)
+        self.fwd_desire = []
+        self.rev_desire = []
+        self.x = np.random.random(self.shape).astype(self.dtype)
+        self.scale = np.random.random([self.shape[1]]).astype(self.dtype)
+        self.bias = np.random.random([self.shape[1]]).astype(self.dtype)
+        self.num_channels = self.shape[1]
+
+        self.static_fwd_desire = []
+        self.static_rev_desire = []
+        for place in self.places:
+            fwd_desire, rev_desire = self.get_eager_desire(place)
+            self.fwd_desire.append(fwd_desire.numpy())
+            self.rev_desire.append(rev_desire.numpy())
+            self.static_fwd_desire.append([])
+            self.static_rev_desire.append([])
+            fwd, rev = self.get_static_desire(place)
+            self.static_fwd_desire[-1].append(fwd[0])
+            self.static_fwd_desire[-1].append(fwd[1])
+            self.static_fwd_desire[-1].append(fwd[2])
+            self.static_rev_desire[-1].append(rev[0])
+            self.static_rev_desire[-1].append(rev[1])
+            self.static_rev_desire[-1].append(rev[2])
+
+    def get_eager_desire(self, place):
+        if isinstance(place, fluid.CPUPlace):
+            paddle.set_device("cpu")
+        if isinstance(place, fluid.CUDAPlace):
+            paddle.set_device("gpu")
+        core.set_prim_eager_enabled(False)
+        paddle.disable_static()
+        input_ = paddle.to_tensor(
+            data=self.x, dtype=self.dtype, place=place, stop_gradient=False
+        )
+        scale_ = paddle.to_tensor(
+            data=self.scale, dtype=self.dtype, place=place, stop_gradient=False
+        )
+        bias_ = paddle.to_tensor(
+            data=self.bias, dtype=self.dtype, place=place, stop_gradient=False
+        )
+        output = paddle.nn.functional.instance_norm(
+            input_, None, None, scale_, bias_, True, 0.9, self.epsilon
+        )
+        grad = paddle.grad(output, input_)
+
+        return output, grad[0]
+
+    def get_static_desire(self, place):
+        core._set_prim_all_enabled(False)
+        paddle.enable_static()
+        if isinstance(place, fluid.CPUPlace):
+            paddle.set_device("cpu")
+        if isinstance(place, fluid.CUDAPlace):
+            paddle.set_device("gpu")
+
+        mp, sp = paddle.static.Program(), paddle.static.Program()
+        with paddle.static.program_guard(mp, sp):
+            input_ = paddle.static.data(
+                'x', shape=self.x.shape, dtype=self.x.dtype
+            )
+            input_.stop_gradient = False
+
+            scale_ = paddle.static.data(
+                'scale_', shape=self.scale.shape, dtype=self.scale.dtype
+            )
+            scale_.stop_gradient = False
+
+            bias_ = paddle.static.data(
+                'bias_', shape=self.bias.shape, dtype=self.bias.dtype
+            )
+            bias_.stop_gradient = False
+
+            output = paddle.nn.functional.instance_norm(
+                input_, None, None, scale_, bias_, True, 0.9, self.epsilon
+            )
+
+            blocks = mp.blocks
+            names = dict(
+                zip(
+                    blocks[0].ops[0].output_names,
+                    blocks[0].ops[0].output_arg_names,
+                )
+            )
+            vars_list = [
+                names[key]
+                for key in [
+                    "Y",
+                    "SavedMean",
+                    "SavedVariance",
+                ]
+            ]
+
+            fwd_ops = [op.type for op in blocks[0].ops]
+            # Ensure that instance_norm in original block
+            assert 'instance_norm' in fwd_ops
+
+            if core._is_fwd_prim_enabled():
+                paddle.incubate.autograd.primapi.to_prim(mp.blocks)
+                fwd_ops_new = [op.type for op in blocks[0].ops]
+                # Ensure that instance_norm is splitted into small ops
+                assert 'instance_norm' not in fwd_ops_new
+
+            grads = paddle.static.gradients([output], [input_, scale_, bias_])
+
+        exe = paddle.static.Executor(place)
+        exe.run(sp)
+        out_list = exe.run(
+            mp,
+            feed={
+                input_.name: self.x,
+                scale_.name: self.scale,
+                bias_.name: self.bias,
+            },
+            fetch_list=vars_list + [grads],
+        )
+        paddle.disable_static()
+        core._set_prim_all_enabled(True)
+
+        return out_list[:3], out_list[3:]
+
+    def test_static_comp(self):
+        paddle.enable_static()
+        mps = []
+        fwd_actual = []
+        rev_actual = []
+        if len(self.places) < 1:
+            return
+
+        with paddle.fluid.framework._static_guard():
+            for place in self.places:
+                fwd_actual.append([])
+                rev_actual.append([])
+                mp, sp = paddle.static.Program(), paddle.static.Program()
+                with paddle.static.program_guard(mp, sp):
+                    input_ = paddle.static.data(
+                        'x', shape=self.x.shape, dtype=self.x.dtype
+                    )
+                    input_.stop_gradient = False
+
+                    scale_ = paddle.static.data(
+                        'scale_', shape=self.scale.shape, dtype=self.scale.dtype
+                    )
+                    scale_.stop_gradient = False
+
+                    bias_ = paddle.static.data(
+                        'bias_', shape=self.bias.shape, dtype=self.bias.dtype
+                    )
+                    bias_.stop_gradient = False
+
+                    output = paddle.nn.functional.instance_norm(
+                        input_,
+                        None,
+                        None,
+                        scale_,
+                        bias_,
+                        True,
+                        0.9,
+                        self.epsilon,
+                    )
+
+                    blocks = mp.blocks
+                    names = dict(
+                        zip(
+                            blocks[0].ops[0].output_names,
+                            blocks[0].ops[0].output_arg_names,
+                        )
+                    )
+                    vars_list = [
+                        names[key]
+                        for key in [
+                            "Y",
+                            "SavedMean",
+                            "SavedVariance",
+                        ]
+                    ]
+
+                    fwd_ops = [op.type for op in blocks[0].ops]
+                    # Ensure that instance_norm in original block
+                    assert 'instance_norm' in fwd_ops
+
+                    if core._is_fwd_prim_enabled():
+                        paddle.incubate.autograd.primapi.to_prim(mp.blocks)
+                        fwd_ops_new = [op.type for op in blocks[0].ops]
+                        # Ensure that instance_norm is splitted into small ops
+                        assert 'instance_norm' not in fwd_ops_new
+
+                    grads = paddle.static.gradients(
+                        output, [input_, scale_, bias_]
+                    )
+                exe = paddle.static.Executor(place)
+                exe.run(sp)
+                out_list = exe.run(
+                    mp,
+                    feed={
+                        input_.name: self.x,
+                        scale_.name: self.scale,
+                        bias_.name: self.bias,
+                    },
+                    fetch_list=vars_list + [grads],
+                )
+                fwd_actual[-1].append(out_list[0])
+                fwd_actual[-1].append(out_list[1])
+                fwd_actual[-1].append(out_list[2])
+                rev_actual[-1].append(out_list[3])
+                rev_actual[-1].append(out_list[4])
+                rev_actual[-1].append(out_list[5])
+                mps.append(mp)
+
+        vars_name = [
+            "Y",
+            "SavedMean",
+            "SavedVariance",
+            "X_grad",
+            "Scale_grad",
+            "Bias_grad",
+        ]
+
+        for i in range(len(self.places)):
+            self.assertTrue(
+                'instance_norm' not in [op.type for op in mps[i].block(0).ops]
+            )
+            atol = self.threshold_list[i][0]
+            rtol = self.threshold_list[i][0]
+            for j in range(len(self.static_fwd_desire[i])):
+                # in float16 type, Y is float16, mean and var are float16
+                # so check mean and var with float32 gpu threshold
+                if self.dtype == 'float16' and j > 0:
+                    atol = 1e-5
+                    rtol = 1e-5
+
+                np.testing.assert_allclose(
+                    self.static_fwd_desire[i][j],
+                    fwd_actual[i][j],
+                    rtol=rtol,
+                    atol=atol,
+                    err_msg=f"Check diff failed of place:{self.places[i]}, output: {vars_name[j]}",
+                )
+                max_abs_diff = np.max(
+                    np.abs(self.static_fwd_desire[i][j] - fwd_actual[i][j])
+                )
+                print(
+                    self.shape,
+                    self.dtype,
+                    self.places[i],
+                    vars_name[j],
+                    max_abs_diff,
+                )
+            # compare with eager_desire
+            np.testing.assert_allclose(
+                self.fwd_desire[i],
+                fwd_actual[i][0],
+                rtol=rtol,
+                atol=atol,
+                err_msg=f"Check diff failed with fwd_eager:{self.places[i]}",
+            )
+
+            for j in range(len(self.static_rev_desire[i])):
+                if self.special_threshold is not None and j <= 1:
+                    atol = self.special_threshold[i]
+                    rtol = self.special_threshold[i]
+                else:
+                    atol = self.threshold_list[i][0]
+                    rtol = self.threshold_list[i][0]
+
+                max_abs_diff = np.max(
+                    np.abs(self.static_rev_desire[i][j] - rev_actual[i][j])
+                )
+
+                print(
+                    self.shape,
+                    self.dtype,
+                    self.places[i],
+                    vars_name[j + 3],
+                    max_abs_diff,
+                )
+
+                np.testing.assert_allclose(
+                    self.static_rev_desire[i][j],
+                    rev_actual[i][j],
+                    rtol=rtol,
+                    atol=atol,
+                    err_msg=f"Check diff failed of place:{self.places[i]}, output: {vars_name[j + 3]}",
+                )
+
+            # now use larger threshold when testing cpu grads to bypass cpu grad test
+            if self.special_threshold is not None and i == 0:
+                atol = self.special_threshold[i]
+                rtol = self.special_threshold[i]
+            # compare with eager_desire
+            np.testing.assert_allclose(
+                self.rev_desire[i],
+                rev_actual[i][0],
+                rtol=rtol,
+                atol=atol,
+                err_msg=f"Check diff failed with rev_eager:{self.places[i]}",
+            )
+
+        paddle.disable_static()
+
+    def test_jit_comp(self):
+        fwd_actual = []
+        rev_actual = []
+        for place in self.places:
+            input_ = paddle.to_tensor(
+                data=self.x, dtype=self.dtype, place=place, stop_gradient=False
+            )
+            scale_ = paddle.to_tensor(
+                data=self.scale,
+                dtype=self.dtype,
+                place=place,
+                stop_gradient=False,
+            )
+            bias_ = paddle.to_tensor(
+                data=self.bias,
+                dtype=self.dtype,
+                place=place,
+                stop_gradient=False,
+            )
+            net = PrimGroupNorm(self.num_channels, scale_, bias_)
+            net = apply_to_static(net, False)
+            output = net(input_)
+
+            grad = paddle.grad(output, input_)
+            fwd_actual.append(output.numpy())
+            rev_actual.append(grad[0].numpy())
+
+        for i in range(len(self.places)):
+            atol = self.threshold_list[i][1]
+            rtol = self.threshold_list[i][1]
+            np.testing.assert_allclose(
+                self.fwd_desire[i],
+                fwd_actual[i],
+                rtol=rtol,
+                atol=atol,
+                err_msg='%s jit fwd' % self.places[i],
+            )
+
+            # now use larger threshold when testing cpu grads to bypass cpu grad test
+            if self.special_threshold is not None:
+                atol = self.special_threshold[i]
+                rtol = self.special_threshold[i]
+
+            np.testing.assert_allclose(
+                self.rev_desire[i],
+                rev_actual[i],
+                rtol=rtol,
+                atol=atol,
+                err_msg='%s jit rev' % self.places[i],
+            )
+
+    def test_jit_comp_with_cinn(self):
+        fwd_actual = []
+        rev_actual = []
+        for place in self.places:
+            input_ = paddle.to_tensor(
+                data=self.x, dtype=self.dtype, place=place, stop_gradient=False
+            )
+            scale_ = paddle.to_tensor(
+                data=self.scale,
+                dtype=self.dtype,
+                place=place,
+                stop_gradient=False,
+            )
+            bias_ = paddle.to_tensor(
+                data=self.bias,
+                dtype=self.dtype,
+                place=place,
+                stop_gradient=False,
+            )
+            net = PrimGroupNorm(self.num_channels, scale_, bias_)
+            net = apply_to_static(net, False)
+            output = net(input_)
+            grad = paddle.grad(output, input_)
+            fwd_actual.append(output.numpy())
+            rev_actual.append(grad[0].numpy())
+
+        for i in range(len(self.places)):
+            atol = self.threshold_list[i][2]
+            rtol = self.threshold_list[i][2]
+            np.testing.assert_allclose(
+                self.fwd_desire[i],
+                fwd_actual[i],
+                rtol=rtol,  # mean of uniform distribution, scale for avoid random failed
+                atol=atol,
+                err_msg='%s jit_cinn fwd' % self.places[i],
+            )
+            # now use larger threshold when testing cpu grads to bypass cpu grad test
+            if self.special_threshold is not None:
+                atol = self.special_threshold[i]
+                rtol = self.special_threshold[i]
+            np.testing.assert_allclose(
+                self.rev_desire[i],
+                rev_actual[i],
+                rtol=rtol,  # mean of uniform distribution, scale for avoid random failed
+                atol=atol,
+                err_msg='%s jit_cinn rev' % self.places[i],
+            )
+
+
+class TestInstanceNormCase1(TestInstanceNormOp):
+    def init_test_case(self):
+        x_shape = [2, 100, 4, 5]
+        n, c, h, w = x_shape[0], x_shape[1], x_shape[2], x_shape[3]
+        self.epsilon = 1e-05
+        dtype = np.float32
+        scale_shape = [c]
+        mean_shape = [n * c]
+        np.random.seed()
+        self.x_np = np.random.random_sample(x_shape).astype(dtype)
+        self.scale_np = np.ones(scale_shape).astype(dtype)
+        self.bias_np = np.zeros(scale_shape).astype(dtype)
+        self.mean_np, self.var_np = _cal_mean_variance(
+            self.x_np, self.epsilon, mean_shape
+        )
+
+
 class TestInstanceNormOpTraining(unittest.TestCase):
     def setUp(self):
         self.epsilon = 1e-5
@@ -112,6 +741,7 @@ def set_global_mean_var(self, mean_shape, x):
 
     def test_forward_backward(self):
         def test_with_place(place, shape):
+            paddle.enable_static()
             epsilon = self.epsilon
             n, c, h, w = shape[0], shape[1], shape[2], shape[3]
             scale_shape = [c]
@@ -207,6 +837,7 @@ def test_with_place(place, shape):
             for id, name in enumerate(self.fetch_list):
                 self.__assert_close(var_dict[name], out[id], name)
             print("op test forward passes: ", str(place))
+            paddle.disable_static()
 
         places = [core.CPUPlace()]
 
@@ -234,6 +865,7 @@ def init_test_case(self):
 
 class TestInstanceNormOpError(unittest.TestCase):
     def test_errors(self):
+        paddle.enable_static()
         with program_guard(Program(), Program()):
             # the input of instance_norm must be Variable.
             x1 = fluid.create_lod_tensor(
@@ -246,14 +878,17 @@ def test_errors(self):
                 name='x2', shape=[-1, 3, 4, 5, 6], dtype="int32"
             )
             self.assertRaises(TypeError, paddle.static.nn.instance_norm, x2)
+        paddle.disable_static()
 
 
 class TestInstanceNormOpErrorCase1(unittest.TestCase):
     def test_errors(self):
+        paddle.enable_static()
         with program_guard(Program(), Program()):
             # the first dimension of input for instance_norm must between [2d, 5d]
             x = paddle.static.data(name='x', shape=[3], dtype="float32")
             self.assertRaises(ValueError, paddle.static.nn.instance_norm, x)
+        paddle.disable_static()
 
 
 class TestElasticNormOp(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py
index d214965b2dd6e..ab687aeb034f5 100644
--- a/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py
@@ -18,8 +18,9 @@
 from eager_op_test import OpTest, convert_float_to_uint16
 
 import paddle
-from paddle import fluid
-from paddle.fluid import Program, core, program_guard
+import paddle.nn.functional as F
+from paddle import fluid, nn
+from paddle.fluid import Program, core, framework, program_guard
 
 
 class TestInstanceNorm(unittest.TestCase):
@@ -319,5 +320,64 @@ def test_check_grad(self):
         )
 
 
+class PrimNet(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.conv = nn.Conv2D(2, 4, (3, 3), bias_attr=False)
+        self.instance_norm = nn.InstanceNorm2D(4)
+
+    def forward(self, x):
+        y = self.conv(x)
+        out = self.instance_norm(y)
+        res = F.max_pool2d(out, kernel_size=2, stride=2, padding=0)
+        return res
+
+
+def apply_to_static(net, use_cinn):
+    build_strategy = paddle.static.BuildStrategy()
+    build_strategy.build_cinn_pass = use_cinn
+    return paddle.jit.to_static(net, build_strategy=False)
+
+
+class TestPrimForwardAndBackward(unittest.TestCase):
+    """
+    Test PrimNet with @to_static + amp O2(with fp32)
+    """
+
+    def setUp(self):
+        paddle.seed(2022)
+        paddle.disable_static()
+        self.x = paddle.randn([4, 2, 6, 6], dtype="float32")
+        self.x.stop_gradient = False
+
+    def train(self, use_amp, data_layout="NCHW"):
+        paddle.seed(2022)
+        net = PrimNet()
+        sgd = paddle.optimizer.SGD(
+            learning_rate=0.1, parameters=net.parameters()
+        )
+        net = apply_to_static(net, False)
+        if use_amp:
+            net = paddle.amp.decorate(models=net, level='O2')
+        with paddle.amp.auto_cast(enable=use_amp, level='O2'):
+            out = net(self.x)
+            loss = paddle.mean(out)
+            loss.backward()
+            sgd.step()
+            sgd.clear_grad()
+            return loss
+
+    def test_amp_nchw(self):
+        if not isinstance(framework._current_expected_place(), core.CPUPlace):
+            expected = self.train(False)
+            actual = self.train(True)
+            np.testing.assert_allclose(
+                expected,
+                actual,
+                rtol=1e-3,
+                atol=1e-3,
+            )
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_isfinite_op.py b/python/paddle/fluid/tests/unittests/test_isfinite_op.py
old mode 100644
new mode 100755
index 6599f66140c22..efda5d502c6a6
--- a/python/paddle/fluid/tests/unittests/test_isfinite_op.py
+++ b/python/paddle/fluid/tests/unittests/test_isfinite_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from eager_op_test import OpTest, convert_float_to_uint16
 
 from paddle.fluid import core
 
@@ -48,6 +48,28 @@ def init_dtype(self):
         self.dtype = np.float16
 
 
+# BFP16 isinf Test
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA or not support the bfloat16",
+)
+class TestInfBF16(OpTest):
+    def setUp(self):
+        self.op_type = "isinf"
+        self.dtype = np.uint16
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(np.float32)
+        x[0] = np.inf
+        x[-1] = np.inf
+
+        out = np.array(True)
+        self.inputs = {'X': convert_float_to_uint16(x)}
+        self.outputs = {'Out': out}
+
+    def test_output(self):
+        self.check_output_with_place(core.CUDAPlace(0))
+
+
 class TestNAN(OpTest):
     def setUp(self):
         self.op_type = "isnan"
@@ -76,6 +98,28 @@ def init_dtype(self):
         self.dtype = np.float16
 
 
+# BFP16 isnan Test
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA or not support the bfloat16",
+)
+class TestNANBF16(OpTest):
+    def setUp(self):
+        self.op_type = "isnan"
+        self.dtype = np.uint16
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(np.float32)
+        x[0] = np.nan
+        x[-1] = np.nan
+
+        out = np.array(True)
+        self.inputs = {'X': convert_float_to_uint16(x)}
+        self.outputs = {'Out': out}
+
+    def test_output(self):
+        self.check_output_with_place(core.CUDAPlace(0))
+
+
 class TestIsfinite(OpTest):
     def setUp(self):
         self.op_type = "isfinite"
@@ -105,5 +149,27 @@ def init_dtype(self):
         self.dtype = np.float16
 
 
+# BFP16 isfinite Test
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA or not support the bfloat16",
+)
+class TestIsfiniteBF16(OpTest):
+    def setUp(self):
+        self.op_type = "isfinite"
+        self.dtype = np.uint16
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(np.float32)
+        x[0] = np.inf
+        x[-1] = np.nan
+
+        out = np.array(False)
+        self.inputs = {'X': convert_float_to_uint16(x)}
+        self.outputs = {'Out': out}
+
+    def test_output(self):
+        self.check_output_with_place(core.CUDAPlace(0))
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
index 5ab2bad28e3c3..aae7ba87697ce 100644
--- a/python/paddle/fluid/tests/unittests/test_pool2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
@@ -15,6 +15,7 @@
 import unittest
 
 import numpy as np
+from eager_op_test import convert_float_to_uint16
 
 import paddle
 from paddle.fluid import core
@@ -366,7 +367,11 @@ def setUp(self):
         self.init_data_format()
         self.init_shape()
 
-        input = np.random.random(self.shape).astype(self.dtype)
+        if self.is_bfloat16_op():
+            input = np.random.random(self.shape).astype(np.float32)
+        else:
+            input = np.random.random(self.shape).astype(self.dtype)
+
         output = pool2D_forward_naive(
             input,
             self.ksize,
@@ -379,8 +384,14 @@ def setUp(self):
             self.data_format,
             self.pool_type,
             self.padding_algorithm,
-        ).astype(self.dtype)
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(input)}
+        )
+
+        if self.is_bfloat16_op():
+            output = convert_float_to_uint16(output)
+            self.inputs = {'X': convert_float_to_uint16(input)}
+        else:
+            output = output.astype(self.dtype)
+            self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(input)}
 
         self.attrs = {
             'strides': self.strides,
@@ -427,7 +438,6 @@ def test_check_grad(self):
                 place,
                 {'X'},
                 'Out',
-                max_relative_error=0.07,
                 check_dygraph=(not self.use_mkldnn),
             )
         elif self.pool_type != "max":
@@ -577,7 +587,6 @@ def test_check_output(self):
                 if core.is_float16_supported(place):
                     self.check_output_with_place(
                         place,
-                        atol=1e-3,
                         check_dygraph=(not self.use_mkldnn),
                     )
 
@@ -593,7 +602,6 @@ def test_check_grad(self):
                     place,
                     {'X'},
                     'Out',
-                    max_relative_error=0.07,
                     check_dygraph=(not self.use_mkldnn),
                 )
 
@@ -618,7 +626,6 @@ def test_check_output(self):
                 if core.is_float16_supported(place):
                     self.check_output_with_place(
                         place,
-                        atol=1e-3,
                         check_dygraph=(not self.use_mkldnn),
                     )
 
@@ -634,7 +641,6 @@ def test_check_grad(self):
                     place,
                     {'X'},
                     'Out',
-                    max_relative_error=0.07,
                     check_dygraph=(not self.use_mkldnn),
                 )
 
@@ -643,20 +649,58 @@ def test_check_grad(self):
     globals()[cls_name] = TestFp16Case
 
 
+def create_test_bf16_class(parent, check_grad=True):
+    @unittest.skipIf(
+        not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
+    )
+    class TestBf16Case(parent):
+        def init_kernel_type(self):
+            self.use_cuda = True
+            self.dtype = np.uint16
+
+        def test_check_output(self):
+            if core.is_compiled_with_cuda():
+                place = core.CUDAPlace(0)
+                self.check_output_with_place(
+                    place,
+                    check_dygraph=(not self.use_mkldnn),
+                )
+
+        def test_check_grad(self):
+            place = core.CUDAPlace(0)
+            if self.pool_type != "max" and check_grad:
+                self.check_grad_with_place(
+                    place,
+                    {'X'},
+                    'Out',
+                    check_dygraph=(not self.use_mkldnn),
+                )
+
+    cls_name = "{}_{}".format(parent.__name__, "Bf16Op")
+    TestBf16Case.__name__ = cls_name
+    globals()[cls_name] = TestBf16Case
+
+
 create_test_cudnn_fp16_class(TestPool2D_Op)
-create_test_cudnn_fp16_class(TestCase1, check_grad=False)
+create_test_cudnn_fp16_class(TestCase1)
 create_test_cudnn_fp16_class(TestCase2)
 create_test_cudnn_fp16_class(TestCase3)
 create_test_cudnn_fp16_class(TestCase4)
 create_test_cudnn_fp16_class(TestCase5)
 
 create_test_fp16_class(TestPool2D_Op)
-create_test_fp16_class(TestCase1, check_grad=False)
+create_test_fp16_class(TestCase1)
 create_test_fp16_class(TestCase2)
 create_test_fp16_class(TestCase3)
 create_test_fp16_class(TestCase4)
 create_test_fp16_class(TestCase5)
 
+create_test_bf16_class(TestPool2D_Op)
+create_test_bf16_class(TestCase1)
+create_test_bf16_class(TestCase2)
+create_test_bf16_class(TestCase3)
+create_test_bf16_class(TestCase4)
+create_test_bf16_class(TestCase5)
 # --------------------test pool2d use ceil mode--------------------
 
 
@@ -796,12 +840,26 @@ def init_shape(self):
 create_test_cudnn_class(TestCase5_AsyPadding)
 
 create_test_cudnn_fp16_class(TestPool2D_AsyPadding)
-create_test_cudnn_fp16_class(TestCase1_AsyPadding, check_grad=False)
+create_test_cudnn_fp16_class(TestCase1_AsyPadding)
 create_test_cudnn_fp16_class(TestCase2_AsyPadding)
 create_test_cudnn_fp16_class(TestCase3_AsyPadding)
 create_test_cudnn_fp16_class(TestCase4_AsyPadding)
 create_test_cudnn_fp16_class(TestCase5_AsyPadding)
 
+create_test_fp16_class(TestPool2D_AsyPadding)
+create_test_fp16_class(TestCase1_AsyPadding)
+create_test_fp16_class(TestCase2_AsyPadding)
+create_test_fp16_class(TestCase3_AsyPadding)
+create_test_fp16_class(TestCase4_AsyPadding)
+create_test_fp16_class(TestCase5_AsyPadding)
+
+create_test_bf16_class(TestPool2D_AsyPadding)
+create_test_bf16_class(TestCase1_AsyPadding)
+create_test_bf16_class(TestCase2_AsyPadding)
+create_test_bf16_class(TestCase3_AsyPadding)
+create_test_bf16_class(TestCase4_AsyPadding)
+create_test_bf16_class(TestCase5_AsyPadding)
+
 create_test_cudnn_use_ceil_class(TestPool2D_AsyPadding)
 create_test_cudnn_use_ceil_class(TestCase1_AsyPadding)
 
@@ -908,12 +966,26 @@ def init_shape(self):
 create_test_cudnn_class(TestCase5_channel_last)
 
 create_test_cudnn_fp16_class(TestPool2D_channel_last)
-create_test_cudnn_fp16_class(TestCase1_channel_last, check_grad=False)
+create_test_cudnn_fp16_class(TestCase1_channel_last)
 create_test_cudnn_fp16_class(TestCase2_channel_last)
 create_test_cudnn_fp16_class(TestCase3_channel_last)
 create_test_cudnn_fp16_class(TestCase4_channel_last)
 create_test_cudnn_fp16_class(TestCase5_channel_last)
 
+create_test_fp16_class(TestPool2D_channel_last)
+create_test_fp16_class(TestCase1_channel_last)
+create_test_fp16_class(TestCase2_channel_last)
+create_test_fp16_class(TestCase3_channel_last)
+create_test_fp16_class(TestCase4_channel_last)
+create_test_fp16_class(TestCase5_channel_last)
+
+create_test_bf16_class(TestPool2D_channel_last)
+create_test_bf16_class(TestCase1_channel_last)
+create_test_bf16_class(TestCase2_channel_last)
+create_test_bf16_class(TestCase3_channel_last)
+create_test_bf16_class(TestCase4_channel_last)
+create_test_bf16_class(TestCase5_channel_last)
+
 create_test_cudnn_use_ceil_class(TestPool2D_channel_last)
 create_test_cudnn_use_ceil_class(TestCase1_channel_last)
 
@@ -1023,14 +1095,26 @@ def init_shape(self):
 create_test_cudnn_class(TestCase5_AsyPadding_channel_last)
 
 create_test_cudnn_fp16_class(TestPool2D_AsyPadding_channel_last)
-create_test_cudnn_fp16_class(
-    TestCase1_AsyPadding_channel_last, check_grad=False
-)
+create_test_cudnn_fp16_class(TestCase1_AsyPadding_channel_last)
 create_test_cudnn_fp16_class(TestCase2_AsyPadding_channel_last)
 create_test_cudnn_fp16_class(TestCase3_AsyPadding_channel_last)
 create_test_cudnn_fp16_class(TestCase4_AsyPadding_channel_last)
 create_test_cudnn_fp16_class(TestCase5_AsyPadding_channel_last)
 
+create_test_fp16_class(TestPool2D_AsyPadding_channel_last)
+create_test_fp16_class(TestCase1_AsyPadding_channel_last)
+create_test_fp16_class(TestCase2_AsyPadding_channel_last)
+create_test_fp16_class(TestCase3_AsyPadding_channel_last)
+create_test_fp16_class(TestCase4_AsyPadding_channel_last)
+create_test_fp16_class(TestCase5_AsyPadding_channel_last)
+
+create_test_bf16_class(TestPool2D_AsyPadding_channel_last)
+create_test_bf16_class(TestCase1_AsyPadding_channel_last)
+create_test_bf16_class(TestCase2_AsyPadding_channel_last)
+create_test_bf16_class(TestCase3_AsyPadding_channel_last)
+create_test_bf16_class(TestCase4_AsyPadding_channel_last)
+create_test_bf16_class(TestCase5_AsyPadding_channel_last)
+
 create_test_cudnn_use_ceil_class(TestPool2D_AsyPadding_channel_last)
 create_test_cudnn_use_ceil_class(TestCase1_AsyPadding_channel_last)
 
diff --git a/python/paddle/fluid/tests/unittests/test_stack_op.py b/python/paddle/fluid/tests/unittests/test_stack_op.py
index d2411dda4b95a..b6a19615a6eda 100644
--- a/python/paddle/fluid/tests/unittests/test_stack_op.py
+++ b/python/paddle/fluid/tests/unittests/test_stack_op.py
@@ -105,6 +105,47 @@ def initParameters(self):
         self.enable_cinn = False
 
 
+class TestStackFP16Op(TestStackOpBase):
+    def initParameters(self):
+        self.dtype = np.float16
+
+
+class TestStackFP16Op1(TestStackOpBase):
+    def initParameters(self):
+        self.dtype = np.float16
+        self.num_inputs = 8
+
+
+class TestStackFP16Op2(TestStackOpBase):
+    def initParameters(self):
+        self.dtype = np.float16
+        self.num_inputs = 10
+
+
+class TestStackFP16Op3(TestStackOpBase):
+    def initParameters(self):
+        self.dtype = np.float16
+        self.axis = -1
+
+
+class TestStackFP16Op4(TestStackOpBase):
+    def initParameters(self):
+        self.dtype = np.float16
+        self.axis = -4
+
+
+class TestStackFP16Op5(TestStackOpBase):
+    def initParameters(self):
+        self.dtype = np.float16
+        self.axis = 1
+
+
+class TestStackFP16Op6(TestStackOpBase):
+    def initParameters(self):
+        self.dtype = np.float16
+        self.axis = 3
+
+
 class TestStackBF16Op(OpTest):
     def initDefaultParameters(self):
         self.num_inputs = 4
diff --git a/python/paddle/fluid/tests/unittests/test_unstack_op.py b/python/paddle/fluid/tests/unittests/test_unstack_op.py
index 34c6950d7f1d8..9e20a78011c9d 100755
--- a/python/paddle/fluid/tests/unittests/test_unstack_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unstack_op.py
@@ -15,9 +15,11 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from eager_op_test import OpTest, convert_float_to_uint16
 
 import paddle
+from paddle import fluid
+from paddle.fluid import core
 
 
 class TestUnStackOpBase(OpTest):
@@ -64,6 +66,35 @@ def test_check_grad(self):
         self.check_grad(['X'], self.get_y_names())
 
 
+class TestUnStackFP16Op(TestUnStackOpBase):
+    def initParameters(self):
+        self.dtype = np.float16
+
+
+class TestStackFP16Op3(TestUnStackOpBase):
+    def initParameters(self):
+        self.dtype = np.float16
+        self.axis = -1
+
+
+class TestStackFP16Op4(TestUnStackOpBase):
+    def initParameters(self):
+        self.dtype = np.float16
+        self.axis = -3
+
+
+class TestStackFP16Op5(TestUnStackOpBase):
+    def initParameters(self):
+        self.dtype = np.float16
+        self.axis = 1
+
+
+class TestStackFP16Op6(TestUnStackOpBase):
+    def initParameters(self):
+        self.dtype = np.float16
+        self.axis = 2
+
+
 class TestStackOp3(TestUnStackOpBase):
     def initParameters(self):
         self.axis = -1
@@ -84,6 +115,71 @@ def initParameters(self):
         self.axis = 2
 
 
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and do not support bfloat16",
+)
+class TestUnStackBF16Op(OpTest):
+    def initDefaultParameters(self):
+        self.input_dim = (5, 6, 7)
+        self.axis = 0
+        self.dtype = np.uint16
+
+    def initParameters(self):
+        pass
+
+    def get_y_names(self):
+        y_names = []
+        for i in range(self.input_dim[self.axis]):
+            y_names.append(f'y{i}')
+        return y_names
+
+    def setUp(self):
+        self.initDefaultParameters()
+        self.initParameters()
+        self.op_type = 'unstack'
+        self.python_api = paddle.unstack
+        self.x = np.random.random(size=self.input_dim).astype(np.float32)
+        outs = np.split(self.x, self.input_dim[self.axis], self.axis)
+        new_shape = list(self.input_dim)
+        del new_shape[self.axis]
+        y_names = self.get_y_names()
+        tmp = []
+        tmp_names = []
+        for i in range(self.input_dim[self.axis]):
+            tmp.append(
+                (
+                    y_names[i],
+                    np.reshape(convert_float_to_uint16(outs[i]), new_shape),
+                )
+            )
+            tmp_names.append(y_names[i])
+
+        self.x = convert_float_to_uint16(self.x)
+        self.python_out_sig = tmp_names
+        self.inputs = {'X': self.x}
+        self.outputs = {'Y': tmp}
+        self.attrs = {'axis': self.axis, 'num': self.input_dim[self.axis]}
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        with fluid.dygraph.guard():
+            x = paddle.to_tensor(self.inputs['X'])
+            x.stop_gradient = False
+            y = paddle.unstack(
+                x, axis=self.attrs['axis'], num=self.attrs['num']
+            )
+            dx = paddle.grad(y, x)[0].numpy()
+            dx_expected = convert_float_to_uint16(
+                np.ones(self.input_dim, np.float32)
+            )
+            np.testing.assert_array_equal(dx, dx_expected)
+
+
 class TestUnstackZeroInputOp(unittest.TestCase):
     def unstack_zero_input_static(self):
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/xpu/CMakeLists.txt
deleted file mode 100644
index cc46e42f8ca64..0000000000000
--- a/python/paddle/fluid/tests/unittests/xpu/CMakeLists.txt
+++ /dev/null
@@ -1,36 +0,0 @@
-file(
-  GLOB TEST_OPS
-  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
-  "test_*.py")
-string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
-
-if(WITH_XPU_BKCL)
-  list(REMOVE_ITEM TEST_OPS "test_gen_bkcl_id_op")
-endif()
-
-file(
-  GLOB DIST_TEST_OPS
-  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
-  "test_dist_*.py")
-if(WITH_XPU_BKCL)
-  list(APPEND DIST_TEST_OPS test_gen_bkcl_id_op)
-endif()
-
-foreach(TEST_OP ${TEST_OPS})
-  py_test_modules(${TEST_OP} MODULES ${TEST_OP})
-endforeach()
-
-foreach(TEST_OP ${DIST_TEST_OPS})
-  py_test_modules(${TEST_OP} MODULES ${TEST_OP})
-endforeach()
-
-set_tests_properties(test_conv2d_op_xpu PROPERTIES TIMEOUT 120)
-set_tests_properties(test_mul_op_xpu PROPERTIES TIMEOUT 120)
-set_tests_properties(test_matmul_v2_op_xpu PROPERTIES TIMEOUT 900)
-set_tests_properties(test_matmul_op_xpu PROPERTIES TIMEOUT 300)
-set_tests_properties(test_collective_identity_xpu
-                     PROPERTIES LABELS "RUN_TYPE=DIST_KUNLUN")
-set_tests_properties(test_collective_allgather_xpu
-                     PROPERTIES LABELS "RUN_TYPE=DIST_KUNLUN")
-set_tests_properties(test_collective_allreduce_xpu
-                     PROPERTIES LABELS "RUN_TYPE=DIST_KUNLUN")
diff --git a/python/paddle/incubate/autograd/composite_rules.py b/python/paddle/incubate/autograd/composite_rules.py
index 84b7d415638b2..ba92c5dba718d 100644
--- a/python/paddle/incubate/autograd/composite_rules.py
+++ b/python/paddle/incubate/autograd/composite_rules.py
@@ -160,8 +160,8 @@ def layernorm_composite(x, scale, bias, epsilon, begin_norm_axis):
     var_tmp1 = difference * difference
     variance = mean(var_tmp1, axis=axis, keepdim=True)
     var_tmp3 = variance + epsilon
-    sqrt_var = sqrt(var_tmp3)
-    out = difference / sqrt_var
+    rsqrt_var = rsqrt(var_tmp3)
+    out = difference * rsqrt_var
 
     if scale is not None:
         scale = reshape(scale, x.shape[begin_norm_axis:])
@@ -178,6 +178,36 @@ def layernorm_composite(x, scale, bias, epsilon, begin_norm_axis):
     return out, mean_, variance
 
 
+@REGISTER_COMPOSITE('instance_norm')
+def instancenorm_composite(x, scale, bias, epsilon):
+    """
+    define composite rule of op instance_norm
+    out = (x - mean(x)) / sqrt(var + epsilon))
+    var = mean((x-mean(x))^2)
+    """
+    n, c, h, w = x.shape
+    axis = tuple(range(2, len(x.shape)))
+    mean_ = mean(x, axis=axis, keepdim=True)
+    difference = x - mean_
+    var_tmp1 = difference * difference
+    variance = mean(var_tmp1, axis=axis, keepdim=True)
+    var_tmp3 = variance + epsilon
+    sqrt_var = pow(var_tmp3, full([], 0.5, dtype=var_tmp3.dtype))
+    out = difference / sqrt_var
+
+    if scale is not None:
+        scale_tile = reshape(scale, [1, c, 1, 1])
+        out = out * scale_tile
+    if bias is not None:
+        bias_tile = reshape(bias, [1, c, 1, 1])
+        out = out + bias_tile
+
+    mean_ = reshape(mean_, [-1])
+    saved_variance = 1 / sqrt_var
+    saved_variance = reshape(saved_variance, [-1])
+    return out, mean_, saved_variance
+
+
 @REGISTER_COMPOSITE('gelu')
 def gelu_composite(x, approximate):
     """define composite rule of op gelu"""
diff --git a/python/paddle/incubate/autograd/primitives.py b/python/paddle/incubate/autograd/primitives.py
index cc8ba89423d7c..9f52d9d69ac23 100644
--- a/python/paddle/incubate/autograd/primitives.py
+++ b/python/paddle/incubate/autograd/primitives.py
@@ -50,6 +50,7 @@
 from paddle.tensor import pow  # noqa: F401
 from paddle.tensor import prod  # noqa: F401
 from paddle.tensor import reshape  # noqa: F401
+from paddle.tensor import rsqrt  # noqa: F401
 from paddle.tensor import sign  # noqa: F401
 from paddle.tensor import sin  # noqa: F401
 from paddle.tensor import sinh  # noqa: F401
@@ -117,6 +118,7 @@
     'ones',
     'zeros',
     'sqrt',
+    'rsqrt',
 ]
 
 others = [
diff --git a/python/paddle/jit/api.py b/python/paddle/jit/api.py
index bc07609a111ee..bde75f6ad73a0 100644
--- a/python/paddle/jit/api.py
+++ b/python/paddle/jit/api.py
@@ -218,8 +218,23 @@ def ignore_module(modules: list[Any]):
     add_ignore_module(modules)
 
 
+def _check_and_set_backend(backend, build_strategy):
+    if backend not in ['CINN', None]:
+        raise ValueError(
+            "The backend of to_static should be 'CINN' or None, but received {}.".format(
+                backend
+            )
+        )
+    if backend == 'CINN':
+        build_strategy.build_cinn_pass = True
+
+
 def to_static(
-    function=None, input_spec=None, build_strategy=None, property=False
+    function=None,
+    input_spec=None,
+    build_strategy=None,
+    backend=None,
+    **kwargs,
 ):
     """
     Converts imperative dygraph APIs into declarative function APIs. Decorator
@@ -228,7 +243,6 @@ def to_static(
     Tensor(s) to do imperative training, inference, or other operations. If the
     decorated function calls other imperative function, the called one will be
     converted into declarative function as well.
-
     Args:
         function (callable): callable imperative function.
         input_spec(list[InputSpec]|tuple[InputSpec]): list/tuple of InputSpec to specific the shape/dtype/name
@@ -238,7 +252,8 @@ def to_static(
             in the computational graph and memory optimization during the execution
             of the computational graph. For more information about build_strategy,
             please refer to :code:`paddle.static.BuildStrategy`. The default is None.
-        property(bool, Optional): whether the fucntion is python property. The default is False.
+        backend(str, Optional): Specifies compilation backend, which can be `CINN` or None. When backend is `CINN`, CINN compiler will be used to speed up training and inference.
+        kwargs: Support keys including `property`, set `property` to True if the fucntion is python property.
 
 
     Returns:
@@ -263,6 +278,7 @@ def func(x):
             print(x_v) # [[2. 2.]]
 
     """
+    property = kwargs.get("property", False)
 
     def decorated(python_func):
         """
@@ -279,6 +295,7 @@ def decorated(python_func):
                 input_spec=input_spec,
                 build_strategy=build_strategy,
                 property=property,
+                backend=backend,
             ),
         )
 
@@ -291,6 +308,7 @@ def decorated(python_func):
                 type(build_strategy).__name__
             )
         )
+    _check_and_set_backend(backend, build_strategy)
 
     # for usage: `to_static(foo, ...)`
     if function is not None:
diff --git a/python/paddle/jit/dy2static/partial_program.py b/python/paddle/jit/dy2static/partial_program.py
index 9538bb9300742..7a6afc82b1bf0 100644
--- a/python/paddle/jit/dy2static/partial_program.py
+++ b/python/paddle/jit/dy2static/partial_program.py
@@ -27,7 +27,12 @@
 from paddle.optimizer.lr import LRScheduler
 
 from . import logging_utils
-from .utils import RETURN_NO_VALUE_MAGIC_NUM, _out_grad_names, _param_grad_names
+from .utils import (
+    RETURN_NO_VALUE_MAGIC_NUM,
+    _out_grad_names,
+    _param_grad_names,
+    backend_guard,
+)
 
 __all__ = []
 
@@ -197,6 +202,7 @@ def __init__(
         # program_id -> list(scope)
         self._scope_cache = {}
         self._hooker = None
+        self._backend = kwargs.get('backend', None)
 
     def __call__(self, inputs):
         """
@@ -636,10 +642,9 @@ def _append_backward_desc(self, main_program):
 
         start_idx = len(program.block(0).ops) + len(self._outputs.tolist())
         if targets:
-            # TODO(CZ): later when use cinn, set_prim_all_enabled and check_and_set_prim_all_enabled will be set at else branch.
-            core.check_and_set_prim_all_enabled()
             start_idx = len(program.block(0).ops) + len(self._outputs.tolist())
-            backward.gradients(targets=targets, inputs=[])
+            with backend_guard(self._backend):
+                backward.gradients(targets=targets, inputs=[])
 
             if self._hooker:
                 program, start_idx = self._hooker.after_append_backward(
diff --git a/python/paddle/jit/dy2static/program_translator.py b/python/paddle/jit/dy2static/program_translator.py
index 3777af8879d9a..a8be1abb2a10f 100644
--- a/python/paddle/jit/dy2static/program_translator.py
+++ b/python/paddle/jit/dy2static/program_translator.py
@@ -48,6 +48,7 @@
     NO_SHAPE_VAR_TYPE,
     ast_to_func,
     ast_to_source_code,
+    backend_guard,
     func_to_source_code,
     input_specs_compatible,
     is_paddle_func,
@@ -334,7 +335,7 @@ def __init__(self, function, input_spec=None, **kwargs):
             self._class_instance = None
 
         if input_spec is not None and prim_or_cinn_is_enabled(
-            kwargs.get("build_strategy", None)
+            kwargs.get("build_strategy", None), kwargs.get("backend", None)
         ):
             from paddle.static import InputSpec
 
@@ -1184,11 +1185,9 @@ def __init__(self):
     def _build_once(self, cache_key):
         # TODO(Aurelius84): Need a gloabl FLAGS to enable/disable to_prim
         enable_prim = cache_key.kwargs['build_strategy'].build_cinn_pass
-        # TODO(CZ): later when use cinn, set_prim_all_enabled and check_and_set_prim_all_enabled will be set at else branch.
 
         # NOTE(xiongkun): Need a global FLAGS to enable/disable fallback
         enable_fallback = enable_prim
-        core.check_and_set_prim_all_enabled()
         try:
             concrete_program = ConcreteProgram.from_func_spec(
                 func_spec=cache_key.function_spec,
@@ -1216,7 +1215,8 @@ def _build_once(self, cache_key):
             else:
                 raise
 
-        if prim_or_cinn_is_enabled(cache_key.kwargs['build_strategy']):
+        backend = cache_key.kwargs['backend']
+        if prim_or_cinn_is_enabled(cache_key.kwargs['build_strategy'], backend):
             for var in concrete_program.main_program.list_vars():
                 if var.type not in NO_SHAPE_VAR_TYPE and -1 in var.shape:
                     warnings.warn(
@@ -1228,10 +1228,11 @@ def _build_once(self, cache_key):
         partial_program = partial_program_from(
             concrete_program, cache_key.class_instance is not None
         )
-        if core._is_fwd_prim_enabled():
-            partial_program.set_hooker(
-                PrimHooker(concrete_program.main_program)
-            )
+        with backend_guard(backend):
+            if core._is_fwd_prim_enabled():
+                partial_program.set_hooker(
+                    PrimHooker(concrete_program.main_program, backend)
+                )
         return concrete_program, partial_program
 
     def __getitem__(self, item):
@@ -1291,39 +1292,46 @@ def clear(self):
 
 
 class PrimHooker(PartialProgramLayerHook):
-    def __init__(self, original_program):
+    def __init__(self, original_program, backend):
         if len(original_program.blocks) > 1:
             raise ValueError(
                 'The primitive mode only support one block currently.'
             )
+        self.backend = backend
         self.custom_vjps = set()
-        if core._is_all_prim_enabled():
-            self.custom_vjps = {
-                op.type
-                for op in original_program.block(0).ops
-                if core.has_comp_grad_op_maker(op.type)
-            }
+        with backend_guard(self.backend):
+            if core._is_all_prim_enabled():
+                self.custom_vjps = {
+                    op.type
+                    for op in original_program.block(0).ops
+                    if core.has_comp_grad_op_maker(op.type)
+                }
 
     def before_append_backward(self, forward_program):
-        if core._is_fwd_prim_enabled():
-            _to_prim(forward_program.blocks, blacklist=self.custom_vjps)
-        return forward_program
+        with backend_guard(self.backend):
+            if core._is_fwd_prim_enabled():
+                _to_prim(forward_program.blocks, blacklist=self.custom_vjps)
+            return forward_program
 
     def after_append_backward(self, whole_program, backward_start_idx):
-        backward_length = len(whole_program.block(0).ops) - backward_start_idx
-        if core._is_fwd_prim_enabled() and len(self.custom_vjps) != 0:
-            # only process backward part of block
-            _to_prim(whole_program.blocks, backward_length=backward_length)
-        new_start_index = len(whole_program.block(0).ops) - backward_length
-        if backward_length > 0:
-            # only process forward part of block
-            _to_prim(whole_program.blocks, start_idx=new_start_index)
-        return whole_program, new_start_index
+        with backend_guard(self.backend):
+            backward_length = (
+                len(whole_program.block(0).ops) - backward_start_idx
+            )
+            if core._is_fwd_prim_enabled() and len(self.custom_vjps) != 0:
+                # only process backward part of block
+                _to_prim(whole_program.blocks, backward_length=backward_length)
+            new_start_index = len(whole_program.block(0).ops) - backward_length
+            if backward_length > 0:
+                # only process forward part of block
+                _to_prim(whole_program.blocks, start_idx=new_start_index)
+            return whole_program, new_start_index
 
     def after_infer(self, infer_program):
-        if core._is_fwd_prim_enabled():
-            _to_prim(infer_program.block(0))
-        return infer_program
+        with backend_guard(self.backend):
+            if core._is_fwd_prim_enabled():
+                _to_prim(infer_program.block(0))
+            return infer_program
 
 
 class ProgramTranslator:
diff --git a/python/paddle/jit/dy2static/utils.py b/python/paddle/jit/dy2static/utils.py
index 3608b8d0641a5..28c8c739f2efc 100644
--- a/python/paddle/jit/dy2static/utils.py
+++ b/python/paddle/jit/dy2static/utils.py
@@ -35,6 +35,7 @@
 from paddle.fluid import core, unique_name
 from paddle.fluid.data_feeder import convert_dtype
 from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid.wrapped_decorator import signature_safe_contextmanager
 from paddle.utils import gast
 
 from .ast_utils import ast_to_source_code
@@ -1498,7 +1499,10 @@ def _out_grad_names(program_desc, fwd_end_op_index, out_size):
     return names
 
 
-def prim_or_cinn_is_enabled(build_strategy):
+def prim_or_cinn_is_enabled(build_strategy, backend):
+    if backend == 'CINN':
+        return True
+
     if build_strategy is not None and build_strategy.build_cinn_pass:
         return True
 
@@ -1534,3 +1538,18 @@ def name_judge():
         return True
     else:
         return False
+
+
+@signature_safe_contextmanager
+def backend_guard(backend):
+    core.check_and_set_prim_all_enabled()
+    orign_fwd = core._is_fwd_prim_enabled()
+    orign_bwd = core._is_bwd_prim_enabled()
+
+    if backend == 'CINN':
+        core._set_prim_all_enabled(True)
+    try:
+        yield
+    finally:
+        core._set_prim_forward_enabled(orign_fwd)
+        core._set_prim_backward_enabled(orign_bwd)
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 4b57c9d936123..c2c98361c75e7 100644
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -1016,7 +1016,6 @@ def hsigmoid_loss(
         attrs = {
             "num_classes": num_classes,
             "is_sparse": is_sparse,
-            "remote_prefetch": is_sparse,
         }
 
         inputs = {
diff --git a/python/paddle/nn/layer/layers.py b/python/paddle/nn/layer/layers.py
index 8a3659e87b037..0babc935f1de7 100644
--- a/python/paddle/nn/layer/layers.py
+++ b/python/paddle/nn/layer/layers.py
@@ -22,7 +22,7 @@
 import numpy as np
 
 import paddle
-from paddle import profiler
+from paddle import nn, profiler
 from paddle.fluid import core, framework, unique_name
 from paddle.fluid.core import VarDesc
 from paddle.fluid.dygraph import no_grad
@@ -125,6 +125,13 @@ def _addindent(string, indent):
     return s1[0] + '\n' + '\n'.join(s2)
 
 
+def _layer_trans_dtype(layer, dtype, excluded_layers):
+    if type(layer) in excluded_layers:
+        return
+
+    layer._to_impl(dtype=dtype, floating_only=True, include_sublayers=False)
+
+
 class LayerObjectHelper(LayerHelperBase):
     def __init__(self, name):
         super().__init__(name, layer_type=name)
@@ -2146,3 +2153,170 @@ def _startup_program(self):
     # [aliases] Compatible with old method names
     set_dict = set_state_dict
     load_dict = set_state_dict
+
+    def float(self, excluded_layers=None):
+        '''
+        Casts all floating point parameters and buffers to ``float`` data type.
+
+        Parameters:
+            excluded_layers(nn.Layer|list|None, optional): Specify the layers that need to be kept original data type. if excluded_layers is None, casts all floating point parameters and buffers. Default: None.
+
+        Returns:
+            Layer: self
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+
+                class Model(paddle.nn.Layer):
+                    def __init__(self):
+                        super().__init__()
+                        self.linear = paddle.nn.Linear(1, 1)
+                        self.dropout = paddle.nn.Dropout(p=0.5)
+
+                    def forward(self, input):
+                        out = self.linear(input)
+                        out = self.dropout(out)
+                        return out
+
+                model = Model()
+                model.float()
+        '''
+
+        excluded_layers = [] if excluded_layers is None else excluded_layers
+
+        if isinstance(excluded_layers, type):
+            excluded_layers = [excluded_layers]
+        elif isinstance(excluded_layers, list):
+            pass
+        else:
+            raise TypeError(
+                "excluded_layers should be type nn.Layer or list, but got %s.",
+                type(excluded_layers).__name__,
+            )
+
+        def layer_trans(layer):
+            _layer_trans_dtype(layer, paddle.float32, excluded_layers)
+
+        return self.apply(layer_trans)
+
+    def float16(self, excluded_layers=None):
+        '''
+        Casts all floating point parameters and buffers to ``float16`` data type.
+
+
+        .. note::
+            ``nn.BatchNorm`` does not support ``bfloat16`` weights, so it would not be converted by default.
+
+
+        Parameters:
+           excluded_layers(nn.Layer|list|None, optional): Specify the layers that need to be kept original data type. if excluded_layers is None, casts all floating point parameters and buffers except ``nn.BatchNorm``. Default: None.
+
+        Returns:
+            Layer: self
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+
+                class Model(paddle.nn.Layer):
+                    def __init__(self):
+                        super().__init__()
+                        self.linear = paddle.nn.Linear(1, 1)
+                        self.dropout = paddle.nn.Dropout(p=0.5)
+
+                    def forward(self, input):
+                        out = self.linear(input)
+                        out = self.dropout(out)
+                        return out
+
+                model = Model()
+                model.float16()
+        '''
+
+        if paddle.amp.is_float16_supported() is False:
+            warnings.warn(
+                "Paddle compiled by the user does not support float16, so keep original data type."
+            )
+            return self
+
+        excluded_layers = (
+            [nn.BatchNorm] if excluded_layers is None else excluded_layers
+        )
+
+        if isinstance(excluded_layers, type):
+            excluded_layers = [excluded_layers]
+        elif isinstance(excluded_layers, list):
+            pass
+        else:
+            raise TypeError(
+                "excluded_layers should be type nn.Layer or list, but got %s.",
+                type(excluded_layers).__name__,
+            )
+
+        def layer_trans(layer):
+            _layer_trans_dtype(layer, paddle.float16, excluded_layers)
+
+        return self.apply(layer_trans)
+
+    def bfloat16(self, excluded_layers=None):
+        '''
+        Casts all floating point parameters and buffers to ``bfloat16`` data type.
+
+
+        .. note::
+            ``nn.BatchNorm`` does not support ``bfloat16`` weights, so it would not be converted by default.
+
+
+        Parameters:
+            excluded_layers(nn.Layer|list|None, optional): Specify the layers that need to be kept original data type. if excluded_layers is None, casts all floating point parameters and buffers except ``nn.BatchNorm``. Default: None.
+
+        Returns:
+            Layer: self
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+
+                class Model(paddle.nn.Layer):
+                    def __init__(self):
+                        super().__init__()
+                        self.linear = paddle.nn.Linear(1, 1)
+                        self.dropout = paddle.nn.Dropout(p=0.5)
+
+                    def forward(self, input):
+                        out = self.linear(input)
+                        out = self.dropout(out)
+                        return out
+
+                model = Model()
+                model.bfloat16()
+        '''
+
+        if paddle.amp.is_bfloat16_supported() is False:
+            warnings.warn(
+                "Paddle compiled by the user does not support bfloat16, so keep original data type."
+            )
+            return self
+
+        excluded_layers = (
+            [nn.BatchNorm] if excluded_layers is None else excluded_layers
+        )
+
+        if isinstance(excluded_layers, type):
+            excluded_layers = [excluded_layers]
+        elif isinstance(excluded_layers, list):
+            pass
+        else:
+            raise TypeError(
+                "excluded_layers should be type nn.Layer or list, but got %s.",
+                type(excluded_layers).__name__,
+            )
+
+        def layer_trans(layer):
+            _layer_trans_dtype(layer, paddle.bfloat16, excluded_layers)
+
+        return self.apply(layer_trans)
diff --git a/python/paddle/optimizer/adadelta.py b/python/paddle/optimizer/adadelta.py
index 1cdb61f698e6b..c760c535da022 100644
--- a/python/paddle/optimizer/adadelta.py
+++ b/python/paddle/optimizer/adadelta.py
@@ -197,6 +197,7 @@ def _append_optimize_op(self, block, param_and_grad):
                     param_and_grad[1],
                     avg_squared_grad_acc,
                     avg_squared_update_acc,
+                    self._create_param_lr(param_and_grad),
                     master_weight,
                     self._rho,
                     self._epsilon,
@@ -213,6 +214,7 @@ def _append_optimize_op(self, block, param_and_grad):
                 "Grad": param_and_grad[1],
                 "AvgSquaredGrad": avg_squared_grad_acc,
                 "AvgSquaredUpdate": avg_squared_update_acc,
+                "LearningRate": self._create_param_lr(param_and_grad),
             }
             outputs = {
                 "ParamOut": param_and_grad[0],
diff --git a/python/paddle/static/amp/fp16_utils.py b/python/paddle/static/amp/fp16_utils.py
index ced21f9bb758e..19d287f6fa07d 100644
--- a/python/paddle/static/amp/fp16_utils.py
+++ b/python/paddle/static/amp/fp16_utils.py
@@ -99,6 +99,8 @@ def _keep_fp32_input(op, in_name):
         return in_name != 'X'
     if op_type == 'layer_norm' and _keep_layer_norm_scale_bias_to_fp32():
         return in_name != 'X'
+    if op_type == 'instance_norm':
+        return in_name != 'X'
     if op_type == 'fused_bn_add_activation':
         return in_name not in {'X', 'Z'}
     if op_type == 'resnet_unit':
diff --git a/python/paddle/tensor/attribute.py b/python/paddle/tensor/attribute.py
index 7a859d64d0c51..63af833747b1b 100644
--- a/python/paddle/tensor/attribute.py
+++ b/python/paddle/tensor/attribute.py
@@ -120,6 +120,7 @@ def shape(input):
                 'int64',
                 'complex64',
                 'complex128',
+                'uint16',
             ],
             'shape',
         )
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index 602fa7186ec84..456e83f816865 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -1293,6 +1293,14 @@ def arange(start=0, end=None, step=1, dtype=None, name=None):
         end = start
         start = 0
 
+    out_shape = None
+    if not in_dygraph_mode() and (
+        not isinstance(start, Variable)
+        and not isinstance(end, Variable)
+        and not isinstance(step, Variable)
+    ):
+        out_shape = [int(math.ceil((end - start) / step))]
+
     if not isinstance(dtype, core.VarDesc.VarType):
         dtype = convert_np_dtype_to_dtype_(dtype)
 
@@ -1324,13 +1332,6 @@ def arange(start=0, end=None, step=1, dtype=None, name=None):
             'range/arange',
         )
         helper = LayerHelper('range', **locals())
-        out_shape = None
-        if (
-            not isinstance(start, Variable)
-            and not isinstance(end, Variable)
-            and not isinstance(step, Variable)
-        ):
-            out_shape = [int(math.ceil((end - start) / step))]
         out = helper.create_variable_for_type_inference(dtype, shape=out_shape)
         helper.append_op(
             type='range',
@@ -1954,13 +1955,29 @@ def empty_like(x, dtype=None, name=None):
         check_variable_and_dtype(
             x,
             'x',
-            ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
+            [
+                'bool',
+                'float16',
+                'float32',
+                'float64',
+                'int32',
+                'int64',
+                'uint16',
+            ],
             'empty_like',
         )
         check_dtype(
             dtype,
             'dtype',
-            ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
+            [
+                'bool',
+                'float16',
+                'float32',
+                'float64',
+                'int32',
+                'int64',
+                'uint16',
+            ],
             'empty_like',
         )
         out = helper.create_variable_for_type_inference(dtype=dtype)
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index ba7efb7956f77..1e969be880401 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -3466,7 +3466,14 @@ def isfinite(x, name=None):
         check_variable_and_dtype(
             x,
             'x',
-            ['float16', 'float32', 'float64', 'int32', 'int64'],
+            [
+                'float16',
+                'float32',
+                'float64',
+                'int32',
+                'int64',
+                'uint16',
+            ],
             'isfinite',
         )
         out = helper.create_variable_for_type_inference('bool')
@@ -3502,7 +3509,17 @@ def isinf(x, name=None):
     else:
         helper = LayerHelper("isinf_v2", **locals())
         check_variable_and_dtype(
-            x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64'], 'isinf'
+            x,
+            'x',
+            [
+                'float16',
+                'float32',
+                'float64',
+                'int32',
+                'int64',
+                'uint16',
+            ],
+            'isinf',
         )
         out = helper.create_variable_for_type_inference(dtype='bool')
         helper.append_op(type="isinf_v2", inputs={"X": x}, outputs={"Out": out})
@@ -3535,7 +3552,17 @@ def isnan(x, name=None):
     else:
         helper = LayerHelper("isnan_v2", **locals())
         check_variable_and_dtype(
-            x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64'], 'isnan'
+            x,
+            'x',
+            [
+                'float16',
+                'float32',
+                'float64',
+                'int32',
+                'int64',
+                'uint16',
+            ],
+            'isnan',
         )
         out = helper.create_variable_for_type_inference(dtype='bool')
         helper.append_op(type="isnan_v2", inputs={"X": x}, outputs={"Out": out})
diff --git a/python/paddle/utils/cpp_extension/extension_utils.py b/python/paddle/utils/cpp_extension/extension_utils.py
index e78cc85f73ca0..8ff70ca4c0e6f 100644
--- a/python/paddle/utils/cpp_extension/extension_utils.py
+++ b/python/paddle/utils/cpp_extension/extension_utils.py
@@ -1041,7 +1041,6 @@ def _gen_output_content(
 ):
     # ' ' * tab space * tab number
     indent = ' ' * 4 * 2
-    inplace_idx = {v: k for k, v in inplace_reverse_idx.items()}
     dynamic_content = f"""
 {indent}res = []
 {indent}start_idx = 0"""
@@ -1134,7 +1133,6 @@ def _custom_api_content(op_name):
         attrs_map,
         inplace_reverse_idx,
     )
-    lower_in_list = [p.split("@")[0].lower() for p in in_names]
     API_TEMPLATE = textwrap.dedent(
         """
         import paddle.fluid.core as core
@@ -1161,11 +1159,6 @@ def {op_name}({params_list}):
     api_content = API_TEMPLATE.format(
         op_name=op_name,
         params_list=params_list,
-        ins_map=ins_map,
-        attrs_map=attrs_map,
-        # "[x, y, z]""
-        in_names="[" + ",".join(lower_in_list) + "]",
-        attr_names="[" + ",".join(attr_names) + "]",
         outs_list=outs_list,
         dynamic_content=dynamic_content,
         static_content=static_content,
diff --git a/python/setup.py.in b/python/setup.py.in
index 650a4449b24c6..fa32dcf13c3d5 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -100,6 +100,32 @@ def is_taged():
     else:
         return False
 
+def get_cinn_version():
+    if '@WITH_CINN@' != 'ON':
+        return "False"
+
+    cinn_git_version = 'Unknown'
+    try:
+        cmd = ['git', 'describe', '--exact-match', '--tags', 'HEAD', '2>/dev/null']
+        cinn_tag = subprocess.Popen(cmd, stdout = subprocess.PIPE, cwd='@CINN_SOURCE_DIR@').communicate()[0].strip()
+        if len(cinn_tag) > 0:
+            cinn_git_version = cinn_tag
+    except:
+        pass
+
+    if cinn_git_version == 'Unknown':
+        try:
+            cmd = ['git', 'rev-parse', 'HEAD']
+            cinn_commit = subprocess.Popen(cmd, stdout = subprocess.PIPE,
+                cwd='@CINN_SOURCE_DIR@').communicate()[0].strip()
+            if len(cinn_commit) > 0:
+                cinn_git_version = cinn_commit
+        except:
+            pass
+
+    cinn_git_version = cinn_git_version.decode('utf-8')
+    return str(cinn_git_version)
+
 def write_version_py(filename='paddle/version/__init__.py'):
     cnt = '''# THIS FILE IS GENERATED FROM PADDLEPADDLE SETUP.PY
 #
@@ -115,6 +141,7 @@ xpu_xccl_version = '%(xpu_xccl)s'
 istaged          = %(istaged)s
 commit           = '%(commit)s'
 with_mkl         = '%(with_mkl)s'
+cinn_version      = '%(cinn)s'
 
 __all__ = ['cuda', 'cudnn', 'show', 'xpu', 'xpu_xccl']
 
@@ -143,6 +170,8 @@ def show():
 
         xpu_xccl: the xpu xccl version of package. It will return `False` if non-XPU version paddle package is installed
 
+        cinn: the cinn version of package. It will return `False` if paddle package is not compiled with CINN
+
     Examples:
         .. code-block:: python
 
@@ -159,6 +188,7 @@ def show():
             # cudnn: '7.6.5'
             # xpu: '20230114'
             # xpu_xccl: '1.0.7'
+            # cinn: False
 
             # Case 2: paddle is not tagged
             paddle.version.show()
@@ -167,6 +197,7 @@ def show():
             # cudnn: '7.6.5'
             # xpu: '20230114'
             # xpu_xccl: '1.0.7'
+            # cinn: False
     """
     if istaged:
         print('full_version:', full_version)
@@ -180,6 +211,7 @@ def show():
     print('cudnn:', cudnn_version)
     print('xpu:', xpu_version)
     print('xpu_xccl:', xpu_xccl_version)
+    print('cinn:', cinn_version)
 
 def mkl():
     return with_mkl
@@ -251,6 +283,23 @@ def xpu_xccl():
 
     """
     return xpu_xccl_version
+
+def cinn():
+    """Get CINN version of paddle package.
+
+    Returns:
+        string: Return the version information of CINN. If paddle package is not compiled with CINN, it will return False.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            paddle.version.cinn()
+            # False
+
+    """
+    return cinn_version
 '''
     commit = git_commit()
 
@@ -275,7 +324,8 @@ def xpu_xccl():
             'xpu_xccl': get_xpu_xccl_version(),
             'commit': commit,
             'istaged': is_taged(),
-            'with_mkl': '@WITH_MKL@'})
+            'with_mkl': '@WITH_MKL@',
+            'cinn': get_cinn_version()})
 
 write_version_py(filename='@PADDLE_BINARY_DIR@/python/paddle/version/__init__.py')
 
diff --git a/setup.py b/setup.py
index daa9dbd5cc6e4..288500feba854 100644
--- a/setup.py
+++ b/setup.py
@@ -427,6 +427,57 @@ def is_taged():
         return False
 
 
+def get_cinn_version():
+    if env_dict.get("WITH_CINN") != 'ON':
+        return "False"
+
+    cinn_git_version = 'Unknown'
+    # try get cinn tag name
+    try:
+        cmd = [
+            'git',
+            'describe',
+            '--exact-match',
+            '--tags',
+            'HEAD',
+            '2>/dev/null',
+        ]
+        cinn_tag = (
+            subprocess.Popen(
+                cmd,
+                stdout=subprocess.PIPE,
+                cwd=env_dict.get("CINN_SOURCE_DIR"),
+            )
+            .communicate()[0]
+            .strip()
+        )
+        if len(cinn_tag) > 0:
+            cinn_git_version = cinn_tag
+    except:
+        pass
+
+    if cinn_git_version == 'Unknown':
+        # try get cinn commit id
+        try:
+            cmd = ['git', 'rev-parse', 'HEAD']
+            cinn_commit = (
+                subprocess.Popen(
+                    cmd,
+                    stdout=subprocess.PIPE,
+                    cwd=env_dict.get("CINN_SOURCE_DIR"),
+                )
+                .communicate()[0]
+                .strip()
+            )
+            if len(cinn_commit) > 0:
+                cinn_git_version = cinn_commit
+        except:
+            pass
+
+    cinn_git_version = cinn_git_version.decode('utf-8')
+    return str(cinn_git_version)
+
+
 def write_version_py(filename='paddle/version/__init__.py'):
     cnt = '''# THIS FILE IS GENERATED FROM PADDLEPADDLE SETUP.PY
 #
@@ -442,6 +493,7 @@ def write_version_py(filename='paddle/version/__init__.py'):
 istaged          = %(istaged)s
 commit           = '%(commit)s'
 with_mkl         = '%(with_mkl)s'
+cinn_version      = '%(cinn)s'
 
 __all__ = ['cuda', 'cudnn', 'show', 'xpu', 'xpu_xccl']
 
@@ -470,6 +522,8 @@ def show():
 
         xpu_xccl: the xpu xccl version of package. It will return `False` if non-XPU version paddle package is installed
 
+        cinn: the cinn version of package. It will return `False` if paddle package is not compiled with CINN
+
     Examples:
         .. code-block:: python
 
@@ -486,6 +540,7 @@ def show():
             # cudnn: '7.6.5'
             # xpu: '20230114'
             # xpu_xccl: '1.0.7'
+            # cinn: False
 
             # Case 2: paddle is not tagged
             paddle.version.show()
@@ -494,6 +549,7 @@ def show():
             # cudnn: '7.6.5'
             # xpu: '20230114'
             # xpu_xccl: '1.0.7'
+            # cinn: False
     """
     if istaged:
         print('full_version:', full_version)
@@ -507,6 +563,7 @@ def show():
     print('cudnn:', cudnn_version)
     print('xpu:', xpu_version)
     print('xpu_xccl:', xpu_xccl_version)
+    print('cinn:', cinn_version)
 
 def mkl():
     return with_mkl
@@ -578,6 +635,23 @@ def xpu_xccl():
 
     """
     return xpu_xccl_version
+
+def cinn():
+    """Get CINN version of paddle package.
+
+    Returns:
+        string: Return the version information of CINN. If paddle package is not compiled with CINN, it will return False.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            paddle.version.cinn()
+            # False
+
+    """
+    return cinn_version
 '''
     commit = git_commit()
 
@@ -605,6 +679,7 @@ def xpu_xccl():
                 'commit': commit,
                 'istaged': is_taged(),
                 'with_mkl': env_dict.get("WITH_MKL"),
+                'cinn': get_cinn_version(),
             }
         )
 
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index d9d9cb5504f1c..8bbd59a7176ff 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -107,7 +107,9 @@ if(WITH_TESTING)
   # add_subdirectory(distributed_passes)
   # add_subdirectory(distribution)
   add_subdirectory(dygraph_to_static)
-  # add_subdirectory(fft)
+  if(NOT WIN32 OR NOT WITH_GPU)
+    add_subdirectory(fft)
+  endif()
   # add_subdirectory(fleet)
   if(WITH_IPU)
     add_subdirectory(ipu)
@@ -124,9 +126,11 @@ if(WITH_TESTING)
   add_subdirectory(rpc)
   # add_subdirectory(sequence)
   add_subdirectory(standalone_executor)
-  # add_subdirectory(tokenizer)
+  add_subdirectory(tokenizer)
   # add_subdirectory(white_list)
-  add_subdirectory(xpu)
+  if(WITH_XPU)
+    add_subdirectory(xpu)
+  endif()
 endif()
 
 get_property(test_srcs GLOBAL PROPERTY TEST_SRCS)
diff --git a/test/amp/test_layer_convert_dtype.py b/test/amp/test_layer_convert_dtype.py
new file mode 100644
index 0000000000000..86ee5c5fc2503
--- /dev/null
+++ b/test/amp/test_layer_convert_dtype.py
@@ -0,0 +1,172 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+from paddle.fluid import core
+
+
+class MyModel(paddle.nn.Layer):
+    def __init__(self, input_size, hidden_size):
+        super().__init__()
+        self.linear1 = paddle.nn.Linear(input_size, hidden_size)
+        self.linear2 = paddle.nn.Linear(hidden_size, hidden_size)
+        self.linear3 = paddle.nn.Linear(hidden_size, 1)
+        self.batchnorm = paddle.nn.Sequential(paddle.nn.BatchNorm(hidden_size))
+        register_buffer_in_temp = paddle.ones([4, 6])
+        self.register_buffer('register_buffer_in', register_buffer_in_temp)
+
+    def forward(self, inputs):
+        x = self.linear1(inputs)
+        x = F.relu(x)
+        x = self.batchnorm(x)
+        x = self.linear3(x)
+        return x
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda(), "Require compiled with CUDA."
+)
+class TestDtypeConvert(unittest.TestCase):
+    def setUp(self):
+        self.batch_size, self.input_size, self.hidden_size = 128, 128, 256
+
+    def verify_trans_dtype(
+        self, test_type=None, excluded_layers=None, corrected_dtype=None
+    ):
+        model = MyModel(self.input_size, self.hidden_size)
+        if test_type == 'float16':
+            model.float16(excluded_layers=excluded_layers)
+        elif test_type == 'bfloat16':
+            model.bfloat16(excluded_layers=excluded_layers)
+        else:
+            model.float(excluded_layers=excluded_layers)
+
+        for name, para in model.named_parameters():
+            if 'linear' in name:
+                self.assertEqual(para.dtype, corrected_dtype)
+            elif 'batchnorm' in name:
+                if excluded_layers is None:
+                    self.assertEqual(para.dtype, paddle.float32)
+                else:
+                    self.assertEqual(para.dtype, paddle.float16)
+
+    def test_excluded_layers(self):
+        self.verify_trans_dtype(
+            test_type='float16',
+            excluded_layers=[nn.Linear],
+            corrected_dtype=paddle.float32,
+        )
+        self.verify_trans_dtype(
+            test_type='float16',
+            excluded_layers=nn.Linear,
+            corrected_dtype=paddle.float32,
+        )
+
+    def test_float16(self):
+        self.verify_trans_dtype(
+            test_type='float16',
+            corrected_dtype=paddle.float16,
+        )
+
+    @unittest.skipIf(
+        not core.is_compiled_with_cuda()
+        or paddle.device.cuda.get_device_capability()[0] >= 8.0,
+        "run test when maximum gpu's compute capability is 8.0.",
+    )
+    def test_unsupported_bfloat16(self):
+        self.verify_trans_dtype(
+            test_type='bfloat16',
+            corrected_dtype=paddle.float32,
+        )
+
+    @unittest.skipIf(
+        not core.is_compiled_with_cuda()
+        or paddle.device.cuda.get_device_capability()[0] < 8.0,
+        "run test when gpu's compute capability is at least 8.0.",
+    )
+    def test_supported_bfloat16(self):
+        self.verify_trans_dtype(
+            test_type='bfloat16',
+            corrected_dtype=paddle.bfloat16,
+        )
+
+    def test_float32(self):
+        paddle.set_default_dtype('float16')
+        self.verify_trans_dtype(
+            test_type='float32',
+            corrected_dtype=paddle.float32,
+        )
+        paddle.set_default_dtype('float32')
+
+    def test_excluded_layers_type_error(self):
+        self.assertRaises(
+            TypeError, self.verify_trans_dtype, excluded_layers=111
+        )
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda(), "Require compiled with CUDA."
+)
+class TestSupportedTypeInfo(unittest.TestCase):
+    def test_cpu(self):
+        res = paddle.amp.is_float16_supported('cpu')
+        self.assertEqual(res, False)
+        res = paddle.amp.is_bfloat16_supported('cpu')
+        self.assertEqual(res, True)
+
+    def test_gpu_fp16_supported(self):
+        res = paddle.amp.is_float16_supported()
+        self.assertEqual(res, True)
+        res = paddle.amp.is_float16_supported('gpu')
+        self.assertEqual(res, True)
+        res = paddle.amp.is_float16_supported('gpu:0')
+        self.assertEqual(res, True)
+
+    @unittest.skipIf(
+        not core.is_compiled_with_cuda()
+        or paddle.device.cuda.get_device_capability()[0] >= 8.0,
+        "run test when maximum gpu's compute capability is 8.0.",
+    )
+    def test_gpu_bf16_unsupported(self):
+        res = paddle.amp.is_bfloat16_supported()
+        self.assertEqual(res, False)
+        res = paddle.amp.is_bfloat16_supported('gpu')
+        self.assertEqual(res, False)
+
+    @unittest.skipIf(
+        not core.is_compiled_with_cuda()
+        or paddle.device.cuda.get_device_capability()[0] < 8.0,
+        "run test when gpu's compute capability is at least 8.0.",
+    )
+    def test_gpu_bf16_supported(self):
+        res = paddle.amp.is_bfloat16_supported()
+        self.assertEqual(res, True)
+        res = paddle.amp.is_bfloat16_supported('gpu')
+        self.assertEqual(res, True)
+
+    def test_device_value_error(self):
+        self.assertRaises(
+            ValueError, paddle.amp.is_float16_supported, device='xxx'
+        )
+        self.assertRaises(
+            ValueError, paddle.amp.is_float16_supported, device=111
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/cpp/phi/kernels/CMakeLists.txt b/test/cpp/phi/kernels/CMakeLists.txt
index a9e897eb614dc..3e7f394f186da 100644
--- a/test/cpp/phi/kernels/CMakeLists.txt
+++ b/test/cpp/phi/kernels/CMakeLists.txt
@@ -105,3 +105,8 @@ cc_test(
   sequence_padding_test
   SRCS sequence_padding_test.cc
   DEPS sequence_padding)
+
+cc_test(
+  sequence_pooling_test
+  SRCS sequence_pooling_test.cc
+  DEPS sequence_pooling)
diff --git a/paddle/fluid/operators/math/sequence_pooling_test.cc b/test/cpp/phi/kernels/sequence_pooling_test.cc
similarity index 81%
rename from paddle/fluid/operators/math/sequence_pooling_test.cc
rename to test/cpp/phi/kernels/sequence_pooling_test.cc
index dac5eb63bfc13..3c12d55ed360f 100644
--- a/paddle/fluid/operators/math/sequence_pooling_test.cc
+++ b/test/cpp/phi/kernels/sequence_pooling_test.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,13 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <gtest/gtest.h>
+
 #include "paddle/fluid/operators/math/sequence_pooling.h"
 
-#include <gtest/gtest.h>
+#include "paddle/phi/backends/context_pool.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/tensor_utils.h"
 
 template <typename DeviceContext, typename T>
 void TestSequencePoolingSum(const DeviceContext &context,
-                            const paddle::framework::LoD &lod,
+                            const phi::LoD &lod,
                             const int64_t second_dim) {
   phi::DenseTensor cpu_out_grad;
   phi::DenseTensor cpu_in_grad;
@@ -30,17 +34,17 @@ void TestSequencePoolingSum(const DeviceContext &context,
   auto out_dims =
       phi::make_ddim({static_cast<int64_t>(out_first_dim), second_dim});
 
-  cpu_out_grad.mutable_data<T>(out_dims, paddle::platform::CPUPlace());
+  cpu_out_grad.mutable_data<T>(out_dims, phi::CPUPlace());
   for (int64_t i = 0; i < cpu_out_grad.numel(); ++i) {
     cpu_out_grad.data<T>()[i] = static_cast<T>(i);
   }
 
   // copy to dst out_grad
   auto place = context.GetPlace();
-  if (paddle::platform::is_cpu_place(place)) {
+  if (place == phi::CPUPlace()) {
     out_grad = cpu_out_grad;
   } else {
-    paddle::framework::TensorCopySync(cpu_out_grad, place, &out_grad);
+    phi::Copy(context, cpu_out_grad, place, true, &out_grad);
   }
 
   // construct in_grad
@@ -53,7 +57,7 @@ void TestSequencePoolingSum(const DeviceContext &context,
   PADDLE_ENFORCE_EQ(
       in_grad.dims().size(),
       out_grad.dims().size(),
-      paddle::platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "The dimension of input and output shall be same. Expected %ld == "
           "%ld, but got %ld != %ld. Please check the input value.",
           in_grad.dims().size(),
@@ -64,7 +68,7 @@ void TestSequencePoolingSum(const DeviceContext &context,
     PADDLE_ENFORCE_EQ(
         in_grad.dims()[i],
         out_grad.dims()[i],
-        paddle::platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The dimension of input and output shall be same. Expected %ld == "
             "%ld, but got %ld != %ld. Please check the input value.",
             in_grad.dims()[i],
@@ -77,18 +81,17 @@ void TestSequencePoolingSum(const DeviceContext &context,
   paddle::operators::math::SequencePoolGradFunctor<DeviceContext, T>()(
       context, "SUM", out_grad, &in_grad);
 
-  if (paddle::platform::is_cpu_place(place)) {
+  if (place == phi::CPUPlace()) {
     cpu_in_grad = in_grad;
   } else {
-    paddle::framework::TensorCopySync(
-        in_grad, paddle::platform::CPUPlace(), &cpu_in_grad);
+    phi::Copy(context, in_grad, phi::CPUPlace(), true, &cpu_in_grad);
     cpu_in_grad.set_lod(in_grad.lod());
   }
 
   EXPECT_EQ(in_grad.numel(), static_cast<int64_t>(lod[0].back() * second_dim));
   EXPECT_EQ(in_grad.lod(), lod);
 
-  if (paddle::platform::is_cpu_place(place)) {
+  if (place == phi::CPUPlace()) {
     for (size_t i = 0; i < in_grad.lod()[0].size() - 1; ++i) {
       int64_t begin = in_grad.lod()[0][i];
       int64_t end = in_grad.lod()[0][i + 1];
@@ -116,30 +119,30 @@ void TestSequencePoolingSum(const DeviceContext &context,
 }
 
 TEST(SequencePoolingGrad, CPU_SUM) {
-  auto place = paddle::platform::CPUPlace();
+  auto place = phi::CPUPlace();
   auto *context = static_cast<phi::CPUContext *>(
-      paddle::platform::DeviceContextPool::Instance().Get(place));
+      phi::DeviceContextPool::Instance().Get(place));
 
-  paddle::framework::LoD lod1;
+  phi::LoD lod1;
   lod1.push_back(std::vector<size_t>{0, 10});
   TestSequencePoolingSum<phi::CPUContext, float>(*context, lod1, 128);
 
-  paddle::framework::LoD lod2;
+  phi::LoD lod2;
   lod2.push_back(std::vector<size_t>{0, 2, 7, 10});
   TestSequencePoolingSum<phi::CPUContext, float>(*context, lod2, 128);
 }
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 TEST(SequencePoolingGrad, CUDA_SUM) {
-  auto place = paddle::platform::CUDAPlace(0);
+  auto place = phi::GPUPlace(0);
   auto *context = static_cast<phi::GPUContext *>(
-      paddle::platform::DeviceContextPool::Instance().Get(place));
+      phi::DeviceContextPool::Instance().Get(place));
 
-  paddle::framework::LoD lod1;
+  phi::LoD lod1;
   lod1.push_back(std::vector<size_t>{0, 10});
   TestSequencePoolingSum<phi::GPUContext, float>(*context, lod1, 128);
 
-  paddle::framework::LoD lod2;
+  phi::LoD lod2;
   lod2.push_back(std::vector<size_t>{0, 2, 7, 10});
   TestSequencePoolingSum<phi::GPUContext, float>(*context, lod2, 128);
 }
diff --git a/test/custom_op/attr_test_op.cc b/test/custom_op/attr_test_op.cc
index 14cb0aa7c716d..819d5e0ea3a2d 100644
--- a/test/custom_op/attr_test_op.cc
+++ b/test/custom_op/attr_test_op.cc
@@ -132,7 +132,7 @@ std::vector<paddle::Tensor> AttrTestForward(
     std::vector<float> float_vec_attr,
     std::vector<int64_t> int64_vec_attr,
     std::vector<std::string> str_vec_attr) {
-  auto out = paddle::Tensor(paddle::PlaceType::kCPU, x.shape());
+  auto out = paddle::empty_like(x);
 
   PD_DISPATCH_FLOATING_TYPES(
       x.type(), "assign_cpu_kernel", ([&] {
@@ -173,7 +173,7 @@ std::vector<paddle::Tensor> AttrTestBackward(
     int int_attr,
     const std::vector<float>& float_vec_attr,
     const std::vector<std::string>& str_vec_attr) {
-  auto grad_x = paddle::Tensor(paddle::PlaceType::kCPU, grad_out.shape());
+  auto grad_x = paddle::empty_like(grad_out);
 
   PD_DISPATCH_FLOATING_TYPES(grad_out.type(), "assign_cpu_kernel", ([&] {
                                assign_cpu_kernel<data_t>(
@@ -198,7 +198,7 @@ std::vector<paddle::Tensor> ConstAttrTestForward(
     const std::vector<float>& float_vec_attr,
     const std::vector<int64_t>& int64_vec_attr,
     const std::vector<std::string>& str_vec_attr) {
-  auto out = paddle::Tensor(paddle::PlaceType::kCPU, x.shape());
+  auto out = paddle::empty_like(x);
 
   PD_DISPATCH_FLOATING_TYPES(
       x.type(), "assign_cpu_kernel", ([&] {
@@ -239,7 +239,7 @@ std::vector<paddle::Tensor> ConstAttrTestBackward(
     const int& int_attr,
     const std::vector<float>& float_vec_attr,
     const std::vector<std::string>& str_vec_attr) {
-  auto grad_x = paddle::Tensor(paddle::PlaceType::kCPU, grad_out.shape());
+  auto grad_x = paddle::empty_like(grad_out);
 
   PD_DISPATCH_FLOATING_TYPES(grad_out.type(), "assign_cpu_kernel", ([&] {
                                assign_cpu_kernel<data_t>(
diff --git a/test/custom_op/context_pool_test_op.cc b/test/custom_op/context_pool_test_op.cc
index 1687bdccc9227..72b28064f0a3f 100644
--- a/test/custom_op/context_pool_test_op.cc
+++ b/test/custom_op/context_pool_test_op.cc
@@ -17,8 +17,7 @@
 #include "paddle/extension.h"
 #include "paddle/phi/backends/context_pool.h"
 
-#define CHECK_INPUT(x) \
-  PD_CHECK(x.place() == paddle::PlaceType::kCPU, #x " must be a CPU Tensor.")
+#define CHECK_INPUT(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
 
 std::vector<paddle::Tensor> ContextPoolTest(const paddle::Tensor& x) {
   // 1. test cpu context
diff --git a/test/custom_op/custom_concat_op.cc b/test/custom_op/custom_concat_op.cc
index 80f76e2df54fe..e34fffff7b2bb 100644
--- a/test/custom_op/custom_concat_op.cc
+++ b/test/custom_op/custom_concat_op.cc
@@ -17,8 +17,7 @@
 #include "concat_and_split.h"  // NOLINT
 #include "paddle/extension.h"
 
-#define CHECK_INPUT(x) \
-  PD_CHECK(x.place() == paddle::PlaceType::kCPU, #x " must be a CPU Tensor.")
+#define CHECK_INPUT(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
 
 int64_t ComputeAxis(int64_t axis, int64_t rank) {
   PD_CHECK(axis >= -rank && axis < rank,
diff --git a/test/custom_op/custom_conj_op.cc b/test/custom_op/custom_conj_op.cc
index 56938552420e7..0f76f715c427f 100644
--- a/test/custom_op/custom_conj_op.cc
+++ b/test/custom_op/custom_conj_op.cc
@@ -18,8 +18,7 @@
 
 #include "paddle/extension.h"
 
-#define CHECK_INPUT(x) \
-  PD_CHECK(x.place() == paddle::PlaceType::kCPU, #x " must be a CPU Tensor.")
+#define CHECK_INPUT(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
 
 template <typename data_t>
 using EnableComplex = typename std::enable_if<
diff --git a/test/custom_op/custom_inplace.cc b/test/custom_op/custom_inplace.cc
index fbbe10b513ece..f7db7922bf3f7 100644
--- a/test/custom_op/custom_inplace.cc
+++ b/test/custom_op/custom_inplace.cc
@@ -18,6 +18,8 @@
 
 #include "paddle/extension.h"
 
+#define CHECK_INPUT(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
+
 template <typename data_t>
 void add_data_pointer(const data_t* x_data, data_t* out_data, int64_t numel) {
   for (size_t i = 0; i < numel; ++i) {
@@ -52,7 +54,7 @@ void relu_backward_kernel(const data_t* out_data,
 }
 
 void AddForward(paddle::Tensor& x, const paddle::Tensor& y) {  // NOLINT
-  PD_CHECK(x.place() == paddle::PlaceType::kCPU, "x must be a CPU Tensor.");
+  CHECK_INPUT(x);
 
   PD_DISPATCH_FLOATING_TYPES(
       x.type(), "AddForward", ([&] {
@@ -63,8 +65,8 @@ void AddForward(paddle::Tensor& x, const paddle::Tensor& y) {  // NOLINT
 std::vector<paddle::Tensor> AddBackward(const paddle::Tensor& x,
                                         const paddle::Tensor& y,
                                         paddle::Tensor& out_grad) {  // NOLINT
-  PD_CHECK(x.place() == paddle::PlaceType::kCPU, "x must be a CPU Tensor.");
-  PD_CHECK(y.place() == paddle::PlaceType::kCPU, "y must be a CPU Tensor.");
+  CHECK_INPUT(x);
+  CHECK_INPUT(y);
 
   paddle::Tensor y_grad = paddle::empty(x.shape(), x.dtype(), x.place());
 
@@ -92,7 +94,7 @@ PD_BUILD_GRAD_OP(custom_add)
 // out[i] = x[i] + y
 void AddVectorForward(std::vector<paddle::Tensor>& x,  // NOLINT
                       const paddle::Tensor& y) {
-  PD_CHECK(y.place() == paddle::PlaceType::kCPU, "y must be a CPU Tensor.");
+  CHECK_INPUT(y);
 
   PD_DISPATCH_FLOATING_TYPES(y.type(), "AddVectorForward", ([&] {
                                for (size_t i = 0; i < x.size(); ++i) {
@@ -109,9 +111,8 @@ std::vector<paddle::Tensor> AddVectorBackward(
     const std::vector<paddle::Tensor>& x,
     const paddle::Tensor& y,
     std::vector<paddle::Tensor>& out_grad) {  // NOLINT
-  PD_CHECK(x[0].place() == paddle::PlaceType::kCPU,
-           "x[0] must be a CPU Tensor.");
-  PD_CHECK(y.place() == paddle::PlaceType::kCPU, "y must be a CPU Tensor.");
+  CHECK_INPUT(x[0]);
+  CHECK_INPUT(y);
   PD_CHECK(x.size() == out_grad.size(),
            "x must have the same size as out_grad.");
 
@@ -145,8 +146,8 @@ void MultiInplaceForward(paddle::Tensor& x,  // NOLINT
                          const paddle::Tensor& y,
                          paddle::Tensor& a,  // NOLINT
                          const paddle::Tensor& b) {
-  PD_CHECK(x.place() == paddle::PlaceType::kCPU, "x must be a CPU Tensor.");
-  PD_CHECK(a.place() == paddle::PlaceType::kCPU, "a must be a CPU Tensor.");
+  CHECK_INPUT(x);
+  CHECK_INPUT(a);
 
   PD_DISPATCH_FLOATING_TYPES(
       x.type(), "MultiInplaceForward", ([&] {
@@ -162,10 +163,10 @@ std::vector<paddle::Tensor> MultiInplaceBackward(
     const paddle::Tensor& a,
     const paddle::Tensor& b,
     paddle::Tensor& outab_grad) {  // NOLINT
-  PD_CHECK(x.place() == paddle::PlaceType::kCPU, "x must be a CPU Tensor.");
-  PD_CHECK(y.place() == paddle::PlaceType::kCPU, "y must be a CPU Tensor.");
-  PD_CHECK(a.place() == paddle::PlaceType::kCPU, "a must be a CPU Tensor.");
-  PD_CHECK(b.place() == paddle::PlaceType::kCPU, "b must be a CPU Tensor.");
+  CHECK_INPUT(x);
+  CHECK_INPUT(y);
+  CHECK_INPUT(a);
+  CHECK_INPUT(b);
 
   paddle::Tensor y_grad = paddle::empty(x.shape(), x.dtype(), x.place());
   paddle::Tensor b_grad = paddle::empty(a.shape(), a.dtype(), a.place());
@@ -200,7 +201,7 @@ PD_BUILD_GRAD_OP(custom_multi_inplace)
     .SetKernelFn(PD_KERNEL(MultiInplaceBackward));
 
 void ReluForwardInplace(paddle::Tensor& x) {  // NOLINT
-  PD_CHECK(x.place() == paddle::PlaceType::kCPU, "x must be a CPU Tensor.");
+  CHECK_INPUT(x);
 
   PD_DISPATCH_FLOATING_TYPES(x.type(), "ReluForward", ([&] {
                                relu_forward_kernel<data_t>(x.data<data_t>(),
@@ -211,7 +212,7 @@ void ReluForwardInplace(paddle::Tensor& x) {  // NOLINT
 void ReluBackwardInplace(const paddle::Tensor& x,
                          const paddle::Tensor& out,
                          paddle::Tensor& grad_out) {  // NOLINT
-  PD_CHECK(out.place() == paddle::PlaceType::kCPU, "x must be a CPU Tensor.");
+  CHECK_INPUT(out);
 
   PD_DISPATCH_FLOATING_TYPES(
       grad_out.type(), "ReluBackward", ([&] {
diff --git a/test/custom_op/custom_optional.cc b/test/custom_op/custom_optional.cc
index 0e28ce84d5a35..9d247f4a27694 100644
--- a/test/custom_op/custom_optional.cc
+++ b/test/custom_op/custom_optional.cc
@@ -18,6 +18,8 @@
 
 #include "paddle/extension.h"
 
+#define CHECK_INPUT(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
+
 template <typename data_t>
 void add_one_pointer(const data_t* x_data, data_t* out_data, int64_t numel) {
   for (size_t i = 0; i < numel; ++i) {
@@ -45,7 +47,7 @@ if (y) {
 std::vector<paddle::Tensor> AddForward(
     const paddle::Tensor& x,
     const paddle::optional<paddle::Tensor>& y) {  // NOLINT
-  PD_CHECK(x.place() == paddle::PlaceType::kCPU, "x must be a CPU Tensor.");
+  CHECK_INPUT(x);
   paddle::Tensor out = paddle::empty(x.shape(), x.dtype(), x.place());
 
   if (y) {
@@ -85,7 +87,7 @@ std::vector<paddle::Tensor> AddBackward(
     const paddle::Tensor& x,
     const paddle::optional<paddle::Tensor>& y,
     const paddle::Tensor& out_grad) {  // NOLINT
-  PD_CHECK(x.place() == paddle::PlaceType::kCPU, "x must be a CPU Tensor.");
+  CHECK_INPUT(x);
   paddle::Tensor x_grad = paddle::zeros(x.shape(), x.dtype(), x.place());
 
   if (y) {
@@ -118,7 +120,7 @@ if (y) {
 std::vector<paddle::Tensor> AddVectorForward(
     const paddle::Tensor& x,
     const paddle::optional<std::vector<paddle::Tensor>>& y) {  // NOLINT
-  PD_CHECK(x.place() == paddle::PlaceType::kCPU, "x must be a CPU Tensor.");
+  CHECK_INPUT(x);
   paddle::Tensor out = paddle::zeros(x.shape(), x.dtype(), x.place());
 
   PD_DISPATCH_FLOATING_TYPES(
@@ -167,7 +169,7 @@ std::vector<paddle::Tensor> AddVectorBackward(
     const paddle::Tensor& x,
     const paddle::optional<std::vector<paddle::Tensor>>& y,
     const paddle::Tensor& out_grad) {  // NOLINT
-  PD_CHECK(x.place() == paddle::PlaceType::kCPU, "x must be a CPU Tensor.");
+  CHECK_INPUT(x);
 
   paddle::Tensor x_grad = paddle::zeros(x.shape(), x.dtype(), x.place());
 
@@ -208,7 +210,7 @@ if (y) {
 std::vector<paddle::Tensor> AddOptionalInplaceForward(
     const paddle::Tensor& x,
     paddle::optional<paddle::Tensor>& y) {  // NOLINT
-  PD_CHECK(x.place() == paddle::PlaceType::kCPU, "x must be a CPU Tensor.");
+  CHECK_INPUT(x);
   paddle::Tensor outX = paddle::zeros(x.shape(), x.dtype(), x.place());
 
   PD_DISPATCH_FLOATING_TYPES(
@@ -252,7 +254,7 @@ std::vector<paddle::Tensor> AddOptionalInplaceBackward(
     const paddle::optional<paddle::Tensor>& y,
     const paddle::Tensor& outx_grad,
     paddle::optional<paddle::Tensor>& outy_grad) {  // NOLINT
-  PD_CHECK(x.place() == paddle::PlaceType::kCPU, "x must be a CPU Tensor.");
+  CHECK_INPUT(x);
 
   paddle::Tensor x_grad = paddle::zeros(x.shape(), x.dtype(), x.place());
 
@@ -313,7 +315,7 @@ if (y) {
 std::vector<paddle::Tensor> AddOptionalInplaceVectorForward(
     const paddle::Tensor& x,
     paddle::optional<std::vector<paddle::Tensor>>& y) {  // NOLINT
-  PD_CHECK(x.place() == paddle::PlaceType::kCPU, "x must be a CPU Tensor.");
+  CHECK_INPUT(x);
   paddle::Tensor outX = paddle::zeros(x.shape(), x.dtype(), x.place());
 
   PD_DISPATCH_FLOATING_TYPES(
@@ -359,7 +361,7 @@ std::vector<paddle::Tensor> AddOptionalInplaceVectorBackward(
     const paddle::optional<std::vector<paddle::Tensor>>& y,
     const paddle::Tensor& outx_grad,
     paddle::optional<std::vector<paddle::Tensor>>& outy_grad) {  // NOLINT
-  PD_CHECK(x.place() == paddle::PlaceType::kCPU, "x must be a CPU Tensor.");
+  CHECK_INPUT(x);
 
   paddle::Tensor x_grad = paddle::zeros(x.shape(), x.dtype(), x.place());
 
diff --git a/test/custom_op/custom_relu_op.cc b/test/custom_op/custom_relu_op.cc
index 7575887318ce3..5627bb28b921f 100644
--- a/test/custom_op/custom_relu_op.cc
+++ b/test/custom_op/custom_relu_op.cc
@@ -128,9 +128,9 @@ std::vector<paddle::Tensor> ReluBackward(const paddle::Tensor& x,
 
 std::vector<paddle::Tensor> ReluDoubleBackward(const paddle::Tensor& out,
                                                const paddle::Tensor& ddx) {
-  if (out.place() == paddle::PlaceType::kCPU) {
+  if (out.is_cpu()) {
     return relu_cpu_double_backward(out, ddx);
-  } else if (out.place() == paddle::PlaceType::kGPU) {
+  } else if (out.is_gpu()) {
     return relu_cuda_double_backward(out, ddx);
   } else {
     PD_THROW("Not implemented.");
@@ -179,9 +179,9 @@ std::vector<paddle::Tensor> relu_cuda_backward_without_x(
 
 std::vector<paddle::Tensor> ReluBackwardWithoutX(
     const paddle::Tensor& out, const paddle::Tensor& grad_out) {
-  if (out.place() == paddle::PlaceType::kCPU) {
+  if (out.is_cpu()) {
     return relu_cpu_backward_without_x(out, grad_out);
-  } else if (out.place() == paddle::PlaceType::kGPU) {
+  } else if (out.is_gpu()) {
     return relu_cuda_backward_without_x(out, grad_out);
   } else {
     PD_THROW("Not implemented.");
@@ -235,9 +235,9 @@ void relu_cuda_backward_out(const paddle::Tensor& x,
                             paddle::Tensor* grad_x);
 
 void ReluForwardOut(const paddle::Tensor& x, paddle::Tensor* out) {
-  if (x.place() == paddle::PlaceType::kCPU) {
+  if (x.is_cpu()) {
     return relu_cpu_forward_out(x, out);
-  } else if (x.place() == paddle::PlaceType::kGPU) {
+  } else if (x.is_gpu()) {
     return relu_cuda_forward_out(x, out);
   } else {
     PD_THROW("Not implemented.");
@@ -248,9 +248,9 @@ void ReluBackwardOut(const paddle::Tensor& x,
                      const paddle::Tensor& out,
                      const paddle::Tensor& grad_out,
                      paddle::Tensor* grad_x) {
-  if (x.place() == paddle::PlaceType::kCPU) {
+  if (x.is_cpu()) {
     return relu_cpu_backward_out(x, out, grad_out, grad_x);
-  } else if (x.place() == paddle::PlaceType::kGPU) {
+  } else if (x.is_gpu()) {
     return relu_cuda_backward_out(x, out, grad_out, grad_x);
   } else {
     PD_THROW("Not implemented.");
diff --git a/test/custom_op/custom_relu_op_xpu.cc b/test/custom_op/custom_relu_op_xpu.cc
index c38f8b877da2c..ee717785ad848 100644
--- a/test/custom_op/custom_relu_op_xpu.cc
+++ b/test/custom_op/custom_relu_op_xpu.cc
@@ -161,7 +161,7 @@ std::vector<paddle::Tensor> ReluBackward(const paddle::Tensor& x,
 
 std::vector<paddle::Tensor> ReluDoubleBackward(const paddle::Tensor& out,
                                                const paddle::Tensor& ddx) {
-  if (out.place() == paddle::PlaceType::kCPU) {
+  if (out.is_cpu()) {
     return relu_cpu_double_backward(out, ddx);
   } else if (out.place().GetType() == phi::AllocationType::XPU) {
     return relu_xpu_double_backward(out, ddx);
diff --git a/test/custom_op/custom_simple_slice_op.cc b/test/custom_op/custom_simple_slice_op.cc
index 783e0cd96fdd9..21bd1b8ada27d 100644
--- a/test/custom_op/custom_simple_slice_op.cc
+++ b/test/custom_op/custom_simple_slice_op.cc
@@ -17,8 +17,7 @@
 
 #include "paddle/extension.h"
 
-#define CHECK_INPUT(x) \
-  PD_CHECK(x.place() == paddle::PlaceType::kCPU, #x " must be a CPU Tensor.")
+#define CHECK_INPUT(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
 
 std::vector<paddle::Tensor> SimpleSliceFunction(const paddle::Tensor& x,
                                                 int64_t begin_index,
diff --git a/test/custom_op/custom_tanh_op.cc b/test/custom_op/custom_tanh_op.cc
index 399eb5b6366d7..a7a61b9528352 100644
--- a/test/custom_op/custom_tanh_op.cc
+++ b/test/custom_op/custom_tanh_op.cc
@@ -18,8 +18,7 @@
 
 #include "paddle/extension.h"
 
-#define CHECK_CPU_INPUT(x) \
-  PD_CHECK(x.place() == paddle::PlaceType::kCPU, #x " must be a CPU Tensor.")
+#define CHECK_CPU_INPUT(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
 
 template <typename data_t>
 void tanh_cpu_forward_kernel(const data_t* x_data,
diff --git a/test/custom_op/dispatch_test_op.cc b/test/custom_op/dispatch_test_op.cc
index 0f7d323b5451e..39e1a24fe2327 100644
--- a/test/custom_op/dispatch_test_op.cc
+++ b/test/custom_op/dispatch_test_op.cc
@@ -27,7 +27,7 @@ void assign_cpu_kernel(const data_t* x_data,
 }
 
 std::vector<paddle::Tensor> DispatchTestInterger(const paddle::Tensor& x) {
-  auto out = paddle::Tensor(paddle::PlaceType::kCPU, x.shape());
+  auto out = paddle::empty_like(x);
 
   PD_DISPATCH_INTEGRAL_TYPES(
       x.type(), "assign_cpu_kernel", ([&] {
@@ -45,7 +45,7 @@ PD_BUILD_OP(dispatch_test_integer)
 
 std::vector<paddle::Tensor> DispatchTestFloatAndInteger(
     const paddle::Tensor& x) {
-  auto out = paddle::Tensor(paddle::PlaceType::kCPU, x.shape());
+  auto out = paddle::empty_like(x);
 
   PD_DISPATCH_FLOATING_AND_INTEGRAL_TYPES(
       x.type(), "assign_cpu_kernel", ([&] {
@@ -62,7 +62,7 @@ PD_BUILD_OP(dispatch_test_float_and_integer)
     .SetKernelFn(PD_KERNEL(DispatchTestFloatAndInteger));
 
 std::vector<paddle::Tensor> DispatchTestComplex(const paddle::Tensor& x) {
-  auto out = paddle::Tensor(paddle::PlaceType::kCPU, x.shape());
+  auto out = paddle::empty_like(x);
 
   PD_DISPATCH_COMPLEX_TYPES(
       x.type(), "assign_cpu_kernel", ([&] {
@@ -80,7 +80,7 @@ PD_BUILD_OP(dispatch_test_complex)
 
 std::vector<paddle::Tensor> DispatchTestFloatAndComplex(
     const paddle::Tensor& x) {
-  auto out = paddle::Tensor(paddle::PlaceType::kCPU, x.shape());
+  auto out = paddle::empty_like(x);
 
   PD_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
       x.type(), "assign_cpu_kernel", ([&] {
@@ -98,7 +98,7 @@ PD_BUILD_OP(dispatch_test_float_and_complex)
 
 std::vector<paddle::Tensor> DispatchTestFloatAndIntegerAndComplex(
     const paddle::Tensor& x) {
-  auto out = paddle::Tensor(paddle::PlaceType::kCPU, x.shape());
+  auto out = paddle::empty_like(x);
 
   PD_DISPATCH_FLOATING_AND_INTEGRAL_AND_COMPLEX_TYPES(
       x.type(), "assign_cpu_kernel", ([&] {
@@ -115,7 +115,7 @@ PD_BUILD_OP(dispatch_test_float_and_integer_and_complex)
     .SetKernelFn(PD_KERNEL(DispatchTestFloatAndIntegerAndComplex));
 
 std::vector<paddle::Tensor> DispatchTestFloatAndHalf(const paddle::Tensor& x) {
-  auto out = paddle::Tensor(paddle::PlaceType::kCPU, x.shape());
+  auto out = paddle::empty_like(x);
 
   PD_DISPATCH_FLOATING_AND_HALF_TYPES(
       x.type(), "assign_cpu_kernel", ([&] {
diff --git a/test/custom_op/multi_out_test_op.cc b/test/custom_op/multi_out_test_op.cc
index d9e0526e4206e..7007058cbb93e 100644
--- a/test/custom_op/multi_out_test_op.cc
+++ b/test/custom_op/multi_out_test_op.cc
@@ -34,7 +34,7 @@ void fill_constant_cpu_kernel(data_t* out_data, int64_t x_numel, data_t value) {
 }
 
 std::vector<paddle::Tensor> MultiOutCPU(const paddle::Tensor& x) {
-  auto out = paddle::Tensor(paddle::PlaceType::kCPU, x.shape());
+  auto out = paddle::empty_like(x);
 
   PD_DISPATCH_FLOATING_TYPES(
       x.type(), "assign_cpu_kernel", ([&] {
@@ -43,13 +43,13 @@ std::vector<paddle::Tensor> MultiOutCPU(const paddle::Tensor& x) {
       }));
 
   // fake multi output: Fake_float64 with float64 dtype
-  auto fake_float64 = paddle::Tensor(paddle::PlaceType::kCPU, x.shape());
+  auto fake_float64 = paddle::empty_like(x);
 
   fill_constant_cpu_kernel<double>(
       fake_float64.mutable_data<double>(x.place()), x.size(), 0.);
 
   // fake multi output: ZFake_int32 with int32 dtype
-  auto zfake_int32 = paddle::Tensor(paddle::PlaceType::kCPU, x.shape());
+  auto zfake_int32 = paddle::empty_like(x);
 
   fill_constant_cpu_kernel<int32_t>(
       zfake_int32.mutable_data<int32_t>(x.place()), x.size(), 1);
diff --git a/test/custom_op/test_custom_conj.py b/test/custom_op/test_custom_conj.py
index c30463bc34869..f51038ae1b34c 100644
--- a/test/custom_op/test_custom_conj.py
+++ b/test/custom_op/test_custom_conj.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from utils import extra_cc_args, extra_nvcc_args, paddle_includes
+from utils import check_output, extra_cc_args, extra_nvcc_args, paddle_includes
 
 import paddle
 from paddle import static
@@ -100,42 +100,27 @@ def setUp(self):
         self.dtypes = ['float32', 'float64']
         self.shape = [2, 20, 2, 3]
 
-    def check_output(self, out, pd_out, name):
-        np.testing.assert_array_equal(
-            out,
-            pd_out,
-            err_msg='custom op {}: {},\n paddle api {}: {}'.format(
-                name, out, name, pd_out
-            ),
-        )
-
-    def run_dynamic(self, dtype, np_input):
-        out, x_grad = conj_dynamic(custom_ops.custom_conj, dtype, np_input)
-        pd_out, pd_x_grad = conj_dynamic(paddle.conj, dtype, np_input)
-
-        self.check_output(out, pd_out, "out")
-        self.check_output(x_grad, pd_x_grad, "x's grad")
-
-    def run_static(self, dtype, np_input):
-        out, x_grad = conj_static(
-            custom_ops.custom_conj, self.shape, dtype, np_input
-        )
-        pd_out, pd_x_grad = conj_static(
-            paddle.conj, self.shape, dtype, np_input
-        )
-
-        self.check_output(out, pd_out, "out")
-        self.check_output(x_grad, pd_x_grad, "x's grad")
-
     def test_dynamic(self):
         for dtype in self.dtypes:
             np_input = np.random.random(self.shape).astype(dtype)
-            self.run_dynamic(dtype, np_input)
+            out, x_grad = conj_dynamic(custom_ops.custom_conj, dtype, np_input)
+            pd_out, pd_x_grad = conj_dynamic(paddle.conj, dtype, np_input)
+
+            check_output(out, pd_out, "out")
+            check_output(x_grad, pd_x_grad, "x's grad")
 
     def test_static(self):
         for dtype in self.dtypes:
             np_input = np.random.random(self.shape).astype(dtype)
-            self.run_static(dtype, np_input)
+            out, x_grad = conj_static(
+                custom_ops.custom_conj, self.shape, dtype, np_input
+            )
+            pd_out, pd_x_grad = conj_static(
+                paddle.conj, self.shape, dtype, np_input
+            )
+
+            check_output(out, pd_out, "out")
+            check_output(x_grad, pd_x_grad, "x's grad")
 
     # complex only used in dynamic mode now
     def test_complex_dynamic(self):
@@ -143,7 +128,16 @@ def test_complex_dynamic(self):
             np_input = np.random.random(self.shape).astype(
                 dtype
             ) + 1j * np.random.random(self.shape).astype(dtype)
-            self.run_dynamic(to_complex(dtype), np_input)
+
+            out, x_grad = conj_dynamic(
+                custom_ops.custom_conj, to_complex(dtype), np_input
+            )
+            pd_out, pd_x_grad = conj_dynamic(
+                paddle.conj, to_complex(dtype), np_input
+            )
+
+            check_output(out, pd_out, "out")
+            check_output(x_grad, pd_x_grad, "x's grad")
 
 
 if __name__ == "__main__":
diff --git a/test/custom_op/test_custom_inplace.py b/test/custom_op/test_custom_inplace.py
index 2c0a5d4c513c1..f5eed712cdcf9 100644
--- a/test/custom_op/test_custom_inplace.py
+++ b/test/custom_op/test_custom_inplace.py
@@ -16,7 +16,13 @@
 import unittest
 
 import numpy as np
-from utils import extra_cc_args, extra_nvcc_args, paddle_includes
+from utils import (
+    check_output,
+    check_output_allclose,
+    extra_cc_args,
+    extra_nvcc_args,
+    paddle_includes,
+)
 
 import paddle
 from paddle import static
@@ -342,26 +348,6 @@ def setUp(self):
             np.random.random((3, 2)).astype("float32"),
         ]
 
-    def check_output(self, out, pd_out, name):
-        np.testing.assert_array_equal(
-            out,
-            pd_out,
-            err_msg='custom op {}: {},\n paddle api {}: {}'.format(
-                name, out, name, pd_out
-            ),
-        )
-
-    def check_output_allclose(self, out, pd_out, name):
-        np.testing.assert_allclose(
-            out,
-            pd_out,
-            rtol=5e-5,
-            atol=1e-2,
-            err_msg='custom op {}: {},\n paddle api {}: {}'.format(
-                name, out, name, pd_out
-            ),
-        )
-
     def test_static_add(self):
         for device in self.devices:
             for dtype in self.dtypes:
@@ -391,15 +377,15 @@ def test_static_add(self):
                     self.np_x,
                     self.np_y,
                 )
-                self.check_output(custom_x, custom_out, "inplace_custom_x")
-                self.check_output(
+                check_output(custom_x, custom_out, "inplace_custom_x")
+                check_output(
                     custom_x_grad, custom_out_grad, "inplace_custom_x_grad"
                 )
 
-                self.check_output(custom_out, pd_out, "out")
-                self.check_output(custom_x_grad, pd_x_grad, "x_grad")
-                self.check_output(custom_y_grad, pd_y_grad, "y_grad")
-                self.check_output(custom_out_grad, pd_out_grad, "out_grad")
+                check_output(custom_out, pd_out, "out")
+                check_output(custom_x_grad, pd_x_grad, "x_grad")
+                check_output(custom_y_grad, pd_y_grad, "y_grad")
+                check_output(custom_out_grad, pd_out_grad, "out_grad")
 
     def test_dynamic_add(self):
         for device in self.devices:
@@ -431,14 +417,14 @@ def test_dynamic_add(self):
                     self.np_y,
                 )
 
-                self.check_output(custom_x, custom_out, "inplace_custom_x")
-                self.check_output(pd_x, pd_out, "inplace_pd_x")
+                check_output(custom_x, custom_out, "inplace_custom_x")
+                check_output(pd_x, pd_out, "inplace_pd_x")
 
-                self.check_output(custom_x, pd_x, "x")
-                self.check_output(custom_y, pd_y, "y")
-                self.check_output(custom_out, pd_out, "out")
-                self.check_output(custom_x_grad, pd_x_grad, "x_grad")
-                self.check_output(custom_y_grad, pd_y_grad, "y_grad")
+                check_output(custom_x, pd_x, "x")
+                check_output(custom_y, pd_y, "y")
+                check_output(custom_out, pd_out, "out")
+                check_output(custom_x_grad, pd_x_grad, "x_grad")
+                check_output(custom_y_grad, pd_y_grad, "y_grad")
 
     def test_static_add_vector(self):
         for device in self.devices:
@@ -468,10 +454,10 @@ def test_static_add_vector(self):
                     self.np_y,
                 )
 
-                self.check_output(custom_out, pd_out, "out")
-                self.check_output(custom_x_grad, pd_x_grad, "x_grad")
-                self.check_output(custom_y_grad, pd_y_grad, "y_grad")
-                self.check_output(custom_out_grad, pd_out_grad, "out_grad")
+                check_output(custom_out, pd_out, "out")
+                check_output(custom_x_grad, pd_x_grad, "x_grad")
+                check_output(custom_y_grad, pd_y_grad, "y_grad")
+                check_output(custom_out_grad, pd_out_grad, "out_grad")
 
     def test_dynamic_add_vector(self):
         for device in self.devices:
@@ -503,14 +489,14 @@ def test_dynamic_add_vector(self):
                     self.np_y,
                 )
 
-                self.check_output(custom_x, custom_out, "inplace_custom_x")
-                self.check_output(pd_x, pd_out, "inplace_pd_x")
+                check_output(custom_x, custom_out, "inplace_custom_x")
+                check_output(pd_x, pd_out, "inplace_pd_x")
 
-                self.check_output(custom_x, pd_x, "x")
-                self.check_output(custom_y, pd_y, "y")
-                self.check_output(custom_out, pd_out, "out")
-                self.check_output(custom_x_grad, pd_x_grad, "x_grad")
-                self.check_output(custom_y_grad, pd_y_grad, "y_grad")
+                check_output(custom_x, pd_x, "x")
+                check_output(custom_y, pd_y, "y")
+                check_output(custom_out, pd_out, "out")
+                check_output(custom_x_grad, pd_x_grad, "x_grad")
+                check_output(custom_y_grad, pd_y_grad, "y_grad")
 
     def test_static_relu_net(self):
         for device in self.devices:
@@ -543,11 +529,11 @@ def test_static_relu_net(self):
                     self.np_y,
                     self.np_z,
                 )
-                self.check_output_allclose(custom_x, pd_x, "x")
-                self.check_output_allclose(custom_y, pd_y, "y")
-                self.check_output_allclose(custom_out, pd_out, "out")
-                self.check_output_allclose(custom_x_grad, pd_x_grad, "x_grad")
-                self.check_output_allclose(custom_y_grad, pd_y_grad, "y_grad")
+                check_output_allclose(custom_x, pd_x, "x")
+                check_output_allclose(custom_y, pd_y, "y")
+                check_output_allclose(custom_out, pd_out, "out")
+                check_output_allclose(custom_x_grad, pd_x_grad, "x_grad")
+                check_output_allclose(custom_y_grad, pd_y_grad, "y_grad")
 
     def test_dynamic_relu_net(self):
         for device in self.devices:
@@ -581,11 +567,11 @@ def test_dynamic_relu_net(self):
                     self.np_z,
                 )
 
-                self.check_output(custom_x, pd_x, "x")
-                self.check_output(custom_y, pd_y, "y")
-                self.check_output(custom_out, pd_out, "out")
-                self.check_output(custom_x_grad, pd_x_grad, "x_grad")
-                self.check_output(custom_y_grad, pd_y_grad, "y_grad")
+                check_output(custom_x, pd_x, "x")
+                check_output(custom_y, pd_y, "y")
+                check_output(custom_out, pd_out, "out")
+                check_output(custom_x_grad, pd_x_grad, "x_grad")
+                check_output(custom_y_grad, pd_y_grad, "y_grad")
 
     def test_static_multi_inplace(self):
         for device in self.devices:
@@ -630,27 +616,23 @@ def test_static_multi_inplace(self):
                     self.np_a,
                     self.np_b,
                 )
-                self.check_output(custom_x, pd_out_xy, "inplace_custom_x")
-                self.check_output(
+                check_output(custom_x, pd_out_xy, "inplace_custom_x")
+                check_output(
                     custom_x_grad, custom_out_xy_grad, "inplace_custom_x_grad"
                 )
-                self.check_output(custom_a, pd_out_ab, "inplace_custom_a")
-                self.check_output(
+                check_output(custom_a, pd_out_ab, "inplace_custom_a")
+                check_output(
                     custom_a_grad, custom_out_ab_grad, "inplace_custom_a_grad"
                 )
 
-                self.check_output(custom_out_xy, pd_out_xy, "outxy")
-                self.check_output(custom_x_grad, pd_x_grad, "x_grad")
-                self.check_output(custom_y_grad, pd_y_grad, "y_grad")
-                self.check_output(
-                    custom_out_xy_grad, pd_out_xy_grad, "outxy_grad"
-                )
-                self.check_output(custom_out_ab, pd_out_ab, "outab")
-                self.check_output(custom_a_grad, pd_a_grad, "a_grad")
-                self.check_output(custom_b_grad, pd_b_grad, "b_grad")
-                self.check_output(
-                    custom_out_ab_grad, pd_out_ab_grad, "outab_grad"
-                )
+                check_output(custom_out_xy, pd_out_xy, "outxy")
+                check_output(custom_x_grad, pd_x_grad, "x_grad")
+                check_output(custom_y_grad, pd_y_grad, "y_grad")
+                check_output(custom_out_xy_grad, pd_out_xy_grad, "outxy_grad")
+                check_output(custom_out_ab, pd_out_ab, "outab")
+                check_output(custom_a_grad, pd_a_grad, "a_grad")
+                check_output(custom_b_grad, pd_b_grad, "b_grad")
+                check_output(custom_out_ab_grad, pd_out_ab_grad, "outab_grad")
 
     def test_dynamic_multi_inplace(self):
         for device in self.devices:
@@ -696,21 +678,21 @@ def test_dynamic_multi_inplace(self):
                     self.np_b,
                 )
 
-                self.check_output(custom_x, custom_out_xy, "inplace_custom_x")
-                self.check_output(pd_x, pd_out_xy, "inplace_pd_x")
-                self.check_output(custom_a, custom_out_ab, "inplace_custom_a")
-                self.check_output(pd_a, pd_out_ab, "inplace_pd_a")
-
-                self.check_output(custom_x, pd_x, "x")
-                self.check_output(custom_y, pd_y, "y")
-                self.check_output(custom_out_xy, pd_out_xy, "outxy")
-                self.check_output(custom_x_grad, pd_x_grad, "x_grad")
-                self.check_output(custom_y_grad, pd_y_grad, "y_grad")
-                self.check_output(custom_a, pd_a, "a")
-                self.check_output(custom_b, pd_b, "b")
-                self.check_output(custom_out_ab, pd_out_ab, "outab")
-                self.check_output(custom_a_grad, pd_a_grad, "a_grad")
-                self.check_output(custom_b_grad, pd_b_grad, "b_grad")
+                check_output(custom_x, custom_out_xy, "inplace_custom_x")
+                check_output(pd_x, pd_out_xy, "inplace_pd_x")
+                check_output(custom_a, custom_out_ab, "inplace_custom_a")
+                check_output(pd_a, pd_out_ab, "inplace_pd_a")
+
+                check_output(custom_x, pd_x, "x")
+                check_output(custom_y, pd_y, "y")
+                check_output(custom_out_xy, pd_out_xy, "outxy")
+                check_output(custom_x_grad, pd_x_grad, "x_grad")
+                check_output(custom_y_grad, pd_y_grad, "y_grad")
+                check_output(custom_a, pd_a, "a")
+                check_output(custom_b, pd_b, "b")
+                check_output(custom_out_ab, pd_out_ab, "outab")
+                check_output(custom_a_grad, pd_a_grad, "a_grad")
+                check_output(custom_b_grad, pd_b_grad, "b_grad")
 
 
 if __name__ == "__main__":
diff --git a/test/custom_op/test_custom_linear.py b/test/custom_op/test_custom_linear.py
index 5cd4b5e14f7dd..60a881bdb6a0c 100644
--- a/test/custom_op/test_custom_linear.py
+++ b/test/custom_op/test_custom_linear.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from utils import extra_cc_args, extra_nvcc_args, paddle_includes
+from utils import check_output, extra_cc_args, extra_nvcc_args, paddle_includes
 
 import paddle
 import paddle.nn.functional as F
@@ -99,15 +99,6 @@ def setUp(self):
         self.np_weight = np.full([2, 4], fill_value=0.5, dtype="float32")
         self.np_bias = np.ones([4], dtype="float32")
 
-    def check_output(self, out, pd_out, name):
-        np.testing.assert_array_equal(
-            out,
-            pd_out,
-            err_msg='custom op {}: {},\n paddle api {}: {}'.format(
-                name, out, name, pd_out
-            ),
-        )
-
     def test_static(self):
         for device in self.devices:
             for dtype in self.dtypes:
@@ -132,12 +123,10 @@ def test_static(self):
                     self.np_weight,
                     self.np_bias,
                 )
-                self.check_output(custom_out, pd_out, "out")
-                self.check_output(custom_x_grad, pd_x_grad, "x_grad")
-                self.check_output(
-                    custom_weight_grad, pd_weight_grad, "weight_grad"
-                )
-                self.check_output(custom_bias_grad, pd_bias_grad, "bias_grad")
+                check_output(custom_out, pd_out, "out")
+                check_output(custom_x_grad, pd_x_grad, "x_grad")
+                check_output(custom_weight_grad, pd_weight_grad, "weight_grad")
+                check_output(custom_bias_grad, pd_bias_grad, "bias_grad")
 
     def test_dynamic(self):
         for device in self.devices:
@@ -168,12 +157,10 @@ def test_dynamic(self):
                     self.np_weight,
                     self.np_bias,
                 )
-                self.check_output(custom_out, pd_out, "custom_out")
-                self.check_output(custom_x_grad, pd_x_grad, "x_grad")
-                self.check_output(
-                    custom_weight_grad, pd_weight_grad, "weight_grad"
-                )
-                self.check_output(custom_bias_grad, pd_bias_grad, "bias_grad")
+                check_output(custom_out, pd_out, "custom_out")
+                check_output(custom_x_grad, pd_x_grad, "x_grad")
+                check_output(custom_weight_grad, pd_weight_grad, "weight_grad")
+                check_output(custom_bias_grad, pd_bias_grad, "bias_grad")
 
 
 if __name__ == "__main__":
diff --git a/test/custom_op/test_custom_optional.py b/test/custom_op/test_custom_optional.py
index 53d4f15952740..1c1335b37bd98 100644
--- a/test/custom_op/test_custom_optional.py
+++ b/test/custom_op/test_custom_optional.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from utils import extra_cc_args, extra_nvcc_args, paddle_includes
+from utils import check_output, extra_cc_args, extra_nvcc_args, paddle_includes
 
 import paddle
 from paddle import static
@@ -465,44 +465,6 @@ def setUp(self):
             np.random.random((3, 2)).astype("float32"),
         ]
 
-    def check_output(self, out, pd_out, name):
-        if out is None and pd_out is None:
-            return
-        assert out is not None, "out value of " + name + " is None"
-        assert pd_out is not None, "pd_out value of " + name + " is None"
-        if isinstance(out, list) and isinstance(pd_out, list):
-            for idx in range(len(out)):
-                np.testing.assert_array_equal(
-                    out[idx],
-                    pd_out[idx],
-                    err_msg='custom op {}: {},\n paddle api {}: {}'.format(
-                        name, out[idx], name, pd_out[idx]
-                    ),
-                )
-        else:
-            np.testing.assert_array_equal(
-                out,
-                pd_out,
-                err_msg='custom op {}: {},\n paddle api {}: {}'.format(
-                    name, out, name, pd_out
-                ),
-            )
-
-    def check_output_allclose(self, out, pd_out, name):
-        if out is None and pd_out is None:
-            return
-        assert out is not None, "out value of " + name + " is None"
-        assert pd_out is not None, "pd_out value of " + name + " is None"
-        np.testing.assert_allclose(
-            out,
-            pd_out,
-            rtol=5e-5,
-            atol=1e-2,
-            err_msg='custom op {}: {},\n paddle api {}: {}'.format(
-                name, out, name, pd_out
-            ),
-        )
-
     def test_optional_static_add(self):
         for device in self.devices:
             for dtype in self.dtypes:
@@ -526,9 +488,9 @@ def test_optional_static_add(self):
                         np_y,
                     )
 
-                    self.check_output(custom_x, pd_x, "x")
-                    self.check_output(custom_out, pd_out, "out")
-                    self.check_output(custom_x_grad, pd_x_grad, "x_grad")
+                    check_output(custom_x, pd_x, "x")
+                    check_output(custom_out, pd_out, "out")
+                    check_output(custom_x_grad, pd_x_grad, "x_grad")
 
     def test_optional_dynamic_add(self):
         for device in self.devices:
@@ -553,9 +515,9 @@ def test_optional_dynamic_add(self):
                         np_y,
                     )
 
-                    self.check_output(custom_x, pd_x, "x")
-                    self.check_output(custom_out, pd_out, "out")
-                    self.check_output(custom_x_grad, pd_x_grad, "x_grad")
+                    check_output(custom_x, pd_x, "x")
+                    check_output(custom_out, pd_out, "out")
+                    check_output(custom_x_grad, pd_x_grad, "x_grad")
 
     def test_optional_inplace_static_add(self):
         for device in self.devices:
@@ -576,13 +538,11 @@ def test_optional_inplace_static_add(self):
                         np_y,
                     )
 
-                    self.check_output(custom_tuple[0], pd_tuple[0], "x")
-                    self.check_output(custom_tuple[1], pd_tuple[1], "out")
-                    self.check_output(custom_tuple[2], pd_tuple[2], "x_grad")
+                    check_output(custom_tuple[0], pd_tuple[0], "x")
+                    check_output(custom_tuple[1], pd_tuple[1], "out")
+                    check_output(custom_tuple[2], pd_tuple[2], "x_grad")
                     if len(custom_tuple) > 3:
-                        self.check_output(
-                            custom_tuple[3], pd_tuple[3], "y_grad"
-                        )
+                        check_output(custom_tuple[3], pd_tuple[3], "y_grad")
 
     def test_optional_inplace_dynamic_add(self):
         for device in self.devices:
@@ -619,16 +579,16 @@ def test_optional_inplace_dynamic_add(self):
                         np_y,
                     )
 
-                    self.check_output(pd_y, pd_outy, "inplace_pd_y")
-                    self.check_output(custom_y, custom_outy, "inplace_custom_y")
+                    check_output(pd_y, pd_outy, "inplace_pd_y")
+                    check_output(custom_y, custom_outy, "inplace_custom_y")
 
-                    self.check_output(custom_x, pd_x, "x")
-                    self.check_output(custom_outx, pd_outx, "outx")
-                    self.check_output(custom_y, pd_y, "y")
-                    self.check_output(custom_outy, pd_outy, "outy")
-                    self.check_output(custom_out, pd_out, "out")
-                    self.check_output(custom_x_grad, pd_x_grad, "x_grad")
-                    self.check_output(custom_y_grad, pd_y_grad, "y_grad")
+                    check_output(custom_x, pd_x, "x")
+                    check_output(custom_outx, pd_outx, "outx")
+                    check_output(custom_y, pd_y, "y")
+                    check_output(custom_outy, pd_outy, "outy")
+                    check_output(custom_out, pd_out, "out")
+                    check_output(custom_x_grad, pd_x_grad, "x_grad")
+                    check_output(custom_y_grad, pd_y_grad, "y_grad")
 
     def test_optional_vector_static_add(self):
         for device in self.devices:
@@ -653,9 +613,9 @@ def test_optional_vector_static_add(self):
                         np_y,
                     )
 
-                    self.check_output(custom_x, pd_x, "x")
-                    self.check_output(custom_out, pd_out, "out")
-                    self.check_output(custom_x_grad, pd_x_grad, "x_grad")
+                    check_output(custom_x, pd_x, "x")
+                    check_output(custom_out, pd_out, "out")
+                    check_output(custom_x_grad, pd_x_grad, "x_grad")
 
     def test_optional_vector_dynamic_add(self):
         for device in self.devices:
@@ -680,9 +640,9 @@ def test_optional_vector_dynamic_add(self):
                         np_y,
                     )
 
-                    self.check_output(custom_x, pd_x, "x")
-                    self.check_output(custom_out, pd_out, "out")
-                    self.check_output(custom_x_grad, pd_x_grad, "x_grad")
+                    check_output(custom_x, pd_x, "x")
+                    check_output(custom_out, pd_out, "out")
+                    check_output(custom_x_grad, pd_x_grad, "x_grad")
 
     def test_optional_inplace_vector_static_add(self):
         for device in self.devices:
@@ -703,16 +663,12 @@ def test_optional_inplace_vector_static_add(self):
                         np_y,
                     )
 
-                    self.check_output(custom_tuple[0], pd_tuple[0], "x")
-                    self.check_output(custom_tuple[1], pd_tuple[1], "out")
-                    self.check_output(custom_tuple[2], pd_tuple[2], "x_grad")
+                    check_output(custom_tuple[0], pd_tuple[0], "x")
+                    check_output(custom_tuple[1], pd_tuple[1], "out")
+                    check_output(custom_tuple[2], pd_tuple[2], "x_grad")
                     if len(custom_tuple) > 3:
-                        self.check_output(
-                            custom_tuple[3], pd_tuple[3], "y1_grad"
-                        )
-                        self.check_output(
-                            custom_tuple[4], pd_tuple[4], "y2_grad"
-                        )
+                        check_output(custom_tuple[3], pd_tuple[3], "y1_grad")
+                        check_output(custom_tuple[4], pd_tuple[4], "y2_grad")
 
     def test_optional_inplace_vector_dynamic_add(self):
         for device in self.devices:
@@ -749,16 +705,16 @@ def test_optional_inplace_vector_dynamic_add(self):
                         np_y,
                     )
 
-                    self.check_output(pd_y, pd_outy, "inplace_pd_y")
-                    self.check_output(custom_y, custom_outy, "inplace_custom_y")
+                    check_output(pd_y, pd_outy, "inplace_pd_y")
+                    check_output(custom_y, custom_outy, "inplace_custom_y")
 
-                    self.check_output(custom_x, pd_x, "x")
-                    self.check_output(custom_outx, pd_outx, "outx")
-                    self.check_output(custom_y, pd_y, "y")
-                    self.check_output(custom_outy, pd_outy, "outy")
-                    self.check_output(custom_out, pd_out, "out")
-                    self.check_output(custom_x_grad, pd_x_grad, "x_grad")
-                    self.check_output(custom_y_grad, pd_y_grad, "y_grad")
+                    check_output(custom_x, pd_x, "x")
+                    check_output(custom_outx, pd_outx, "outx")
+                    check_output(custom_y, pd_y, "y")
+                    check_output(custom_outy, pd_outy, "outy")
+                    check_output(custom_out, pd_out, "out")
+                    check_output(custom_x_grad, pd_x_grad, "x_grad")
+                    check_output(custom_y_grad, pd_y_grad, "y_grad")
 
 
 if __name__ == "__main__":
diff --git a/test/custom_op/test_custom_relu_op_setup.py b/test/custom_op/test_custom_relu_op_setup.py
index 8a164b0472933..8673a806313fe 100644
--- a/test/custom_op/test_custom_relu_op_setup.py
+++ b/test/custom_op/test_custom_relu_op_setup.py
@@ -18,6 +18,7 @@
 import unittest
 
 import numpy as np
+from utils import check_output, check_output_allclose
 
 import paddle
 from paddle import static
@@ -205,13 +206,7 @@ def test_static(self):
                     pd_out = custom_relu_static(
                         custom_op, device, dtype, x, False
                     )
-                    np.testing.assert_array_equal(
-                        out,
-                        pd_out,
-                        err_msg='custom op out: {},\n paddle api out: {}'.format(
-                            out, pd_out
-                        ),
-                    )
+                    check_output(out, pd_out, "out")
 
     def test_dynamic(self):
         for device in self.devices:
@@ -226,20 +221,8 @@ def test_dynamic(self):
                     pd_out, pd_x_grad = custom_relu_dynamic(
                         custom_op, device, dtype, x, False
                     )
-                    np.testing.assert_array_equal(
-                        out,
-                        pd_out,
-                        err_msg='custom op out: {},\n paddle api out: {}'.format(
-                            out, pd_out
-                        ),
-                    )
-                    np.testing.assert_array_equal(
-                        x_grad,
-                        pd_x_grad,
-                        err_msg='custom op x grad: {},\n paddle api x grad: {}'.format(
-                            x_grad, pd_x_grad
-                        ),
-                    )
+                    check_output(out, pd_out, "out")
+                    check_output(x_grad, pd_x_grad, "x_grad")
 
     def test_static_save_and_load_inference_model(self):
         paddle.enable_static()
@@ -263,13 +246,7 @@ def test_static_save_and_load_inference_model(self):
                     feed={feed_target_names[0]: np_data},
                     fetch_list=fetch_targets,
                 )
-                np.testing.assert_array_equal(
-                    predict,
-                    predict_infer,
-                    err_msg='custom op predict: {},\n custom op infer predict: {}'.format(
-                        predict, predict_infer
-                    ),
-                )
+                check_output(predict, predict_infer, "predict")
         paddle.disable_static()
 
     def test_static_save_and_run_inference_predictor(self):
@@ -298,12 +275,9 @@ def test_static_save_and_run_inference_predictor(self):
                 predictor.get_output_names()[0]
             )
             predict_infer = output_tensor.copy_to_cpu()
-            self.assertTrue(
-                np.isclose(predict, predict_infer, rtol=5e-5).any(),
-                "custom op predict: {},\n custom op infer predict: {}".format(
-                    predict, predict_infer
-                ),
-            )
+            predict = np.array(predict).flatten()
+            predict_infer = np.array(predict_infer).flatten()
+            check_output_allclose(predict, predict_infer, "predict")
         paddle.disable_static()
 
     def test_double_grad_dynamic(self):
@@ -318,20 +292,8 @@ def test_double_grad_dynamic(self):
                 pd_out, pd_dx_grad = custom_relu_double_grad_dynamic(
                     self.custom_ops[0], device, dtype, x, False
                 )
-                np.testing.assert_array_equal(
-                    out,
-                    pd_out,
-                    err_msg='custom op out: {},\n paddle api out: {}'.format(
-                        out, pd_out
-                    ),
-                )
-                np.testing.assert_array_equal(
-                    dx_grad,
-                    pd_dx_grad,
-                    err_msg='custom op dx grad: {},\n paddle api dx grad: {}'.format(
-                        dx_grad, pd_dx_grad
-                    ),
-                )
+                check_output(out, pd_out, "out")
+                check_output(dx_grad, pd_dx_grad, "dx_grad")
 
     def test_with_dataloader(self):
         for device in self.devices:
@@ -355,13 +317,7 @@ def test_with_dataloader(self):
                 image = paddle.to_tensor(image)
                 out = self.custom_ops[0](image)
                 pd_out = paddle.nn.functional.relu(image)
-                np.testing.assert_array_equal(
-                    out,
-                    pd_out,
-                    err_msg='custom op out: {},\n paddle api out: {}'.format(
-                        out, pd_out
-                    ),
-                )
+                check_output(out, pd_out, "out")
 
                 if batch_id == 5:
                     break
diff --git a/test/custom_op/test_custom_relu_op_xpu_setup.py b/test/custom_op/test_custom_relu_op_xpu_setup.py
index 3eed65668ebc8..e054eadafd03a 100644
--- a/test/custom_op/test_custom_relu_op_xpu_setup.py
+++ b/test/custom_op/test_custom_relu_op_xpu_setup.py
@@ -18,6 +18,7 @@
 import unittest
 
 import numpy as np
+from utils import check_output, check_output_allclose
 
 import paddle
 from paddle import static
@@ -183,13 +184,7 @@ def test_static(self):
             pd_out = custom_relu_static(
                 self.custom_op, self.device, dtype, x, False
             )
-            np.testing.assert_array_equal(
-                out,
-                pd_out,
-                err_msg='custom op out: {},\n paddle api out: {}'.format(
-                    out, pd_out
-                ),
-            )
+            check_output(out, pd_out, "out")
 
     def test_dynamic(self):
         for dtype in self.dtypes:
@@ -200,20 +195,8 @@ def test_dynamic(self):
             pd_out, pd_x_grad = custom_relu_dynamic(
                 self.custom_op, self.device, dtype, x, False
             )
-            np.testing.assert_array_equal(
-                out,
-                pd_out,
-                err_msg='custom op out: {},\n paddle api out: {}'.format(
-                    out, pd_out
-                ),
-            )
-            np.testing.assert_array_equal(
-                x_grad,
-                pd_x_grad,
-                err_msg='custom op x grad: {},\n paddle api x grad: {}'.format(
-                    x_grad, pd_x_grad
-                ),
-            )
+            check_output(out, pd_out, "out")
+            check_output(x_grad, pd_x_grad, "x_grad")
 
     def test_static_save_and_load_inference_model(self):
         paddle.enable_static()
@@ -237,14 +220,7 @@ def test_static_save_and_load_inference_model(self):
                 feed={feed_target_names[0]: np_data},
                 fetch_list=fetch_targets,
             )
-            np.testing.assert_allclose(
-                predict,
-                predict_infer,
-                atol=1e-2,
-                err_msg='custom op predict: {},\n custom op infer predict: {}'.format(
-                    predict, predict_infer
-                ),
-            )
+            check_output(predict, predict_infer, "predict")
         paddle.disable_static()
 
     def test_static_save_and_run_inference_predictor(self):
@@ -272,15 +248,7 @@ def test_static_save_and_run_inference_predictor(self):
         predict_infer = output_tensor.copy_to_cpu()
         predict = np.array(predict).flatten()
         predict_infer = np.array(predict_infer).flatten()
-        np.testing.assert_allclose(
-            predict,
-            predict_infer,
-            rtol=5e-5,
-            atol=1e-2,
-            err_msg="custom op predict: {},\n custom op infer predict: {}".format(
-                predict, predict_infer
-            ),
-        )
+        check_output_allclose(predict, predict_infer, "predict")
         paddle.disable_static()
 
     def test_func_double_grad_dynamic(self):
@@ -292,20 +260,8 @@ def test_func_double_grad_dynamic(self):
             pd_out, pd_dx_grad = custom_relu_double_grad_dynamic(
                 self.custom_op, self.device, dtype, x, False
             )
-            np.testing.assert_array_equal(
-                out,
-                pd_out,
-                err_msg='custom op out: {},\n paddle api out: {}'.format(
-                    out, pd_out
-                ),
-            )
-            np.testing.assert_array_equal(
-                dx_grad,
-                pd_dx_grad,
-                err_msg='custom op dx grad: {},\n paddle api dx grad: {}'.format(
-                    dx_grad, pd_dx_grad
-                ),
-            )
+            check_output(out, pd_out, "out")
+            check_output(dx_grad, pd_dx_grad, "dx_grad")
 
     def test_with_dataloader(self):
         paddle.disable_static()
@@ -328,14 +284,7 @@ def test_with_dataloader(self):
         for batch_id, (image, _) in enumerate(train_loader()):
             out = self.custom_op(image)
             pd_out = paddle.nn.functional.relu(image)
-            np.testing.assert_allclose(
-                out,
-                pd_out,
-                atol=1e-2,
-                err_msg='custom op out: {},\n paddle api out: {}'.format(
-                    out, pd_out
-                ),
-            )
+            check_output_allclose(out, pd_out, "out", atol=1e-2)
 
             if batch_id == 5:
                 break
diff --git a/test/custom_op/test_custom_simple_slice.py b/test/custom_op/test_custom_simple_slice.py
index d69322103520c..e2662e70f3bc6 100644
--- a/test/custom_op/test_custom_simple_slice.py
+++ b/test/custom_op/test_custom_simple_slice.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from utils import extra_cc_args, extra_nvcc_args, paddle_includes
+from utils import check_output, extra_cc_args, extra_nvcc_args, paddle_includes
 
 import paddle
 from paddle.utils.cpp_extension import get_build_directory, load
@@ -47,13 +47,7 @@ def test_slice_output(self):
         x = paddle.to_tensor(np_x)
         custom_op_out = custom_ops.custom_simple_slice(x, 2, 3)
         np_out = np_x[2:3]
-        np.testing.assert_array_equal(
-            custom_op_out,
-            np_out,
-            err_msg='custom op: {},\n numpy: {}'.format(
-                np_out, custom_op_out.numpy()
-            ),
-        )
+        check_output(custom_op_out, np_out, "out")
 
 
 if __name__ == "__main__":
diff --git a/test/custom_op/test_custom_tanh_double_grad.py b/test/custom_op/test_custom_tanh_double_grad.py
index 08c57dac91fe1..a47ce712dd6a4 100644
--- a/test/custom_op/test_custom_tanh_double_grad.py
+++ b/test/custom_op/test_custom_tanh_double_grad.py
@@ -16,7 +16,12 @@
 import unittest
 
 import numpy as np
-from utils import extra_cc_args, extra_nvcc_args, paddle_includes
+from utils import (
+    check_output_allclose,
+    extra_cc_args,
+    extra_nvcc_args,
+    paddle_includes,
+)
 
 import paddle
 from paddle.utils.cpp_extension import get_build_directory, load
@@ -77,30 +82,9 @@ def test_double_grad_dynamic(self):
                 pd_out, pd_dx_grad, pd_dout = custom_tanh_double_grad_dynamic(
                     paddle.tanh, device, dtype, x
                 )
-                np.testing.assert_allclose(
-                    out,
-                    pd_out,
-                    rtol=1e-05,
-                    err_msg='custom op out: {},\n paddle api out: {}'.format(
-                        out, pd_out
-                    ),
-                )
-                np.testing.assert_allclose(
-                    dx_grad,
-                    pd_dx_grad,
-                    rtol=1e-05,
-                    err_msg='custom op dx grad: {},\n paddle api dx grad: {}'.format(
-                        dx_grad, pd_dx_grad
-                    ),
-                )
-                np.testing.assert_allclose(
-                    dout,
-                    pd_dout,
-                    rtol=1e-05,
-                    err_msg='custom op out grad: {},\n paddle api out grad: {}'.format(
-                        dout, pd_dout
-                    ),
-                )
+                check_output_allclose(out, pd_out, "out", rtol=1e-05)
+                check_output_allclose(dx_grad, pd_dx_grad, "out", rtol=1e-05)
+                check_output_allclose(dout, pd_dout, "dout", rtol=1e-05)
 
 
 if __name__ == "__main__":
diff --git a/test/custom_op/test_custom_tensor_operator.py b/test/custom_op/test_custom_tensor_operator.py
index 4e524b2f5b16b..f6edbd934171d 100644
--- a/test/custom_op/test_custom_tensor_operator.py
+++ b/test/custom_op/test_custom_tensor_operator.py
@@ -16,7 +16,12 @@
 import unittest
 
 import numpy as np
-from utils import extra_cc_args, paddle_includes
+from utils import (
+    check_output,
+    check_output_allclose,
+    extra_cc_args,
+    paddle_includes,
+)
 
 import paddle
 from paddle import static
@@ -260,7 +265,7 @@ def _test_static(self):
                 pd_out = test_custom_add_static(
                     self.add, device, dtype, x, False
                 )
-                np.testing.assert_allclose(out, pd_out, rtol=1e-5, atol=1e-8)
+                check_output_allclose(out, pd_out, "out", rtol=1e-5, atol=1e-8)
 
                 out = test_custom_subtract_static(
                     self.subtract, device, dtype, x
@@ -268,7 +273,7 @@ def _test_static(self):
                 pd_out = test_custom_subtract_static(
                     self.subtract, device, dtype, x, False
                 )
-                np.testing.assert_allclose(out, pd_out, rtol=1e-5, atol=1e-8)
+                check_output_allclose(out, pd_out, "out", rtol=1e-5, atol=1e-8)
 
                 out = test_custom_multiply_static(
                     self.multiply, device, dtype, x
@@ -276,13 +281,13 @@ def _test_static(self):
                 pd_out = test_custom_multiply_static(
                     self.multiply, device, dtype, x, False
                 )
-                np.testing.assert_allclose(out, pd_out, rtol=1e-5, atol=1e-8)
+                check_output_allclose(out, pd_out, "out", rtol=1e-5, atol=1e-8)
 
                 out = test_custom_divide_static(self.divide, device, dtype, x)
                 pd_out = test_custom_divide_static(
                     self.divide, device, dtype, x, False
                 )
-                np.testing.assert_allclose(out, pd_out, rtol=1e-5, atol=1e-8)
+                check_output_allclose(out, pd_out, "out", rtol=1e-5, atol=1e-8)
 
     def _test_dynamic(self):
         for device in self.devices:
@@ -297,9 +302,9 @@ def _test_dynamic(self):
                 pd_out, pd_x_grad = test_custom_add_dynamic(
                     self.add, device, dtype, x, False
                 )
-                np.testing.assert_allclose(out, pd_out, rtol=1e-5, atol=1e-8)
-                np.testing.assert_allclose(
-                    x_grad, pd_x_grad, rtol=1e-5, atol=1e-8
+                check_output_allclose(out, pd_out, "out", rtol=1e-5, atol=1e-8)
+                check_output_allclose(
+                    x_grad, pd_x_grad, "x_grad", rtol=1e-5, atol=1e-8
                 )
 
                 out, x_grad = test_custom_subtract_dynamic(
@@ -308,9 +313,9 @@ def _test_dynamic(self):
                 pd_out, pd_x_grad = test_custom_subtract_dynamic(
                     self.subtract, device, dtype, x, False
                 )
-                np.testing.assert_allclose(out, pd_out, rtol=1e-5, atol=1e-8)
-                np.testing.assert_allclose(
-                    x_grad, pd_x_grad, rtol=1e-5, atol=1e-8
+                check_output_allclose(out, pd_out, "out", rtol=1e-5, atol=1e-8)
+                check_output_allclose(
+                    x_grad, pd_x_grad, "x_grad", rtol=1e-5, atol=1e-8
                 )
 
                 out, x_grad = test_custom_multiply_dynamic(
@@ -319,9 +324,9 @@ def _test_dynamic(self):
                 pd_out, pd_x_grad = test_custom_multiply_dynamic(
                     self.multiply, device, dtype, x, False
                 )
-                np.testing.assert_allclose(out, pd_out, rtol=1e-5, atol=1e-8)
-                np.testing.assert_allclose(
-                    x_grad, pd_x_grad, rtol=1e-5, atol=1e-8
+                check_output_allclose(out, pd_out, "out", rtol=1e-5, atol=1e-8)
+                check_output_allclose(
+                    x_grad, pd_x_grad, "x_grad", rtol=1e-5, atol=1e-8
                 )
 
                 out, x_grad = test_custom_divide_dynamic(
@@ -330,7 +335,7 @@ def _test_dynamic(self):
                 pd_out, pd_x_grad = test_custom_divide_dynamic(
                     self.divide, device, dtype, x, False
                 )
-                np.testing.assert_allclose(out, pd_out, rtol=1e-5, atol=1e-8)
+                check_output_allclose(out, pd_out, "out", rtol=1e-5, atol=1e-8)
 
     def _test_logical_operants(self):
         for device in self.devices:
@@ -342,19 +347,19 @@ def _test_logical_operants(self):
 
             out = self.custom_module.custom_logical_and(x, y)
             pd_out = paddle.bitwise_and(x, y)
-            np.testing.assert_equal(out.numpy(), pd_out.numpy())
+            check_output(out.numpy(), pd_out.numpy(), "out")
 
             out = self.custom_module.custom_logical_or(x, y)
             pd_out = paddle.bitwise_or(x, y)
-            np.testing.assert_equal(out.numpy(), pd_out.numpy())
+            check_output(out.numpy(), pd_out.numpy(), "out")
 
             out = self.custom_module.custom_logical_xor(x, y)
             pd_out = paddle.bitwise_xor(x, y)
-            np.testing.assert_equal(out.numpy(), pd_out.numpy())
+            check_output(out.numpy(), pd_out.numpy(), "out")
 
             out = self.custom_module.custom_logical_not(x)
             pd_out = paddle.bitwise_not(x)
-            np.testing.assert_equal(out.numpy(), pd_out.numpy())
+            check_output(out.numpy(), pd_out.numpy(), "out")
 
     def _test_compare_operants(self):
         for device in self.devices:
@@ -366,27 +371,27 @@ def _test_compare_operants(self):
 
             out = self.custom_module.custom_less_than(x, y)
             pd_out = paddle.less_than(x, y)
-            np.testing.assert_equal(out.numpy(), pd_out.numpy())
+            check_output(out.numpy(), pd_out.numpy(), "out")
 
             out = self.custom_module.custom_less_equal(x, y)
             pd_out = paddle.less_equal(x, y)
-            np.testing.assert_equal(out.numpy(), pd_out.numpy())
+            check_output(out.numpy(), pd_out.numpy(), "out")
 
             out = self.custom_module.custom_equal(x, y)
             pd_out = paddle.equal(x, y)
-            np.testing.assert_equal(out.numpy(), pd_out.numpy())
+            check_output(out.numpy(), pd_out.numpy(), "out")
 
             out = self.custom_module.custom_not_equal(x, y)
             pd_out = paddle.not_equal(x, y)
-            np.testing.assert_equal(out.numpy(), pd_out.numpy())
+            check_output(out.numpy(), pd_out.numpy(), "out")
 
             out = self.custom_module.custom_greater_than(x, y)
             pd_out = paddle.greater_than(x, y)
-            np.testing.assert_equal(out.numpy(), pd_out.numpy())
+            check_output(out.numpy(), pd_out.numpy(), "out")
 
             out = self.custom_module.custom_greater_equal(x, y)
             pd_out = paddle.greater_equal(x, y)
-            np.testing.assert_equal(out.numpy(), pd_out.numpy())
+            check_output(out.numpy(), pd_out.numpy(), "out")
 
 
 if __name__ == '__main__':
diff --git a/test/custom_op/test_multi_out_jit.py b/test/custom_op/test_multi_out_jit.py
index f3e3a6ec8abc1..a191ab33e6a7c 100644
--- a/test/custom_op/test_multi_out_jit.py
+++ b/test/custom_op/test_multi_out_jit.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from utils import extra_cc_args, paddle_includes
+from utils import check_output, extra_cc_args, paddle_includes
 
 import paddle
 from paddle import static
@@ -105,15 +105,6 @@ def setUp(self):
         self.np_y = np.random.uniform(-1, 1, [4, 8]).astype("float32")
         self.np_z = np.random.uniform(-1, 1, [4, 8]).astype("float32")
 
-    def check_output(self, out, pd_out, name):
-        np.testing.assert_array_equal(
-            out,
-            pd_out,
-            err_msg='custom op {}: {},\n paddle api {}: {}'.format(
-                name, out, name, pd_out
-            ),
-        )
-
     def run_static(self, device, dtype):
         paddle.set_device(device)
         x_data = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
@@ -140,14 +131,12 @@ def check_multi_outputs(self, outs, is_dynamic=False):
             one_int32 = one_int32.numpy()
         # Fake_float64
         self.assertTrue('float64' in str(zero_float64.dtype))
-        np.testing.assert_array_equal(
-            zero_float64, np.zeros([4, 8]).astype('float64')
+        check_output(
+            zero_float64, np.zeros([4, 8]).astype('float64'), "zero_float64"
         )
         # ZFake_int32
         self.assertTrue('int32' in str(one_int32.dtype))
-        np.testing.assert_array_equal(
-            one_int32, np.ones([4, 8]).astype('int32')
-        )
+        check_output(one_int32, np.ones([4, 8]).astype('int32'), "one_int32")
 
     def test_multi_out_static(self):
         paddle.enable_static()
@@ -193,10 +182,10 @@ def test_discrete_out_static(self):
                     self.np_y,
                     self.np_z,
                 )
-                self.check_output(custom_out, pd_out, "out")
+                check_output(custom_out, pd_out, "out")
                 # NOTE: In static mode, the output gradient of custom operator has been optimized to shape=[1]. However, native paddle op's output shape = [4, 8], hence we need to fetch pd_w_grad[0][0] (By the way, something wrong with native paddle's gradient, the outputs with other indexes instead of pd_w_grad[0][0] is undefined in this unittest.)
-                self.check_output(custom_w_grad, pd_w_grad[0][0], "w_grad")
-                self.check_output(custom_y_grad, pd_y_grad[0][0], "y_grad")
+                check_output(custom_w_grad, pd_w_grad[0][0], "w_grad")
+                check_output(custom_y_grad, pd_y_grad[0][0], "y_grad")
 
     def test_discrete_out_dynamic(self):
         for device in self.devices:
@@ -223,9 +212,9 @@ def test_discrete_out_dynamic(self):
                     self.np_y,
                     self.np_z,
                 )
-                self.check_output(custom_out, pd_out, "out")
-                self.check_output(custom_w_grad, pd_w_grad, "w_grad")
-                self.check_output(custom_y_grad, pd_y_grad, "y_grad")
+                check_output(custom_out, pd_out, "out")
+                check_output(custom_w_grad, pd_w_grad, "w_grad")
+                check_output(custom_y_grad, pd_y_grad, "y_grad")
 
 
 if __name__ == '__main__':
diff --git a/test/custom_op/utils.py b/test/custom_op/utils.py
index 7e199f3a6114d..d65a0f2175f6e 100644
--- a/test/custom_op/utils.py
+++ b/test/custom_op/utils.py
@@ -16,6 +16,8 @@
 import sys
 from site import getsitepackages
 
+import numpy as np
+
 from paddle.utils.cpp_extension.extension_utils import IS_WINDOWS
 
 IS_MAC = sys.platform.startswith('darwin')
@@ -39,3 +41,43 @@
 extra_cc_args = ['-w', '-g'] if not IS_WINDOWS else ['/w']
 extra_nvcc_args = ['-O3']
 extra_compile_args = {'cc': extra_cc_args, 'nvcc': extra_nvcc_args}
+
+
+def check_output(out, pd_out, name):
+    if out is None and pd_out is None:
+        return
+    assert out is not None, "out value of " + name + " is None"
+    assert pd_out is not None, "pd_out value of " + name + " is None"
+    if isinstance(out, list) and isinstance(pd_out, list):
+        for idx in range(len(out)):
+            np.testing.assert_array_equal(
+                out[idx],
+                pd_out[idx],
+                err_msg='custom op {}: {},\n paddle api {}: {}'.format(
+                    name, out[idx], name, pd_out[idx]
+                ),
+            )
+    else:
+        np.testing.assert_array_equal(
+            out,
+            pd_out,
+            err_msg='custom op {}: {},\n paddle api {}: {}'.format(
+                name, out, name, pd_out
+            ),
+        )
+
+
+def check_output_allclose(out, pd_out, name, rtol=5e-5, atol=1e-2):
+    if out is None and pd_out is None:
+        return
+    assert out is not None, "out value of " + name + " is None"
+    assert pd_out is not None, "pd_out value of " + name + " is None"
+    np.testing.assert_allclose(
+        out,
+        pd_out,
+        rtol,
+        atol,
+        err_msg='custom op {}: {},\n paddle api {}: {}'.format(
+            name, out, name, pd_out
+        ),
+    )
diff --git a/test/dygraph_to_static/test_cinn_prim.py b/test/dygraph_to_static/test_cinn_prim.py
index 6ace7696c383a..388cb67c66f43 100644
--- a/test/dygraph_to_static/test_cinn_prim.py
+++ b/test/dygraph_to_static/test_cinn_prim.py
@@ -163,5 +163,20 @@ def test_cinn_prim(self):
             )
 
 
+class TestBackend(unittest.TestCase):
+    def test_backend(self):
+        x = paddle.randn([2, 4])
+        out1 = self.forward(x, 'CINN')
+        out2 = self.forward(x, None)
+        np.testing.assert_allclose(out1, out2, rtol=1e-6)
+
+    def forward(self, x, backend=None):
+        paddle.seed(2022)
+        net = PrimeNet()
+        net = paddle.jit.to_static(net, backend=backend)
+        out = net(x)
+        return out
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/dygraph_to_static/test_partial_program_hook.py b/test/dygraph_to_static/test_partial_program_hook.py
index 896dde419bf20..b9a64d3d0993a 100644
--- a/test/dygraph_to_static/test_partial_program_hook.py
+++ b/test/dygraph_to_static/test_partial_program_hook.py
@@ -44,7 +44,7 @@ def f():
             f
         ).get_concrete_program()
         self._hook = program_translator.PrimHooker(
-            concrete_program.main_program
+            concrete_program.main_program, None
         )
         self._forward = partial_program.forward_program
         self._whole = partial_program._train_program
diff --git a/python/paddle/fluid/tests/unittests/fft/CMakeLists.txt b/test/fft/CMakeLists.txt
similarity index 100%
rename from python/paddle/fluid/tests/unittests/fft/CMakeLists.txt
rename to test/fft/CMakeLists.txt
diff --git a/python/paddle/fluid/tests/unittests/fft/__init__.py b/test/fft/__init__.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/fft/__init__.py
rename to test/fft/__init__.py
diff --git a/python/paddle/fluid/tests/unittests/fft/spectral_op_np.py b/test/fft/spectral_op_np.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/fft/spectral_op_np.py
rename to test/fft/spectral_op_np.py
diff --git a/python/paddle/fluid/tests/unittests/fft/test_fft.py b/test/fft/test_fft.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/fft/test_fft.py
rename to test/fft/test_fft.py
diff --git a/python/paddle/fluid/tests/unittests/fft/test_fft_with_static_graph.py b/test/fft/test_fft_with_static_graph.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/fft/test_fft_with_static_graph.py
rename to test/fft/test_fft_with_static_graph.py
diff --git a/python/paddle/fluid/tests/unittests/fft/test_spectral_op.py b/test/fft/test_spectral_op.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/fft/test_spectral_op.py
rename to test/fft/test_spectral_op.py
index 6b8ab6cc2ff04..075d68b68ed47 100644
--- a/python/paddle/fluid/tests/unittests/fft/test_spectral_op.py
+++ b/test/fft/test_spectral_op.py
@@ -29,7 +29,7 @@
 from paddle import _C_ops
 
 sys.path.append("../")
-from eager_op_test import OpTest
+from paddle.fluid.tests.unittests.eager_op_test import OpTest
 
 paddle.enable_static()
 
diff --git a/test/prim/model/test_resnet_prim_cinn.py b/test/prim/model/test_resnet_prim_cinn.py
index deda6671d52a8..46ea9bfba72a7 100644
--- a/test/prim/model/test_resnet_prim_cinn.py
+++ b/test/prim/model/test_resnet_prim_cinn.py
@@ -70,15 +70,15 @@
 # The results in ci as as follows:
 DY2ST_PRIM_GT = [
     5.82879114151001,
-    8.333706855773926,
-    5.07769250869751,
-    8.66937255859375,
-    8.411705017089844,
-    7.252340793609619,
-    9.683248519897461,
-    8.177335739135742,
-    8.195427894592285,
-    10.219732284545898,
+    8.33370590209961,
+    5.091761589050293,
+    8.776082992553711,
+    8.274380683898926,
+    7.546653747558594,
+    9.607137680053711,
+    8.27371597290039,
+    8.429732322692871,
+    10.362630844116211,
 ]
 DY2ST_CINN_GT = [
     5.828789710998535,
@@ -92,17 +92,18 @@
     8.383116722106934,
     10.120304107666016,
 ]
+
 DY2ST_PRIM_CINN_GT = [
-    5.828784942626953,
-    8.341737747192383,
-    5.113619327545166,
-    8.625601768493652,
-    8.082450866699219,
-    7.4913249015808105,
-    9.858025550842285,
-    8.287693977355957,
-    8.435812950134277,
-    10.372406005859375,
+    5.828786849975586,
+    8.332868576049805,
+    5.038548469543457,
+    8.554015159606934,
+    8.106254577636719,
+    7.493070125579834,
+    9.479158401489258,
+    8.270158767700195,
+    8.324719429016113,
+    10.140411376953125,
 ]
 
 if core.is_compiled_with_cuda():
@@ -130,31 +131,13 @@ def optimizer_setting(parameter_list=None):
     return optimizer
 
 
-def train(to_static, enable_prim, enable_cinn):
-    if core.is_compiled_with_cuda():
-        paddle.set_device('gpu')
-    else:
-        paddle.set_device('cpu')
-    np.random.seed(SEED)
-    paddle.seed(SEED)
-    paddle.framework.random._manual_program_seed(SEED)
-    fluid.core._set_prim_all_enabled(enable_prim)
-
-    train_reader = paddle.batch(
-        reader_decorator(paddle.dataset.flowers.train(use_xmap=False)),
-        batch_size=batch_size,
-        drop_last=True,
-    )
-    data_loader = fluid.io.DataLoader.from_generator(capacity=5, iterable=True)
-    data_loader.set_sample_list_generator(train_reader)
-
-    resnet = resnet50(False)
-    if to_static:
-        build_strategy = paddle.static.BuildStrategy()
-        if enable_cinn:
-            build_strategy.build_cinn_pass = True
-        resnet = paddle.jit.to_static(resnet, build_strategy=build_strategy)
-    optimizer = optimizer_setting(parameter_list=resnet.parameters())
+def run(model, data_loader, optimizer, mode):
+    if mode == 'train':
+        model.train()
+        end_step = 9
+    elif mode == 'eval':
+        model.eval()
+        end_step = 1
 
     for epoch in range(epoch_num):
         total_acc1 = 0.0
@@ -166,7 +149,7 @@ def train(to_static, enable_prim, enable_cinn):
             start_time = time.time()
             img, label = data
 
-            pred = resnet(img)
+            pred = model(img)
             avg_loss = paddle.nn.functional.cross_entropy(
                 input=pred,
                 label=label,
@@ -178,9 +161,10 @@ def train(to_static, enable_prim, enable_cinn):
             acc_top1 = paddle.static.accuracy(input=pred, label=label, k=1)
             acc_top5 = paddle.static.accuracy(input=pred, label=label, k=5)
 
-            avg_loss.backward()
-            optimizer.minimize(avg_loss)
-            resnet.clear_gradients()
+            if mode == 'train':
+                avg_loss.backward()
+                optimizer.minimize(avg_loss)
+                model.clear_gradients()
 
             total_acc1 += acc_top1
             total_acc5 += acc_top5
@@ -189,8 +173,9 @@ def train(to_static, enable_prim, enable_cinn):
 
             end_time = time.time()
             print(
-                "epoch %d | batch step %d, loss %0.8f, acc1 %0.3f, acc5 %0.3f, time %f"
+                "[%s]epoch %d | batch step %d, loss %0.8f, acc1 %0.3f, acc5 %0.3f, time %f"
                 % (
+                    mode,
                     epoch,
                     batch_id,
                     avg_loss,
@@ -199,7 +184,7 @@ def train(to_static, enable_prim, enable_cinn):
                     end_time - start_time,
                 )
             )
-            if batch_id >= 9:
+            if batch_id >= end_step:
                 # avoid dataloader throw abort signaal
                 data_loader._reset()
                 break
@@ -207,6 +192,38 @@ def train(to_static, enable_prim, enable_cinn):
     return losses
 
 
+def train(to_static, enable_prim, enable_cinn):
+    if core.is_compiled_with_cuda():
+        paddle.set_device('gpu')
+    else:
+        paddle.set_device('cpu')
+    np.random.seed(SEED)
+    paddle.seed(SEED)
+    paddle.framework.random._manual_program_seed(SEED)
+    fluid.core._set_prim_all_enabled(enable_prim)
+
+    train_reader = paddle.batch(
+        reader_decorator(paddle.dataset.flowers.train(use_xmap=False)),
+        batch_size=batch_size,
+        drop_last=True,
+    )
+    data_loader = fluid.io.DataLoader.from_generator(capacity=5, iterable=True)
+    data_loader.set_sample_list_generator(train_reader)
+
+    resnet = resnet50(False)
+    if to_static:
+        build_strategy = paddle.static.BuildStrategy()
+        if enable_cinn:
+            build_strategy.build_cinn_pass = True
+        resnet = paddle.jit.to_static(resnet, build_strategy=build_strategy)
+    optimizer = optimizer_setting(parameter_list=resnet.parameters())
+
+    train_losses = run(resnet, data_loader, optimizer, 'train')
+    if to_static and enable_prim and enable_cinn:
+        eval_losses = run(resnet, data_loader, optimizer, 'eval')
+    return train_losses
+
+
 class TestResnet(unittest.TestCase):
     @unittest.skipIf(
         not (paddle.is_compiled_with_cinn() and paddle.is_compiled_with_cuda()),
diff --git a/test/tokenizer/CMakeLists.txt b/test/tokenizer/CMakeLists.txt
new file mode 100644
index 0000000000000..1cf384df660b3
--- /dev/null
+++ b/test/tokenizer/CMakeLists.txt
@@ -0,0 +1,12 @@
+file(
+  GLOB TEST_OPS
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+foreach(src ${TEST_OPS})
+  py_test(${src} SRCS ${src}.py)
+endforeach()
+
+set_tests_properties(test_faster_tokenizer_op PROPERTIES LABELS
+                                                         "RUN_TYPE=EXCLUSIVE")
diff --git a/python/paddle/fluid/tests/unittests/tokenizer/__init__.py b/test/tokenizer/__init__.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/tokenizer/__init__.py
rename to test/tokenizer/__init__.py
diff --git a/python/paddle/fluid/tests/unittests/tokenizer/bert_tokenizer.py b/test/tokenizer/bert_tokenizer.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/tokenizer/bert_tokenizer.py
rename to test/tokenizer/bert_tokenizer.py
diff --git a/python/paddle/fluid/tests/unittests/test_faster_tokenizer_op.py b/test/tokenizer/test_faster_tokenizer_op.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/test_faster_tokenizer_op.py
rename to test/tokenizer/test_faster_tokenizer_op.py
index 6972505bf3cbb..37bb09a514a18 100755
--- a/python/paddle/fluid/tests/unittests/test_faster_tokenizer_op.py
+++ b/test/tokenizer/test_faster_tokenizer_op.py
@@ -13,20 +13,17 @@
 # limitations under the License.
 
 import os
-import sys
 import tempfile
 import unittest
 
 import numpy as np
+from bert_tokenizer import BertTokenizer
 
 import paddle
 from paddle import _legacy_C_ops, nn
 from paddle.fluid.framework import _non_static_mode, core
 from paddle.fluid.layer_helper import LayerHelper
 
-sys.path.append("./tokenizer")
-from tokenizer.bert_tokenizer import BertTokenizer
-
 
 def to_string_tensor(string_values, name):
     """
diff --git a/python/paddle/fluid/tests/unittests/tokenizer/tokenizer_utils.py b/test/tokenizer/tokenizer_utils.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/tokenizer/tokenizer_utils.py
rename to test/tokenizer/tokenizer_utils.py
diff --git a/test/xpu/CMakeLists.txt b/test/xpu/CMakeLists.txt
index e0543ef9e50f5..4ecde12f008af 100644
--- a/test/xpu/CMakeLists.txt
+++ b/test/xpu/CMakeLists.txt
@@ -1,3 +1,40 @@
 if(WITH_XPU)
   add_subdirectory(cpp)
 endif()
+
+file(
+  GLOB TEST_OPS
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+if(WITH_XPU_BKCL)
+  list(REMOVE_ITEM TEST_OPS "test_gen_bkcl_id_op")
+endif()
+
+file(
+  GLOB DIST_TEST_OPS
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "test_dist_*.py")
+if(WITH_XPU_BKCL)
+  list(APPEND DIST_TEST_OPS test_gen_bkcl_id_op)
+endif()
+
+foreach(TEST_OP ${TEST_OPS})
+  py_test_modules(${TEST_OP} MODULES ${TEST_OP})
+endforeach()
+
+foreach(TEST_OP ${DIST_TEST_OPS})
+  py_test_modules(${TEST_OP} MODULES ${TEST_OP})
+endforeach()
+
+set_tests_properties(test_conv2d_op_xpu PROPERTIES TIMEOUT 120)
+set_tests_properties(test_mul_op_xpu PROPERTIES TIMEOUT 120)
+set_tests_properties(test_matmul_v2_op_xpu PROPERTIES TIMEOUT 900)
+set_tests_properties(test_matmul_op_xpu PROPERTIES TIMEOUT 300)
+set_tests_properties(test_collective_identity_xpu
+                     PROPERTIES LABELS "RUN_TYPE=DIST_KUNLUN")
+set_tests_properties(test_collective_allgather_xpu
+                     PROPERTIES LABELS "RUN_TYPE=DIST_KUNLUN")
+set_tests_properties(test_collective_allreduce_xpu
+                     PROPERTIES LABELS "RUN_TYPE=DIST_KUNLUN")
diff --git a/python/paddle/fluid/tests/unittests/xpu/collective_allgather_op_xpu.py b/test/xpu/collective_allgather_op_xpu.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/xpu/collective_allgather_op_xpu.py
rename to test/xpu/collective_allgather_op_xpu.py
diff --git a/python/paddle/fluid/tests/unittests/xpu/collective_allreduce_op_xpu.py b/test/xpu/collective_allreduce_op_xpu.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/xpu/collective_allreduce_op_xpu.py
rename to test/xpu/collective_allreduce_op_xpu.py
diff --git a/python/paddle/fluid/tests/unittests/xpu/collective_broadcast_op_xpu.py b/test/xpu/collective_broadcast_op_xpu.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/xpu/collective_broadcast_op_xpu.py
rename to test/xpu/collective_broadcast_op_xpu.py
diff --git a/python/paddle/fluid/tests/unittests/xpu/collective_concat_op.py b/test/xpu/collective_concat_op.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/xpu/collective_concat_op.py
rename to test/xpu/collective_concat_op.py
diff --git a/python/paddle/fluid/tests/unittests/xpu/collective_identity_op_xpu.py b/test/xpu/collective_identity_op_xpu.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/xpu/collective_identity_op_xpu.py
rename to test/xpu/collective_identity_op_xpu.py
diff --git a/python/paddle/fluid/tests/unittests/xpu/collective_softmax_with_cross_entropy_op_xpu.py b/test/xpu/collective_softmax_with_cross_entropy_op_xpu.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/xpu/collective_softmax_with_cross_entropy_op_xpu.py
rename to test/xpu/collective_softmax_with_cross_entropy_op_xpu.py
diff --git a/python/paddle/fluid/tests/unittests/xpu/collective_split_op.py b/test/xpu/collective_split_op.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/xpu/collective_split_op.py
rename to test/xpu/collective_split_op.py
diff --git a/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py b/test/xpu/get_test_cover_info.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py
rename to test/xpu/get_test_cover_info.py
diff --git a/python/paddle/fluid/tests/unittests/op_test_xpu.py b/test/xpu/op_test_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/op_test_xpu.py
rename to test/xpu/op_test_xpu.py
index af92704a57216..02e68b3c3ed93 100644
--- a/python/paddle/fluid/tests/unittests/op_test_xpu.py
+++ b/test/xpu/op_test_xpu.py
@@ -12,15 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import sys
+
 import numpy as np
+
+sys.path.append('..')
+sys.path.append('../../python/paddle/fluid/tests/unittests')
+
 from eager_op_test import OpTest
-from testsuite import append_loss_ops, create_op, set_input
-from white_list import no_grad_set_white_list, op_threshold_white_list
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     get_xpu_op_support_types,
     is_empty_grad_op_type,
     type_dict_str_to_numpy,
 )
+from testsuite import append_loss_ops, create_op, set_input
+from white_list import no_grad_set_white_list, op_threshold_white_list
 
 import paddle
 from paddle import fluid
diff --git a/python/paddle/fluid/tests/unittests/xpu/parallel_dygraph_dataparallel_with_pylayer.py b/test/xpu/parallel_dygraph_dataparallel_with_pylayer.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/xpu/parallel_dygraph_dataparallel_with_pylayer.py
rename to test/xpu/parallel_dygraph_dataparallel_with_pylayer.py
diff --git a/python/paddle/fluid/tests/unittests/xpu/parallel_dygraph_gradient_check.py b/test/xpu/parallel_dygraph_gradient_check.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/xpu/parallel_dygraph_gradient_check.py
rename to test/xpu/parallel_dygraph_gradient_check.py
diff --git a/python/paddle/fluid/tests/unittests/xpu/parallel_dygraph_gradient_check_in_eager_mode.py b/test/xpu/parallel_dygraph_gradient_check_in_eager_mode.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/xpu/parallel_dygraph_gradient_check_in_eager_mode.py
rename to test/xpu/parallel_dygraph_gradient_check_in_eager_mode.py
diff --git a/python/paddle/fluid/tests/unittests/xpu/process_group_bkcl.py b/test/xpu/process_group_bkcl.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/xpu/process_group_bkcl.py
rename to test/xpu/process_group_bkcl.py
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_accuracy_op_xpu.py b/test/xpu/test_accuracy_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_accuracy_op_xpu.py
rename to test/xpu/test_accuracy_op_xpu.py
index 082e883ded741..a87f6c084351c 100755
--- a/python/paddle/fluid/tests/unittests/xpu/test_accuracy_op_xpu.py
+++ b/test/xpu/test_accuracy_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py b/test/xpu/test_activation_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py
rename to test/xpu/test_activation_op_xpu.py
index b071db95b40bc..a877b09bbc957 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py
+++ b/test/xpu/test_activation_op_xpu.py
@@ -15,17 +15,16 @@
 import sys
 import unittest
 
-import numpy as np
-
-sys.path.append("..")
+sys.path.append('../../python/paddle/fluid/tests/unittests')
 
+import numpy as np
 from eager_op_test import OpTest
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 import paddle.nn.functional as F
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_adadelta_op_xpu.py b/test/xpu/test_adadelta_op_xpu.py
similarity index 96%
rename from python/paddle/fluid/tests/unittests/xpu/test_adadelta_op_xpu.py
rename to test/xpu/test_adadelta_op_xpu.py
index 71b691a6f2743..b6ef0fbdf8ec8 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_adadelta_op_xpu.py
+++ b/test/xpu/test_adadelta_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle import fluid
@@ -52,11 +48,13 @@ def setUp(self):
             rho = 0.95
             epsilon = 1e-6
 
+            learning_rate = 1.0
             self.inputs = {
                 'Param': param,
                 'Grad': grad,
                 'AvgSquaredGrad': avg_squared_grad,
                 'AvgSquaredUpdate': avg_squared_update,
+                'LearningRate': np.array([learning_rate]).astype("float32"),
             }
 
             self.attrs = {'rho': rho, 'epsilon': epsilon}
@@ -107,11 +105,13 @@ def setUp(self):
             rho = 0.95
             epsilon = 1e-6
 
+            learning_rate = 1.0
             self.inputs = {
                 'Param': param,
                 'Grad': grad,
                 'AvgSquaredGrad': avg_squared_grad,
                 'AvgSquaredUpdate': avg_squared_update,
+                'LearningRate': np.array([learning_rate]).astype("float32"),
             }
 
             avg_squared_grad_out = rho * avg_squared_grad + (
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_adagrad_op_xpu.py b/test/xpu/test_adagrad_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_adagrad_op_xpu.py
rename to test/xpu/test_adagrad_op_xpu.py
index 942ffd26a4c09..34040ebd3f77b 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_adagrad_op_xpu.py
+++ b/test/xpu/test_adagrad_op_xpu.py
@@ -12,21 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-import numpy as np
-
-import paddle
-
-sys.path.append("..")
 import unittest
 
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+import numpy as np
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
+
+import paddle
 
 paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_adam_op_xpu.py b/test/xpu/test_adam_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_adam_op_xpu.py
rename to test/xpu/test_adam_op_xpu.py
index 6d44d355e4cdc..990136c57170e 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_adam_op_xpu.py
+++ b/test/xpu/test_adam_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle.fluid import core
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_adamw_op_xpu.py b/test/xpu/test_adamw_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_adamw_op_xpu.py
rename to test/xpu/test_adamw_op_xpu.py
index d30fdbed09db8..768cbe8151da3 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_adamw_op_xpu.py
+++ b/test/xpu/test_adamw_op_xpu.py
@@ -12,20 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
-
 import unittest
 from functools import partial
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle import fluid
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_affine_channel_op_xpu.py b/test/xpu/test_affine_channel_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_affine_channel_op_xpu.py
rename to test/xpu/test_affine_channel_op_xpu.py
index 6f85dc47488ab..c200235ff879c 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_affine_channel_op_xpu.py
+++ b/test/xpu/test_affine_channel_op_xpu.py
@@ -15,10 +15,6 @@
 Unit testing for affine_channel_op
 """
 
-import sys
-
-sys.path.append("..")
-
 import unittest
 
 import numpy as np
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_amp_check_finite_and_scale_op_xpu.py b/test/xpu/test_amp_check_finite_and_scale_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_amp_check_finite_and_scale_op_xpu.py
rename to test/xpu/test_amp_check_finite_and_scale_op_xpu.py
index e171625dd4367..6abcf53707a33 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_amp_check_finite_and_scale_op_xpu.py
+++ b/test/xpu/test_amp_check_finite_and_scale_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_arg_max_op_xpu.py b/test/xpu/test_arg_max_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_arg_max_op_xpu.py
rename to test/xpu/test_arg_max_op_xpu.py
index d9a69216351a4..4a8e0dc28fad1 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_arg_max_op_xpu.py
+++ b/test/xpu/test_arg_max_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_argsort_op_xpu.py b/test/xpu/test_argsort_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_argsort_op_xpu.py
rename to test/xpu/test_argsort_op_xpu.py
index 39f554f9ac176..f3a8a69ee5ded 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_argsort_op_xpu.py
+++ b/test/xpu/test_argsort_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_assign_op_xpu.py b/test/xpu/test_assign_op_xpu.py
similarity index 96%
rename from python/paddle/fluid/tests/unittests/xpu/test_assign_op_xpu.py
rename to test/xpu/test_assign_op_xpu.py
index 97460b54aa310..d3102dd448a49 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_assign_op_xpu.py
+++ b/test/xpu/test_assign_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_assign_value_op_xpu.py b/test/xpu/test_assign_value_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_assign_value_op_xpu.py
rename to test/xpu/test_assign_value_op_xpu.py
index d98e6375da52d..a0e3a57dc8ac5 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_assign_value_op_xpu.py
+++ b/test/xpu/test_assign_value_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle import fluid
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_atan_op_xpu.py b/test/xpu/test_atan_op_xpu.py
similarity index 96%
rename from python/paddle/fluid/tests/unittests/xpu/test_atan_op_xpu.py
rename to test/xpu/test_atan_op_xpu.py
index bb02e1320da15..4ab5b14e9b44e 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_atan_op_xpu.py
+++ b/test/xpu/test_atan_op_xpu.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
@@ -21,14 +20,13 @@
 
 paddle.enable_static()
 
-sys.path.append("..")
 
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 
 class XPUTestAtanOp(XPUOpTestWrapper):
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py b/test/xpu/test_batch_norm_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py
rename to test/xpu/test_batch_norm_op_xpu.py
index 446d49717af81..6cf666c8094c9 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py
+++ b/test/xpu/test_batch_norm_op_xpu.py
@@ -12,13 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_bce_loss_op_xpu.py b/test/xpu/test_bce_loss_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_bce_loss_op_xpu.py
rename to test/xpu/test_bce_loss_op_xpu.py
index 883063969ff6a..acc3bd06e6103 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_bce_loss_op_xpu.py
+++ b/test/xpu/test_bce_loss_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_bilinear_interp_op_xpu.py b/test/xpu/test_bilinear_interp_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_bilinear_interp_op_xpu.py
rename to test/xpu/test_bilinear_interp_op_xpu.py
index dc8e996e09382..a5a849f080e6a 100755
--- a/python/paddle/fluid/tests/unittests/xpu/test_bilinear_interp_op_xpu.py
+++ b/test/xpu/test_bilinear_interp_op_xpu.py
@@ -12,13 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import paddle
 
-sys.path.append("..")
-
 paddle.enable_static()
 '''
 def bilinear_interp_np(input,
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_bilinear_interp_v2_op_xpu.py b/test/xpu/test_bilinear_interp_v2_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_bilinear_interp_v2_op_xpu.py
rename to test/xpu/test_bilinear_interp_v2_op_xpu.py
index ebd48f55d57f1..dd0a6049221fd 100755
--- a/python/paddle/fluid/tests/unittests/xpu/test_bilinear_interp_v2_op_xpu.py
+++ b/test/xpu/test_bilinear_interp_v2_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_bitwise_op_xpu.py b/test/xpu/test_bitwise_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_bitwise_op_xpu.py
rename to test/xpu/test_bitwise_op_xpu.py
index 8fcf5a7af7811..1d21108bf8cd5 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_bitwise_op_xpu.py
+++ b/test/xpu/test_bitwise_op_xpu.py
@@ -15,17 +15,16 @@
 import sys
 import unittest
 
-import numpy as np
-
-sys.path.append("..")
+sys.path.append('../../python/paddle/fluid/tests/unittests')
 
+import numpy as np
 from eager_op_test import OpTest
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_bmm_op_xpu.py b/test/xpu/test_bmm_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_bmm_op_xpu.py
rename to test/xpu/test_bmm_op_xpu.py
index d0d43dd94b0aa..48bd4ea692cf8 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_bmm_op_xpu.py
+++ b/test/xpu/test_bmm_op_xpu.py
@@ -10,19 +10,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
-
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_c_concat.py b/test/xpu/test_c_concat.py
similarity index 95%
rename from python/paddle/fluid/tests/unittests/xpu/test_c_concat.py
rename to test/xpu/test_c_concat.py
index 313ae27a5b617..d2490aa3772dc 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_c_concat.py
+++ b/test/xpu/test_c_concat.py
@@ -12,21 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
-from test_collective_base_xpu import TestDistBase
-
-import paddle
-from paddle.fluid import core
-
-sys.path.append("..")
-
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from test_collective_base_xpu import TestDistBase
+
+import paddle
+from paddle.fluid import core
 
 paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_c_embedding_op_xpu.py b/test/xpu/test_c_embedding_op_xpu.py
similarity index 96%
rename from python/paddle/fluid/tests/unittests/xpu/test_c_embedding_op_xpu.py
rename to test/xpu/test_c_embedding_op_xpu.py
index b685458a3eed6..4d0989c322e54 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_c_embedding_op_xpu.py
+++ b/test/xpu/test_c_embedding_op_xpu.py
@@ -12,10 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
-sys.path.append("..")
 import paddle
 from paddle.fluid.tests.unittests.c_embedding_op_base import (
     TestCEmbeddingCPU,
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_c_split.py b/test/xpu/test_c_split.py
similarity index 95%
rename from python/paddle/fluid/tests/unittests/xpu/test_c_split.py
rename to test/xpu/test_c_split.py
index c5b0f236935af..67e2f1a6cc5f6 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_c_split.py
+++ b/test/xpu/test_c_split.py
@@ -12,21 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
-from test_collective_base_xpu import TestDistBase
-
-import paddle
-from paddle.fluid import core
-
-sys.path.append("..")
-
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from test_collective_base_xpu import TestDistBase
+
+import paddle
+from paddle.fluid import core
 
 paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_cast_op_xpu.py b/test/xpu/test_cast_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_cast_op_xpu.py
rename to test/xpu/test_cast_op_xpu.py
index e013432d13b97..baf814e08de8a 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_cast_op_xpu.py
+++ b/test/xpu/test_cast_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle import fluid
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_clip_by_norm_op_xpu.py b/test/xpu/test_clip_by_norm_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_clip_by_norm_op_xpu.py
rename to test/xpu/test_clip_by_norm_op_xpu.py
index 206f65c10afcd..4bec31b80d85c 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_clip_by_norm_op_xpu.py
+++ b/test/xpu/test_clip_by_norm_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_clip_op_xpu.py b/test/xpu/test_clip_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_clip_op_xpu.py
rename to test/xpu/test_clip_op_xpu.py
index 994153a8dd725..79d4e3e779869 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_clip_op_xpu.py
+++ b/test/xpu/test_clip_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle import fluid
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_coalesce_tensor_op_xpu.py b/test/xpu/test_coalesce_tensor_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_coalesce_tensor_op_xpu.py
rename to test/xpu/test_coalesce_tensor_op_xpu.py
index 2324d09857dcf..f0f053137949f 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_coalesce_tensor_op_xpu.py
+++ b/test/xpu/test_coalesce_tensor_op_xpu.py
@@ -12,22 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
 
 from paddle.fluid import core
 
-sys.path.append("..")
-
 alignment = 256
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_collective_allgather_xpu.py b/test/xpu/test_collective_allgather_xpu.py
similarity index 95%
rename from python/paddle/fluid/tests/unittests/xpu/test_collective_allgather_xpu.py
rename to test/xpu/test_collective_allgather_xpu.py
index be1326d176456..3651ed2062957 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_collective_allgather_xpu.py
+++ b/test/xpu/test_collective_allgather_xpu.py
@@ -12,21 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
-from test_collective_base_xpu import TestDistBase
-
-import paddle
-from paddle.fluid import core
-
-sys.path.append("..")
-
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from test_collective_base_xpu import TestDistBase
+
+import paddle
+from paddle.fluid import core
 
 paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_collective_allreduce_xpu.py b/test/xpu/test_collective_allreduce_xpu.py
similarity index 95%
rename from python/paddle/fluid/tests/unittests/xpu/test_collective_allreduce_xpu.py
rename to test/xpu/test_collective_allreduce_xpu.py
index 187494f50154e..05539aeaae432 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_collective_allreduce_xpu.py
+++ b/test/xpu/test_collective_allreduce_xpu.py
@@ -12,21 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
-from test_collective_base_xpu import TestDistBase
-
-import paddle
-from paddle.fluid import core
-
-sys.path.append("..")
-
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from test_collective_base_xpu import TestDistBase
+
+import paddle
+from paddle.fluid import core
 
 paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_collective_base_xpu.py b/test/xpu/test_collective_base_xpu.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/xpu/test_collective_base_xpu.py
rename to test/xpu/test_collective_base_xpu.py
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_collective_broadcast_xpu.py b/test/xpu/test_collective_broadcast_xpu.py
similarity index 92%
rename from python/paddle/fluid/tests/unittests/xpu/test_collective_broadcast_xpu.py
rename to test/xpu/test_collective_broadcast_xpu.py
index e015d0f92b114..5ddb451e7e4fa 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_collective_broadcast_xpu.py
+++ b/test/xpu/test_collective_broadcast_xpu.py
@@ -12,18 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
+from get_test_cover_info import XPUOpTestWrapper, create_test_class
 from test_collective_base_xpu import TestDistBase
 
 import paddle
 from paddle.fluid import core
 
-sys.path.append("..")
-
-from xpu.get_test_cover_info import XPUOpTestWrapper, create_test_class
-
 paddle.enable_static()
 
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_collective_identity_xpu.py b/test/xpu/test_collective_identity_xpu.py
similarity index 95%
rename from python/paddle/fluid/tests/unittests/xpu/test_collective_identity_xpu.py
rename to test/xpu/test_collective_identity_xpu.py
index 3b5a2fa767a97..421f9168a28d3 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_collective_identity_xpu.py
+++ b/test/xpu/test_collective_identity_xpu.py
@@ -12,21 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
-from test_collective_base_xpu import TestDistBase
-
-import paddle
-from paddle.fluid import core
-
-sys.path.append("..")
-
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from test_collective_base_xpu import TestDistBase
+
+import paddle
+from paddle.fluid import core
 
 paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_collective_process_group.py b/test/xpu/test_collective_process_group.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/xpu/test_collective_process_group.py
rename to test/xpu/test_collective_process_group.py
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_collective_softmax_with_cross_entropy_xpu.py b/test/xpu/test_collective_softmax_with_cross_entropy_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_collective_softmax_with_cross_entropy_xpu.py
rename to test/xpu/test_collective_softmax_with_cross_entropy_xpu.py
index 703194eb58d5e..0bc75c7a4930b 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_collective_softmax_with_cross_entropy_xpu.py
+++ b/test/xpu/test_collective_softmax_with_cross_entropy_xpu.py
@@ -13,22 +13,18 @@
 # limitations under the License.
 
 import os
-import sys
 import unittest
 
 import numpy as np
-from test_collective_base_xpu import DataTypeCast, TestDistBase
-
-import paddle
-from paddle.framework import core
-
-sys.path.append("..")
-
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from test_collective_base_xpu import DataTypeCast, TestDistBase
+
+import paddle
+from paddle.framework import core
 
 paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_compare_op_xpu.py b/test/xpu/test_compare_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_compare_op_xpu.py
rename to test/xpu/test_compare_op_xpu.py
index e16b9032f2ea4..4793122a81753 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_compare_op_xpu.py
+++ b/test/xpu/test_compare_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_concat_op_xpu.py b/test/xpu/test_concat_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_concat_op_xpu.py
rename to test/xpu/test_concat_op_xpu.py
index 5867858a97b4d..4f722ef6d9853 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_concat_op_xpu.py
+++ b/test/xpu/test_concat_op_xpu.py
@@ -13,18 +13,18 @@
 # limitations under the License.
 
 import sys
-
-sys.path.append("..")
 import unittest
 
+sys.path.append('../../python/paddle/fluid/tests/unittests')
+
 import numpy as np
 from eager_op_test import skip_check_grad_ci
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_conv2d_op_xpu.py b/test/xpu/test_conv2d_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_conv2d_op_xpu.py
rename to test/xpu/test_conv2d_op_xpu.py
index a3eb2a1f3a77e..d09402f934c69 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_conv2d_op_xpu.py
+++ b/test/xpu/test_conv2d_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle.fluid import core
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_conv2d_transpose_op_xpu.py b/test/xpu/test_conv2d_transpose_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_conv2d_transpose_op_xpu.py
rename to test/xpu/test_conv2d_transpose_op_xpu.py
index a5be198089e86..7bf01d23fb56f 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_conv2d_transpose_op_xpu.py
+++ b/test/xpu/test_conv2d_transpose_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_conv3d_op_xpu.py b/test/xpu/test_conv3d_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_conv3d_op_xpu.py
rename to test/xpu/test_conv3d_op_xpu.py
index f6578371b97ad..f9904148f9b38 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_conv3d_op_xpu.py
+++ b/test/xpu/test_conv3d_op_xpu.py
@@ -12,14 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
+from get_test_cover_info import XPUOpTestWrapper, create_test_class
 from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import XPUOpTestWrapper, create_test_class
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_cumprod_op_xpu.py b/test/xpu/test_cumprod_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_cumprod_op_xpu.py
rename to test/xpu/test_cumprod_op_xpu.py
index 3ea12d2bf9f41..fb3763ac5e8f7 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_cumprod_op_xpu.py
+++ b/test/xpu/test_cumprod_op_xpu.py
@@ -13,19 +13,15 @@
 # limitations under the License.
 
 import random
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_cumsum_op_xpu.py b/test/xpu/test_cumsum_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_cumsum_op_xpu.py
rename to test/xpu/test_cumsum_op_xpu.py
index 8ba052171fc2a..2e3555b702576 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_cumsum_op_xpu.py
+++ b/test/xpu/test_cumsum_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_deformable_conv_op_xpu.py b/test/xpu/test_deformable_conv_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_deformable_conv_op_xpu.py
rename to test/xpu/test_deformable_conv_op_xpu.py
index 84afb9cbd03ea..8577cb2497704 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_deformable_conv_op_xpu.py
+++ b/test/xpu/test_deformable_conv_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
-from op_test_xpu import OpTest, XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import OpTest, XPUOpTest
 
 import paddle
 from paddle.fluid import core
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_depthwise_conv2d_op_xpu.py b/test/xpu/test_depthwise_conv2d_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_depthwise_conv2d_op_xpu.py
rename to test/xpu/test_depthwise_conv2d_op_xpu.py
index 7ccf79170ddf2..a0b01c921280f 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_depthwise_conv2d_op_xpu.py
+++ b/test/xpu/test_depthwise_conv2d_op_xpu.py
@@ -12,9 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
@@ -22,12 +19,12 @@
 import paddle
 
 paddle.enable_static()
-from test_conv2d_op_xpu import XPUTestConv2DOp, XPUTestConv2DOp_v2
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from test_conv2d_op_xpu import XPUTestConv2DOp, XPUTestConv2DOp_v2
 
 
 class XPUTestDepthwiseConv2DOp(XPUOpTestWrapper):
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_device_guard_xpu.py b/test/xpu/test_device_guard_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_device_guard_xpu.py
rename to test/xpu/test_device_guard_xpu.py
index 01581c9ac61cc..cc9fb142279ac 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_device_guard_xpu.py
+++ b/test/xpu/test_device_guard_xpu.py
@@ -12,11 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
-
-sys.path.append("..")
-
 import warnings
 
 import paddle
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_diag_v2_op_xpu.py b/test/xpu/test_diag_v2_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_diag_v2_op_xpu.py
rename to test/xpu/test_diag_v2_op_xpu.py
index 0a2eac8720ef1..51f42d00507fe 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_diag_v2_op_xpu.py
+++ b/test/xpu/test_diag_v2_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
-sys.path.append("..")
-
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle import fluid
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_diagonal_op_xpu.py b/test/xpu/test_diagonal_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_diagonal_op_xpu.py
rename to test/xpu/test_diagonal_op_xpu.py
index 001cd727a081c..bbf289ce4c9fd 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_diagonal_op_xpu.py
+++ b/test/xpu/test_diagonal_op_xpu.py
@@ -15,18 +15,18 @@
 import sys
 import unittest
 
-import numpy as np
-
-import paddle
+sys.path.append('../../python/paddle/fluid/tests/unittests')
 
-sys.path.append("..")
+import numpy as np
 from eager_op_test import skip_check_grad_ci
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
+
+import paddle
 
 paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_distribute_fpn_proposals_op_xpu.py b/test/xpu/test_distribute_fpn_proposals_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_distribute_fpn_proposals_op_xpu.py
rename to test/xpu/test_distribute_fpn_proposals_op_xpu.py
index c11899d4c7ed7..230b9647f6ef1 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_distribute_fpn_proposals_op_xpu.py
+++ b/test/xpu/test_distribute_fpn_proposals_op_xpu.py
@@ -11,9 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
@@ -23,7 +20,7 @@
 
 paddle.enable_static()
 
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_dropout_op_xpu.py b/test/xpu/test_dropout_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_dropout_op_xpu.py
rename to test/xpu/test_dropout_op_xpu.py
index 562b968b5f698..1a3459736c268 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_dropout_op_xpu.py
+++ b/test/xpu/test_dropout_op_xpu.py
@@ -12,9 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
@@ -26,7 +23,7 @@
 
 paddle.enable_static()
 
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_einsum_op_xpu.py b/test/xpu/test_einsum_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_einsum_op_xpu.py
rename to test/xpu/test_einsum_op_xpu.py
index cb73f85671a83..57a82009834fa 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_einsum_op_xpu.py
+++ b/test/xpu/test_einsum_op_xpu.py
@@ -12,20 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-import paddle
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
+
+import paddle
 
 paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op_xpu.py b/test/xpu/test_elementwise_add_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op_xpu.py
rename to test/xpu/test_elementwise_add_op_xpu.py
index 06db6b54a6740..8d894a7b8828c 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op_xpu.py
+++ b/test/xpu/test_elementwise_add_op_xpu.py
@@ -13,19 +13,18 @@
 # limitations under the License.
 
 import sys
-
-import numpy as np
-
-sys.path.append("..")
 import unittest
 
+sys.path.append('../../python/paddle/fluid/tests/unittests')
+
+import numpy as np
 from eager_op_test import OpTest, skip_check_grad_ci
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle import fluid
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op_xpu_kp.py b/test/xpu/test_elementwise_add_op_xpu_kp.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op_xpu_kp.py
rename to test/xpu/test_elementwise_add_op_xpu_kp.py
index d9ef90fb2363f..267ba2ec2b959 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op_xpu_kp.py
+++ b/test/xpu/test_elementwise_add_op_xpu_kp.py
@@ -13,12 +13,11 @@
 # limitations under the License.
 
 import sys
-
-import numpy as np
-
-sys.path.append("..")
 import unittest
 
+sys.path.append('../../python/paddle/fluid/tests/unittests')
+
+import numpy as np
 from eager_op_test import OpTest, skip_check_grad_ci
 from op_test_xpu import XPUOpTest
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_div_op_xpu.py b/test/xpu/test_elementwise_div_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_elementwise_div_op_xpu.py
rename to test/xpu/test_elementwise_div_op_xpu.py
index fc1bf1d834aeb..ca7693d0ab8e7 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_div_op_xpu.py
+++ b/test/xpu/test_elementwise_div_op_xpu.py
@@ -12,18 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import sys
-
-sys.path.append("..")
 import unittest
 
+sys.path.append('../../python/paddle/fluid/tests/unittests')
+
 import numpy as np
 from eager_op_test import skip_check_grad_ci
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle import fluid
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_floordiv_op_xpu.py b/test/xpu/test_elementwise_floordiv_op_xpu.py
similarity index 96%
rename from python/paddle/fluid/tests/unittests/xpu/test_elementwise_floordiv_op_xpu.py
rename to test/xpu/test_elementwise_floordiv_op_xpu.py
index 83c476a213ac0..3aa7a7f2c138a 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_floordiv_op_xpu.py
+++ b/test/xpu/test_elementwise_floordiv_op_xpu.py
@@ -12,18 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import sys
-
-sys.path.append("..")
 import unittest
 
+sys.path.append('../../python/paddle/fluid/tests/unittests')
+
 import numpy as np
 from eager_op_test import OpTest
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_max_op_xpu.py b/test/xpu/test_elementwise_max_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_elementwise_max_op_xpu.py
rename to test/xpu/test_elementwise_max_op_xpu.py
index 66982e9a2c5e5..d9e96ec1fcb2c 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_max_op_xpu.py
+++ b/test/xpu/test_elementwise_max_op_xpu.py
@@ -12,18 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import sys
-
-sys.path.append("..")
 import unittest
 
+sys.path.append('../../python/paddle/fluid/tests/unittests')
+
 import numpy as np
 from eager_op_test import skip_check_grad_ci
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_min_op_xpu.py b/test/xpu/test_elementwise_min_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_elementwise_min_op_xpu.py
rename to test/xpu/test_elementwise_min_op_xpu.py
index c79cc9b8e130c..34223b52780f7 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_min_op_xpu.py
+++ b/test/xpu/test_elementwise_min_op_xpu.py
@@ -12,18 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import sys
-
-sys.path.append("..")
 import unittest
 
+sys.path.append('../../python/paddle/fluid/tests/unittests')
+
 import numpy as np
 from eager_op_test import skip_check_grad_ci
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_mod_op_xpu.py b/test/xpu/test_elementwise_mod_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_elementwise_mod_op_xpu.py
rename to test/xpu/test_elementwise_mod_op_xpu.py
index c00ea8db5c859..f909a12cc5e21 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_mod_op_xpu.py
+++ b/test/xpu/test_elementwise_mod_op_xpu.py
@@ -12,18 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import sys
-
-sys.path.append("..")
 import unittest
 
+sys.path.append('../../python/paddle/fluid/tests/unittests')
+
 import numpy as np
 from eager_op_test import OpTest
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle import fluid
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_mul_op_xpu.py b/test/xpu/test_elementwise_mul_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_elementwise_mul_op_xpu.py
rename to test/xpu/test_elementwise_mul_op_xpu.py
index 6dea1d6b99199..a6c1319b5f19d 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_mul_op_xpu.py
+++ b/test/xpu/test_elementwise_mul_op_xpu.py
@@ -12,18 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import sys
-
-sys.path.append("..")
 import unittest
 
+sys.path.append('../../python/paddle/fluid/tests/unittests')
+
 import numpy as np
 from eager_op_test import OpTest, skip_check_grad_ci
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_pow_op_xpu.py b/test/xpu/test_elementwise_pow_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_elementwise_pow_op_xpu.py
rename to test/xpu/test_elementwise_pow_op_xpu.py
index 431ca838c1ab7..5864bfa00c793 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_pow_op_xpu.py
+++ b/test/xpu/test_elementwise_pow_op_xpu.py
@@ -12,18 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import sys
-
-sys.path.append("..")
 import unittest
 
+sys.path.append('../../python/paddle/fluid/tests/unittests')
+
 import numpy as np
 from eager_op_test import OpTest, skip_check_grad_ci
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_sub_op_xpu.py b/test/xpu/test_elementwise_sub_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_elementwise_sub_op_xpu.py
rename to test/xpu/test_elementwise_sub_op_xpu.py
index 5b731ef32bb0d..aeddf4641d726 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_sub_op_xpu.py
+++ b/test/xpu/test_elementwise_sub_op_xpu.py
@@ -13,19 +13,18 @@
 # limitations under the License.
 
 import sys
-
-import numpy as np
-
-sys.path.append("..")
 import unittest
 
+sys.path.append('../../python/paddle/fluid/tests/unittests')
+
+import numpy as np
 from eager_op_test import skip_check_grad_ci
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_empty_op_xpu.py b/test/xpu/test_empty_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_empty_op_xpu.py
rename to test/xpu/test_empty_op_xpu.py
index 8724188127522..71c25f335b1ba 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_empty_op_xpu.py
+++ b/test/xpu/test_empty_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
-
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle.fluid.framework import convert_np_dtype_to_dtype_
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_expand_as_v2_op_xpu.py b/test/xpu/test_expand_as_v2_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_expand_as_v2_op_xpu.py
rename to test/xpu/test_expand_as_v2_op_xpu.py
index ac5e06c2682c8..586761c9aeac4 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_expand_as_v2_op_xpu.py
+++ b/test/xpu/test_expand_as_v2_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle import fluid
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_expand_v2_op_xpu.py b/test/xpu/test_expand_v2_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_expand_v2_op_xpu.py
rename to test/xpu/test_expand_v2_op_xpu.py
index f7098282a62a3..9d869d14b32e2 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_expand_v2_op_xpu.py
+++ b/test/xpu/test_expand_v2_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle import fluid
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_fill_any_like_op_xpu.py b/test/xpu/test_fill_any_like_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_fill_any_like_op_xpu.py
rename to test/xpu/test_fill_any_like_op_xpu.py
index af8f9518b5483..079a86b07c44a 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_fill_any_like_op_xpu.py
+++ b/test/xpu/test_fill_any_like_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
-
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_fill_any_op_xpu.py b/test/xpu/test_fill_any_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_fill_any_op_xpu.py
rename to test/xpu/test_fill_any_op_xpu.py
index 95d514d94cecd..e351d9dacd1a3 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_fill_any_op_xpu.py
+++ b/test/xpu/test_fill_any_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_fill_constant_op_xpu.py b/test/xpu/test_fill_constant_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_fill_constant_op_xpu.py
rename to test/xpu/test_fill_constant_op_xpu.py
index 4bd9abae9a5b9..d2a01a1e6377b 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_fill_constant_op_xpu.py
+++ b/test/xpu/test_fill_constant_op_xpu.py
@@ -13,18 +13,18 @@
 # limitations under the License.
 
 import sys
-
-sys.path.append("..")
 import unittest
 
+sys.path.append('../../python/paddle/fluid/tests/unittests')
+
 import numpy as np
 from eager_op_test import convert_float_to_uint16
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_fill_diagonal_tensor_op_xpu.py b/test/xpu/test_fill_diagonal_tensor_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_fill_diagonal_tensor_op_xpu.py
rename to test/xpu/test_fill_diagonal_tensor_op_xpu.py
index 3fbdf7abe6d14..de5025e8c4c05 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_fill_diagonal_tensor_op_xpu.py
+++ b/test/xpu/test_fill_diagonal_tensor_op_xpu.py
@@ -15,18 +15,18 @@
 import sys
 import unittest
 
-import numpy as np
-
-import paddle
+sys.path.append('../../python/paddle/fluid/tests/unittests')
 
-sys.path.append("..")
+import numpy as np
 from eager_op_test import skip_check_grad_ci
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
+
+import paddle
 
 paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_fill_op_xpu.py b/test/xpu/test_fill_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_fill_op_xpu.py
rename to test/xpu/test_fill_op_xpu.py
index 4cb43a2ba430c..99ca677ce4200 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_fill_op_xpu.py
+++ b/test/xpu/test_fill_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle.fluid import core
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_flatten2_op_xpu.py b/test/xpu/test_flatten2_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_flatten2_op_xpu.py
rename to test/xpu/test_flatten2_op_xpu.py
index 380da7b62d0b6..9595b9877bc5a 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_flatten2_op_xpu.py
+++ b/test/xpu/test_flatten2_op_xpu.py
@@ -12,17 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
-sys.path.append("..")
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_flatten_contiguous_range_op_xpu.py b/test/xpu/test_flatten_contiguous_range_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_flatten_contiguous_range_op_xpu.py
rename to test/xpu/test_flatten_contiguous_range_op_xpu.py
index af6f2095fc97d..05ad91958374b 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_flatten_contiguous_range_op_xpu.py
+++ b/test/xpu/test_flatten_contiguous_range_op_xpu.py
@@ -12,21 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_flatten_op_xpu.py b/test/xpu/test_flatten_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_flatten_op_xpu.py
rename to test/xpu/test_flatten_op_xpu.py
index 9876b6c381540..7673ec9ba3d6d 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_flatten_op_xpu.py
+++ b/test/xpu/test_flatten_op_xpu.py
@@ -12,17 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
-sys.path.append("..")
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_fleet_exe_dist_model_run_xpu.py b/test/xpu/test_fleet_exe_dist_model_run_xpu.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/xpu/test_fleet_exe_dist_model_run_xpu.py
rename to test/xpu/test_fleet_exe_dist_model_run_xpu.py
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_fused_attention_op_xpu.py b/test/xpu/test_fused_attention_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_fused_attention_op_xpu.py
rename to test/xpu/test_fused_attention_op_xpu.py
index 3cdb5094f21d4..9db584f278e7f 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_fused_attention_op_xpu.py
+++ b/test/xpu/test_fused_attention_op_xpu.py
@@ -12,20 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-import numpy as np
-
-sys.path.append("..")
-
 import unittest
 
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+import numpy as np
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 import paddle.incubate.nn.functional as incubate_f
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_fused_feedforward_op_xpu.py b/test/xpu/test_fused_feedforward_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_fused_feedforward_op_xpu.py
rename to test/xpu/test_fused_feedforward_op_xpu.py
index feb7549a33e50..11f7148e188d0 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_fused_feedforward_op_xpu.py
+++ b/test/xpu/test_fused_feedforward_op_xpu.py
@@ -11,16 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import sys
-
-import numpy as np
-
-sys.path.append("..")
-
 import unittest
 
+import numpy as np
+from get_test_cover_info import XPUOpTestWrapper, create_test_class
 from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import XPUOpTestWrapper, create_test_class
 
 import paddle
 import paddle.incubate.nn.functional as incubate_f
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_fused_gemm_epilogue_grad_op_xpu.py b/test/xpu/test_fused_gemm_epilogue_grad_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_fused_gemm_epilogue_grad_op_xpu.py
rename to test/xpu/test_fused_gemm_epilogue_grad_op_xpu.py
index 35b943a3f4c77..394fe515554f3 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_fused_gemm_epilogue_grad_op_xpu.py
+++ b/test/xpu/test_fused_gemm_epilogue_grad_op_xpu.py
@@ -13,19 +13,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
-
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle.fluid import core
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_fused_gemm_epilogue_op_xpu.py b/test/xpu/test_fused_gemm_epilogue_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_fused_gemm_epilogue_op_xpu.py
rename to test/xpu/test_fused_gemm_epilogue_op_xpu.py
index 590276f58e4dc..37b1271963faf 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_fused_gemm_epilogue_op_xpu.py
+++ b/test/xpu/test_fused_gemm_epilogue_op_xpu.py
@@ -13,18 +13,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle import _legacy_C_ops
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_fused_resnet_basic_block_op_xpu.py b/test/xpu/test_fused_resnet_basic_block_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_fused_resnet_basic_block_op_xpu.py
rename to test/xpu/test_fused_resnet_basic_block_op_xpu.py
index 9c3156997035a..060217a6a1082 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_fused_resnet_basic_block_op_xpu.py
+++ b/test/xpu/test_fused_resnet_basic_block_op_xpu.py
@@ -13,13 +13,13 @@
 # limitations under the License.
 
 import sys
-
-sys.path.append("..")
 import unittest
 
+sys.path.append('../../python/paddle/fluid/tests/unittests')
+
 import numpy as np
 from eager_op_test import OpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_gather_nd_op_xpu.py b/test/xpu/test_gather_nd_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_gather_nd_op_xpu.py
rename to test/xpu/test_gather_nd_op_xpu.py
index a22c10e0fec56..e642afffb44cf 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_gather_nd_op_xpu.py
+++ b/test/xpu/test_gather_nd_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_gather_op_xpu.py b/test/xpu/test_gather_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_gather_op_xpu.py
rename to test/xpu/test_gather_op_xpu.py
index a57af602f9712..0d132e7185e64 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_gather_op_xpu.py
+++ b/test/xpu/test_gather_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
-sys.path.append("..")
-
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_gaussian_random_op_xpu.py b/test/xpu/test_gaussian_random_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_gaussian_random_op_xpu.py
rename to test/xpu/test_gaussian_random_op_xpu.py
index 9d5b5e747f445..f30b994dcd18b 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_gaussian_random_op_xpu.py
+++ b/test/xpu/test_gaussian_random_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle import fluid
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_gen_bkcl_id_op.py b/test/xpu/test_gen_bkcl_id_op.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_gen_bkcl_id_op.py
rename to test/xpu/test_gen_bkcl_id_op.py
index e13efff36e484..7c7ae3511a252 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_gen_bkcl_id_op.py
+++ b/test/xpu/test_gen_bkcl_id_op.py
@@ -13,10 +13,7 @@
 # limitations under the License.
 
 import os
-import sys
 import unittest
-
-sys.path.append("..")
 from multiprocessing import Process
 
 from launch_function_helper import _find_free_port, wait
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_generate_proposals_v2_op_xpu.py b/test/xpu/test_generate_proposals_v2_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_generate_proposals_v2_op_xpu.py
rename to test/xpu/test_generate_proposals_v2_op_xpu.py
index 3a97b28267d92..b7e22032f5f0c 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_generate_proposals_v2_op_xpu.py
+++ b/test/xpu/test_generate_proposals_v2_op_xpu.py
@@ -12,22 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-import unittest
-
-import numpy as np
-
-sys.path.append("..")
-
 import copy
 import math
+import unittest
 
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+import numpy as np
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle.fluid import core
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_grid_sampler_op_xpu.py b/test/xpu/test_grid_sampler_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_grid_sampler_op_xpu.py
rename to test/xpu/test_grid_sampler_op_xpu.py
index c92ddc9531b21..1e171f2349392 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_grid_sampler_op_xpu.py
+++ b/test/xpu/test_grid_sampler_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_group_norm_op_xpu.py b/test/xpu/test_group_norm_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_group_norm_op_xpu.py
rename to test/xpu/test_group_norm_op_xpu.py
index 67161776f81c2..16cec44287df8 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_group_norm_op_xpu.py
+++ b/test/xpu/test_group_norm_op_xpu.py
@@ -15,17 +15,16 @@
 import sys
 import unittest
 
-import numpy as np
-
-sys.path.append("..")
+sys.path.append('../../python/paddle/fluid/tests/unittests')
 
+import numpy as np
 from eager_op_test import OpTest
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_huber_loss_op_xpu.py b/test/xpu/test_huber_loss_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_huber_loss_op_xpu.py
rename to test/xpu/test_huber_loss_op_xpu.py
index 2a51e6ea95014..fa1e0b4b2ce87 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_huber_loss_op_xpu.py
+++ b/test/xpu/test_huber_loss_op_xpu.py
@@ -15,17 +15,16 @@
 import sys
 import unittest
 
-import numpy as np
-
-sys.path.append("..")
+sys.path.append('../../python/paddle/fluid/tests/unittests')
 
+import numpy as np
 from eager_op_test import OpTest
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_increment_op_xpu.py b/test/xpu/test_increment_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_increment_op_xpu.py
rename to test/xpu/test_increment_op_xpu.py
index 8ebbeae9654a6..5ef28f30b44a9 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_increment_op_xpu.py
+++ b/test/xpu/test_increment_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_index_sample_op_xpu.py b/test/xpu/test_index_sample_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_index_sample_op_xpu.py
rename to test/xpu/test_index_sample_op_xpu.py
index c9701af3e5786..e5204a1247f46 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_index_sample_op_xpu.py
+++ b/test/xpu/test_index_sample_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle import fluid
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_index_select_op_xpu.py b/test/xpu/test_index_select_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_index_select_op_xpu.py
rename to test/xpu/test_index_select_op_xpu.py
index 03e7debb59acf..62b9dd54c2e8b 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_index_select_op_xpu.py
+++ b/test/xpu/test_index_select_op_xpu.py
@@ -12,23 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
-from paddle import fluid
-from paddle.fluid import Program, program_guard
-
-sys.path.append("..")
-
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
+from paddle import fluid
+from paddle.fluid import Program, program_guard
 
 paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_instance_norm_op_xpu.py b/test/xpu/test_instance_norm_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_instance_norm_op_xpu.py
rename to test/xpu/test_instance_norm_op_xpu.py
index 8e0b777ea852c..5eb3e955deddf 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_instance_norm_op_xpu.py
+++ b/test/xpu/test_instance_norm_op_xpu.py
@@ -12,22 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-import paddle
-from paddle import fluid
-from paddle.fluid import Program, program_guard
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
+
+import paddle
+from paddle import fluid
+from paddle.fluid import Program, program_guard
 
 paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_iou_similarity_op_xpu.py b/test/xpu/test_iou_similarity_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_iou_similarity_op_xpu.py
rename to test/xpu/test_iou_similarity_op_xpu.py
index 9d8873666a3ab..301d5fb07b99a 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_iou_similarity_op_xpu.py
+++ b/test/xpu/test_iou_similarity_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
-from numpy import random
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from numpy import random
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_isfinite_op_xpu.py b/test/xpu/test_isfinite_op_xpu.py
similarity index 96%
rename from python/paddle/fluid/tests/unittests/xpu/test_isfinite_op_xpu.py
rename to test/xpu/test_isfinite_op_xpu.py
index c5253bb90cbaa..93e6cf3533eca 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_isfinite_op_xpu.py
+++ b/test/xpu/test_isfinite_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_kldiv_loss_op_xpu.py b/test/xpu/test_kldiv_loss_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_kldiv_loss_op_xpu.py
rename to test/xpu/test_kldiv_loss_op_xpu.py
index b3e3d7e5a058d..d3dd09b6c3c28 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_kldiv_loss_op_xpu.py
+++ b/test/xpu/test_kldiv_loss_op_xpu.py
@@ -11,18 +11,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle.nn.functional import kl_div
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_label_smooth_op_xpu.py b/test/xpu/test_label_smooth_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_label_smooth_op_xpu.py
rename to test/xpu/test_label_smooth_op_xpu.py
index b83a32a313ad4..4ad7b3dc6a871 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_label_smooth_op_xpu.py
+++ b/test/xpu/test_label_smooth_op_xpu.py
@@ -12,20 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-import paddle
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
+
+import paddle
 
 paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_lamb_op_xpu.py b/test/xpu/test_lamb_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_lamb_op_xpu.py
rename to test/xpu/test_lamb_op_xpu.py
index 70794de507f5e..c82bb5cd4e166 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_lamb_op_xpu.py
+++ b/test/xpu/test_lamb_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_layer_norm_op_xpu.py b/test/xpu/test_layer_norm_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_layer_norm_op_xpu.py
rename to test/xpu/test_layer_norm_op_xpu.py
index 12e6b49424093..1b98c4fe081b4 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_layer_norm_op_xpu.py
+++ b/test/xpu/test_layer_norm_op_xpu.py
@@ -12,23 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 from functools import reduce
-
-import numpy as np
-
-import paddle
-
-sys.path.append("..")
 from operator import mul
 
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+import numpy as np
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
+
+import paddle
 
 paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_linspace_op_xpu.py b/test/xpu/test_linspace_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_linspace_op_xpu.py
rename to test/xpu/test_linspace_op_xpu.py
index 65247c5bec50d..70fdb01b92159 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_linspace_op_xpu.py
+++ b/test/xpu/test_linspace_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest, convert_np_dtype_to_dtype_
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest, convert_np_dtype_to_dtype_
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_log_loss_op_xpu.py b/test/xpu/test_log_loss_op_xpu.py
similarity index 96%
rename from python/paddle/fluid/tests/unittests/xpu/test_log_loss_op_xpu.py
rename to test/xpu/test_log_loss_op_xpu.py
index 42a59da6d0dde..920c6c1f46931 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_log_loss_op_xpu.py
+++ b/test/xpu/test_log_loss_op_xpu.py
@@ -13,10 +13,10 @@
 # limitations under the License.
 
 import sys
-
-sys.path.append("..")
 import unittest
 
+sys.path.append('../../python/paddle/fluid/tests/unittests')
+
 import numpy as np
 from eager_op_test import OpTest
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_log_softmax_op_xpu.py b/test/xpu/test_log_softmax_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_log_softmax_op_xpu.py
rename to test/xpu/test_log_softmax_op_xpu.py
index fdaaadcae81e1..269d3e76bca4a 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_log_softmax_op_xpu.py
+++ b/test/xpu/test_log_softmax_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 import paddle.nn.functional as F
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_logical_op_xpu.py b/test/xpu/test_logical_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_logical_op_xpu.py
rename to test/xpu/test_logical_op_xpu.py
index b07327283746d..44f891d2e3f65 100755
--- a/python/paddle/fluid/tests/unittests/xpu/test_logical_op_xpu.py
+++ b/test/xpu/test_logical_op_xpu.py
@@ -15,17 +15,16 @@
 import sys
 import unittest
 
-import numpy as np
-
-sys.path.append("..")
+sys.path.append('../../python/paddle/fluid/tests/unittests')
 
+import numpy as np
 from eager_op_test import OpTest
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_logsumexp_op_xpu.py b/test/xpu/test_logsumexp_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_logsumexp_op_xpu.py
rename to test/xpu/test_logsumexp_op_xpu.py
index 46515eb6b1cfd..1d871797bb60c 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_logsumexp_op_xpu.py
+++ b/test/xpu/test_logsumexp_op_xpu.py
@@ -12,15 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
-import paddle
-
-sys.path.append("..")
 import numpy as np
 from op_test_xpu import XPUOpTest
 
+import paddle
+
 paddle.enable_static()
 
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_lookup_table_v2_op_xpu.py b/test/xpu/test_lookup_table_v2_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_lookup_table_v2_op_xpu.py
rename to test/xpu/test_lookup_table_v2_op_xpu.py
index 8cb36afb2e490..7af995692a7a3 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_lookup_table_v2_op_xpu.py
+++ b/test/xpu/test_lookup_table_v2_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_masked_select_op_xpu.py b/test/xpu/test_masked_select_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_masked_select_op_xpu.py
rename to test/xpu/test_masked_select_op_xpu.py
index d526dae396dde..4ed6cd0a06e37 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_masked_select_op_xpu.py
+++ b/test/xpu/test_masked_select_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py b/test/xpu/test_matmul_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py
rename to test/xpu/test_matmul_op_xpu.py
index 3484264cff6dd..07cea1b943c91 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py
+++ b/test/xpu/test_matmul_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle import fluid
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py b/test/xpu/test_matmul_v2_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py
rename to test/xpu/test_matmul_v2_op_xpu.py
index 4149af1226852..eb10d1462e466 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py
+++ b/test/xpu/test_matmul_v2_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_mean_op_xpu.py b/test/xpu/test_mean_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_mean_op_xpu.py
rename to test/xpu/test_mean_op_xpu.py
index a13bea88b6a80..66ed8d7edbce3 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_mean_op_xpu.py
+++ b/test/xpu/test_mean_op_xpu.py
@@ -12,21 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-import paddle
-
-sys.path.append("..")
 from op_test_xpu import XPUOpTest
 
+import paddle
 from paddle.fluid import Program, program_guard
 
 np.random.seed(10)
 
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_merged_momentum_op_xpu.py b/test/xpu/test_merged_momentum_op_xpu.py
similarity index 96%
rename from python/paddle/fluid/tests/unittests/xpu/test_merged_momentum_op_xpu.py
rename to test/xpu/test_merged_momentum_op_xpu.py
index 1a6455a2a712e..8f3afc5a32697 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_merged_momentum_op_xpu.py
+++ b/test/xpu/test_merged_momentum_op_xpu.py
@@ -12,17 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
-sys.path.append("..")
-
-from test_merged_momentum_op_xpu_base import TestMergedMomentumBase
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from test_merged_momentum_op_xpu_base import TestMergedMomentumBase
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_merged_momentum_op_xpu_base.py b/test/xpu/test_merged_momentum_op_xpu_base.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/xpu/test_merged_momentum_op_xpu_base.py
rename to test/xpu/test_merged_momentum_op_xpu_base.py
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_meshgrid_op_xpu.py b/test/xpu/test_meshgrid_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_meshgrid_op_xpu.py
rename to test/xpu/test_meshgrid_op_xpu.py
index dfb70604d65e1..6c00fa39d71bf 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_meshgrid_op_xpu.py
+++ b/test/xpu/test_meshgrid_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_momentum_op_xpu.py b/test/xpu/test_momentum_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_momentum_op_xpu.py
rename to test/xpu/test_momentum_op_xpu.py
index 73d39c17ed072..50854cdeb9fae 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_momentum_op_xpu.py
+++ b/test/xpu/test_momentum_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle.fluid import core
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_mul_op_xpu.py b/test/xpu/test_mul_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_mul_op_xpu.py
rename to test/xpu/test_mul_op_xpu.py
index 760f88bea0f25..a924cf42e84a3 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_mul_op_xpu.py
+++ b/test/xpu/test_mul_op_xpu.py
@@ -12,19 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
+from op_test_xpu import XPUOpTest
 
 import paddle
 
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-
 paddle.enable_static()
 
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_nearest_interp_op_xpu.py b/test/xpu/test_nearest_interp_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_nearest_interp_op_xpu.py
rename to test/xpu/test_nearest_interp_op_xpu.py
index 441439838cbcd..235ccbdd2de9c 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_nearest_interp_op_xpu.py
+++ b/test/xpu/test_nearest_interp_op_xpu.py
@@ -12,13 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import paddle
 
-sys.path.append("..")
-
 paddle.enable_static()
 '''
 def nearest_neighbor_interp_np(X,
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_nearest_interp_v2_op_xpu.py b/test/xpu/test_nearest_interp_v2_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_nearest_interp_v2_op_xpu.py
rename to test/xpu/test_nearest_interp_v2_op_xpu.py
index 35c362b9a9aba..9caac459a9451 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_nearest_interp_v2_op_xpu.py
+++ b/test/xpu/test_nearest_interp_v2_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_one_hot_op_xpu.py b/test/xpu/test_one_hot_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_one_hot_op_xpu.py
rename to test/xpu/test_one_hot_op_xpu.py
index 4a24e3e2028e5..941387b3eb1fb 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_one_hot_op_xpu.py
+++ b/test/xpu/test_one_hot_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle.fluid import core
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_one_hot_v2_op_xpu.py b/test/xpu/test_one_hot_v2_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_one_hot_v2_op_xpu.py
rename to test/xpu/test_one_hot_v2_op_xpu.py
index 7fca3ab6827ab..80a60eed539c0 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_one_hot_v2_op_xpu.py
+++ b/test/xpu/test_one_hot_v2_op_xpu.py
@@ -12,22 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-import paddle
-from paddle import fluid
-from paddle.fluid import core
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
+
+import paddle
+from paddle import fluid
+from paddle.fluid import core
 
 paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_p_norm_op_xpu.py b/test/xpu/test_p_norm_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_p_norm_op_xpu.py
rename to test/xpu/test_p_norm_op_xpu.py
index 959ae77ca0117..3f09c8eeda772 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_p_norm_op_xpu.py
+++ b/test/xpu/test_p_norm_op_xpu.py
@@ -12,20 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-import paddle
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
+
+import paddle
 
 paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_pad3d_op_xpu.py b/test/xpu/test_pad3d_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_pad3d_op_xpu.py
rename to test/xpu/test_pad3d_op_xpu.py
index 7c4db207c8ef9..2757ed1e3e70a 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_pad3d_op_xpu.py
+++ b/test/xpu/test_pad3d_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 import paddle.nn.functional as F
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_parallel_dygraph_dataparallel.py b/test/xpu/test_parallel_dygraph_dataparallel.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/xpu/test_parallel_dygraph_dataparallel.py
rename to test/xpu/test_parallel_dygraph_dataparallel.py
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_pixel_shuffle_op_xpu.py b/test/xpu/test_pixel_shuffle_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_pixel_shuffle_op_xpu.py
rename to test/xpu/test_pixel_shuffle_op_xpu.py
index 6674cf33ebb04..444066ffbc548 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_pixel_shuffle_op_xpu.py
+++ b/test/xpu/test_pixel_shuffle_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_pool2d_op_xpu.py b/test/xpu/test_pool2d_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_pool2d_op_xpu.py
rename to test/xpu/test_pool2d_op_xpu.py
index f5a7bb398d63b..5c4233ee36e78 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_pool2d_op_xpu.py
+++ b/test/xpu/test_pool2d_op_xpu.py
@@ -12,19 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from test_pool2d_op import adaptive_end_index, adaptive_start_index
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
+from test_pool2d_op import adaptive_end_index, adaptive_start_index
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_pool3d_op_xpu.py b/test/xpu/test_pool3d_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_pool3d_op_xpu.py
rename to test/xpu/test_pool3d_op_xpu.py
index 06161a14054c8..43b3675563e64 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_pool3d_op_xpu.py
+++ b/test/xpu/test_pool3d_op_xpu.py
@@ -12,20 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-import paddle
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
+
+import paddle
 
 paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_pool_max_op_xpu.py b/test/xpu/test_pool_max_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_pool_max_op_xpu.py
rename to test/xpu/test_pool_max_op_xpu.py
index 57f09ab1f7410..0eb11bb83b70d 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_pool_max_op_xpu.py
+++ b/test/xpu/test_pool_max_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_pow2_decay_with_linear_warmup_op_xpu.py b/test/xpu/test_pow2_decay_with_linear_warmup_op_xpu.py
similarity index 96%
rename from python/paddle/fluid/tests/unittests/xpu/test_pow2_decay_with_linear_warmup_op_xpu.py
rename to test/xpu/test_pow2_decay_with_linear_warmup_op_xpu.py
index 78ca6933181aa..71da7768cc12f 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_pow2_decay_with_linear_warmup_op_xpu.py
+++ b/test/xpu/test_pow2_decay_with_linear_warmup_op_xpu.py
@@ -15,14 +15,14 @@
 import sys
 import unittest
 
+sys.path.append('../../python/paddle/fluid/tests/unittests')
+
+from get_test_cover_info import record_op_test
+
 import paddle
 from paddle.fluid.contrib.layers.nn import pow2_decay_with_linear_warmup
 from paddle.optimizer.lr import LinearWarmup, PolynomialDecay
 
-sys.path.append("..")
-
-from xpu.get_test_cover_info import record_op_test
-
 
 def gen_pow2_warmup_op_lr(warmup_steps, total_steps, base_lr, end_lr, place):
     main = paddle.static.Program()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_prelu_op_xpu.py b/test/xpu/test_prelu_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_prelu_op_xpu.py
rename to test/xpu/test_prelu_op_xpu.py
index 6bd4fcf8d5c56..0a0ea28269722 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_prelu_op_xpu.py
+++ b/test/xpu/test_prelu_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle import fluid
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_prior_box_op_xpu.py b/test/xpu/test_prior_box_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_prior_box_op_xpu.py
rename to test/xpu/test_prior_box_op_xpu.py
index 52d3ca875efdf..3b69cbaba341e 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_prior_box_op_xpu.py
+++ b/test/xpu/test_prior_box_op_xpu.py
@@ -13,19 +13,15 @@
 # limitations under the License.
 
 import math
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_prod_op_xpu.py b/test/xpu/test_prod_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_prod_op_xpu.py
rename to test/xpu/test_prod_op_xpu.py
index 1fb907f9f0924..a873fa8ecaf31 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_prod_op_xpu.py
+++ b/test/xpu/test_prod_op_xpu.py
@@ -15,9 +15,9 @@
 import sys
 import unittest
 
-import numpy as np
+sys.path.append('../../python/paddle/fluid/tests/unittests')
 
-sys.path.append("..")
+import numpy as np
 from test_sum_op import TestReduceOPTensorAxisBase
 
 import paddle
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_randint_op_xpu.py b/test/xpu/test_randint_op_xpu.py
similarity index 96%
rename from python/paddle/fluid/tests/unittests/xpu/test_randint_op_xpu.py
rename to test/xpu/test_randint_op_xpu.py
index baeff8a10a640..e697109a1baea 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_randint_op_xpu.py
+++ b/test/xpu/test_randint_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_randperm_op_xpu.py b/test/xpu/test_randperm_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_randperm_op_xpu.py
rename to test/xpu/test_randperm_op_xpu.py
index 0e285f6b03c0e..f28944e0009a2 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_randperm_op_xpu.py
+++ b/test/xpu/test_randperm_op_xpu.py
@@ -12,22 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-import paddle
-from paddle.fluid import core
-from paddle.static import Program, program_guard
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
+
+import paddle
+from paddle.fluid import core
+from paddle.static import Program, program_guard
 
 paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_range_xpu.py b/test/xpu/test_range_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_range_xpu.py
rename to test/xpu/test_range_xpu.py
index 2870cbb7a7cc6..f202a08c0f364 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_range_xpu.py
+++ b/test/xpu/test_range_xpu.py
@@ -12,20 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-import paddle
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
+
+import paddle
 
 paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_recompute_op_xpu.py b/test/xpu/test_recompute_op_xpu.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/xpu/test_recompute_op_xpu.py
rename to test/xpu/test_recompute_op_xpu.py
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reduce_all_op_xpu.py b/test/xpu/test_reduce_all_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_reduce_all_op_xpu.py
rename to test/xpu/test_reduce_all_op_xpu.py
index 987b968b0a691..313d8297a1705 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_reduce_all_op_xpu.py
+++ b/test/xpu/test_reduce_all_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reduce_amax_op_xpu.py b/test/xpu/test_reduce_amax_op_xpu.py
similarity index 96%
rename from python/paddle/fluid/tests/unittests/xpu/test_reduce_amax_op_xpu.py
rename to test/xpu/test_reduce_amax_op_xpu.py
index 49ffef884d3db..0de9b6c6e7306 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_reduce_amax_op_xpu.py
+++ b/test/xpu/test_reduce_amax_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reduce_amin_op_xpu.py b/test/xpu/test_reduce_amin_op_xpu.py
similarity index 96%
rename from python/paddle/fluid/tests/unittests/xpu/test_reduce_amin_op_xpu.py
rename to test/xpu/test_reduce_amin_op_xpu.py
index 4f2ca6fea3ff8..ad1d643bb9703 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_reduce_amin_op_xpu.py
+++ b/test/xpu/test_reduce_amin_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reduce_any_op_xpu.py b/test/xpu/test_reduce_any_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_reduce_any_op_xpu.py
rename to test/xpu/test_reduce_any_op_xpu.py
index a255dc390bcc0..5b4e0740cfacc 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_reduce_any_op_xpu.py
+++ b/test/xpu/test_reduce_any_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reduce_max_op_xpu.py b/test/xpu/test_reduce_max_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_reduce_max_op_xpu.py
rename to test/xpu/test_reduce_max_op_xpu.py
index dd00a711f85ac..1b76f78d09ac7 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_reduce_max_op_xpu.py
+++ b/test/xpu/test_reduce_max_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reduce_mean_op_xpu.py b/test/xpu/test_reduce_mean_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_reduce_mean_op_xpu.py
rename to test/xpu/test_reduce_mean_op_xpu.py
index ed3d51ff4fd62..d2447debaa479 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_reduce_mean_op_xpu.py
+++ b/test/xpu/test_reduce_mean_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reduce_min_op_xpu.py b/test/xpu/test_reduce_min_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_reduce_min_op_xpu.py
rename to test/xpu/test_reduce_min_op_xpu.py
index 87ab399863596..692d06df6a6d2 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_reduce_min_op_xpu.py
+++ b/test/xpu/test_reduce_min_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reduce_prod_op_xpu.py b/test/xpu/test_reduce_prod_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_reduce_prod_op_xpu.py
rename to test/xpu/test_reduce_prod_op_xpu.py
index 1e9c259f0e580..ab44b1be351e9 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_reduce_prod_op_xpu.py
+++ b/test/xpu/test_reduce_prod_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reduce_sum_op_xpu.py b/test/xpu/test_reduce_sum_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_reduce_sum_op_xpu.py
rename to test/xpu/test_reduce_sum_op_xpu.py
index 4137b2b18cbc2..e6ed19365c65e 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_reduce_sum_op_xpu.py
+++ b/test/xpu/test_reduce_sum_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_refactor_op_xpu.py b/test/xpu/test_refactor_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_refactor_op_xpu.py
rename to test/xpu/test_refactor_op_xpu.py
index 55f32c876c74a..976a33244209a 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_refactor_op_xpu.py
+++ b/test/xpu/test_refactor_op_xpu.py
@@ -15,17 +15,16 @@
 import sys
 import unittest
 
-import numpy as np
-
-sys.path.append("..")
+sys.path.append('../../python/paddle/fluid/tests/unittests')
 
+import numpy as np
 from eager_op_test import OpTest
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle.fluid import core
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reshape2_op_xpu.py b/test/xpu/test_reshape2_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_reshape2_op_xpu.py
rename to test/xpu/test_reshape2_op_xpu.py
index 41415a574b589..ce825d89c0957 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_reshape2_op_xpu.py
+++ b/test/xpu/test_reshape2_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_rmsprop_op_xpu.py b/test/xpu/test_rmsprop_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_rmsprop_op_xpu.py
rename to test/xpu/test_rmsprop_op_xpu.py
index c905eb9a57974..604f9e4bb00bd 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_rmsprop_op_xpu.py
+++ b/test/xpu/test_rmsprop_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle import fluid
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_rnn_op_xpu.py b/test/xpu/test_rnn_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_rnn_op_xpu.py
rename to test/xpu/test_rnn_op_xpu.py
index e28a7ff9c10d5..2b6100247e379 100755
--- a/python/paddle/fluid/tests/unittests/xpu/test_rnn_op_xpu.py
+++ b/test/xpu/test_rnn_op_xpu.py
@@ -10,26 +10,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import random
+import sys
 import unittest
 
+sys.path.append('../../python/paddle/fluid/tests/unittests')
+
 import numpy as np
 from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle.fluid import core
 
-sys.path.append("../rnn")
+sys.path.append('../../python/paddle/fluid/tests/unittests/rnn')
 from convert import get_params_for_net
-from rnn_numpy import LSTM
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from rnn_numpy import LSTM
 
 random.seed(2)
 np.set_printoptions(threshold=np.inf)
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_roi_align_op_xpu.py b/test/xpu/test_roi_align_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_roi_align_op_xpu.py
rename to test/xpu/test_roi_align_op_xpu.py
index 1c3ad0af30c9c..d65f78be1a488 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_roi_align_op_xpu.py
+++ b/test/xpu/test_roi_align_op_xpu.py
@@ -12,19 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import math
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle.fluid import core
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_roll_op_xpu.py b/test/xpu/test_roll_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_roll_op_xpu.py
rename to test/xpu/test_roll_op_xpu.py
index 25b156a280a64..8c3a9c6fcb164 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_roll_op_xpu.py
+++ b/test/xpu/test_roll_op_xpu.py
@@ -12,20 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-import paddle
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
+
+import paddle
 
 paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_scale_op_xpu.py b/test/xpu/test_scale_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_scale_op_xpu.py
rename to test/xpu/test_scale_op_xpu.py
index 1dd41e90a1700..fbc3b7f820856 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_scale_op_xpu.py
+++ b/test/xpu/test_scale_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle.fluid import Program, program_guard
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_scatter_nd_add_op_xpu.py b/test/xpu/test_scatter_nd_add_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_scatter_nd_add_op_xpu.py
rename to test/xpu/test_scatter_nd_add_op_xpu.py
index 12e159706ea1e..f303cd9ce5150 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_scatter_nd_add_op_xpu.py
+++ b/test/xpu/test_scatter_nd_add_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_scatter_op_xpu.py b/test/xpu/test_scatter_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_scatter_op_xpu.py
rename to test/xpu/test_scatter_op_xpu.py
index 565549f0f16bb..50c860bdd8673 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_scatter_op_xpu.py
+++ b/test/xpu/test_scatter_op_xpu.py
@@ -12,20 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
     type_dict_str_to_numpy,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_sequence_conv_op_xpu.py b/test/xpu/test_sequence_conv_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_sequence_conv_op_xpu.py
rename to test/xpu/test_sequence_conv_op_xpu.py
index a4f960fc9e31b..4a52ea54f4aff 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_sequence_conv_op_xpu.py
+++ b/test/xpu/test_sequence_conv_op_xpu.py
@@ -21,12 +21,12 @@
 import paddle
 
 sys.path.append("../")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 paddle.enable_static()
 np.set_printoptions(threshold=np.inf)
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_sequence_unpad_op_xpu.py b/test/xpu/test_sequence_unpad_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_sequence_unpad_op_xpu.py
rename to test/xpu/test_sequence_unpad_op_xpu.py
index 65f52bcfc0b1d..15215fcb0c614 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_sequence_unpad_op_xpu.py
+++ b/test/xpu/test_sequence_unpad_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_set_value_op_xpu.py b/test/xpu/test_set_value_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_set_value_op_xpu.py
rename to test/xpu/test_set_value_op_xpu.py
index 90277c7f484d6..e749eb8bc1b11 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_set_value_op_xpu.py
+++ b/test/xpu/test_set_value_op_xpu.py
@@ -21,12 +21,12 @@
 import numpy as np
 
 sys.path.append("../")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle.fluid.layer_helper import LayerHelper
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_sgd_op_xpu.py b/test/xpu/test_sgd_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_sgd_op_xpu.py
rename to test/xpu/test_sgd_op_xpu.py
index 42cdfd0c82d2a..6c57c19438ad6 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_sgd_op_xpu.py
+++ b/test/xpu/test_sgd_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle import fluid
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_shape_op_xpu.py b/test/xpu/test_shape_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_shape_op_xpu.py
rename to test/xpu/test_shape_op_xpu.py
index 2f8d7ec830077..a812369ea526e 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_shape_op_xpu.py
+++ b/test/xpu/test_shape_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle.fluid import core
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_sigmoid_cross_entropy_with_logits_op_xpu.py b/test/xpu/test_sigmoid_cross_entropy_with_logits_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_sigmoid_cross_entropy_with_logits_op_xpu.py
rename to test/xpu/test_sigmoid_cross_entropy_with_logits_op_xpu.py
index 8c0b3e4c73384..30369e9f22d85 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_sigmoid_cross_entropy_with_logits_op_xpu.py
+++ b/test/xpu/test_sigmoid_cross_entropy_with_logits_op_xpu.py
@@ -12,19 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from scipy.special import expit, logit
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
+from scipy.special import expit, logit
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_sign_op_xpu.py b/test/xpu/test_sign_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_sign_op_xpu.py
rename to test/xpu/test_sign_op_xpu.py
index 8743310a9c697..e6b2334f9b7f3 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_sign_op_xpu.py
+++ b/test/xpu/test_sign_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_slice_op_xpu.py b/test/xpu/test_slice_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_slice_op_xpu.py
rename to test/xpu/test_slice_op_xpu.py
index 09368723a1f48..f19c3d37e283e 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_slice_op_xpu.py
+++ b/test/xpu/test_slice_op_xpu.py
@@ -12,20 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-import paddle
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
+
+import paddle
 
 paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_softmax_op_xpu.py b/test/xpu/test_softmax_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_softmax_op_xpu.py
rename to test/xpu/test_softmax_op_xpu.py
index 24c25bbe1a88e..9b849832bd984 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_softmax_op_xpu.py
+++ b/test/xpu/test_softmax_op_xpu.py
@@ -12,20 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-import paddle
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
+
+import paddle
 
 paddle.enable_static()
 np.random.seed(10)
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_softmax_with_cross_entropy_op_xpu.py b/test/xpu/test_softmax_with_cross_entropy_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_softmax_with_cross_entropy_op_xpu.py
rename to test/xpu/test_softmax_with_cross_entropy_op_xpu.py
index 1ecc1eb4934ca..cb623e900d42b 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_softmax_with_cross_entropy_op_xpu.py
+++ b/test/xpu/test_softmax_with_cross_entropy_op_xpu.py
@@ -12,20 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
-
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from test_softmax_op import stable_softmax
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
+from test_softmax_op import stable_softmax
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_split_op_xpu.py b/test/xpu/test_split_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_split_op_xpu.py
rename to test/xpu/test_split_op_xpu.py
index dca61b4b129a1..8bc7ee9af1b04 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_split_op_xpu.py
+++ b/test/xpu/test_split_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_squeeze2_op_xpu.py b/test/xpu/test_squeeze2_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_squeeze2_op_xpu.py
rename to test/xpu/test_squeeze2_op_xpu.py
index b9598bc3ca08a..4e26152551c57 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_squeeze2_op_xpu.py
+++ b/test/xpu/test_squeeze2_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
-sys.path.append("..")
-
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_squeeze_op_xpu.py b/test/xpu/test_squeeze_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_squeeze_op_xpu.py
rename to test/xpu/test_squeeze_op_xpu.py
index 85339b9eb8b8a..5aae366c85635 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_squeeze_op_xpu.py
+++ b/test/xpu/test_squeeze_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
-sys.path.append("..")
-
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle import fluid
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_stack_op_xpu.py b/test/xpu/test_stack_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_stack_op_xpu.py
rename to test/xpu/test_stack_op_xpu.py
index b13e1b9b300aa..3732de7dc33f5 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_stack_op_xpu.py
+++ b/test/xpu/test_stack_op_xpu.py
@@ -13,18 +13,18 @@
 # limitations under the License.
 
 import sys
-
-sys.path.append("..")
 import unittest
 
+sys.path.append("../../python/paddle/fluid/tests/unittests")
+
 import numpy as np
 from eager_op_test import skip_check_grad_ci
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_strided_slice_op_xpu.py b/test/xpu/test_strided_slice_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_strided_slice_op_xpu.py
rename to test/xpu/test_strided_slice_op_xpu.py
index 7659ffd4ae0c6..63954dfd7859c 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_strided_slice_op_xpu.py
+++ b/test/xpu/test_strided_slice_op_xpu.py
@@ -12,20 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-import paddle
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
+
+import paddle
 
 paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_sum_op_xpu.py b/test/xpu/test_sum_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_sum_op_xpu.py
rename to test/xpu/test_sum_op_xpu.py
index 77d934e478cb5..3b51b0adb76d0 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_sum_op_xpu.py
+++ b/test/xpu/test_sum_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle import fluid
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_temporal_shift_op_xpu.py b/test/xpu/test_temporal_shift_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_temporal_shift_op_xpu.py
rename to test/xpu/test_temporal_shift_op_xpu.py
index 4a1967326504f..71904903fc145 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_temporal_shift_op_xpu.py
+++ b/test/xpu/test_temporal_shift_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 import paddle.nn.functional as F
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_tile_op_xpu.py b/test/xpu/test_tile_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_tile_op_xpu.py
rename to test/xpu/test_tile_op_xpu.py
index c6f9c79be4d6f..dc2b0d7f0edcd 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_tile_op_xpu.py
+++ b/test/xpu/test_tile_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle import fluid
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_top_k_op_xpu.py b/test/xpu/test_top_k_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_top_k_op_xpu.py
rename to test/xpu/test_top_k_op_xpu.py
index 8dfbddbb1cf59..131bb0c1d0711 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_top_k_op_xpu.py
+++ b/test/xpu/test_top_k_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_top_k_v2_op_xpu.py b/test/xpu/test_top_k_v2_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_top_k_v2_op_xpu.py
rename to test/xpu/test_top_k_v2_op_xpu.py
index eaad7001928fa..8230aa0ff5d22 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_top_k_v2_op_xpu.py
+++ b/test/xpu/test_top_k_v2_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_transpose_op_xpu.py b/test/xpu/test_transpose_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_transpose_op_xpu.py
rename to test/xpu/test_transpose_op_xpu.py
index 458cf8a667421..f314eb6e4dc77 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_transpose_op_xpu.py
+++ b/test/xpu/test_transpose_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_tril_triu_op_xpu.py b/test/xpu/test_tril_triu_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_tril_triu_op_xpu.py
rename to test/xpu/test_tril_triu_op_xpu.py
index 010cf6fb6102e..15371d894fa8d 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_tril_triu_op_xpu.py
+++ b/test/xpu/test_tril_triu_op_xpu.py
@@ -10,19 +10,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
-
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle import tensor
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_truncated_gaussian_random_op_xpu.py b/test/xpu/test_truncated_gaussian_random_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_truncated_gaussian_random_op_xpu.py
rename to test/xpu/test_truncated_gaussian_random_op_xpu.py
index 7355acdfcee48..c217a2641d160 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_truncated_gaussian_random_op_xpu.py
+++ b/test/xpu/test_truncated_gaussian_random_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle import fluid
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_unbind_op_xpu.py b/test/xpu/test_unbind_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_unbind_op_xpu.py
rename to test/xpu/test_unbind_op_xpu.py
index fd0f36677f8fc..dc8ea7ae6bc14 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_unbind_op_xpu.py
+++ b/test/xpu/test_unbind_op_xpu.py
@@ -12,17 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
-sys.path.append("..")
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle import fluid, tensor
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_unfold_op_xpu.py b/test/xpu/test_unfold_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_unfold_op_xpu.py
rename to test/xpu/test_unfold_op_xpu.py
index e1034d6363628..c6e80469f7d0d 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_unfold_op_xpu.py
+++ b/test/xpu/test_unfold_op_xpu.py
@@ -12,21 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-import paddle
-from paddle import fluid
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
+
+import paddle
+from paddle import fluid
 
 paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_uniform_random_op_xpu.py b/test/xpu/test_uniform_random_op_xpu.py
similarity index 95%
rename from python/paddle/fluid/tests/unittests/xpu/test_uniform_random_op_xpu.py
rename to test/xpu/test_uniform_random_op_xpu.py
index 3dff72b5d680c..f5fd57bd36696 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_uniform_random_op_xpu.py
+++ b/test/xpu/test_uniform_random_op_xpu.py
@@ -13,10 +13,10 @@
 # limitations under the License.
 
 import sys
-
-sys.path.append("..")
 import unittest
 
+sys.path.append('../../python/paddle/fluid/tests/unittests')
+
 import numpy as np
 from test_uniform_random_op import (
     TestUniformRandomOp,
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_unsqueeze2_op_xpu.py b/test/xpu/test_unsqueeze2_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_unsqueeze2_op_xpu.py
rename to test/xpu/test_unsqueeze2_op_xpu.py
index 56862299074f3..d8cb02e64f993 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_unsqueeze2_op_xpu.py
+++ b/test/xpu/test_unsqueeze2_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
-sys.path.append("..")
-
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_unsqueeze_op_xpu.py b/test/xpu/test_unsqueeze_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_unsqueeze_op_xpu.py
rename to test/xpu/test_unsqueeze_op_xpu.py
index 4f2b1d2b5a8ad..333633031bdfd 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_unsqueeze_op_xpu.py
+++ b/test/xpu/test_unsqueeze_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
-sys.path.append("..")
-
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_unstack_op_xpu.py b/test/xpu/test_unstack_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_unstack_op_xpu.py
rename to test/xpu/test_unstack_op_xpu.py
index 6195ec55abd41..9d305a312b74b 100755
--- a/python/paddle/fluid/tests/unittests/xpu/test_unstack_op_xpu.py
+++ b/test/xpu/test_unstack_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_update_loss_scaling_op_xpu.py b/test/xpu/test_update_loss_scaling_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_update_loss_scaling_op_xpu.py
rename to test/xpu/test_update_loss_scaling_op_xpu.py
index e2b7263fed26e..86e6aac6badb5 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_update_loss_scaling_op_xpu.py
+++ b/test/xpu/test_update_loss_scaling_op_xpu.py
@@ -12,17 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
-sys.path.append("..")
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle import fluid
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_warpctc_op_xpu.py b/test/xpu/test_warpctc_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_warpctc_op_xpu.py
rename to test/xpu/test_warpctc_op_xpu.py
index d09db48cffc51..95cf65075472f 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_warpctc_op_xpu.py
+++ b/test/xpu/test_warpctc_op_xpu.py
@@ -13,18 +13,16 @@
 # limitations under the License.
 
 import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from test_softmax_op import stable_softmax
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
+from test_softmax_op import stable_softmax
 
 import paddle
 import paddle.nn.functional as F
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_where_index_xpu.py b/test/xpu/test_where_index_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_where_index_xpu.py
rename to test/xpu/test_where_index_xpu.py
index 1a8e7aa96453c..cca29f5737336 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_where_index_xpu.py
+++ b/test/xpu/test_where_index_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle import fluid
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_where_op_xpu.py b/test/xpu/test_where_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_where_op_xpu.py
rename to test/xpu/test_where_op_xpu.py
index a7a26f32b02cb..8dd7500517aed 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_where_op_xpu.py
+++ b/test/xpu/test_where_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle import fluid
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_while_op_xpu.py b/test/xpu/test_while_op_xpu.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/xpu/test_while_op_xpu.py
rename to test/xpu/test_while_op_xpu.py
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_xpu_place.py b/test/xpu/test_xpu_place.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/xpu/test_xpu_place.py
rename to test/xpu/test_xpu_place.py
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_xpu_stream_event.py b/test/xpu/test_xpu_stream_event.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/xpu/test_xpu_stream_event.py
rename to test/xpu/test_xpu_stream_event.py
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_zero_dim_tensor_xpu.py b/test/xpu/test_zero_dim_tensor_xpu.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/xpu/test_zero_dim_tensor_xpu.py
rename to test/xpu/test_zero_dim_tensor_xpu.py
diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index 50f9344c66fe4..8cfcb63e84c70 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -248,8 +248,8 @@ fi
 NO_NPU_FILE=`git diff --name-only upstream/$BRANCH | grep -v "_npu.py"`
 HAS_UNITTEST_SKIP=`git diff -U0 upstream/$BRANCH ${NO_NPU_FILE} | grep "^+[[:space:]]\{0,\}@unittest.skip" || true`
 if [ "${HAS_UNITTEST_SKIP}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-    echo_line="Unittest is not allowed to be disabled.\nYou must have one RD (kolinwei(Recommend), wanghuancoder, luotao1, QingshuChen or qili93) approval for the usage of @unittest.skip or @unittest.skipIf.\n${HAS_UNITTEST_SKIP}\n"
-    check_approval 1 22165420 6836917 46661762 26922892 16605440 2002279
+    echo_line="Unittest is not allowed to be disabled.\nYou must have one RD (kolinwei(Recommend), wanghuancoder, luotao1, QingshuChen, qili93 or ZzSean) approval for the usage of @unittest.skip or @unittest.skipIf.\n${HAS_UNITTEST_SKIP}\n"
+    check_approval 1 22165420 6836917 46661762 26922892 16605440 2002279 32410583
   fi
 
 HAS_MODIFIED_DEMO_CMAKE=`git diff --name-only upstream/$BRANCH | grep "paddle/fluid/inference/api/demo_ci/CMakeLists.txt" || true`
@@ -456,8 +456,8 @@ if [ "${NEW_OP_TEST_ADDED}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     CHECK_WHOLE=$CHECK_OUTPUT$CHECK_OUTPUT_WITH_PLACE$CHECK_GRAD$CHECK_GRAD_CHECK
     if [ "${CHECK_WHOLE}" != "" ] ; then
         CHECK_OP=${CHECK_WHOLE//+/'\n+'}
-        echo_line="Please use the default precision parameters of 'atol, rtol, eps, max_relative_error'. If you don't use the default value, you must have one RD (Xreki (Recommend), fuyinno4, QingshuChen(Recommend for kunlun), zhiqiu or qili93 (Recommend for NPU) , luotao1, lanxianghit or phlrain) approval for the usage of other values. The detailed information is in the link: https://github.cor/PaddlePaddle/Paddle/wiki/OP-test-accuracy-requirements. The error line is ${CHECK_OP}\n"
-        check_approval 1 6836917 47554610 12538138 43953930 35824027 6888866 16605440 2002279
+        echo_line="Please use the default precision parameters of 'atol, rtol, eps, max_relative_error'. If you don't use the default value, you must have one RD (Xreki (Recommend), fuyinno4, QingshuChen(Recommend for kunlun), zhiqiu or qili93 (Recommend for NPU) , luotao1, lanxianghit, phlrain or ZzSean) approval for the usage of other values. The detailed information is in the link: https://github.cor/PaddlePaddle/Paddle/wiki/OP-test-accuracy-requirements. The error line is ${CHECK_OP}\n"
+        check_approval 1 6836917 47554610 12538138 43953930 35824027 6888866 16605440 2002279 32410583
     fi
 fi
 
diff --git a/tools/get_pr_ut.py b/tools/get_pr_ut.py
index c6f190189a67f..14c4e5075f6d8 100644
--- a/tools/get_pr_ut.py
+++ b/tools/get_pr_ut.py
@@ -406,7 +406,7 @@ def get_pr_ut(self):
                         ut_list.append('md_placeholder')
                         onlyCommentsFilesOrXpu.append(f_judge)
                     elif (
-                        'tests/unittests/xpu' in f_judge
+                        'test/xpu' in f_judge
                         or 'tests/unittests/npu' in f_judge
                         or 'op_npu.cc' in f_judge
                     ):