diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 180214970be99..2899119cde9a8 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -23,7 +23,8 @@ set(CBLAS_TAG v0.3.7)
 # https://github.com/PaddlePaddle/Paddle/pull/52983
 if(UNIX
    AND NOT APPLE
-   AND NOT WITH_ROCM)
+   AND NOT WITH_ROCM
+   AND NOT WITH_XPU)
   set(CBLAS_TAG v0.3.18)
 endif()
 
diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index 6202d4edf0496..c2be9fab7ee78 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -9,7 +9,7 @@ set(XPU_RT_LIB_NAME "libxpurt.so")
 set(XPU_XFT_LIB_NAME "libxft.so")
 
 set(XPU_BASE_DATE "20230427")
-set(XPU_XCCL_BASE_VERSION "1.0.13")
+set(XPU_XCCL_BASE_VERSION "1.0.49.2")
 set(XPU_XFT_BASE_VERSION "latest")
 
 if(NOT DEFINED XPU_BASE_URL)
@@ -30,35 +30,41 @@ if(NOT XPU_XFT_BASE_URL)
   )
 endif()
 
+if(WITH_XCCL_RDMA)
+  set(XPU_XCCL_PREFIX "xccl_rdma")
+else()
+  set(XPU_XCCL_PREFIX "xccl_socket")
+endif()
+
 if(WITH_AARCH64)
   set(XPU_XRE_DIR_NAME "xre-kylin_aarch64")
   set(XPU_XDNN_DIR_NAME "xdnn-kylin_aarch64")
-  set(XPU_XCCL_DIR_NAME "xccl-kylin_aarch64")
+  set(XPU_XCCL_DIR_NAME "${XPU_XCCL_PREFIX}-kylin_aarch64")
   set(XPU_XFT_DIR_NAME "") # TODO: xft has no kylin output at now.
 elseif(WITH_SUNWAY)
   set(XPU_XRE_DIR_NAME "xre-deepin_sw6_64")
   set(XPU_XDNN_DIR_NAME "xdnn-deepin_sw6_64")
-  set(XPU_XCCL_DIR_NAME "xccl-deepin_sw6_64")
+  set(XPU_XCCL_DIR_NAME "${XPU_XCCL_PREFIX}-deepin_sw6_64")
   set(XPU_XFT_DIR_NAME "") # TODO: xft has no deepin output at now.
 elseif(WITH_BDCENTOS)
   set(XPU_XRE_DIR_NAME "xre-bdcentos_x86_64")
   set(XPU_XDNN_DIR_NAME "xdnn-bdcentos_x86_64")
-  set(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64")
+  set(XPU_XCCL_DIR_NAME "${XPU_XCCL_PREFIX}-bdcentos_x86_64")
   set(XPU_XFT_DIR_NAME "xft_bdcentos6u3_x86_64_gcc82")
 elseif(WITH_UBUNTU)
   set(XPU_XRE_DIR_NAME "xre-ubuntu_x86_64")
   set(XPU_XDNN_DIR_NAME "xdnn-ubuntu_x86_64")
-  set(XPU_XCCL_DIR_NAME "xccl-ubuntu_x86_64")
+  set(XPU_XCCL_DIR_NAME "${XPU_XCCL_PREFIX}-ubuntu_x86_64")
   set(XPU_XFT_DIR_NAME "xft_ubuntu1604_x86_64")
 elseif(WITH_CENTOS)
   set(XPU_XRE_DIR_NAME "xre-centos7_x86_64")
   set(XPU_XDNN_DIR_NAME "xdnn-centos7_x86_64")
-  set(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64")
+  set(XPU_XCCL_DIR_NAME "${XPU_XCCL_PREFIX}-bdcentos_x86_64")
   set(XPU_XFT_DIR_NAME "xft_bdcentos6u3_x86_64_gcc82")
 else()
   set(XPU_XRE_DIR_NAME "xre-ubuntu_x86_64")
   set(XPU_XDNN_DIR_NAME "xdnn-ubuntu_x86_64")
-  set(XPU_XCCL_DIR_NAME "xccl-ubuntu_x86_64")
+  set(XPU_XCCL_DIR_NAME "${XPU_XCCL_PREFIX}-ubuntu_x86_64")
   set(XPU_XFT_DIR_NAME "xft_ubuntu1604_x86_64")
 endif()
 
@@ -75,9 +81,6 @@ set(XPU_XFT_URL "${XPU_XFT_BASE_URL}/${XPU_XFT_DIR_NAME}.tar.gz")
 set(XPU_PACK_DEPENCE_URL
     "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/pack_paddle_depence.sh"
     CACHE STRING "" FORCE)
-set(XPU_CHECK_DEPENCE_URL
-    "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/check_xpu_dependence.sh"
-    CACHE STRING "" FORCE)
 set(XPU_XFT_GET_DEPENCE_URL
     "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/get_xft_dependence.sh"
     CACHE STRING "" FORCE)
@@ -115,8 +118,8 @@ ExternalProject_Add(
   PREFIX ${SNAPPY_PREFIX_DIR}
   DOWNLOAD_DIR ${XPU_DOWNLOAD_DIR}
   DOWNLOAD_COMMAND
-    wget ${XPU_CHECK_DEPENCE_URL} && bash check_xpu_dependence.sh
-    ${XPU_BASE_URL} ${XPU_XCCL_BASE_URL} && wget ${XPU_PACK_DEPENCE_URL} && bash
+    bash ${CMAKE_SOURCE_DIR}/tools/xpu/check_xpu_dependence.sh ${XPU_BASE_URL}
+    ${XPU_XCCL_BASE_URL} && wget ${XPU_PACK_DEPENCE_URL} && bash
     pack_paddle_depence.sh ${XPU_XRE_URL} ${XPU_XRE_DIR_NAME} ${XPU_XDNN_URL}
     ${XPU_XDNN_DIR_NAME} ${XPU_XCCL_URL} ${XPU_XCCL_DIR_NAME} && wget
     ${XPU_XFT_GET_DEPENCE_URL} && bash get_xft_dependence.sh ${XPU_XFT_URL}
diff --git a/paddle/fluid/distributed/collective/process_group_bkcl.cc b/paddle/fluid/distributed/collective/process_group_bkcl.cc
index 47dd2241c2cde..a3c3e085c6df5 100644
--- a/paddle/fluid/distributed/collective/process_group_bkcl.cc
+++ b/paddle/fluid/distributed/collective/process_group_bkcl.cc
@@ -115,7 +115,13 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::Recv(
           const phi::DenseTensor& input,
           BKCLContext_t comm,
           const XPUStream& stream) {
-        VLOG(3) << "bkcl_recv";
+        VLOG(3) << "calling bkcl_recv"
+                << ", rank_id: " << platform::GetBKCLRankID(comm)
+                << ", dev_id: " << platform::GetBKCLDevID(comm)
+                << ", nranks: " << platform::GetBKCLNRanks(comm)
+                << ", src_rank: " << src_rank << ", numel: " << output->numel()
+                << ", dtype: " << output->type() << ", sync_op: " << sync_op
+                << ", use_calc_stream: " << use_calc_stream;
         int r = bkcl_recv(comm,
                           output->data(),
                           output->numel(),
@@ -148,7 +154,14 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::Send(
           const phi::DenseTensor& input,
           BKCLContext_t comm,
           const XPUStream& stream) {
-        VLOG(3) << "bkcl_send";
+        VLOG(3) << "calling bkcl_send"
+                << ", rank_id: " << platform::GetBKCLRankID(comm)
+                << ", dev_id: " << platform::GetBKCLDevID(comm)
+                << ", nranks: " << platform::GetBKCLNRanks(comm)
+                << ", dst_rank: " << dst_rank
+                << ", input numel: " << input.numel()
+                << ", dtype: " << input.type() << ", sync_op: " << sync_op
+                << ", use_calc_stream: " << use_calc_stream;
         int r = bkcl_send(comm,
                           input.data(),
                           input.numel(),
@@ -276,7 +289,14 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::AllReduce(
           const phi::DenseTensor& input,
           BKCLContext_t comm,
           const XPUStream& stream) {
-        VLOG(3) << "bkcl_all_reduce";
+        VLOG(3) << "calling bkcl_all_reduce"
+                << ", rank_id: " << platform::GetBKCLRankID(comm)
+                << ", dev_id: " << platform::GetBKCLDevID(comm)
+                << ", nranks: " << platform::GetBKCLNRanks(comm)
+                << ", numel: " << input.numel() << ", dtype: " << input.type()
+                << ", reduce_type: " << ToBKCLRedType(opts.reduce_op)
+                << ", sync_op: " << sync_op
+                << ", use_calc_stream: " << use_calc_stream;
         int r =
             bkcl_all_reduce(comm,
                             input.data(),
@@ -307,7 +327,13 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::Broadcast(
           BKCLContext_t comm,
           const XPUStream& stream) {
         int root = opts.source_rank + opts.source_root;
-        VLOG(3) << "bkcl_broadcast";
+        VLOG(3) << "calling bkcl_broadcast"
+                << ", rank_id: " << platform::GetBKCLRankID(comm)
+                << ", dev_id: " << platform::GetBKCLDevID(comm)
+                << ", nranks: " << platform::GetBKCLNRanks(comm)
+                << ", root: " << root << ", numel: " << input.numel()
+                << ", dtype: " << input.type() << ", sync_op: " << sync_op
+                << ", use_calc_stream: " << use_calc_stream;
         int r =
             bkcl_broadcast(comm,
                            input.data(),
@@ -346,7 +372,13 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::AllGather(
           const phi::DenseTensor& input,
           BKCLContext_t comm,
           const XPUStream& stream) {
-        VLOG(3) << "bkcl_all_gather";
+        VLOG(3) << "calling bkcl_all_gather"
+                << ", rank_id: " << platform::GetBKCLRankID(comm)
+                << ", dev_id: " << platform::GetBKCLDevID(comm)
+                << ", nranks: " << platform::GetBKCLNRanks(comm)
+                << ", numel: " << in_tensor_maybe_partial.numel()
+                << ", dtype: " << input.type() << ", sync_op: " << sync_op
+                << ", use_calc_stream: " << use_calc_stream;
         int r =
             bkcl_all_gather(comm,
                             in_tensor_maybe_partial.data(),
@@ -375,7 +407,15 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::Reduce(
           const phi::DenseTensor& input,
           BKCLContext_t comm,
           const XPUStream& stream) {
-        VLOG(3) << "bkcl_reduce";
+        VLOG(3) << "calling bkcl_reduce"
+                << ", rank_id: " << platform::GetBKCLRankID(comm)
+                << ", dev_id: " << platform::GetBKCLDevID(comm)
+                << ", nranks: " << platform::GetBKCLNRanks(comm)
+                << ", root: " << opts.root_rank << ", numel: " << input.numel()
+                << ", dtype: " << input.type()
+                << ", reduce_type: " << ToBKCLRedType(opts.reduce_op)
+                << ", sync_op: " << sync_op
+                << ", use_calc_stream: " << use_calc_stream;
         int r = bkcl_reduce(comm,
                             input.data(),
                             output->data(),
@@ -405,7 +445,14 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::ReduceScatter(
           const phi::DenseTensor& input,
           BKCLContext_t comm,
           const XPUStream& stream) {
-        VLOG(3) << "bkcl_reduce_scatter";
+        VLOG(3) << "calling bkcl_reduce_scatter"
+                << ", rank_id: " << platform::GetBKCLRankID(comm)
+                << ", dev_id: " << platform::GetBKCLDevID(comm)
+                << ", nranks: " << platform::GetBKCLNRanks(comm)
+                << ", numel: " << output->numel() << ", dtype: " << input.type()
+                << ", reduce_type: " << ToBKCLRedType(opts.reduce_op)
+                << ", sync_op: " << sync_op
+                << ", use_calc_stream: " << use_calc_stream;
         int r = bkcl_reduce_scatter(
             comm,
             input.data(),
@@ -491,8 +538,13 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::AllReduce(
           const phi::DenseTensor& input,
           BKCLContext_t comm,
           const XPUStream& stream) {
-        VLOG(3) << "bkcl_all_reduce";
-
+        VLOG(3) << "calling bkcl_all_reduce"
+                << ", rank_id: " << platform::GetBKCLRankID(comm)
+                << ", dev_id: " << platform::GetBKCLDevID(comm)
+                << ", nranks: " << platform::GetBKCLNRanks(comm)
+                << ", numel: " << input.numel() << ", dtype: " << input.type()
+                << ", reduce_type: " << ToBKCLRedType(opts.reduce_op)
+                << ", sync_op: " << true << ", use_calc_stream: " << false;
         int r =
             bkcl_all_reduce(comm,
                             input.data(),
@@ -535,7 +587,13 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::AllReduce(
           const phi::DenseTensor& input,
           BKCLContext_t comm,
           const XPUStream& stream) {
-        VLOG(3) << "bkcl_all_reduce";
+        VLOG(3) << "calling bkcl_all_reduce"
+                << ", rank_id: " << platform::GetBKCLRankID(comm)
+                << ", dev_id: " << platform::GetBKCLDevID(comm)
+                << ", nranks: " << platform::GetBKCLNRanks(comm)
+                << ", numel: " << input.numel() << ", dtype: " << input.type()
+                << ", reduce_type: " << ToBKCLRedType(opts.reduce_op)
+                << ", sync_op: " << sync_op << ", use_calc_stream: " << false;
         int r =
             bkcl_all_reduce(comm,
                             input.data(),
@@ -580,7 +638,13 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::Broadcast(
           const XPUStream& stream) {
         const auto root =
             opts.source_rank * in_tensors.size() + opts.source_root;
-        VLOG(3) << "bkcl_broadcast";
+        VLOG(3) << "calling bkcl_broadcast"
+                << ", rank_id: " << platform::GetBKCLRankID(comm)
+                << ", dev_id: " << platform::GetBKCLDevID(comm)
+                << ", nranks: " << platform::GetBKCLNRanks(comm)
+                << ", root: " << root << ", numel: " << input.numel()
+                << ", dtype: " << input.type() << ", sync_op: " << true
+                << ", use_calc_stream: " << false;
         int r =
             bkcl_broadcast(comm,
                            input.data(),
@@ -626,7 +690,13 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::Broadcast(
           const XPUStream& stream) {
         const auto root =
             opts.source_rank * in_tensors.size() + opts.source_root;
-        VLOG(3) << "bkcl_broadcast";
+        VLOG(3) << "calling bkcl_broadcast"
+                << ", rank_id: " << platform::GetBKCLRankID(comm)
+                << ", dev_id: " << platform::GetBKCLDevID(comm)
+                << ", nranks: " << platform::GetBKCLNRanks(comm)
+                << ", root: " << root << ", numel: " << input.numel()
+                << ", dtype: " << input.type() << ", sync_op: " << sync_op
+                << ", use_calc_stream: " << false;
         int r =
             bkcl_broadcast(comm,
                            input.data(),
@@ -671,7 +741,12 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::AllGather(
           const phi::DenseTensor& input,
           BKCLContext_t comm,
           const XPUStream& stream) {
-        VLOG(3) << "bkcl_all_gather";
+        VLOG(3) << "calling bkcl_all_gather"
+                << ", rank_id: " << platform::GetBKCLRankID(comm)
+                << ", dev_id: " << platform::GetBKCLDevID(comm)
+                << ", nranks: " << platform::GetBKCLNRanks(comm)
+                << ", numel: " << input.numel() << ", dtype: " << input.type()
+                << ", sync_op: " << true << ", use_calc_stream: " << false;
         int r =
             bkcl_all_gather(comm,
                             input.data(),
@@ -712,7 +787,12 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::AllGather(
           const phi::DenseTensor& input,
           BKCLContext_t comm,
           const XPUStream& stream) {
-        VLOG(3) << "bkcl_all_gather";
+        VLOG(3) << "calling bkcl_all_gather"
+                << ", rank_id: " << platform::GetBKCLRankID(comm)
+                << ", dev_id: " << platform::GetBKCLDevID(comm)
+                << ", nranks: " << platform::GetBKCLNRanks(comm)
+                << ", numel: " << input.numel() << ", dtype: " << input.type()
+                << ", sync_op: " << sync_op << ", use_calc_stream: " << false;
         int r =
             bkcl_all_gather(comm,
                             input.data(),
diff --git a/paddle/fluid/distributed/collective/process_group_custom.cc b/paddle/fluid/distributed/collective/process_group_custom.cc
index b6c7063fd6fb7..1e4d1df337bdb 100644
--- a/paddle/fluid/distributed/collective/process_group_custom.cc
+++ b/paddle/fluid/distributed/collective/process_group_custom.cc
@@ -125,12 +125,14 @@ void ProcessGroupCustom::BroadcastUniqueCustomID(
     std::vector<phi::ccl::CCLRootId>& ccl_ids) {  // NOLINT
   if (rank_ == 0) {
     for (size_t i = 0; i < ccl_ids.size(); i++) {
-      auto key = "ProcessGroupCustom/ccl_ids/" + std::to_string(i);
+      auto key = "ProcessGroupCustom/ccl_ids/" + std::to_string(gid_) + "/" +
+                 std::to_string(i);
       store_->set(key, ccl_ids[i]);
     }
   } else {
     for (size_t i = 0; i < ccl_ids.size(); i++) {
-      auto key = "ProcessGroupCustom/ccl_ids/" + std::to_string(i);
+      auto key = "ProcessGroupCustom/ccl_ids/" + std::to_string(gid_) + "/" +
+                 std::to_string(i);
       ccl_ids[i] = store_->get(key);
     }
   }
diff --git a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
index ed21e1171c17c..08a5f57d293af 100644
--- a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
@@ -68,7 +68,6 @@
     "matmul_double_grad",
     "tanh_double_grad",
     "add_double_grad",
-    "multiply_double_grad",
     "subtract_double_grad",
 ]
 
diff --git a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc
index 38c9ce3d8091e..693365c9f47ca 100644
--- a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc
@@ -150,7 +150,7 @@ DeviceContext* StreamAnalyzer::ParseDeviceContext(
 
   DeviceContext* dev_ctx = nullptr;
 
-  // only gpu needs update. xpu not need, because xpu memcpy op kernel is
+  // only gpu need update. xpu not need, because xpu memcpy op kernel is
   // synchronous.
   if (platform::is_gpu_place(place_) || platform::is_custom_place(place_)) {
     VLOG(6) << "Parse DeviceContext for " << op_type
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 21547331aa08f..66658020b66c6 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -114,6 +114,25 @@ struct SimpleOpTypeSetTeller : public Teller {
         "atanh", "ceil",        "celu",       "floor",       "round",
         "sign",  "logical_not", "reciprocal", "tanh_shrink", "logsigmoid",
         "erf",   "bitwise_not", "equal",      "not_equal",   "rsqrt"};
+
+    // Static shape does not support 0 or 1 dim's input.
+    if (!with_dynamic_shape) {
+      auto inputs = desc.Inputs();
+      for (auto iter : inputs) {
+        for (auto var_name : iter.second) {
+          auto* block = desc.Block();
+          if (block) {
+            auto* var_desc = block->FindVar(var_name);
+            // Can't get feed op's TensorDesc
+            if (op_type != "feed" && var_desc && !var_desc->Persistable()) {
+              const auto shape = var_desc->GetShape();
+              if (shape.size() == 1 || shape.size() == 0) return false;
+            }
+          }
+        }
+      }
+    }
+
     if (act_op_list.find(op_type) != act_op_list.end()) {
       auto* block = desc.Block();
       if (block == nullptr) {
@@ -122,15 +141,6 @@ struct SimpleOpTypeSetTeller : public Teller {
                    "the pass.";
         return false;
       }
-      auto x_var_name = desc.Input("X")[0];
-      auto* x_var_desc = block->FindVar(x_var_name);
-      const auto x_shape = x_var_desc->GetShape();
-      if (!with_dynamic_shape && (x_shape.size() == 1 || x_shape.size() == 0)) {
-        VLOG(3) << op_type
-                << " op does not support input's dim is 1 or 0 in tensorrt "
-                   "static shape mode.";
-        return false;
-      }
 #if !IS_TRT_VERSION_GE(7000)
       if (op_type == "erf") {
         VLOG(3) << op_type << " op does not support tensorrt.";
@@ -138,6 +148,9 @@ struct SimpleOpTypeSetTeller : public Teller {
       }
 #endif
 #if !IS_TRT_VERSION_GE(8600)
+      auto x_var_name = desc.Input("X")[0];
+      auto* x_var_desc = block->FindVar(x_var_name);
+      const auto x_shape = x_var_desc->GetShape();
       if (x_shape.size() == 0 && unary_list.find(op_type) != unary_list.end()) {
         VLOG(3) << op_type
                 << " op does not support 0 dim input when TensorRT < 8.6.";
@@ -145,24 +158,6 @@ struct SimpleOpTypeSetTeller : public Teller {
       }
 #endif
     }
-    // In static shape in Paddle-TRT, we can't allow that one op has a
-    // 1D intermediate tensor as input.
-    if (!with_dynamic_shape) {
-      auto inputs = desc.Inputs();
-      for (auto iter : inputs) {
-        for (auto var_name : iter.second) {
-          auto* block = desc.Block();
-          if (block) {
-            auto* var_desc = block->FindVar(var_name);
-            // Can't get feed op's TensorDesc
-            if (op_type != "feed" && var_desc && !var_desc->Persistable()) {
-              const auto shape = var_desc->GetShape();
-              if (shape.size() == 1) return false;
-            }
-          }
-        }
-      }
-    }
 
     if (op_type == "dropout") {
       /*
@@ -1491,6 +1486,7 @@ struct SimpleOpTypeSetTeller : public Teller {
                    "elementwise op.";
         return false;
       }
+
       if (x_var_desc->Persistable() && !with_dynamic_shape) {
         VLOG(3)
             << "Input X is a parameter which is not supported for "
@@ -1864,8 +1860,10 @@ struct SimpleOpTypeSetTeller : public Teller {
       auto x_var_name = desc.Input("X")[0];
       auto* x_var_desc = block->FindVar(x_var_name);
       const auto x_shape = x_var_desc->GetShape();
-      if (x_shape.size() == 1) {
-        VLOG(3) << "mish op does not support input's dim is 1 in tensorrt.";
+      if ((!with_dynamic_shape && x_shape.size() == 1) || x_shape.size() == 0) {
+        VLOG(3) << op_type
+                << "mish op does not support input's dim is 1 in tensorrt "
+                   "static shape mode or 0.";
         return false;
       }
     }
@@ -2598,6 +2596,15 @@ struct SimpleOpTypeSetTeller : public Teller {
                    "the pass.";
         return false;
       }
+
+#if IS_TRT_VERSION_LT(8000)
+      auto x_var_name = desc.Input("X")[0];
+      auto* x_var_desc = block->FindVar(x_var_name);
+      const auto x_shape = x_var_desc->GetShape();
+      if (x_shape.size() == 0) {
+        return false;  // not supported 0 dim.
+      }
+#endif
     }
 
     if (op_type == "grid_sampler") {
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 527e843d05bb8..931372a0d9a43 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -1039,8 +1039,8 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
 #elif defined(PADDLE_WITH_XPU)
   return GetAllocator(place)->Allocate(size);
 #else
-  PADDLE_THROW(platform::errors::PreconditionNotMet(
-      "Not compiled with GPU or XPU or NPU."));
+  PADDLE_THROW(
+      platform::errors::PreconditionNotMet("Not compiled with GPU or XPU."));
 #endif
 }
 
diff --git a/paddle/fluid/operators/beam_search_decode_op.h b/paddle/fluid/operators/beam_search_decode_op.h
index a6d807b028c1b..07a1c46ac0923 100644
--- a/paddle/fluid/operators/beam_search_decode_op.h
+++ b/paddle/fluid/operators/beam_search_decode_op.h
@@ -95,7 +95,7 @@ struct BeamSearchDecodeFunctor {
 
     } else {
       BeamSearchDecoder<T> beam_search_decoder(beam_size_, end_id_);
-      // Check if the tensor is on GPU or NPU. If so, use the CPU copy instead
+      // Check if the tensor is on GPU. If so, use the CPU copy instead
       if (tensor_on_gpu_ || tensor_on_npu_) {
         beam_search_decoder.Backtrace(
             step_ids_, step_scores_, id_tensor_, score_tensor_);
diff --git a/paddle/fluid/operators/collective/c_embedding_op.cc b/paddle/fluid/operators/collective/c_embedding_op.cc
index aee2b0c86b81e..2efd5b46bdc09 100644
--- a/paddle/fluid/operators/collective/c_embedding_op.cc
+++ b/paddle/fluid/operators/collective/c_embedding_op.cc
@@ -79,7 +79,7 @@ class CEmbeddingOpMaker : public framework::OpProtoAndCheckerMaker {
              "(Tensor) The input represents embedding tensors, "
              "which is a learnable parameter.");
     AddInput("Ids",
-             "An input with type int32 or int64 in CPU and GPU, int32 in NPU "
+             "An input with type int32 or int64 in CPU and GPU, "
              "contains the ids to be looked up in W.");
     AddOutput("Out", "The lookup results, which have the same type as W.");
 
diff --git a/paddle/fluid/operators/collective/global_gather_op.h b/paddle/fluid/operators/collective/global_gather_op.h
index 0d3b4ed92e9b2..723c5e48a5ae4 100644
--- a/paddle/fluid/operators/collective/global_gather_op.h
+++ b/paddle/fluid/operators/collective/global_gather_op.h
@@ -28,7 +28,7 @@ namespace operators {
 template <typename T, typename DeviceContext>
 class GlobalGatherOpCPUKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
+  void Compute(const framework::ExecutionContext& ctx UNUSED) const override {
     PADDLE_THROW(platform::errors::Unavailable(
         "Do not support global gather op for cpu kernel now."));
   }
diff --git a/paddle/fluid/operators/collective/global_scatter_op.h b/paddle/fluid/operators/collective/global_scatter_op.h
index 3cb2a3c7fc41b..fc4b48500c071 100644
--- a/paddle/fluid/operators/collective/global_scatter_op.h
+++ b/paddle/fluid/operators/collective/global_scatter_op.h
@@ -28,7 +28,7 @@ namespace operators {
 template <typename T, typename DeviceContext>
 class GlobalScatterOpCPUKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
+  void Compute(const framework::ExecutionContext& ctx UNUSED) const override {
     PADDLE_THROW(platform::errors::Unavailable(
         "Do not support global scatter op for cpu kernel now."));
   }
diff --git a/paddle/fluid/operators/collective/partial_allgather_op.h b/paddle/fluid/operators/collective/partial_allgather_op.h
index 6b827a2656f29..815558d0227eb 100644
--- a/paddle/fluid/operators/collective/partial_allgather_op.h
+++ b/paddle/fluid/operators/collective/partial_allgather_op.h
@@ -29,7 +29,7 @@ namespace operators {
 template <typename T, typename DeviceContext>
 class PartialAllGatherOpCPUKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
+  void Compute(const framework::ExecutionContext& ctx UNUSED) const override {
     PADDLE_THROW(platform::errors::Unavailable(
         "Do not support partial_allgather for cpu kernel now."));
   }
diff --git a/paddle/fluid/operators/collective/partial_recv_op.h b/paddle/fluid/operators/collective/partial_recv_op.h
index fdf3f02b0d679..baf47ef9dff8d 100644
--- a/paddle/fluid/operators/collective/partial_recv_op.h
+++ b/paddle/fluid/operators/collective/partial_recv_op.h
@@ -27,7 +27,7 @@ namespace operators {
 template <typename T, typename DeviceContext>
 class PartialRecvOpCPUKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
+  void Compute(const framework::ExecutionContext& ctx UNUSED) const override {
     PADDLE_THROW(platform::errors::Unavailable(
         "Do not support partial_recv for cpu kernel now."));
   }
diff --git a/paddle/fluid/operators/collective/partial_send_op.h b/paddle/fluid/operators/collective/partial_send_op.h
index 773125be7d40f..b7b72789b87ff 100644
--- a/paddle/fluid/operators/collective/partial_send_op.h
+++ b/paddle/fluid/operators/collective/partial_send_op.h
@@ -28,7 +28,7 @@ namespace operators {
 template <typename T, typename DeviceContext>
 class PartialSendOpCPUKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
+  void Compute(const framework::ExecutionContext& ctx UNUSED) const override {
     PADDLE_THROW(platform::errors::Unavailable(
         "Do not support partial_send for cpu kernel now."));
   }
diff --git a/paddle/fluid/operators/collective/recv_v2_op.h b/paddle/fluid/operators/collective/recv_v2_op.h
index 3430cdb73aa1a..e76e4a7b55197 100644
--- a/paddle/fluid/operators/collective/recv_v2_op.h
+++ b/paddle/fluid/operators/collective/recv_v2_op.h
@@ -27,7 +27,7 @@ namespace operators {
 template <typename T, typename DeviceContext>
 class RecvOpV2CPUKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
+  void Compute(const framework::ExecutionContext& ctx UNUSED) const override {
     PADDLE_THROW(platform::errors::Unavailable(
         "Do not support recv for cpu kernel now."));
   }
diff --git a/paddle/fluid/operators/controlflow/CMakeLists.txt b/paddle/fluid/operators/controlflow/CMakeLists.txt
index 7d1ae606d1710..7db85eebb4f84 100644
--- a/paddle/fluid/operators/controlflow/CMakeLists.txt
+++ b/paddle/fluid/operators/controlflow/CMakeLists.txt
@@ -27,11 +27,6 @@ cc_library(
   SRCS while_op_helper.cc
   DEPS operator op_variant)
 
-cc_test(
-  conditional_block_op_test
-  SRCS conditional_block_op_test.cc
-  DEPS conditional_block_op standalone_executor executor)
-
 if(WITH_UNITY_BUILD)
   target_link_libraries(paddle_operators_controlflow_unity conditional_block_op)
 else()
diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
index 3e07b1f155452..1bca2068f83d8 100644
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -91,9 +91,5 @@ cc_library(
   mask_util
   SRCS mask_util.cc
   DEPS memory)
-cc_test(
-  mask_util_test
-  SRCS mask_util_test.cc
-  DEPS memory mask_util)
 detection_library(generate_mask_labels_op SRCS generate_mask_labels_op.cc DEPS
                   mask_util)
diff --git a/paddle/fluid/operators/dlnne/CMakeLists.txt b/paddle/fluid/operators/dlnne/CMakeLists.txt
index 6680e1f23e08d..de017546fb34e 100644
--- a/paddle/fluid/operators/dlnne/CMakeLists.txt
+++ b/paddle/fluid/operators/dlnne/CMakeLists.txt
@@ -45,8 +45,3 @@ op_library(
 #endif()
 
 target_link_libraries(dlnne_engine_op ${DLNNE_LIB} ${CURT_LIB})
-
-cc_test(
-  test_dlnne_engine_op
-  SRCS dlnne_engine_op_test.cc
-  DEPS dlnne_engine_op analysis)
diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op_xpu.cc b/paddle/fluid/operators/fused/fused_gemm_epilogue_op_xpu.cc
index 6594df2f5164f..58d81ebf8be06 100644
--- a/paddle/fluid/operators/fused/fused_gemm_epilogue_op_xpu.cc
+++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op_xpu.cc
@@ -66,7 +66,6 @@ class FusedGemmEpilogueXPUKernel : public framework::OpKernel<T> {
     phi::XpuFcInfo fc_info;
 
     phi::GetFCInfo(x_mat_dims, y->dims(), trans_x, trans_y, &fc_info);
-    VLOG(0) << "FusedGemmEpilogueXPUKernel 000";
     xpu::Context* xpu_ctx = dev_ctx.x_context();
 
     const XPUType* x_ptr = reinterpret_cast<const XPUType*>(x->data<T>());
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
index 5cea4fa9e0573..40c82619db4a3 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -622,13 +622,12 @@ class ReduceBaseOp : public framework::OperatorWithKernel {
     // NOTE(jiahongyu): Above codes originally enclosed by PADDLE_WITH_MKLDNN
 
     if (input_data_type == framework::proto::VarType::FP16) {
-      PADDLE_ENFORCE_EQ(
-          platform::is_gpu_place(ctx.GetPlace()) ||
-              platform::is_xpu_place(ctx.GetPlace()) ||
-              platform::is_custom_place(ctx.GetPlace()),
-          true,
-          platform::errors::InvalidArgument(
-              "float16 can only be used on GPU or NPU or XPU place"));
+      PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()) ||
+                            platform::is_xpu_place(ctx.GetPlace()) ||
+                            platform::is_custom_place(ctx.GetPlace()),
+                        true,
+                        platform::errors::InvalidArgument(
+                            "float16 can only be used on GPU or XPU place"));
     }
     return phi::KernelKey(input_data_type, ctx.GetPlace());
   }
diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc
index 633ef748be698..2fb7883cb3f71 100644
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -47,7 +47,7 @@ class SoftmaxOp : public framework::OperatorWithKernel {
               platform::is_custom_place(ctx.GetPlace()),
           true,
           platform::errors::InvalidArgument(
-              "float16 can only be used on GPU/NPU/XPU and custom place"));
+              "float16 can only be used on GPU/XPU and custom place"));
     }
     return phi::KernelKey(
         ctx.GetPlace(), layout_, phi::TransToPhiDataType(input_data_type));
@@ -130,7 +130,7 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
             platform::is_xpu_place(ctx.GetPlace()) ||
             platform::is_custom_place(ctx.GetPlace())))
         PADDLE_THROW(platform::errors::InvalidArgument(
-            "float16 can only be used on GPU/NPU/XPU and custom place"));
+            "float16 can only be used on GPU/XPU and custom place"));
     }
     return phi::KernelKey(
         ctx.GetPlace(), layout_, phi::TransToPhiDataType(input_data_type));
diff --git a/paddle/fluid/platform/device/xpu/bkcl_helper.h b/paddle/fluid/platform/device/xpu/bkcl_helper.h
index 1f44bc0a8c98b..dcb17bfd2b932 100644
--- a/paddle/fluid/platform/device/xpu/bkcl_helper.h
+++ b/paddle/fluid/platform/device/xpu/bkcl_helper.h
@@ -62,6 +62,18 @@ inline BKCLDataType ToBKCLDataType(framework::proto::VarType::Type type) {
   }
 }
 
+inline int GetBKCLRankID(BKCLContext_t comm) {
+  return reinterpret_cast<int *>(comm)[0];
+}
+
+inline int GetBKCLDevID(BKCLContext_t comm) {
+  return reinterpret_cast<int *>(comm)[1];
+}
+
+inline int GetBKCLNRanks(BKCLContext_t comm) {
+  return reinterpret_cast<int *>(comm)[2];
+}
+
 class BKCLGroupGuard {
  public:
   static std::mutex &BKCLMutex() {
diff --git a/paddle/fluid/platform/device_event_base.h b/paddle/fluid/platform/device_event_base.h
index f56688de09a32..e2de1e5a9abe3 100644
--- a/paddle/fluid/platform/device_event_base.h
+++ b/paddle/fluid/platform/device_event_base.h
@@ -65,7 +65,7 @@ class DeviceEvent {
                           MaxDeviceTypes,
                           type_id_));
 #ifndef PADDLE_WITH_CUSTOM_DEVICE
-    // TODO(Aurelius84): only support CPU/CUDA/NPU.
+    // TODO(Aurelius84): only support CPU/CUDA.
     PADDLE_ENFORCE_LT(type_id_,
                       3,
                       platform::errors::Unavailable(
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index 2caf978db6f23..7f32c14c493af 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -931,7 +931,7 @@ static PyObject* tensor__getitem_index_not_tensor(TensorObject* self,
     // with FLAGS_set_to_1d=True. In this case, one `None` should be pop out,
     // otherwise the output shape will be not correct.
     if (static_cast<int>(decrease_axis.size()) == tensor->dims().size()) {
-      VLOG(0)
+      VLOG(1)
           << "Warning: In Tensor '__getitem__', if the number of scalar "
              "elements "
              "in the index is equal to the rank of the Tensor, the output "
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index aa1ee4724925e..65eac1e3dc6fd 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -1060,7 +1060,7 @@ void BindImperative(py::module *m_ptr) {
                // not correct.
                if (static_cast<int>(decrease_axis.size()) ==
                    tensor->dims().size()) {
-                 VLOG(0) << "Warning: In Tensor '__getitem__', if the number "
+                 VLOG(1) << "Warning: In Tensor '__getitem__', if the number "
                             "of scalar "
                             "elements "
                             "in the index is equal to the rank of the Tensor, "
diff --git a/paddle/fluid/pybind/parallel_executor.cc b/paddle/fluid/pybind/parallel_executor.cc
index 553701906f59e..3f20a2498f840 100644
--- a/paddle/fluid/pybind/parallel_executor.cc
+++ b/paddle/fluid/pybind/parallel_executor.cc
@@ -188,39 +188,7 @@ using namespace paddle::framework;                // NOLINT
 void BindParallelExecutor(pybind11::module &m) {  // NOLINT
   // -- python binds for parallel executor.
   py::class_<ParallelExecutor> pe(m, "ParallelExecutor");
-  py::class_<ExecutionStrategy> exec_strategy(pe, "ExecutionStrategy", R"DOC(
-    ExecutionStrategy allows the user to more preciously control how to run
-    the program in ParallelExecutor by setting the property.
-
-    Returns:
-        ExecutionStrategy: An ExecutionStrategy object.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle
-          import paddle.static as static
-          import paddle.nn.functional as F
-
-          paddle.enable_static()
-
-          x = static.data(name='x', shape=[None, 13], dtype='float32')
-          y = static.data(name='y', shape=[None, 1], dtype='float32')
-          y_predict = static.nn.fc(input=x, size=1, act=None)
-
-          cost = F.square_error_cost(input=y_predict, label=y)
-          avg_loss = paddle.mean(cost)
-
-          sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.001)
-          sgd_optimizer.minimize(avg_loss)
-
-          exec_strategy = static.ExecutionStrategy()
-          exec_strategy.num_threads = 4
-
-          train_exe = static.ParallelExecutor(use_cuda=False,
-                                              loss_name=avg_loss.name,
-                                              exec_strategy=exec_strategy)
-        )DOC");
+  py::class_<ExecutionStrategy> exec_strategy(pe, "ExecutionStrategy");
 
   py::enum_<paddle::platform::DeviceType>(m, "DeviceType", py::arithmetic())
       .value("CPU", paddle::platform::DeviceType::CPU)
@@ -233,29 +201,7 @@ void BindParallelExecutor(pybind11::module &m) {  // NOLINT
           [](const ExecutionStrategy &self) { return self.num_threads_; },
           [](ExecutionStrategy &self, size_t num_threads) {
             self.num_threads_ = num_threads;
-          },
-          R"DOC(
-            The type is INT, num_threads represents the size of thread pool that
-            used to run the operators of the current program in ParallelExecutor.
-            If :math:`num\_threads=1`, all the operators will execute one by one,
-            but the order maybe difference between iterations.
-            If it is not set, it will be set in ParallelExecutor according to the
-            device type and device count, for GPU, :math:`num\_threads=device\_count*4`, for CPU,
-            :math:`num\_threads=CPU\_NUM*4`, the explanation of:math:`CPU\_NUM` is in ParallelExecutor.
-            if it is not set, ParallelExecutor will get the cpu count by calling
-            `multiprocessing.cpu_count()`. Default 0.
-
-            Examples:
-                .. code-block:: python
-
-                    import paddle
-                    import paddle.static as static
-
-                    paddle.enable_static()
-
-                    exec_strategy = static.ExecutionStrategy()
-                    exec_strategy.num_threads = 4
-            )DOC")
+          })
       .def_property(
           "_use_device",
           [](const ExecutionStrategy &self) { return self.use_device_; },
@@ -268,11 +214,7 @@ void BindParallelExecutor(pybind11::module &m) {  // NOLINT
           [](const ExecutionStrategy &self) { return self.allow_op_delay_; },
           [](ExecutionStrategy &self, bool allow_op_delay) {
             self.allow_op_delay_ = allow_op_delay;
-          },
-          R"DOC(The type is BOOL, allow_op_delay represents whether to delay the
-                communication operators to run, it may make the execution faster.
-                Note that this option is invalid now, and it will be removed in
-                next version. Default False.)DOC")
+          })
       .def_property(
           "num_iteration_per_drop_scope",
           [](const ExecutionStrategy &self) {
@@ -280,30 +222,7 @@ void BindParallelExecutor(pybind11::module &m) {  // NOLINT
           },
           [](ExecutionStrategy &self, size_t num_iteration_per_drop_scope) {
             self.num_iteration_per_drop_scope_ = num_iteration_per_drop_scope;
-          },
-          R"DOC(The type is INT, num_iteration_per_drop_scope indicates how
-                many iterations to clean up the temp variables which
-                is generated during execution. It may make the execution faster,
-                because the temp variable's shape maybe the same between two iterations.
-                Default 100.
-
-                .. note::
-                    1. If you fetch data when calling the 'run', the ParallelExecutor
-                    will clean up the temp variables at the end of the current iteration.
-                    2. In some NLP model, it may cause the GPU memory is insufficient,
-                    in this case, you should reduce `num_iteration_per_drop_scope`.
-
-                Examples:
-                    .. code-block:: python
-
-                        import paddle
-                        import paddle.static as static
-
-                        paddle.enable_static()
-
-                        exec_strategy = static.ExecutionStrategy()
-                        exec_strategy.num_iteration_per_drop_scope = 10
-              )DOC")
+          })
       .def_property(
           "num_iteration_per_run",
           [](const ExecutionStrategy &self) {
@@ -311,29 +230,13 @@ void BindParallelExecutor(pybind11::module &m) {  // NOLINT
           },
           [](ExecutionStrategy &self, size_t num_iteration_per_run) {
             self.num_iteration_per_run_ = num_iteration_per_run;
-          },
-          R"DOC(This config that how many iteration the executor will run when
-                user call exe.run() in python。Default: 1.
-
-                Examples:
-                    .. code-block:: python
-
-                        import paddle
-                        import paddle.static as static
-
-                        paddle.enable_static()
-
-                        exec_strategy = static.ExecutionStrategy()
-                        exec_strategy.num_iteration_per_run = 10
-              )DOC")
+          })
       .def_property(
           "use_thread_barrier",
           [](const ExecutionStrategy &self) { return self.thread_barrier_; },
           [](ExecutionStrategy &self, bool use_thread_barrier) {
             self.thread_barrier_ = use_thread_barrier;
-          },
-          R"DOC(This config that the this is distributed training with parameter server
-              )DOC")
+          })
       .def_property(
           "_dry_run",
           [](const ExecutionStrategy &self) { return self.dry_run_; },
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index 65132bc68fa0d..6f8ae115bd12a 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -434,7 +434,7 @@ void SetTensorFromPyArrayT(
     }
 #else
     PADDLE_THROW(platform::errors::PermissionDenied(
-        "Cannot use IPUPlace in CPU/GPU/XPU/NPU version, "
+        "Cannot use IPUPlace in CPU/GPU/XPU version, "
         "Please recompile or reinstall Paddle with IPU support."));
 #endif
   } else if (paddle::platform::is_custom_place(place)) {
@@ -1106,7 +1106,7 @@ inline py::array TensorToPyArray(const phi::DenseTensor &tensor,
     return py_arr;
 #else
     PADDLE_THROW(platform::errors::PermissionDenied(
-        "Cannot use CustomPlace in CPU/GPU/XPU/NPU version, "
+        "Cannot use CustomPlace in CPU/GPU/XPU version, "
         "Please recompile or reinstall Paddle with CustomPlace "
         "support."));
 #endif
diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml
index 9d1b2ce5b4933..3eaafe2b407ad 100755
--- a/paddle/phi/api/yaml/legacy_backward.yaml
+++ b/paddle/phi/api/yaml/legacy_backward.yaml
@@ -617,6 +617,7 @@
     func : multiply_double_grad
   optional : grad_x_grad, grad_y_grad
   inplace : (grad_x_grad -> grad_out_grad)
+  backward : multiply_triple_grad
   composite : multiply_double_grad(x, y, grad_out, grad_x_grad, grad_y_grad, axis, x_grad, y_grad, grad_out_grad)
 
 - backward_op : multiply_grad
@@ -631,6 +632,17 @@
   composite: multiply_grad(x, y, out_grad, axis, x_grad, y_grad)
   backward : multiply_double_grad
 
+- backward_op : multiply_triple_grad
+  forward : multiply_double_grad (Tensor x, Tensor y, Tensor fwd_grad_out, Tensor fwd_grad_grad_x, Tensor fwd_grad_grad_y, int aixs = -1) -> Tensor(grad_x), Tensor(grad_y), Tensor(grad_grad_out)
+  args : (Tensor x, Tensor y, Tensor fwd_grad_out, Tensor fwd_grad_grad_x, Tensor fwd_grad_grad_y, Tensor grad_x_grad, Tensor grad_y_grad, Tensor grad_grad_out_grad, int axis = -1)
+  output : Tensor(x_grad), Tensor(y_grad), Tensor(fwd_grad_out_grad), Tensor(fwd_grad_grad_x_grad), Tensor(fwd_grad_grad_y_grad)
+  infer_meta :
+    func : GeneralQuinaryGradInferMeta
+    param : [x, y, fwd_grad_out, fwd_grad_grad_x, fwd_grad_grad_y]
+  kernel :
+    func : multiply_triple_grad
+  optional : fwd_grad_grad_x, fwd_grad_grad_y, grad_x_grad, grad_y_grad, grad_grad_out_grad
+
 - backward_op : norm_grad
   forward : norm (Tensor x, int axis, float epsilon, bool is_test) -> Tensor(out), Tensor(norm)
   args : (Tensor x, Tensor norm, Tensor out_grad, int axis, float epsilon, bool is_test)
diff --git a/paddle/phi/backends/device_memory_aligment.h b/paddle/phi/backends/device_memory_aligment.h
index 72562f0c001f0..06d9b450a83ab 100644
--- a/paddle/phi/backends/device_memory_aligment.h
+++ b/paddle/phi/backends/device_memory_aligment.h
@@ -41,7 +41,7 @@ inline size_t Alignment(size_t size,
       alignment = alignment;
 #else
       PADDLE_THROW(phi::errors::PreconditionNotMet(
-          "Fluid is not compiled with CUDA/XPU/NPU."));
+          "Fluid is not compiled with CUDA/XPU."));
 #endif
     }
   }
diff --git a/paddle/phi/backends/xpu/xpu2_op_list.cc b/paddle/phi/backends/xpu/xpu2_op_list.cc
index 4c3688b0badfa..6f10baf07bb5a 100644
--- a/paddle/phi/backends/xpu/xpu2_op_list.cc
+++ b/paddle/phi/backends/xpu/xpu2_op_list.cc
@@ -185,6 +185,9 @@ XPUOpMap& get_kl2_ops() {
       {"deformable_conv_v1", XPUKernelSet({phi::DataType::FLOAT32})},
       {"depthwise_conv2d_grad", XPUKernelSet({phi::DataType::FLOAT32})},
       {"depthwise_conv2d", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"depthwise_conv2d_transpose_grad",
+       XPUKernelSet({phi::DataType::FLOAT32})},
+      {"depthwise_conv2d_transpose", XPUKernelSet({phi::DataType::FLOAT32})},
       {"diag_v2",
        XPUKernelSet({phi::DataType::FLOAT32,
                      phi::DataType::FLOAT16,
@@ -531,6 +534,16 @@ XPUOpMap& get_kl2_ops() {
       {"p_norm_grad", XPUKernelSet({phi::DataType::FLOAT32})},
       {"pad3d_grad", XPUKernelSet({phi::DataType::FLOAT32})},
       {"pad3d", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"pad",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::INT32,
+                     phi::DataType::INT16,
+                     phi::DataType::FLOAT16})},
+      {"pad_grad",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::INT32,
+                     phi::DataType::INT16,
+                     phi::DataType::FLOAT16})},
       {"pixel_shuffle", XPUKernelSet({phi::DataType::FLOAT32})},
       {"pixel_shuffle_grad", XPUKernelSet({phi::DataType::FLOAT32})},
       {"pool2d_grad",
diff --git a/paddle/phi/kernels/impl/eigvalsh_grad_kernel_impl.h b/paddle/phi/kernels/impl/eigvalsh_grad_kernel_impl.h
index 7248985bf294c..18a6a0518a3cc 100644
--- a/paddle/phi/kernels/impl/eigvalsh_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/eigvalsh_grad_kernel_impl.h
@@ -27,8 +27,8 @@ template <typename T, typename Context>
 void EigvalshGradKernel(const Context& dev_ctx,
                         const DenseTensor& out_v,
                         const DenseTensor& out_w_grad,
-                        const std::string& uplo,
-                        bool is_test,
+                        const std::string& uplo UNUSED,
+                        bool is_test UNUSED,
                         DenseTensor* x_grad) {
   auto tV = phi::TransposeLast2Dim<T>(dev_ctx, phi::Conj<T>(dev_ctx, out_v));
 
diff --git a/paddle/phi/kernels/impl/einsum_impl.h b/paddle/phi/kernels/impl/einsum_impl.h
index 400334ad4e04e..92a4f99c6eb77 100644
--- a/paddle/phi/kernels/impl/einsum_impl.h
+++ b/paddle/phi/kernels/impl/einsum_impl.h
@@ -752,7 +752,7 @@ void EinsumKernel(const Context& dev_ctx,
                   const std::string& equation,
                   DenseTensor* out,
                   std::vector<DenseTensor*> cache,
-                  std::vector<DenseTensor*> xshape) {
+                  std::vector<DenseTensor*> xshape UNUSED) {
   std::vector<char> tmp;
   // for the sake of compatibility, we may load and run v2.3 EinsumOp. Output
   // may have nullptr and the cache.size() is not equal to inputs.size(). refer
diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
index 15f99a58fa5a5..3ce1e721b968e 100644
--- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
@@ -119,7 +119,9 @@ void SubtractDoubleGradImpl(const Context& dev_ctx,
 
 template <typename T>
 struct DivGradDX {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout / y; }
+  HOSTDEVICE T operator()(T x UNUSED, T y, T out UNUSED, T dout) const {
+    return dout / y;
+  }
 };
 
 template <typename T>
@@ -136,7 +138,7 @@ struct DivGradDX<phi::dtype::complex<T>> {
 
 template <typename T>
 struct DivGradDY {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
+  HOSTDEVICE T operator()(T x UNUSED, T y, T out, T dout) const {
     return -dout * out / y;
   }
 };
@@ -857,14 +859,14 @@ struct MinGradDy {
 
 template <typename T>
 struct HeavisideGradDx {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
+  HOSTDEVICE T operator()(T x UNUSED, T y UNUSED, T out UNUSED, T dout) const {
     return dout * static_cast<T>(0);
   }
 };
 
 template <typename T>
 struct HeavisideGradDy {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
+  HOSTDEVICE T operator()(T x, T y UNUSED, T out UNUSED, T dout) const {
     return dout * static_cast<T>(x == static_cast<T>(0));
   }
 };
diff --git a/paddle/phi/kernels/impl/lamb_kernel_impl.h b/paddle/phi/kernels/impl/lamb_kernel_impl.h
index e0850b8aef0d9..b02d2a517a1c5 100644
--- a/paddle/phi/kernels/impl/lamb_kernel_impl.h
+++ b/paddle/phi/kernels/impl/lamb_kernel_impl.h
@@ -128,7 +128,7 @@ void ComputeImpl(const Context& dev_ctx,
                  float beta1_f,
                  float beta2_f,
                  float epsilon_f,
-                 bool multi_precision,
+                 bool multi_precision UNUSED,
                  DenseTensor* param_out,
                  DenseTensor* mom1_out,
                  DenseTensor* mom2_out,
diff --git a/paddle/phi/kernels/impl/lu_kernel_impl.h b/paddle/phi/kernels/impl/lu_kernel_impl.h
index 5663484362a8e..e9ba46d0c162c 100644
--- a/paddle/phi/kernels/impl/lu_kernel_impl.h
+++ b/paddle/phi/kernels/impl/lu_kernel_impl.h
@@ -474,7 +474,7 @@ void Unpack_Pivot(const Context& dev_ctx,
                   const DenseTensor& Pivot,
                   DenseTensor* P,
                   int h,
-                  int w) {
+                  int w UNUSED) {
   auto dims = Pivot.dims();
   auto Pdimvec = vectorize(dims);
   auto prank = Pdimvec.size();
diff --git a/paddle/phi/kernels/impl/unstack_kernel_impl.h b/paddle/phi/kernels/impl/unstack_kernel_impl.h
index 030f4a62c6e00..102126a1e3307 100644
--- a/paddle/phi/kernels/impl/unstack_kernel_impl.h
+++ b/paddle/phi/kernels/impl/unstack_kernel_impl.h
@@ -26,7 +26,7 @@ template <typename T, typename Context>
 void UnStackKernel(const Context &dev_ctx,
                    const DenseTensor &x,
                    int axis,
-                   int num,
+                   int num UNUSED,
                    std::vector<DenseTensor *> outs) {
   auto *dy = &x;
   auto dx = outs;
diff --git a/paddle/phi/kernels/onednn/conv_handler.h b/paddle/phi/kernels/onednn/conv_handler.h
index 2be0ba5649711..fd6cf0c577849 100644
--- a/paddle/phi/kernels/onednn/conv_handler.h
+++ b/paddle/phi/kernels/onednn/conv_handler.h
@@ -240,10 +240,10 @@ class ConvOneDNNHandlerT
                      const std::string& padding_algorithm,
                      const std::vector<int>& dilations_in,
                      int groups,
-                     const std::string& data_format,
+                     const std::string& data_format UNUSED,
                      bool is_test,
-                     phi::DenseTensor* filter_grad,
-                     phi::DenseTensor* in_x_grad,
+                     phi::DenseTensor* filter_grad UNUSED,
+                     phi::DenseTensor* in_x_grad UNUSED,
                      const std::string& unique_name)
       : funcs::OneDNNHandlerT<T,
                               dnnl::convolution_forward,
diff --git a/paddle/phi/kernels/onednn/pool_kernel.cc b/paddle/phi/kernels/onednn/pool_kernel.cc
index 9a4963eefeba7..726014396b124 100644
--- a/paddle/phi/kernels/onednn/pool_kernel.cc
+++ b/paddle/phi/kernels/onednn/pool_kernel.cc
@@ -26,7 +26,7 @@ void Pool2dKernel(const Context& dev_ctx,
                   const std::vector<int>& paddings,
                   bool ceil_mode,
                   bool exclusive,
-                  const std::string& data_format,
+                  const std::string& data_format UNUSED,
                   const std::string& pooling_type,
                   bool global_pooling,
                   bool adaptive,
diff --git a/paddle/phi/kernels/onednn/reduce_kernel_impl.h b/paddle/phi/kernels/onednn/reduce_kernel_impl.h
index 69f667c36624b..7c512c6e3eb4e 100644
--- a/paddle/phi/kernels/onednn/reduce_kernel_impl.h
+++ b/paddle/phi/kernels/onednn/reduce_kernel_impl.h
@@ -118,7 +118,7 @@ void ReduceGradKernel(const Context& dev_ctx,
                       bool reduce_all,
                       DenseTensor* x_grad,
                       dnnl::algorithm binary_type,
-                      dnnl::algorithm reduction_type,
+                      dnnl::algorithm reduction_type UNUSED,
                       float scale_x,
                       float scale_y) {
   reduce_all = recompute_reduce_all(x, dims, reduce_all);
diff --git a/paddle/phi/kernels/onednn/slice_kernel.cc b/paddle/phi/kernels/onednn/slice_kernel.cc
index 6c927018264bf..dfda78de77c06 100644
--- a/paddle/phi/kernels/onednn/slice_kernel.cc
+++ b/paddle/phi/kernels/onednn/slice_kernel.cc
@@ -25,7 +25,7 @@ void SliceKernel(const Context& dev_ctx,
                  const std::vector<int64_t>& axes,
                  const IntArray& starts,
                  const IntArray& ends,
-                 const std::vector<int64_t>& infer_flags,
+                 const std::vector<int64_t>& infer_flags UNUSED,
                  const std::vector<int64_t>& decrease_axis,
                  DenseTensor* out) {
   const auto& onednn_engine = dev_ctx.GetEngine();
diff --git a/paddle/phi/kernels/selected_rows/impl/lamb_kernel_impl.h b/paddle/phi/kernels/selected_rows/impl/lamb_kernel_impl.h
index 4323a23e0e60c..f9c620c925689 100644
--- a/paddle/phi/kernels/selected_rows/impl/lamb_kernel_impl.h
+++ b/paddle/phi/kernels/selected_rows/impl/lamb_kernel_impl.h
@@ -131,7 +131,7 @@ void ComputeRowImpl(const Context& dev_ctx,
                     float beta1_f,
                     float beta2_f,
                     float epsilon_f,
-                    bool multi_precision,
+                    bool multi_precision UNUSED,
                     DenseTensor* param_out,
                     DenseTensor* mom1_out,
                     DenseTensor* mom2_out,
diff --git a/paddle/phi/kernels/sparse/cpu/conv_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/conv_grad_kernel.cc
index fe8b8ad6f8166..09c307a5c005f 100644
--- a/paddle/phi/kernels/sparse/cpu/conv_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/conv_grad_kernel.cc
@@ -38,10 +38,10 @@ void Conv3dCooGradCPUKernel(const CPUContext& dev_ctx,
                             const DenseTensor& rulebook,
                             const DenseTensor& counter,
                             const SparseCooTensor& out_grad,
-                            const std::vector<int>& paddings,
-                            const std::vector<int>& dilations,
-                            const std::vector<int>& strides,
-                            const int groups,
+                            const std::vector<int>& paddings UNUSED,
+                            const std::vector<int>& dilations UNUSED,
+                            const std::vector<int>& strides UNUSED,
+                            const int groups UNUSED,
                             const bool subm,
                             const std::string& key,
                             SparseCooTensor* x_grad,
diff --git a/paddle/phi/kernels/sparse/cpu/conv_kernel.cc b/paddle/phi/kernels/sparse/cpu/conv_kernel.cc
index 48f04ad1ddfa5..7fcbb5cfdd1f2 100644
--- a/paddle/phi/kernels/sparse/cpu/conv_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/conv_kernel.cc
@@ -34,7 +34,7 @@ void Conv3dCooCPUKernel(const CPUContext& dev_ctx,
                         const std::vector<int>& paddings,
                         const std::vector<int>& dilations,
                         const std::vector<int>& strides,
-                        const int groups,
+                        const int groups UNUSED,
                         const bool subm,
                         const std::string& key,
                         SparseCooTensor* out,
diff --git a/paddle/phi/kernels/xpu/conv_transpose_grad_kernel.cc b/paddle/phi/kernels/xpu/conv_transpose_grad_kernel.cc
index 9db36ace02e4d..4b61b61b5e254 100644
--- a/paddle/phi/kernels/xpu/conv_transpose_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/conv_transpose_grad_kernel.cc
@@ -97,6 +97,36 @@ void Conv2dTransposeGradKernel(const Context& ctx,
   PADDLE_ENFORCE_XDNN_SUCCESS(r, "conv2d_transpose_grad");
 }
 
+template <typename T, typename Context>
+void DepthwiseConv2dTransposeGradKernel(const Context& ctx,
+                                        const DenseTensor& x,
+                                        const DenseTensor& filter,
+                                        const DenseTensor& dout,
+                                        const std::vector<int>& strides,
+                                        const std::vector<int>& paddings,
+                                        const std::vector<int>& output_padding,
+                                        const IntArray& output_size,
+                                        const std::string& padding_algorithm,
+                                        int groups,
+                                        const std::vector<int>& dilations,
+                                        const std::string& data_format,
+                                        DenseTensor* dx,
+                                        DenseTensor* dfilter) {
+  Conv2dTransposeGradKernel<T, Context>(ctx,
+                                        x,
+                                        filter,
+                                        dout,
+                                        strides,
+                                        paddings,
+                                        output_padding,
+                                        output_size,
+                                        padding_algorithm,
+                                        groups,
+                                        dilations,
+                                        data_format,
+                                        dx,
+                                        dfilter);
+}
 }  // namespace phi
 
 PD_REGISTER_KERNEL(conv2d_transpose_grad,
@@ -104,3 +134,8 @@ PD_REGISTER_KERNEL(conv2d_transpose_grad,
                    ALL_LAYOUT,
                    phi::Conv2dTransposeGradKernel,
                    float) {}
+PD_REGISTER_KERNEL(depthwise_conv2d_transpose_grad,
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::DepthwiseConv2dTransposeGradKernel,
+                   float) {}
diff --git a/paddle/phi/kernels/xpu/conv_transpose_kernel.cc b/paddle/phi/kernels/xpu/conv_transpose_kernel.cc
index 1b3c31f665c7c..f658f06a9908d 100644
--- a/paddle/phi/kernels/xpu/conv_transpose_kernel.cc
+++ b/paddle/phi/kernels/xpu/conv_transpose_kernel.cc
@@ -145,8 +145,39 @@ void Conv2dTransposeKernel(const Context& ctx,
     PADDLE_ENFORCE_XDNN_SUCCESS(r, "conv2d_transpose_v2");
   }
 }
+template <typename T, typename Context>
+void DepthwiseConv2dTransposeKernel(const Context& ctx,
+                                    const DenseTensor& x,
+                                    const DenseTensor& filter,
+                                    const std::vector<int>& strides,
+                                    const std::vector<int>& paddings,
+                                    const std::vector<int>& output_padding,
+                                    const IntArray& output_size,
+                                    const std::string& padding_algorithm,
+                                    int groups,
+                                    const std::vector<int>& dilations,
+                                    const std::string& data_format,
+                                    DenseTensor* out) {
+  Conv2dTransposeKernel<T, Context>(ctx,
+                                    x,
+                                    filter,
+                                    strides,
+                                    paddings,
+                                    output_padding,
+                                    output_size,
+                                    padding_algorithm,
+                                    groups,
+                                    dilations,
+                                    data_format,
+                                    out);
+}
 
 }  // namespace phi
+PD_REGISTER_KERNEL(depthwise_conv2d_transpose,
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::DepthwiseConv2dTransposeKernel,
+                   float) {}
 
 PD_REGISTER_KERNEL(conv2d_transpose,
                    XPU,
diff --git a/paddle/phi/kernels/xpu/pad_grad_kernel.cc b/paddle/phi/kernels/xpu/pad_grad_kernel.cc
new file mode 100644
index 0000000000000..45fc3393412cd
--- /dev/null
+++ b/paddle/phi/kernels/xpu/pad_grad_kernel.cc
@@ -0,0 +1,56 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/pad_grad_kernel.h"
+
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+template <typename T, typename Context>
+void PadGradKernel(const Context& dev_ctx,
+                   const DenseTensor& d_out,
+                   const std::vector<int>& paddings,
+                   const Scalar& pad_value,
+                   DenseTensor* d_x) {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  std::vector<int> pad_left, pad_right;
+  std::vector<int> out_shape = vectorize<int>(d_out.dims());
+  dev_ctx.template Alloc<T>(d_x);
+
+  for (size_t i = 0; i < paddings.size() / 2; ++i) {
+    pad_left.push_back(-paddings[i * 2]);
+    pad_right.push_back(-paddings[i * 2 + 1]);
+  }
+
+  XPUType value = static_cast<XPUType>(pad_value.to<T>());
+  int r = xpu::pad<XPUType>(dev_ctx.x_context(),
+                            reinterpret_cast<const XPUType*>(d_out.data<T>()),
+                            reinterpret_cast<XPUType*>(d_x->data<T>()),
+                            out_shape,
+                            pad_left,
+                            pad_right,
+                            value);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "pad");
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(pad_grad,
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::PadGradKernel,
+                   float,
+                   int,
+                   int16_t,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/xpu/pad_kernel.cc b/paddle/phi/kernels/xpu/pad_kernel.cc
new file mode 100644
index 0000000000000..899503e328607
--- /dev/null
+++ b/paddle/phi/kernels/xpu/pad_kernel.cc
@@ -0,0 +1,56 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/pad_kernel.h"
+
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+template <typename T, typename Context>
+void PadKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int>& paddings,
+               const Scalar& pad_value,
+               DenseTensor* out) {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  dev_ctx.template Alloc<T>(out);
+  std::vector<int> pad_left, pad_right;
+  std::vector<int> xshape = vectorize<int>(x.dims());
+
+  for (size_t i = 0; i < paddings.size() / 2; ++i) {
+    pad_left.push_back(paddings[i * 2]);
+    pad_right.push_back(paddings[i * 2 + 1]);
+  }
+
+  XPUType value = static_cast<XPUType>(pad_value.to<T>());
+  int r = xpu::pad<XPUType>(dev_ctx.x_context(),
+                            reinterpret_cast<const XPUType*>(x.data<T>()),
+                            reinterpret_cast<XPUType*>(out->data<T>()),
+                            xshape,
+                            pad_left,
+                            pad_right,
+                            value);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "pad");
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(pad,
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::PadKernel,
+                   float,
+                   int,
+                   int16_t,
+                   phi::dtype::float16) {}
diff --git a/python/paddle/distributed/auto_parallel/dist_loader.py b/python/paddle/distributed/auto_parallel/dist_loader.py
index 5af0dd12f3ff9..5c0cd89306c90 100644
--- a/python/paddle/distributed/auto_parallel/dist_loader.py
+++ b/python/paddle/distributed/auto_parallel/dist_loader.py
@@ -17,16 +17,16 @@
 import numpy as np
 
 import paddle
-from paddle.fluid.dataloader.batch_sampler import (
+from paddle.io import BatchSampler, IterableDataset
+from paddle.io.dataloader.batch_sampler import (
     DistributedBatchSampler,
     _InfiniteIterableSampler,
 )
-from paddle.fluid.dataloader.dataloader_iter import (
+from paddle.io.dataloader.dataloader_iter import (
     _DatasetKind,
     default_collate_fn,
     default_convert_fn,
 )
-from paddle.io import BatchSampler, IterableDataset
 
 
 class DistributedDataLoaderBase(metaclass=abc.ABCMeta):
@@ -272,7 +272,7 @@ def __next__(self):
         return next(self.data)
 
     def _create_inner_dataloader(self):
-        dataloader = paddle.fluid.io.DataLoader(
+        dataloader = paddle.io.DataLoader(
             self.dataset,
             feed_list=self.feed_list,
             places=self.places,
diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
index da20e312a1dc8..fa4b937ba56b7 100644
--- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
 import os
 
 import numpy as np
@@ -27,10 +26,30 @@
 _use_cache = False
 _enable_partial_send_recv = True
 
+_xpu_comm_group_started = False
+
 _sync_send = os.environ.get("PADDLE_P2P_SYNC_SEND", "0")
 _sync_send = _sync_send.lower() in ['1', 'true']
 
 
+def _xpu_comm_group_start():
+    if not paddle.is_compiled_with_xpu():
+        return
+    global _xpu_comm_group_started
+    assert not _xpu_comm_group_started
+    framework.core.ProcessGroupBKCL.group_start()
+    _xpu_comm_group_started = True
+
+
+def _xpu_comm_group_end():
+    if not paddle.is_compiled_with_xpu():
+        return
+    global _xpu_comm_group_started
+    if _xpu_comm_group_started:
+        framework.core.ProcessGroupBKCL.group_end()
+        _xpu_comm_group_started = False
+
+
 def initialize_p2p_groups(hcg, use_cache=True, enable_partial_send_recv=True):
     global _hcg, _use_cache, _enable_partial_send_recv
     _hcg = hcg
@@ -357,6 +376,7 @@ def _p2p_helper(
     # TODO(Yuang Liu): use batch_isend_irecv replace all these comm ops
     tasks = []
     # start to p2p communicate
+
     if _sync_send:
         # Some devices(NPU for example) do not support asynchronized send op, So the order is
         # recv_prev -> send_next -> recv_next -> send_prev
@@ -492,8 +512,8 @@ def _p2p_helper(
                     group=_hcg.send_prev_group,
                     use_calc_stream=False,
                 )
-
     else:
+        _xpu_comm_group_start()
         if tensor_send_prev is not None:
             if isinstance(tensor_send_prev, tuple):
                 for d in tensor_send_prev:
@@ -529,6 +549,7 @@ def _p2p_helper(
                         use_calc_stream=sync_recv,
                     )
                     if sync_recv:
+                        _xpu_comm_group_end()
                         allgather_partial(
                             d,
                             nranks=mp_degree,
@@ -549,6 +570,7 @@ def _p2p_helper(
                 )
 
                 if sync_recv:
+                    _xpu_comm_group_end()
                     allgather_partial(
                         tensor_recv_prev,
                         nranks=mp_degree,
@@ -595,6 +617,7 @@ def _p2p_helper(
                     )
 
                     if sync_recv:
+                        _xpu_comm_group_end()
                         allgather_partial(
                             d,
                             nranks=mp_degree,
@@ -615,6 +638,7 @@ def _p2p_helper(
                     use_calc_stream=sync_recv,
                 )
                 if sync_recv:
+                    _xpu_comm_group_end()
                     allgather_partial(
                         tensor_recv_next,
                         nranks=mp_degree,
@@ -624,7 +648,7 @@ def _p2p_helper(
                     )
                 else:
                     tasks.append(task)
-
+        _xpu_comm_group_end()
     if not sync_recv:
         if framework.in_dygraph_mode():
             # wait irecv tasks in eager dygraph mode with new comm library
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
index 928c93df0b40a..4c47cbfcc1d0b 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
@@ -323,6 +323,6 @@ def cvt_to_device(x, dev_id, blocking=True):
         place = paddle.XPUPlace(dev_id)
     else:
         raise OSError(
-            "Only supported compiled paddle with gpu/rocm, npu and xpu , but current verison is compiled with cpu."
+            "Only supported compiled paddle with gpu/rocm and xpu , but current verison is compiled with cpu."
         )
     return x._copy_to(place, blocking)
diff --git a/python/paddle/distributed/fleet/recompute/recompute_hybrid.py b/python/paddle/distributed/fleet/recompute/recompute_hybrid.py
index c6acae878745b..a5689020eb009 100644
--- a/python/paddle/distributed/fleet/recompute/recompute_hybrid.py
+++ b/python/paddle/distributed/fleet/recompute/recompute_hybrid.py
@@ -110,7 +110,10 @@ def forward(
 
         cur_device = paddle.get_device()
         assert (
-            'gpu:' in paddle.get_device() or 'xpu:' in paddle.get_device()
+            'gpu:' in paddle.get_device()
+            or 'xpu:' in paddle.get_device()
+            or cur_device.split(':')[0]
+            in paddle.device.get_all_custom_device_type()
         ), "Recompute with RNG is not support current device: {}.".format(
             cur_device
         )
diff --git a/python/paddle/distributed/launch/main.py b/python/paddle/distributed/launch/main.py
index 02099c743933e..a2c3e34f6dee5 100644
--- a/python/paddle/distributed/launch/main.py
+++ b/python/paddle/distributed/launch/main.py
@@ -52,7 +52,7 @@ def launch():
 
         - ``--job_id``: The job unique id, it affects the log files' name. e.g., ``--job_id=job1``. Default ``--job_id=default``.
 
-        - ``--devices``: The selected accelerate devices on nodes, can be gpu/xpu/npu etc.. e.g., ``--devices=0,1,2,3`` will launch four training processes each bound to one device.
+        - ``--devices``: The selected accelerate devices on nodes, can be gpu/xpu etc.. e.g., ``--devices=0,1,2,3`` will launch four training processes each bound to one device.
 
         - ``training_script``: The full path to the single GPU training program/script to be launched in parallel, followed by all the arguments for the training script. e.g., ``training.py``
 
diff --git a/python/paddle/distributed/passes/auto_parallel_fp16.py b/python/paddle/distributed/passes/auto_parallel_fp16.py
index 3216c1b408276..6a763ce15030f 100644
--- a/python/paddle/distributed/passes/auto_parallel_fp16.py
+++ b/python/paddle/distributed/passes/auto_parallel_fp16.py
@@ -685,7 +685,7 @@ def _insert_memcopy(block, idx, src_var, dist_context, direction="D2H"):
         world_process_group.ranks,
     )
 
-    # TODO to support CUDAPinned/NPU/XPU Places
+    # TODO to support CUDAPinned/XPU Places
     if direction == "D2H":
         dst_place_type = 0
     else:
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 605db8e932bf8..e0e102e2393f9 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -55,8 +55,6 @@
 from . import reader
 from . import unique_name
 from .reader import *
-from . import dataloader
-from .dataloader import *
 from . import core
 from paddle.utils import deprecated
 from paddle.fluid.framework import static_only
diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py
index 4724077f1c9ec..ed294700b1621 100644
--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
@@ -40,14 +40,6 @@
     _cleanup,
     _set_SIGCHLD_handler,
 )
-from .dataloader import BatchSampler, Dataset, IterableDataset, Subset
-from .dataloader.dataloader_iter import (
-    _DataLoaderIterSingleProcess,
-    _DataLoaderIterMultiProcess,
-    _DatasetKind,
-    default_collate_fn,
-)
-from .dataloader.batch_sampler import _InfiniteIterableSampler
 from .layers.io import (
     monkey_patch_reader_methods,
     _copy_reader_var_,
@@ -69,22 +61,12 @@
 # NOTE: [ avoid hanging & failed quickly ] These value is used in getting data from another process
 QUEUE_GET_TIMEOUT = 60
 
-__all__ = ['PyReader', 'DataLoader', 'default_collate_fn']
+__all__ = ['PyReader', 'DataLoader']
 
 data_loader_unique_name_generator = UniqueNameGenerator()
 
 KEEP_DATA_LOADER_ORDER = True
 USE_PINNED_MEMORY = None
-# AutoTune Flags
-USE_AUTOTUNE = False
-TUNING_STEPS = 500
-
-
-def set_autotune_config(use_autotune, tuning_steps=500):
-    global USE_AUTOTUNE
-    USE_AUTOTUNE = use_autotune
-    global TUNING_STEPS
-    TUNING_STEPS = tuning_steps
 
 
 def keep_data_loader_order(*args):
@@ -171,454 +153,7 @@ def _check_input_array(cls, item):
         return arr
 
 
-class AuToTune:
-    def __init__(self, loader):
-        self.loader = loader
-        self.max_num_worker = multiprocessing.cpu_count() / 2
-
-    def __call__(self):
-        # use default loader
-        if (not USE_AUTOTUNE) or (not self.need_autotune()):
-            return self.loader.num_workers
-
-        # get autotune loader
-        auto_tune_loader = self.get_autotune_loader()
-        if auto_tune_loader is None:
-            return self.loader.num_workers
-
-        # pick the best num_workers
-        auto_tune_start = time.time()
-        logging.debug("========= DataLoader Auto Tune =========")
-        logging.debug(
-            "User config for DataLoader: " + str(self.loader.num_workers)
-        )
-        best_num_workers = 0
-        min_cost = float("inf")
-        logging.debug(
-            "Tuning Range for num_workers: 0 ~ " + str(self.max_num_worker)
-        )
-        num_workers = 0
-        while num_workers < self.max_num_worker:
-            auto_tune_loader.num_workers = num_workers
-            avg_cost = self.evaluate_reader_cost(auto_tune_loader)
-            if min_cost * 0.75 > avg_cost:
-                min_cost = avg_cost
-                best_num_workers = num_workers
-            else:
-                update_num = self.is_best(
-                    auto_tune_loader,
-                    best_num_workers,
-                    min_cost,
-                    self.max_num_worker,
-                )
-                if update_num == best_num_workers:
-                    break
-                else:
-                    best_num_workers = update_num
-            logging.debug(
-                "num_workers: "
-                + str(num_workers)
-                + " avg_cost: "
-                + str(avg_cost)
-            )
-            num_workers += 2
-        logging.info(
-            "auto_tune dataLoader best_num_workers: " + str(best_num_workers)
-        )
-        logging.debug(
-            "AutoTuning Cost for DataLoader: "
-            + str(time.time() - auto_tune_start)
-            + ' seconds'
-        )
-
-        # tune the default loader's num_workers
-        return best_num_workers
-
-    def need_autotune(self):
-        if sys.platform == 'darwin' or sys.platform == 'win32':
-            return False
-        else:
-            return True
-
-    def get_sub_dataset(self, dataset, batch_size):
-        num_samples = min(batch_size * TUNING_STEPS, len(dataset))
-        sub_dataset = Subset(dataset, indices=list(range(num_samples)))
-        return sub_dataset
-
-    def get_autotune_loader(self):
-        loader = copy.copy(self.loader)
-        batch_size = self.loader.batch_sampler.batch_size
-        if isinstance(
-            self.loader.batch_sampler, paddle.io.DistributedBatchSampler
-        ):
-            dataset = self.loader.batch_sampler.dataset
-            sub_dataset = self.get_sub_dataset(dataset, batch_size)
-            loader.batch_sampler = paddle.io.DistributedBatchSampler(
-                dataset=sub_dataset,
-                batch_size=batch_size,
-                num_replicas=self.loader.batch_sampler.nranks,
-                rank=self.loader.batch_sampler.local_rank,
-                shuffle=self.loader.batch_sampler.shuffle,
-                drop_last=self.loader.batch_sampler.drop_last,
-            )
-        elif isinstance(self.loader.batch_sampler, paddle.io.BatchSampler):
-            dataset = self.loader.batch_sampler.sampler.data_source
-            sub_dataset = self.get_sub_dataset(dataset, batch_size)
-            loader.batch_sampler = paddle.io.BatchSampler(
-                dataset=sub_dataset,
-                batch_size=batch_size,
-                drop_last=self.loader.batch_sampler.drop_last,
-            )
-        else:
-            loader = None
-        return loader
-
-    def evaluate_reader_cost(self, reader):
-        costs = []
-        avg_cost = 0
-        start = time.time()
-        for i, data in enumerate(reader):
-            costs.append(time.time() - start)
-            start = time.time()
-        if len(costs) > 2:
-            avg_cost = sum(costs[2:]) / len(costs[2:])
-        else:
-            avg_cost = sum(costs[0:]) / len(costs[0:])
-        return avg_cost
-
-    def is_best(self, reader, best_workers, best_time, num_work_boundary):
-        step = 0
-        num_workers = best_workers + 1
-        boundary = 1
-        while num_workers < num_work_boundary and step < 5:
-            self.loader.num_workers = num_workers
-            time = self.evaluate_reader_cost(reader)
-            logging.debug(
-                "for back num_workers: "
-                + str(num_workers)
-                + " avg_cost: "
-                + str(time)
-            )
-            step += 1
-            if time < best_time * 0.70 * boundary:
-                return num_workers
-            else:
-                num_workers += 1
-            boundary *= 0.80
-        return best_workers
-
-
 class DataLoader:
-    """
-    DataLoader prodives an iterator which iterates given dataset
-    once by the batch_sampler.
-
-    DataLoader supports single-process and multi-prcess data loading,
-    multi-process workers will be used to load data asynchronously if
-    :attr:`num_workers` is set as a positive number.
-
-    DataLoader supports map-style dataset and iterable-style dataset.
-
-    For map-style datast(can get a sample from dataset with a given
-    index), please see :code:`paddle.io.Dataset`.
-
-    For iterable-style datast(get samples from dataset iteratively,
-    like a Python iterator), please see :code:`paddle.io.IterableDataset`.
-
-    For :code:`batch_sampler` please see :code:`paddle.io.BatchSampler`
-
-    .. note::
-        GPU tensor operation is not supported in subprocess currently,
-        please don't use GPU tensor operations in pipeline which will
-        be performed in subprocess, such as dataset transforms, collte_fn,
-        etc. Numpy array and CPU tensor operation is supported.
-
-    **Disable automatic batching**
-
-    In certain cases such as some NLP tasks, instead of automatic batching,
-    handling batching manually in dataset is needed by users. For these
-    cases, automatic batching is disabled if both :attr:`batch_size` and
-    :attr:`batch_sampler` is set as None, each data got from :attr:`dataset`
-    should be batched data and will be processed with function define by
-    :attr:`collate_fn` or :attr:`default_collate_fn`.
-
-
-    .. note::
-        When automatic batching is disabled, :attr:`default_collate_fn` will
-        do nothing to data from dataset.
-
-
-    Args:
-        dataset(Dataset): the dataset to load data from, should be an
-            instance of subclass of :code:`paddle.io.Dataset` or
-            :code:`paddle.io.IterableDataset`.
-        feed_list (list(Tensor)|tuple(Tensor), optional): feed Tensor list.
-            The Tensors should be created by :code:`paddle.static.data()`.
-            :attr:`feed_list` must be set if :attr:`return_list` is
-            False. Default None.
-        places(list(Place)|tuple(Place)|list(str), optional): a list of Place,
-            to put data onto, :attr:`places` can be None, if
-            :attr:`places` is None, default place(CPUPlace or CUDAPlace(0))
-            will be used. Default None. If ``places`` is list of string,
-            the string in the list can be ``cpu``, ``gpu:x`` and ``gpu_pinned``,
-            where ``x`` is the index of the GPUs.
-        return_list (bool, optional): whether the return value on each device is
-            presented as a list. If :attr:`return_list=False`, the return
-            value on each device would be a dict of str -> Tensor, where
-            the key of the dict is the name of each fed Tensors. If
-            :attr:`return_list=True`, the return value on each device would
-            be a list(Tensor). :attr:`return_list` can only be True
-            in dynamic graph mode. Default True.
-        batch_sampler(BatchSampler, optional): an instance of `paddle.io.BatchSampler`
-            to generate batch indices to draw samples from :attr:`dataset`
-            and combine a batch. Default None.
-        batch_size(int|None, optional): sample number in a mini-batch, a substitution
-            parameter for :attr:`batch_sampler`, if :attr:`batch_sampler`
-            is not set, a default `paddle.io.BatchSampler` will be used
-            and initialize by :attr:`batch_size`, :attr:`shuffle` and
-            :attr:`drop_last`. Default 1.
-        shuffle(bool, optional): whther to shuffle indices order before genrate
-            batch indices, a substitution parameter for :attr:`batch_sampler`
-            see :attr:`batch_size`. Default False.
-        drop_last(bool, optional): whether drop the last incomplete batch dataset size
-            is not divisible by the batch size, a substitution parameter
-            for :attr:`batch_sampler`, see :attr:`batch_size`. Default False
-        collate_fn(callable, optional): function to generate mini-batch data by merging
-            the sample list, None for only stack each fields of sample in axis
-            0(same as :attr::`np.stack(..., axis=0)`). Default None
-        num_workers(int, optional): the number of subprocess to load data, 0 for no
-            subprocess used and loading data in main process. Default 0
-        use_buffer_reader (bool, optional): whether to use bufferred reader.
-            If use_buffer_reader=True, the DataLoader would prefetch
-            batch data asynchronously, so it would speed up data feeding
-            and occupies a little more CPU or GPU memory, i.e., the memory
-            of one batch input data. Default True.
-        prefetch_factor (int, optional): Number of batch data the DataLoader would prefetch
-            if use_buffer_reader=True. Default 2.
-        use_shared_memory (bool, optional): whether to use shared memory to speed up
-            putting data into inter-process queue, set :attr:`use_shared_memory`
-            as True only when the shared memory space on your machine(e.g.
-            space of '/dev/shm' on Linux operating sysytem) is large enough.
-            Shared memory will only be enabled in multi-process mode(num_workers
-            > 0). Default True.
-        timeout(int, optional): the timeout value for getting data form output queue
-            of subprocesses. Default 0.
-        worker_init_fn(callable, optional): init function which will be called with
-            worker id on each subproces starting if not set as None. Default
-            None.
-
-    Returns:
-        DataLoader: an iterable object for data iterating, each elemnet of the generated data is a Tensor.
-
-    Examples:
-
-        .. code-block:: python
-
-            import numpy as np
-
-            import paddle
-            import paddle.nn as nn
-            import paddle.nn.functional as F
-            from paddle.io import Dataset, BatchSampler, DataLoader
-
-            BATCH_NUM = 20
-            BATCH_SIZE = 16
-            EPOCH_NUM = 4
-
-            IMAGE_SIZE = 784
-            CLASS_NUM = 10
-
-            # define a random dataset
-            class RandomDataset(Dataset):
-                def __init__(self, num_samples):
-                    self.num_samples = num_samples
-
-                def __getitem__(self, idx):
-                    image = np.random.random([IMAGE_SIZE]).astype('float32')
-                    label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64')
-                    return image, label
-
-                def __len__(self):
-                    return self.num_samples
-
-            dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
-
-            class SimpleNet(nn.Layer):
-                def __init__(self):
-                    super().__init__()
-                    self.fc = nn.Linear(IMAGE_SIZE, CLASS_NUM)
-
-                def forward(self, image, label=None):
-                    return self.fc(image)
-
-            simple_net = SimpleNet()
-            opt = paddle.optimizer.SGD(learning_rate=1e-3,
-                                      parameters=simple_net.parameters())
-
-            loader = DataLoader(dataset,
-                                batch_size=BATCH_SIZE,
-                                shuffle=True,
-                                drop_last=True,
-                                num_workers=2)
-
-            for e in range(EPOCH_NUM):
-                for i, (image, label) in enumerate(loader()):
-                    out = simple_net(image)
-                    loss = F.cross_entropy(out, label)
-                    avg_loss = paddle.mean(loss)
-                    avg_loss.backward()
-                    opt.minimize(avg_loss)
-                    simple_net.clear_gradients()
-                    print("Epoch {} batch {}: loss = {}".format(e, i, np.mean(loss.numpy())))
-
-
-    .. note::
-        For reading iterable dataset with multiprocess Dataloader,
-        please see :code:`paddle.io.IterableDataset`
-
-    """
-
-    def __init__(
-        self,
-        dataset,
-        feed_list=None,
-        places=None,
-        return_list=True,
-        batch_sampler=None,
-        batch_size=1,
-        shuffle=False,
-        drop_last=False,
-        collate_fn=None,
-        num_workers=0,
-        use_buffer_reader=True,
-        prefetch_factor=2,
-        use_shared_memory=True,
-        timeout=0,
-        worker_init_fn=None,
-        persistent_workers=False,
-    ):
-        self.return_list = return_list
-        self.collate_fn = collate_fn
-        self.use_buffer_reader = use_buffer_reader
-        self.prefetch_factor = prefetch_factor
-        self.worker_init_fn = worker_init_fn
-
-        self.dataset = dataset
-
-        if not return_list and not _non_static_mode():
-            assert (
-                feed_list is not None
-            ), "feed_list should be set when return_list=False"
-        self.feed_list = feed_list
-
-        if places is None:
-            places = _current_expected_place()
-        if isinstance(places, (list, tuple)):
-            places = _get_paddle_place_list(places)
-        else:
-            places = _get_paddle_place(places)
-        self.places = _convert_places(places)
-
-        assert num_workers >= 0, "num_workers should be a non-negative value"
-        if num_workers > 0 and (
-            sys.platform == 'darwin' or sys.platform == 'win32'
-        ):
-            warnings.warn(
-                "DataLoader with multi-process mode is not supported on MacOs and Windows currently."
-                " Please use signle-process mode with num_workers = 0 instead"
-            )
-            num_workers = 0
-        self.num_workers = num_workers
-
-        assert prefetch_factor > 0, "prefetch_factor should be a positive value"
-
-        self.use_shared_memory = use_shared_memory
-        if use_shared_memory and num_workers == 0:
-            self.use_shared_memory = False
-
-        assert timeout >= 0, "timeout should be a non-negative value"
-        self.timeout = timeout
-
-        if isinstance(dataset, IterableDataset):
-            self.dataset_kind = _DatasetKind.ITER
-            if shuffle:
-                raise ValueError(
-                    "IterableDataset not support shuffle, but got shuffle={}".format(
-                        shuffle
-                    )
-                )
-            if batch_sampler is not None:
-                raise ValueError(
-                    "IterableDataset expect unspecified batch_sampler"
-                )
-        else:
-            self.dataset_kind = _DatasetKind.MAP
-
-        if batch_sampler is not None:
-            assert batch_size == 1 and not shuffle and not drop_last, (
-                "batch_size/shuffle/drop_last should not be set when "
-                "batch_sampler is given"
-            )
-            self.batch_sampler = batch_sampler
-            self.batch_size = None
-        elif batch_size is None:
-            self.batch_sampler = None
-            self.batch_size = None
-        else:
-            assert batch_size > 0, (
-                "batch_size should be None or a positive value when "
-                "batch_sampler is not given"
-            )
-            self.batch_size = batch_size
-            if isinstance(dataset, IterableDataset):
-                self.batch_sampler = _InfiniteIterableSampler(
-                    dataset, batch_size
-                )
-            else:
-                self.batch_sampler = BatchSampler(
-                    dataset=dataset,
-                    batch_size=batch_size,
-                    shuffle=shuffle,
-                    drop_last=drop_last,
-                )
-
-        self.drop_last = drop_last
-        self.auto_collate_batch = self.batch_sampler is not None
-
-        self.pin_memory = False
-        if _non_static_mode():
-            self.pin_memory = (
-                True if use_pinned_memory() is None else use_pinned_memory()
-            )
-
-        self._persistent_workers = persistent_workers
-        self._iterator = None
-        self.num_workers = AuToTune(self).__call__()
-
-    def __len__(self):
-        if self.dataset_kind == _DatasetKind.ITER:
-            raise ValueError("length of IterableDataset not supported")
-        else:
-            if self.auto_collate_batch:
-                return len(self.batch_sampler)
-            else:
-                return len(self.dataset)
-
-    def __iter__(self):
-        if self.num_workers == 0:
-            return _DataLoaderIterSingleProcess(self)
-        elif self._persistent_workers:
-            if self._iterator is None:
-                self._iterator = _DataLoaderIterMultiProcess(self)
-            else:
-                self._iterator._reset()
-            return self._iterator
-        else:
-            return _DataLoaderIterMultiProcess(self)
-
-    def __call__(self):
-        return self.__iter__()
-
     @staticmethod
     def from_generator(
         feed_list=None,
@@ -793,7 +328,7 @@ def set_data_source(loader, places):
                 label = static.data(name='label', shape=[None, 1], dtype='int64')
 
                 # Define DataLoader
-                loader = paddle.io.DataLoader.from_generator(feed_list=[image, label], capacity=16, iterable=ITERABLE)
+                loader = paddle.fluid.io.DataLoader.from_generator(feed_list=[image, label], capacity=16, iterable=ITERABLE)
 
                 # Define network
                 loss = simple_net(image, label)
@@ -867,7 +402,7 @@ def forward(self, x):
                 adam = opt.Adam(learning_rate=0.001, parameters=dp_layer.parameters())
 
                 # create data loader
-                loader = paddle.io.DataLoader.from_generator(capacity=5)
+                loader = paddle.fluid.io.DataLoader.from_generator(capacity=5)
                 loader.set_batch_generator(random_batch_reader())
 
                 for epoch_id in range(EPOCH_NUM):
@@ -944,7 +479,7 @@ def from_dataset(dataset, places, drop_last=True):
                     use_var=[image, label])
                 dataset.set_filelist(['a.txt', 'b.txt', 'c.txt'])
 
-                loader = paddle.io.DataLoader.from_dataset(dataset, static.cpu_places())
+                loader = paddle.fluid.io.DataLoader.from_dataset(dataset, static.cpu_places())
         """
         return DatasetLoader(dataset, places, drop_last)
 
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_save_for_auto_infer.py b/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_save_for_auto_infer.py
index c74e2b7adaa22..a2a9c9113271b 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_save_for_auto_infer.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_save_for_auto_infer.py
@@ -37,8 +37,8 @@
 )
 from paddle.distributed.sharding.group_sharded import group_sharded_parallel
 from paddle.distributed.utils.log_utils import get_logger
-from paddle.fluid.dataloader.dataset import IterableDataset
 from paddle.incubate.distributed.utils.io import save_for_auto_inference
+from paddle.io import IterableDataset
 from paddle.nn import Linear
 
 logger = get_logger("INFO", __file__)
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py
index d7e09481a1c71..9e2b89b12860c 100755
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py
@@ -406,7 +406,7 @@ def setUp(self):
         ]
 
     def test_main(self):
-        from paddle.fluid.dataloader.worker import _generate_states
+        from paddle.io.dataloader.worker import _generate_states
 
         for inp, outp in zip(self.inputs, self.outputs):
             out = _generate_states(*inp)
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py
index 2b4d1a78d1ea3..bfd08f703c4f6 100644
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py
@@ -19,8 +19,8 @@
 
 from paddle import fluid
 from paddle.fluid import core
-from paddle.fluid.dataloader.dataloader_iter import _worker_loop
 from paddle.io import BatchSampler, DataLoader, Dataset, IterableDataset
+from paddle.io.dataloader.worker import _worker_loop
 
 
 class RandomDataset(Dataset):
diff --git a/python/paddle/incubate/autotune.py b/python/paddle/incubate/autotune.py
index 742c5bded064a..dfad1dc58c928 100644
--- a/python/paddle/incubate/autotune.py
+++ b/python/paddle/incubate/autotune.py
@@ -84,7 +84,7 @@ def set_config(config=None):
     if config is None:
         core.enable_autotune()
         core.enable_layout_autotune()
-        paddle.fluid.reader.set_autotune_config(use_autotune=True)
+        paddle.io.reader.set_autotune_config(use_autotune=True)
         return
 
     config_dict = {}
@@ -147,7 +147,7 @@ def set_config(config=None):
                 )
         if "tuning_steps" in dataloader_config:
             if isinstance(dataloader_config['tuning_steps'], int):
-                paddle.fluid.reader.set_autotune_config(
+                paddle.io.reader.set_autotune_config(
                     use_autoune, dataloader_config['tuning_steps']
                 )
             else:
@@ -155,4 +155,4 @@ def set_config(config=None):
                     "The auto-tuning configuration of the dataloader is incorrect."
                     "The `tuning_steps` should be int. Use default parameter instead."
                 )
-                paddle.fluid.reader.set_autotune_config(use_autoune)
+                paddle.io.reader.set_autotune_config(use_autoune)
diff --git a/python/paddle/io/__init__.py b/python/paddle/io/__init__.py
index a9c0e9a2f2d2f..6c2e0dae67834 100755
--- a/python/paddle/io/__init__.py
+++ b/python/paddle/io/__init__.py
@@ -14,21 +14,21 @@
 
 # TODO: define all functions about input & output in this directory
 
-from ..fluid.io import DataLoader  # noqa: F401
-from ..fluid.dataloader import Dataset  # noqa: F401
-from ..fluid.dataloader import IterableDataset  # noqa: F401
-from ..fluid.dataloader import BatchSampler  # noqa: F401
-from ..fluid.dataloader import get_worker_info  # noqa: F401
-from ..fluid.dataloader import TensorDataset  # noqa: F401
-from ..fluid.dataloader import Sampler  # noqa: F401
-from ..fluid.dataloader import SequenceSampler  # noqa: F401
-from ..fluid.dataloader import RandomSampler  # noqa: F401
-from ..fluid.dataloader import DistributedBatchSampler  # noqa: F401
-from ..fluid.dataloader import ComposeDataset  # noqa: F401
-from ..fluid.dataloader import ChainDataset  # noqa: F401
-from ..fluid.dataloader import WeightedRandomSampler  # noqa: F401
-from ..fluid.dataloader import Subset  # noqa: F401
-from ..fluid.dataloader import random_split  # noqa: F401
+from .reader import DataLoader  # noqa: F401
+from .dataloader import Dataset  # noqa: F401
+from .dataloader import IterableDataset  # noqa: F401
+from .dataloader import BatchSampler  # noqa: F401
+from .dataloader import get_worker_info  # noqa: F401
+from .dataloader import TensorDataset  # noqa: F401
+from .dataloader import Sampler  # noqa: F401
+from .dataloader import SequenceSampler  # noqa: F401
+from .dataloader import RandomSampler  # noqa: F401
+from .dataloader import DistributedBatchSampler  # noqa: F401
+from .dataloader import ComposeDataset  # noqa: F401
+from .dataloader import ChainDataset  # noqa: F401
+from .dataloader import WeightedRandomSampler  # noqa: F401
+from .dataloader import Subset  # noqa: F401
+from .dataloader import random_split  # noqa: F401
 
 __all__ = [  # noqa
     'Dataset',
diff --git a/python/paddle/fluid/dataloader/__init__.py b/python/paddle/io/dataloader/__init__.py
similarity index 55%
rename from python/paddle/fluid/dataloader/__init__.py
rename to python/paddle/io/dataloader/__init__.py
index c0b2052283b1c..bb65463f70afc 100644
--- a/python/paddle/fluid/dataloader/__init__.py
+++ b/python/paddle/io/dataloader/__init__.py
@@ -12,21 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from . import dataset
-from .dataset import *
+from .dataset import Dataset
+from .dataset import IterableDataset
+from .dataset import TensorDataset
+from .dataset import ComposeDataset
+from .dataset import ChainDataset
+from .dataset import random_split
+from .dataset import Subset
 
-from . import batch_sampler
-from .batch_sampler import *
+from .batch_sampler import BatchSampler
+from .batch_sampler import DistributedBatchSampler
 
-from . import dataloader_iter
-from .dataloader_iter import *
+from .worker import get_worker_info
 
-from . import sampler
-from .sampler import *
-
-__all__ = (
-    dataset.__all__
-    + batch_sampler.__all__
-    + dataloader_iter.__all__
-    + sampler.__all__
-)
+from .sampler import Sampler
+from .sampler import SequenceSampler
+from .sampler import RandomSampler
+from .sampler import WeightedRandomSampler
diff --git a/python/paddle/fluid/dataloader/batch_sampler.py b/python/paddle/io/dataloader/batch_sampler.py
similarity index 98%
rename from python/paddle/fluid/dataloader/batch_sampler.py
rename to python/paddle/io/dataloader/batch_sampler.py
index 3e0449719c4cd..190e9240900f8 100644
--- a/python/paddle/fluid/dataloader/batch_sampler.py
+++ b/python/paddle/io/dataloader/batch_sampler.py
@@ -12,13 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import numpy as np
 import math
 
-from .sampler import Sampler, SequenceSampler, RandomSampler
-from .dataset import Dataset, IterableDataset
+import numpy as np
 
-__all__ = ["BatchSampler", "DistributedBatchSampler"]
+from .dataset import IterableDataset
+from .sampler import RandomSampler, Sampler, SequenceSampler
 
 
 class BatchSampler(Sampler):
diff --git a/python/paddle/fluid/dataloader/collate.py b/python/paddle/io/dataloader/collate.py
similarity index 97%
rename from python/paddle/fluid/dataloader/collate.py
rename to python/paddle/io/dataloader/collate.py
index dd70a3421409d..141624668f09b 100644
--- a/python/paddle/fluid/dataloader/collate.py
+++ b/python/paddle/io/dataloader/collate.py
@@ -12,13 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle
 import numbers
+from collections.abc import Mapping, Sequence
+
 import numpy as np
-from ..framework import _non_static_mode
-from .. import core, layers
 
-from collections.abc import Sequence, Mapping
+import paddle
+
+from ...framework import core
 
 
 def default_collate_fn(batch):
diff --git a/python/paddle/fluid/dataloader/dataloader_iter.py b/python/paddle/io/dataloader/dataloader_iter.py
similarity index 98%
rename from python/paddle/fluid/dataloader/dataloader_iter.py
rename to python/paddle/io/dataloader/dataloader_iter.py
index 2b06c371ef36f..43b749c869dd6 100644
--- a/python/paddle/fluid/dataloader/dataloader_iter.py
+++ b/python/paddle/io/dataloader/dataloader_iter.py
@@ -12,51 +12,39 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import itertools
+import logging
 import os
+import queue
 import sys
-import time
-import signal
-import numbers
-import logging
-import itertools
 import threading
+import time
 import warnings
-import numpy as np
-from collections import namedtuple
-from paddle.fluid.framework import (
-    _set_expected_place,
-    _current_expected_place,
-    set_flags,
-)
 
-import queue
+import numpy as np
 
 import paddle
-import paddle.profiler as profiler
+from paddle import profiler
+from paddle.fluid.framework import _current_expected_place, _set_expected_place
+from paddle.profiler.timer import benchmark
 from paddle.profiler.utils import in_profiler_mode
-from .. import core, layers
-from ..framework import in_dygraph_mode
+
+from ...framework import core, in_dygraph_mode
 from ..multiprocess_utils import (
-    _set_SIGCHLD_handler,
     MP_STATUS_CHECK_INTERVAL,
     CleanupFuncRegistrar,
+    _set_SIGCHLD_handler,
 )
-from .fetcher import _IterableDatasetFetcher, _MapDatasetFetcher
 from .batch_sampler import _InfiniteIterableSampler
 from .collate import default_collate_fn, default_convert_fn
+from .flat import _flatten_batch, _restore_batch
 from .worker import (
-    ParentWatchDog,
-    get_worker_info,
-    _worker_loop,
     _DatasetKind,
     _IterableDatasetStopIteration,
-    _WorkerException,
     _ResumeIteration,
+    _worker_loop,
+    _WorkerException,
 )
-from .flat import _flatten_batch, _restore_batch
-from paddle.profiler.timer import benchmark
-
-__all__ = ['get_worker_info']
 
 # NOTE: fix `terminate called without an active exception`
 # if for loop break and program exit immediately(with no model
@@ -95,7 +83,7 @@ class _DataLoaderIterBase:
     data by setting in given dataloader.
 
     Args:
-        loader(instance of DataLoader): instance of `fluid.io.DataLoader`
+        loader(instance of DataLoader): instance of `paddle.io.DataLoader`
     """
 
     def __init__(self, loader):
@@ -439,7 +427,7 @@ def __init__(self, loader):
         self._shutdown = False
 
     def _init_workers(self):
-        import paddle.incubate.multiprocessing as multiprocessing
+        from paddle.incubate import multiprocessing
 
         # multiprocess worker and indice queue list initial as empty
         self._workers = []
diff --git a/python/paddle/fluid/dataloader/dataset.py b/python/paddle/io/dataloader/dataset.py
similarity index 98%
rename from python/paddle/fluid/dataloader/dataset.py
rename to python/paddle/io/dataloader/dataset.py
index 3701da0b33ec7..e8bb6bbd364c8 100755
--- a/python/paddle/fluid/dataloader/dataset.py
+++ b/python/paddle/io/dataloader/dataset.py
@@ -13,17 +13,8 @@
 # limitations under the License.
 
 import paddle
-from .. import framework
-
-__all__ = [
-    "Dataset",
-    "IterableDataset",
-    "TensorDataset",
-    "ComposeDataset",
-    "ChainDataset",
-    "random_split",
-    "Subset",
-]
+
+from ... import framework
 
 
 class Dataset:
diff --git a/python/paddle/fluid/dataloader/fetcher.py b/python/paddle/io/dataloader/fetcher.py
similarity index 60%
rename from python/paddle/fluid/dataloader/fetcher.py
rename to python/paddle/io/dataloader/fetcher.py
index b097a315c0c73..309d009cfc106 100644
--- a/python/paddle/fluid/dataloader/fetcher.py
+++ b/python/paddle/io/dataloader/fetcher.py
@@ -12,12 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import logging
-from ..log_helper import get_logger
-from collections.abc import Sequence, Mapping
-
-_WARNING_TO_LOG = True
-
 
 class _DatasetFetcher:
     def __init__(self, dataset, auto_collate_batch, collate_fn, drop_last):
@@ -37,47 +31,8 @@ def __init__(self, dataset, auto_collate_batch, collate_fn, drop_last):
     #       ecah sample processing in the batch
     def fetch(self, batch_indices, done_event=None):
         raise NotImplementedError(
-            "'fetch' not implement for class {}".format(self.__class__.__name__)
-        )
-
-    def _log_warning(self):
-        # only log warning on GPU 0 when distributed launch
-        from ...distributed import get_world_size, get_rank
-
-        if get_world_size() >= 2 and get_rank() != 0:
-            return
-
-        warn_str = (
-            "Detect dataset only contains single fileds, return format "
-            "changed since Paddle 2.1. In Paddle <= 2.0, DataLoader add "
-            "a list surround output data(e.g. return [data]), and in "
-            "Paddle >= 2.1, DataLoader return the single filed directly "
-            "(e.g. return data). For example, in following code: \n\n"
-        )
-        warn_str += (
-            "import numpy as np\n"
-            "from paddle.io import DataLoader, Dataset\n\n"
-            "class RandomDataset(Dataset):\n"
-            "    def __getitem__(self, idx):\n"
-            "        data = np.random.random((2, 3)).astype('float32')\n\n"
-            "        return data\n\n"
-            "    def __len__(self):\n"
-            "        return 10\n\n"
-            "dataset = RandomDataset()\n"
-            "loader = DataLoader(dataset, batch_size=1)\n"
-            "data = next(loader())\n\n"
-        )
-
-        warn_str += (
-            "In Paddle <= 2.0, data is in format '[Tensor(shape=(1, 2, 3), "
-            "dtype=float32)]', and in Paddle >= 2.1, data is in format"
-            " 'Tensor(shape=(1, 2, 3), dtype=float32)'\n"
-        )
-
-        logger = get_logger(
-            "DataLoader", logging.INFO, fmt='%(levelname)s: %(message)s'
+            f"'fetch' not implement for class {self.__class__.__name__}"
         )
-        logger.warning(warn_str)
 
 
 class _IterableDatasetFetcher(_DatasetFetcher):
@@ -103,10 +58,6 @@ def fetch(self, batch_indices, done_event=None):
             ):
                 raise StopIteration
 
-            global _WARNING_TO_LOG
-            if not isinstance(data[0], (Sequence, Mapping)) and _WARNING_TO_LOG:
-                self._log_warning()
-                _WARNING_TO_LOG = False
         else:
             data = next(self.dataset_iter)
 
@@ -128,10 +79,6 @@ def fetch(self, batch_indices, done_event=None):
                 else:
                     return None
 
-            global _WARNING_TO_LOG
-            if not isinstance(data[0], (Sequence, Mapping)) and _WARNING_TO_LOG:
-                self._log_warning()
-                _WARNING_TO_LOG = False
         else:
             data = self.dataset[batch_indices]
 
diff --git a/python/paddle/fluid/dataloader/flat.py b/python/paddle/io/dataloader/flat.py
similarity index 93%
rename from python/paddle/fluid/dataloader/flat.py
rename to python/paddle/io/dataloader/flat.py
index 1e1ed1eebd806..f674d7fb2b4b9 100644
--- a/python/paddle/fluid/dataloader/flat.py
+++ b/python/paddle/io/dataloader/flat.py
@@ -12,12 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle
 import numbers
-import numpy as np
+from collections.abc import Mapping, Sequence
 
-from collections.abc import Sequence, Mapping
+import numpy as np
 
+import paddle
 
 FIELD_PREFIX = "_paddle_field_"
 
@@ -38,7 +38,7 @@ def _flatten(batch, flat_batch, structure, field_idx):
                     field,
                     (np.ndarray, paddle.Tensor, paddle.fluid.core.eager.Tensor),
                 ):
-                    structure.append('{}{}'.format(FIELD_PREFIX, field_idx))
+                    structure.append(f'{FIELD_PREFIX}{field_idx}')
                     flat_batch.append(field)
                     field_idx += 1
                 elif isinstance(field, (str, bytes, numbers.Number)):
@@ -61,7 +61,7 @@ def _flatten(batch, flat_batch, structure, field_idx):
                     field,
                     (np.ndarray, paddle.Tensor, paddle.fluid.core.eager.Tensor),
                 ):
-                    structure[k] = '{}{}'.format(FIELD_PREFIX, field_idx)
+                    structure[k] = f'{FIELD_PREFIX}{field_idx}'
                     flat_batch.append(field)
                     field_idx += 1
                 elif isinstance(field, (str, bytes, numbers.Number)):
@@ -79,7 +79,7 @@ def _flatten(batch, flat_batch, structure, field_idx):
                 else:
                     structure[k] = field
         else:
-            raise TypeError("wrong flat data type: {}".format(type(batch)))
+            raise TypeError(f"wrong flat data type: {type(batch)}")
 
         return structure, field_idx
 
@@ -130,7 +130,7 @@ def _restore(structure, field_idx):
                 elif isinstance(field, (Sequence, Mapping)):
                     field_idx = _restore(structure[k], field_idx)
         else:
-            raise TypeError("wrong flat data type: {}".format(type(structure)))
+            raise TypeError(f"wrong flat data type: {type(structure)}")
 
         return field_idx
 
@@ -145,7 +145,7 @@ def _restore(structure, field_idx):
     if isinstance(structure, (str, bytes)):
         assert structure == '{}{}'.format(
             FIELD_PREFIX, 0
-        ), "invalid structure: {}".format(structure)
+        ), f"invalid structure: {structure}"
         return flat_batch[0]
     field_idx = _restore(structure, 0)
     assert field_idx + 1 == len(flat_batch), "Tensor parse incomplete"
diff --git a/python/paddle/fluid/dataloader/sampler.py b/python/paddle/io/dataloader/sampler.py
similarity index 98%
rename from python/paddle/fluid/dataloader/sampler.py
rename to python/paddle/io/dataloader/sampler.py
index a6ec3ffbae9b8..aa8a4e649c76c 100644
--- a/python/paddle/fluid/dataloader/sampler.py
+++ b/python/paddle/io/dataloader/sampler.py
@@ -13,14 +13,8 @@
 # limitations under the License.
 
 import numpy as np
-from .. import core
 
-__all__ = [
-    "Sampler",
-    "SequenceSampler",
-    "RandomSampler",
-    "WeightedRandomSampler",
-]
+from ...framework import core
 
 
 class Sampler:
@@ -317,7 +311,7 @@ def __iter__(self):
         idxs = _weighted_sample(
             self.weights, self.num_samples, self.replacement
         )
-        return iter(idxs.reshape((-1)).tolist())
+        return iter(idxs.reshape(-1).tolist())
 
     def __len__(self):
         mul = np.prod(self.weights.shape) // self.weights.shape[-1]
diff --git a/python/paddle/fluid/dataloader/worker.py b/python/paddle/io/dataloader/worker.py
similarity index 98%
rename from python/paddle/fluid/dataloader/worker.py
rename to python/paddle/io/dataloader/worker.py
index de6c382054e0a..4ca80e09ae65e 100644
--- a/python/paddle/fluid/dataloader/worker.py
+++ b/python/paddle/io/dataloader/worker.py
@@ -13,25 +13,25 @@
 # limitations under the License.
 
 import os
+
+# NOTE: queue has a different name in python2 and python3
+import queue
 import sys
-import paddle
-import numpy as np
 import traceback
-from collections import namedtuple
-from .. import core
-from .fetcher import _IterableDatasetFetcher, _MapDatasetFetcher
+
+import numpy as np
+
+import paddle
+
+from ...framework import core
 from ..multiprocess_utils import (
-    _cleanup_mmap,
-    CleanupFuncRegistrar,
     MP_STATUS_CHECK_INTERVAL,
+    CleanupFuncRegistrar,
+    _cleanup_mmap,
 )
-from ..framework import _non_static_mode, _in_eager_without_dygraph_check
+from .fetcher import _IterableDatasetFetcher, _MapDatasetFetcher
 from .flat import _flatten_batch
 
-import queue
-
-__all__ = ['get_worker_info']
-
 
 class _IterableDatasetStopIteration:
     def __init__(self, worker_id):
@@ -59,7 +59,7 @@ def create_fetcher(
                 dataset, auto_collate_batch, collate_fn, drop_last
             )
         else:
-            raise NotImplementedError("unknown Dataset kind {}".format(kind))
+            raise NotImplementedError(f"unknown Dataset kind {kind}")
 
 
 class ParentWatchDog:
@@ -291,9 +291,9 @@ def _worker_loop(
 
         # set different numpy seed for each worker
         try:
-            import numpy as np
-            import time
             import random
+
+            import numpy as np
         except ImportError:
             pass
         else:
diff --git a/python/paddle/io/multiprocess_utils.py b/python/paddle/io/multiprocess_utils.py
new file mode 100644
index 0000000000000..5792983ceb475
--- /dev/null
+++ b/python/paddle/io/multiprocess_utils.py
@@ -0,0 +1,140 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import atexit
+
+# NOTE: queue has a different name in python2 and python3
+import queue
+import signal
+import sys
+
+from ..framework import core
+
+# multi-process worker check indices queue interval, avoid
+# hanging in subprocess data loading
+MP_STATUS_CHECK_INTERVAL = 5.0
+
+# NOTE: [ mmap files clear ] If there is still data in the multiprocess queue when the main process finishes reading,
+# the data in the queue needs to be popped. Then the LoDTensor read by the main process
+# from the child process will automatically clear the memory-mapped file.
+multiprocess_queue_set = set()
+
+
+def _clear_multiprocess_queue_set():
+    global multiprocess_queue_set
+    for data_queue in multiprocess_queue_set:
+        while True:
+            try:
+                data_queue.get_nowait()
+            except queue.Empty:
+                break
+
+
+# NOTE: main process clear function at exit
+def _cleanup():
+    # NOTE: inter-process Queue shared memory objects clear function
+    _clear_multiprocess_queue_set()
+    # NOTE: main process memory map files clear funciton
+    core._cleanup_mmap_fds()
+
+
+# NOTE: for child process clear function at exit
+def _cleanup_mmap():
+    # clear memory map files in child process
+    core._cleanup_mmap_fds()
+
+
+# NOTE used for register a function to be executed at interpreter exit.
+class CleanupFuncRegistrar:
+    # Record the cleanup functions that have been executed
+    _executed_func_set = set()
+    # Record the cleanup functions that have been registered
+    _registered_func_set = set()
+
+    @classmethod
+    def register(cls, function, signals=[]):
+        def _func_exectuor():
+            if function not in cls._executed_func_set:
+                try:
+                    function()
+                finally:
+                    cls._executed_func_set.add(function)
+
+        def _func_register(function):
+            if not callable(function):
+                raise TypeError("%s is not callable object." % (function))
+            # check function object whether hash-able {function}
+            if function not in cls._registered_func_set:
+                atexit.register(_func_exectuor)
+                cls._registered_func_set.add(function)
+
+        def _signal_handler(signum=None, frame=None):
+            _func_exectuor()
+            if signum is not None:
+                if signum == signal.SIGINT:
+                    raise KeyboardInterrupt
+                sys.exit(signum)
+
+        def _signal_register(signals):
+            signals = set(signals)
+            for sig in signals:
+                orig_handler = signal.signal(sig, _signal_handler)
+                if orig_handler not in (signal.SIG_DFL, signal.SIG_IGN):
+                    if (
+                        sig == signal.SIGINT
+                        and orig_handler is signal.default_int_handler
+                    ):
+                        continue
+                    if orig_handler not in cls._registered_func_set:
+                        atexit.register(orig_handler)
+                        cls._registered_func_set.add(orig_handler)
+
+        # deal with signals
+        _signal_register(signals)
+        # deal with function
+        _func_register(function)
+
+
+# NOTE: [ mmap files clear ] When the main process exits unexpectedly, the remaining
+# shared memory objects in the inter-process Queue and the main process (mostly in the
+# BlockingQueue) may not be completely released, resulting in the corresponding
+# memory-mapped file remaining on the disk (/dev/shm), so register this function
+# to clean up shared memory objects in these two queues before the python interpreter exits.
+# NOTE: Currently multi-process DataLoader only supports Linux platform
+if not (sys.platform == 'darwin' or sys.platform == 'win32'):
+    CleanupFuncRegistrar.register(_cleanup)
+
+# ------------ SIGCHLD handler setting --------------
+_SIGCHLD_handler_set = False
+
+
+def _set_SIGCHLD_handler():
+    global _SIGCHLD_handler_set
+    if _SIGCHLD_handler_set:
+        return
+
+    current_handler = signal.getsignal(signal.SIGCHLD)
+    if not callable(current_handler):
+        current_handler = None
+
+    def __handler__(signum, frame):
+        # NOTE: Here the signum is SIGCHLD, when the child process exits,
+        # this handler will be called whenever the child process exits
+        # normally or abnormally.
+        core._throw_error_if_process_failed()
+        if current_handler is not None:
+            current_handler(signum, frame)
+
+    signal.signal(signal.SIGCHLD, __handler__)
+    _SIGCHLD_handler_set = True
diff --git a/python/paddle/io/reader.py b/python/paddle/io/reader.py
new file mode 100644
index 0000000000000..6698caa435f94
--- /dev/null
+++ b/python/paddle/io/reader.py
@@ -0,0 +1,528 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import multiprocessing
+
+# NOTE: queue has a different name in python2 and python3
+import sys
+import time
+import warnings
+
+import paddle
+from paddle.fluid.framework import logging
+
+from ..fluid.framework import (
+    _current_expected_place,
+    _get_paddle_place,
+    _get_paddle_place_list,
+    _non_static_mode,
+)
+from ..framework import core
+from .dataloader import BatchSampler, IterableDataset, Subset
+from .dataloader.batch_sampler import _InfiniteIterableSampler
+from .dataloader.dataloader_iter import (
+    _DataLoaderIterMultiProcess,
+    _DataLoaderIterSingleProcess,
+    _DatasetKind,
+)
+
+# NOTE: [ avoid hanging & failed quickly ]
+# These value is used in getting data from another process
+QUEUE_GET_TIMEOUT = 60
+
+USE_PINNED_MEMORY = None
+# AutoTune Flags
+USE_AUTOTUNE = False
+TUNING_STEPS = 500
+
+
+def set_autotune_config(use_autotune, tuning_steps=500):
+    global USE_AUTOTUNE
+    USE_AUTOTUNE = use_autotune
+    global TUNING_STEPS
+    TUNING_STEPS = tuning_steps
+
+
+def use_pinned_memory(*args):
+    global USE_PINNED_MEMORY
+    if len(args) == 0:
+        return USE_PINNED_MEMORY
+    else:
+        assert len(args) == 1 and isinstance(args[0], bool)
+        USE_PINNED_MEMORY = args[0]
+
+
+def _convert_places(places):
+    if not isinstance(places, (list, tuple)):
+        places = [places]
+
+    ret = []
+    for p in places:
+        if not isinstance(p, core.Place):
+            tmp = core.Place()
+            tmp.set_place(p)
+            p = tmp
+
+        ret.append(p)
+    return ret
+
+
+class AuToTune:
+    def __init__(self, loader):
+        self.loader = loader
+        self.max_num_worker = multiprocessing.cpu_count() / 2
+
+    def __call__(self):
+        # use default loader
+        if (not USE_AUTOTUNE) or (not self.need_autotune()):
+            return self.loader.num_workers
+
+        # get autotune loader
+        auto_tune_loader = self.get_autotune_loader()
+        if auto_tune_loader is None:
+            return self.loader.num_workers
+
+        # pick the best num_workers
+        auto_tune_start = time.time()
+        logging.debug("========= DataLoader Auto Tune =========")
+        logging.debug(
+            "User config for DataLoader: " + str(self.loader.num_workers)
+        )
+        best_num_workers = 0
+        min_cost = float("inf")
+        logging.debug(
+            "Tuning Range for num_workers: 0 ~ " + str(self.max_num_worker)
+        )
+        num_workers = 0
+        while num_workers < self.max_num_worker:
+            auto_tune_loader.num_workers = num_workers
+            avg_cost = self.evaluate_reader_cost(auto_tune_loader)
+            if min_cost * 0.75 > avg_cost:
+                min_cost = avg_cost
+                best_num_workers = num_workers
+            else:
+                update_num = self.is_best(
+                    auto_tune_loader,
+                    best_num_workers,
+                    min_cost,
+                    self.max_num_worker,
+                )
+                if update_num == best_num_workers:
+                    break
+                else:
+                    best_num_workers = update_num
+            logging.debug(
+                "num_workers: "
+                + str(num_workers)
+                + " avg_cost: "
+                + str(avg_cost)
+            )
+            num_workers += 2
+        logging.info(
+            "auto_tune dataLoader best_num_workers: " + str(best_num_workers)
+        )
+        logging.debug(
+            "AutoTuning Cost for DataLoader: "
+            + str(time.time() - auto_tune_start)
+            + ' seconds'
+        )
+
+        # tune the default loader's num_workers
+        return best_num_workers
+
+    def need_autotune(self):
+        if sys.platform == 'darwin' or sys.platform == 'win32':
+            return False
+        else:
+            return True
+
+    def get_sub_dataset(self, dataset, batch_size):
+        num_samples = min(batch_size * TUNING_STEPS, len(dataset))
+        sub_dataset = Subset(dataset, indices=list(range(num_samples)))
+        return sub_dataset
+
+    def get_autotune_loader(self):
+        loader = copy.copy(self.loader)
+        batch_size = self.loader.batch_sampler.batch_size
+        if isinstance(
+            self.loader.batch_sampler, paddle.io.DistributedBatchSampler
+        ):
+            dataset = self.loader.batch_sampler.dataset
+            sub_dataset = self.get_sub_dataset(dataset, batch_size)
+            loader.batch_sampler = paddle.io.DistributedBatchSampler(
+                dataset=sub_dataset,
+                batch_size=batch_size,
+                num_replicas=self.loader.batch_sampler.nranks,
+                rank=self.loader.batch_sampler.local_rank,
+                shuffle=self.loader.batch_sampler.shuffle,
+                drop_last=self.loader.batch_sampler.drop_last,
+            )
+        elif isinstance(self.loader.batch_sampler, paddle.io.BatchSampler):
+            dataset = self.loader.batch_sampler.sampler.data_source
+            sub_dataset = self.get_sub_dataset(dataset, batch_size)
+            loader.batch_sampler = paddle.io.BatchSampler(
+                dataset=sub_dataset,
+                batch_size=batch_size,
+                drop_last=self.loader.batch_sampler.drop_last,
+            )
+        else:
+            loader = None
+        return loader
+
+    def evaluate_reader_cost(self, reader):
+        costs = []
+        avg_cost = 0
+        start = time.time()
+        for i, data in enumerate(reader):
+            costs.append(time.time() - start)
+            start = time.time()
+        if len(costs) > 2:
+            avg_cost = sum(costs[2:]) / len(costs[2:])
+        else:
+            avg_cost = sum(costs[0:]) / len(costs[0:])
+        return avg_cost
+
+    def is_best(self, reader, best_workers, best_time, num_work_boundary):
+        step = 0
+        num_workers = best_workers + 1
+        boundary = 1
+        while num_workers < num_work_boundary and step < 5:
+            self.loader.num_workers = num_workers
+            time = self.evaluate_reader_cost(reader)
+            logging.debug(
+                "for back num_workers: "
+                + str(num_workers)
+                + " avg_cost: "
+                + str(time)
+            )
+            step += 1
+            if time < best_time * 0.70 * boundary:
+                return num_workers
+            else:
+                num_workers += 1
+            boundary *= 0.80
+        return best_workers
+
+
+class DataLoader:
+    """
+    DataLoader prodives an iterator which iterates given dataset
+    once by the batch_sampler.
+
+    DataLoader supports single-process and multi-prcess data loading,
+    multi-process workers will be used to load data asynchronously if
+    :attr:`num_workers` is set as a positive number.
+
+    DataLoader supports map-style dataset and iterable-style dataset.
+
+    For map-style datast(can get a sample from dataset with a given
+    index), please see :code:`paddle.io.Dataset`.
+
+    For iterable-style datast(get samples from dataset iteratively,
+    like a Python iterator), please see :code:`paddle.io.IterableDataset`.
+
+    For :code:`batch_sampler` please see :code:`paddle.io.BatchSampler`
+
+    .. note::
+        GPU tensor operation is not supported in subprocess currently,
+        please don't use GPU tensor operations in pipeline which will
+        be performed in subprocess, such as dataset transforms, collte_fn,
+        etc. Numpy array and CPU tensor operation is supported.
+
+    **Disable automatic batching**
+
+    In certain cases such as some NLP tasks, instead of automatic batching,
+    handling batching manually in dataset is needed by users. For these
+    cases, automatic batching is disabled if both :attr:`batch_size` and
+    :attr:`batch_sampler` is set as None, each data got from :attr:`dataset`
+    should be batched data and will be processed with function define by
+    :attr:`collate_fn` or :attr:`default_collate_fn`.
+
+
+    .. note::
+        When automatic batching is disabled, :attr:`default_collate_fn` will
+        do nothing to data from dataset.
+
+
+    Args:
+        dataset(Dataset): the dataset to load data from, should be an
+            instance of subclass of :code:`paddle.io.Dataset` or
+            :code:`paddle.io.IterableDataset`.
+        feed_list (list(Tensor)|tuple(Tensor), optional): feed Tensor list.
+            The Tensors should be created by :code:`paddle.static.data()`.
+            :attr:`feed_list` must be set if :attr:`return_list` is
+            False. Default None.
+        places(list(Place)|tuple(Place)|list(str), optional): a list of Place,
+            to put data onto, :attr:`places` can be None, if
+            :attr:`places` is None, default place(CPUPlace or CUDAPlace(0))
+            will be used. Default None. If ``places`` is list of string,
+            the string in the list can be ``cpu``, ``gpu:x`` and ``gpu_pinned``,
+            where ``x`` is the index of the GPUs.
+        return_list (bool, optional): whether the return value on each device is
+            presented as a list. If :attr:`return_list=False`, the return
+            value on each device would be a dict of str -> Tensor, where
+            the key of the dict is the name of each fed Tensors. If
+            :attr:`return_list=True`, the return value on each device would
+            be a list(Tensor). :attr:`return_list` can only be True
+            in dynamic graph mode. Default True.
+        batch_sampler(BatchSampler, optional): an instance of `paddle.io.BatchSampler`
+            to generate batch indices to draw samples from :attr:`dataset`
+            and combine a batch. Default None.
+        batch_size(int|None, optional): sample number in a mini-batch, a substitution
+            parameter for :attr:`batch_sampler`, if :attr:`batch_sampler`
+            is not set, a default `paddle.io.BatchSampler` will be used
+            and initialize by :attr:`batch_size`, :attr:`shuffle` and
+            :attr:`drop_last`. Default 1.
+        shuffle(bool, optional): whther to shuffle indices order before genrate
+            batch indices, a substitution parameter for :attr:`batch_sampler`
+            see :attr:`batch_size`. Default False.
+        drop_last(bool, optional): whether drop the last incomplete batch dataset size
+            is not divisible by the batch size, a substitution parameter
+            for :attr:`batch_sampler`, see :attr:`batch_size`. Default False
+        collate_fn(callable, optional): function to generate mini-batch data by merging
+            the sample list, None for only stack each fields of sample in axis
+            0(same as :attr::`np.stack(..., axis=0)`). Default None
+        num_workers(int, optional): the number of subprocess to load data, 0 for no
+            subprocess used and loading data in main process. Default 0
+        use_buffer_reader (bool, optional): whether to use bufferred reader.
+            If use_buffer_reader=True, the DataLoader would prefetch
+            batch data asynchronously, so it would speed up data feeding
+            and occupies a little more CPU or GPU memory, i.e., the memory
+            of one batch input data. Default True.
+        prefetch_factor (int, optional): Number of batch data the DataLoader would prefetch
+            if use_buffer_reader=True. Default 2.
+        use_shared_memory (bool, optional): whether to use shared memory to speed up
+            putting data into inter-process queue, set :attr:`use_shared_memory`
+            as True only when the shared memory space on your machine(e.g.
+            space of '/dev/shm' on Linux operating sysytem) is large enough.
+            Shared memory will only be enabled in multi-process mode(num_workers
+            > 0). Default True.
+        timeout(int, optional): the timeout value for getting data form output queue
+            of subprocesses. Default 0.
+        worker_init_fn(callable, optional): init function which will be called with
+            worker id on each subproces starting if not set as None. Default
+            None.
+
+    Returns:
+        DataLoader: an iterable object for data iterating, each elemnet of the generated data is a Tensor.
+
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+
+            import paddle
+            import paddle.nn as nn
+            import paddle.nn.functional as F
+            from paddle.io import Dataset, BatchSampler, DataLoader
+
+            BATCH_NUM = 20
+            BATCH_SIZE = 16
+            EPOCH_NUM = 4
+
+            IMAGE_SIZE = 784
+            CLASS_NUM = 10
+
+            # define a random dataset
+            class RandomDataset(Dataset):
+                def __init__(self, num_samples):
+                    self.num_samples = num_samples
+
+                def __getitem__(self, idx):
+                    image = np.random.random([IMAGE_SIZE]).astype('float32')
+                    label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64')
+                    return image, label
+
+                def __len__(self):
+                    return self.num_samples
+
+            dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
+
+            class SimpleNet(nn.Layer):
+                def __init__(self):
+                    super().__init__()
+                    self.fc = nn.Linear(IMAGE_SIZE, CLASS_NUM)
+
+                def forward(self, image, label=None):
+                    return self.fc(image)
+
+            simple_net = SimpleNet()
+            opt = paddle.optimizer.SGD(learning_rate=1e-3,
+                                      parameters=simple_net.parameters())
+
+            loader = DataLoader(dataset,
+                                batch_size=BATCH_SIZE,
+                                shuffle=True,
+                                drop_last=True,
+                                num_workers=2)
+
+            for e in range(EPOCH_NUM):
+                for i, (image, label) in enumerate(loader()):
+                    out = simple_net(image)
+                    loss = F.cross_entropy(out, label)
+                    avg_loss = paddle.mean(loss)
+                    avg_loss.backward()
+                    opt.minimize(avg_loss)
+                    simple_net.clear_gradients()
+                    print("Epoch {} batch {}: loss = {}".format(e, i, np.mean(loss.numpy())))
+
+
+    .. note::
+        For reading iterable dataset with multiprocess Dataloader,
+        please see :code:`paddle.io.IterableDataset`
+
+    """
+
+    def __init__(
+        self,
+        dataset,
+        feed_list=None,
+        places=None,
+        return_list=True,
+        batch_sampler=None,
+        batch_size=1,
+        shuffle=False,
+        drop_last=False,
+        collate_fn=None,
+        num_workers=0,
+        use_buffer_reader=True,
+        prefetch_factor=2,
+        use_shared_memory=True,
+        timeout=0,
+        worker_init_fn=None,
+        persistent_workers=False,
+    ):
+        self.return_list = return_list
+        self.collate_fn = collate_fn
+        self.use_buffer_reader = use_buffer_reader
+        self.prefetch_factor = prefetch_factor
+        self.worker_init_fn = worker_init_fn
+
+        self.dataset = dataset
+
+        if not return_list and not _non_static_mode():
+            assert (
+                feed_list is not None
+            ), "feed_list should be set when return_list=False"
+        self.feed_list = feed_list
+
+        if places is None:
+            places = _current_expected_place()
+        if isinstance(places, (list, tuple)):
+            places = _get_paddle_place_list(places)
+        else:
+            places = _get_paddle_place(places)
+        self.places = _convert_places(places)
+
+        assert num_workers >= 0, "num_workers should be a non-negative value"
+        if num_workers > 0 and (
+            sys.platform == 'darwin' or sys.platform == 'win32'
+        ):
+            warnings.warn(
+                "DataLoader with multi-process mode is not supported on MacOs and Windows currently."
+                " Please use signle-process mode with num_workers = 0 instead"
+            )
+            num_workers = 0
+        self.num_workers = num_workers
+
+        assert prefetch_factor > 0, "prefetch_factor should be a positive value"
+
+        self.use_shared_memory = use_shared_memory
+        if use_shared_memory and num_workers == 0:
+            self.use_shared_memory = False
+
+        assert timeout >= 0, "timeout should be a non-negative value"
+        self.timeout = timeout
+
+        if isinstance(dataset, IterableDataset):
+            self.dataset_kind = _DatasetKind.ITER
+            if shuffle:
+                raise ValueError(
+                    "IterableDataset not support shuffle, but got shuffle={}".format(
+                        shuffle
+                    )
+                )
+            if batch_sampler is not None:
+                raise ValueError(
+                    "IterableDataset expect unspecified batch_sampler"
+                )
+        else:
+            self.dataset_kind = _DatasetKind.MAP
+
+        if batch_sampler is not None:
+            assert batch_size == 1 and not shuffle and not drop_last, (
+                "batch_size/shuffle/drop_last should not be set when "
+                "batch_sampler is given"
+            )
+            self.batch_sampler = batch_sampler
+            self.batch_size = None
+        elif batch_size is None:
+            self.batch_sampler = None
+            self.batch_size = None
+        else:
+            assert batch_size > 0, (
+                "batch_size should be None or a positive value when "
+                "batch_sampler is not given"
+            )
+            self.batch_size = batch_size
+            if isinstance(dataset, IterableDataset):
+                self.batch_sampler = _InfiniteIterableSampler(
+                    dataset, batch_size
+                )
+            else:
+                self.batch_sampler = BatchSampler(
+                    dataset=dataset,
+                    batch_size=batch_size,
+                    shuffle=shuffle,
+                    drop_last=drop_last,
+                )
+
+        self.drop_last = drop_last
+        self.auto_collate_batch = self.batch_sampler is not None
+
+        self.pin_memory = False
+        if _non_static_mode():
+            self.pin_memory = (
+                True if use_pinned_memory() is None else use_pinned_memory()
+            )
+
+        self._persistent_workers = persistent_workers
+        self._iterator = None
+        self.num_workers = AuToTune(self).__call__()
+
+    def __len__(self):
+        if self.dataset_kind == _DatasetKind.ITER:
+            raise ValueError("length of IterableDataset not supported")
+        else:
+            if self.auto_collate_batch:
+                return len(self.batch_sampler)
+            else:
+                return len(self.dataset)
+
+    def __iter__(self):
+        if self.num_workers == 0:
+            return _DataLoaderIterSingleProcess(self)
+        elif self._persistent_workers:
+            if self._iterator is None:
+                self._iterator = _DataLoaderIterMultiProcess(self)
+            else:
+                self._iterator._reset()
+            return self._iterator
+        else:
+            return _DataLoaderIterMultiProcess(self)
+
+    def __call__(self):
+        return self.__iter__()
diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py
index f8e00eabecf5e..c7e550f7aa117 100644
--- a/python/paddle/optimizer/adamw.py
+++ b/python/paddle/optimizer/adamw.py
@@ -181,9 +181,7 @@ def __init__(
                 not core.is_compiled_with_cuda()
                 and not core.is_compiled_with_xpu()
             ):
-                raise NotImplementedError(
-                    "'lr_ratio' is unimplemented in CPU, and NPU"
-                )
+                raise NotImplementedError("'lr_ratio' is unimplemented in CPU.")
 
         if parameters is not None:
             # paddle.Tensor is also iterable, so here we don't check whether
diff --git a/python/paddle/static/quantization/post_training_quantization.py b/python/paddle/static/quantization/post_training_quantization.py
index f80b293a12d4b..266c1756a334b 100644
--- a/python/paddle/static/quantization/post_training_quantization.py
+++ b/python/paddle/static/quantization/post_training_quantization.py
@@ -620,7 +620,7 @@ def _load_model_data(self):
                 self._batch_nums if self._batch_nums else len(self._data_loader)
             )
             return
-        self._data_loader = io.DataLoader.from_generator(
+        self._data_loader = reader.DataLoader.from_generator(
             feed_list=feed_vars, capacity=3 * self._batch_size, iterable=True
         )
         if self._sample_generator is not None:
diff --git a/python/setup.py.in b/python/setup.py.in
index 1c59c4aaa4746..ebf949c2b41f3 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -445,7 +445,6 @@ packages=['paddle',
           'paddle.fluid.proto',
           'paddle.fluid.proto.profiler',
           'paddle.fluid.layers',
-          'paddle.fluid.dataloader',
           'paddle.fluid.contrib',
           'paddle.fluid.contrib.extend_optimizer',
           'paddle.fluid.incubate',
@@ -492,6 +491,7 @@ packages=['paddle',
           'paddle.sparse.nn.functional',
           'paddle.incubate.xpu',
           'paddle.io',
+          'paddle.io.dataloader',
           'paddle.optimizer',
           'paddle.nn',
           'paddle.nn.functional',
diff --git a/setup.py b/setup.py
index 6a305243bbe3d..297e66eba0499 100644
--- a/setup.py
+++ b/setup.py
@@ -1421,7 +1421,6 @@ def get_setup_parameters():
         'paddle.fluid.proto',
         'paddle.fluid.proto.profiler',
         'paddle.fluid.layers',
-        'paddle.fluid.dataloader',
         'paddle.fluid.contrib',
         'paddle.fluid.contrib.extend_optimizer',
         'paddle.fluid.incubate',
@@ -1468,6 +1467,7 @@ def get_setup_parameters():
         'paddle.sparse.nn.functional',
         'paddle.incubate.xpu',
         'paddle.io',
+        'paddle.io.dataloader',
         'paddle.optimizer',
         'paddle.nn',
         'paddle.nn.functional',
diff --git a/test/auto_parallel/auto_parallel_relaunch_model.py b/test/auto_parallel/auto_parallel_relaunch_model.py
index 290af66485512..6fa3bc9eaa1ff 100644
--- a/test/auto_parallel/auto_parallel_relaunch_model.py
+++ b/test/auto_parallel/auto_parallel_relaunch_model.py
@@ -109,7 +109,7 @@ def mlp_pretrain_forward(train_program, start_program):
         error_cost = paddle.nn.functional.square_error_cost(predict, label)
         loss = paddle.mean(error_cost)
 
-        loader = paddle.io.DataLoader.from_generator(
+        loader = paddle.fluid.io.DataLoader.from_generator(
             feed_list=[input, label], capacity=4 * batch_size, iterable=True
         )
 
diff --git a/test/auto_parallel/engine_api.py b/test/auto_parallel/engine_api.py
index cb0b4f2541e94..a2725a57b8e53 100644
--- a/test/auto_parallel/engine_api.py
+++ b/test/auto_parallel/engine_api.py
@@ -297,7 +297,7 @@ def train_builtin_data_vars():
     with static.program_guard(engine.main_program, engine.startup_program):
         feed_list = engine.inputs + engine.labels
         print(feed_list)
-        loader = paddle.io.DataLoader.from_generator(
+        loader = paddle.fluid.io.DataLoader.from_generator(
             feed_list=feed_list, capacity=4 * batch_size, iterable=False
         )
 
@@ -324,7 +324,7 @@ def train_non_builtin_data_vars():
         )
         label = static.data(name="label", shape=[batch_size, 1], dtype='int64')
 
-        loader = paddle.io.DataLoader.from_generator(
+        loader = paddle.fluid.io.DataLoader.from_generator(
             feed_list=[input, label], capacity=4 * batch_size, iterable=False
         )
         places = static.cuda_places()
@@ -383,7 +383,7 @@ def get_cost():
         )
         label = static.data(name="label", shape=[batch_size, 1], dtype='int64')
 
-        loader = paddle.io.DataLoader.from_generator(
+        loader = paddle.fluid.io.DataLoader.from_generator(
             feed_list=[input, label], capacity=4 * batch_size, iterable=False
         )
         places = static.cuda_places()
@@ -434,7 +434,7 @@ def get_cost_by_default_program():
         )
         label = static.data(name="label", shape=[batch_size, 1], dtype='int64')
 
-        loader = paddle.io.DataLoader.from_generator(
+        loader = paddle.fluid.io.DataLoader.from_generator(
             feed_list=[input, label], capacity=4 * batch_size, iterable=False
         )
         places = static.cuda_places()
diff --git a/test/auto_parallel/test_dist_attr_v2.py b/test/auto_parallel/test_dist_attr_v2.py
index 11c140a812a9f..1d15c34221f90 100644
--- a/test/auto_parallel/test_dist_attr_v2.py
+++ b/test/auto_parallel/test_dist_attr_v2.py
@@ -130,7 +130,7 @@ def get_program():
         )
         data_holder = [input, label]
         # dataloader
-        dataloader = paddle.io.DataLoader.from_generator(
+        dataloader = paddle.fluid.io.DataLoader.from_generator(
             feed_list=data_holder, capacity=4 * batch_size, iterable=False
         )
         dataloader.set_batch_generator(
diff --git a/test/auto_parallel/test_dist_context.py b/test/auto_parallel/test_dist_context.py
index 10f78aedd4fb9..2944b2db2a3fb 100644
--- a/test/auto_parallel/test_dist_context.py
+++ b/test/auto_parallel/test_dist_context.py
@@ -112,7 +112,7 @@ def get_program():
         )
         data_holder = [input, label]
         # dataloader
-        dataloader = paddle.io.DataLoader.from_generator(
+        dataloader = paddle.fluid.io.DataLoader.from_generator(
             feed_list=data_holder, capacity=4 * batch_size, iterable=False
         )
         dataloader.set_batch_generator(
diff --git a/test/auto_parallel/test_serialization.py b/test/auto_parallel/test_serialization.py
index 00a30e8a61d4e..d89c9596f4cdb 100644
--- a/test/auto_parallel/test_serialization.py
+++ b/test/auto_parallel/test_serialization.py
@@ -124,7 +124,7 @@ def get_program():
         )
         data_holder = [input, label]
         # dataloader
-        dataloader = paddle.io.DataLoader.from_generator(
+        dataloader = paddle.fluid.io.DataLoader.from_generator(
             feed_list=data_holder, capacity=4 * batch_size, iterable=False
         )
         dataloader.set_batch_generator(
diff --git a/test/auto_parallel/test_while_op_completion.py b/test/auto_parallel/test_while_op_completion.py
index 6d5264ab971b7..3f9b5b151ab08 100644
--- a/test/auto_parallel/test_while_op_completion.py
+++ b/test/auto_parallel/test_while_op_completion.py
@@ -148,7 +148,7 @@ def get_program():
         )
         data_holder = [input, label]
         # dataloader
-        dataloader = paddle.io.DataLoader.from_generator(
+        dataloader = paddle.fluid.io.DataLoader.from_generator(
             feed_list=data_holder, capacity=4 * batch_size, iterable=False
         )
         dataloader.set_batch_generator(
diff --git a/test/auto_parallel/test_while_op_partition.py b/test/auto_parallel/test_while_op_partition.py
index 6dc02d6834f5b..00f3a70bbcf42 100644
--- a/test/auto_parallel/test_while_op_partition.py
+++ b/test/auto_parallel/test_while_op_partition.py
@@ -136,7 +136,7 @@ def get_program():
 
         data_holder = [input, label]
         # dataloader
-        dataloader = paddle.io.DataLoader.from_generator(
+        dataloader = fluid.io.DataLoader.from_generator(
             feed_list=data_holder, capacity=4 * batch_size, iterable=False
         )
         dataloader.set_batch_generator(
diff --git a/test/cpp/fluid/CMakeLists.txt b/test/cpp/fluid/CMakeLists.txt
index 34f45371cca2a..7bc33dfda33e8 100644
--- a/test/cpp/fluid/CMakeLists.txt
+++ b/test/cpp/fluid/CMakeLists.txt
@@ -2,9 +2,11 @@ add_subdirectory(benchmark)
 if(WITH_CINN)
   add_subdirectory(cinn)
 endif()
-# add_subdirectory(controlflow)
-# add_subdirectory(detection)
-# add_subdirectory(dlnne)
+add_subdirectory(controlflow)
+add_subdirectory(detection)
+if(WITH_DLNNE)
+  add_subdirectory(dlnne)
+endif()
 add_subdirectory(elementwise)
 add_subdirectory(fused)
 if(WITH_LITE)
@@ -21,7 +23,10 @@ endif()
 add_subdirectory(prim_ops)
 add_subdirectory(reader)
 add_subdirectory(reduce_ops)
-# add_subdirectory(tensorrt)
+# TODO(gouzil): enable this after the bug is fixed. windows: Exit code 0xc000007b, pr: #53470
+# if(WITH_GPU AND TENSORRT_FOUND)
+#   add_subdirectory(tensorrt)
+# endif()
 
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} executor)
 
diff --git a/test/cpp/fluid/controlflow/CMakeLists.txt b/test/cpp/fluid/controlflow/CMakeLists.txt
new file mode 100644
index 0000000000000..87950fdcd46f2
--- /dev/null
+++ b/test/cpp/fluid/controlflow/CMakeLists.txt
@@ -0,0 +1,4 @@
+cc_test(
+  conditional_block_op_test
+  SRCS conditional_block_op_test.cc
+  DEPS conditional_block_op standalone_executor executor)
diff --git a/paddle/fluid/operators/controlflow/conditional_block_op_test.cc b/test/cpp/fluid/controlflow/conditional_block_op_test.cc
similarity index 100%
rename from paddle/fluid/operators/controlflow/conditional_block_op_test.cc
rename to test/cpp/fluid/controlflow/conditional_block_op_test.cc
diff --git a/test/cpp/fluid/detection/CMakeLists.txt b/test/cpp/fluid/detection/CMakeLists.txt
new file mode 100644
index 0000000000000..bc9e8e0a53603
--- /dev/null
+++ b/test/cpp/fluid/detection/CMakeLists.txt
@@ -0,0 +1,4 @@
+cc_test(
+  mask_util_test
+  SRCS mask_util_test.cc
+  DEPS memory mask_util)
diff --git a/paddle/fluid/operators/detection/mask_util_test.cc b/test/cpp/fluid/detection/mask_util_test.cc
similarity index 100%
rename from paddle/fluid/operators/detection/mask_util_test.cc
rename to test/cpp/fluid/detection/mask_util_test.cc
diff --git a/test/cpp/fluid/dlnne/CMakeLists.txt b/test/cpp/fluid/dlnne/CMakeLists.txt
new file mode 100644
index 0000000000000..5089a41351ce3
--- /dev/null
+++ b/test/cpp/fluid/dlnne/CMakeLists.txt
@@ -0,0 +1,4 @@
+cc_test(
+  test_dlnne_engine_op
+  SRCS dlnne_engine_op_test.cc
+  DEPS dlnne_engine_op analysis)
diff --git a/paddle/fluid/operators/dlnne/dlnne_engine_op_test.cc b/test/cpp/fluid/dlnne/dlnne_engine_op_test.cc
similarity index 100%
rename from paddle/fluid/operators/dlnne/dlnne_engine_op_test.cc
rename to test/cpp/fluid/dlnne/dlnne_engine_op_test.cc
diff --git a/test/dygraph_to_static/test_resnet_v2.py b/test/dygraph_to_static/test_resnet_v2.py
index bf332809ff8f0..2efbe46cedfec 100644
--- a/test/dygraph_to_static/test_resnet_v2.py
+++ b/test/dygraph_to_static/test_resnet_v2.py
@@ -255,7 +255,7 @@ def do_train(self, to_static):
             batch_size=batch_size,
             drop_last=True,
         )
-        data_loader = paddle.io.DataLoader.from_generator(
+        data_loader = paddle.fluid.io.DataLoader.from_generator(
             capacity=5, iterable=True
         )
         data_loader.set_sample_list_generator(train_reader)
diff --git a/test/dygraph_to_static/test_simnet_v2.py b/test/dygraph_to_static/test_simnet_v2.py
index 3e8cb4c10b3d4..a86259cc6d736 100644
--- a/test/dygraph_to_static/test_simnet_v2.py
+++ b/test/dygraph_to_static/test_simnet_v2.py
@@ -132,7 +132,7 @@ def train(conf_dict, to_static):
     global_step = 0
     losses = []
 
-    train_loader = paddle.io.DataLoader.from_generator(
+    train_loader = paddle.fluid.io.DataLoader.from_generator(
         capacity=16, return_list=True, iterable=True, use_double_buffer=True
     )
     get_train_examples = simnet_process.get_reader("train", epoch=args.epoch)
diff --git a/test/ir/inference/test_trt_convert_elementwise.py b/test/ir/inference/test_trt_convert_elementwise.py
index a6faff0787be5..0ac4a2ba46209 100644
--- a/test/ir/inference/test_trt_convert_elementwise.py
+++ b/test/ir/inference/test_trt_convert_elementwise.py
@@ -1214,5 +1214,161 @@ def test(self):
         self.run_test()
 
 
+class TrtConvertElementwise0D(TrtLayerAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_configs(self):
+        def generate_input(dims, op_type):
+            shape = []
+            if dims == 0:
+                shape = []
+            elif dims == 1:
+                shape = [8]
+            elif dims == 2:
+                shape = [1, 8]
+            elif dims == 3:
+                shape = [1, 8, 8]
+            else:
+                shape = [1, 8, 8, 8]
+
+            # elementwise_floordiv is integer only
+            if op_type == "elementwise_floordiv":
+                return np.random.randint(
+                    low=1, high=10000, size=shape, dtype=np.int32
+                )
+            elif op_type == "elementwise_mod":
+                return np.random.uniform(low=0.1, high=1.0, size=shape).astype(
+                    np.float32
+                )
+            else:
+                return np.random.random(shape).astype(np.float32)
+
+        for dims in [[0, 0], [0, 1], [0, 2], [1, 0], [2, 0]]:
+            for op_type in [
+                "elementwise_add",
+                "elementwise_mul",
+                "elementwise_sub",
+                "elementwise_div",
+                "elementwise_pow",
+                "elementwise_min",
+                "elementwise_max",
+                "elementwise_floordiv",
+                "elementwise_mod",
+            ]:
+                for axis in [-1 if dims[0] == 1 or dims[0] == 0 else 1]:
+                    self.dims = dims[0]
+                    dics = [{"axis": axis}]
+                    ops_config = [
+                        {
+                            "op_type": op_type,
+                            "op_inputs": {
+                                "X": ["input_data"],
+                                "Y": ["weight"],
+                            },
+                            "op_outputs": {"Out": ["output_data"]},
+                            "op_attrs": dics[0],
+                            "outputs_dtype": {
+                                "output_data": np.float32
+                                if op_type != "elementwise_floordiv"
+                                else np.int32
+                            },
+                        }
+                    ]
+                    ops = self.generate_op_config(ops_config)
+
+                    program_config = ProgramConfig(
+                        ops=ops,
+                        weights={
+                            "weight": TensorConfig(
+                                data_gen=partial(
+                                    generate_input, dims[1], op_type
+                                )
+                            )
+                        },
+                        inputs={
+                            "input_data": TensorConfig(
+                                data_gen=partial(
+                                    generate_input, dims[0], op_type
+                                )
+                            ),
+                        },
+                        outputs=["output_data"],
+                    )
+
+                    yield program_config
+
+    def sample_predictor_configs(
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
+        def generate_dynamic_shape(attrs):
+            # The input.dims[1] must be equal to the weight's length.
+            if self.dims == 0:
+                self.dynamic_shape.min_input_shape = {"input_data": []}
+                self.dynamic_shape.max_input_shape = {"input_data": []}
+                self.dynamic_shape.opt_input_shape = {"input_data": []}
+            if self.dims == 1:
+                self.dynamic_shape.min_input_shape = {"input_data": [1]}
+                self.dynamic_shape.max_input_shape = {"input_data": [16]}
+                self.dynamic_shape.opt_input_shape = {"input_data": [8]}
+            elif self.dims == 2:
+                self.dynamic_shape.min_input_shape = {"input_data": [1, 8]}
+                self.dynamic_shape.max_input_shape = {"input_data": [4, 8]}
+                self.dynamic_shape.opt_input_shape = {"input_data": [2, 8]}
+            elif self.dims == 3:
+                self.dynamic_shape.min_input_shape = {"input_data": [1, 1, 4]}
+                self.dynamic_shape.max_input_shape = {"input_data": [4, 16, 16]}
+                self.dynamic_shape.opt_input_shape = {"input_data": [2, 8, 8]}
+            elif self.dims == 4:
+                self.dynamic_shape.min_input_shape = {
+                    "input_data": [1, 8, 8, 8]
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "input_data": [4, 8, 8, 8]
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "input_data": [4, 8, 8, 8]
+                }
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        def generate_trt_nodes_num(attrs, dynamic_shape):
+            if not dynamic_shape and (self.dims == 1 or self.dims == 0):
+                return 0, 3
+            return 1, 2
+
+        attrs = [
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
+        ]
+
+        # for static_shape
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False
+        ), (1e-5, 1e-5)
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False
+        ), (1e-3, 1e-3)
+
+        # # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True
+        ), (1e-5, 1e-5)
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True
+        ), (1e-3, 1e-3)
+
+    def test(self):
+        self.run_test()
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/ir/inference/test_trt_convert_equal.py b/test/ir/inference/test_trt_convert_equal.py
index 4993e830f190b..5879a003d9546 100644
--- a/test/ir/inference/test_trt_convert_equal.py
+++ b/test/ir/inference/test_trt_convert_equal.py
@@ -40,54 +40,64 @@ def generate_input(shape):
             return np.random.random(shape).astype(np.float32)
 
         for op_type in ["equal", "not_equal"]:
-            for batch in [1, 2, 4]:
-                for shape in [[batch, 1], [batch, 1, 32], [batch, 1, 16, 32]]:
-                    for axis in [-1 if len(shape) == 1 else 1]:
-                        self.dims = len(shape)
-                        dics = [{"axis": axis}, {"in_dtype": 0, "out_dtype": 5}]
-                        ops_config = [
-                            {
-                                "op_type": op_type,
-                                "op_inputs": {
-                                    "X": ["input_data1"],
-                                    "Y": ["input_data2"],
-                                },
-                                "op_outputs": {"Out": ["compare_output_data"]},
-                                "op_attrs": dics[0],
-                                "outputs_dtype": {
-                                    "compare_output_data": np.bool_
-                                },
+            for shape in [[], [1, 1], [1, 1, 32], [1, 1, 16, 32]]:
+                for axis in [-1 if len(shape) == 1 or len(shape) == 0 else 1]:
+                    self.dims = len(shape)
+                    dics = [{"axis": axis}, {"in_dtype": 0, "out_dtype": 5}]
+                    ops_config = [
+                        {
+                            "op_type": op_type,
+                            "op_inputs": {
+                                "X": ["input_data1"],
+                                "Y": ["input_data2"],
                             },
-                            {
-                                "op_type": "cast",
-                                "op_inputs": {"X": ["compare_output_data"]},
-                                "op_outputs": {"Out": ["output_data"]},
-                                "op_attrs": dics[1],
-                                "outputs_dtype": {"output_data": np.float32},
-                            },
-                        ]
-                        ops = self.generate_op_config(ops_config)
-
-                        program_config = ProgramConfig(
-                            ops=ops,
-                            weights={},
-                            inputs={
-                                "input_data1": TensorConfig(
-                                    data_gen=partial(generate_input, shape)
-                                ),
-                                "input_data2": TensorConfig(
-                                    data_gen=partial(generate_input, shape)
-                                ),
-                            },
-                            outputs=["output_data"],
-                        )
-                        yield program_config
+                            "op_outputs": {"Out": ["compare_output_data"]},
+                            "op_attrs": dics[0],
+                            "outputs_dtype": {"compare_output_data": np.bool_},
+                        },
+                        {
+                            "op_type": "cast",
+                            "op_inputs": {"X": ["compare_output_data"]},
+                            "op_outputs": {"Out": ["output_data"]},
+                            "op_attrs": dics[1],
+                            "outputs_dtype": {"output_data": np.float32},
+                        },
+                    ]
+                    ops = self.generate_op_config(ops_config)
+
+                    program_config = ProgramConfig(
+                        ops=ops,
+                        weights={},
+                        inputs={
+                            "input_data1": TensorConfig(
+                                data_gen=partial(generate_input, shape)
+                            ),
+                            "input_data2": TensorConfig(
+                                data_gen=partial(generate_input, shape)
+                            ),
+                        },
+                        outputs=["output_data"],
+                    )
+                    yield program_config
 
     def sample_predictor_configs(
         self, program_config
     ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
             # The input.dims[1] must be equal to the weight's length.
+            if self.dims == 0:
+                self.dynamic_shape.min_input_shape = {
+                    "input_data1": [],
+                    "input_data2": [],
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "input_data1": [],
+                    "input_data2": [],
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "input_data1": [],
+                    "input_data2": [],
+                }
             if self.dims == 2:
                 self.dynamic_shape.min_input_shape = {
                     "input_data1": [1, 1],
diff --git a/test/ir/inference/test_trt_convert_expand_as_v2.py b/test/ir/inference/test_trt_convert_expand_as_v2.py
index 46b3a2232e471..be5458cac07dc 100644
--- a/test/ir/inference/test_trt_convert_expand_as_v2.py
+++ b/test/ir/inference/test_trt_convert_expand_as_v2.py
@@ -49,8 +49,11 @@ def generate_input1(attrs: List[Dict[str, Any]]):
             elif self.dims == 1:
                 self.input_shape = [32]
                 return np.random.random([32]).astype(np.float32)
+            elif self.dims == 0:
+                self.input_shape = []
+                return np.random.random([]).astype(np.float32)
 
-        for dims in [1, 2, 3, 4]:
+        for dims in [0, 1, 2, 3, 4]:
             for shape in [
                 [10, 8, 32, 32],
                 [2, 8, 32, 32],
@@ -125,6 +128,10 @@ def generate_dynamic_shape(attrs):
                 self.dynamic_shape.min_input_shape = {"expand_v2_input": [32]}
                 self.dynamic_shape.max_input_shape = {"expand_v2_input": [64]}
                 self.dynamic_shape.opt_input_shape = {"expand_v2_input": [32]}
+            elif self.dims == 0:
+                self.dynamic_shape.min_input_shape = {"expand_v2_input": []}
+                self.dynamic_shape.max_input_shape = {"expand_v2_input": []}
+                self.dynamic_shape.opt_input_shape = {"expand_v2_input": []}
 
         def clear_dynamic_shape():
             self.dynamic_shape.min_input_shape = {}
@@ -132,7 +139,9 @@ def clear_dynamic_shape():
             self.dynamic_shape.opt_input_shape = {}
 
         def generate_trt_nodes_num(attrs, dynamic_shape):
-            if dynamic_shape:
+            ver = paddle_infer.get_trt_compile_version()
+            ver_num = ver[0] * 1000 + ver[1] * 100 + ver[2] * 10
+            if dynamic_shape and (ver_num > 8000 or self.dims > 0):
                 return 1, 2
             else:
                 return 0, 3
diff --git a/test/ir/inference/test_trt_convert_reshape.py b/test/ir/inference/test_trt_convert_reshape.py
index 3f88b39003bb9..c30d973651bad 100644
--- a/test/ir/inference/test_trt_convert_reshape.py
+++ b/test/ir/inference/test_trt_convert_reshape.py
@@ -431,5 +431,99 @@ def test(self):
         self.run_test()
 
 
+class TrtConvertReshapeZeroDimsTest(TrtLayerAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_configs(self):
+        def generate_input1(attrs: List[Dict[str, Any]]):
+            if self.dims > 0:
+                self.input_shape = [1] * self.dims
+                return np.random.random(self.input_shape).astype(np.float32)
+            elif self.dims == 0:
+                self.input_shape = []
+                return np.random.random([]).astype(np.float32)
+
+        for dims in [0, 1, 2, 3]:
+            for shape in [
+                [],
+                [1, 1],
+            ]:
+                dics = [
+                    {
+                        "shape": shape,
+                    },
+                ]
+                self.dims = dims
+                dics_intput = [{"X": ["reshape_input"]}]
+
+                ops_config = [
+                    {
+                        "op_type": "reshape",
+                        "op_inputs": dics_intput[0],
+                        "op_outputs": {"Out": ["reshape_out"]},
+                        "op_attrs": dics[0],
+                    }
+                ]
+                ops = self.generate_op_config(ops_config)
+                program_config = ProgramConfig(
+                    ops=ops,
+                    weights={},
+                    inputs={
+                        "reshape_input": TensorConfig(
+                            data_gen=partial(generate_input1, dics)
+                        )
+                    },
+                    outputs=["reshape_out"],
+                )
+
+                yield program_config
+
+    def sample_predictor_configs(
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
+        def generate_dynamic_shape(attrs):
+            self.dynamic_shape.min_input_shape = {
+                "reshape_input": self.input_shape
+            }
+            self.dynamic_shape.max_input_shape = {
+                "reshape_input": self.input_shape
+            }
+            self.dynamic_shape.opt_input_shape = {
+                "reshape_input": self.input_shape
+            }
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        def generate_trt_nodes_num(attrs, dynamic_shape):
+            # only test dynamic shape mode
+            return 1, 2
+
+        attrs = [
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
+        ]
+
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True
+        ), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True
+        ), 1e-3
+
+    def add_skip_trt_case(self):
+        pass
+
+    def test(self):
+        self.add_skip_trt_case()
+        self.run_test()
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/legacy_test/test_model.py b/test/legacy_test/test_model.py
index af3cb7bdefb74..c5d62761f7f68 100644
--- a/test/legacy_test/test_model.py
+++ b/test/legacy_test/test_model.py
@@ -199,13 +199,13 @@ def setUpClass(cls):
             mode='test', return_label=False, sample_num=sp_num
         )
 
-        cls.train_loader = fluid.io.DataLoader(
+        cls.train_loader = paddle.io.DataLoader(
             cls.train_dataset, places=cls.device, batch_size=64
         )
-        cls.val_loader = fluid.io.DataLoader(
+        cls.val_loader = paddle.io.DataLoader(
             cls.val_dataset, places=cls.device, batch_size=64
         )
-        cls.test_loader = fluid.io.DataLoader(
+        cls.test_loader = paddle.io.DataLoader(
             cls.test_dataset, places=cls.device, batch_size=64
         )
 
@@ -322,14 +322,14 @@ def fit(self, dynamic, num_replicas=None, rank=None, num_iters=None):
             rank=rank,
         )
 
-        train_loader = fluid.io.DataLoader(
+        train_loader = paddle.io.DataLoader(
             self.train_dataset,
             batch_sampler=train_sampler,
             places=self.device,
             return_list=True,
         )
 
-        val_loader = fluid.io.DataLoader(
+        val_loader = paddle.io.DataLoader(
             self.val_dataset,
             batch_sampler=val_sampler,
             places=self.device,
@@ -375,14 +375,14 @@ def fit_with_tuple_input(self, dynamic, num_replicas=None, rank=None):
             rank=rank,
         )
 
-        train_loader = fluid.io.DataLoader(
+        train_loader = paddle.io.DataLoader(
             self.train_dataset,
             batch_sampler=train_sampler,
             places=self.device,
             return_list=True,
         )
 
-        val_loader = fluid.io.DataLoader(
+        val_loader = paddle.io.DataLoader(
             self.val_dataset,
             batch_sampler=val_sampler,
             places=self.device,
@@ -404,7 +404,7 @@ def evaluate(self, dynamic):
             self.val_dataset, batch_size=64, shuffle=False
         )
 
-        val_loader = fluid.io.DataLoader(
+        val_loader = paddle.io.DataLoader(
             self.val_dataset,
             batch_sampler=sampler,
             places=self.device,
@@ -432,7 +432,7 @@ def predict(self, dynamic):
             self.test_dataset, batch_size=64, shuffle=False
         )
 
-        test_loader = fluid.io.DataLoader(
+        test_loader = paddle.io.DataLoader(
             self.test_dataset,
             batch_sampler=sampler,
             places=self.device,
diff --git a/test/prim/prim/vjp/test_comp_high_grad.py b/test/prim/prim/vjp/test_comp_high_grad.py
index 76283528e2404..87b3c8f300ecd 100644
--- a/test/prim/prim/vjp/test_comp_high_grad.py
+++ b/test/prim/prim/vjp/test_comp_high_grad.py
@@ -226,6 +226,7 @@ def test_high_grad(self):
             self.func_triple(p)
 
 
+'''
 @param.parameterized_class(
     ('shape1', 'shape2'),
     [
@@ -328,7 +329,6 @@ def test_high_grad(self):
         for p in places:
             self.func_double(p)
             self.func_triple(p)
-
-
+'''
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/xpu/test_depthwise_conv2d_transpose_op_xpu.py b/test/xpu/test_depthwise_conv2d_transpose_op_xpu.py
new file mode 100644
index 0000000000000..e4b290a2e66e1
--- /dev/null
+++ b/test/xpu/test_depthwise_conv2d_transpose_op_xpu.py
@@ -0,0 +1,298 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+from get_test_cover_info import (
+    XPUOpTestWrapper,
+    create_test_class,
+    get_xpu_op_support_types,
+)
+from op_test_xpu import XPUOpTest
+
+import paddle
+
+paddle.enable_static()
+
+
+def depthwiseconv2dtranspose_forward_naive(input_, filter_, attrs):
+    padding_algorithm = attrs['padding_algorithm']
+    if padding_algorithm not in ["SAME", "VALID", "EXPLICIT"]:
+        raise ValueError(
+            "Unknown Attr(padding_algorithm): '%s'. "
+            "It can only be 'SAME' or 'VALID'." % str(padding_algorithm)
+        )
+
+    if attrs['data_format'] == 'NHWC':
+        input_ = np.transpose(input_, [0, 3, 1, 2])
+    in_n, in_c, in_h, in_w = input_.shape
+    f_c, f_out_c, f_h, f_w = filter_.shape
+    groups = attrs['groups']
+    assert in_c == f_c
+    out_c = f_out_c * groups
+    sub_in_c = in_c // groups
+
+    stride, pad, dilations = (
+        attrs['strides'],
+        attrs['paddings'],
+        attrs['dilations'],
+    )
+
+    # update pad and dilation
+    def _get_padding_with_SAME(input_shape, kernel_size, kernel_stride):
+        padding = []
+        for input_size, filter_size, stride_size in zip(
+            input_shape, kernel_size, kernel_stride
+        ):
+            out_size = int((input_size + stride_size - 1) / stride_size)
+            pad_sum = np.max(
+                ((out_size - 1) * stride_size + filter_size - input_size, 0)
+            )
+            pad_0 = int(pad_sum / 2)
+            pad_1 = int(pad_sum - pad_0)
+            padding.append(pad_0)
+            padding.append(pad_1)
+        return padding
+
+    ksize = filter_.shape[2:4]
+    if padding_algorithm == "VALID":
+        pad = [0, 0, 0, 0]
+    elif padding_algorithm == "SAME":
+        dilations = [1, 1]
+        input_data_shape = input_.shape[2:4]
+        pad = _get_padding_with_SAME(input_data_shape, ksize, stride)
+
+    pad_h_0, pad_h_1 = pad[0], pad[0]
+    pad_w_0, pad_w_1 = pad[1], pad[1]
+    if len(pad) == 4:
+        pad_h_0, pad_h_1 = pad[0], pad[1]
+        pad_w_0, pad_w_1 = pad[2], pad[3]
+
+    d_bolck_h = dilations[0] * (f_h - 1) + 1
+    d_bolck_w = dilations[1] * (f_w - 1) + 1
+    out_h = (in_h - 1) * stride[0] + d_bolck_h
+    out_w = (in_w - 1) * stride[1] + d_bolck_w
+    if 'output_size' in attrs:
+        output_size = attrs['output_size']
+        out_h = output_size[0] + pad_h_0 + pad_h_1
+        out_w = output_size[1] + pad_w_0 + pad_w_1
+    out_pad_h = 0
+    out_pad_w = 0
+    if 'output_padding' in attrs:
+        out_pad_h = attrs['output_padding'][0]
+        out_pad_w = attrs['output_padding'][1]
+    out = np.zeros(
+        (in_n, out_c, out_h + out_pad_h, out_w + out_pad_w), dtype=input_.dtype
+    )
+
+    for n in range(in_n):
+        for i in range(in_h):
+            for j in range(in_w):
+                for g in range(groups):
+                    input_masked = input_[
+                        n, g * sub_in_c : (g + 1) * sub_in_c, i, j
+                    ]  # (c)
+                    input_masked = np.reshape(input_masked, (sub_in_c, 1, 1))
+                    input_masked = np.tile(input_masked, (1, f_h, f_w))
+
+                    for k in range(f_out_c):
+                        tmp_out = np.sum(
+                            input_masked
+                            * filter_[
+                                g * sub_in_c : (g + 1) * sub_in_c, k, :, :
+                            ],
+                            axis=0,
+                        )
+                        i1, i2 = i * stride[0], i * stride[0] + d_bolck_h
+                        j1, j2 = j * stride[1], j * stride[1] + d_bolck_w
+                        out[
+                            n,
+                            g * f_out_c + k,
+                            i1 : i2 : dilations[0],
+                            j1 : j2 : dilations[1],
+                        ] += tmp_out
+
+    out = out[
+        :,
+        :,
+        pad_h_0 : out_h - pad_h_1 + out_pad_h,
+        pad_w_0 : out_w - pad_w_1 + out_pad_w,
+    ]
+    if attrs['data_format'] == 'NHWC':
+        out = np.transpose(out, [0, 2, 3, 1])
+    return out
+
+
+class XPUTestDepthwiseConv2DTransposeOp(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'depthwise_conv2d_transpose'
+        self.use_dynamic_create_class = False
+
+    class TestDepthwiseConv2DTransposeOp(XPUOpTest):
+        def setUp(self):
+            # init as conv transpose
+            self.need_check_grad = True
+            self.is_test = False
+            self.use_cudnn = False
+            self.use_mkldnn = False
+            self.output_size = None
+            self.output_padding = []
+            self.data_format = "NCHW"
+            self.pad = [0, 0]
+            self.padding_algorithm = "EXPLICIT"
+            self.init_op_type()
+            self.init_test_case()
+            self.__class__.op_type = "depthwise_conv2d_transpose"
+
+            input_ = np.random.random(self.input_size).astype(self.dtype)
+            filter_ = np.random.random(self.filter_size).astype(self.dtype)
+
+            self.inputs = {'Input': input_, 'Filter': filter_}
+            self.attrs = {
+                'strides': self.stride,
+                'paddings': self.pad,
+                'padding_algorithm': self.padding_algorithm,
+                'groups': self.groups,
+                'dilations': self.dilations,
+                'use_cudnn': self.use_cudnn,
+                'is_test': self.is_test,
+                'use_mkldnn': self.use_mkldnn,
+                'data_format': self.data_format,
+            }
+            if self.output_size is not None:
+                self.attrs['output_size'] = self.output_size
+
+            if len(self.output_padding) > 0:
+                self.attrs['output_padding'] = self.output_padding
+
+            output = depthwiseconv2dtranspose_forward_naive(
+                input_, filter_, self.attrs
+            ).astype(self.dtype)
+
+            self.outputs = {'Output': output}
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def test_check_grad_no_input(self):
+            if self.need_check_grad:
+                self.check_grad_with_place(
+                    self.place, ['Filter'], 'Output', no_grad_set={'Input'}
+                )
+
+        def test_check_grad_no_filter(self):
+            if self.need_check_grad:
+                self.check_grad_with_place(
+                    self.place, ['Input'], 'Output', no_grad_set={'Filter'}
+                )
+
+        def test_check_grad(self):
+            if self.need_check_grad:
+                self.check_grad_with_place(
+                    self.place, {'Input', 'Filter'}, 'Output'
+                )
+
+        def init_test_case(self):
+            self.pad = [0, 0]
+            self.stride = [1, 1]
+            self.dilations = [1, 1]
+            self.groups = 1
+            self.input_size = [2, 3, 5, 5]  # NCHW
+            f_c = self.input_size[1]
+            self.filter_size = [f_c, 6, 3, 3]
+
+        def init_op_type(self):
+            self.dtype = self.in_type
+            self.place = paddle.XPUPlace(0)
+            self.op_type = "depthwise_conv2d_transpose"
+
+    class TestWithSymmetricPad(TestDepthwiseConv2DTransposeOp):
+        def init_test_case(self):
+            self.pad = [1, 1]
+            self.stride = [1, 1]
+            self.dilations = [1, 1]
+            self.groups = 1
+            self.input_size = [2, 3, 5, 5]  # NCHW
+            f_c = self.input_size[1]
+            self.filter_size = [f_c, 6, 3, 3]
+
+    class TestWithAsymmetricPad(TestDepthwiseConv2DTransposeOp):
+        def init_test_case(self):
+            self.pad = [1, 0, 1, 2]
+            self.stride = [1, 1]
+            self.dilations = [1, 1]
+            self.groups = 1
+            self.input_size = [2, 3, 5, 5]  # NCHW
+            f_c = self.input_size[1]
+            self.filter_size = [f_c, 6, 3, 3]
+
+    class TestWithSAMEPad(TestDepthwiseConv2DTransposeOp):
+        def init_test_case(self):
+            self.stride = [2, 1]
+            self.dilations = [1, 2]
+            self.groups = 1
+            self.input_size = [2, 3, 6, 5]  # NCHW
+            f_c = self.input_size[1]
+            self.filter_size = [f_c, 6, 4, 3]
+            self.padding_algorithm = 'SAME'
+
+    class TestWithVALIDPad(TestDepthwiseConv2DTransposeOp):
+        def init_test_case(self):
+            self.stride = [1, 1]
+            self.dilations = [1, 1]
+            self.groups = 1
+            self.input_size = [2, 3, 5, 5]  # NCHW
+            f_c = self.input_size[1]
+            self.filter_size = [f_c, 6, 3, 3]
+            self.padding_algorithm = 'VALID'
+
+    class TestWithGroups(TestDepthwiseConv2DTransposeOp):
+        def init_test_case(self):
+            self.pad = [1, 1]
+            self.stride = [1, 1]
+            self.dilations = [1, 1]
+            self.groups = 2
+            self.input_size = [2, 4, 5, 5]  # NCHW
+            f_c = self.input_size[1]
+            self.filter_size = [f_c, 3, 3, 3]
+
+    class TestWithStride(TestDepthwiseConv2DTransposeOp):
+        def init_test_case(self):
+            self.pad = [1, 1]
+            self.stride = [2, 2]
+            self.dilations = [1, 1]
+            self.groups = 1
+            self.input_size = [2, 3, 5, 5]  # NCHW
+            f_c = self.input_size[1]
+            self.filter_size = [f_c, 6, 3, 3]
+
+    class TestWithEvenUpsample(TestDepthwiseConv2DTransposeOp):
+        def init_test_case(self):
+            self.pad = [2, 2]
+            self.stride = [2, 2]
+            self.groups = 1
+            self.dilations = [1, 1]
+            self.output_size = [14, 14]
+            self.input_size = [2, 3, 7, 7]  # NCHW
+            f_c = self.input_size[1]
+            self.filter_size = [f_c, 6, 5, 5]
+
+
+support_types = get_xpu_op_support_types('depthwise_conv2d_transpose')
+for stype in support_types:
+    create_test_class(globals(), XPUTestDepthwiseConv2DTransposeOp, stype)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/xpu/test_pad_op_xpu.py b/test/xpu/test_pad_op_xpu.py
new file mode 100644
index 0000000000000..4f4d68ab73d0e
--- /dev/null
+++ b/test/xpu/test_pad_op_xpu.py
@@ -0,0 +1,214 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+
+import numpy as np
+from get_test_cover_info import (
+    XPUOpTestWrapper,
+    create_test_class,
+    get_xpu_op_support_types,
+)
+from op_test_xpu import XPUOpTest
+from test_attribute_var import UnittestBase
+
+import paddle
+from paddle.fluid import Program, program_guard
+
+
+def pad_wrapper(x, paddings, pad_value):
+    return paddle.nn.functional.pad(
+        x, pad=list(paddings), mode='constant', value=pad_value
+    )
+
+
+paddle.enable_static()
+
+
+class XPUTestPadOp(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = "pad"
+        self.use_dynamic_create_class = False
+
+    class TestPadOp(XPUOpTest):
+        def setUp(self):
+            self.op_type = "pad"
+            self.place = paddle.XPUPlace(0)
+            self.python_api = pad_wrapper
+            self.public_python_api = pad_wrapper
+            self.init_dtype()
+            self.init_test_case()
+            self.init_data()
+
+        def init_dtype(self):
+            self.dtype = self.in_type
+
+        def init_test_case(self):
+            self.shape = (16, 16)
+            self.paddings = [(0, 1), (2, 3)]
+            self.pad_value = 0.0
+
+        def init_data(self):
+            self.inputs = {'X': np.random.random(self.shape).astype(self.dtype)}
+            self.outputs = {
+                'Out': np.pad(
+                    self.inputs['X'],
+                    self.paddings,
+                    mode='constant',
+                    constant_values=self.pad_value,
+                )
+            }
+            self.attrs = {
+                'paddings': list(np.array(self.paddings).flatten()),
+                'pad_value': self.pad_value,
+            }
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def test_check_grad_normal(self):
+            self.check_grad_with_place(self.place, ['X'], 'Out')
+
+    class TestCase1(TestPadOp):
+        def init_test_case(self):
+            self.shape = (2, 3, 4, 5)
+            self.paddings = [(0, 1), (2, 3), (2, 1), (1, 1)]
+            self.pad_value = 0.5
+
+    class TestCase2(TestPadOp):
+        def init_test_case(self):
+            self.shape = (5, 5, 5)
+            self.paddings = [(0, 0), (0, 0), (1, 2)]
+            self.pad_value = 1.0
+
+    class TestCase3(TestPadOp):
+        def init_test_case(self):
+            self.shape = 100
+            self.paddings = [(0, 1)]
+            self.pad_value = 0.9
+
+    class TestPadOpError(unittest.TestCase):
+        def test_errors(self):
+            with paddle.fluid.framework._static_guard():
+                with program_guard(Program(), Program()):
+                    input_data = np.random.random((2, 2)).astype("float32")
+
+                def test_Variable():
+                    paddle.nn.functional.pad(x=input_data, pad=[1, 1, 1, 1])
+
+                self.assertRaises(TypeError, test_Variable)
+
+                data = paddle.static.data(
+                    name='data', shape=[4], dtype='float16'
+                )
+                paddle.nn.functional.pad(x=data, pad=[0, 1])
+
+    class TestPaddingValueTensor(UnittestBase):
+        def init_info(self):
+            self.shapes = [[2, 4]]
+            self.save_path = os.path.join(
+                self.temp_dir.name, self.path_prefix()
+            )
+
+        def test_static(self):
+            with paddle.fluid.framework._static_guard():
+                main_prog = Program()
+                starup_prog = Program()
+                with program_guard(main_prog, starup_prog):
+                    fc = paddle.nn.Linear(4, 10)
+                    x = paddle.randn([2, 4])
+                    x.stop_gradient = False
+                    feat = fc(x)  # [2,3,10]
+
+                    out = self.call_func(feat)
+
+                    sgd = paddle.optimizer.SGD()
+                    sgd.minimize(paddle.mean(out))
+                    self.assertTrue(self.var_prefix() in str(main_prog))
+                    exe = paddle.static.Executor(paddle.XPUPlace(0))
+                    exe.run(starup_prog)
+                    res = exe.run(fetch_list=[feat, out])
+                    gt = np.pad(
+                        res[0], [1, 1], 'constant', constant_values=[1.0, 1.0]
+                    )
+                    np.testing.assert_allclose(res[1], gt)
+                    paddle.static.save_inference_model(
+                        self.save_path, [x], [feat, out], exe
+                    )
+                    # Test for Inference Predictor
+                    infer_outs = self.infer_prog()
+                    gt = np.pad(
+                        infer_outs[0],
+                        [1, 1],
+                        'constant',
+                        constant_values=[1.0, 1.0],
+                    )
+                    np.testing.assert_allclose(infer_outs[1], gt)
+
+        def path_prefix(self):
+            return 'padding_value'
+
+        def var_prefix(self):
+            return "Var["
+
+        def call_func(self, x):
+            padding_value = paddle.assign([1.0])
+            out = paddle.nn.functional.pad(
+                x, pad=[1, 1, 1, 1], value=padding_value, mode='constant'
+            )
+            return out
+
+    class TestPaddingValueTensor2(TestPaddingValueTensor):
+        def call_func(self, x):
+            padding_value = paddle.assign([1.0])
+            # test for int value
+            tmp = paddle.nn.functional.pad(x, pad=[1, 1, 1, 1], value=1)
+            out = paddle.nn.functional.pad(
+                x, pad=[1, 1, 1, 1], value=padding_value
+            )
+            return out
+
+    class TestPaddingValueTensor3(unittest.TestCase):
+        def test_static(self):
+            with paddle.fluid.framework._static_guard():
+                np_x = np.random.random((16, 16)).astype('float32')
+                main_prog = Program()
+                starup_prog = Program()
+                with program_guard(main_prog, starup_prog):
+                    x = paddle.assign(np_x).astype('float32')
+                    pad_value = paddle.assign([0.0]).astype('float64')
+                    y = paddle.nn.functional.pad(
+                        x, [0, 1, 2, 3], value=pad_value
+                    )
+                    loss = y.sum()
+                    optimize_ops, params_grads = paddle.optimizer.SGD(
+                        0.01
+                    ).minimize(loss)
+
+                exe = paddle.static.Executor(paddle.XPUPlace(0))
+                res = exe.run(
+                    main_prog, fetch_list=[y] + [g for p, g in params_grads]
+                )
+                pd_out = res[0]
+                np_out = np.pad(np_x, [(0, 1), (2, 3)], constant_values=0.0)
+                np.testing.assert_allclose(pd_out, np_out)
+
+
+support_types = get_xpu_op_support_types("pad")
+for stype in support_types:
+    create_test_class(globals(), XPUTestPadOp, stype)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tools/get_pr_ut.py b/tools/get_pr_ut.py
index 8bbe39b3b7659..4da57036b68c8 100644
--- a/tools/get_pr_ut.py
+++ b/tools/get_pr_ut.py
@@ -340,10 +340,8 @@ def get_pr_ut(self):
                         file_list.append(filename)
                     else:
                         filterFiles.append(filename)
-                elif (
-                    ('/xpu/' in filename.lower())
-                    or ('/npu/' in filename.lower())
-                    or ('/ipu/' in filename.lower())
+                elif ('/xpu/' in filename.lower()) or (
+                    '/ipu/' in filename.lower()
                 ):
                     filterFiles.append(filename)
                 else:
diff --git a/tools/xpu/check_xpu_dependence.sh b/tools/xpu/check_xpu_dependence.sh
new file mode 100644
index 0000000000000..abfea14330819
--- /dev/null
+++ b/tools/xpu/check_xpu_dependence.sh
@@ -0,0 +1,120 @@
+#!/bin/bash
+
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -u
+
+if [[ $# -ne 2 ]]; then
+    echo "usage: ./check_xpu_dependence.sh XPU_BASE_URL XPU_XCCL_BASE_URL"
+    exit 1
+fi
+
+xpu_base_url=$1
+xccl_base_url=$2
+
+echo "xpu_base_url: $xpu_base_url"
+echo "xccl_base_url: $xccl_base_url"
+
+function check_files() {
+    local url="$1"
+    local local_dir="$2"
+    echo "local dir: $local_dir"
+    local local_file_name="${local_dir}.tar.gz"
+    echo "local file name: $local_file_name"
+
+    shift
+    shift
+    local files=("$@")
+
+    # start to download
+    echo "downloading: $url"
+    rm -f ./$local_file_name
+    wget -q $url -O ${local_file_name}
+    if [[ $? -ne 0 ]]; then
+        echo "downloading failed: $url"
+        return 1
+    else
+        echo "downloading ok: $url"
+    fi
+
+    # remove local dir and de-compress
+    rm -rf ./$local_dir
+    tar xf $local_file_name
+    if [[ $? -ne 0 ]]; then
+        echo "de-compress failed: $local_file_name"
+        return 1
+    fi
+
+    for i in "${files[@]}";
+    do
+        echo "checking $local_dir/$i"
+        if [[ ! -f $local_dir/$i ]]; then
+            echo "checking failed: $local_dir/$i"
+            return 1
+        else
+            echo "checking ok: $local_dir/$i"
+        fi
+    done
+
+    # clean
+    rm -f ./$local_file_name
+    rm -rf ./$local_dir
+}
+
+# XRE
+xre_tar_file_names=("xre-kylin_aarch64" "xre-bdcentos_x86_64" "xre-ubuntu_x86_64" "xre-centos7_x86_64")
+xre_inner_file_names=("include/xpu/runtime.h" "so/libxpurt.so")
+for name in ${xre_tar_file_names[@]}; do
+    url="${xpu_base_url}/${name}.tar.gz"
+    check_files $url $name "${xre_inner_file_names[@]}"
+    if [[ $? -ne 0 ]]; then
+        echo "XRE check failed, name: $name"
+        exit 1
+    else
+        echo "XRE check ok, name: $name"
+    fi
+done
+
+# XDNN
+xdnn_tar_file_names=("xdnn-kylin_aarch64" "xdnn-bdcentos_x86_64" "xdnn-ubuntu_x86_64" "xdnn-centos7_x86_64")
+xdnn_inner_file_names=("include/xpu/xdnn.h" "so/libxpuapi.so")
+for name in ${xdnn_tar_file_names[@]}; do
+    url="${xpu_base_url}/${name}.tar.gz"
+    check_files $url $name "${xdnn_inner_file_names[@]}"
+    if [[ $? -ne 0 ]]; then
+        echo "XDNN check failed, name: $name"
+        exit 1
+    else
+        echo "XDNN check ok, name: $name"
+    fi
+done
+
+# XCCL
+xccl_tar_file_names=("xccl_rdma-bdcentos_x86_64" "xccl_rdma-ubuntu_x86_64" "xccl_socket-bdcentos_x86_64" "xccl_socket-deepin_sw6_64" "xccl_socket-kylin_aarch64" "xccl_socket-ubuntu_x86_64")
+xccl_inner_file_names=("include/bkcl.h" "so/libbkcl.so")
+for name in ${xccl_tar_file_names[@]}; do
+    url="${xccl_base_url}/${name}.tar.gz"
+    check_files $url $name "${xccl_inner_file_names[@]}"
+    if [[ $? -ne 0 ]]; then
+        echo "XCCL check failed, name: $name"
+        exit 1
+    else
+        echo "XCCL check ok, name: $name"
+    fi
+done
+
+echo "ALL CHECKS PASSED"
+
+exit 0