From 2bb5aae8fd6a521183565ade69dfc72ec8a83ce0 Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Mon, 21 Feb 2022 14:29:35 +0800
Subject: [PATCH 001/101] [pten]rm reduce_sum and reduce_mean raw kernel
 (#39484)

* rm reduce_sum raw kernel

* remove reduce_mean kernel

* remove reduce_mean kernel

* reduce support int and int64_t

* mean support int and int64_t type
---
 .../performance_tests/benchmark_eager_cpu.cc  |  2 +-
 .../performance_tests/benchmark_eager_cuda.cc |  2 +-
 .../performance_tests/benchmark_fluid_cpu.cc  |  2 +-
 .../performance_tests/benchmark_fluid_cuda.cc |  2 +-
 .../new_executor/standalone_executor_test.cc  |  4 +--
 paddle/fluid/imperative/tests/test_tracer.cc  |  2 +-
 .../operators/reduce_ops/reduce_mean_op.cc    |  7 -----
 .../operators/reduce_ops/reduce_mean_op.cu    | 27 -----------------
 .../operators/reduce_ops/reduce_sum_op.cc     | 21 --------------
 .../operators/reduce_ops/reduce_sum_op.cu     | 29 -------------------
 .../reduce_ops/unity_build_rule.cmake         |  1 -
 paddle/phi/kernels/gpu/math_kernel.cu         |  4 ++-
 paddle/phi/kernels/math_kernel.cc             |  2 ++
 13 files changed, 12 insertions(+), 93 deletions(-)
 delete mode 100644 paddle/fluid/operators/reduce_ops/reduce_mean_op.cu
 delete mode 100644 paddle/fluid/operators/reduce_ops/reduce_sum_op.cu

diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
index b72b7cb87530e..6c4bf9a4f17e6 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
@@ -176,4 +176,4 @@ TEST(Benchmark, EagerIntermediateMLPCPU) {
 USE_OP_ITSELF(scale);
 USE_OP_ITSELF(elementwise_add);
 USE_OP_ITSELF(matmul_v2);
-USE_OP(reduce_sum);
+USE_OP_ITSELF(reduce_sum);
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
index 701a9b2cba195..14e7ce8cfcfb4 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
@@ -185,7 +185,7 @@ TEST(Benchmark, EagerIntermediateMLPCUDA) {
 
 USE_OP_ITSELF(scale);
 USE_OP_ITSELF(matmul_v2);
-USE_OP(reduce_sum);
+USE_OP_ITSELF(reduce_sum);
 USE_OP(reduce_sum_grad);
 USE_OP_ITSELF(elementwise_add);
 
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc
index bea80809a3b17..3292de9363696 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc
@@ -214,4 +214,4 @@ TEST(Benchmark, FluidMLPCPU) {
 USE_OP_ITSELF(scale);
 USE_OP_ITSELF(elementwise_add);
 USE_OP_ITSELF(matmul_v2);
-USE_OP(reduce_sum);
+USE_OP_ITSELF(reduce_sum);
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
index 86c1ad7e23a6a..e9b7d10070dbf 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
@@ -247,7 +247,7 @@ TEST(Benchmark, FluidMLPCUDA) {
 
 USE_OP_ITSELF(scale);
 USE_OP_ITSELF(matmul_v2);
-USE_OP(reduce_sum);
+USE_OP_ITSELF(reduce_sum);
 USE_OP(reduce_sum_grad);
 USE_OP_ITSELF(elementwise_add);
 
diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
index c1d449d30205e..2c3359ffa8e46 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
@@ -35,8 +35,8 @@ USE_OP(sigmoid);
 USE_OP(tanh);
 USE_OP(elementwise_mul);
 USE_OP(softmax_with_cross_entropy);
-USE_OP(reduce_mean);
-USE_OP(reduce_sum);
+USE_OP_ITSELF(reduce_mean);
+USE_OP_ITSELF(reduce_sum);
 USE_OP(reduce_sum_grad);
 USE_OP(reduce_mean_grad);
 USE_OP_ITSELF(reshape2_grad);
diff --git a/paddle/fluid/imperative/tests/test_tracer.cc b/paddle/fluid/imperative/tests/test_tracer.cc
index afe1f92ca03b3..d05036f7a12eb 100644
--- a/paddle/fluid/imperative/tests/test_tracer.cc
+++ b/paddle/fluid/imperative/tests/test_tracer.cc
@@ -590,6 +590,6 @@ TEST(test_tracer, eager_tracer) {
 
 USE_OP(mul);
 USE_OP(mul_grad);
-USE_OP(reduce_sum);
+USE_OP_ITSELF(reduce_sum);
 USE_OP(reduce_sum_grad);
 USE_OP_ITSELF(elementwise_add);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
index c8d568c8c2cf7..e80df5f95bb4a 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
@@ -99,13 +99,6 @@ REGISTER_OPERATOR(reduce_mean_grad, ops::ReduceGradOp,
                   ops::ReduceMeanDoubleGradDescMaker,
                   ops::ReduceMeanDoubleGradOpBaseMaker,
                   ops::ReduceMeanGradNoNeedBufferVarInferer);
-REGISTER_OP_CPU_KERNEL(reduce_mean,
-                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                                         bool, ops::MeanFunctor>,
-                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                                         float, ops::MeanFunctor>,
-                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                                         double, ops::MeanFunctor>);
 
 template <typename T>
 using CPUReduceMeanGradKernel =
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu
deleted file mode 100644
index 30a699e979efc..0000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu
+++ /dev/null
@@ -1,27 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <vector>
-#include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
-
-REGISTER_OP_CUDA_KERNEL(
-    reduce_mean,
-    ops::ReduceCudaKernel<bool, kps::AddFunctor, kps::DivideFunctor>,
-    ops::ReduceCudaKernel<paddle::platform::float16, kps::AddFunctor,
-                          kps::DivideFunctor>,
-    ops::ReduceCudaKernel<float, kps::AddFunctor, kps::DivideFunctor>,
-    ops::ReduceCudaKernel<double, kps::AddFunctor, kps::DivideFunctor>,
-    ops::ReduceCudaKernel<int, kps::AddFunctor, kps::DivideFunctor>,
-    ops::ReduceCudaKernel<int64_t, kps::AddFunctor, kps::DivideFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
index cfafc11739948..bdab14a18a05a 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
@@ -107,27 +107,6 @@ REGISTER_OPERATOR(reduce_sum_grad, ops::ReduceGradOp,
                   ops::ReduceSumDoubleOpGradMaker<paddle::imperative::OpBase>,
                   ops::ReduceSumGradNoNeedBufferVarInferer);
 
-REGISTER_OP_CPU_KERNEL(
-    reduce_sum, ops::ReduceKernel<paddle::platform::CPUDeviceContext, bool,
-                                  ops::SumFunctor>,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext, float,
-                      ops::SumFunctor>,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext, double,
-                      ops::SumFunctor>,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                      paddle::platform::float16, ops::SumFunctor>,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int16_t,
-                      ops::SumFunctor>,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::SumFunctor>,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int64_t,
-                      ops::SumFunctor>,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                      paddle::platform::complex<float>, ops::SumFunctor>,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                      paddle::platform::complex<double>,
-
-                      ops::SumFunctor>);
-
 template <typename T>
 using CPUReduceSumGradKernel =
     ops::ReduceSumGradKernel<paddle::platform::CPUDeviceContext, T,
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cu b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cu
deleted file mode 100644
index 94ccb0965f06e..0000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cu
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h"
-REGISTER_OP_CUDA_KERNEL(
-    reduce_sum,
-    ops::ReduceCudaKernel<bool, kps::AddFunctor, kps::IdentityFunctor>,
-    ops::ReduceCudaKernel<float, kps::AddFunctor, kps::IdentityFunctor>,
-    ops::ReduceCudaKernel<double, kps::AddFunctor, kps::IdentityFunctor>,
-    ops::ReduceCudaKernel<paddle::platform::float16, kps::AddFunctor,
-                          kps::IdentityFunctor>,
-    ops::ReduceCudaKernel<int16_t, kps::AddFunctor, kps::IdentityFunctor>,
-    ops::ReduceCudaKernel<int, kps::AddFunctor, kps::IdentityFunctor>,
-    ops::ReduceCudaKernel<int64_t, kps::AddFunctor, kps::IdentityFunctor>,
-    ops::ReduceCudaKernel<paddle::platform::complex<float>, kps::AddFunctor,
-                          kps::IdentityFunctor>,
-    ops::ReduceCudaKernel<paddle::platform::complex<double>, kps::AddFunctor,
-                          kps::IdentityFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/unity_build_rule.cmake b/paddle/fluid/operators/reduce_ops/unity_build_rule.cmake
index 74781ef6f0237..c4f32a8d25764 100644
--- a/paddle/fluid/operators/reduce_ops/unity_build_rule.cmake
+++ b/paddle/fluid/operators/reduce_ops/unity_build_rule.cmake
@@ -21,5 +21,4 @@ register_unity_group(cu
 register_unity_group(cu frobenius_norm_op.cu)
 register_unity_group(cu logsumexp_op.cu)
 register_unity_group(cu reduce_max_op.cu)
-register_unity_group(cu reduce_mean_op.cu)
 register_unity_group(cu reduce_min_op.cu)
diff --git a/paddle/phi/kernels/gpu/math_kernel.cu b/paddle/phi/kernels/gpu/math_kernel.cu
index 2ae40bd4b1923..c3605ce655f2b 100644
--- a/paddle/phi/kernels/gpu/math_kernel.cu
+++ b/paddle/phi/kernels/gpu/math_kernel.cu
@@ -165,4 +165,6 @@ PT_REGISTER_KERNEL(mean_raw,
                    float,
                    double,
                    bool,
-                   float16) {}
+                   float16,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/math_kernel.cc b/paddle/phi/kernels/math_kernel.cc
index 7fb6cc0ba9cca..e1e3679ea8be8 100644
--- a/paddle/phi/kernels/math_kernel.cc
+++ b/paddle/phi/kernels/math_kernel.cc
@@ -152,6 +152,8 @@ PT_REGISTER_KERNEL(mean,
                    float,
                    double,
                    bool,
+                   int,
+                   int64_t,
                    phi::dtype::float16) {}
 PT_REGISTER_KERNEL(sum,
                    GPU,

From 05982c101e05d59546013f73f7a1b3d80c46f362 Mon Sep 17 00:00:00 2001
From: seemingwang <seemingwang@users.noreply.github.com>
Date: Mon, 21 Feb 2022 15:15:10 +0800
Subject: [PATCH 002/101] gpu ps graph engine (#39699)

* gpu ps graph engine

* remove logs
---
 .../framework/fleet/heter_ps/CMakeLists.txt   |   2 +
 .../fleet/heter_ps/graph_gpu_ps_table.h       | 144 ++++++
 .../fleet/heter_ps/graph_gpu_ps_table_inl.h   | 447 ++++++++++++++++++
 .../framework/fleet/heter_ps/heter_comm.h     |  12 +-
 .../framework/fleet/heter_ps/test_graph.cu    | 112 +++++
 5 files changed, 712 insertions(+), 5 deletions(-)
 create mode 100644 paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
 create mode 100644 paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h
 create mode 100644 paddle/fluid/framework/fleet/heter_ps/test_graph.cu

diff --git a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
index 189724a545520..17346f5fd9393 100644
--- a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
@@ -10,6 +10,8 @@ IF(WITH_GPU)
     nv_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h mem_pool.h DEPS ${HETERPS_DEPS})
     nv_test(test_heter_comm SRCS feature_value.h DEPS heter_comm)
     nv_library(heter_ps SRCS heter_ps.cu DEPS heter_comm)
+    nv_library(graph_gpu_ps SRCS graph_gpu_ps_table.h DEPS heter_comm)
+    nv_test(test_graph_comm SRCS test_graph.cu DEPS graph_gpu_ps)
 ENDIF()
 IF(WITH_ROCM)
     hip_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h DEPS cub device_context)
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
new file mode 100644
index 0000000000000..a6508bf96c00f
--- /dev/null
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
@@ -0,0 +1,144 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "heter_comm.h"
+#include "paddle/fluid/platform/enforce.h"
+#ifdef PADDLE_WITH_HETERPS
+namespace paddle {
+namespace framework {
+struct GpuPsGraphNode {
+  int64_t node_id;
+  int neighbor_size, neighbor_offset;
+  // this node's neighbor is stored on [neighbor_offset,neighbor_offset +
+  // neighbor_size) of int64_t *neighbor_list;
+};
+
+struct GpuPsCommGraph {
+  int64_t *neighbor_list;
+  GpuPsGraphNode *node_list;
+  int neighbor_size, node_size;
+  // the size of neighbor array and graph_node_list array
+  GpuPsCommGraph()
+      : neighbor_list(NULL), node_list(NULL), neighbor_size(0), node_size(0) {}
+  GpuPsCommGraph(int64_t *neighbor_list_, GpuPsGraphNode *node_list_,
+                 int neighbor_size_, int node_size_)
+      : neighbor_list(neighbor_list_),
+        node_list(node_list_),
+        neighbor_size(neighbor_size_),
+        node_size(node_size_) {}
+};
+
+/*
+suppose we have a graph like this
+
+0----3-----5----7
+ \   |\         |\
+ 17  8 9        1 2
+
+we save the nodes in arbitrary order,
+in this example,the order is
+[0,5,1,2,7,3,8,9,17]
+let us name this array u_id;
+we record each node's neighbors:
+0:3,17
+5:3,7
+1:7
+2:7
+7:1,2,5
+3:0,5,8,9
+8:3
+9:3
+17:0
+
+by concatenating each node's neighbor_list in the order we save the node id.
+we get [3,17,3,7,7,7,1,2,5,0,5,8,9,3,3,0]
+this is the neighbor_list of GpuPsCommGraph
+given this neighbor_list and the order to save node id,
+we know,
+node 0's neighbors are in the range [0,1] of neighbor_list
+node 5's neighbors are in the range [2,3] of neighbor_list
+node 1's neighbors are in the range [4,4] of neighbor_list
+node 2:[5,5]
+node 7:[6,6]
+node 3:[9,12]
+node 8:[13,13]
+node 9:[14,14]
+node 17:[15,15]
+...
+by the above information,
+we generate a node_list:GpuPsGraphNode *graph_node_list in GpuPsCommGraph
+of size 9,
+where node_list[i].id = u_id[i]
+then we have:
+node_list[0]-> node_id:0, neighbor_size:2, neighbor_offset:0
+node_list[1]-> node_id:5, neighbor_size:2, neighbor_offset:2
+node_list[2]-> node_id:1, neighbor_size:1, neighbor_offset:4
+node_list[3]-> node_id:2, neighbor_size:1, neighbor_offset:5
+node_list[4]-> node_id:7, neighbor_size:3, neighbor_offset:6
+node_list[5]-> node_id:3, neighbor_size:4, neighbor_offset:9
+node_list[6]-> node_id:8, neighbor_size:1, neighbor_offset:13
+node_list[7]-> node_id:9, neighbor_size:1, neighbor_offset:14
+node_list[8]-> node_id:17, neighbor_size:1, neighbor_offset:15
+*/
+struct NeighborSampleResult {
+  int64_t *val;
+  int *actual_sample_size, sample_size, key_size;
+  NeighborSampleResult(int _sample_size, int _key_size)
+      : sample_size(_sample_size), key_size(_key_size) {
+    actual_sample_size = NULL;
+    val = NULL;
+  };
+  ~NeighborSampleResult() {
+    if (val != NULL) cudaFree(val);
+    if (actual_sample_size != NULL) cudaFree(actual_sample_size);
+  }
+};
+
+struct NodeQueryResult {
+  int64_t *val;
+  int actual_sample_size;
+  NodeQueryResult() {
+    val = NULL;
+    actual_sample_size = 0;
+  };
+  ~NodeQueryResult() {
+    if (val != NULL) cudaFree(val);
+  }
+};
+class GpuPsGraphTable : public HeterComm<int64_t, int, int> {
+ public:
+  GpuPsGraphTable(std::shared_ptr<HeterPsResource> resource)
+      : HeterComm<int64_t, int, int>(1, resource) {
+    load_factor_ = 0.25;
+  }
+  void build_graph_from_cpu(std::vector<GpuPsCommGraph> &cpu_node_list);
+  NodeQueryResult *graph_node_sample(int gpu_id, int sample_size);
+  NeighborSampleResult *graph_neighbor_sample(int gpu_id, int64_t *key,
+                                              int sample_size, int len);
+  NodeQueryResult *query_node_list(int gpu_id, int start, int query_size);
+  void clear_graph_info();
+  void move_neighbor_sample_result_to_source_gpu(int gpu_id, int gpu_num,
+                                                 int sample_size, int *h_left,
+                                                 int *h_right,
+                                                 int64_t *src_sample_res,
+                                                 int *actual_sample_size);
+
+ private:
+  std::vector<GpuPsCommGraph> gpu_graph_list;
+};
+}
+};
+#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h"
+#endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h
new file mode 100644
index 0000000000000..839c7e5468c6c
--- /dev/null
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h
@@ -0,0 +1,447 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#ifdef PADDLE_WITH_HETERPS
+namespace paddle {
+namespace framework {
+/*
+comment 0
+this kernel just serves as an example of how to sample nodes' neighbors.
+feel free to modify it
+index[0,len) saves the nodes' index
+actual_size[0,len) is to save the sample size of each node.
+for ith node in index, actual_size[i] = min(node i's neighbor size, sample size)
+sample_result is to save the neighbor sampling result, its size is len *
+sample_size;
+
+*/
+
+__global__ void neighbor_sample_example(GpuPsCommGraph graph, int* index,
+                                        int* actual_size,
+                                        int64_t* sample_result, int sample_size,
+                                        int len) {
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) {
+    auto node_index = index[i];
+    actual_size[i] = graph.node_list[node_index].neighbor_size < sample_size
+                         ? graph.node_list[node_index].neighbor_size
+                         : sample_size;
+    int offset = graph.node_list[node_index].neighbor_offset;
+    for (int j = 0; j < actual_size[i]; j++) {
+      sample_result[sample_size * i + j] = graph.neighbor_list[offset + j];
+    }
+  }
+}
+
+/*
+ comment 1
+
+ gpu i triggers a neighbor_sample task,
+ when this task is done,
+ this function is called to move the sample result on other gpu back
+ to gup i and aggragate the result.
+ the sample_result is saved on src_sample_res and the actual sample size for
+ each node is saved on actual_sample_size.
+ the number of actual sample_result for
+ key[x] (refer to comment 2 for definition of key)
+ is saved on  actual_sample_size[x], since the neighbor size of key[x] might be
+ smaller than sample_size,
+ is saved on src_sample_res [x*sample_size, x*sample_size +
+ actual_sample_size[x])
+
+ since before each gpu runs the neighbor_sample task,the key array is shuffled,
+ but we have the idx array to save the original order.
+ when the gpu i gets all the sample results from other gpus, it relies on
+ idx array to recover the original order.
+ that's what fill_dvals does.
+
+*/
+void GpuPsGraphTable::move_neighbor_sample_result_to_source_gpu(
+    int gpu_id, int gpu_num, int sample_size, int* h_left, int* h_right,
+    int64_t* src_sample_res, int* actual_sample_size) {
+  for (int i = 0; i < gpu_num; i++) {
+    if (h_left[i] == -1 || h_right[i] == -1) {
+      continue;
+    }
+    auto shard_len = h_right[i] - h_left[i] + 1;
+    // int cur_step = path_[gpu_id][i].nodes_.size() - 1;
+    // auto& node = path_[gpu_id][i].nodes_[cur_step];
+    auto& node = path_[gpu_id][i].nodes_.front();
+    cudaMemcpyAsync(
+        reinterpret_cast<char*>(src_sample_res + h_left[i] * sample_size),
+        node.val_storage + sizeof(int64_t) * shard_len,
+        node.val_bytes_len - sizeof(int64_t) * shard_len, cudaMemcpyDefault,
+        node.out_stream);
+    cudaMemcpyAsync(reinterpret_cast<char*>(actual_sample_size + h_left[i]),
+                    node.val_storage + sizeof(int) * shard_len,
+                    sizeof(int) * shard_len, cudaMemcpyDefault,
+                    node.out_stream);
+  }
+  for (int i = 0; i < gpu_num; ++i) {
+    if (h_left[i] == -1 || h_right[i] == -1) {
+      continue;
+    }
+    auto& node = path_[gpu_id][i].nodes_.front();
+    cudaStreamSynchronize(node.out_stream);
+  }
+}
+
+/*
+TODO:
+how to optimize it to eliminate the for loop
+*/
+__global__ void fill_dvalues(int64_t* d_shard_vals, int64_t* d_vals,
+                             int* d_shard_actual_sample_size,
+                             int* d_actual_sample_size, int* idx,
+                             int sample_size, int len) {
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) {
+    d_actual_sample_size[idx[i]] = d_shard_actual_sample_size[i];
+    // d_vals[idx[i]] = d_shard_vals[i];
+    for (int j = 0; j < sample_size; j++) {
+      d_vals[idx[i] * sample_size + j] = d_shard_vals[i * sample_size + j];
+    }
+  }
+}
+
+__global__ void node_query_example(GpuPsCommGraph graph, int start, int size,
+                                   int64_t* res) {
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < size) {
+    res[i] = graph.node_list[start + i].node_id;
+  }
+}
+
+void GpuPsGraphTable::clear_graph_info() {
+  if (tables_.size()) {
+    for (auto table : tables_) delete table;
+  }
+  tables_.clear();
+  for (auto graph : gpu_graph_list) {
+    if (graph.neighbor_list != NULL) {
+      cudaFree(graph.neighbor_list);
+    }
+    if (graph.node_list != NULL) {
+      cudaFree(graph.node_list);
+    }
+  }
+  gpu_graph_list.clear();
+}
+/*
+the parameter std::vector<GpuPsCommGraph> cpu_graph_list is generated by cpu.
+it saves the graph to be saved on each gpu.
+
+for the ith GpuPsCommGraph, any the node's key satisfies that key % gpu_number
+== i
+
+In this function, memory is allocated on each gpu to save the graphs,
+gpu i saves the ith graph from cpu_graph_list
+*/
+
+void GpuPsGraphTable::build_graph_from_cpu(
+    std::vector<GpuPsCommGraph>& cpu_graph_list) {
+  PADDLE_ENFORCE_EQ(
+      cpu_graph_list.size(), resource_->total_gpu(),
+      platform::errors::InvalidArgument("the cpu node list size doesn't match "
+                                        "the number of gpu on your machine."));
+  clear_graph_info();
+  for (int i = 0; i < cpu_graph_list.size(); i++) {
+    platform::CUDADeviceGuard guard(resource_->dev_id(i));
+    gpu_graph_list.push_back(GpuPsCommGraph());
+    auto table =
+        new Table(std::max(1, cpu_graph_list[i].node_size) / load_factor_);
+    tables_.push_back(table);
+    if (cpu_graph_list[i].node_size > 0) {
+      std::vector<int64_t> keys;
+      std::vector<int> offset;
+      cudaMalloc((void**)&gpu_graph_list[i].node_list,
+                 cpu_graph_list[i].node_size * sizeof(GpuPsGraphNode));
+      cudaMemcpy(gpu_graph_list[i].node_list, cpu_graph_list[i].node_list,
+                 cpu_graph_list[i].node_size * sizeof(GpuPsGraphNode),
+                 cudaMemcpyHostToDevice);
+      for (int j = 0; j < cpu_graph_list[i].node_size; j++) {
+        keys.push_back(cpu_graph_list[i].node_list[j].node_id);
+        offset.push_back(j);
+      }
+      build_ps(i, keys.data(), offset.data(), keys.size(), 1024, 8);
+      gpu_graph_list[i].node_size = cpu_graph_list[i].node_size;
+    } else {
+      gpu_graph_list[i].node_list = NULL;
+      gpu_graph_list[i].node_size = 0;
+    }
+    if (cpu_graph_list[i].neighbor_size) {
+      cudaMalloc((void**)&gpu_graph_list[i].neighbor_list,
+                 cpu_graph_list[i].neighbor_size * sizeof(int64_t));
+      cudaMemcpy(gpu_graph_list[i].neighbor_list,
+                 cpu_graph_list[i].neighbor_list,
+                 cpu_graph_list[i].neighbor_size * sizeof(int64_t),
+                 cudaMemcpyHostToDevice);
+      gpu_graph_list[i].neighbor_size = cpu_graph_list[i].neighbor_size;
+    } else {
+      gpu_graph_list[i].neighbor_list = NULL;
+      gpu_graph_list[i].neighbor_size = 0;
+    }
+  }
+  cudaDeviceSynchronize();
+}
+NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
+                                                             int64_t* key,
+                                                             int sample_size,
+                                                             int len) {
+  /*
+ comment 2
+  this function shares some kernels with heter_comm_inl.h
+  arguments definitions:
+  gpu_id:the id of gpu.
+  len:how many keys are used,(the length of array key)
+  sample_size:how many neighbors should be sampled for each node in key.
+
+  the code below shuffle the key array to make the keys
+    that belong to a gpu-card stay together,
+    the shuffled result is saved on d_shard_keys,
+    if ith element in d_shard_keys_ptr is
+    from jth element in the original key array, then idx[i] = j,
+    idx could be used to recover the original array.
+    if keys in range [a,b] belong to ith-gpu, then h_left[i] = a, h_right[i] =
+ b,
+    if no keys are allocated for ith-gpu, then h_left[i] == h_right[i] == -1
+
+    for example, suppose key = [0,1,2,3,4,5,6,7,8], gpu_num = 2
+    when we run this neighbor_sample function,
+    the key is shuffled to [0,2,4,6,8,1,3,5,7]
+    the first part (0,2,4,6,8) % 2 == 0,thus should be handled by gpu 0,
+    the rest part should be handled by gpu1, because (1,3,5,7) % 2 == 1,
+    h_left = [0,5],h_right = [4,8]
+
+  */
+  NeighborSampleResult* result = new NeighborSampleResult(sample_size, len);
+  if (len == 0) {
+    return result;
+  }
+  cudaMalloc((void**)&result->val, len * sample_size * sizeof(int64_t));
+  cudaMalloc((void**)&result->actual_sample_size, len * sizeof(int));
+  int* actual_sample_size = result->actual_sample_size;
+  int64_t* val = result->val;
+  int total_gpu = resource_->total_gpu();
+  int dev_id = resource_->dev_id(gpu_id);
+  platform::CUDAPlace place = platform::CUDAPlace(dev_id);
+  platform::CUDADeviceGuard guard(dev_id);
+  auto stream = resource_->local_stream(gpu_id, 0);
+
+  int grid_size = (len - 1) / block_size_ + 1;
+
+  int h_left[total_gpu];   // NOLINT
+  int h_right[total_gpu];  // NOLINT
+
+  auto d_left = memory::Alloc(place, total_gpu * sizeof(int));
+  auto d_right = memory::Alloc(place, total_gpu * sizeof(int));
+  int* d_left_ptr = reinterpret_cast<int*>(d_left->ptr());
+  int* d_right_ptr = reinterpret_cast<int*>(d_right->ptr());
+
+  cudaMemsetAsync(d_left_ptr, -1, total_gpu * sizeof(int), stream);
+  cudaMemsetAsync(d_right_ptr, -1, total_gpu * sizeof(int), stream);
+  //
+  auto d_idx = memory::Alloc(place, len * sizeof(int));
+  int* d_idx_ptr = reinterpret_cast<int*>(d_idx->ptr());
+
+  auto d_shard_keys = memory::Alloc(place, len * sizeof(int64_t));
+  int64_t* d_shard_keys_ptr = reinterpret_cast<int64_t*>(d_shard_keys->ptr());
+  auto d_shard_vals = memory::Alloc(place, len * sizeof(int64_t));
+  int64_t* d_shard_vals_ptr = reinterpret_cast<int64_t*>(d_shard_vals->ptr());
+  auto d_shard_actual_sample_size = memory::Alloc(place, len * sizeof(int));
+  int* d_shard_actual_sample_size_ptr =
+      reinterpret_cast<int*>(d_shard_actual_sample_size->ptr());
+
+  split_input_to_shard(key, d_idx_ptr, len, d_left_ptr, d_right_ptr, gpu_id);
+
+  fill_shard_key<<<grid_size, block_size_, 0, stream>>>(d_shard_keys_ptr, key,
+                                                        d_idx_ptr, len);
+
+  cudaStreamSynchronize(stream);
+
+  cudaMemcpy(h_left, d_left_ptr, total_gpu * sizeof(int),
+             cudaMemcpyDeviceToHost);
+  cudaMemcpy(h_right, d_right_ptr, total_gpu * sizeof(int),
+             cudaMemcpyDeviceToHost);
+
+  for (int i = 0; i < total_gpu; ++i) {
+    int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1;
+    if (shard_len == 0) {
+      continue;
+    }
+    /*
+   comment 3
+    shard_len denotes the size of keys on i-th gpu here,
+    when we sample  on i-th gpu, we allocate shard_len * (1 + sample_size)
+   int64_t units
+    of memory, we use alloc_mem_i to denote it, the range [0,shard_len) is saved
+   for the respective nodes' indexes
+    and acutal sample_size.
+    with nodes' indexes we could get the nodes to sample.
+    since size of int64_t is 8 bits, while size of int is 4,
+    the range of [0,shard_len) contains shard_len * 2 int uinits;
+    The values of the first half of this range will be updated by
+    the k-v map on i-th-gpu.
+    The second half of this range is saved for actual sample size of each node.
+    For node x,
+    its sampling result is saved on the range
+    [shard_len + sample_size * x,shard_len + sample_size * x +
+   actual_sample_size_of_x)
+    of alloc_mem_i, actual_sample_size_of_x equals ((int
+   *)alloc_mem_i)[shard_len + x]
+    */
+    create_storage(gpu_id, i, shard_len * sizeof(int64_t),
+                   shard_len * (1 + sample_size) * sizeof(int64_t));
+  }
+  walk_to_dest(gpu_id, total_gpu, h_left, h_right, d_shard_keys_ptr, NULL);
+
+  for (int i = 0; i < total_gpu; ++i) {
+    if (h_left[i] == -1) {
+      continue;
+    }
+    // auto& node = path_[gpu_id][i].nodes_.back();
+    auto& node = path_[gpu_id][i].nodes_.front();
+    cudaStreamSynchronize(node.in_stream);
+    platform::CUDADeviceGuard guard(resource_->dev_id(i));
+    // use the key-value map to update alloc_mem_i[0,shard_len)
+    tables_[i]->rwlock_->RDLock();
+    tables_[i]->get(reinterpret_cast<int64_t*>(node.key_storage),
+                    reinterpret_cast<int*>(node.val_storage),
+                    h_right[i] - h_left[i] + 1,
+                    resource_->remote_stream(i, gpu_id));
+  }
+  for (int i = 0; i < total_gpu; ++i) {
+    if (h_left[i] == -1) {
+      continue;
+    }
+    // cudaStreamSynchronize(resource_->remote_stream(i, num));
+    // tables_[i]->rwlock_->UNLock();
+    platform::CUDADeviceGuard guard(resource_->dev_id(i));
+    auto& node = path_[gpu_id][i].nodes_.front();
+    auto shard_len = h_right[i] - h_left[i] + 1;
+    auto graph = gpu_graph_list[i];
+    int* res_array = reinterpret_cast<int*>(node.val_storage);
+    int* actual_size_array = res_array + shard_len;
+    int64_t* sample_array = (int64_t*)(res_array + shard_len * 2);
+    neighbor_sample_example<<<grid_size, block_size_, 0,
+                              resource_->remote_stream(i, gpu_id)>>>(
+        graph, res_array, actual_size_array, sample_array, sample_size,
+        shard_len);
+  }
+
+  for (int i = 0; i < total_gpu; ++i) {
+    if (h_left[i] == -1) {
+      continue;
+    }
+    cudaStreamSynchronize(resource_->remote_stream(i, gpu_id));
+    tables_[i]->rwlock_->UNLock();
+  }
+  // walk_to_src(num, total_gpu, h_left, h_right, d_shard_vals_ptr);
+  move_neighbor_sample_result_to_source_gpu(gpu_id, total_gpu, sample_size,
+                                            h_left, h_right, d_shard_vals_ptr,
+                                            d_shard_actual_sample_size_ptr);
+
+  fill_dvalues<<<grid_size, block_size_, 0, stream>>>(
+      d_shard_vals_ptr, val, d_shard_actual_sample_size_ptr, actual_sample_size,
+      d_idx_ptr, sample_size, len);
+  cudaStreamSynchronize(stream);
+  for (int i = 0; i < total_gpu; ++i) {
+    int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1;
+    if (shard_len == 0) {
+      continue;
+    }
+    destroy_storage(gpu_id, i);
+  }
+  return result;
+}
+
+NodeQueryResult* GpuPsGraphTable::graph_node_sample(int gpu_id,
+                                                    int sample_size) {}
+
+NodeQueryResult* GpuPsGraphTable::query_node_list(int gpu_id, int start,
+                                                  int query_size) {
+  NodeQueryResult* result = new NodeQueryResult();
+  if (query_size <= 0) return result;
+  int& actual_size = result->actual_sample_size;
+  actual_size = 0;
+  cudaMalloc((void**)&result->val, query_size * sizeof(int64_t));
+  int64_t* val = result->val;
+  int dev_id = resource_->dev_id(gpu_id);
+  platform::CUDADeviceGuard guard(dev_id);
+  std::vector<int> idx, gpu_begin_pos, local_begin_pos, sample_size;
+  int size = 0;
+  /*
+  if idx[i] = a, gpu_begin_pos[i] = p1,
+  gpu_local_begin_pos[i] = p2;
+  sample_size[i] = s;
+  then on gpu a, the nodes of positions [p1,p1 + s) should be returned
+  and saved from the p2 position on the sample_result array
+
+  for example:
+  suppose
+  gpu 0 saves [0,2,4,6,8], gpu1 saves [1,3,5,7]
+  start = 3, query_size = 5
+  we know [6,8,1,3,5] should be returned;
+  idx = [0,1]
+  gpu_begin_pos = [3,0]
+  local_begin_pos = [0,3]
+  sample_size = [2,3]
+
+  */
+  for (int i = 0; i < gpu_graph_list.size() && query_size != 0; i++) {
+    auto graph = gpu_graph_list[i];
+    if (graph.node_size == 0) {
+      continue;
+    }
+    if (graph.node_size + size > start) {
+      int cur_size = min(query_size, graph.node_size + size - start);
+      query_size -= cur_size;
+      idx.emplace_back(i);
+      gpu_begin_pos.emplace_back(start - size);
+      local_begin_pos.emplace_back(actual_size);
+      start += cur_size;
+      actual_size += cur_size;
+      sample_size.emplace_back(cur_size);
+      create_storage(gpu_id, i, 1, cur_size * sizeof(int64_t));
+    }
+    size += graph.node_size;
+  }
+  for (int i = 0; i < idx.size(); i++) {
+    int dev_id_i = resource_->dev_id(idx[i]);
+    platform::CUDADeviceGuard guard(dev_id_i);
+    auto& node = path_[gpu_id][idx[i]].nodes_.front();
+    int grid_size = (sample_size[i] - 1) / block_size_ + 1;
+    node_query_example<<<grid_size, block_size_, 0,
+                         resource_->remote_stream(idx[i], gpu_id)>>>(
+        gpu_graph_list[idx[i]], gpu_begin_pos[i], sample_size[i],
+        (int64_t*)node.val_storage);
+  }
+
+  for (int i = 0; i < idx.size(); i++) {
+    cudaStreamSynchronize(resource_->remote_stream(idx[i], gpu_id));
+    auto& node = path_[gpu_id][idx[i]].nodes_.front();
+    cudaMemcpyAsync(reinterpret_cast<char*>(val + local_begin_pos[i]),
+                    node.val_storage, node.val_bytes_len, cudaMemcpyDefault,
+                    node.out_stream);
+  }
+  for (int i = 0; i < idx.size(); i++) {
+    auto& node = path_[gpu_id][idx[i]].nodes_.front();
+    cudaStreamSynchronize(node.out_stream);
+  }
+  return result;
+}
+}
+};
+#endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
index 7b43e68ff0151..1fca8cdf8bb80 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
@@ -173,16 +173,18 @@ class HeterComm {
   void walk_to_src(int start_index, int gpu_num, int* h_left, int* h_right,
                    ValType* src_val);
 
- private:
+ protected:
   using Table = HashTable<KeyType, ValType>;
-  int block_size_{256};
-  float load_factor_{0.75};
   std::vector<Table*> tables_;
   std::shared_ptr<HeterPsResource> resource_;
-  CustomGradMerger merger_;
-  int topo_aware_{0};
   std::vector<std::vector<Path>> path_;
+  float load_factor_{0.75};
+  int block_size_{256};
+
+ private:
   std::vector<LocalStorage> storage_;
+  CustomGradMerger merger_;
+  int topo_aware_{0};
   int feanum_{1800 * 2048};
   int multi_node_{0};
   std::vector<ncclComm_t> nccl_inner_comms_;
diff --git a/paddle/fluid/framework/fleet/heter_ps/test_graph.cu b/paddle/fluid/framework/fleet/heter_ps/test_graph.cu
new file mode 100644
index 0000000000000..697e0ba2cdf34
--- /dev/null
+++ b/paddle/fluid/framework/fleet/heter_ps/test_graph.cu
@@ -0,0 +1,112 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <vector>
+#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
+#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h"
+#include "paddle/fluid/framework/fleet/heter_ps/heter_comm.h"
+#include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h"
+#include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h"
+#include "paddle/fluid/platform/cuda_device_guard.h"
+
+using namespace paddle::framework;
+TEST(TEST_FLEET, graph_comm) {
+  int gpu_count = 3;
+  std::vector<int> dev_ids;
+  dev_ids.push_back(0);
+  dev_ids.push_back(1);
+  dev_ids.push_back(2);
+  std::shared_ptr<HeterPsResource> resource =
+      std::make_shared<HeterPsResource>(dev_ids);
+  resource->enable_p2p();
+  GpuPsGraphTable g(resource);
+  int node_count = 10;
+  std::vector<std::vector<int64_t>> neighbors(node_count);
+  int ind = 0;
+  int64_t node_id = 0;
+  std::vector<GpuPsCommGraph> graph_list(gpu_count);
+  while (ind < node_count) {
+    int neighbor_size = ind + 1;
+    graph_list[ind % gpu_count].node_size++;
+    graph_list[ind % gpu_count].neighbor_size += neighbor_size;
+    while (neighbor_size--) {
+      neighbors[ind].push_back(node_id++);
+    }
+    ind++;
+  }
+  std::vector<int> neighbor_offset(gpu_count, 0), node_index(gpu_count, 0);
+  for (int i = 0; i < graph_list.size(); i++) {
+    graph_list[i].node_list = new GpuPsGraphNode[graph_list[i].node_size];
+    graph_list[i].neighbor_list = new int64_t[graph_list[i].neighbor_size];
+  }
+  for (int i = 0; i < node_count; i++) {
+    ind = i % gpu_count;
+    graph_list[ind].node_list[node_index[ind]].node_id = i;
+    graph_list[ind].node_list[node_index[ind]].neighbor_offset =
+        neighbor_offset[ind];
+    graph_list[ind].node_list[node_index[ind]].neighbor_size =
+        neighbors[i].size();
+    for (auto x : neighbors[i]) {
+      graph_list[ind].neighbor_list[neighbor_offset[ind]++] = x;
+    }
+    node_index[ind]++;
+  }
+  g.build_graph_from_cpu(graph_list);
+  /*
+  gpu 0:
+  0,3,6,9
+  gpu 1:
+  1,4,7
+  gpu 2:
+  2,5,8
+
+  query(2,6) returns nodes [6,9,1,4,7,2]
+  */
+  int64_t answer[6] = {6, 9, 1, 4, 7, 2};
+  int64_t *res = new int64_t[6];
+  auto query_res = g.query_node_list(0, 2, 6);
+  cudaMemcpy(res, query_res->val, 48, cudaMemcpyDeviceToHost);
+  ASSERT_EQ(query_res->actual_sample_size, 6);
+  for (int i = 0; i < 6; i++) {
+    ASSERT_EQ(res[i], answer[i]);
+  }
+  delete[] res;
+  delete query_res;
+  /*
+   node x's neighbor list = [(1+x)*x/2,(1+x)*x/2 + 1,.....,(1+x)*x/2 + x]
+   so node 6's neighbors are [21,22...,27]
+   node 7's neighbors are [28,29,..35]
+    node 0's neighbors are [0]
+   query([7,0,6],sample_size=3) should return [28,29,30,0,x,x,21,22,23]
+   6 --index-->2
+   0 --index--->0
+   7 --index-->2
+  */
+  int64_t cpu_key[3] = {7, 0, 6};
+  void *key;
+  cudaMalloc((void **)&key, 3 * sizeof(int64_t));
+  cudaMemcpy(key, cpu_key, 3 * sizeof(int64_t), cudaMemcpyHostToDevice);
+  auto neighbor_sample_res = g.graph_neighbor_sample(0, (int64_t *)key, 3, 3);
+  res = new int64_t[9];
+  cudaMemcpy(res, neighbor_sample_res->val, 72, cudaMemcpyDeviceToHost);
+  int64_t expected_sample_val[] = {28, 29, 30, 0, -1, -1, 21, 22, 23};
+  for (int i = 0; i < 9; i++) {
+    if (expected_sample_val[i] != -1) {
+      ASSERT_EQ(res[i], expected_sample_val[i]);
+    }
+  }
+  delete[] res;
+  delete neighbor_sample_res;
+}

From a863b32ea8441b2448e487b417e7ce596f530a44 Mon Sep 17 00:00:00 2001
From: 0x45f <23097963+0x45f@users.noreply.github.com>
Date: Mon, 21 Feb 2022 15:24:54 +0800
Subject: [PATCH 003/101] [Dy2St]Fix cond grad error when handle tensor array
 (#39689)

* fix cond grad error when handle tensor array

* add UT
---
 paddle/fluid/framework/var_type_inference.h        |  6 ++++++
 .../operators/controlflow/conditional_block_op.cc  | 14 ++++++++++++--
 .../tests/unittests/dygraph_to_static/test_list.py |  6 ++++--
 3 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/framework/var_type_inference.h b/paddle/fluid/framework/var_type_inference.h
index f649c9388f0f6..945b68438e1e7 100644
--- a/paddle/fluid/framework/var_type_inference.h
+++ b/paddle/fluid/framework/var_type_inference.h
@@ -69,6 +69,12 @@ class InferVarTypeContext {
     return op_->Inputs().at(name).size();
   }
 
+  virtual size_t OutputSize(const std::string& name) const {
+    PADDLE_ENFORCE_NOT_NULL(
+        op_, platform::errors::PreconditionNotMet("op_ should not be null"));
+    return op_->Outputs().at(name).size();
+  }
+
   virtual const std::string& InputVarName(const std::string& name,
                                           const int index = 0) const {
     PADDLE_ENFORCE_NOT_NULL(
diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.cc b/paddle/fluid/operators/controlflow/conditional_block_op.cc
index 31ed10a71201c..6bf419c47a566 100644
--- a/paddle/fluid/operators/controlflow/conditional_block_op.cc
+++ b/paddle/fluid/operators/controlflow/conditional_block_op.cc
@@ -272,8 +272,18 @@ class ConditionalBlockGradInferVarType : public framework::VarTypeInference {
     // Input is {Tensor, LoDTensorArray}, we need synchronous the Input's
     // VarType into Input@GRAD to avoid generating {Tensor, Tensor} as
     // Input@GRAD.
-    ctx->SyncTypeAndDataType(ConditionalOp::kInputs,
-                             framework::GradVarName(ConditionalOp::kInputs));
+    auto input_size = ctx->InputSize(ConditionalOp::kInputs);
+    auto output_size =
+        ctx->OutputSize(framework::GradVarName(ConditionalOp::kInputs));
+    PADDLE_ENFORCE_EQ(input_size, output_size,
+                      platform::errors::InvalidArgument(
+                          "input_size and output_size should be equal for "
+                          "conditional_block_grad_op."));
+    for (size_t i = 0; i < output_size; ++i) {
+      ctx->SyncTypeAndDataType(ConditionalOp::kInputs,
+                               framework::GradVarName(ConditionalOp::kInputs),
+                               i);
+    }
   }
 };
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_list.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_list.py
index 567f266cd57b1..ba1f5ed2b3ead 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_list.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_list.py
@@ -320,10 +320,12 @@ def forward(self, x, index):
 
         if index > 0:
             res = a[0] * a[0]
+            y = y + 1
         else:
             res = a[-1] * a[-1]
+            y = y - 1
 
-        z = a[-1] * res
+        z = a[-1] * res * y[0]
         return z
 
 
@@ -333,7 +335,7 @@ def test_to_static(self):
         x = paddle.to_tensor([2, 3, 4], dtype='float32')
         index = paddle.to_tensor([1])
         res = net(x, index)
-        self.assertEqual(res[0], 16.)
+        self.assertEqual(res[0], 48.)
 
 
 if __name__ == '__main__':

From 740cfa94078ca667142c3601b6d4434b2c9b9ddf Mon Sep 17 00:00:00 2001
From: piotrekobiIntel <piotr.paturej@intel.com>
Date: Mon, 21 Feb 2022 08:35:49 +0100
Subject: [PATCH 004/101] Add loss conversion from uint16 to float in
 ProgressBar class (#39231)

* Add loss conversion from uint16 to float in progressbar class

* Fix test coverage

* Actually fix coverage

* Fix format error
---
 python/paddle/hapi/progressbar.py       | 15 +++++++++++++++
 python/paddle/tests/test_progressbar.py |  1 +
 2 files changed, 16 insertions(+)

diff --git a/python/paddle/hapi/progressbar.py b/python/paddle/hapi/progressbar.py
index 6ed33f4f960b4..8020029be2a4e 100644
--- a/python/paddle/hapi/progressbar.py
+++ b/python/paddle/hapi/progressbar.py
@@ -20,6 +20,7 @@
 import sys
 import time
 import numpy as np
+import struct
 from collections import namedtuple
 
 __all__ = []
@@ -79,6 +80,20 @@ def start(self):
     def update(self, current_num, values={}):
         now = time.time()
 
+        def convert_uint16_to_float(in_list):
+            in_list = np.asarray(in_list)
+            out = np.vectorize(
+                lambda x: struct.unpack('<f', struct.pack('<I', x << 16))[0],
+                otypes=[np.float32])(in_list.flat)
+            return np.reshape(out, in_list.shape)
+
+        for i, (k, val) in enumerate(values):
+            if k == "loss":
+                val = val if isinstance(val, list) or isinstance(
+                    val, np.ndarray) else [val]
+                if isinstance(val[0], np.uint16):
+                    values[i] = ("loss", list(convert_uint16_to_float(val)))
+
         if current_num:
             time_per_unit = (now - self._start) / current_num
         else:
diff --git a/python/paddle/tests/test_progressbar.py b/python/paddle/tests/test_progressbar.py
index 4726522918238..a68aee7aa8f89 100644
--- a/python/paddle/tests/test_progressbar.py
+++ b/python/paddle/tests/test_progressbar.py
@@ -41,6 +41,7 @@ def prog_bar(self, num, epoch, width, verbose=1):
         progbar.update(1, [['loss', 1e-4]])
         progbar.update(1, [['loss', np.array([1.])]])
         progbar.update(1, [['loss', np.array([1e-4])]])
+        progbar.update(1, [['loss', np.array([1]).astype(np.uint16)]])
         progbar.start()
 
         progbar.update(0, values)

From 6aafb2fa2ca312557cc6453135223f660ee9b29b Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Mon, 21 Feb 2022 18:54:48 +0800
Subject: [PATCH 005/101] [Pten] Migrate huber_loss into phi (#39761)

* migrate huber_loss into phi

* migrate infershape

* modify pten into phi
---
 paddle/fluid/operators/huber_loss_op.cc       |  48 ++-----
 paddle/fluid/operators/huber_loss_op.cu       |  24 ----
 paddle/fluid/operators/huber_loss_op.h        | 123 ------------------
 paddle/fluid/operators/huber_loss_op_npu.cc   |   2 +-
 paddle/fluid/operators/huber_loss_op_xpu.cc   |   3 +-
 paddle/phi/infermeta/binary.cc                |  37 ++++++
 paddle/phi/infermeta/binary.h                 |   7 +
 .../phi/kernels/cpu/huber_loss_grad_kernel.cc |  22 ++++
 paddle/phi/kernels/cpu/huber_loss_kernel.cc   |  21 +++
 .../phi/kernels/gpu/huber_loss_grad_kernel.cu |  22 ++++
 paddle/phi/kernels/gpu/huber_loss_kernel.cu   |  21 +++
 paddle/phi/kernels/huber_loss_grad_kernel.h   |  30 +++++
 paddle/phi/kernels/huber_loss_kernel.h        |  30 +++++
 .../impl/huber_loss_grad_kernel_impl.h        |  75 +++++++++++
 .../phi/kernels/impl/huber_loss_kernel_impl.h |  61 +++++++++
 paddle/phi/ops/compat/huber_loss_sig.cc       |  36 +++++
 16 files changed, 373 insertions(+), 189 deletions(-)
 delete mode 100644 paddle/fluid/operators/huber_loss_op.cu
 delete mode 100644 paddle/fluid/operators/huber_loss_op.h
 create mode 100644 paddle/phi/kernels/cpu/huber_loss_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/huber_loss_kernel.cc
 create mode 100644 paddle/phi/kernels/gpu/huber_loss_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/huber_loss_kernel.cu
 create mode 100644 paddle/phi/kernels/huber_loss_grad_kernel.h
 create mode 100644 paddle/phi/kernels/huber_loss_kernel.h
 create mode 100644 paddle/phi/kernels/impl/huber_loss_grad_kernel_impl.h
 create mode 100644 paddle/phi/kernels/impl/huber_loss_kernel_impl.h
 create mode 100644 paddle/phi/ops/compat/huber_loss_sig.cc

diff --git a/paddle/fluid/operators/huber_loss_op.cc b/paddle/fluid/operators/huber_loss_op.cc
index 041f7487fd257..3915ce5809c39 100644
--- a/paddle/fluid/operators/huber_loss_op.cc
+++ b/paddle/fluid/operators/huber_loss_op.cc
@@ -12,47 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/huber_loss_op.h"
 #include <memory>
 #include <string>
 #include <vector>
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/binary.h"
+
 namespace paddle {
 namespace operators {
 
 class HuberLossOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "HuberLoss");
-    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "HuberLoss");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-
-    PADDLE_ENFORCE_EQ(x_dims.size(), y_dims.size(),
-                      platform::errors::InvalidArgument(
-                          "Input(input) rank and Input(label) rank should be "
-                          "same, but received input rank(%d) != label rank(%d)",
-                          x_dims.size(), y_dims.size()));
-
-    bool contain_unknown_dim =
-        phi::contain_unknown_dim(x_dims) || phi::contain_unknown_dim(y_dims);
-    if (ctx->IsRuntime() || !contain_unknown_dim) {
-      PADDLE_ENFORCE_EQ(
-          x_dims, y_dims,
-          platform::errors::InvalidArgument(
-              "The Input(input) and Input(label) should have the same "
-              "shape, but received input shape [%s] != label shape [%s]",
-              x_dims, y_dims));
-    }
-
-    auto out_dims = y_dims;
-    ctx->SetOutputDim("Residual", out_dims);
-    ctx->SetOutputDim("Out", out_dims);
-    ctx->ShareLoD("X", "Out");
-  }
 };
 
 template <typename AttrType>
@@ -139,14 +112,11 @@ class HuberLossGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DELCARE_INFER_SHAPE_FUNCTOR(huber_loss, HuberLossInferShapeFunctor,
+                            PT_INFER_META(phi::HuberLossInferMeta));
+
 REGISTER_OPERATOR(huber_loss, ops::HuberLossOp, ops::HuberLossOpMaker<float>,
                   ops::HuberLossGradOpMaker<paddle::framework::OpDesc>,
-                  ops::HuberLossGradOpMaker<paddle::imperative::OpBase>);
+                  ops::HuberLossGradOpMaker<paddle::imperative::OpBase>,
+                  HuberLossInferShapeFunctor);
 REGISTER_OPERATOR(huber_loss_grad, ops::HuberLossGradOp);
-REGISTER_OP_CPU_KERNEL(
-    huber_loss, ops::HuberLossKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::HuberLossKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    huber_loss_grad,
-    ops::HuberLossGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::HuberLossGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/huber_loss_op.cu b/paddle/fluid/operators/huber_loss_op.cu
deleted file mode 100644
index 4ce6856a7eade..0000000000000
--- a/paddle/fluid/operators/huber_loss_op.cu
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/huber_loss_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    huber_loss,
-    ops::HuberLossKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::HuberLossKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    huber_loss_grad,
-    ops::HuberLossGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::HuberLossGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/huber_loss_op.h b/paddle/fluid/operators/huber_loss_op.h
deleted file mode 100644
index ebe26f05ab3e4..0000000000000
--- a/paddle/fluid/operators/huber_loss_op.h
+++ /dev/null
@@ -1,123 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/hostdevice.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-template <typename T>
-struct HuberLossForward {
-  HOSTDEVICE HuberLossForward(const T& delta) : delta(delta) {}
-
-  HOSTDEVICE T operator()(const T& val) const {
-    T abs_val = std::abs(val);
-    if (abs_val <= delta) {
-      return static_cast<T>(0.5) * val * val;
-    } else {
-      return delta * (abs_val - static_cast<T>(0.5) * delta);
-    }
-  }
-
-  T delta;
-};
-
-template <typename DeviceContext, typename T>
-class HuberLossKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in0 = context.Input<Tensor>("X");
-    auto* in1 = context.Input<Tensor>("Y");
-    auto* out0 = context.Output<Tensor>("Residual");
-    auto* out1 = context.Output<Tensor>("Out");
-    auto delta = static_cast<T>(context.Attr<float>("delta"));
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-
-    auto x = EigenVector<T>::Flatten(*in0);
-    auto y = EigenVector<T>::Flatten(*in1);
-    out0->mutable_data<T>(context.GetPlace());
-    auto residual = EigenVector<T>::Flatten(*out0);
-    residual.device(place) = y - x;
-    out1->mutable_data<T>(context.GetPlace());
-    auto loss = EigenVector<T>::Flatten(*out1);
-    loss.device(place) = residual.unaryExpr(HuberLossForward<T>(delta));
-  }
-};
-
-template <typename T>
-struct HuberLossBackward {
-  HOSTDEVICE HuberLossBackward(const T& delta, T sign)
-      : sign(sign), delta(delta) {}
-
-  HOSTDEVICE T operator()(const T& val) const {
-    T abs_val = std::abs(val);
-    if (abs_val <= delta) {
-      return sign * val;
-    } else {
-      if (val > 0) {
-        return sign * delta;
-      } else {
-        return -1 * sign * delta;
-      }
-    }
-  }
-
-  T sign;
-  T delta;
-};
-
-template <typename DeviceContext, typename T>
-class HuberLossGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in0 = context.Input<Tensor>("Residual");
-    auto* in1 = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
-    auto* out1 = context.Output<Tensor>(framework::GradVarName("Y"));
-    auto delta = static_cast<T>(context.Attr<float>("delta"));
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-
-    auto residual = EigenVector<T>::Flatten(*in0);
-    auto out_grad = EigenVector<T>::Flatten(*in1);
-
-    if (out0) {
-      out0->mutable_data<T>(context.GetPlace());
-      auto x_grad = EigenVector<T>::Flatten(*out0);
-      x_grad.device(place) =
-          residual.unaryExpr(HuberLossBackward<T>(delta, -1.0));
-      x_grad.device(place) = out_grad * x_grad;
-    }
-
-    if (out1) {
-      out1->mutable_data<T>(context.GetPlace());
-      auto y_grad = EigenVector<T>::Flatten(*out1);
-      y_grad.device(place) =
-          residual.unaryExpr(HuberLossBackward<T>(delta, 1.0));
-      y_grad.device(place) = out_grad * y_grad;
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/huber_loss_op_npu.cc b/paddle/fluid/operators/huber_loss_op_npu.cc
index 19ced131c00a2..6fc6960d3db56 100644
--- a/paddle/fluid/operators/huber_loss_op_npu.cc
+++ b/paddle/fluid/operators/huber_loss_op_npu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/huber_loss_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/huber_loss_op_xpu.cc b/paddle/fluid/operators/huber_loss_op_xpu.cc
index 767ce542736e8..ccddec2779515 100644
--- a/paddle/fluid/operators/huber_loss_op_xpu.cc
+++ b/paddle/fluid/operators/huber_loss_op_xpu.cc
@@ -13,8 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
-
-#include "paddle/fluid/operators/huber_loss_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index f79b5982f6194..a964788b15e31 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/infermeta/binary.h"
+#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
 
 namespace phi {
@@ -188,4 +189,40 @@ void ElementwiseRawInferMeta(const MetaTensor& x,
   out->share_lod(x);
 }
 
+void HuberLossInferMeta(const MetaTensor& input,
+                        const MetaTensor& label,
+                        float delta,
+                        MetaTensor* out,
+                        MetaTensor* residual,
+                        MetaConfig config) {
+  auto input_dims = input.dims();
+  auto label_dims = label.dims();
+
+  PADDLE_ENFORCE_EQ(input_dims.size(),
+                    label_dims.size(),
+                    phi::errors::InvalidArgument(
+                        "Input(input) rank and Input(label) rank should be "
+                        "same, but received input rank(%d) != label rank(%d)",
+                        input_dims.size(),
+                        label_dims.size()));
+
+  bool contain_unknown_dim = phi::contain_unknown_dim(input_dims) ||
+                             phi::contain_unknown_dim(label_dims);
+  if (config.is_runtime || !contain_unknown_dim) {
+    PADDLE_ENFORCE_EQ(
+        input_dims,
+        label_dims,
+        phi::errors::InvalidArgument(
+            "The Input(input) and Input(label) should have the same "
+            "shape, but received input shape [%s] != label shape [%s]",
+            input_dims,
+            label_dims));
+  }
+
+  auto out_dims = label_dims;
+  residual->set_dims(out_dims);
+  out->set_dims(out_dims);
+  out->share_lod(input);
+}
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h
index 5e3214127ee23..93ef9f5f35abb 100644
--- a/paddle/phi/infermeta/binary.h
+++ b/paddle/phi/infermeta/binary.h
@@ -45,4 +45,11 @@ void ElementwiseRawInferMeta(const MetaTensor& x_meta,
                              const MetaTensor& y_meta,
                              int axis,
                              MetaTensor* out);
+
+void HuberLossInferMeta(const MetaTensor& input_meta,
+                        const MetaTensor& label_meta,
+                        float delta,
+                        MetaTensor* out,
+                        MetaTensor* residual,
+                        MetaConfig config = MetaConfig());
 }  // namespace phi
diff --git a/paddle/phi/kernels/cpu/huber_loss_grad_kernel.cc b/paddle/phi/kernels/cpu/huber_loss_grad_kernel.cc
new file mode 100644
index 0000000000000..bd2349393e742
--- /dev/null
+++ b/paddle/phi/kernels/cpu/huber_loss_grad_kernel.cc
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/huber_loss_grad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/huber_loss_grad_kernel_impl.h"
+
+PT_REGISTER_KERNEL(
+    huber_loss_grad, CPU, ALL_LAYOUT, phi::HuberLossGradKernel, float, double) {
+}
diff --git a/paddle/phi/kernels/cpu/huber_loss_kernel.cc b/paddle/phi/kernels/cpu/huber_loss_kernel.cc
new file mode 100644
index 0000000000000..dfdab16bc85e3
--- /dev/null
+++ b/paddle/phi/kernels/cpu/huber_loss_kernel.cc
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/huber_loss_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/huber_loss_kernel_impl.h"
+
+PT_REGISTER_KERNEL(
+    huber_loss, CPU, ALL_LAYOUT, phi::HuberLossKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/huber_loss_grad_kernel.cu b/paddle/phi/kernels/gpu/huber_loss_grad_kernel.cu
new file mode 100644
index 0000000000000..5e1e000a38d95
--- /dev/null
+++ b/paddle/phi/kernels/gpu/huber_loss_grad_kernel.cu
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/huber_loss_grad_kernel.h"
+#include "paddle/phi/kernels/impl/huber_loss_grad_kernel_impl.h"
+
+PT_REGISTER_KERNEL(
+    huber_loss_grad, GPU, ALL_LAYOUT, phi::HuberLossGradKernel, float, double) {
+}
diff --git a/paddle/phi/kernels/gpu/huber_loss_kernel.cu b/paddle/phi/kernels/gpu/huber_loss_kernel.cu
new file mode 100644
index 0000000000000..2cca0c08a3f3b
--- /dev/null
+++ b/paddle/phi/kernels/gpu/huber_loss_kernel.cu
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/huber_loss_kernel.h"
+#include "paddle/phi/kernels/impl/huber_loss_kernel_impl.h"
+
+PT_REGISTER_KERNEL(
+    huber_loss, GPU, ALL_LAYOUT, phi::HuberLossKernel, float, double) {}
diff --git a/paddle/phi/kernels/huber_loss_grad_kernel.h b/paddle/phi/kernels/huber_loss_grad_kernel.h
new file mode 100644
index 0000000000000..c6246b1553197
--- /dev/null
+++ b/paddle/phi/kernels/huber_loss_grad_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void HuberLossGradKernel(const Context& dev_ctx,
+                         const DenseTensor& residual,
+                         const DenseTensor& out_grad,
+                         float delta,
+                         DenseTensor* input_grad,
+                         DenseTensor* label_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/huber_loss_kernel.h b/paddle/phi/kernels/huber_loss_kernel.h
new file mode 100644
index 0000000000000..3533a9ec6ded5
--- /dev/null
+++ b/paddle/phi/kernels/huber_loss_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void HuberLossKernel(const Context& dev_ctx,
+                     const DenseTensor& input,
+                     const DenseTensor& label,
+                     float delta,
+                     DenseTensor* out,
+                     DenseTensor* residual);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/huber_loss_grad_kernel_impl.h b/paddle/phi/kernels/impl/huber_loss_grad_kernel_impl.h
new file mode 100644
index 0000000000000..b93578abba2b7
--- /dev/null
+++ b/paddle/phi/kernels/impl/huber_loss_grad_kernel_impl.h
@@ -0,0 +1,75 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/huber_loss_grad_kernel.h"
+
+namespace phi {
+
+template <typename T>
+struct HuberLossBackward {
+  HOSTDEVICE HuberLossBackward(const T& delta, T sign)
+      : sign(sign), delta(delta) {}
+
+  HOSTDEVICE T operator()(const T& val) const {
+    T abs_val = std::abs(val);
+    if (abs_val <= delta) {
+      return sign * val;
+    } else {
+      if (val > 0) {
+        return sign * delta;
+      } else {
+        return -1 * sign * delta;
+      }
+    }
+  }
+
+  T sign;
+  T delta;
+};
+
+template <typename T, typename Context>
+void HuberLossGradKernel(const Context& dev_ctx,
+                         const DenseTensor& residual,
+                         const DenseTensor& out_grad,
+                         float delta,
+                         DenseTensor* input_grad,
+                         DenseTensor* label_grad) {
+  T delta_ = static_cast<T>(delta);
+  auto& place = *dev_ctx.eigen_device();
+
+  auto eigen_residual = EigenVector<T>::Flatten(residual);
+  auto eigen_out_grad = EigenVector<T>::Flatten(out_grad);
+
+  if (input_grad) {
+    dev_ctx.template Alloc<T>(input_grad);
+    auto eigen_input_grad = EigenVector<T>::Flatten(*input_grad);
+    eigen_input_grad.device(place) =
+        eigen_residual.unaryExpr(HuberLossBackward<T>(delta_, -1.0));
+    eigen_input_grad.device(place) = eigen_out_grad * eigen_input_grad;
+  }
+
+  if (label_grad) {
+    dev_ctx.template Alloc<T>(label_grad);
+    auto eigen_label_grad = EigenVector<T>::Flatten(*label_grad);
+    eigen_label_grad.device(place) =
+        eigen_residual.unaryExpr(HuberLossBackward<T>(delta_, 1.0));
+    eigen_label_grad.device(place) = eigen_out_grad * eigen_label_grad;
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/huber_loss_kernel_impl.h b/paddle/phi/kernels/impl/huber_loss_kernel_impl.h
new file mode 100644
index 0000000000000..7fbdc80c3829b
--- /dev/null
+++ b/paddle/phi/kernels/impl/huber_loss_kernel_impl.h
@@ -0,0 +1,61 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/huber_loss_kernel.h"
+
+namespace phi {
+
+template <typename T>
+struct HuberLossForward {
+  HOSTDEVICE HuberLossForward(const T& delta) : delta(delta) {}
+
+  HOSTDEVICE T operator()(const T& val) const {
+    T abs_val = std::abs(val);
+    if (abs_val <= delta) {
+      return static_cast<T>(0.5) * val * val;
+    } else {
+      return delta * (abs_val - static_cast<T>(0.5) * delta);
+    }
+  }
+
+  T delta;
+};
+
+template <typename T, typename Context>
+void HuberLossKernel(const Context& dev_ctx,
+                     const DenseTensor& input,
+                     const DenseTensor& label,
+                     float delta,
+                     DenseTensor* out,
+                     DenseTensor* residual) {
+  T delta_ = static_cast<T>(delta);
+  auto& place = *dev_ctx.eigen_device();
+
+  auto x = EigenVector<T>::Flatten(input);
+  auto y = EigenVector<T>::Flatten(label);
+
+  dev_ctx.template Alloc<T>(residual);
+  auto eigen_residual = EigenVector<T>::Flatten(*residual);
+  eigen_residual.device(place) = y - x;
+
+  dev_ctx.template Alloc<T>(out);
+  auto loss = EigenVector<T>::Flatten(*out);
+  loss.device(place) = eigen_residual.unaryExpr(HuberLossForward<T>(delta_));
+}
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/huber_loss_sig.cc b/paddle/phi/ops/compat/huber_loss_sig.cc
new file mode 100644
index 0000000000000..6e7183ff9f281
--- /dev/null
+++ b/paddle/phi/ops/compat/huber_loss_sig.cc
@@ -0,0 +1,36 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature HuberLossOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "huber_loss", {"X", "Y"}, {"delta"}, {"Out", "Residual"});
+}
+
+KernelSignature HuberLossGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("huber_loss_grad",
+                         {"Residual", GradVarName("Out")},
+                         {"delta"},
+                         {GradVarName("X"), GradVarName("Y")});
+}
+
+}  // namespace phi
+
+PT_REGISTER_ARG_MAPPING_FN(huber_loss, phi::HuberLossOpArgumentMapping);
+PT_REGISTER_ARG_MAPPING_FN(huber_loss_grad,
+                           phi::HuberLossGradOpArgumentMapping);

From 9c51eee17a1b80768260a00b4416a3174e826b5c Mon Sep 17 00:00:00 2001
From: From00 <zero.ruibiao@gmail.com>
Date: Mon, 21 Feb 2022 20:03:55 +0800
Subject: [PATCH 006/101] Move Abs InferShape to phi (#39762)

* Move Abs InferShaper to phi

* Fix CI error
---
 paddle/fluid/operators/abs_op.cc | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/paddle/fluid/operators/abs_op.cc b/paddle/fluid/operators/abs_op.cc
index 149a87fe32da1..c28026a4bd43a 100644
--- a/paddle/fluid/operators/abs_op.cc
+++ b/paddle/fluid/operators/abs_op.cc
@@ -16,7 +16,10 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
@@ -27,16 +30,6 @@ namespace operators {
 class AbsOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "abs");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "abs");
-
-    auto in_dims = ctx->GetInputDim("X");
-
-    ctx->SetOutputDim("Out", in_dims);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
 };
 
 class AbsOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -148,11 +141,15 @@ class AbsDoubleGradOp : public framework::OperatorWithKernel {
 }  // namespace operators
 }  // namespace paddle
 
+DELCARE_INFER_SHAPE_FUNCTOR(abs, AbsInferShapeFunctor,
+                            PT_INFER_META(phi::UnchangedInferMeta));
+
 namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(abs, ops::AbsOp, ops::AbsOpMaker,
                   ops::AbsGradMaker<paddle::framework::OpDesc>,
-                  ops::AbsGradMaker<paddle::imperative::OpBase>);
+                  ops::AbsGradMaker<paddle::imperative::OpBase>,
+                  AbsInferShapeFunctor);
 
 REGISTER_OPERATOR(abs_grad, ops::AbsGradOp,
                   ops::AbsDoubleGradMaker<paddle::framework::OpDesc>,

From 68631ed45874c79438a99a18b4415edd9f908dc4 Mon Sep 17 00:00:00 2001
From: Aganlengzi <aganlengzi@gmail.com>
Date: Mon, 21 Feb 2022 21:53:49 +0800
Subject: [PATCH 007/101] [PluggableDevice]custom kernel to phi core structs
 (#39690)

* [PluggableDevice]custom kernel to pten core structs

* mod extension.h for custom op

* compatible python for CI

* support custom context

* refactor to pten

* fix windows and ut
---
 paddle/fluid/framework/CMakeLists.txt         |    4 +-
 paddle/fluid/framework/custom_kernel.cc       |  345 +----
 paddle/fluid/framework/custom_kernel.h        |   13 +-
 paddle/fluid/platform/CMakeLists.txt          |    4 +
 paddle/phi/api/all.h                          |    1 -
 paddle/phi/api/lib/CMakeLists.txt             |    1 -
 paddle/phi/backends/CMakeLists.txt            |    4 +
 paddle/phi/backends/all_context.h             |    3 +
 paddle/phi/backends/custom/custom_context.cc  |    6 +-
 paddle/phi/backends/custom/custom_context.h   |    3 +-
 paddle/phi/common/backend.h                   |   26 +
 paddle/phi/core/CMakeLists.txt                |    2 +
 paddle/phi/core/custom_kernel.cc              |   66 +
 paddle/phi/core/custom_kernel.h               |   49 +
 paddle/phi/core/dense_tensor.h                |    3 +
 paddle/phi/core/kernel_context.h              |    1 +
 paddle/phi/core/kernel_registry.h             | 1141 ++++++++++-------
 paddle/phi/core/kernel_utils.h                |   10 +
 paddle/phi/core/lod_utils.h                   |    6 +
 paddle/phi/core/tensor_meta.h                 |    6 +
 paddle/phi/core/tensor_utils.h                |   24 +-
 paddle/phi/tests/common/test_backend.cc       |   14 +
 paddle/phi/tests/core/CMakeLists.txt          |    1 +
 .../tests/core/test_custom_kernel.cc}         |  151 +--
 .../tests/custom_kernel/custom_kernel_dot.cc  |   19 +-
 .../custom_kernel/custom_kernel_dot_setup.py  |   36 +-
 python/setup.py.in                            |    4 +-
 27 files changed, 1001 insertions(+), 942 deletions(-)
 create mode 100644 paddle/phi/core/custom_kernel.cc
 create mode 100644 paddle/phi/core/custom_kernel.h
 rename paddle/{fluid/framework/custom_kernel_test.cc => phi/tests/core/test_custom_kernel.cc} (70%)

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 78f5bb077aaf1..7d527e24a0079 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -437,8 +437,7 @@ message(STATUS "branch: ${PADDLE_BRANCH}")
 configure_file(commit.h.in commit.h)
 
 cc_library(custom_operator SRCS custom_operator.cc DEPS tensor attribute framework_proto op_registry operator dynamic_loader string_helper pten_tensor op_meta_info pten_api)
-cc_library(custom_kernel SRCS custom_kernel.cc DEPS
-           tensor attribute framework_proto op_registry operator dynamic_loader string_helper pten_tensor op_kernel_info pten_api)
+cc_library(custom_kernel SRCS custom_kernel.cc DEPS op_registry pten_custom_kernel pten_tensor_raw)
 #cc_binary(test_executor SRCS test_executor.cc DEPS executor op_registry ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} )
 #cc_binary(new_executor SRCS new_exec_test.cc DEPS operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler)
 
@@ -459,4 +458,3 @@ else()
   cc_library(fluid_convert_utils SRCS convert_utils.cc DEPS data_type place)
 endif()
 cc_test(convert_utils_test SRCS convert_utils_test.cc DEPS fluid_convert_utils)
-cc_test(custom_kernel_test SRCS custom_kernel_test.cc DEPS custom_kernel pten_tensor)
diff --git a/paddle/fluid/framework/custom_kernel.cc b/paddle/fluid/framework/custom_kernel.cc
index 3a00d9424646a..49a1e0774a6b1 100644
--- a/paddle/fluid/framework/custom_kernel.cc
+++ b/paddle/fluid/framework/custom_kernel.cc
@@ -18,355 +18,24 @@ limitations under the License. */
 #endif
 
 #include "paddle/fluid/framework/custom_kernel.h"
-#include <dirent.h>
-#include <algorithm>
-#include <regex>
-#include "paddle/fluid/framework/op_kernel_info_helper.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/phi/api/ext/op_kernel_info.h"
-#include "paddle/phi/core/compat/convert_utils.h"
-#include "paddle/phi/core/kernel_context.h"
-#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/custom_kernel.h"
 
 namespace paddle {
-
 namespace framework {
 
-// set phi::Kernel args_def_ from op_kernel_info
-// because we can not set directly to phi::Kernel without exposing
-// phi::KernelArgsDef when parsing custom user function
-static void ParseArgs(const OpKernelInfo& op_kernel_info,
-                      phi::KernelArgsDef* args_def) {
-  auto& input_defs = OpKernelInfoHelper::GetInputDefs(op_kernel_info);
-  auto& output_defs = OpKernelInfoHelper::GetOutputDefs(op_kernel_info);
-  auto& attribute_defs = OpKernelInfoHelper::GetAttributeDefs(op_kernel_info);
-
-  for (auto& input : input_defs) {
-    auto type_index =
-        input.is_vector
-            ? std::type_index(typeid(const std::vector<phi::DenseTensor>&))
-            : std::type_index(typeid(const phi::DenseTensor&));
-    args_def->AppendInput(input.backend, input.layout, input.dtype, type_index);
-  }
-  for (auto& output : output_defs) {
-    auto type_index =
-        output.is_vector
-            ? std::type_index(typeid(const std::vector<phi::DenseTensor>&))
-            : std::type_index(typeid(const phi::DenseTensor&));
-    args_def->AppendOutput(output.backend, output.layout, output.dtype,
-                           type_index);
-  }
-  for (auto& attr : attribute_defs) {
-    args_def->AppendAttribute(attr.type_index);
-  }
-}
-
-// custom pten kernel call function define
-static void RunKernelFunc(phi::KernelContext* ctx,
-                          const OpKernelInfo& op_kernel_info) {
-  VLOG(3) << "[CUSTOM KERNEL] RunKernelFunc begin...";
-
-  // input and output size is not params' num
-  // but actual Tensors' size
-  size_t input_size = ctx->InputsSize();
-  size_t output_size = ctx->OutputsSize();
-  size_t attr_size = ctx->AttrsSize();
-
-  // parameters' num of unified user kernel function
-  auto& input_defs = OpKernelInfoHelper::GetInputDefs(op_kernel_info);
-  auto& output_defs = OpKernelInfoHelper::GetOutputDefs(op_kernel_info);
-  auto& attribute_defs = OpKernelInfoHelper::GetAttributeDefs(op_kernel_info);
-
-  PADDLE_ENFORCE_GE(input_size, input_defs.size(),
-                    platform::errors::InvalidArgument(
-                        "the size of ctx inputs size (%d) must be larger than "
-                        "the size of kernel input_defs (%d).",
-                        input_size, input_defs.size()));
-
-  PADDLE_ENFORCE_GE(output_size, output_defs.size(),
-                    platform::errors::InvalidArgument(
-                        "the size of ctx outputs size (%d) must be larger than "
-                        "the size of kernel output_defs (%d).",
-                        output_size, output_defs.size()));
-
-  PADDLE_ENFORCE_EQ(attr_size, attribute_defs.size(),
-                    platform::errors::InvalidArgument(
-                        "the size of ctx attribute size (%d) must be equal to "
-                        "to the size of kernel attribute_defs (%d).",
-                        attr_size, attribute_defs.size()));
-
-  VLOG(3) << "[CUSTOM KERNEL] Input num: " << input_defs.size()
-          << "[tensor size:" << input_size << "]"
-          << " Attribute num: " << attribute_defs.size()
-          << " Output num: " << output_defs.size()
-          << "[tensor size:" << output_size << "].";
-
-  // Inputs mapping
-  std::vector<paddle::experimental::Tensor> custom_ins;
-  std::vector<std::vector<paddle::experimental::Tensor>> custom_vec_ins;
-  for (size_t in_idx = 0; in_idx < input_defs.size(); ++in_idx) {
-    VLOG(3) << "Mapping Input[" << in_idx << "]";
-    const std::pair<int, int> range = ctx->InputRangeAt(in_idx);
-
-    // is_vector tells if this Input is Tensor or std::vector<Tensor>
-    if (!input_defs.at(in_idx).is_vector) {
-      paddle::experimental::Tensor custom_t;
-      auto& ctx_tensor = ctx->InputAt<phi::DenseTensor>(range.first);
-      custom_t.set_impl(std::make_shared<phi::DenseTensor>(ctx_tensor));
-      custom_ins.emplace_back(custom_t);
-    } else {
-      std::vector<paddle::experimental::Tensor> custom_vec_in;
-      auto ctx_tensor_vec =
-          ctx->MoveInputsBetween<phi::DenseTensor>(range.first, range.second);
-      for (auto& ctx_tensor : ctx_tensor_vec) {
-        paddle::experimental::Tensor custom_t;
-        custom_t.set_impl(std::make_shared<phi::DenseTensor>(ctx_tensor));
-        custom_vec_in.emplace_back(custom_t);
-      }
-      custom_vec_ins.emplace_back(custom_vec_in);
-    }
-    VLOG(3) << "Mapped Input[" << in_idx << "] with range[" << range.first
-            << "," << range.second << ").";
-  }
-
-  // Attributes mapping
-  std::vector<paddle::any> custom_attrs;
-  for (size_t attr_idx = 0; attr_idx < attribute_defs.size(); ++attr_idx) {
-    VLOG(3) << "Mapping Attribute[" << attr_idx << "]";
-    if (attribute_defs[attr_idx].type_index == std::type_index(typeid(bool))) {
-      bool arg = ctx->AttrAt<bool>(attr_idx);
-      custom_attrs.emplace_back(arg);
-    } else if (attribute_defs[attr_idx].type_index ==
-               std::type_index(typeid(int))) {
-      int arg = ctx->AttrAt<int>(attr_idx);
-      custom_attrs.emplace_back(arg);
-    } else if (attribute_defs[attr_idx].type_index ==
-               std::type_index(typeid(float))) {
-      float arg = ctx->AttrAt<float>(attr_idx);
-      custom_attrs.emplace_back(arg);
-    } else if (attribute_defs[attr_idx].type_index ==
-               std::type_index(typeid(double))) {
-      double arg = ctx->AttrAt<double>(attr_idx);
-      custom_attrs.emplace_back(arg);
-    } else if (attribute_defs[attr_idx].type_index ==
-               std::type_index(typeid(int64_t))) {
-      int64_t arg = ctx->AttrAt<int64_t>(attr_idx);
-      custom_attrs.emplace_back(arg);
-    } else if (attribute_defs[attr_idx].type_index ==
-               std::type_index(typeid(phi::dtype::float16))) {
-      phi::dtype::float16 arg = ctx->AttrAt<phi::dtype::float16>(attr_idx);
-      custom_attrs.emplace_back(arg);
-    } else if (attribute_defs[attr_idx].type_index ==
-               std::type_index(typeid(DataType))) {
-      DataType arg = ctx->AttrAt<DataType>(attr_idx);
-      custom_attrs.emplace_back(arg);
-    } else if (attribute_defs[attr_idx].type_index ==
-               std::type_index(typeid(const Scalar&))) {
-      const Scalar& arg = ctx->AttrAt<const Scalar&>(attr_idx);
-      custom_attrs.emplace_back(arg);
-    } else if (attribute_defs[attr_idx].type_index ==
-               std::type_index(typeid(const std::vector<int64_t>&))) {
-      const std::vector<int64_t>& arg =
-          ctx->AttrAt<const std::vector<int64_t>&>(attr_idx);
-      custom_attrs.emplace_back(arg);
-    } else if (attribute_defs[attr_idx].type_index ==
-               std::type_index(typeid(const ScalarArray&))) {
-      const ScalarArray& arg = ctx->AttrAt<const ScalarArray&>(attr_idx);
-      custom_attrs.emplace_back(arg);
-    } else if (attribute_defs[attr_idx].type_index ==
-               std::type_index(typeid(const std::vector<int>&))) {
-      const std::vector<int>& arg =
-          ctx->AttrAt<const std::vector<int>&>(attr_idx);
-      custom_attrs.emplace_back(arg);
-    } else {
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "Unsupported attribute attribute_defs[%d].type_index", attr_idx));
-    }
-    VLOG(3) << "Mapped Attribute[" << attr_idx << "]";
-  }
-
-  // Outputs mapping
-  std::vector<paddle::experimental::Tensor*> custom_outs;
-  std::vector<std::vector<paddle::experimental::Tensor*>> custom_vec_outs;
-  std::vector<std::shared_ptr<phi::DenseTensor>> custom_outs_ptr;
-  std::vector<std::vector<std::shared_ptr<phi::DenseTensor>>>
-      custom_vec_outs_ptr;
-
-  for (size_t out_idx = 0; out_idx < output_defs.size(); ++out_idx) {
-    VLOG(3) << "Mapping Output[" << out_idx << "]";
-    const std::pair<int, int> range = ctx->OutputRangeAt(out_idx);
-
-    // is_vector tells if this Output is Tensor or std::vector<Tensor>
-    if (!output_defs.at(out_idx).is_vector) {
-      auto* ctx_tensor = ctx->MutableOutputAt<phi::DenseTensor>(range.first);
-      auto* custom_t = new paddle::experimental::Tensor();
-      auto custom_t_ptr = std::make_shared<phi::DenseTensor>(*ctx_tensor);
-      custom_t->set_impl(custom_t_ptr);
-      custom_outs.emplace_back(custom_t);
-      custom_outs_ptr.emplace_back(custom_t_ptr);
-    } else {
-      std::vector<paddle::experimental::Tensor*> custom_vec_out;
-      std::vector<std::shared_ptr<phi::DenseTensor>> custom_vec_out_ptr;
-      auto ctx_tensor_vec = ctx->MutableOutputBetween<phi::DenseTensor>(
-          range.first, range.second);
-      for (auto ctx_tensor : ctx_tensor_vec) {
-        auto* custom_t = new paddle::experimental::Tensor();
-        auto custom_t_ptr = std::make_shared<phi::DenseTensor>(*ctx_tensor);
-        custom_t->set_impl(custom_t_ptr);
-        custom_vec_out.emplace_back(custom_t);
-        custom_vec_out_ptr.emplace_back(custom_t_ptr);
-      }
-      custom_vec_outs.emplace_back(custom_vec_out);
-      custom_vec_outs_ptr.emplace_back(custom_vec_out_ptr);
-    }
-    VLOG(3) << "Mapped Output[" << out_idx << "] with range[" << range.first
-            << "," << range.second << ").";
-  }
-
-  // DeviceContext
-  // In pten, the first paramter XXContext is decided when registering
-  // through template param, but custom kernel function use unified
-  // DeviceContext as first parameter of user_kernel_fn, we use backend
-  // from OpKernelInfo to decide XXContext. In temporary simple
-  // DeviceContext, we just set necessary info to dev_ctx(such as stream
-  // in NPUContext), more related work should be done when
-  // phi::DeviceContext is exposed to outer.
-  DeviceContext dev_ctx;
-  auto& backend = OpKernelInfoHelper::GetBackend(op_kernel_info);
-  if (backend == phi::Backend::CPU) {
-    // do nothing
-  } else {
-#ifdef PADDLE_WITH_CUSTOM_DEVICE
-    size_t device_type_id_ = static_cast<size_t>(backend) -
-                             static_cast<size_t>(phi::Backend::ALL_BACKEND);
-    std::string device_type = phi::GetGlobalDeviceType(device_type_id_);
-    if (!device_type.empty()) {
-      auto custom_ctx =
-          ctx->GetDeviceContext<paddle::platform::CustomDeviceContext>();
-      dev_ctx.set_stream(custom_ctx.stream());
-      return;
-    }
-#endif
-    LOG(ERROR) << "[CUSTOM KERNEL] Unsupported kernel backend: " << backend
-               << " with compiled Paddle.";
-    return;
-  }
-
-  auto& user_kernel_fn = OpKernelInfoHelper::GetKernelFn(op_kernel_info);
-  // call user function
-  user_kernel_fn(dev_ctx, custom_ins, custom_vec_ins, custom_attrs,
-                 &custom_outs, &custom_vec_outs);
-
-  VLOG(3) << "[CUSTOM KERNEL] finished call user kernel function.";
-
-  // NOTE: Map back the output tensors with stored shared_ptrs.
-  for (int out_idx = output_defs.size() - 1; out_idx >= 0; --out_idx) {
-    VLOG(3) << "Mapping Back Output[" << out_idx << "]";
-    const std::pair<int, int> range = ctx->OutputRangeAt(out_idx);
-
-    // is_vector tells if this Output is Tensor or std::vector<Tensor>
-    if (!output_defs.at(out_idx).is_vector) {
-      auto* ctx_tensor = ctx->MutableOutputAt<phi::DenseTensor>(range.first);
-      *ctx_tensor = *(custom_outs_ptr.back().get());
-      custom_outs_ptr.pop_back();
-    } else {
-      auto ctx_tensor_vec = ctx->MutableOutputBetween<phi::DenseTensor>(
-          range.first, range.second);
-      auto custom_vec_ptr_out = custom_vec_outs_ptr.back();
-      for (int idx = ctx_tensor_vec.size() - 1; idx >= 0; --idx) {
-        *(ctx_tensor_vec[idx]) = *(custom_vec_ptr_out.back().get());
-        custom_vec_ptr_out.pop_back();
-      }
-      custom_vec_outs_ptr.pop_back();
-    }
-    VLOG(3) << "Mapped Output[" << out_idx << "] with range[" << range.first
-            << "," << range.second << "].";
-  }
-
-  // delete newed paddle::Tensor for outputs while calling user kernel function
-  for (size_t i = 0; i < custom_outs.size(); ++i) {
-    delete custom_outs[i];
-  }
-  for (size_t i = 0; i < custom_vec_outs.size(); ++i) {
-    for (size_t j = 0; j < custom_vec_outs[i].size(); ++j) {
-      delete custom_vec_outs[i][j];
-    }
-  }
-}
-
-void RegisterKernelWithMetaInfo(
-    const std::vector<OpKernelInfo>& op_kernel_infos) {
-  for (size_t i = 0; i < op_kernel_infos.size(); ++i) {
-    auto& kernel_info = op_kernel_infos[i];
-    auto op_type = OpKernelInfoHelper::GetOpName(kernel_info);
-    auto kernel_key = OpKernelInfoHelper::GetKernelKey(kernel_info);
-
-    VLOG(3) << "[CUSTOM KERNEL] registering [" << op_type << "]" << kernel_key;
-
-    // 1.Check whether this kernel is valid for a specific operator
-    PADDLE_ENFORCE_EQ(
-        phi::KernelFactory::Instance().HasCompatiblePtenKernel(op_type), true,
-        platform::errors::InvalidArgument(
-            "[CUSTOM KERNEL] %s is not ready for custom kernel registering.",
-            op_type));
-
-    // 2.Check whether kernel_key has been already registed
-    PADDLE_ENFORCE_EQ(
-        phi::KernelFactory::Instance().kernels()[op_type].find(kernel_key),
-        phi::KernelFactory::Instance().kernels()[op_type].end(),
-        platform::errors::InvalidArgument(
-            "[CUSTOM KERNEL] The operator <%s>'s kernel: %s has been "
-            "already existed in Paddle, please contribute PR if need "
-            "to optimize the kernel code. Custom kernel do NOT support "
-            "to replace existing kernel in Paddle.",
-            op_type, kernel_key));
-
-    // phi::KernelFn
-    phi::KernelFn kernel_fn = [kernel_info](phi::KernelContext* ctx) {
-      VLOG(3) << "[CUSTOM KERNEL] run custom PTEN kernel func in lambda.";
-      RunKernelFunc(ctx, kernel_info);
-    };
-    // variadic_kernel_fn
-    void* variadic_kernel_fn =
-        OpKernelInfoHelper::GetVariadicKernelFn(kernel_info);
-    phi::Kernel kernel(kernel_fn, variadic_kernel_fn);
-    // args info
-    ParseArgs(kernel_info, kernel.mutable_args_def());
-    // register custom kernel to phi::KernelFactory
-    phi::KernelFactory::Instance().kernels()[op_type][kernel_key] = kernel;
-    VLOG(3) << "[CUSTOM KERNEL] Successed in registering operator <" << op_type
-            << ">'s kernel " << kernel_key << " to Paddle. "
-            << "It will be used like native ones.";
-  }
-}
-
-void RegisterKernelWithMetaInfoMap(
-    const paddle::OpKernelInfoMap& op_kernel_info_map) {
-  auto& kernel_info_map = op_kernel_info_map.GetMap();
-  VLOG(3) << "[CUSTOM KERNEL] size of op_kernel_info_map: "
-          << kernel_info_map.size();
-
-  // pair: {op_type, OpKernelInfo}
-  for (auto& pair : kernel_info_map) {
-    VLOG(3) << "[CUSTOM KERNEL] pair first -> op name: " << pair.first;
-    RegisterKernelWithMetaInfo(pair.second);
-  }
-}
-
 void LoadCustomKernelLib(const std::string& dso_lib_path, void* dso_handle) {
 #ifdef _LINUX
-  typedef OpKernelInfoMap& get_op_kernel_info_map_t();
-  auto* func = reinterpret_cast<get_op_kernel_info_map_t*>(
-      dlsym(dso_handle, "PD_GetOpKernelInfoMap"));
+  typedef phi::CustomKernelMap& get_custom_kernel_map_t();
+  auto* func = reinterpret_cast<get_custom_kernel_map_t*>(
+      dlsym(dso_handle, "PD_GetCustomKernelMap"));
 
   if (func == nullptr) {
     LOG(WARNING) << "Skipped lib [" << dso_lib_path << "]: fail to find "
-                 << "PD_GetOpKernelInfoMap symbol in this lib.";
+                 << "PD_GetCustomKernelMap symbol in this lib.";
     return;
   }
-  auto& op_kernel_info_map = func();
-  RegisterKernelWithMetaInfoMap(op_kernel_info_map);
+  auto& custom_kernel_map = func();
+  phi::RegisterCustomKernels(custom_kernel_map);
   LOG(INFO) << "Successed in loading custom kernels in lib: " << dso_lib_path;
 #else
   VLOG(3) << "Unsupported: Custom kernel is only implemented on Linux.";
diff --git a/paddle/fluid/framework/custom_kernel.h b/paddle/fluid/framework/custom_kernel.h
index 30bccc97000f8..31084a34413ea 100644
--- a/paddle/fluid/framework/custom_kernel.h
+++ b/paddle/fluid/framework/custom_kernel.h
@@ -14,22 +14,13 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/phi/api/ext/op_kernel_info.h"
+#include <string>
 
 namespace paddle {
 namespace framework {
 
+// Load custom kernel lib and register
 void LoadCustomKernelLib(const std::string& dso_lib_path, void* dso_handle);
 
-// Load custom kernel api: register kernel after user compiled
-void LoadOpKernelInfoAndRegister(const std::string& dso_name);
-
-// Register custom kernel api: register kernel directly
-void RegisterKernelWithMetaInfoMap(
-    const paddle::OpKernelInfoMap& op_kernel_info_map);
-
-// Interface for selective register custom kernel.
-void RegisterKernelWithMetaInfo(
-    const std::vector<OpKernelInfo>& op_kernel_infos);
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index be02bac1aa0ef..b808e1561b24a 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -146,6 +146,10 @@ if(WITH_ASCEND_CL)
     target_link_libraries(device_context npu_resource_pool)
 endif()
 
+if(WITH_CUSTOM_DEVICE)
+    target_link_libraries(device_context custom_context)
+endif()
+
 cc_test(init_test SRCS init_test.cc DEPS device_context)
 
 # Manage all device event library
diff --git a/paddle/phi/api/all.h b/paddle/phi/api/all.h
index 8d840214092ba..06f3cd8447606 100644
--- a/paddle/phi/api/all.h
+++ b/paddle/phi/api/all.h
@@ -41,7 +41,6 @@ limitations under the License. */
 #include "paddle/phi/api/ext/dispatch.h"
 #include "paddle/phi/api/ext/dll_decl.h"
 #include "paddle/phi/api/ext/exception.h"
-#include "paddle/phi/api/ext/op_kernel_info.h"
 #include "paddle/phi/api/ext/op_meta_info.h"
 #include "paddle/phi/api/ext/place.h"
 #include "paddle/phi/api/ext/tensor_compat.h"
diff --git a/paddle/phi/api/lib/CMakeLists.txt b/paddle/phi/api/lib/CMakeLists.txt
index 175bf34c0da66..720c6f54bb075 100644
--- a/paddle/phi/api/lib/CMakeLists.txt
+++ b/paddle/phi/api/lib/CMakeLists.txt
@@ -90,7 +90,6 @@ cc_library(manual_api SRCS manual_api.cc DEPS pten_tensor_raw pten kernel_dispat
 cc_library(pten_tensor SRCS tensor_method.cc DEPS pten_tensor_raw pten_function_api)
 
 cc_library(op_meta_info SRCS op_meta_info.cc DEPS pten_tensor)
-cc_library(op_kernel_info SRCS op_kernel_info.cc DEPS pten_tensor_raw)
 
 cc_library(sparse_api SRCS sparse_api.cc DEPS pten_tensor pten kernel_dispatch pten_data_transform)
 cc_library(pten_function_api SRCS ${api_source_file} DEPS pten_tensor pten kernel_dispatch pten_data_transform)
diff --git a/paddle/phi/backends/CMakeLists.txt b/paddle/phi/backends/CMakeLists.txt
index 441bd0a8c303b..38366d57841b0 100644
--- a/paddle/phi/backends/CMakeLists.txt
+++ b/paddle/phi/backends/CMakeLists.txt
@@ -21,3 +21,7 @@ endif()
 if(WITH_GPU)
   add_dependencies(pten_context gpu_context)
 endif()
+
+if(WITH_CUSTOM_DEVICE)
+  add_dependencies(pten_context custom_context)
+endif()
diff --git a/paddle/phi/backends/all_context.h b/paddle/phi/backends/all_context.h
index b53c5ce5c780c..3fe03905e42dd 100644
--- a/paddle/phi/backends/all_context.h
+++ b/paddle/phi/backends/all_context.h
@@ -21,12 +21,15 @@ limitations under the License. */
 // path replacement after implementing pten DeviceContext
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/custom/custom_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/xpu/xpu_context.h"
 
+#ifndef PADDLE_WITH_CUSTOM_KERNEL
 // TODO(wilber): DeviceContextPool nees include fluid file.
 #include "paddle/fluid/platform/device_context.h"
 
 namespace phi {
 using DeviceContextPool = paddle::platform::DeviceContextPool;
 }  // namespace phi
+#endif
diff --git a/paddle/phi/backends/custom/custom_context.cc b/paddle/phi/backends/custom/custom_context.cc
index 445f550839160..bde3b6a08539b 100644
--- a/paddle/phi/backends/custom/custom_context.cc
+++ b/paddle/phi/backends/custom/custom_context.cc
@@ -32,8 +32,8 @@ struct CustomContext::Impl {
 
   const Place& GetPlace() const { return place_; }
 
-  C_Stream stream() const {
-    return reinterpret_cast<C_Stream>(stream_->raw_stream());
+  void* stream() const {
+    return reinterpret_cast<void*>(stream_->raw_stream());
   }
 
   void Wait() const { stream_->Wait(); }
@@ -47,7 +47,7 @@ void CustomContext::Init() { impl_->Init(); }
 
 const Place& CustomContext::GetPlace() const { return impl_->GetPlace(); }
 
-C_Stream CustomContext::stream() const { return impl_->stream(); }
+void* CustomContext::stream() const { return impl_->stream(); }
 
 void CustomContext::Wait() const { return impl_->Wait(); }
 
diff --git a/paddle/phi/backends/custom/custom_context.h b/paddle/phi/backends/custom/custom_context.h
index 109f5e53707f6..37b0ee21219b5 100644
--- a/paddle/phi/backends/custom/custom_context.h
+++ b/paddle/phi/backends/custom/custom_context.h
@@ -15,7 +15,6 @@ limitations under the License. */
 #pragma once
 
 #include <memory>
-#include "paddle/fluid/platform/device/device_ext.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/device_context.h"
 
@@ -30,7 +29,7 @@ class CustomContext : public DeviceContext {
   const Place& GetPlace() const override;
 
   /*! \brief  Return stream in the device context. */
-  C_Stream stream() const;
+  void* stream() const;
 
   // Wait for all operations completion in the stream.
   void Wait() const override;
diff --git a/paddle/phi/common/backend.h b/paddle/phi/common/backend.h
index f7c39eacae9bd..62692fb9475da 100644
--- a/paddle/phi/common/backend.h
+++ b/paddle/phi/common/backend.h
@@ -130,6 +130,32 @@ inline std::ostream& operator<<(std::ostream& os, Backend backend) {
   return os;
 }
 
+inline Backend StringToBackend(const char* backend_cstr) {
+  std::string s(backend_cstr);
+  if (s == std::string("Undefined")) {
+    return Backend::UNDEFINED;
+  }
+  for (size_t i = 0; i < s.size(); ++i) {
+    s[i] = toupper(s[i]);
+  }
+  if (s == std::string("CPU")) {
+    return Backend::CPU;
+  } else if (s == std::string("GPU")) {
+    return Backend::GPU;
+  } else if (s == std::string("XPU")) {
+    return Backend::XPU;
+  } else if (s == std::string("NPU")) {
+    return Backend::NPU;
+  } else if (s == std::string("MKLDNN")) {
+    return Backend::MKLDNN;
+  } else if (s == std::string("CUDNN")) {
+    return Backend::CUDNN;
+  } else {
+    return static_cast<Backend>(static_cast<size_t>(Backend::NUM_BACKENDS) +
+                                phi::GetOrRegisterGlobalDeviceTypeId(s));
+  }
+}
+
 }  // namespace experimental
 }  // namespace paddle
 
diff --git a/paddle/phi/core/CMakeLists.txt b/paddle/phi/core/CMakeLists.txt
index 18f209377bafc..32b9b42f74f62 100644
--- a/paddle/phi/core/CMakeLists.txt
+++ b/paddle/phi/core/CMakeLists.txt
@@ -25,6 +25,8 @@ cc_library(meta_tensor SRCS meta_tensor.cc DEPS tensor_base tensor_meta dense_te
 cc_library(infermeta_utils SRCS infermeta_utils.cc DEPS meta_tensor)
 cc_library(selected_rows SRCS selected_rows_impl.cc DEPS dense_tensor mixed_vector pten_enforce ddim)
 
+cc_library(pten_custom_kernel SRCS custom_kernel.cc DEPS kernel_factory convert_utils)
+
 # Will remove once we implemented MKLDNN_Tensor
 if(WITH_MKLDNN)
     add_dependencies(dense_tensor mkldnn)
diff --git a/paddle/phi/core/custom_kernel.cc b/paddle/phi/core/custom_kernel.cc
new file mode 100644
index 0000000000000..75ff9cc286003
--- /dev/null
+++ b/paddle/phi/core/custom_kernel.cc
@@ -0,0 +1,66 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/custom_kernel.h"
+
+namespace phi {
+
+void RegisterCustomKernels(const CustomKernelMap& custom_kernel_map) {
+  auto& kernel_info_map = custom_kernel_map.GetMap();
+  VLOG(3) << "Size of custom_kernel_map: " << kernel_info_map.size();
+
+  for (auto& pair : kernel_info_map) {
+    PADDLE_ENFORCE_EQ(
+        KernelFactory::Instance().HasCompatiblePtenKernel(pair.first),
+        true,
+        phi::errors::InvalidArgument(
+            "The kernel %s is not ready for custom kernel registering.",
+            pair.first));
+
+    for (auto& info_pair : pair.second) {
+      auto& kernels = KernelFactory::Instance().kernels();
+      PADDLE_ENFORCE_EQ(
+          kernels[pair.first].find(info_pair.first),
+          kernels[pair.first].end(),
+          phi::errors::InvalidArgument(
+              "The operator <%s>'s kernel: %s has been already existed "
+              "in Paddle, please contribute PR if it is necessary "
+              "to optimize the kernel code. Custom kernel does NOT support "
+              "to replace existing kernel in Paddle.",
+              pair.first,
+              info_pair.first));
+
+      kernels[pair.first][info_pair.first] = info_pair.second;
+
+      VLOG(3) << "Successed in registering operator <" << pair.first
+              << ">'s kernel: " << info_pair.first
+              << " to Paddle. It will be used like native ones.";
+    }
+  }
+}
+
+}  // namespace phi
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// C-API to get global CustomKernelMap.
+phi::CustomKernelMap& PD_GetCustomKernelMap() {
+  return phi::CustomKernelMap::Instance();
+}
+
+#ifdef __cplusplus
+}  // end extern "C"
+#endif
diff --git a/paddle/phi/core/custom_kernel.h b/paddle/phi/core/custom_kernel.h
new file mode 100644
index 0000000000000..20ae2b7bb7360
--- /dev/null
+++ b/paddle/phi/core/custom_kernel.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/kernel_factory.h"
+#include "paddle/phi/core/macros.h"
+
+namespace phi {
+/**
+ * Note:
+ * Used to store kernels' info before registered to KernelFactory.
+ */
+class CustomKernelMap {
+ public:
+  static CustomKernelMap& Instance() {
+    static CustomKernelMap g_custom_kernel_info_map;
+    return g_custom_kernel_info_map;
+  }
+
+  KernelNameMap& Kernels() { return kernels_; }
+
+  const KernelNameMap& GetMap() const { return kernels_; }
+
+ private:
+  CustomKernelMap() = default;
+  DISABLE_COPY_AND_ASSIGN(CustomKernelMap);
+
+  KernelNameMap kernels_;
+};
+
+/**
+ * Note:
+ * Used to register custom kernels to KernelFactory.
+ */
+void RegisterCustomKernels(const CustomKernelMap& custom_kernel_map);
+
+}  // namespace phi
diff --git a/paddle/phi/core/dense_tensor.h b/paddle/phi/core/dense_tensor.h
index 622cedf1d7f91..0dddd63099bbc 100644
--- a/paddle/phi/core/dense_tensor.h
+++ b/paddle/phi/core/dense_tensor.h
@@ -171,6 +171,9 @@ class DenseTensor : public TensorBase,
   DenseTensorMeta meta_;
   std::shared_ptr<phi::Allocation> holder_;
 
+#ifndef PADDLE_WITH_CUSTOM_KERNEL
 #include "paddle/phi/core/dense_tensor.inl"
+#endif
 };
+
 }  // namespace phi
diff --git a/paddle/phi/core/kernel_context.h b/paddle/phi/core/kernel_context.h
index 0b960004fcb27..57e2db60c24ca 100644
--- a/paddle/phi/core/kernel_context.h
+++ b/paddle/phi/core/kernel_context.h
@@ -22,6 +22,7 @@
 #include "paddle/phi/core/tensor_base.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/utils/any.h"
+#include "paddle/utils/optional.h"
 #include "paddle/utils/small_vector.h"
 
 namespace phi {
diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h
index 577e9e28cf379..a93c9a2826068 100644
--- a/paddle/phi/core/kernel_registry.h
+++ b/paddle/phi/core/kernel_registry.h
@@ -21,6 +21,7 @@
 #include <typeinfo>
 #include <vector>
 
+#include "paddle/phi/core/custom_kernel.h"
 #include "paddle/phi/core/kernel_factory.h"
 #include "paddle/phi/core/kernel_utils.h"
 #include "paddle/phi/core/macros.h"
@@ -62,6 +63,9 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
 #elif defined(PADDLE_WITH_XPU)
           ||
           arg_type == std::type_index(typeid(const XPUContext&))) {
+#elif defined(PADDLE_WITH_CUSTOM_DEVICE)
+          ||
+          arg_type == std::type_index(typeid(const CustomContext&))) {
 #else
               ) {
 #endif
@@ -83,11 +87,13 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
                               default_tensor_layout,
                               default_key.dtype(),
                               arg_type);
+#ifndef PADDLE_WITH_CUSTOM_KERNEL
       } else if (arg_type == std::type_index(typeid(const SelectedRows&))) {
         args_def->AppendInput(default_key.backend(),
                               default_tensor_layout,
                               default_key.dtype(),
                               arg_type);
+#endif
       } else if (arg_type == std::type_index(typeid(DenseTensor*))) {
         args_def->AppendOutput(default_key.backend(),
                                default_tensor_layout,
@@ -99,11 +105,13 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
                                default_tensor_layout,
                                default_key.dtype(),
                                arg_type);
+#ifndef PADDLE_WITH_CUSTOM_KERNEL
       } else if (arg_type == std::type_index(typeid(SelectedRows*))) {
         args_def->AppendOutput(default_key.backend(),
                                default_tensor_layout,
                                default_key.dtype(),
                                arg_type);
+#endif
       } else {
         // Attribute deal with
         // TODO(chenweihang): now here allow any types of attribute, maybe
@@ -121,20 +129,28 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
   }
 };
 
+// NOTE: used for making a difference between kernels compiled with phi or not.
+enum class RegType : uint8_t {
+  BUILTIN = 0,  // compiled with phi
+  PLUGIN,       // separate compiled and registered
+};
+
 // TODO(chenweihang): Polish the kernel selection logic, support the selection
 // of ALL_DTYPE kernel, and simplify the constructor
 struct KernelRegistrar {
  public:
-  KernelRegistrar(const char* kernel_name_cstr,
-                  Backend backend,
+  KernelRegistrar(RegType reg_type,
+                  const char* kernel_name_cstr,
+                  const char* backend_cstr,
                   DataLayout layout,
                   DataType dtype,
                   KernelArgsParseFn args_parse_fn,
                   KernelArgsDefFn args_def_fn,
                   KernelFn kernel_fn,
                   void* variadic_kernel_fn) {
-    ConstructKernel(kernel_name_cstr,
-                    backend,
+    ConstructKernel(reg_type,
+                    kernel_name_cstr,
+                    backend_cstr,
                     layout,
                     dtype,
                     args_parse_fn,
@@ -143,8 +159,9 @@ struct KernelRegistrar {
                     variadic_kernel_fn);
   }
 
-  KernelRegistrar(const char* kernel_name_cstr,
-                  Backend backend,
+  KernelRegistrar(RegType reg_type,
+                  const char* kernel_name_cstr,
+                  const char* backend_cstr,
                   DataLayout layout,
                   KernelArgsParseFn args_parse_fn,
                   KernelArgsDefFn args_def_fn,
@@ -160,8 +177,9 @@ struct KernelRegistrar {
           dtype == static_cast<size_t>(DataType::UINT16)) {
         continue;
       }
-      ConstructKernel(kernel_name_cstr,
-                      backend,
+      ConstructKernel(reg_type,
+                      kernel_name_cstr,
+                      backend_cstr,
                       layout,
                       static_cast<DataType>(dtype),
                       args_parse_fn,
@@ -172,8 +190,9 @@ struct KernelRegistrar {
   }
 
  private:
-  void ConstructKernel(const char* kernel_name_cstr,
-                       Backend backend,
+  void ConstructKernel(RegType reg_type,
+                       const char* kernel_name_cstr,
+                       const char* backend_cstr,
                        DataLayout layout,
                        DataType dtype,
                        KernelArgsParseFn args_parse_fn,
@@ -181,11 +200,16 @@ struct KernelRegistrar {
                        KernelFn kernel_fn,
                        void* variadic_kernel_fn) {
     std::string kernel_name(kernel_name_cstr);
-    KernelKey kernel_key(backend, layout, dtype);
+    KernelKey kernel_key(
+        paddle::experimental::StringToBackend(backend_cstr), layout, dtype);
     Kernel kernel(kernel_fn, variadic_kernel_fn);
     args_parse_fn(kernel_key, kernel.mutable_args_def());
     args_def_fn(kernel_key, &kernel);
-    KernelFactory::Instance().kernels()[kernel_name][kernel_key] = kernel;
+    if (reg_type == RegType::BUILTIN) {
+      KernelFactory::Instance().kernels()[kernel_name][kernel_key] = kernel;
+    } else {
+      CustomKernelMap::Instance().Kernels()[kernel_name][kernel_key] = kernel;
+    }
   }
 };
 
@@ -220,21 +244,38 @@ struct KernelRegistrar {
  * Note: `2TA` means `2 template argument`
  */
 #define PT_REGISTER_KERNEL(kernel_name, backend, layout, meta_kernel_fn, ...) \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                          \
-      pt_register_tp_kernel_ns_check_##kernel_name##_##backend##_##layout,    \
-      "PT_REGISTER_KERNEL must be called in global namespace.");              \
-  PT_EXPAND(_PT_REGISTER_2TA_KERNEL(                                          \
-      kernel_name, backend, layout, meta_kernel_fn, __VA_ARGS__))
+  _PT_REGISTER_KERNEL(::phi::RegType::BUILTIN,                                \
+                      kernel_name,                                            \
+                      backend,                                                \
+                      ::phi::backend##Context,                                \
+                      layout,                                                 \
+                      meta_kernel_fn,                                         \
+                      __VA_ARGS__)
+
+#define _PT_REGISTER_KERNEL(                                               \
+    reg_type, kernel_name, backend, context, layout, meta_kernel_fn, ...)  \
+  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                       \
+      pt_register_tp_kernel_ns_check_##kernel_name##_##backend##_##layout, \
+      "PT_REGISTER_KERNEL must be called in global namespace.");           \
+  PT_EXPAND(_PT_REGISTER_2TA_KERNEL(reg_type,                              \
+                                    kernel_name,                           \
+                                    backend,                               \
+                                    context,                               \
+                                    layout,                                \
+                                    meta_kernel_fn,                        \
+                                    __VA_ARGS__))
 
 #ifndef _WIN32
 #define _PT_REGISTER_2TA_KERNEL(                                            \
-    kernel_name, backend, layout, meta_kernel_fn, ...)                      \
-  PT_KERNEL_INSTANTIATION(meta_kernel_fn, backend, __VA_ARGS__);            \
+    reg_type, kernel_name, backend, context, layout, meta_kernel_fn, ...)   \
+  PT_KERNEL_INSTANTIATION(meta_kernel_fn, backend, context, __VA_ARGS__);   \
   static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel);           \
   PT_KERNEL_REGISTRAR_INIT(                                                 \
+      reg_type,                                                             \
       kernel_name,                                                          \
       backend,                                                              \
+      context,                                                              \
       layout,                                                               \
       &__PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout,        \
       meta_kernel_fn,                                                       \
@@ -255,12 +296,14 @@ struct KernelRegistrar {
  * And msvc can work without template instantiation
  */
 #define _PT_REGISTER_2TA_KERNEL(                                            \
-    kernel_name, backend, layout, meta_kernel_fn, ...)                      \
+    reg_type, kernel_name, backend, context, layout, meta_kernel_fn, ...)   \
   static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel);           \
   PT_EXPAND(PT_KERNEL_REGISTRAR_INIT(                                       \
+      reg_type,                                                             \
       kernel_name,                                                          \
       backend,                                                              \
+      context,                                                              \
       layout,                                                               \
       &__PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout,        \
       meta_kernel_fn,                                                       \
@@ -269,82 +312,119 @@ struct KernelRegistrar {
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel)
 #endif
 
-#define PT_KERNEL_INSTANTIATION(meta_kernel_fn, backend, ...) \
-  _PT_KERNEL_INSTANTIATION(                                   \
-      PT_NARGS(__VA_ARGS__), meta_kernel_fn, backend, __VA_ARGS__)
-
-#define _PT_KERNEL_INSTANTIATION(N, meta_kernel_fn, backend, ...) \
-  PT_CONCATENATE(_PT_KERNEL_INSTANTIATION_, N)                    \
-  (meta_kernel_fn, backend, __VA_ARGS__)
-
-#define _PT_KERNEL_INSTANTIATION_1(meta_kernel_fn, backend, cpp_dtype)  \
-  template decltype(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>) \
-      meta_kernel_fn<cpp_dtype, ::phi::backend##Context>
-#define _PT_KERNEL_INSTANTIATION_2(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)     \
-      meta_kernel_fn<cpp_dtype, ::phi::backend##Context>;                   \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_1(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_3(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)     \
-      meta_kernel_fn<cpp_dtype, ::phi::backend##Context>;                   \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_2(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_4(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)     \
-      meta_kernel_fn<cpp_dtype, ::phi::backend##Context>;                   \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_3(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_5(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)     \
-      meta_kernel_fn<cpp_dtype, ::phi::backend##Context>;                   \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_4(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_6(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)     \
-      meta_kernel_fn<cpp_dtype, ::phi::backend##Context>;                   \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_5(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_7(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)     \
-      meta_kernel_fn<cpp_dtype, ::phi::backend##Context>;                   \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_6(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_8(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)     \
-      meta_kernel_fn<cpp_dtype, ::phi::backend##Context>;                   \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_7(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_9(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)     \
-      meta_kernel_fn<cpp_dtype, ::phi::backend##Context>;                   \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_8(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_10(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)      \
-      meta_kernel_fn<cpp_dtype, ::phi::backend##Context>;                    \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_9(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_11(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)      \
-      meta_kernel_fn<cpp_dtype, ::phi::backend##Context>;                    \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_10(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_12(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)      \
-      meta_kernel_fn<cpp_dtype, ::phi::backend##Context>;                    \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_11(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_13(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)      \
-      meta_kernel_fn<cpp_dtype, ::phi::backend##Context>;                    \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_12(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_14(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)      \
-      meta_kernel_fn<cpp_dtype, ::phi::backend##Context>;                    \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_13(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_15(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)      \
-      meta_kernel_fn<cpp_dtype, ::phi::backend##Context>;                    \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_14(meta_kernel_fn, backend, __VA_ARGS__))
-
-#define PT_KERNEL_REGISTRAR_INIT(                                   \
-    kernel_name, backend, layout, args_def_fn, meta_kernel_fn, ...) \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT(PT_NARGS(__VA_ARGS__),        \
-                                      kernel_name,                  \
-                                      backend,                      \
-                                      layout,                       \
-                                      args_def_fn,                  \
-                                      meta_kernel_fn,               \
+#define PT_KERNEL_INSTANTIATION(meta_kernel_fn, backend, context, ...) \
+  _PT_KERNEL_INSTANTIATION(                                            \
+      PT_NARGS(__VA_ARGS__), meta_kernel_fn, backend, context, __VA_ARGS__)
+
+#define _PT_KERNEL_INSTANTIATION(N, meta_kernel_fn, backend, context, ...) \
+  PT_CONCATENATE(_PT_KERNEL_INSTANTIATION_, N)                             \
+  (meta_kernel_fn, backend, context, __VA_ARGS__)
+
+#define _PT_KERNEL_INSTANTIATION_1(              \
+    meta_kernel_fn, backend, context, cpp_dtype) \
+  template decltype(                             \
+      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>
+#define _PT_KERNEL_INSTANTIATION_2(                                           \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
+  template decltype(                                                          \
+      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_1(                                       \
+      meta_kernel_fn, backend, context, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_3(                                           \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
+  template decltype(                                                          \
+      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_2(                                       \
+      meta_kernel_fn, backend, context, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_4(                                           \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
+  template decltype(                                                          \
+      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_3(                                       \
+      meta_kernel_fn, backend, context, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_5(                                           \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
+  template decltype(                                                          \
+      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_4(                                       \
+      meta_kernel_fn, backend, context, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_6(                                           \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
+  template decltype(                                                          \
+      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_5(                                       \
+      meta_kernel_fn, backend, context, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_7(                                           \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
+  template decltype(                                                          \
+      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_6(                                       \
+      meta_kernel_fn, backend, context, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_8(                                           \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
+  template decltype(                                                          \
+      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_7(                                       \
+      meta_kernel_fn, backend, context, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_9(                                           \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
+  template decltype(                                                          \
+      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_8(                                       \
+      meta_kernel_fn, backend, context, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_10(                                          \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
+  template decltype(                                                          \
+      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_9(                                       \
+      meta_kernel_fn, backend, context, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_11(                                          \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
+  template decltype(                                                          \
+      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_10(                                      \
+      meta_kernel_fn, backend, context, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_12(                                          \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
+  template decltype(                                                          \
+      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_11(                                      \
+      meta_kernel_fn, backend, context, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_13(                                          \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
+  template decltype(                                                          \
+      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_12(                                      \
+      meta_kernel_fn, backend, context, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_14(                                          \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
+  template decltype(                                                          \
+      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_13(                                      \
+      meta_kernel_fn, backend, context, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_15(                                          \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
+  template decltype(                                                          \
+      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_14(                                      \
+      meta_kernel_fn, backend, context, __VA_ARGS__))
+
+#define PT_KERNEL_REGISTRAR_INIT(reg_type,                   \
+                                 kernel_name,                \
+                                 backend,                    \
+                                 context,                    \
+                                 layout,                     \
+                                 args_def_fn,                \
+                                 meta_kernel_fn,             \
+                                 ...)                        \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT(PT_NARGS(__VA_ARGS__), \
+                                      reg_type,              \
+                                      kernel_name,           \
+                                      backend,               \
+                                      context,               \
+                                      layout,                \
+                                      args_def_fn,           \
+                                      meta_kernel_fn,        \
                                       __VA_ARGS__))
 
 // clang-format off
@@ -352,15 +432,19 @@ struct KernelRegistrar {
 /* The =pre-commit always treats this macro into the wrong format,
   and multi-line macros cannot be skipped with NOLINT.*/
 #define _PT_KERNEL_REGISTRAR_INIT(N,                       \
+                                  reg_type,                \
                                   kernel_name,             \
                                   backend,                 \
+                                  context,                 \
                                   layout,                  \
                                   args_def_fn,             \
                                   meta_kernel_fn,          \
                                   ...)                     \
   PT_EXPAND(PT_CONCATENATE(_PT_KERNEL_REGISTRAR_INIT_, N) ( \
+    reg_type,                                              \
     kernel_name,                                           \
     backend,                                               \
+    context,                                               \
     layout,                                                \
     PT_ID,                                                 \
     args_def_fn,                                           \
@@ -369,413 +453,492 @@ struct KernelRegistrar {
 
 // clang-format on
 
-#define _PT_KERNEL_REGISTRAR_INIT_1(kernel_name,                               \
-                                    backend,                                   \
-                                    layout,                                    \
-                                    registrar_id,                              \
-                                    args_def_fn,                               \
-                                    meta_kernel_fn,                            \
-                                    cpp_dtype)                                 \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                          \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)(  \
-      #kernel_name,                                                            \
-      BACKEND(backend),                                                        \
-      DATALAYOUT(layout),                                                      \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),            \
-      ::phi::KernelArgsParseFunctor<decltype(                                  \
-          &meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)>::Parse,        \
-      args_def_fn,                                                             \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>),           \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)); \
+#define _PT_KERNEL_REGISTRAR_INIT_1(reg_type,                                 \
+                                    kernel_name,                              \
+                                    backend,                                  \
+                                    context,                                  \
+                                    layout,                                   \
+                                    registrar_id,                             \
+                                    args_def_fn,                              \
+                                    meta_kernel_fn,                           \
+                                    cpp_dtype)                                \
+  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
+      reg_type,                                                               \
+      #kernel_name,                                                           \
+      #backend,                                                               \
+      DATALAYOUT(layout),                                                     \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
+      ::phi::KernelArgsParseFunctor<decltype(                                 \
+          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      args_def_fn,                                                            \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
+      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
   int TouchKernelSymbolFor_##kernel_name##_##backend##_##layout() { return 0; }
-#define _PT_KERNEL_REGISTRAR_INIT_2(kernel_name,                               \
-                                    backend,                                   \
-                                    layout,                                    \
-                                    registrar_id,                              \
-                                    args_def_fn,                               \
-                                    meta_kernel_fn,                            \
-                                    cpp_dtype,                                 \
-                                    ...)                                       \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                          \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)(  \
-      #kernel_name,                                                            \
-      BACKEND(backend),                                                        \
-      DATALAYOUT(layout),                                                      \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),            \
-      ::phi::KernelArgsParseFunctor<decltype(                                  \
-          &meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)>::Parse,        \
-      args_def_fn,                                                             \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>),           \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)); \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_1(kernel_name,                           \
-                                        backend,                               \
-                                        layout,                                \
-                                        PT_ID,                                 \
-                                        args_def_fn,                           \
-                                        meta_kernel_fn,                        \
+#define _PT_KERNEL_REGISTRAR_INIT_2(reg_type,                                 \
+                                    kernel_name,                              \
+                                    backend,                                  \
+                                    context,                                  \
+                                    layout,                                   \
+                                    registrar_id,                             \
+                                    args_def_fn,                              \
+                                    meta_kernel_fn,                           \
+                                    cpp_dtype,                                \
+                                    ...)                                      \
+  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
+      reg_type,                                                               \
+      #kernel_name,                                                           \
+      #backend,                                                               \
+      DATALAYOUT(layout),                                                     \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
+      ::phi::KernelArgsParseFunctor<decltype(                                 \
+          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      args_def_fn,                                                            \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
+      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_1(reg_type,                             \
+                                        kernel_name,                          \
+                                        backend,                              \
+                                        context,                              \
+                                        layout,                               \
+                                        PT_ID,                                \
+                                        args_def_fn,                          \
+                                        meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_3(kernel_name,                               \
-                                    backend,                                   \
-                                    layout,                                    \
-                                    registrar_id,                              \
-                                    args_def_fn,                               \
-                                    meta_kernel_fn,                            \
-                                    cpp_dtype,                                 \
-                                    ...)                                       \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                          \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)(  \
-      #kernel_name,                                                            \
-      BACKEND(backend),                                                        \
-      DATALAYOUT(layout),                                                      \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),            \
-      ::phi::KernelArgsParseFunctor<decltype(                                  \
-          &meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)>::Parse,        \
-      args_def_fn,                                                             \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>),           \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)); \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_2(kernel_name,                           \
-                                        backend,                               \
-                                        layout,                                \
-                                        PT_ID,                                 \
-                                        args_def_fn,                           \
-                                        meta_kernel_fn,                        \
+#define _PT_KERNEL_REGISTRAR_INIT_3(reg_type,                                 \
+                                    kernel_name,                              \
+                                    backend,                                  \
+                                    context,                                  \
+                                    layout,                                   \
+                                    registrar_id,                             \
+                                    args_def_fn,                              \
+                                    meta_kernel_fn,                           \
+                                    cpp_dtype,                                \
+                                    ...)                                      \
+  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
+      reg_type,                                                               \
+      #kernel_name,                                                           \
+      #backend,                                                               \
+      DATALAYOUT(layout),                                                     \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
+      ::phi::KernelArgsParseFunctor<decltype(                                 \
+          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      args_def_fn,                                                            \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
+      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_2(reg_type,                             \
+                                        kernel_name,                          \
+                                        backend,                              \
+                                        context,                              \
+                                        layout,                               \
+                                        PT_ID,                                \
+                                        args_def_fn,                          \
+                                        meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_4(kernel_name,                               \
-                                    backend,                                   \
-                                    layout,                                    \
-                                    registrar_id,                              \
-                                    args_def_fn,                               \
-                                    meta_kernel_fn,                            \
-                                    cpp_dtype,                                 \
-                                    ...)                                       \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                          \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)(  \
-      #kernel_name,                                                            \
-      BACKEND(backend),                                                        \
-      DATALAYOUT(layout),                                                      \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),            \
-      ::phi::KernelArgsParseFunctor<decltype(                                  \
-          &meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)>::Parse,        \
-      args_def_fn,                                                             \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>),           \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)); \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_3(kernel_name,                           \
-                                        backend,                               \
-                                        layout,                                \
-                                        PT_ID,                                 \
-                                        args_def_fn,                           \
-                                        meta_kernel_fn,                        \
+#define _PT_KERNEL_REGISTRAR_INIT_4(reg_type,                                 \
+                                    kernel_name,                              \
+                                    backend,                                  \
+                                    context,                                  \
+                                    layout,                                   \
+                                    registrar_id,                             \
+                                    args_def_fn,                              \
+                                    meta_kernel_fn,                           \
+                                    cpp_dtype,                                \
+                                    ...)                                      \
+  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
+      reg_type,                                                               \
+      #kernel_name,                                                           \
+      #backend,                                                               \
+      DATALAYOUT(layout),                                                     \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
+      ::phi::KernelArgsParseFunctor<decltype(                                 \
+          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      args_def_fn,                                                            \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
+      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_3(reg_type,                             \
+                                        kernel_name,                          \
+                                        backend,                              \
+                                        context,                              \
+                                        layout,                               \
+                                        PT_ID,                                \
+                                        args_def_fn,                          \
+                                        meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_5(kernel_name,                               \
-                                    backend,                                   \
-                                    layout,                                    \
-                                    registrar_id,                              \
-                                    args_def_fn,                               \
-                                    meta_kernel_fn,                            \
-                                    cpp_dtype,                                 \
-                                    ...)                                       \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                          \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)(  \
-      #kernel_name,                                                            \
-      BACKEND(backend),                                                        \
-      DATALAYOUT(layout),                                                      \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),            \
-      ::phi::KernelArgsParseFunctor<decltype(                                  \
-          &meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)>::Parse,        \
-      args_def_fn,                                                             \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>),           \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)); \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_4(kernel_name,                           \
-                                        backend,                               \
-                                        layout,                                \
-                                        PT_ID,                                 \
-                                        args_def_fn,                           \
-                                        meta_kernel_fn,                        \
+#define _PT_KERNEL_REGISTRAR_INIT_5(reg_type,                                 \
+                                    kernel_name,                              \
+                                    backend,                                  \
+                                    context,                                  \
+                                    layout,                                   \
+                                    registrar_id,                             \
+                                    args_def_fn,                              \
+                                    meta_kernel_fn,                           \
+                                    cpp_dtype,                                \
+                                    ...)                                      \
+  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
+      reg_type,                                                               \
+      #kernel_name,                                                           \
+      #backend,                                                               \
+      DATALAYOUT(layout),                                                     \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
+      ::phi::KernelArgsParseFunctor<decltype(                                 \
+          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      args_def_fn,                                                            \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
+      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_4(reg_type,                             \
+                                        kernel_name,                          \
+                                        backend,                              \
+                                        context,                              \
+                                        layout,                               \
+                                        PT_ID,                                \
+                                        args_def_fn,                          \
+                                        meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_6(kernel_name,                               \
-                                    backend,                                   \
-                                    layout,                                    \
-                                    registrar_id,                              \
-                                    args_def_fn,                               \
-                                    meta_kernel_fn,                            \
-                                    cpp_dtype,                                 \
-                                    ...)                                       \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                          \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)(  \
-      #kernel_name,                                                            \
-      BACKEND(backend),                                                        \
-      DATALAYOUT(layout),                                                      \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),            \
-      ::phi::KernelArgsParseFunctor<decltype(                                  \
-          &meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)>::Parse,        \
-      args_def_fn,                                                             \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>),           \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)); \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_5(kernel_name,                           \
-                                        backend,                               \
-                                        layout,                                \
-                                        PT_ID,                                 \
-                                        args_def_fn,                           \
-                                        meta_kernel_fn,                        \
+#define _PT_KERNEL_REGISTRAR_INIT_6(reg_type,                                 \
+                                    kernel_name,                              \
+                                    backend,                                  \
+                                    context,                                  \
+                                    layout,                                   \
+                                    registrar_id,                             \
+                                    args_def_fn,                              \
+                                    meta_kernel_fn,                           \
+                                    cpp_dtype,                                \
+                                    ...)                                      \
+  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
+      reg_type,                                                               \
+      #kernel_name,                                                           \
+      #backend,                                                               \
+      DATALAYOUT(layout),                                                     \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
+      ::phi::KernelArgsParseFunctor<decltype(                                 \
+          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      args_def_fn,                                                            \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
+      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_5(reg_type,                             \
+                                        kernel_name,                          \
+                                        backend,                              \
+                                        context,                              \
+                                        layout,                               \
+                                        PT_ID,                                \
+                                        args_def_fn,                          \
+                                        meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_7(kernel_name,                               \
-                                    backend,                                   \
-                                    layout,                                    \
-                                    registrar_id,                              \
-                                    args_def_fn,                               \
-                                    meta_kernel_fn,                            \
-                                    cpp_dtype,                                 \
-                                    ...)                                       \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                          \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)(  \
-      #kernel_name,                                                            \
-      BACKEND(backend),                                                        \
-      DATALAYOUT(layout),                                                      \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),            \
-      ::phi::KernelArgsParseFunctor<decltype(                                  \
-          &meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)>::Parse,        \
-      args_def_fn,                                                             \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>),           \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)); \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_6(kernel_name,                           \
-                                        backend,                               \
-                                        layout,                                \
-                                        PT_ID,                                 \
-                                        args_def_fn,                           \
-                                        meta_kernel_fn,                        \
+#define _PT_KERNEL_REGISTRAR_INIT_7(reg_type,                                 \
+                                    kernel_name,                              \
+                                    backend,                                  \
+                                    context,                                  \
+                                    layout,                                   \
+                                    registrar_id,                             \
+                                    args_def_fn,                              \
+                                    meta_kernel_fn,                           \
+                                    cpp_dtype,                                \
+                                    ...)                                      \
+  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
+      reg_type,                                                               \
+      #kernel_name,                                                           \
+      #backend,                                                               \
+      DATALAYOUT(layout),                                                     \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
+      ::phi::KernelArgsParseFunctor<decltype(                                 \
+          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      args_def_fn,                                                            \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
+      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_6(reg_type,                             \
+                                        kernel_name,                          \
+                                        backend,                              \
+                                        context,                              \
+                                        layout,                               \
+                                        PT_ID,                                \
+                                        args_def_fn,                          \
+                                        meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_8(kernel_name,                               \
-                                    backend,                                   \
-                                    layout,                                    \
-                                    registrar_id,                              \
-                                    args_def_fn,                               \
-                                    meta_kernel_fn,                            \
-                                    cpp_dtype,                                 \
-                                    ...)                                       \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                          \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)(  \
-      #kernel_name,                                                            \
-      BACKEND(backend),                                                        \
-      DATALAYOUT(layout),                                                      \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),            \
-      ::phi::KernelArgsParseFunctor<decltype(                                  \
-          &meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)>::Parse,        \
-      args_def_fn,                                                             \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>),           \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)); \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_7(kernel_name,                           \
-                                        backend,                               \
-                                        layout,                                \
-                                        PT_ID,                                 \
-                                        args_def_fn,                           \
-                                        meta_kernel_fn,                        \
+#define _PT_KERNEL_REGISTRAR_INIT_8(reg_type,                                 \
+                                    kernel_name,                              \
+                                    backend,                                  \
+                                    context,                                  \
+                                    layout,                                   \
+                                    registrar_id,                             \
+                                    args_def_fn,                              \
+                                    meta_kernel_fn,                           \
+                                    cpp_dtype,                                \
+                                    ...)                                      \
+  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
+      reg_type,                                                               \
+      #kernel_name,                                                           \
+      #backend,                                                               \
+      DATALAYOUT(layout),                                                     \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
+      ::phi::KernelArgsParseFunctor<decltype(                                 \
+          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      args_def_fn,                                                            \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
+      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_7(reg_type,                             \
+                                        kernel_name,                          \
+                                        backend,                              \
+                                        context,                              \
+                                        layout,                               \
+                                        PT_ID,                                \
+                                        args_def_fn,                          \
+                                        meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_9(kernel_name,                               \
-                                    backend,                                   \
-                                    layout,                                    \
-                                    registrar_id,                              \
-                                    args_def_fn,                               \
-                                    meta_kernel_fn,                            \
-                                    cpp_dtype,                                 \
-                                    ...)                                       \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                          \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)(  \
-      #kernel_name,                                                            \
-      BACKEND(backend),                                                        \
-      DATALAYOUT(layout),                                                      \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),            \
-      ::phi::KernelArgsParseFunctor<decltype(                                  \
-          &meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)>::Parse,        \
-      args_def_fn,                                                             \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>),           \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)); \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_8(kernel_name,                           \
-                                        backend,                               \
-                                        layout,                                \
-                                        PT_ID,                                 \
-                                        args_def_fn,                           \
-                                        meta_kernel_fn,                        \
+#define _PT_KERNEL_REGISTRAR_INIT_9(reg_type,                                 \
+                                    kernel_name,                              \
+                                    backend,                                  \
+                                    context,                                  \
+                                    layout,                                   \
+                                    registrar_id,                             \
+                                    args_def_fn,                              \
+                                    meta_kernel_fn,                           \
+                                    cpp_dtype,                                \
+                                    ...)                                      \
+  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
+      reg_type,                                                               \
+      #kernel_name,                                                           \
+      #backend,                                                               \
+      DATALAYOUT(layout),                                                     \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
+      ::phi::KernelArgsParseFunctor<decltype(                                 \
+          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      args_def_fn,                                                            \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
+      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_8(reg_type,                             \
+                                        kernel_name,                          \
+                                        backend,                              \
+                                        context,                              \
+                                        layout,                               \
+                                        PT_ID,                                \
+                                        args_def_fn,                          \
+                                        meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_10(kernel_name,                              \
-                                     backend,                                  \
-                                     layout,                                   \
-                                     registrar_id,                             \
-                                     args_def_fn,                              \
-                                     meta_kernel_fn,                           \
-                                     cpp_dtype,                                \
-                                     ...)                                      \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                          \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)(  \
-      #kernel_name,                                                            \
-      BACKEND(backend),                                                        \
-      DATALAYOUT(layout),                                                      \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),            \
-      ::phi::KernelArgsParseFunctor<decltype(                                  \
-          &meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)>::Parse,        \
-      args_def_fn,                                                             \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>),           \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)); \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_9(kernel_name,                           \
-                                        backend,                               \
-                                        layout,                                \
-                                        PT_ID,                                 \
-                                        args_def_fn,                           \
-                                        meta_kernel_fn,                        \
+#define _PT_KERNEL_REGISTRAR_INIT_10(reg_type,                                \
+                                     kernel_name,                             \
+                                     backend,                                 \
+                                     context,                                 \
+                                     layout,                                  \
+                                     registrar_id,                            \
+                                     args_def_fn,                             \
+                                     meta_kernel_fn,                          \
+                                     cpp_dtype,                               \
+                                     ...)                                     \
+  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
+      reg_type,                                                               \
+      #kernel_name,                                                           \
+      #backend,                                                               \
+      DATALAYOUT(layout),                                                     \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
+      ::phi::KernelArgsParseFunctor<decltype(                                 \
+          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      args_def_fn,                                                            \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
+      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_9(reg_type,                             \
+                                        kernel_name,                          \
+                                        backend,                              \
+                                        context,                              \
+                                        layout,                               \
+                                        PT_ID,                                \
+                                        args_def_fn,                          \
+                                        meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_11(kernel_name,                              \
-                                     backend,                                  \
-                                     layout,                                   \
-                                     registrar_id,                             \
-                                     args_def_fn,                              \
-                                     meta_kernel_fn,                           \
-                                     cpp_dtype,                                \
-                                     ...)                                      \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                          \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)(  \
-      #kernel_name,                                                            \
-      BACKEND(backend),                                                        \
-      DATALAYOUT(layout),                                                      \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),            \
-      ::phi::KernelArgsParseFunctor<decltype(                                  \
-          &meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)>::Parse,        \
-      args_def_fn,                                                             \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>),           \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)); \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_10(kernel_name,                          \
-                                         backend,                              \
-                                         layout,                               \
-                                         PT_ID,                                \
-                                         args_def_fn,                          \
-                                         meta_kernel_fn,                       \
+#define _PT_KERNEL_REGISTRAR_INIT_11(reg_type,                                \
+                                     kernel_name,                             \
+                                     backend,                                 \
+                                     context,                                 \
+                                     layout,                                  \
+                                     registrar_id,                            \
+                                     args_def_fn,                             \
+                                     meta_kernel_fn,                          \
+                                     cpp_dtype,                               \
+                                     ...)                                     \
+  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
+      reg_type,                                                               \
+      #kernel_name,                                                           \
+      #backend,                                                               \
+      DATALAYOUT(layout),                                                     \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
+      ::phi::KernelArgsParseFunctor<decltype(                                 \
+          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      args_def_fn,                                                            \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
+      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_10(reg_type,                            \
+                                         kernel_name,                         \
+                                         backend,                             \
+                                         context,                             \
+                                         layout,                              \
+                                         PT_ID,                               \
+                                         args_def_fn,                         \
+                                         meta_kernel_fn,                      \
                                          __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_12(kernel_name,                              \
-                                     backend,                                  \
-                                     layout,                                   \
-                                     registrar_id,                             \
-                                     args_def_fn,                              \
-                                     meta_kernel_fn,                           \
-                                     cpp_dtype,                                \
-                                     ...)                                      \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                          \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)(  \
-      #kernel_name,                                                            \
-      BACKEND(backend),                                                        \
-      DATALAYOUT(layout),                                                      \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),            \
-      ::phi::KernelArgsParseFunctor<decltype(                                  \
-          &meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)>::Parse,        \
-      args_def_fn,                                                             \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>),           \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)); \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_11(kernel_name,                          \
-                                         backend,                              \
-                                         layout,                               \
-                                         PT_ID,                                \
-                                         args_def_fn,                          \
-                                         meta_kernel_fn,                       \
+#define _PT_KERNEL_REGISTRAR_INIT_12(reg_type,                                \
+                                     kernel_name,                             \
+                                     backend,                                 \
+                                     context,                                 \
+                                     layout,                                  \
+                                     registrar_id,                            \
+                                     args_def_fn,                             \
+                                     meta_kernel_fn,                          \
+                                     cpp_dtype,                               \
+                                     ...)                                     \
+  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
+      reg_type,                                                               \
+      #kernel_name,                                                           \
+      #backend,                                                               \
+      DATALAYOUT(layout),                                                     \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
+      ::phi::KernelArgsParseFunctor<decltype(                                 \
+          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      args_def_fn,                                                            \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
+      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_11(reg_type,                            \
+                                         kernel_name,                         \
+                                         backend,                             \
+                                         context,                             \
+                                         layout,                              \
+                                         PT_ID,                               \
+                                         args_def_fn,                         \
+                                         meta_kernel_fn,                      \
                                          __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_13(kernel_name,                              \
-                                     backend,                                  \
-                                     layout,                                   \
-                                     registrar_id,                             \
-                                     args_def_fn,                              \
-                                     meta_kernel_fn,                           \
-                                     cpp_dtype,                                \
-                                     ...)                                      \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                          \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)(  \
-      #kernel_name,                                                            \
-      BACKEND(backend),                                                        \
-      DATALAYOUT(layout),                                                      \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),            \
-      ::phi::KernelArgsParseFunctor<decltype(                                  \
-          &meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)>::Parse,        \
-      args_def_fn,                                                             \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>),           \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)); \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_12(kernel_name,                          \
-                                         backend,                              \
-                                         layout,                               \
-                                         PT_ID,                                \
-                                         args_def_fn,                          \
-                                         meta_kernel_fn,                       \
+#define _PT_KERNEL_REGISTRAR_INIT_13(reg_type,                                \
+                                     kernel_name,                             \
+                                     backend,                                 \
+                                     context,                                 \
+                                     layout,                                  \
+                                     registrar_id,                            \
+                                     args_def_fn,                             \
+                                     meta_kernel_fn,                          \
+                                     cpp_dtype,                               \
+                                     ...)                                     \
+  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
+      reg_type,                                                               \
+      #kernel_name,                                                           \
+      #backend,                                                               \
+      DATALAYOUT(layout),                                                     \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
+      ::phi::KernelArgsParseFunctor<decltype(                                 \
+          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      args_def_fn,                                                            \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
+      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_12(reg_type,                            \
+                                         kernel_name,                         \
+                                         backend,                             \
+                                         context,                             \
+                                         layout,                              \
+                                         PT_ID,                               \
+                                         args_def_fn,                         \
+                                         meta_kernel_fn,                      \
                                          __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_14(kernel_name,                              \
-                                     backend,                                  \
-                                     layout,                                   \
-                                     registrar_id,                             \
-                                     args_def_fn,                              \
-                                     meta_kernel_fn,                           \
-                                     cpp_dtype,                                \
-                                     ...)                                      \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                          \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)(  \
-      #kernel_name,                                                            \
-      BACKEND(backend),                                                        \
-      DATALAYOUT(layout),                                                      \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),            \
-      ::phi::KernelArgsParseFunctor<decltype(                                  \
-          &meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)>::Parse,        \
-      args_def_fn,                                                             \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>),           \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)); \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_13(kernel_name,                          \
-                                         backend,                              \
-                                         layout,                               \
-                                         PT_ID,                                \
-                                         args_def_fn,                          \
-                                         meta_kernel_fn,                       \
+#define _PT_KERNEL_REGISTRAR_INIT_14(reg_type,                                \
+                                     kernel_name,                             \
+                                     backend,                                 \
+                                     context,                                 \
+                                     layout,                                  \
+                                     registrar_id,                            \
+                                     args_def_fn,                             \
+                                     meta_kernel_fn,                          \
+                                     cpp_dtype,                               \
+                                     ...)                                     \
+  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
+      reg_type,                                                               \
+      #kernel_name,                                                           \
+      #backend,                                                               \
+      DATALAYOUT(layout),                                                     \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
+      ::phi::KernelArgsParseFunctor<decltype(                                 \
+          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      args_def_fn,                                                            \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
+      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_13(reg_type,                            \
+                                         kernel_name,                         \
+                                         backend,                             \
+                                         context,                             \
+                                         layout,                              \
+                                         PT_ID,                               \
+                                         args_def_fn,                         \
+                                         meta_kernel_fn,                      \
                                          __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_15(kernel_name,                              \
-                                     backend,                                  \
-                                     layout,                                   \
-                                     registrar_id,                             \
-                                     args_def_fn,                              \
-                                     meta_kernel_fn,                           \
-                                     cpp_dtype,                                \
-                                     ...)                                      \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                          \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)(  \
-      #kernel_name,                                                            \
-      BACKEND(backend),                                                        \
-      DATALAYOUT(layout),                                                      \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),            \
-      ::phi::KernelArgsParseFunctor<decltype(                                  \
-          &meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)>::Parse,        \
-      args_def_fn,                                                             \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>),           \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)); \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_14(kernel_name,                          \
-                                         backend,                              \
-                                         layout,                               \
-                                         PT_ID,                                \
-                                         args_def_fn,                          \
-                                         meta_kernel_fn,                       \
+#define _PT_KERNEL_REGISTRAR_INIT_15(reg_type,                                \
+                                     kernel_name,                             \
+                                     backend,                                 \
+                                     context,                                 \
+                                     layout,                                  \
+                                     registrar_id,                            \
+                                     args_def_fn,                             \
+                                     meta_kernel_fn,                          \
+                                     cpp_dtype,                               \
+                                     ...)                                     \
+  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
+      reg_type,                                                               \
+      #kernel_name,                                                           \
+      #backend,                                                               \
+      DATALAYOUT(layout),                                                     \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
+      ::phi::KernelArgsParseFunctor<decltype(                                 \
+          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      args_def_fn,                                                            \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
+      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_14(reg_type,                            \
+                                         kernel_name,                         \
+                                         backend,                             \
+                                         context,                             \
+                                         layout,                              \
+                                         PT_ID,                               \
+                                         args_def_fn,                         \
+                                         meta_kernel_fn,                      \
                                          __VA_ARGS__))
-
 /** PT_REGISTER_GENERAL_KERNEL
  *
  * Basic Kernel register marco, used to register a instantiated kernel function
  * with one template argument.
  */
 
-#define PT_REGISTER_GENERAL_KERNEL(                                          \
-    kernel_name, backend, layout, kernel_fn, dtype)                          \
+#define PT_REGISTER_GENERAL_KERNEL(                 \
+    kernel_name, backend, layout, kernel_fn, dtype) \
+  _PT_REGISTER_GENERAL_KERNEL(                      \
+      ::phi::RegType::BUILTIN, kernel_name, backend, layout, kernel_fn, dtype)
+
+#define _PT_REGISTER_GENERAL_KERNEL(                                         \
+    reg_type, kernel_name, backend, layout, kernel_fn, dtype)                \
   PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                         \
       pt_register_no_t_kernel_ns_check_##kernel_name##_##backend##_##layout, \
       "PT_REGISTER_NO_TEMPLATE_KERNEL must be called in global namespace."); \
-  _PT_REGISTER_GENERAL_KERNEL(kernel_name, backend, layout, kernel_fn, dtype)
+  __PT_REGISTER_GENERAL_KERNEL(                                              \
+      reg_type, kernel_name, backend, layout, kernel_fn, dtype)
 
 #ifndef _WIN32
-#define _PT_REGISTER_GENERAL_KERNEL(                                        \
-    kernel_name, backend, layout, kernel_fn, dtype)                         \
+#define __PT_REGISTER_GENERAL_KERNEL(                                       \
+    reg_type, kernel_name, backend, layout, kernel_fn, dtype)               \
   template decltype(kernel_fn) kernel_fn;                                   \
   static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel);           \
   static const ::phi::KernelRegistrar                                       \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout(                 \
+          reg_type,                                                         \
           #kernel_name,                                                     \
-          BACKEND(backend),                                                 \
+          #backend,                                                         \
           DATALAYOUT(layout),                                               \
           ::phi::KernelArgsParseFunctor<decltype(&kernel_fn)>::Parse,       \
           &__PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout,    \
@@ -787,14 +950,15 @@ struct KernelRegistrar {
   void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(        \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel)
 #else
-#define _PT_REGISTER_GENERAL_KERNEL(                                        \
-    kernel_name, backend, layout, kernel_fn, dtype)                         \
+#define __PT_REGISTER_GENERAL_KERNEL(                                       \
+    reg_type, kernel_name, backend, layout, kernel_fn, dtype)               \
   static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel);           \
   static const ::phi::KernelRegistrar                                       \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout(                 \
+          reg_type,                                                         \
           #kernel_name,                                                     \
-          BACKEND(backend),                                                 \
+          #backend,                                                         \
           DATALAYOUT(layout),                                               \
           ::phi::KernelArgsParseFunctor<decltype(&kernel_fn)>::Parse,       \
           &__PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout,    \
@@ -821,4 +985,33 @@ struct KernelRegistrar {
       __declare_kernel_symbol_for_##kernel_name##_##backend##_##layout =  \
           TouchKernelSymbolFor_##kernel_name##_##backend##_##layout()
 
+/** PD_REGISTER_KERNEL
+ *
+ * Used to register kernels for built-in backends.
+ * Support CPU GPU XPU.
+ */
+#define PD_REGISTER_KERNEL(kernel_name, backend, layout, meta_kernel_fn, ...) \
+  _PT_REGISTER_KERNEL(::phi::RegType::PLUGIN,                                 \
+                      kernel_name,                                            \
+                      backend,                                                \
+                      ::phi::backend##Context,                                \
+                      layout,                                                 \
+                      meta_kernel_fn,                                         \
+                      __VA_ARGS__)
+
+/** PD_REGISTER_CUSTOM_KERNEL
+ *
+ * Used to register kernels for plug-in backends.
+ * Support user-defined backend such as 'Ascend910'.
+ */
+#define PD_REGISTER_CUSTOM_KERNEL(                     \
+    kernel_name, backend, layout, meta_kernel_fn, ...) \
+  _PT_REGISTER_KERNEL(::phi::RegType::PLUGIN,          \
+                      kernel_name,                     \
+                      backend,                         \
+                      ::phi::CustomContext,            \
+                      layout,                          \
+                      meta_kernel_fn,                  \
+                      __VA_ARGS__)
+
 }  // namespace phi
diff --git a/paddle/phi/core/kernel_utils.h b/paddle/phi/core/kernel_utils.h
index 8c7d096eab091..862f61b20400e 100644
--- a/paddle/phi/core/kernel_utils.h
+++ b/paddle/phi/core/kernel_utils.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/custom/custom_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/xpu/xpu_context.h"
 #include "paddle/phi/common/scalar.h"
@@ -22,7 +23,9 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_context.h"
+#ifndef PADDLE_WITH_CUSTOM_KERNEL
 #include "paddle/phi/core/selected_rows.h"
+#endif
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/core/sparse_csr_tensor.h"
 #include "paddle/phi/core/type_defs.h"
@@ -210,13 +213,18 @@ struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {
 #ifdef PADDLE_WITH_XPU
   PT_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(XPUContext);
 #endif
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  PT_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(CustomContext);
+#endif
 
   /* Input Helpers */
 
   PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(DenseTensor);
   PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(DenseTensor);
   PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(DenseTensor);
+#ifndef PADDLE_WITH_CUSTOM_KERNEL
   PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SelectedRows);
+#endif
 
   PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SparseCooTensor);
   PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SparseCooTensor);
@@ -250,7 +258,9 @@ struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {
 
   PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(DenseTensor);
   PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(DenseTensor);
+#ifndef PADDLE_WITH_CUSTOM_KERNEL
   PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SelectedRows);
+#endif
 
   PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SparseCooTensor);
   PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(SparseCooTensor);
diff --git a/paddle/phi/core/lod_utils.h b/paddle/phi/core/lod_utils.h
index 2b0be4d93429d..a5f73b66fb99b 100644
--- a/paddle/phi/core/lod_utils.h
+++ b/paddle/phi/core/lod_utils.h
@@ -15,10 +15,16 @@
 #pragma once
 
 // See Note [ Why still include the fluid headers? ]
+#ifndef PADDLE_WITH_CUSTOM_KERNEL
 #include "paddle/fluid/framework/mixed_vector.h"
+#endif
 
 namespace phi {
+#ifndef PADDLE_WITH_CUSTOM_KERNEL
 using LoD = std::vector<paddle::framework::Vector<size_t>>;
+#else
+using LoD = std::vector<std::vector<size_t>>;
+#endif
 
 void AppendLoD(LoD* lod, const LoD& lod_length);
 
diff --git a/paddle/phi/core/tensor_meta.h b/paddle/phi/core/tensor_meta.h
index d5e5e2aa001fd..ede9b43b1f382 100644
--- a/paddle/phi/core/tensor_meta.h
+++ b/paddle/phi/core/tensor_meta.h
@@ -24,12 +24,18 @@ limitations under the License. */
 // Note: mixed_vector include many header now, LoD will be
 // used on CUDA device? Can we use small_vector here?
 // @zhanlve: Rollback to original LoD for now
+#ifndef PADDLE_WITH_CUSTOM_KERNEL
 #include "paddle/fluid/framework/mixed_vector.h"
+#endif
 
 namespace phi {
 
 using DDim = phi::DDim;
+#ifndef PADDLE_WITH_CUSTOM_KERNEL
 using LoD = std::vector<paddle::framework::Vector<size_t>>;
+#else
+using LoD = std::vector<std::vector<size_t>>;
+#endif
 /// \brief The meta data of dense tensor. Take the structure type
 /// and use all default operations.
 ///
diff --git a/paddle/phi/core/tensor_utils.h b/paddle/phi/core/tensor_utils.h
index 04db7c0877ad8..676a590ecbce2 100644
--- a/paddle/phi/core/tensor_utils.h
+++ b/paddle/phi/core/tensor_utils.h
@@ -31,25 +31,25 @@ class DenseTensorUtils {
     size_t bytes = tensor.numel() * SizeOf(tensor.dtype());
     PADDLE_ENFORCE_GE(tensor.capacity(),
                       bytes,
-                      paddle::platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The memory size %d should be enough to meet the "
                           "volume required by metadata %d.",
                           tensor.capacity(),
                           bytes));
-    PADDLE_ENFORCE_GE(begin_idx,
-                      0,
-                      paddle::platform::errors::OutOfRange(
-                          "The start row index must be greater than 0."
-                          "But received the start index is d%.",
-                          begin_idx));
-    PADDLE_ENFORCE_LE(end_idx,
-                      tensor.dims()[0],
-                      paddle::platform::errors::OutOfRange(
-                          "The end row index is out of bound."));
+    PADDLE_ENFORCE_GE(
+        begin_idx,
+        0,
+        phi::errors::OutOfRange("The start row index must be greater than 0."
+                                "But received the start index is d%.",
+                                begin_idx));
+    PADDLE_ENFORCE_LE(
+        end_idx,
+        tensor.dims()[0],
+        phi::errors::OutOfRange("The end row index is out of bound."));
     PADDLE_ENFORCE_LT(
         begin_idx,
         end_idx,
-        paddle::platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The start row index must be less than the end row index."
             "But received the start index = %d, the end index = %d.",
             begin_idx,
diff --git a/paddle/phi/tests/common/test_backend.cc b/paddle/phi/tests/common/test_backend.cc
index 941c00d9fea8b..d74a35c9eae2e 100644
--- a/paddle/phi/tests/common/test_backend.cc
+++ b/paddle/phi/tests/common/test_backend.cc
@@ -52,5 +52,19 @@ TEST(Backend, OStream) {
   }
 }
 
+TEST(Backend, StringToBackend) {
+  namespace pexp = paddle::experimental;
+  EXPECT_EQ(phi::Backend::UNDEFINED, pexp::StringToBackend("Undefined"));
+  EXPECT_EQ(phi::Backend::CPU, pexp::StringToBackend("CPU"));
+  EXPECT_EQ(phi::Backend::GPU, pexp::StringToBackend("GPU"));
+  EXPECT_EQ(phi::Backend::XPU, pexp::StringToBackend("XPU"));
+  EXPECT_EQ(phi::Backend::NPU, pexp::StringToBackend("NPU"));
+  EXPECT_EQ(phi::Backend::MKLDNN, pexp::StringToBackend("MKLDNN"));
+  EXPECT_EQ(phi::Backend::CUDNN, pexp::StringToBackend("CUDNN"));
+  EXPECT_EQ(static_cast<phi::Backend>(
+                static_cast<size_t>(phi::Backend::NUM_BACKENDS) + 1),
+            pexp::StringToBackend("CustomBackend"));
+}
+
 }  // namespace tests
 }  // namespace phi
diff --git a/paddle/phi/tests/core/CMakeLists.txt b/paddle/phi/tests/core/CMakeLists.txt
index 971d9112eead9..576ab7ffe6a66 100644
--- a/paddle/phi/tests/core/CMakeLists.txt
+++ b/paddle/phi/tests/core/CMakeLists.txt
@@ -1,3 +1,4 @@
+cc_test(test_custom_kernel SRCS test_custom_kernel.cc DEPS pten_custom_kernel)
 cc_test(test_dense_tensor SRCS test_dense_tensor.cc DEPS dense_tensor)
 cc_test(test_intrusive_ptr SRCS test_intrusive_ptr.cc)
 cc_test(test_type_info SRCS test_type_info.cc)
diff --git a/paddle/fluid/framework/custom_kernel_test.cc b/paddle/phi/tests/core/test_custom_kernel.cc
similarity index 70%
rename from paddle/fluid/framework/custom_kernel_test.cc
rename to paddle/phi/tests/core/test_custom_kernel.cc
index fb3cc0a35f0e0..b0957d80aa95e 100644
--- a/paddle/fluid/framework/custom_kernel_test.cc
+++ b/paddle/phi/tests/core/test_custom_kernel.cc
@@ -17,24 +17,21 @@ limitations under the License. */
 #define _LINUX
 #endif
 
-#include "paddle/fluid/framework/custom_kernel.h"
-
-#include <glog/logging.h>
-#include <gtest/gtest.h>
-#include "paddle/extension.h"
+#ifdef _LINUX
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_kernel_info_helper.h"
-#include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/api/lib/utils/storage.h"
-#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/common/scalar_array.h"
 #include "paddle/phi/core/kernel_context.h"
 #include "paddle/phi/core/kernel_factory.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/meta_tensor.h"
 #include "paddle/phi/infermeta/binary.h"
-#include "paddle/utils/small_vector.h"
 
-#ifdef _LINUX
+#include <gtest/gtest.h>
+
 // user kernel function
 namespace custom_kernel {
 
@@ -43,17 +40,23 @@ namespace custom_kernel {
 // attribute 11: fake_attributes
 // output 2: one Tensor* and one std::vector<Tensor*>
 template <typename T, typename Context>
-void FakeDot(const Context& dev_ctx, const paddle::Tensor& x,
-             const paddle::Tensor& y,
-             const std::vector<paddle::Tensor>& fake_input_vec,
-             bool fake_attr_bool, int fake_attr_int, float fake_attr_float,
-             double fake_attr_double, int64_t fake_attr_int64,
-             phi::dtype::float16 fake_attr_f16, phi::DataType fake_attr_dtype,
+void FakeDot(const Context& dev_ctx,
+             const phi::DenseTensor& x,
+             const phi::DenseTensor& y,
+             const std::vector<phi::DenseTensor>& fake_input_vec,
+             bool fake_attr_bool,
+             int fake_attr_int,
+             float fake_attr_float,
+             double fake_attr_double,
+             int64_t fake_attr_int64,
+             phi::dtype::float16 fake_attr_f16,
+             phi::DataType fake_attr_dtype,
              const phi::Scalar& fake_attr_scalar,
              const phi::ScalarArray& fake_attr_scalar_array,
              const std::vector<int64_t>& fake_attr_int64_vec,
-             const std::vector<int>& fake_attr_int_vec, paddle::Tensor* out,
-             std::vector<paddle::Tensor*> fake_out_vec) {
+             const std::vector<int>& fake_attr_int_vec,
+             phi::DenseTensor* out,
+             std::vector<phi::DenseTensor*> fake_out_vec) {
   // print param info
   std::cout << "fake_input_vec.size: " << fake_input_vec.size() << std::endl;
   std::cout << "fake_attr_bool: " << fake_attr_bool << std::endl;
@@ -83,10 +86,10 @@ void FakeDot(const Context& dev_ctx, const paddle::Tensor& x,
 
   auto const *x_ptr = x.data<T>(), *x_ptr_ = &x_ptr[0];
   auto const *y_ptr = y.data<T>(), *y_ptr_ = &y_ptr[0];
-  auto* z = out->mutable_data<T>(paddle::PlaceType::kCPU);
-  auto shape = x.shape();
+  T* z = dev_ctx.template Alloc<T>(out);
+  auto&& d = x.dims();
   auto const N = x.numel();
-  auto const B = shape[shape.size() - 1];
+  auto const B = d[d.size() - 1];
   for (int j = 0; j < N / B; j++) {
     T ss = 0;
     for (int i = 0; i < B; i++) ss += (*x_ptr_++) * (*y_ptr_++);
@@ -95,8 +98,19 @@ void FakeDot(const Context& dev_ctx, const paddle::Tensor& x,
 }
 }  // namespace custom_kernel
 
-PD_REGISTER_KERNEL(fake_dot, CPU, ALL_LAYOUT, custom_kernel::FakeDot, float,
-                   double, int, int64_t, int8_t, uint8_t) {}
+PD_REGISTER_KERNEL(fake_dot,
+                   CPU,
+                   ALL_LAYOUT,
+                   custom_kernel::FakeDot,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   int8_t,
+                   uint8_t) {}
+
+namespace phi {
+namespace tests {
 
 // Upper code will store dot kernels info into OpKernelInfoMap
 TEST(CustomKernel, custom_kernel_dot) {
@@ -105,33 +119,38 @@ TEST(CustomKernel, custom_kernel_dot) {
   phi::DataLayout layout = phi::DataLayout::ALL_LAYOUT;
 
   // 1.custom kernel info parsed and store
-  EXPECT_TRUE(paddle::OpKernelInfoMap::Instance().GetMap().find(op_name) !=
-              paddle::OpKernelInfoMap::Instance().GetMap().end());
+  EXPECT_TRUE(phi::CustomKernelMap::Instance().GetMap().find(op_name) !=
+              phi::CustomKernelMap::Instance().GetMap().end());
 
+  auto& custom_kernels = phi::CustomKernelMap::Instance().Kernels();
   // 2.info check
-  EXPECT_EQ(
-      6, static_cast<int>(paddle::OpKernelInfoMap::Instance()[op_name].size()));
-  // index 0
-  EXPECT_TRUE(paddle::OpKernelInfoMap::Instance()[op_name][0].GetBackend() ==
-              backend);
-  EXPECT_TRUE(paddle::OpKernelInfoMap::Instance()[op_name][0].GetDataLayout() ==
-              layout);
-  EXPECT_TRUE(paddle::OpKernelInfoMap::Instance()[op_name][0].GetDataType() ==
-              phi::DataType::FLOAT32);
-  // index 5
-  EXPECT_TRUE(paddle::OpKernelInfoMap::Instance()[op_name][5].GetBackend() ==
-              backend);
-  EXPECT_TRUE(paddle::OpKernelInfoMap::Instance()[op_name][5].GetDataLayout() ==
-              layout);
-  EXPECT_TRUE(paddle::OpKernelInfoMap::Instance()[op_name][5].GetDataType() ==
-              phi::DataType::UINT8);
+  EXPECT_EQ(6, static_cast<int>(custom_kernels[op_name].size()));
+  auto& custom_fake_dot_kernels = custom_kernels[op_name];
+  EXPECT_TRUE(custom_fake_dot_kernels.find(
+                  phi::KernelKey(backend, layout, phi::DataType::FLOAT32)) !=
+              custom_fake_dot_kernels.end());
+  EXPECT_TRUE(custom_fake_dot_kernels.find(
+                  phi::KernelKey(backend, layout, phi::DataType::FLOAT64)) !=
+              custom_fake_dot_kernels.end());
+  EXPECT_TRUE(custom_fake_dot_kernels.find(
+                  phi::KernelKey(backend, layout, phi::DataType::INT32)) !=
+              custom_fake_dot_kernels.end());
+  EXPECT_TRUE(custom_fake_dot_kernels.find(
+                  phi::KernelKey(backend, layout, phi::DataType::INT64)) !=
+              custom_fake_dot_kernels.end());
+  EXPECT_TRUE(custom_fake_dot_kernels.find(
+                  phi::KernelKey(backend, layout, phi::DataType::INT8)) !=
+              custom_fake_dot_kernels.end());
+  EXPECT_TRUE(custom_fake_dot_kernels.find(
+                  phi::KernelKey(backend, layout, phi::DataType::UINT8)) !=
+              custom_fake_dot_kernels.end());
 
   // 3.before register
   auto& kernel_factory_instance = phi::KernelFactory::Instance();
   auto& kernels = phi::KernelFactory::Instance().kernels();
   EXPECT_TRUE(!kernel_factory_instance.HasCompatiblePtenKernel(op_name));
 
-  // mock fake_dot is supported by pten for HasCompatiblePtenKernel check while
+  // mock fake_dot is supported by phi for HasCompatiblePtenKernel check while
   // registering
   auto& fake_dot_kernels = kernels[op_name];
 
@@ -155,8 +174,7 @@ TEST(CustomKernel, custom_kernel_dot) {
               fake_dot_kernels.end());
 
   // register
-  paddle::framework::RegisterKernelWithMetaInfoMap(
-      paddle::OpKernelInfoMap::Instance());
+  phi::RegisterCustomKernels(phi::CustomKernelMap::Instance());
 
   EXPECT_TRUE(fake_dot_kernels.find(
                   phi::KernelKey(backend, layout, phi::DataType::FLOAT32)) !=
@@ -186,15 +204,15 @@ TEST(CustomKernel, custom_kernel_dot) {
       paddle::platform::CPUPlace());
   auto dense_x = std::make_shared<phi::DenseTensor>(
       alloc.get(),
-      phi::DenseTensorMeta(phi::DataType::UINT8, phi::make_ddim({2, 3}),
-                           phi::DataLayout::NCHW));
+      phi::DenseTensorMeta(
+          phi::DataType::UINT8, phi::make_ddim({2, 3}), phi::DataLayout::NCHW));
   auto* dense_x_data =
       dense_x->mutable_data<uint8_t>(paddle::platform::CPUPlace());
 
   auto dense_y = std::make_shared<phi::DenseTensor>(
       alloc.get(),
-      phi::DenseTensorMeta(phi::DataType::UINT8, phi::make_ddim({2, 3}),
-                           phi::DataLayout::NCHW));
+      phi::DenseTensorMeta(
+          phi::DataType::UINT8, phi::make_ddim({2, 3}), phi::DataLayout::NCHW));
   auto* dense_y_data =
       dense_y->mutable_data<uint8_t>(paddle::platform::CPUPlace());
 
@@ -288,38 +306,7 @@ TEST(CustomKernel, custom_kernel_dot) {
   ASSERT_EQ(expect_result[1], actual_result1);
 }
 
-// test OpKernelInfoHelper
-TEST(OpKernelInfoHelper, op_kernel_info_help_getters) {
-  using OpKernelInfoHelper = paddle::framework::OpKernelInfoHelper;
-  std::string op_name = "fake_dot";
-  phi::Backend backend = phi::Backend::CPU;
-  phi::DataLayout layout = phi::DataLayout::ANY;
-  phi::DataType dtype = phi::DataType::FLOAT32;
-
-  auto op_kernel_info = paddle::OpKernelInfoMap::Instance()[op_name][0];
-
-  EXPECT_EQ(op_name, OpKernelInfoHelper::GetOpName(op_kernel_info));
-  EXPECT_EQ(backend, OpKernelInfoHelper::GetBackend(op_kernel_info));
-  EXPECT_EQ(layout, OpKernelInfoHelper::GetDataLayout(op_kernel_info));
-  EXPECT_EQ(dtype, OpKernelInfoHelper::GetDataType(op_kernel_info));
-
-  EXPECT_EQ(phi::KernelKey(backend, layout, dtype),
-            OpKernelInfoHelper::GetKernelKey(op_kernel_info));
-
-  paddle::CustomKernelFunc kernel_fn =
-      PD_PT_KERNEL(custom_kernel::FakeDot<float, paddle::CPUContext>);
-  EXPECT_EQ(kernel_fn, OpKernelInfoHelper::GetKernelFn(op_kernel_info));
-
-  void* variadic_func =
-      PD_PT_VARIADIC_KERNEL(custom_kernel::FakeDot<float, paddle::CPUContext>);
-  EXPECT_EQ(variadic_func,
-            OpKernelInfoHelper::GetVariadicKernelFn(op_kernel_info));
-
-  auto& input_defs = OpKernelInfoHelper::GetInputDefs(op_kernel_info);
-  auto& output_defs = OpKernelInfoHelper::GetOutputDefs(op_kernel_info);
-  auto& attribute_defs = OpKernelInfoHelper::GetAttributeDefs(op_kernel_info);
-  EXPECT_EQ(3, static_cast<int>(input_defs.size()));
-  EXPECT_EQ(2, static_cast<int>(output_defs.size()));
-  EXPECT_EQ(11, static_cast<int>(attribute_defs.size()));
-}
+}  // namespace tests
+}  // namespace phi
+
 #endif
diff --git a/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot.cc b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot.cc
index 3ae30c2f30577..68393cba57e36 100644
--- a/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot.cc
+++ b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/extension.h"
+#include "paddle/phi/core/kernel_registry.h"
 
 namespace paddle {
 
@@ -21,19 +21,19 @@ namespace custom_kernel {
 // Here we use dot <CPU, ANY, INT8> for test
 // This test will fail when this kernel is supported in framework
 template <typename T, typename Context>
-void Dot(const Context& dev_ctx,
-         const paddle::Tensor& x,
-         const paddle::Tensor& y,
-         paddle::Tensor* out) {
+void DotKernel(const Context& dev_ctx,
+               const phi::DenseTensor& x,
+               const phi::DenseTensor& y,
+               phi::DenseTensor* out) {
   auto const *x_ptr = x.data<T>(), *x_ptr_ = &x_ptr[0];
   auto const *y_ptr = y.data<T>(), *y_ptr_ = &y_ptr[0];
-  auto* z = out->mutable_data<T>(paddle::PlaceType::kCPU);
+  T* z = dev_ctx.template Alloc<T>(out);
 
   // Loop over the total N elements of both operands while sum-reducing every
   // B pairs along the way where B is the dimension of the least ordered axis
-  auto shape = x.shape();
+  auto&& d = x.dims();
   auto const N = x.numel();
-  auto const B = shape[shape.size() - 1];
+  auto const B = d[d.size() - 1];
 
   for (int j = 0; j < N / B; j++) {
     T ss = 0;
@@ -45,6 +45,7 @@ void Dot(const Context& dev_ctx,
 }  // namespace custom_kernel
 }  // namespace paddle
 
-PD_REGISTER_KERNEL(dot, CPU, ALL_LAYOUT, paddle::custom_kernel::Dot, int8_t) {
+PD_REGISTER_KERNEL(
+    dot, CPU, ALL_LAYOUT, paddle::custom_kernel::DotKernel, int8_t) {
   kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::INT8);
 }
diff --git a/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py
index 5e3bd2f8ed98d..3cef228d14d6e 100644
--- a/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py
+++ b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -16,9 +16,28 @@
 from paddle.fluid import core
 from distutils.sysconfig import get_python_lib
 from distutils.core import setup, Extension
+from setuptools.command.build_ext import build_ext
+
+
+# refer: https://note.qidong.name/2018/03/setup-warning-strict-prototypes
+# Avoid a gcc warning below:
+# cc1plus: warning: command line option ‘-Wstrict-prototypes’ is valid
+# for C/ObjC but not for C++
+class BuildExt(build_ext):
+    def build_extensions(self):
+        if '-Wstrict-prototypes' in self.compiler.compiler_so:
+            self.compiler.compiler_so.remove('-Wstrict-prototypes')
+        super(BuildExt, self).build_extensions()
+
 
 # cc flags
-paddle_extra_compile_args = ['-std=c++14', '-shared', '-fPIC']
+paddle_extra_compile_args = [
+    '-std=c++14',
+    '-shared',
+    '-fPIC',
+    '-Wno-parentheses',
+    '-DPADDLE_WITH_CUSTOM_KERNEL',
+]
 if core.is_compiled_with_npu():
     paddle_extra_compile_args += ['-D_GLIBCXX_USE_CXX11_ABI=0']
 
@@ -27,6 +46,14 @@
 paddle_custom_kernel_include = [
     os.path.join(site_packages_path, 'paddle', 'include'),
 ]
+# include path third_party
+compile_third_party_path = os.path.join(os.environ['PADDLE_ROOT'],
+                                        'build/third_party')
+paddle_custom_kernel_include += [
+    os.path.join(compile_third_party_path, 'boost/src/extern_boost'),  # boost
+    os.path.join(compile_third_party_path, 'install/gflags/include'),  # gflags
+    os.path.join(compile_third_party_path, 'install/glog/include'),  # glog
+]
 
 # libs path
 paddle_custom_kernel_library_dir = [
@@ -50,4 +77,5 @@
     name='custom_kernel_dot',
     version='1.0',
     description='custom kernel fot compiling',
+    cmdclass={'build_ext': BuildExt},
     ext_modules=[custom_kernel_dot_module])
diff --git a/python/setup.py.in b/python/setup.py.in
index 7b3909d40a01b..f39429387dbc3 100755
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -577,9 +577,9 @@ headers = (
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/common')) +  # pten common headers
     # pten level api headers (low level api)
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/core', recursive=True)) +  # pten core headers
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/backends', recursive=True)) +  # pten backends headers
     # utila api headers
-    ['@PADDLE_SOURCE_DIR@/paddle/utils/any.h'] +
-    ['@PADDLE_SOURCE_DIR@/paddle/utils/small_vector.h'] +
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/utils', recursive=True)) +  # paddle utils headers
     ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/device/device_ext.h'])
 
 if '${WITH_MKLDNN}' == 'ON':

From 73bf9673815ccfa8e28d75b366eaa75e275b5c1d Mon Sep 17 00:00:00 2001
From: Yuang Liu <liuyuang@baidu.com>
Date: Tue, 22 Feb 2022 09:23:17 +0800
Subject: [PATCH 008/101] [fleet exe] supprot fp16 feed and fetch on cpp side
 (#39758)

---
 .../distributed/fleet_executor/dist_model.cc  | 10 ++-
 .../dist_model_tensor_wrapper.h               |  6 ++
 paddle/fluid/pybind/bind_fleet_executor.cc    | 89 ++++++++++++++-----
 .../test_fleet_exe_dist_model_tensor.py       | 13 +++
 4 files changed, 94 insertions(+), 24 deletions(-)

diff --git a/paddle/fluid/distributed/fleet_executor/dist_model.cc b/paddle/fluid/distributed/fleet_executor/dist_model.cc
index e684d75bfb832..c1408130b5e57 100644
--- a/paddle/fluid/distributed/fleet_executor/dist_model.cc
+++ b/paddle/fluid/distributed/fleet_executor/dist_model.cc
@@ -52,6 +52,8 @@ bool LoadDataFromDistModelTensor(const DistModelTensor &input_data,
     input_tensor_ptr = input_tensor->mutable_data<float>(dims, place);
   } else if (input_data.dtype == DistModelDataType::INT32) {
     input_tensor_ptr = input_tensor->mutable_data<int32_t>(dims, place);
+  } else if (input_data.dtype == DistModelDataType::FLOAT16) {
+    input_tensor_ptr = input_tensor->mutable_data<float16>(dims, place);
   } else {
     LOG(ERROR) << "unsupported feed type " << input_data.dtype;
     return false;
@@ -412,6 +414,8 @@ bool DistModel::PrepareFeedAndFetch() {
         feeds_to_dtype_.insert({var_name, DistModelDataType::INT32});
       } else if (real_var->GetDataType() == framework::proto::VarType::INT64) {
         feeds_to_dtype_.insert({var_name, DistModelDataType::INT64});
+      } else if (real_var->GetDataType() == framework::proto::VarType::FP16) {
+        feeds_to_dtype_.insert({var_name, DistModelDataType::FLOAT16});
       } else {
         LOG(ERROR) << "Don't support feed var dtype for: "
                    << real_var->GetDataType();
@@ -503,9 +507,13 @@ bool DistModel::FetchResults(std::vector<DistModelTensor> *output_data,
     } else if (type == framework::proto::VarType::INT32) {
       rst = FetchResult<int32_t>(fetch, output);
       output->dtype = DistModelDataType::INT32;
+    } else if (type == framework::proto::VarType::FP16) {
+      rst = FetchResult<float16>(fetch, output);
+      output->dtype = DistModelDataType::FLOAT16;
     } else {
       LOG(ERROR) << "DistModel meets unknown fetch data type. DistModel only "
-                    "supports float32, int64 and int32 fetch type for now.";
+                    "supports float32, float16, int64 and int32 fetch type "
+                    "for now.";
     }
     if (!rst) {
       LOG(ERROR) << "DistModel fails to fetch result " << idx_to_fetches_[idx];
diff --git a/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h b/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h
index 6bdd858d6cf9e..dc8b2596803e0 100644
--- a/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h
+++ b/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <string>
 #include <vector>
+#include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/macros.h"
 
 namespace paddle {
@@ -40,6 +41,11 @@ constexpr DistModelDataType DistModelGetDtype<float>() {
   return DistModelDataType::FLOAT32;
 }
 
+template <>
+constexpr DistModelDataType DistModelGetDtype<platform::float16>() {
+  return DistModelDataType::FLOAT16;
+}
+
 class DistModelDataBuf {
  public:
   explicit DistModelDataBuf(size_t length)
diff --git a/paddle/fluid/pybind/bind_fleet_executor.cc b/paddle/fluid/pybind/bind_fleet_executor.cc
index 7bb7f03983eb9..b29cc10e8f56f 100644
--- a/paddle/fluid/pybind/bind_fleet_executor.cc
+++ b/paddle/fluid/pybind/bind_fleet_executor.cc
@@ -24,10 +24,41 @@
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/place.h"
+#include "pybind11/pybind11.h"
 
 namespace py = pybind11;
 
+namespace pybind11 {
+namespace detail {
+
+// Note: use same enum number of float16 in numpy.
+// import numpy as np
+// print np.dtype(np.float16).num  # 23
+constexpr int NPY_FLOAT16_ = 23;
+
+// Note: Since float16 is not a builtin type in C++, we register
+// paddle::platform::float16 as numpy.float16.
+// Ref: https://github.com/pybind/pybind11/issues/1776
+template <>
+struct npy_format_descriptor<paddle::platform::float16> {
+  static py::dtype dtype() {
+    handle ptr = npy_api::get().PyArray_DescrFromType_(NPY_FLOAT16_);
+    return reinterpret_borrow<py::dtype>(ptr);
+  }
+  static std::string format() {
+    // Note: "e" represents float16.
+    // Details at:
+    // https://docs.python.org/3/library/struct.html#format-characters.
+    return "e";
+  }
+  static constexpr auto name = _("float16");
+};
+
+}  // namespace detail
+}  // namespace pybind11
+
 namespace paddle {
 namespace pybind {
 
@@ -175,6 +206,7 @@ void BindFleetExecutor(py::module* m) {
       .def(py::init(&DistModelDataBufCreate<int32_t>))
       .def(py::init(&DistModelDataBufCreate<int64_t>))
       .def(py::init(&DistModelDataBufCreate<float>))
+      .def(py::init(&DistModelDataBufCreate<paddle::platform::float16>))
       .def("reset",
            [](DistModelDataBuf& self, std::vector<float>& data) {
              self.Resize(data.size() * sizeof(float));
@@ -183,29 +215,35 @@ void BindFleetExecutor(py::module* m) {
       .def("reset", &DistModelDataBufReset<int32_t>)
       .def("reset", &DistModelDataBufReset<int64_t>)
       .def("reset", &DistModelDataBufReset<float>)
+      .def("reset", &DistModelDataBufReset<paddle::platform::float16>)
       .def("length", &DistModelDataBuf::length)
-      .def("tolist",
-           [](DistModelDataBuf& self, const std::string& dtype) -> py::list {
-             py::list l;
-             if (dtype == "int32") {
-               auto* data = static_cast<int32_t*>(self.data());
-               auto size = self.length() / sizeof(int32_t);
-               l = py::cast(std::vector<int32_t>(data, data + size));
-             } else if (dtype == "int64") {
-               auto* data = static_cast<int64_t*>(self.data());
-               auto size = self.length() / sizeof(int64_t);
-               l = py::cast(std::vector<int64_t>(data, data + size));
-             } else if (dtype == "float32") {
-               auto* data = static_cast<float*>(self.data());
-               auto size = self.length() / sizeof(float);
-               l = py::cast(std::vector<float>(data, data + size));
-             } else {
-               PADDLE_THROW(platform::errors::Unimplemented(
-                   "Unsupported data type. Now only supports INT32, INT64 and "
-                   "FLOAT32."));
-             }
-             return l;
-           });
+      .def("tolist", [](DistModelDataBuf& self,
+                        const std::string& dtype) -> py::list {
+        py::list l;
+        if (dtype == "int32") {
+          auto* data = static_cast<int32_t*>(self.data());
+          auto size = self.length() / sizeof(int32_t);
+          l = py::cast(std::vector<int32_t>(data, data + size));
+        } else if (dtype == "int64") {
+          auto* data = static_cast<int64_t*>(self.data());
+          auto size = self.length() / sizeof(int64_t);
+          l = py::cast(std::vector<int64_t>(data, data + size));
+        } else if (dtype == "float32") {
+          auto* data = static_cast<float*>(self.data());
+          auto size = self.length() / sizeof(float);
+          l = py::cast(std::vector<float>(data, data + size));
+        } else if (dtype == "float16") {
+          auto* data = static_cast<paddle::platform::float16*>(self.data());
+          auto size = self.length() / sizeof(paddle::platform::float16);
+          l = py::cast(
+              std::vector<paddle::platform::float16>(data, data + size));
+        } else {
+          PADDLE_THROW(platform::errors::Unimplemented(
+              "Unsupported data type. Now only supports INT32, INT64, "
+              "FLOAT16 and FLOAT32."));
+        }
+        return l;
+      });
 
   py::class_<DistModelTensor>(*m, "DistModelTensor")
       .def(py::init<>())
@@ -221,6 +259,10 @@ void BindFleetExecutor(py::module* m) {
            py::arg("name") = "",
            py::arg("lod") = std::vector<std::vector<size_t>>(),
            py::arg("copy") = true)
+      .def(py::init(&DistModelTensorCreate<paddle::platform::float16>),
+           py::arg("data"), py::arg("name") = "",
+           py::arg("lod") = std::vector<std::vector<size_t>>(),
+           py::arg("copy") = true)
       .def_readwrite("name", &DistModelTensor::name)
       .def_readwrite("shape", &DistModelTensor::shape)
       .def_readwrite("data", &DistModelTensor::data)
@@ -231,7 +273,8 @@ void BindFleetExecutor(py::module* m) {
   py::enum_<DistModelDataType>(*m, "DistModelDataType")
       .value("FLOAT32", DistModelDataType::FLOAT32)
       .value("INT64", DistModelDataType::INT64)
-      .value("INT32", DistModelDataType::INT32);
+      .value("INT32", DistModelDataType::INT32)
+      .value("FLOAT16", DistModelDataType::FLOAT16);
 }
 }  // namespace pybind
 }  // namespace paddle
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_exe_dist_model_tensor.py b/python/paddle/fluid/tests/unittests/test_fleet_exe_dist_model_tensor.py
index a74b4f0d224ef..2d4fe92f05156 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_exe_dist_model_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_exe_dist_model_tensor.py
@@ -58,6 +58,19 @@ def test_dist_model_tensor(self):
         self.assertEqual(dist_tensor_float.as_ndarray().ravel().tolist(),
                          tensor_float.ravel().tolist())
 
+        tensor_float_16 = np.random.randn(20, 2).astype('float16')
+        dist_tensor_float_16 = DistModelTensor(tensor_float_16,
+                                               'float_tensor_16')
+        self.assertEqual(dist_tensor_float_16.dtype, DistModelDataType.FLOAT16)
+        self.assertEqual(
+            dist_tensor_float_16.data.tolist('float16'),
+            tensor_float_16.ravel().tolist())
+        self.assertEqual(dist_tensor_float_16.data.length(), 40 * 2)
+        self.assertEqual(dist_tensor_float_16.name, 'float_tensor_16')
+        dist_tensor_float_16.data.reset(tensor_float_16)
+        self.assertEqual(dist_tensor_float_16.as_ndarray().ravel().tolist(),
+                         tensor_float_16.ravel().tolist())
+
 
 if __name__ == '__main__':
     unittest.main()

From 4a3387963583eccad4fa254aa414bc27486fc025 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Tue, 22 Feb 2022 09:23:34 +0800
Subject: [PATCH 009/101] [PTen->Phi PR2] Rename PT_REGISTER macro to
 PD_REGISTER (#39790)

* unify register macro

* rename declare macro

* fix infrt error
---
 cmake/pten.cmake                              | 22 +++---
 .../fluid/framework/infershape_utils_test.cc  |  2 +-
 paddle/phi/api/ext/op_kernel_info.h           | 17 ++---
 paddle/phi/api/lib/api_declare.h              |  6 +-
 paddle/phi/api/lib/api_registry.h             |  4 +-
 paddle/phi/api/lib/manual_api.cc              | 10 +--
 paddle/phi/api/lib/op_kernel_info.cc          |  2 +-
 paddle/phi/api/lib/sparse_api.cc              | 26 +++----
 paddle/phi/common/backend.h                   | 14 ++--
 paddle/phi/core/compat/op_utils.h             | 24 +++----
 paddle/phi/core/infermeta_utils.h             |  6 +-
 paddle/phi/core/kernel_registry.h             | 67 ++++++++++---------
 paddle/phi/infermeta/unary.cc                 |  4 +-
 paddle/phi/kernels/cpu/abs_grad_kernel.cc     |  4 +-
 paddle/phi/kernels/cpu/abs_kernel.cc          |  2 +-
 paddle/phi/kernels/cpu/bernoulli_kernel.cc    |  2 +-
 paddle/phi/kernels/cpu/cast_kernel.cc         |  2 +-
 paddle/phi/kernels/cpu/complex_kernel.cc      |  2 +-
 paddle/phi/kernels/cpu/concat_kernel.cc       |  2 +-
 paddle/phi/kernels/cpu/copy_kernel.cc         |  2 +-
 .../phi/kernels/cpu/diagonal_grad_kernel.cc   |  2 +-
 paddle/phi/kernels/cpu/diagonal_kernel.cc     |  2 +-
 paddle/phi/kernels/cpu/digamma_grad_kernel.cc |  2 +-
 paddle/phi/kernels/cpu/digamma_kernel.cc      |  2 +-
 paddle/phi/kernels/cpu/dot_grad_kernel.cc     |  2 +-
 paddle/phi/kernels/cpu/dot_kernel.cc          |  2 +-
 .../kernels/cpu/elementwise_grad_kernel.cc    | 10 +--
 paddle/phi/kernels/cpu/expand_grad_kernel.cc  |  2 +-
 paddle/phi/kernels/cpu/expand_kernel.cc       |  2 +-
 paddle/phi/kernels/cpu/full_kernel.cc         |  4 +-
 paddle/phi/kernels/cpu/histogram_kernel.cc    |  2 +-
 .../phi/kernels/cpu/huber_loss_grad_kernel.cc |  2 +-
 paddle/phi/kernels/cpu/huber_loss_kernel.cc   |  2 +-
 paddle/phi/kernels/cpu/lerp_grad_kernel.cc    |  2 +-
 paddle/phi/kernels/cpu/lerp_kernel.cc         |  2 +-
 .../kernels/cpu/masked_select_grad_kernel.cc  |  2 +-
 .../phi/kernels/cpu/masked_select_kernel.cc   |  2 +-
 paddle/phi/kernels/cpu/math_kernel.cc         | 12 ++--
 paddle/phi/kernels/cpu/matmul_grad_kernel.cc  |  6 +-
 paddle/phi/kernels/cpu/matmul_kernel.cc       |  2 +-
 paddle/phi/kernels/cpu/norm_grad_kernel.cc    |  2 +-
 paddle/phi/kernels/cpu/norm_kernel.cc         |  2 +-
 paddle/phi/kernels/cpu/scale_kernel.cc        |  2 +-
 paddle/phi/kernels/cpu/sign_kernel.cc         |  2 +-
 paddle/phi/kernels/cpu/split_kernel.cc        |  2 +-
 paddle/phi/kernels/cpu/trace_grad_kernel.cc   |  2 +-
 paddle/phi/kernels/cpu/trace_kernel.cc        |  2 +-
 paddle/phi/kernels/cpu/trunc_grad_kernel.cc   |  2 +-
 paddle/phi/kernels/cpu/trunc_kernel.cc        |  2 +-
 paddle/phi/kernels/empty_kernel.cc            |  8 +--
 paddle/phi/kernels/flatten_grad_kernel.cc     |  6 +-
 paddle/phi/kernels/flatten_kernel.cc          | 12 ++--
 paddle/phi/kernels/gpu/abs_grad_kernel.cu     |  4 +-
 paddle/phi/kernels/gpu/abs_kernel.cu          |  2 +-
 paddle/phi/kernels/gpu/bernoulli_kernel.cu    |  2 +-
 paddle/phi/kernels/gpu/cast_kernel.cu         |  2 +-
 paddle/phi/kernels/gpu/complex_kernel.cu      |  2 +-
 paddle/phi/kernels/gpu/concat_kernel.cu       |  2 +-
 paddle/phi/kernels/gpu/copy_kernel.cu         |  2 +-
 .../phi/kernels/gpu/diagonal_grad_kernel.cu   |  2 +-
 paddle/phi/kernels/gpu/diagonal_kernel.cu     |  2 +-
 paddle/phi/kernels/gpu/digamma_grad_kernel.cu |  2 +-
 paddle/phi/kernels/gpu/digamma_kernel.cu      |  2 +-
 paddle/phi/kernels/gpu/dot_grad_kernel.cu     |  2 +-
 paddle/phi/kernels/gpu/dot_kernel.cu          |  2 +-
 .../kernels/gpu/elementwise_grad_kernel.cu    | 10 +--
 paddle/phi/kernels/gpu/expand_grad_kernel.cu  |  2 +-
 paddle/phi/kernels/gpu/expand_kernel.cu       |  2 +-
 paddle/phi/kernels/gpu/full_kernel.cu         |  4 +-
 paddle/phi/kernels/gpu/histogram_kernel.cu    |  2 +-
 .../phi/kernels/gpu/huber_loss_grad_kernel.cu |  2 +-
 paddle/phi/kernels/gpu/huber_loss_kernel.cu   |  2 +-
 paddle/phi/kernels/gpu/lerp_grad_kernel.cu    |  2 +-
 paddle/phi/kernels/gpu/lerp_kernel.cu         |  2 +-
 .../kernels/gpu/masked_select_grad_kernel.cu  |  2 +-
 .../phi/kernels/gpu/masked_select_kernel.cu   |  2 +-
 paddle/phi/kernels/gpu/math_kernel.cu         | 12 ++--
 paddle/phi/kernels/gpu/matmul_grad_kernel.cu  |  6 +-
 paddle/phi/kernels/gpu/matmul_kernel.cu       |  2 +-
 paddle/phi/kernels/gpu/norm_grad_kernel.cu    |  2 +-
 paddle/phi/kernels/gpu/norm_kernel.cu         |  2 +-
 paddle/phi/kernels/gpu/scale_kernel.cu        |  2 +-
 paddle/phi/kernels/gpu/sign_kernel.cu.cc      |  2 +-
 paddle/phi/kernels/gpu/split_kernel.cu        |  2 +-
 paddle/phi/kernels/gpu/trace_grad_kernel.cu   |  2 +-
 paddle/phi/kernels/gpu/trace_kernel.cu        |  2 +-
 paddle/phi/kernels/gpu/trunc_grad_kernel.cu   |  2 +-
 paddle/phi/kernels/gpu/trunc_kernel.cu        |  2 +-
 paddle/phi/kernels/math_kernel.cc             | 24 +++----
 paddle/phi/kernels/reshape_grad_kernel.cc     | 12 ++--
 paddle/phi/kernels/reshape_kernel.cc          | 12 ++--
 .../phi/kernels/selected_rows/full_kernel.cc  |  4 +-
 .../phi/kernels/selected_rows/scale_kernel.cc |  4 +-
 .../kernels/sparse/cpu/sparse_utils_kernel.cc | 12 ++--
 .../kernels/sparse/gpu/sparse_utils_kernel.cu | 12 ++--
 paddle/phi/kernels/transfer_layout_kernel.cc  |  2 +-
 paddle/phi/kernels/xpu/cast_kernel.cc         |  2 +-
 paddle/phi/kernels/xpu/copy_kernel.cc         |  2 +-
 paddle/phi/kernels/xpu/full_kernel.cc         |  4 +-
 paddle/phi/kernels/xpu/scale_kernel.cc        |  2 +-
 paddle/phi/ops/compat/abs_sig.cc              |  6 +-
 paddle/phi/ops/compat/cast_sig.cc             |  2 +-
 paddle/phi/ops/compat/concat_sig.cc           |  2 +-
 paddle/phi/ops/compat/diagonal_sig.cc         |  2 +-
 paddle/phi/ops/compat/digamma_sig.cc          |  2 +-
 paddle/phi/ops/compat/dot_sig.cc              |  2 +-
 paddle/phi/ops/compat/elementwise_sig.cc      | 34 +++++-----
 paddle/phi/ops/compat/empty_sig.cc            |  2 +-
 paddle/phi/ops/compat/expand_sig.cc           |  8 +--
 paddle/phi/ops/compat/fill_any_like_sig.cc    |  4 +-
 paddle/phi/ops/compat/fill_constant_sig.cc    |  4 +-
 paddle/phi/ops/compat/flatten_sig.cc          |  8 +--
 paddle/phi/ops/compat/histogram_sig.cc        |  2 +-
 paddle/phi/ops/compat/huber_loss_sig.cc       |  4 +-
 paddle/phi/ops/compat/lerp_sig.cc             |  4 +-
 paddle/phi/ops/compat/masked_select_sig.cc    |  4 +-
 paddle/phi/ops/compat/matmul_sig.cc           | 14 ++--
 paddle/phi/ops/compat/norm_sig.cc             |  4 +-
 paddle/phi/ops/compat/reduce_sig.cc           |  8 +--
 paddle/phi/ops/compat/reshape_sig.cc          | 12 ++--
 paddle/phi/ops/compat/scale_sig.cc            |  2 +-
 paddle/phi/ops/compat/split_sig.cc            |  2 +-
 paddle/phi/ops/compat/trace_sig.cc            |  4 +-
 paddle/phi/ops/compat/trunc_sig.cc            |  4 +-
 paddle/phi/tests/core/test_custom_kernel.cc   | 24 +++----
 paddle/phi/tests/core/test_kernel_factory.cc  |  4 +-
 .../phi/tests/kernels/test_flatten_dev_api.cc |  6 +-
 .../tests/custom_kernel/custom_kernel_dot.cc  |  2 +-
 python/paddle/utils/code_gen/api_gen.py       |  2 +-
 .../utils/code_gen/wrapped_infermeta_gen.py   |  4 +-
 tools/infrt/get_pten_kernel_function.sh       |  6 +-
 131 files changed, 354 insertions(+), 352 deletions(-)

diff --git a/cmake/pten.cmake b/cmake/pten.cmake
index 6049f6e21e566..9a3552efce8e1 100644
--- a/cmake/pten.cmake
+++ b/cmake/pten.cmake
@@ -58,26 +58,26 @@ endfunction()
 function(kernel_declare TARGET_LIST)
     foreach(kernel_path ${TARGET_LIST})
         file(READ ${kernel_path} kernel_impl)
-        # TODO(chenweihang): rename PT_REGISTER_KERNEL to PT_REGISTER_KERNEL
+        # TODO(chenweihang): rename PD_REGISTER_KERNEL to PD_REGISTER_KERNEL
         # NOTE(chenweihang): now we don't recommend to use digit in kernel name
-        string(REGEX MATCH "(PT_REGISTER_KERNEL|PT_REGISTER_GENERAL_KERNEL)\\([ \t\r\n]*[a-z0-9_]*," first_registry "${kernel_impl}")
+        string(REGEX MATCH "(PD_REGISTER_KERNEL|PD_REGISTER_GENERAL_KERNEL)\\([ \t\r\n]*[a-z0-9_]*," first_registry "${kernel_impl}")
         if (NOT first_registry STREQUAL "")
             # parse the first kernel name
-            string(REPLACE "PT_REGISTER_KERNEL(" "" kernel_name "${first_registry}")
-            string(REPLACE "PT_REGISTER_GENERAL_KERNEL(" "" kernel_name "${kernel_name}")
+            string(REPLACE "PD_REGISTER_KERNEL(" "" kernel_name "${first_registry}")
+            string(REPLACE "PD_REGISTER_GENERAL_KERNEL(" "" kernel_name "${kernel_name}")
             string(REPLACE "," "" kernel_name "${kernel_name}")
             string(REGEX REPLACE "[ \t\r\n]+" "" kernel_name "${kernel_name}")
             # append kernel declare into declarations.h
             # TODO(chenweihang): default declare ALL_LAYOUT for each kernel
             if (${kernel_path} MATCHES "./cpu\/")
-                file(APPEND ${kernel_declare_file} "PT_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n")
+                file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n")
             elseif (${kernel_path} MATCHES "./gpu\/")
-                file(APPEND ${kernel_declare_file} "PT_DECLARE_KERNEL(${kernel_name}, GPU, ALL_LAYOUT);\n")
+                file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, GPU, ALL_LAYOUT);\n")
             elseif (${kernel_path} MATCHES "./xpu\/")
-                file(APPEND ${kernel_declare_file} "PT_DECLARE_KERNEL(${kernel_name}, XPU, ALL_LAYOUT);\n")
+                file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, XPU, ALL_LAYOUT);\n")
             else ()
                 # deal with device independent kernel, now we use CPU temporaary
-                file(APPEND ${kernel_declare_file} "PT_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n")
+                file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n")
             endif()
         endif()
     endforeach()
@@ -285,9 +285,9 @@ endfunction()
 
 function(append_op_util_declare TARGET)
     file(READ ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET} target_content)
-    string(REGEX MATCH "(PT_REGISTER_BASE_KERNEL_NAME|PT_REGISTER_ARG_MAPPING_FN)\\([ \t\r\n]*[a-z0-9_]*" util_registrar "${target_content}")
-    string(REPLACE "PT_REGISTER_ARG_MAPPING_FN" "PT_DECLARE_ARG_MAPPING_FN" util_declare "${util_registrar}")
-    string(REPLACE "PT_REGISTER_BASE_KERNEL_NAME" "PT_DECLARE_BASE_KERNEL_NAME" util_declare "${util_declare}")
+    string(REGEX MATCH "(PD_REGISTER_BASE_KERNEL_NAME|PD_REGISTER_ARG_MAPPING_FN)\\([ \t\r\n]*[a-z0-9_]*" util_registrar "${target_content}")
+    string(REPLACE "PD_REGISTER_ARG_MAPPING_FN" "PD_DECLARE_ARG_MAPPING_FN" util_declare "${util_registrar}")
+    string(REPLACE "PD_REGISTER_BASE_KERNEL_NAME" "PD_DECLARE_BASE_KERNEL_NAME" util_declare "${util_declare}")
     string(APPEND util_declare ");\n")
     file(APPEND ${op_utils_header} "${util_declare}")
 endfunction()
diff --git a/paddle/fluid/framework/infershape_utils_test.cc b/paddle/fluid/framework/infershape_utils_test.cc
index 592e787109d18..53dcc19fcbae8 100644
--- a/paddle/fluid/framework/infershape_utils_test.cc
+++ b/paddle/fluid/framework/infershape_utils_test.cc
@@ -118,7 +118,7 @@ REGISTER_OPERATOR(infer_shape_utils_test,
                   paddle::framework::InferShapeUtilsTestOpMaker,
                   InferShapeUtilsTestInferShapeFunctor);
 
-PT_REGISTER_KERNEL(infer_shape_utils_test, CPU, ALL_LAYOUT,
+PD_REGISTER_KERNEL(infer_shape_utils_test, CPU, ALL_LAYOUT,
                    paddle::framework::InferShapeUtilsTestKernel, int) {}
 
 TEST(InferShapeUtilsTest, ALL) {
diff --git a/paddle/phi/api/ext/op_kernel_info.h b/paddle/phi/api/ext/op_kernel_info.h
index b52b0abe9e745..b3adbe9d18b96 100644
--- a/paddle/phi/api/ext/op_kernel_info.h
+++ b/paddle/phi/api/ext/op_kernel_info.h
@@ -630,16 +630,16 @@ class PADDLE_API OpKernelInfoBuilder {
 };
 /////////////////////// Custom kernel register API /////////////////////////
 // For inference: compile directly with framework
-// Call after PD_REGISTER_KERNEL(...)
+// Call after PD_REGISTER_BUILTIN_KERNEL(...)
 void RegisterAllCustomKernel();
 
 //////////////// Custom kernel register macro /////////////////////
 // Refer to paddle/phi/core/kernel_registry.h, we can not use
-// PT_REGISTER_KERNEL directly, common macros and functions are
+// PD_REGISTER_KERNEL directly, common macros and functions are
 // not ready for custom kernel now.
 // Difference: custom_kernel stores all kernels' info into global
 // g_custom_kernel_info_map before loading and registering into
-// pten kernel management. Only providing PD_REGISTER_KERNEL which
+// pten kernel management. Only providing PD_REGISTER_BUILTIN_KERNEL which
 // supports 2 template arguments.
 
 #define PD_BACKEND(arg__) phi::Backend::arg__
@@ -666,11 +666,12 @@ void RegisterAllCustomKernel();
 #define PD_ID __LINE__
 #endif
 
-#define PD_REGISTER_KERNEL(kernel_name, backend, layout, func, cpp_dtype, ...) \
-  STATIC_ASSERT_GLOBAL_NAMESPACE(                                              \
-      _reg_custom_kernel_ns_check_##kernel_name##_##backend##_##layout,        \
-      "PD_REGISTER_KERNEL must be called in global namespace.");               \
-  _PD_REGISTER_2TA_KERNEL(                                                     \
+#define PD_REGISTER_BUILTIN_KERNEL(                                      \
+    kernel_name, backend, layout, func, cpp_dtype, ...)                  \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                                        \
+      _reg_custom_kernel_ns_check_##kernel_name##_##backend##_##layout,  \
+      "PD_REGISTER_BUILTIN_KERNEL must be called in global namespace."); \
+  _PD_REGISTER_2TA_KERNEL(                                               \
       kernel_name, backend, layout, func, cpp_dtype, ##__VA_ARGS__)
 
 // WIN32 is not supported
diff --git a/paddle/phi/api/lib/api_declare.h b/paddle/phi/api/lib/api_declare.h
index 650161a933a8c..26408290bd325 100644
--- a/paddle/phi/api/lib/api_declare.h
+++ b/paddle/phi/api/lib/api_declare.h
@@ -17,6 +17,6 @@ limitations under the License. */
 // api symbols declare, remove in the future
 #include "paddle/phi/api/lib/api_registry.h"
 
-PT_DECLARE_API(Math);
-PT_DECLARE_API(Utils);
-PT_DECLARE_API(SparseApi);
+PD_DECLARE_API(Math);
+PD_DECLARE_API(Utils);
+PD_DECLARE_API(SparseApi);
diff --git a/paddle/phi/api/lib/api_registry.h b/paddle/phi/api/lib/api_registry.h
index 2812bede8e09b..3783620ea449b 100644
--- a/paddle/phi/api/lib/api_registry.h
+++ b/paddle/phi/api/lib/api_registry.h
@@ -36,10 +36,10 @@ namespace experimental {
  */
 
 // use to declare symbol
-#define PT_REGISTER_API(name) \
+#define PD_REGISTER_API(name) \
   PADDLE_API int RegisterSymbolsFor##name() { return 0; }
 
-#define PT_DECLARE_API(name)                        \
+#define PD_DECLARE_API(name)                        \
   extern PADDLE_API int RegisterSymbolsFor##name(); \
   UNUSED static int use_pten_api_##name = RegisterSymbolsFor##name()
 
diff --git a/paddle/phi/api/lib/manual_api.cc b/paddle/phi/api/lib/manual_api.cc
index e0da15eac39b7..7bd4711cc3f30 100644
--- a/paddle/phi/api/lib/manual_api.cc
+++ b/paddle/phi/api/lib/manual_api.cc
@@ -27,15 +27,15 @@ limitations under the License. */
 #include "paddle/phi/core/meta_tensor.h"
 #include "paddle/phi/infermeta/unary.h"
 
-PT_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
-PT_DECLARE_KERNEL(split, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(split, CPU, ALL_LAYOUT);
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT);
 #endif
 
 #ifdef PADDLE_WITH_XPU
-PT_DECLARE_KERNEL(copy, XPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(copy, XPU, ALL_LAYOUT);
 #endif
 
 namespace paddle {
@@ -147,4 +147,4 @@ PADDLE_API std::vector<Tensor> split(const Tensor& x,
 }  // namespace experimental
 }  // namespace paddle
 
-PT_REGISTER_API(Utils);
+PD_REGISTER_API(Utils);
diff --git a/paddle/phi/api/lib/op_kernel_info.cc b/paddle/phi/api/lib/op_kernel_info.cc
index 78b4955f321da..c2aef8288dae1 100644
--- a/paddle/phi/api/lib/op_kernel_info.cc
+++ b/paddle/phi/api/lib/op_kernel_info.cc
@@ -86,7 +86,7 @@ OpKernelInfoBuilder& OpKernelInfoBuilder::ArgsDef(CustomKernelArgsDefFn func) {
 /////////////////////// Op register API /////////////////////////
 
 // For inference: compile directly with framework
-// Call after PD_REGISTER_KERNEL(...)
+// Call after PD_REGISTER_BUILTIN_KERNEL(...)
 void RegisterAllCustomKernel() {
   auto& op_kernel_info_map = OpKernelInfoMap::Instance();
   framework::RegisterKernelWithMetaInfoMap(op_kernel_info_map);
diff --git a/paddle/phi/api/lib/sparse_api.cc b/paddle/phi/api/lib/sparse_api.cc
index 5a22d617492d2..cc90c2b819dae 100644
--- a/paddle/phi/api/lib/sparse_api.cc
+++ b/paddle/phi/api/lib/sparse_api.cc
@@ -22,20 +22,20 @@ limitations under the License. */
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/infermeta/unary.h"
 
-PT_DECLARE_KERNEL(dense_to_sparse_coo, CPU, ALL_LAYOUT);
-PT_DECLARE_KERNEL(sparse_csr_to_coo, CPU, ALL_LAYOUT);
-PT_DECLARE_KERNEL(dense_to_sparse_csr, CPU, ALL_LAYOUT);
-PT_DECLARE_KERNEL(sparse_coo_to_csr, CPU, ALL_LAYOUT);
-PT_DECLARE_KERNEL(sparse_coo_to_dense, CPU, ALL_LAYOUT);
-PT_DECLARE_KERNEL(sparse_csr_to_dense, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(dense_to_sparse_coo, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sparse_csr_to_coo, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(dense_to_sparse_csr, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sparse_coo_to_csr, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sparse_coo_to_dense, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sparse_csr_to_dense, CPU, ALL_LAYOUT);
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_DECLARE_KERNEL(dense_to_sparse_coo, GPU, ALL_LAYOUT);
-PT_DECLARE_KERNEL(sparse_csr_to_coo, GPU, ALL_LAYOUT);
-PT_DECLARE_KERNEL(dense_to_sparse_csr, GPU, ALL_LAYOUT);
-PT_DECLARE_KERNEL(sparse_coo_to_csr, GPU, ALL_LAYOUT);
-PT_DECLARE_KERNEL(sparse_coo_to_dense, GPU, ALL_LAYOUT);
-PT_DECLARE_KERNEL(sparse_csr_to_dense, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(dense_to_sparse_coo, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sparse_csr_to_coo, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(dense_to_sparse_csr, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sparse_coo_to_csr, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sparse_coo_to_dense, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sparse_csr_to_dense, GPU, ALL_LAYOUT);
 #endif
 
 namespace paddle {
@@ -228,4 +228,4 @@ PADDLE_API Tensor to_dense(const Tensor& x, Backend backend) {
 }  // namespace experimental
 }  // namespace paddle
 
-PT_REGISTER_API(SparseApi);
+PD_REGISTER_API(SparseApi);
diff --git a/paddle/phi/common/backend.h b/paddle/phi/common/backend.h
index 62692fb9475da..9a2ec093119fd 100644
--- a/paddle/phi/common/backend.h
+++ b/paddle/phi/common/backend.h
@@ -71,17 +71,17 @@ enum class Backend : uint8_t {
    * Of course, we have also considered solving this problem through different
    * named macros, for example, if we define
    *
-   * PT_REGISTER_KERNEL_FOR_ALL_BACKEND
+   * PD_REGISTER_KERNEL_FOR_ALL_BACKEND
    *
    * Based on this design pattern, the dtype and layout also have the same
    * requirements, this cause we need to define a series of macros
    *
-   * PT_REGISTER_KERNEL_FOR_ALL_DTYPE
-   * PT_REGISTER_KERNEL_FOR_ALL_LAYOUT
-   * PT_REGISTER_KERNEL_FOR_ALL_BACKEND_AND_LAYOUT
-   * PT_REGISTER_KERNEL_FOR_ALL_BACKEND_AND_DTYPE
-   * PT_REGISTER_KERNEL_FOR_ALL_LAYOUT_AND_DTYPE
-   * PT_REGISTER_KERNEL_FOR_ALL_BACKEND_AND_LAYOUT_AND_DTYPE
+   * PD_REGISTER_KERNEL_FOR_ALL_DTYPE
+   * PD_REGISTER_KERNEL_FOR_ALL_LAYOUT
+   * PD_REGISTER_KERNEL_FOR_ALL_BACKEND_AND_LAYOUT
+   * PD_REGISTER_KERNEL_FOR_ALL_BACKEND_AND_DTYPE
+   * PD_REGISTER_KERNEL_FOR_ALL_LAYOUT_AND_DTYPE
+   * PD_REGISTER_KERNEL_FOR_ALL_BACKEND_AND_LAYOUT_AND_DTYPE
    *
    * It makes the system of registering macros more complicated, we think
    * this is not a simple design, so we still adopt the design of providing
diff --git a/paddle/phi/core/compat/op_utils.h b/paddle/phi/core/compat/op_utils.h
index 5c0c440d8942c..ec810d4e16340 100644
--- a/paddle/phi/core/compat/op_utils.h
+++ b/paddle/phi/core/compat/op_utils.h
@@ -164,34 +164,34 @@ struct ArgumentMappingFnRegistrar {
   }
 };
 
-#define PT_REGISTER_BASE_KERNEL_NAME(op_type, base_kernel_name)                \
+#define PD_REGISTER_BASE_KERNEL_NAME(op_type, base_kernel_name)                \
   PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                           \
-      pt_register_base_kernel_name_ns_check_##op_type,                         \
-      "PT_REGISTER_BASE_KERNEL_NAME must be called in global namespace.");     \
+      PD_REGISTER_base_kernel_name_ns_check_##op_type,                         \
+      "PD_REGISTER_BASE_KERNEL_NAME must be called in global namespace.");     \
   static const ::phi::BaseKernelNameRegistrar                                  \
       __registrar_base_kernel_name_for_##op_type(#op_type, #base_kernel_name); \
   int TouchBaseKernelNameSymbol_##op_type() { return 0; }
 
-#define PT_DECLARE_BASE_KERNEL_NAME(op_type)                              \
+#define PD_DECLARE_BASE_KERNEL_NAME(op_type)                              \
   PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                      \
-      pt_declare_ai_name_ns_check_##op_type,                              \
-      "PT_DECLARE_BASE_KERNEL_NAME must be called in global namespace."); \
+      PD_DECLARE_ai_name_ns_check_##op_type,                              \
+      "PD_DECLARE_BASE_KERNEL_NAME must be called in global namespace."); \
   extern int TouchBaseKernelNameSymbol_##op_type();                       \
   UNUSED static int __declare_base_kernel_name_symbol_for_##op_type =     \
       TouchBaseKernelNameSymbol_##op_type()
 
-#define PT_REGISTER_ARG_MAPPING_FN(op_type, arg_mapping_fn)              \
+#define PD_REGISTER_ARG_MAPPING_FN(op_type, arg_mapping_fn)              \
   PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                     \
-      pt_register_arg_map_fn_ns_check_##op_type,                         \
-      "PT_REGISTER_ARG_MAPPING_FN must be called in global namespace."); \
+      PD_REGISTER_arg_map_fn_ns_check_##op_type,                         \
+      "PD_REGISTER_ARG_MAPPING_FN must be called in global namespace."); \
   static const ::phi::ArgumentMappingFnRegistrar                         \
       __registrar_arg_map_fn_for_##op_type(#op_type, arg_mapping_fn);    \
   int TouchArgumentMappingFnSymbol_##op_type() { return 0; }
 
-#define PT_DECLARE_ARG_MAPPING_FN(op_type)                              \
+#define PD_DECLARE_ARG_MAPPING_FN(op_type)                              \
   PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                    \
-      pt_declare_arg_map_fn_ns_check_##op_type,                         \
-      "PT_DECLARE_ARG_MAPPING_FN must be called in global namespace."); \
+      PD_DECLARE_arg_map_fn_ns_check_##op_type,                         \
+      "PD_DECLARE_ARG_MAPPING_FN must be called in global namespace."); \
   extern int TouchArgumentMappingFnSymbol_##op_type();                  \
   UNUSED static int __declare_arg_map_fn_symbol_for_##op_type =         \
       TouchArgumentMappingFnSymbol_##op_type()
diff --git a/paddle/phi/core/infermeta_utils.h b/paddle/phi/core/infermeta_utils.h
index 2b98ab22bcdbd..1b8cfea130d49 100644
--- a/paddle/phi/core/infermeta_utils.h
+++ b/paddle/phi/core/infermeta_utils.h
@@ -282,10 +282,10 @@ struct InferMetaFnRegistrar {
   }
 };
 
-#define PT_REGISTER_INFER_META_FN(kernel_name_prefix, variadic_infer_meta_fn) \
+#define PD_REGISTER_INFER_META_FN(kernel_name_prefix, variadic_infer_meta_fn) \
   PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                          \
-      pt_register_infer_meta_fn_ns_check_##kernel_name_prefix,                \
-      "PT_REGISTER_INFER_META_FN must be called in global namespace.");       \
+      PD_REGISTER_infer_meta_fn_ns_check_##kernel_name_prefix,                \
+      "PD_REGISTER_INFER_META_FN must be called in global namespace.");       \
   static const ::phi::InferMetaFnRegistrar                                    \
       __registrar_arg_map_fn_for_##kernel_name_prefix(                        \
           #kernel_name_prefix, PT_INFER_META(variadic_infer_meta_fn))
diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h
index a93c9a2826068..4603f4123acd0 100644
--- a/paddle/phi/core/kernel_registry.h
+++ b/paddle/phi/core/kernel_registry.h
@@ -234,7 +234,7 @@ struct KernelRegistrar {
 #define _PT_ARG_N(args) _PT_ARG_N_EXPAND args
 #define _PT_RESQ_N() 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 
-/** PT_REGISTER_KERNEL
+/** PD_REGISTER_KERNEL
  *
  * The most frequently used kernel registration macro, used for kernel
  * registration with only data type as template parameter, and the function
@@ -243,8 +243,8 @@ struct KernelRegistrar {
  *
  * Note: `2TA` means `2 template argument`
  */
-#define PT_REGISTER_KERNEL(kernel_name, backend, layout, meta_kernel_fn, ...) \
-  _PT_REGISTER_KERNEL(::phi::RegType::BUILTIN,                                \
+#define PD_REGISTER_KERNEL(kernel_name, backend, layout, meta_kernel_fn, ...) \
+  _PD_REGISTER_KERNEL(::phi::RegType::BUILTIN,                                \
                       kernel_name,                                            \
                       backend,                                                \
                       ::phi::backend##Context,                                \
@@ -252,12 +252,12 @@ struct KernelRegistrar {
                       meta_kernel_fn,                                         \
                       __VA_ARGS__)
 
-#define _PT_REGISTER_KERNEL(                                               \
+#define _PD_REGISTER_KERNEL(                                               \
     reg_type, kernel_name, backend, context, layout, meta_kernel_fn, ...)  \
   PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                       \
-      pt_register_tp_kernel_ns_check_##kernel_name##_##backend##_##layout, \
-      "PT_REGISTER_KERNEL must be called in global namespace.");           \
-  PT_EXPAND(_PT_REGISTER_2TA_KERNEL(reg_type,                              \
+      PD_REGISTER_tp_kernel_ns_check_##kernel_name##_##backend##_##layout, \
+      "PD_REGISTER_KERNEL must be called in global namespace.");           \
+  PT_EXPAND(_PD_REGISTER_2TA_KERNEL(reg_type,                              \
                                     kernel_name,                           \
                                     backend,                               \
                                     context,                               \
@@ -266,7 +266,7 @@ struct KernelRegistrar {
                                     __VA_ARGS__))
 
 #ifndef _WIN32
-#define _PT_REGISTER_2TA_KERNEL(                                            \
+#define _PD_REGISTER_2TA_KERNEL(                                            \
     reg_type, kernel_name, backend, context, layout, meta_kernel_fn, ...)   \
   PT_KERNEL_INSTANTIATION(meta_kernel_fn, backend, context, __VA_ARGS__);   \
   static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
@@ -295,7 +295,7 @@ struct KernelRegistrar {
  *
  * And msvc can work without template instantiation
  */
-#define _PT_REGISTER_2TA_KERNEL(                                            \
+#define _PD_REGISTER_2TA_KERNEL(                                            \
     reg_type, kernel_name, backend, context, layout, meta_kernel_fn, ...)   \
   static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel);           \
@@ -909,27 +909,27 @@ struct KernelRegistrar {
                                          args_def_fn,                         \
                                          meta_kernel_fn,                      \
                                          __VA_ARGS__))
-/** PT_REGISTER_GENERAL_KERNEL
+/** PD_REGISTER_GENERAL_KERNEL
  *
  * Basic Kernel register marco, used to register a instantiated kernel function
  * with one template argument.
  */
 
-#define PT_REGISTER_GENERAL_KERNEL(                 \
+#define PD_REGISTER_GENERAL_KERNEL(                 \
     kernel_name, backend, layout, kernel_fn, dtype) \
-  _PT_REGISTER_GENERAL_KERNEL(                      \
+  _PD_REGISTER_GENERAL_KERNEL(                      \
       ::phi::RegType::BUILTIN, kernel_name, backend, layout, kernel_fn, dtype)
 
-#define _PT_REGISTER_GENERAL_KERNEL(                                         \
+#define _PD_REGISTER_GENERAL_KERNEL(                                         \
     reg_type, kernel_name, backend, layout, kernel_fn, dtype)                \
   PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                         \
-      pt_register_no_t_kernel_ns_check_##kernel_name##_##backend##_##layout, \
-      "PT_REGISTER_NO_TEMPLATE_KERNEL must be called in global namespace."); \
-  __PT_REGISTER_GENERAL_KERNEL(                                              \
+      PD_REGISTER_no_t_kernel_ns_check_##kernel_name##_##backend##_##layout, \
+      "PD_REGISTER_NO_TEMPLATE_KERNEL must be called in global namespace."); \
+  __PD_REGISTER_GENERAL_KERNEL(                                              \
       reg_type, kernel_name, backend, layout, kernel_fn, dtype)
 
 #ifndef _WIN32
-#define __PT_REGISTER_GENERAL_KERNEL(                                       \
+#define __PD_REGISTER_GENERAL_KERNEL(                                       \
     reg_type, kernel_name, backend, layout, kernel_fn, dtype)               \
   template decltype(kernel_fn) kernel_fn;                                   \
   static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
@@ -950,7 +950,7 @@ struct KernelRegistrar {
   void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(        \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel)
 #else
-#define __PT_REGISTER_GENERAL_KERNEL(                                       \
+#define __PD_REGISTER_GENERAL_KERNEL(                                       \
     reg_type, kernel_name, backend, layout, kernel_fn, dtype)               \
   static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel);           \
@@ -971,42 +971,43 @@ struct KernelRegistrar {
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel)
 #endif
 
-/** PT_DECLARE_KERNEL
+/** PD_DECLARE_KERNEL
  *
  * Used to export the symbols of the file where the kernel is located,
  * to avoid being removed by linker
  */
-#define PT_DECLARE_KERNEL(kernel_name, backend, layout)                   \
+#define PD_DECLARE_KERNEL(kernel_name, backend, layout)                   \
   PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                      \
-      pt_declare_tp_kernel_ns_check_##kernel_name##_##backend##_##layout, \
-      "PT_DECLARE_KERNEL must be called in global namespace.");           \
+      PD_DECLARE_tp_kernel_ns_check_##kernel_name##_##backend##_##layout, \
+      "PD_DECLARE_KERNEL must be called in global namespace.");           \
   extern int TouchKernelSymbolFor_##kernel_name##_##backend##_##layout(); \
   UNUSED static int                                                       \
       __declare_kernel_symbol_for_##kernel_name##_##backend##_##layout =  \
           TouchKernelSymbolFor_##kernel_name##_##backend##_##layout()
 
-/** PD_REGISTER_KERNEL
+/** PD_REGISTER_BUILTIN_KERNEL
  *
  * Used to register kernels for built-in backends.
  * Support CPU GPU XPU.
  */
-#define PD_REGISTER_KERNEL(kernel_name, backend, layout, meta_kernel_fn, ...) \
-  _PT_REGISTER_KERNEL(::phi::RegType::PLUGIN,                                 \
-                      kernel_name,                                            \
-                      backend,                                                \
-                      ::phi::backend##Context,                                \
-                      layout,                                                 \
-                      meta_kernel_fn,                                         \
+#define PD_REGISTER_BUILTIN_KERNEL(                    \
+    kernel_name, backend, layout, meta_kernel_fn, ...) \
+  _PD_REGISTER_KERNEL(::phi::RegType::PLUGIN,          \
+                      kernel_name,                     \
+                      backend,                         \
+                      ::phi::backend##Context,         \
+                      layout,                          \
+                      meta_kernel_fn,                  \
                       __VA_ARGS__)
 
-/** PD_REGISTER_CUSTOM_KERNEL
+/** PD_REGISTER_PLUGIN_KERNEL
  *
  * Used to register kernels for plug-in backends.
  * Support user-defined backend such as 'Ascend910'.
  */
-#define PD_REGISTER_CUSTOM_KERNEL(                     \
+#define PD_REGISTER_PLUGIN_KERNEL(                     \
     kernel_name, backend, layout, meta_kernel_fn, ...) \
-  _PT_REGISTER_KERNEL(::phi::RegType::PLUGIN,          \
+  _PD_REGISTER_KERNEL(::phi::RegType::PLUGIN,          \
                       kernel_name,                     \
                       backend,                         \
                       ::phi::CustomContext,            \
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 4b13545e038f0..66a91e0ca53e8 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -539,5 +539,5 @@ void TraceInferMeta(
 
 }  // namespace phi
 
-PT_REGISTER_INFER_META_FN(copy_to, phi::CopyToInferMeta);
-PT_REGISTER_INFER_META_FN(split, phi::SplitInferMeta);
+PD_REGISTER_INFER_META_FN(copy_to, phi::CopyToInferMeta);
+PD_REGISTER_INFER_META_FN(split, phi::SplitInferMeta);
diff --git a/paddle/phi/kernels/cpu/abs_grad_kernel.cc b/paddle/phi/kernels/cpu/abs_grad_kernel.cc
index 3c90a348d86a4..ca42a5eb2976f 100644
--- a/paddle/phi/kernels/cpu/abs_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/abs_grad_kernel.cc
@@ -19,7 +19,7 @@
 
 using phi::dtype::complex;
 
-PT_REGISTER_KERNEL(abs_grad,
+PD_REGISTER_KERNEL(abs_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::AbsGradKernel,
@@ -29,7 +29,7 @@ PT_REGISTER_KERNEL(abs_grad,
                    int64_t,
                    complex<float>,
                    complex<double>) {}
-PT_REGISTER_KERNEL(abs_double_grad,
+PD_REGISTER_KERNEL(abs_double_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::AbsDoubleGradKernel,
diff --git a/paddle/phi/kernels/cpu/abs_kernel.cc b/paddle/phi/kernels/cpu/abs_kernel.cc
index 97bd89832870c..71d818c45e6f3 100644
--- a/paddle/phi/kernels/cpu/abs_kernel.cc
+++ b/paddle/phi/kernels/cpu/abs_kernel.cc
@@ -36,7 +36,7 @@ void AbsKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) {
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(abs,
+PD_REGISTER_KERNEL(abs,
                    CPU,
                    ALL_LAYOUT,
                    phi::AbsKernel,
diff --git a/paddle/phi/kernels/cpu/bernoulli_kernel.cc b/paddle/phi/kernels/cpu/bernoulli_kernel.cc
index 4ba965a4e5f1d..09c07d9ec9dea 100644
--- a/paddle/phi/kernels/cpu/bernoulli_kernel.cc
+++ b/paddle/phi/kernels/cpu/bernoulli_kernel.cc
@@ -51,5 +51,5 @@ void BernoulliKernel(const Context& ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(
+PD_REGISTER_KERNEL(
     bernoulli, CPU, ALL_LAYOUT, phi::BernoulliKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/cast_kernel.cc b/paddle/phi/kernels/cpu/cast_kernel.cc
index 4e95a37270dd4..c2c207bfaf25e 100644
--- a/paddle/phi/kernels/cpu/cast_kernel.cc
+++ b/paddle/phi/kernels/cpu/cast_kernel.cc
@@ -58,7 +58,7 @@ void CastKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(cast,
+PD_REGISTER_KERNEL(cast,
                    CPU,
                    ALL_LAYOUT,
                    phi::CastKernel,
diff --git a/paddle/phi/kernels/cpu/complex_kernel.cc b/paddle/phi/kernels/cpu/complex_kernel.cc
index 3a886c3378524..ae09f2a5effe1 100644
--- a/paddle/phi/kernels/cpu/complex_kernel.cc
+++ b/paddle/phi/kernels/cpu/complex_kernel.cc
@@ -21,7 +21,7 @@
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/phi/common/complex.h"
 
-PT_REGISTER_KERNEL(conj,
+PD_REGISTER_KERNEL(conj,
                    CPU,
                    ALL_LAYOUT,
                    phi::ConjKernel,
diff --git a/paddle/phi/kernels/cpu/concat_kernel.cc b/paddle/phi/kernels/cpu/concat_kernel.cc
index 7f4cce379e04d..0cae2599f8d13 100644
--- a/paddle/phi/kernels/cpu/concat_kernel.cc
+++ b/paddle/phi/kernels/cpu/concat_kernel.cc
@@ -110,7 +110,7 @@ void ConcatKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(concat,
+PD_REGISTER_KERNEL(concat,
                    CPU,
                    ALL_LAYOUT,
                    phi::ConcatKernel,
diff --git a/paddle/phi/kernels/cpu/copy_kernel.cc b/paddle/phi/kernels/cpu/copy_kernel.cc
index 8a79a5f6b1941..7dcd75d39e4df 100644
--- a/paddle/phi/kernels/cpu/copy_kernel.cc
+++ b/paddle/phi/kernels/cpu/copy_kernel.cc
@@ -56,5 +56,5 @@ void Copy(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_GENERAL_KERNEL(
+PD_REGISTER_GENERAL_KERNEL(
     copy, CPU, ALL_LAYOUT, phi::Copy<phi::CPUContext>, ALL_DTYPE) {}
diff --git a/paddle/phi/kernels/cpu/diagonal_grad_kernel.cc b/paddle/phi/kernels/cpu/diagonal_grad_kernel.cc
index 351b2335386a8..c3c290b4fe91e 100644
--- a/paddle/phi/kernels/cpu/diagonal_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/diagonal_grad_kernel.cc
@@ -82,7 +82,7 @@ void DiagonalGradKernel(const Context& dev_ctx,
   }
 }
 }  // namespace phi
-PT_REGISTER_KERNEL(diagonal_grad,
+PD_REGISTER_KERNEL(diagonal_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::DiagonalGradKernel,
diff --git a/paddle/phi/kernels/cpu/diagonal_kernel.cc b/paddle/phi/kernels/cpu/diagonal_kernel.cc
index 79f09008f3e2e..df17b458e1166 100644
--- a/paddle/phi/kernels/cpu/diagonal_kernel.cc
+++ b/paddle/phi/kernels/cpu/diagonal_kernel.cc
@@ -79,7 +79,7 @@ void DiagonalKernel(const Context& dev_ctx,
   }
 }
 }  // namespace phi
-PT_REGISTER_KERNEL(diagonal,
+PD_REGISTER_KERNEL(diagonal,
                    CPU,
                    ALL_LAYOUT,
                    phi::DiagonalKernel,
diff --git a/paddle/phi/kernels/cpu/digamma_grad_kernel.cc b/paddle/phi/kernels/cpu/digamma_grad_kernel.cc
index 5cb86eef498bd..da1b5ae556609 100644
--- a/paddle/phi/kernels/cpu/digamma_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/digamma_grad_kernel.cc
@@ -19,5 +19,5 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/digamma_grad_kernel_impl.h"
 
-PT_REGISTER_KERNEL(
+PD_REGISTER_KERNEL(
     digamma_grad, CPU, ALL_LAYOUT, phi::DigammaGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/digamma_kernel.cc b/paddle/phi/kernels/cpu/digamma_kernel.cc
index 0013d8ee7740b..ee120a29b6061 100644
--- a/paddle/phi/kernels/cpu/digamma_kernel.cc
+++ b/paddle/phi/kernels/cpu/digamma_kernel.cc
@@ -19,5 +19,5 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/digamma_kernel_impl.h"
 
-PT_REGISTER_KERNEL(
+PD_REGISTER_KERNEL(
     digamma, CPU, ALL_LAYOUT, phi::DigammaKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/dot_grad_kernel.cc b/paddle/phi/kernels/cpu/dot_grad_kernel.cc
index 729bc9aa3a3ac..a2abdb7c00900 100644
--- a/paddle/phi/kernels/cpu/dot_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/dot_grad_kernel.cc
@@ -20,7 +20,7 @@
 
 #include "paddle/phi/common/complex.h"
 
-PT_REGISTER_KERNEL(dot_grad,
+PD_REGISTER_KERNEL(dot_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::DotGradKernel,
diff --git a/paddle/phi/kernels/cpu/dot_kernel.cc b/paddle/phi/kernels/cpu/dot_kernel.cc
index f4f5d1ffeb544..3518501a6b63d 100644
--- a/paddle/phi/kernels/cpu/dot_kernel.cc
+++ b/paddle/phi/kernels/cpu/dot_kernel.cc
@@ -49,7 +49,7 @@ void DotKernel(const Context& dev_ctx,
 using complex64 = ::phi::dtype::complex<float>;
 using complex128 = ::phi::dtype::complex<double>;
 
-PT_REGISTER_KERNEL(dot,
+PD_REGISTER_KERNEL(dot,
                    CPU,
                    ALL_LAYOUT,
                    phi::DotKernel,
diff --git a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
index 2d1b2a3bd7c3f..0b29091367c83 100644
--- a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
@@ -125,7 +125,7 @@ void SubtractDoubleGradKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(add_grad,
+PD_REGISTER_KERNEL(add_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::AddGradKernel,
@@ -137,7 +137,7 @@ PT_REGISTER_KERNEL(add_grad,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-PT_REGISTER_KERNEL(add_double_grad,
+PD_REGISTER_KERNEL(add_double_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::AddDoubleGradKernel,
@@ -149,7 +149,7 @@ PT_REGISTER_KERNEL(add_double_grad,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-PT_REGISTER_KERNEL(add_triple_grad,
+PD_REGISTER_KERNEL(add_triple_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::AddTripleGradKernel,
@@ -161,7 +161,7 @@ PT_REGISTER_KERNEL(add_triple_grad,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-PT_REGISTER_KERNEL(subtract_grad,
+PD_REGISTER_KERNEL(subtract_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::SubtractGradKernel,
@@ -173,7 +173,7 @@ PT_REGISTER_KERNEL(subtract_grad,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-PT_REGISTER_KERNEL(subtract_double_grad,
+PD_REGISTER_KERNEL(subtract_double_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::SubtractDoubleGradKernel,
diff --git a/paddle/phi/kernels/cpu/expand_grad_kernel.cc b/paddle/phi/kernels/cpu/expand_grad_kernel.cc
index 427b6441b2d24..4799a6aa7afdf 100644
--- a/paddle/phi/kernels/cpu/expand_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/expand_grad_kernel.cc
@@ -19,7 +19,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/expand_grad_kernel_impl.h"
 
-PT_REGISTER_KERNEL(expand_grad,
+PD_REGISTER_KERNEL(expand_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::ExpandGradKernel,
diff --git a/paddle/phi/kernels/cpu/expand_kernel.cc b/paddle/phi/kernels/cpu/expand_kernel.cc
index cce367c8eb832..077048976729f 100644
--- a/paddle/phi/kernels/cpu/expand_kernel.cc
+++ b/paddle/phi/kernels/cpu/expand_kernel.cc
@@ -19,7 +19,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/expand_kernel_impl.h"
 
-PT_REGISTER_KERNEL(expand,
+PD_REGISTER_KERNEL(expand,
                    CPU,
                    ALL_LAYOUT,
                    phi::ExpandKernel,
diff --git a/paddle/phi/kernels/cpu/full_kernel.cc b/paddle/phi/kernels/cpu/full_kernel.cc
index b55eb109f7de3..84d7f56d3361c 100644
--- a/paddle/phi/kernels/cpu/full_kernel.cc
+++ b/paddle/phi/kernels/cpu/full_kernel.cc
@@ -73,7 +73,7 @@ void FullLikeKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(full,
+PD_REGISTER_KERNEL(full,
                    CPU,
                    ALL_LAYOUT,
                    phi::FullKernel,
@@ -89,7 +89,7 @@ PT_REGISTER_KERNEL(full,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-PT_REGISTER_KERNEL(full_like,
+PD_REGISTER_KERNEL(full_like,
                    CPU,
                    ALL_LAYOUT,
                    phi::FullLikeKernel,
diff --git a/paddle/phi/kernels/cpu/histogram_kernel.cc b/paddle/phi/kernels/cpu/histogram_kernel.cc
index fbcf47c3070e6..82b88f868d8a7 100644
--- a/paddle/phi/kernels/cpu/histogram_kernel.cc
+++ b/paddle/phi/kernels/cpu/histogram_kernel.cc
@@ -77,7 +77,7 @@ void HistogramKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(histogram,
+PD_REGISTER_KERNEL(histogram,
                    CPU,
                    ALL_LAYOUT,
                    phi::HistogramKernel,
diff --git a/paddle/phi/kernels/cpu/huber_loss_grad_kernel.cc b/paddle/phi/kernels/cpu/huber_loss_grad_kernel.cc
index bd2349393e742..654f2c9400af0 100644
--- a/paddle/phi/kernels/cpu/huber_loss_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/huber_loss_grad_kernel.cc
@@ -17,6 +17,6 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/huber_loss_grad_kernel_impl.h"
 
-PT_REGISTER_KERNEL(
+PD_REGISTER_KERNEL(
     huber_loss_grad, CPU, ALL_LAYOUT, phi::HuberLossGradKernel, float, double) {
 }
diff --git a/paddle/phi/kernels/cpu/huber_loss_kernel.cc b/paddle/phi/kernels/cpu/huber_loss_kernel.cc
index dfdab16bc85e3..702c0589057af 100644
--- a/paddle/phi/kernels/cpu/huber_loss_kernel.cc
+++ b/paddle/phi/kernels/cpu/huber_loss_kernel.cc
@@ -17,5 +17,5 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/huber_loss_kernel_impl.h"
 
-PT_REGISTER_KERNEL(
+PD_REGISTER_KERNEL(
     huber_loss, CPU, ALL_LAYOUT, phi::HuberLossKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/lerp_grad_kernel.cc b/paddle/phi/kernels/cpu/lerp_grad_kernel.cc
index 7cfb42dbcf96f..d74919011ec5d 100644
--- a/paddle/phi/kernels/cpu/lerp_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/lerp_grad_kernel.cc
@@ -17,5 +17,5 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/lerp_grad_kernel_impl.h"
 
-PT_REGISTER_KERNEL(
+PD_REGISTER_KERNEL(
     lerp_grad, CPU, ALL_LAYOUT, phi::LerpGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/lerp_kernel.cc b/paddle/phi/kernels/cpu/lerp_kernel.cc
index 97083c96464c3..7adfc35bfa321 100644
--- a/paddle/phi/kernels/cpu/lerp_kernel.cc
+++ b/paddle/phi/kernels/cpu/lerp_kernel.cc
@@ -17,4 +17,4 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/lerp_kernel_impl.h"
 
-PT_REGISTER_KERNEL(lerp, CPU, ALL_LAYOUT, phi::LerpKernel, float, double) {}
+PD_REGISTER_KERNEL(lerp, CPU, ALL_LAYOUT, phi::LerpKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/masked_select_grad_kernel.cc b/paddle/phi/kernels/cpu/masked_select_grad_kernel.cc
index 071bbba1975e4..7fe41e686af8c 100644
--- a/paddle/phi/kernels/cpu/masked_select_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/masked_select_grad_kernel.cc
@@ -43,7 +43,7 @@ void MaskedSelectGradKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(masked_select_grad,
+PD_REGISTER_KERNEL(masked_select_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::MaskedSelectGradKernel,
diff --git a/paddle/phi/kernels/cpu/masked_select_kernel.cc b/paddle/phi/kernels/cpu/masked_select_kernel.cc
index 08fc3f69f01e1..274863a863b79 100644
--- a/paddle/phi/kernels/cpu/masked_select_kernel.cc
+++ b/paddle/phi/kernels/cpu/masked_select_kernel.cc
@@ -61,7 +61,7 @@ void MaskedSelectKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(masked_select,
+PD_REGISTER_KERNEL(masked_select,
                    CPU,
                    ALL_LAYOUT,
                    phi::MaskedSelectKernel,
diff --git a/paddle/phi/kernels/cpu/math_kernel.cc b/paddle/phi/kernels/cpu/math_kernel.cc
index 862ee42296c92..581c5f90f35e5 100644
--- a/paddle/phi/kernels/cpu/math_kernel.cc
+++ b/paddle/phi/kernels/cpu/math_kernel.cc
@@ -118,7 +118,7 @@ using complex128 = ::phi::dtype::complex<double>;
 
 // NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16
 // using bfloat16 = ::phi::dtype::bfloat16;
-PT_REGISTER_KERNEL(add_raw,
+PD_REGISTER_KERNEL(add_raw,
                    CPU,
                    ALL_LAYOUT,
                    phi::AddRawKernel,
@@ -129,7 +129,7 @@ PT_REGISTER_KERNEL(add_raw,
                    int64_t,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL(subtract_raw,
+PD_REGISTER_KERNEL(subtract_raw,
                    CPU,
                    ALL_LAYOUT,
                    phi::SubtractRawKernel,
@@ -140,7 +140,7 @@ PT_REGISTER_KERNEL(subtract_raw,
                    int64_t,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL(divide_raw,
+PD_REGISTER_KERNEL(divide_raw,
                    CPU,
                    ALL_LAYOUT,
                    phi::DivideRawKernel,
@@ -150,7 +150,7 @@ PT_REGISTER_KERNEL(divide_raw,
                    int64_t,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL(multiply_raw,
+PD_REGISTER_KERNEL(multiply_raw,
                    CPU,
                    ALL_LAYOUT,
                    phi::MultiplyRawKernel,
@@ -161,7 +161,7 @@ PT_REGISTER_KERNEL(multiply_raw,
                    bool,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL(sum_raw,
+PD_REGISTER_KERNEL(sum_raw,
                    CPU,
                    ALL_LAYOUT,
                    phi::SumRawKernel,
@@ -176,5 +176,5 @@ PT_REGISTER_KERNEL(sum_raw,
                    complex128) {
   kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
 }
-PT_REGISTER_KERNEL(
+PD_REGISTER_KERNEL(
     mean_raw, CPU, ALL_LAYOUT, phi::MeanRawKernel, float, double, bool) {}
diff --git a/paddle/phi/kernels/cpu/matmul_grad_kernel.cc b/paddle/phi/kernels/cpu/matmul_grad_kernel.cc
index 56a185e4ade06..c68e8115e898b 100644
--- a/paddle/phi/kernels/cpu/matmul_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/matmul_grad_kernel.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 
 #include "paddle/phi/kernels/impl/matmul_grad_kernel_impl.h"
 
-PT_REGISTER_KERNEL(matmul_grad,
+PD_REGISTER_KERNEL(matmul_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::MatmulGradKernel,
@@ -28,7 +28,7 @@ PT_REGISTER_KERNEL(matmul_grad,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-PT_REGISTER_KERNEL(matmul_double_grad,
+PD_REGISTER_KERNEL(matmul_double_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::MatmulDoubleGradKernel,
@@ -37,7 +37,7 @@ PT_REGISTER_KERNEL(matmul_double_grad,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-PT_REGISTER_KERNEL(matmul_triple_grad,
+PD_REGISTER_KERNEL(matmul_triple_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::MatmulTripleGradKernel,
diff --git a/paddle/phi/kernels/cpu/matmul_kernel.cc b/paddle/phi/kernels/cpu/matmul_kernel.cc
index 8676aec3eccb4..2bf56c07a5bc7 100644
--- a/paddle/phi/kernels/cpu/matmul_kernel.cc
+++ b/paddle/phi/kernels/cpu/matmul_kernel.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/kernels/impl/matmul_kernel_impl.h"
 
-PT_REGISTER_KERNEL(matmul,
+PD_REGISTER_KERNEL(matmul,
                    CPU,
                    ALL_LAYOUT,
                    phi::MatmulKernel,
diff --git a/paddle/phi/kernels/cpu/norm_grad_kernel.cc b/paddle/phi/kernels/cpu/norm_grad_kernel.cc
index d2073c07244bd..597207a05a226 100644
--- a/paddle/phi/kernels/cpu/norm_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/norm_grad_kernel.cc
@@ -83,5 +83,5 @@ void NormGradKernel(const Context& ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(
+PD_REGISTER_KERNEL(
     norm_grad, CPU, ALL_LAYOUT, phi::NormGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/norm_kernel.cc b/paddle/phi/kernels/cpu/norm_kernel.cc
index e8f35b5fe7efd..50906d9c3bb94 100644
--- a/paddle/phi/kernels/cpu/norm_kernel.cc
+++ b/paddle/phi/kernels/cpu/norm_kernel.cc
@@ -76,4 +76,4 @@ void NormKernel(const Context& ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(norm, CPU, ALL_LAYOUT, phi::NormKernel, float, double) {}
+PD_REGISTER_KERNEL(norm, CPU, ALL_LAYOUT, phi::NormKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/scale_kernel.cc b/paddle/phi/kernels/cpu/scale_kernel.cc
index 156afb8798de4..e929b5bd7219b 100644
--- a/paddle/phi/kernels/cpu/scale_kernel.cc
+++ b/paddle/phi/kernels/cpu/scale_kernel.cc
@@ -51,7 +51,7 @@ void ScaleKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(scale,
+PD_REGISTER_KERNEL(scale,
                    CPU,
                    ALL_LAYOUT,
                    phi::ScaleKernel,
diff --git a/paddle/phi/kernels/cpu/sign_kernel.cc b/paddle/phi/kernels/cpu/sign_kernel.cc
index 6be931904d133..5fe11ffbd6d5c 100644
--- a/paddle/phi/kernels/cpu/sign_kernel.cc
+++ b/paddle/phi/kernels/cpu/sign_kernel.cc
@@ -21,4 +21,4 @@ limitations under the License. */
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/phi/common/bfloat16.h"
 
-PT_REGISTER_KERNEL(sign, CPU, ALL_LAYOUT, phi::SignKernel, float, double) {}
+PD_REGISTER_KERNEL(sign, CPU, ALL_LAYOUT, phi::SignKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/split_kernel.cc b/paddle/phi/kernels/cpu/split_kernel.cc
index d02909f007da4..259bf9e388c2c 100644
--- a/paddle/phi/kernels/cpu/split_kernel.cc
+++ b/paddle/phi/kernels/cpu/split_kernel.cc
@@ -60,7 +60,7 @@ void SplitKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(split,
+PD_REGISTER_KERNEL(split,
                    CPU,
                    ALL_LAYOUT,
                    phi::SplitKernel,
diff --git a/paddle/phi/kernels/cpu/trace_grad_kernel.cc b/paddle/phi/kernels/cpu/trace_grad_kernel.cc
index e6ffd99bc53bd..2167851b197d1 100644
--- a/paddle/phi/kernels/cpu/trace_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/trace_grad_kernel.cc
@@ -18,7 +18,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/trace_grad_kernel_impl.h"
 
-PT_REGISTER_KERNEL(trace_grad,
+PD_REGISTER_KERNEL(trace_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::TraceGradKernel,
diff --git a/paddle/phi/kernels/cpu/trace_kernel.cc b/paddle/phi/kernels/cpu/trace_kernel.cc
index 2b2cda6491d48..3646e22651913 100644
--- a/paddle/phi/kernels/cpu/trace_kernel.cc
+++ b/paddle/phi/kernels/cpu/trace_kernel.cc
@@ -45,7 +45,7 @@ void TraceKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(trace,
+PD_REGISTER_KERNEL(trace,
                    CPU,
                    ALL_LAYOUT,
                    phi::TraceKernel,
diff --git a/paddle/phi/kernels/cpu/trunc_grad_kernel.cc b/paddle/phi/kernels/cpu/trunc_grad_kernel.cc
index 7fc677c16ef73..4d85dd609e2d1 100644
--- a/paddle/phi/kernels/cpu/trunc_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/trunc_grad_kernel.cc
@@ -30,7 +30,7 @@ void TruncGradKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(trunc_grad,
+PD_REGISTER_KERNEL(trunc_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::TruncGradKernel,
diff --git a/paddle/phi/kernels/cpu/trunc_kernel.cc b/paddle/phi/kernels/cpu/trunc_kernel.cc
index 10e42196679fa..babae6ce7c931 100644
--- a/paddle/phi/kernels/cpu/trunc_kernel.cc
+++ b/paddle/phi/kernels/cpu/trunc_kernel.cc
@@ -35,5 +35,5 @@ void TruncKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(
+PD_REGISTER_KERNEL(
     trunc, CPU, ALL_LAYOUT, phi::TruncKernel, float, double, int, int64_t) {}
diff --git a/paddle/phi/kernels/empty_kernel.cc b/paddle/phi/kernels/empty_kernel.cc
index 6d9e733b2f576..8109d3879cb21 100644
--- a/paddle/phi/kernels/empty_kernel.cc
+++ b/paddle/phi/kernels/empty_kernel.cc
@@ -38,7 +38,7 @@ void EmptyLikeKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(empty,
+PD_REGISTER_KERNEL(empty,
                    CPU,
                    ALL_LAYOUT,
                    phi::EmptyKernel,
@@ -54,7 +54,7 @@ PT_REGISTER_KERNEL(empty,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-PT_REGISTER_KERNEL(empty_like,
+PD_REGISTER_KERNEL(empty_like,
                    CPU,
                    ALL_LAYOUT,
                    phi::EmptyLikeKernel,
@@ -71,7 +71,7 @@ PT_REGISTER_KERNEL(empty_like,
                    phi::dtype::complex<double>) {}
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_REGISTER_KERNEL(empty,
+PD_REGISTER_KERNEL(empty,
                    GPU,
                    ALL_LAYOUT,
                    phi::EmptyKernel,
@@ -86,7 +86,7 @@ PT_REGISTER_KERNEL(empty,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-PT_REGISTER_KERNEL(empty_like,
+PD_REGISTER_KERNEL(empty_like,
                    GPU,
                    ALL_LAYOUT,
                    phi::EmptyLikeKernel,
diff --git a/paddle/phi/kernels/flatten_grad_kernel.cc b/paddle/phi/kernels/flatten_grad_kernel.cc
index 33e6c2724982a..7e8010a43f3d1 100644
--- a/paddle/phi/kernels/flatten_grad_kernel.cc
+++ b/paddle/phi/kernels/flatten_grad_kernel.cc
@@ -32,7 +32,7 @@ void FlattenGradKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(flatten_grad,
+PD_REGISTER_KERNEL(flatten_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::FlattenGradKernel,
@@ -44,7 +44,7 @@ PT_REGISTER_KERNEL(flatten_grad,
                    int64_t) {}
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_REGISTER_KERNEL(flatten_grad,
+PD_REGISTER_KERNEL(flatten_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::FlattenGradKernel,
@@ -59,7 +59,7 @@ PT_REGISTER_KERNEL(flatten_grad,
 #endif
 
 #ifdef PADDLE_WITH_XPU
-PT_REGISTER_KERNEL(flatten_grad,
+PD_REGISTER_KERNEL(flatten_grad,
                    XPU,
                    ALL_LAYOUT,
                    phi::FlattenGradKernel,
diff --git a/paddle/phi/kernels/flatten_kernel.cc b/paddle/phi/kernels/flatten_kernel.cc
index 1ac444aa1792f..12eaab92d5211 100644
--- a/paddle/phi/kernels/flatten_kernel.cc
+++ b/paddle/phi/kernels/flatten_kernel.cc
@@ -48,7 +48,7 @@ void FlattenWithXShape(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(flatten,
+PD_REGISTER_KERNEL(flatten,
                    CPU,
                    ALL_LAYOUT,
                    phi::FlattenKernel,
@@ -60,7 +60,7 @@ PT_REGISTER_KERNEL(flatten,
                    int,
                    int64_t) {}
 
-PT_REGISTER_KERNEL(flatten_with_xshape,
+PD_REGISTER_KERNEL(flatten_with_xshape,
                    CPU,
                    ALL_LAYOUT,
                    phi::FlattenWithXShape,
@@ -73,7 +73,7 @@ PT_REGISTER_KERNEL(flatten_with_xshape,
                    int64_t) {}
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_REGISTER_KERNEL(flatten,
+PD_REGISTER_KERNEL(flatten,
                    GPU,
                    ALL_LAYOUT,
                    phi::FlattenKernel,
@@ -86,7 +86,7 @@ PT_REGISTER_KERNEL(flatten,
                    int,
                    int64_t) {}
 
-PT_REGISTER_KERNEL(flatten_with_xshape,
+PD_REGISTER_KERNEL(flatten_with_xshape,
                    GPU,
                    ALL_LAYOUT,
                    phi::FlattenWithXShape,
@@ -101,7 +101,7 @@ PT_REGISTER_KERNEL(flatten_with_xshape,
 #endif
 
 #ifdef PADDLE_WITH_XPU
-PT_REGISTER_KERNEL(flatten,
+PD_REGISTER_KERNEL(flatten,
                    XPU,
                    ALL_LAYOUT,
                    phi::FlattenKernel,
@@ -112,7 +112,7 @@ PT_REGISTER_KERNEL(flatten,
                    int,
                    int64_t) {}
 
-PT_REGISTER_KERNEL(flatten_with_xshape,
+PD_REGISTER_KERNEL(flatten_with_xshape,
                    XPU,
                    ALL_LAYOUT,
                    phi::FlattenWithXShape,
diff --git a/paddle/phi/kernels/gpu/abs_grad_kernel.cu b/paddle/phi/kernels/gpu/abs_grad_kernel.cu
index 37b19278233a8..1ce6a1638b1a0 100644
--- a/paddle/phi/kernels/gpu/abs_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/abs_grad_kernel.cu
@@ -20,7 +20,7 @@
 
 using phi::dtype::complex;
 
-PT_REGISTER_KERNEL(abs_grad,
+PD_REGISTER_KERNEL(abs_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::AbsGradKernel,
@@ -31,7 +31,7 @@ PT_REGISTER_KERNEL(abs_grad,
                    phi::dtype::float16,
                    complex<float>,
                    complex<double>) {}
-PT_REGISTER_KERNEL(abs_double_grad,
+PD_REGISTER_KERNEL(abs_double_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::AbsDoubleGradKernel,
diff --git a/paddle/phi/kernels/gpu/abs_kernel.cu b/paddle/phi/kernels/gpu/abs_kernel.cu
index 5c191dfc992a5..e122e6b1e9c8a 100644
--- a/paddle/phi/kernels/gpu/abs_kernel.cu
+++ b/paddle/phi/kernels/gpu/abs_kernel.cu
@@ -52,7 +52,7 @@ void AbsKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) {
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(abs,
+PD_REGISTER_KERNEL(abs,
                    GPU,
                    ALL_LAYOUT,
                    phi::AbsKernel,
diff --git a/paddle/phi/kernels/gpu/bernoulli_kernel.cu b/paddle/phi/kernels/gpu/bernoulli_kernel.cu
index b043a55e21b61..6127bceef509c 100644
--- a/paddle/phi/kernels/gpu/bernoulli_kernel.cu
+++ b/paddle/phi/kernels/gpu/bernoulli_kernel.cu
@@ -73,5 +73,5 @@ void BernoulliKernel(const Context& ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(
+PD_REGISTER_KERNEL(
     bernoulli, GPU, ALL_LAYOUT, phi::BernoulliKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/cast_kernel.cu b/paddle/phi/kernels/gpu/cast_kernel.cu
index c05cd15b4757a..7a6c99c5fe15f 100644
--- a/paddle/phi/kernels/gpu/cast_kernel.cu
+++ b/paddle/phi/kernels/gpu/cast_kernel.cu
@@ -61,7 +61,7 @@ void CastKernel(const Context& dev_ctx,
 }  // namespace phi
 
 #define PTEN_REGISTER_CAST_CUDA_BASE_TYPE(op_name, ...) \
-  PT_REGISTER_KERNEL(cast,                              \
+  PD_REGISTER_KERNEL(cast,                              \
                      GPU,                               \
                      ALL_LAYOUT,                        \
                      phi::CastKernel,                   \
diff --git a/paddle/phi/kernels/gpu/complex_kernel.cu b/paddle/phi/kernels/gpu/complex_kernel.cu
index 47a43ee9910b8..02fd408aba86f 100644
--- a/paddle/phi/kernels/gpu/complex_kernel.cu
+++ b/paddle/phi/kernels/gpu/complex_kernel.cu
@@ -21,7 +21,7 @@
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/phi/common/complex.h"
 
-PT_REGISTER_KERNEL(conj,
+PD_REGISTER_KERNEL(conj,
                    GPU,
                    ALL_LAYOUT,
                    phi::ConjKernel,
diff --git a/paddle/phi/kernels/gpu/concat_kernel.cu b/paddle/phi/kernels/gpu/concat_kernel.cu
index 22faeaf419700..c80a873127708 100644
--- a/paddle/phi/kernels/gpu/concat_kernel.cu
+++ b/paddle/phi/kernels/gpu/concat_kernel.cu
@@ -110,7 +110,7 @@ void ConcatKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(concat,
+PD_REGISTER_KERNEL(concat,
                    GPU,
                    ALL_LAYOUT,
                    phi::ConcatKernel,
diff --git a/paddle/phi/kernels/gpu/copy_kernel.cu b/paddle/phi/kernels/gpu/copy_kernel.cu
index 58b0a31d1d6d5..e88795b617370 100644
--- a/paddle/phi/kernels/gpu/copy_kernel.cu
+++ b/paddle/phi/kernels/gpu/copy_kernel.cu
@@ -207,5 +207,5 @@ void Copy(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_GENERAL_KERNEL(
+PD_REGISTER_GENERAL_KERNEL(
     copy, GPU, ALL_LAYOUT, phi::Copy<phi::GPUContext>, ALL_DTYPE) {}
diff --git a/paddle/phi/kernels/gpu/diagonal_grad_kernel.cu b/paddle/phi/kernels/gpu/diagonal_grad_kernel.cu
index 599fa2842a974..423093728e9d6 100644
--- a/paddle/phi/kernels/gpu/diagonal_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/diagonal_grad_kernel.cu
@@ -158,7 +158,7 @@ void DiagonalGradKernel(const Context& dev_ctx,
   }
 }
 }  // namespace phi
-PT_REGISTER_KERNEL(diagonal_grad,
+PD_REGISTER_KERNEL(diagonal_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::DiagonalGradKernel,
diff --git a/paddle/phi/kernels/gpu/diagonal_kernel.cu b/paddle/phi/kernels/gpu/diagonal_kernel.cu
index c4b61cf819f84..58da29b2224a6 100644
--- a/paddle/phi/kernels/gpu/diagonal_kernel.cu
+++ b/paddle/phi/kernels/gpu/diagonal_kernel.cu
@@ -154,7 +154,7 @@ void DiagonalKernel(const Context& dev_ctx,
 }
 }  // namespace phi
 
-PT_REGISTER_KERNEL(diagonal,
+PD_REGISTER_KERNEL(diagonal,
                    GPU,
                    ALL_LAYOUT,
                    phi::DiagonalKernel,
diff --git a/paddle/phi/kernels/gpu/digamma_grad_kernel.cu b/paddle/phi/kernels/gpu/digamma_grad_kernel.cu
index 54a618fe0421e..695227bba0f71 100644
--- a/paddle/phi/kernels/gpu/digamma_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/digamma_grad_kernel.cu
@@ -18,5 +18,5 @@
 #include "paddle/phi/kernels/digamma_grad_kernel.h"
 #include "paddle/phi/kernels/impl/digamma_grad_kernel_impl.h"
 
-PT_REGISTER_KERNEL(
+PD_REGISTER_KERNEL(
     digamma_grad, GPU, ALL_LAYOUT, phi::DigammaGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/digamma_kernel.cu b/paddle/phi/kernels/gpu/digamma_kernel.cu
index 91d63eeab8c83..381c22a82e863 100644
--- a/paddle/phi/kernels/gpu/digamma_kernel.cu
+++ b/paddle/phi/kernels/gpu/digamma_kernel.cu
@@ -19,5 +19,5 @@
 #include "paddle/phi/kernels/digamma_kernel.h"
 #include "paddle/phi/kernels/impl/digamma_kernel_impl.h"
 
-PT_REGISTER_KERNEL(
+PD_REGISTER_KERNEL(
     digamma, GPU, ALL_LAYOUT, phi::DigammaKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/dot_grad_kernel.cu b/paddle/phi/kernels/gpu/dot_grad_kernel.cu
index 3290dba3d45b9..7defc0304e511 100644
--- a/paddle/phi/kernels/gpu/dot_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/dot_grad_kernel.cu
@@ -20,7 +20,7 @@ limitations under the License. */
 
 #include "paddle/phi/common/complex.h"
 
-PT_REGISTER_KERNEL(dot_grad,
+PD_REGISTER_KERNEL(dot_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::DotGradKernel,
diff --git a/paddle/phi/kernels/gpu/dot_kernel.cu b/paddle/phi/kernels/gpu/dot_kernel.cu
index 9f3c3ff794aba..4442396f6c9dd 100644
--- a/paddle/phi/kernels/gpu/dot_kernel.cu
+++ b/paddle/phi/kernels/gpu/dot_kernel.cu
@@ -52,7 +52,7 @@ void DotKernel(const Context& dev_ctx,
 using complex64 = ::phi::dtype::complex<float>;
 using complex128 = ::phi::dtype::complex<double>;
 
-PT_REGISTER_KERNEL(dot,
+PD_REGISTER_KERNEL(dot,
                    GPU,
                    ALL_LAYOUT,
                    phi::DotKernel,
diff --git a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
index fc78fe88c2e0e..02dbb506c4eb5 100644
--- a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
@@ -119,7 +119,7 @@ void SubtractDoubleGradKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(add_grad,
+PD_REGISTER_KERNEL(add_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::AddGradKernel,
@@ -131,7 +131,7 @@ PT_REGISTER_KERNEL(add_grad,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-PT_REGISTER_KERNEL(add_double_grad,
+PD_REGISTER_KERNEL(add_double_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::AddDoubleGradKernel,
@@ -143,7 +143,7 @@ PT_REGISTER_KERNEL(add_double_grad,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-PT_REGISTER_KERNEL(add_triple_grad,
+PD_REGISTER_KERNEL(add_triple_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::AddTripleGradKernel,
@@ -155,7 +155,7 @@ PT_REGISTER_KERNEL(add_triple_grad,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-PT_REGISTER_KERNEL(subtract_grad,
+PD_REGISTER_KERNEL(subtract_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::SubtractGradKernel,
@@ -167,7 +167,7 @@ PT_REGISTER_KERNEL(subtract_grad,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-PT_REGISTER_KERNEL(subtract_double_grad,
+PD_REGISTER_KERNEL(subtract_double_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::SubtractDoubleGradKernel,
diff --git a/paddle/phi/kernels/gpu/expand_grad_kernel.cu b/paddle/phi/kernels/gpu/expand_grad_kernel.cu
index 9ee58ad6caf29..8e2c3fde04a6a 100644
--- a/paddle/phi/kernels/gpu/expand_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/expand_grad_kernel.cu
@@ -18,7 +18,7 @@
 #include "paddle/phi/kernels/expand_grad_kernel.h"
 #include "paddle/phi/kernels/impl/expand_grad_kernel_impl.h"
 
-PT_REGISTER_KERNEL(expand_grad,
+PD_REGISTER_KERNEL(expand_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::ExpandGradKernel,
diff --git a/paddle/phi/kernels/gpu/expand_kernel.cu b/paddle/phi/kernels/gpu/expand_kernel.cu
index dc1b4717fcc4c..d4275804b3db8 100644
--- a/paddle/phi/kernels/gpu/expand_kernel.cu
+++ b/paddle/phi/kernels/gpu/expand_kernel.cu
@@ -19,7 +19,7 @@
 #include "paddle/phi/kernels/expand_kernel.h"
 #include "paddle/phi/kernels/impl/expand_kernel_impl.h"
 
-PT_REGISTER_KERNEL(expand,
+PD_REGISTER_KERNEL(expand,
                    GPU,
                    ALL_LAYOUT,
                    phi::ExpandKernel,
diff --git a/paddle/phi/kernels/gpu/full_kernel.cu b/paddle/phi/kernels/gpu/full_kernel.cu
index caa05514c4f0f..d5cb1575b7181 100644
--- a/paddle/phi/kernels/gpu/full_kernel.cu
+++ b/paddle/phi/kernels/gpu/full_kernel.cu
@@ -98,7 +98,7 @@ void FullLikeKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(full,
+PD_REGISTER_KERNEL(full,
                    GPU,
                    ALL_LAYOUT,
                    phi::FullKernel,
@@ -113,7 +113,7 @@ PT_REGISTER_KERNEL(full,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-PT_REGISTER_KERNEL(full_like,
+PD_REGISTER_KERNEL(full_like,
                    GPU,
                    ALL_LAYOUT,
                    phi::FullLikeKernel,
diff --git a/paddle/phi/kernels/gpu/histogram_kernel.cu b/paddle/phi/kernels/gpu/histogram_kernel.cu
index 47dee820e2fbd..6db987e22fc6c 100644
--- a/paddle/phi/kernels/gpu/histogram_kernel.cu
+++ b/paddle/phi/kernels/gpu/histogram_kernel.cu
@@ -149,7 +149,7 @@ void HistogramKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(histogram,
+PD_REGISTER_KERNEL(histogram,
                    GPU,
                    ALL_LAYOUT,
                    phi::HistogramKernel,
diff --git a/paddle/phi/kernels/gpu/huber_loss_grad_kernel.cu b/paddle/phi/kernels/gpu/huber_loss_grad_kernel.cu
index 5e1e000a38d95..20cc2ed669adf 100644
--- a/paddle/phi/kernels/gpu/huber_loss_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/huber_loss_grad_kernel.cu
@@ -17,6 +17,6 @@
 #include "paddle/phi/kernels/huber_loss_grad_kernel.h"
 #include "paddle/phi/kernels/impl/huber_loss_grad_kernel_impl.h"
 
-PT_REGISTER_KERNEL(
+PD_REGISTER_KERNEL(
     huber_loss_grad, GPU, ALL_LAYOUT, phi::HuberLossGradKernel, float, double) {
 }
diff --git a/paddle/phi/kernels/gpu/huber_loss_kernel.cu b/paddle/phi/kernels/gpu/huber_loss_kernel.cu
index 2cca0c08a3f3b..26648a260b99e 100644
--- a/paddle/phi/kernels/gpu/huber_loss_kernel.cu
+++ b/paddle/phi/kernels/gpu/huber_loss_kernel.cu
@@ -17,5 +17,5 @@
 #include "paddle/phi/kernels/huber_loss_kernel.h"
 #include "paddle/phi/kernels/impl/huber_loss_kernel_impl.h"
 
-PT_REGISTER_KERNEL(
+PD_REGISTER_KERNEL(
     huber_loss, GPU, ALL_LAYOUT, phi::HuberLossKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/lerp_grad_kernel.cu b/paddle/phi/kernels/gpu/lerp_grad_kernel.cu
index 81bd69a5f12e0..0a5ac99fa8e45 100644
--- a/paddle/phi/kernels/gpu/lerp_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/lerp_grad_kernel.cu
@@ -17,5 +17,5 @@
 #include "paddle/phi/kernels/impl/lerp_grad_kernel_impl.h"
 #include "paddle/phi/kernels/lerp_grad_kernel.h"
 
-PT_REGISTER_KERNEL(
+PD_REGISTER_KERNEL(
     lerp_grad, GPU, ALL_LAYOUT, phi::LerpGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/lerp_kernel.cu b/paddle/phi/kernels/gpu/lerp_kernel.cu
index 190248c0cd077..96010aff4e70c 100644
--- a/paddle/phi/kernels/gpu/lerp_kernel.cu
+++ b/paddle/phi/kernels/gpu/lerp_kernel.cu
@@ -17,4 +17,4 @@
 #include "paddle/phi/kernels/impl/lerp_kernel_impl.h"
 #include "paddle/phi/kernels/lerp_kernel.h"
 
-PT_REGISTER_KERNEL(lerp, GPU, ALL_LAYOUT, phi::LerpKernel, float, double) {}
+PD_REGISTER_KERNEL(lerp, GPU, ALL_LAYOUT, phi::LerpKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/masked_select_grad_kernel.cu b/paddle/phi/kernels/gpu/masked_select_grad_kernel.cu
index c4f4b461f2aa0..71b7cd8750462 100644
--- a/paddle/phi/kernels/gpu/masked_select_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/masked_select_grad_kernel.cu
@@ -96,7 +96,7 @@ void MaskedSelectGradKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(masked_select_grad,
+PD_REGISTER_KERNEL(masked_select_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::MaskedSelectGradKernel,
diff --git a/paddle/phi/kernels/gpu/masked_select_kernel.cu b/paddle/phi/kernels/gpu/masked_select_kernel.cu
index 8254ce4be6356..fc4adca2f4243 100644
--- a/paddle/phi/kernels/gpu/masked_select_kernel.cu
+++ b/paddle/phi/kernels/gpu/masked_select_kernel.cu
@@ -108,7 +108,7 @@ void MaskedSelectKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(masked_select,
+PD_REGISTER_KERNEL(masked_select,
                    GPU,
                    ALL_LAYOUT,
                    phi::MaskedSelectKernel,
diff --git a/paddle/phi/kernels/gpu/math_kernel.cu b/paddle/phi/kernels/gpu/math_kernel.cu
index c3605ce655f2b..f7b1205cb593a 100644
--- a/paddle/phi/kernels/gpu/math_kernel.cu
+++ b/paddle/phi/kernels/gpu/math_kernel.cu
@@ -95,7 +95,7 @@ using float16 = phi::dtype::float16;
 using complex64 = ::phi::dtype::complex<float>;
 using complex128 = ::phi::dtype::complex<double>;
 
-PT_REGISTER_KERNEL(add_raw,
+PD_REGISTER_KERNEL(add_raw,
                    GPU,
                    ALL_LAYOUT,
                    phi::AddRawKernel,
@@ -107,7 +107,7 @@ PT_REGISTER_KERNEL(add_raw,
                    float16,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL(subtract_raw,
+PD_REGISTER_KERNEL(subtract_raw,
                    GPU,
                    ALL_LAYOUT,
                    phi::SubtractRawKernel,
@@ -119,7 +119,7 @@ PT_REGISTER_KERNEL(subtract_raw,
                    float16,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL(divide_raw,
+PD_REGISTER_KERNEL(divide_raw,
                    GPU,
                    ALL_LAYOUT,
                    phi::DivideRawKernel,
@@ -130,7 +130,7 @@ PT_REGISTER_KERNEL(divide_raw,
                    float16,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL(multiply_raw,
+PD_REGISTER_KERNEL(multiply_raw,
                    GPU,
                    ALL_LAYOUT,
                    phi::MultiplyRawKernel,
@@ -142,7 +142,7 @@ PT_REGISTER_KERNEL(multiply_raw,
                    float16,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL(sum_raw,
+PD_REGISTER_KERNEL(sum_raw,
                    GPU,
                    ALL_LAYOUT,
                    phi::SumRawKernel,
@@ -158,7 +158,7 @@ PT_REGISTER_KERNEL(sum_raw,
   kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
 }
 
-PT_REGISTER_KERNEL(mean_raw,
+PD_REGISTER_KERNEL(mean_raw,
                    GPU,
                    ALL_LAYOUT,
                    phi::MeanRawKernel,
diff --git a/paddle/phi/kernels/gpu/matmul_grad_kernel.cu b/paddle/phi/kernels/gpu/matmul_grad_kernel.cu
index 7da5fb2c98818..ff23ebd05b528 100644
--- a/paddle/phi/kernels/gpu/matmul_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/matmul_grad_kernel.cu
@@ -19,7 +19,7 @@ limitations under the License. */
 
 #include "paddle/phi/kernels/impl/matmul_grad_kernel_impl.h"
 
-PT_REGISTER_KERNEL(matmul_grad,
+PD_REGISTER_KERNEL(matmul_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::MatmulGradKernel,
@@ -30,7 +30,7 @@ PT_REGISTER_KERNEL(matmul_grad,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-PT_REGISTER_KERNEL(matmul_double_grad,
+PD_REGISTER_KERNEL(matmul_double_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::MatmulDoubleGradKernel,
@@ -40,7 +40,7 @@ PT_REGISTER_KERNEL(matmul_double_grad,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-PT_REGISTER_KERNEL(matmul_triple_grad,
+PD_REGISTER_KERNEL(matmul_triple_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::MatmulTripleGradKernel,
diff --git a/paddle/phi/kernels/gpu/matmul_kernel.cu b/paddle/phi/kernels/gpu/matmul_kernel.cu
index 3041784e93695..98be79c5f9dab 100644
--- a/paddle/phi/kernels/gpu/matmul_kernel.cu
+++ b/paddle/phi/kernels/gpu/matmul_kernel.cu
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/kernels/impl/matmul_kernel_impl.h"
 
-PT_REGISTER_KERNEL(matmul,
+PD_REGISTER_KERNEL(matmul,
                    GPU,
                    ALL_LAYOUT,
                    phi::MatmulKernel,
diff --git a/paddle/phi/kernels/gpu/norm_grad_kernel.cu b/paddle/phi/kernels/gpu/norm_grad_kernel.cu
index 3530de11d35e2..ab38a82eceb1e 100644
--- a/paddle/phi/kernels/gpu/norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/norm_grad_kernel.cu
@@ -111,7 +111,7 @@ void NormGradKernel(const Context& ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(norm_grad,
+PD_REGISTER_KERNEL(norm_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::NormGradKernel,
diff --git a/paddle/phi/kernels/gpu/norm_kernel.cu b/paddle/phi/kernels/gpu/norm_kernel.cu
index 4ed3100918edf..274f91b8dd661 100644
--- a/paddle/phi/kernels/gpu/norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/norm_kernel.cu
@@ -124,7 +124,7 @@ void NormKernel(const Context& ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(norm,
+PD_REGISTER_KERNEL(norm,
                    GPU,
                    ALL_LAYOUT,
                    phi::NormKernel,
diff --git a/paddle/phi/kernels/gpu/scale_kernel.cu b/paddle/phi/kernels/gpu/scale_kernel.cu
index 245605ed8a91b..d9c8de21c5bc2 100644
--- a/paddle/phi/kernels/gpu/scale_kernel.cu
+++ b/paddle/phi/kernels/gpu/scale_kernel.cu
@@ -63,7 +63,7 @@ void ScaleKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(scale,
+PD_REGISTER_KERNEL(scale,
                    GPU,
                    ALL_LAYOUT,
                    phi::ScaleKernel,
diff --git a/paddle/phi/kernels/gpu/sign_kernel.cu.cc b/paddle/phi/kernels/gpu/sign_kernel.cu.cc
index 950cf67d7cff5..1fe17a7a227ec 100644
--- a/paddle/phi/kernels/gpu/sign_kernel.cu.cc
+++ b/paddle/phi/kernels/gpu/sign_kernel.cu.cc
@@ -23,5 +23,5 @@ limitations under the License. */
 
 using float16 = phi::dtype::float16;
 
-PT_REGISTER_KERNEL(
+PD_REGISTER_KERNEL(
     sign, GPU, ALL_LAYOUT, phi::SignKernel, float, double, float16) {}
diff --git a/paddle/phi/kernels/gpu/split_kernel.cu b/paddle/phi/kernels/gpu/split_kernel.cu
index 919b0a7d4f966..5222fce03ace6 100644
--- a/paddle/phi/kernels/gpu/split_kernel.cu
+++ b/paddle/phi/kernels/gpu/split_kernel.cu
@@ -59,7 +59,7 @@ void SplitKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(split,
+PD_REGISTER_KERNEL(split,
                    GPU,
                    ALL_LAYOUT,
                    phi::SplitKernel,
diff --git a/paddle/phi/kernels/gpu/trace_grad_kernel.cu b/paddle/phi/kernels/gpu/trace_grad_kernel.cu
index a7e4b55b4ca22..6692c1e19b033 100644
--- a/paddle/phi/kernels/gpu/trace_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/trace_grad_kernel.cu
@@ -18,7 +18,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/trace_grad_kernel_impl.h"
 
-PT_REGISTER_KERNEL(trace_grad,
+PD_REGISTER_KERNEL(trace_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::TraceGradKernel,
diff --git a/paddle/phi/kernels/gpu/trace_kernel.cu b/paddle/phi/kernels/gpu/trace_kernel.cu
index bc8b6bc922c91..7ac7c451b0054 100644
--- a/paddle/phi/kernels/gpu/trace_kernel.cu
+++ b/paddle/phi/kernels/gpu/trace_kernel.cu
@@ -44,7 +44,7 @@ void TraceKernel(const Context& ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(trace,
+PD_REGISTER_KERNEL(trace,
                    GPU,
                    ALL_LAYOUT,
                    phi::TraceKernel,
diff --git a/paddle/phi/kernels/gpu/trunc_grad_kernel.cu b/paddle/phi/kernels/gpu/trunc_grad_kernel.cu
index b5427d0b73867..92d95e7259bf0 100644
--- a/paddle/phi/kernels/gpu/trunc_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/trunc_grad_kernel.cu
@@ -44,7 +44,7 @@ void TruncGradKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(trunc_grad,
+PD_REGISTER_KERNEL(trunc_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::TruncGradKernel,
diff --git a/paddle/phi/kernels/gpu/trunc_kernel.cu b/paddle/phi/kernels/gpu/trunc_kernel.cu
index d9c0803de2832..cc44602b657aa 100644
--- a/paddle/phi/kernels/gpu/trunc_kernel.cu
+++ b/paddle/phi/kernels/gpu/trunc_kernel.cu
@@ -77,5 +77,5 @@ void TruncKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(
+PD_REGISTER_KERNEL(
     trunc, GPU, ALL_LAYOUT, phi::TruncKernel, float, double, int, int64_t) {}
diff --git a/paddle/phi/kernels/math_kernel.cc b/paddle/phi/kernels/math_kernel.cc
index e1e3679ea8be8..db6c5e1ac3591 100644
--- a/paddle/phi/kernels/math_kernel.cc
+++ b/paddle/phi/kernels/math_kernel.cc
@@ -81,10 +81,10 @@ void MultiplyKernel(const Context& dev_ctx,
 using complex64 = ::phi::dtype::complex<float>;
 using complex128 = ::phi::dtype::complex<double>;
 
-PT_REGISTER_KERNEL(
+PD_REGISTER_KERNEL(
     mean, CPU, ALL_LAYOUT, phi::MeanKernel, float, double, bool) {}
 
-PT_REGISTER_KERNEL(sum,
+PD_REGISTER_KERNEL(sum,
                    CPU,
                    ALL_LAYOUT,
                    phi::SumKernel,
@@ -100,7 +100,7 @@ PT_REGISTER_KERNEL(sum,
   kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
 }
 
-PT_REGISTER_KERNEL(add,
+PD_REGISTER_KERNEL(add,
                    CPU,
                    ALL_LAYOUT,
                    phi::AddKernel,
@@ -111,7 +111,7 @@ PT_REGISTER_KERNEL(add,
                    int64_t,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL(subtract,
+PD_REGISTER_KERNEL(subtract,
                    CPU,
                    ALL_LAYOUT,
                    phi::SubtractKernel,
@@ -122,7 +122,7 @@ PT_REGISTER_KERNEL(subtract,
                    int64_t,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL(divide,
+PD_REGISTER_KERNEL(divide,
                    CPU,
                    ALL_LAYOUT,
                    phi::DivideKernel,
@@ -132,7 +132,7 @@ PT_REGISTER_KERNEL(divide,
                    int64_t,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL(multiply,
+PD_REGISTER_KERNEL(multiply,
                    CPU,
                    ALL_LAYOUT,
                    phi::MultiplyKernel,
@@ -145,7 +145,7 @@ PT_REGISTER_KERNEL(multiply,
                    complex128) {}
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_REGISTER_KERNEL(mean,
+PD_REGISTER_KERNEL(mean,
                    GPU,
                    ALL_LAYOUT,
                    phi::MeanKernel,
@@ -155,7 +155,7 @@ PT_REGISTER_KERNEL(mean,
                    int,
                    int64_t,
                    phi::dtype::float16) {}
-PT_REGISTER_KERNEL(sum,
+PD_REGISTER_KERNEL(sum,
                    GPU,
                    ALL_LAYOUT,
                    phi::SumKernel,
@@ -170,7 +170,7 @@ PT_REGISTER_KERNEL(sum,
                    complex128) {
   kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
 }
-PT_REGISTER_KERNEL(add,
+PD_REGISTER_KERNEL(add,
                    GPU,
                    ALL_LAYOUT,
                    phi::AddKernel,
@@ -182,7 +182,7 @@ PT_REGISTER_KERNEL(add,
                    phi::dtype::float16,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL(subtract,
+PD_REGISTER_KERNEL(subtract,
                    GPU,
                    ALL_LAYOUT,
                    phi::SubtractKernel,
@@ -194,7 +194,7 @@ PT_REGISTER_KERNEL(subtract,
                    phi::dtype::float16,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL(divide,
+PD_REGISTER_KERNEL(divide,
                    GPU,
                    ALL_LAYOUT,
                    phi::DivideKernel,
@@ -205,7 +205,7 @@ PT_REGISTER_KERNEL(divide,
                    phi::dtype::float16,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL(multiply,
+PD_REGISTER_KERNEL(multiply,
                    GPU,
                    ALL_LAYOUT,
                    phi::MultiplyKernel,
diff --git a/paddle/phi/kernels/reshape_grad_kernel.cc b/paddle/phi/kernels/reshape_grad_kernel.cc
index 436813b53e6cd..5361315bb611b 100644
--- a/paddle/phi/kernels/reshape_grad_kernel.cc
+++ b/paddle/phi/kernels/reshape_grad_kernel.cc
@@ -37,24 +37,24 @@ void ReshapeDoubleGradKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_GENERAL_KERNEL(reshape_grad,
+PD_REGISTER_GENERAL_KERNEL(reshape_grad,
                            CPU,
                            ALL_LAYOUT,
                            phi::ReshapeGradKernel<phi::CPUContext>,
                            ALL_DTYPE) {}
-PT_REGISTER_GENERAL_KERNEL(reshape_double_grad,
+PD_REGISTER_GENERAL_KERNEL(reshape_double_grad,
                            CPU,
                            ALL_LAYOUT,
                            phi::ReshapeDoubleGradKernel<phi::CPUContext>,
                            ALL_DTYPE) {}
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_REGISTER_GENERAL_KERNEL(reshape_grad,
+PD_REGISTER_GENERAL_KERNEL(reshape_grad,
                            GPU,
                            ALL_LAYOUT,
                            phi::ReshapeGradKernel<phi::GPUContext>,
                            ALL_DTYPE) {}
-PT_REGISTER_GENERAL_KERNEL(reshape_double_grad,
+PD_REGISTER_GENERAL_KERNEL(reshape_double_grad,
                            GPU,
                            ALL_LAYOUT,
                            phi::ReshapeDoubleGradKernel<phi::GPUContext>,
@@ -62,12 +62,12 @@ PT_REGISTER_GENERAL_KERNEL(reshape_double_grad,
 #endif
 
 #ifdef PADDLE_WITH_XPU
-PT_REGISTER_GENERAL_KERNEL(reshape_grad,
+PD_REGISTER_GENERAL_KERNEL(reshape_grad,
                            XPU,
                            ALL_LAYOUT,
                            phi::ReshapeGradKernel<phi::XPUContext>,
                            ALL_DTYPE) {}
-PT_REGISTER_GENERAL_KERNEL(reshape_double_grad,
+PD_REGISTER_GENERAL_KERNEL(reshape_double_grad,
                            XPU,
                            ALL_LAYOUT,
                            phi::ReshapeDoubleGradKernel<phi::XPUContext>,
diff --git a/paddle/phi/kernels/reshape_kernel.cc b/paddle/phi/kernels/reshape_kernel.cc
index 68d9130850191..570e70ea11227 100644
--- a/paddle/phi/kernels/reshape_kernel.cc
+++ b/paddle/phi/kernels/reshape_kernel.cc
@@ -52,18 +52,18 @@ void ReshapeWithXShape(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_GENERAL_KERNEL(
+PD_REGISTER_GENERAL_KERNEL(
     reshape, CPU, ALL_LAYOUT, phi::ReshapeKernel<phi::CPUContext>, ALL_DTYPE) {}
-PT_REGISTER_GENERAL_KERNEL(reshape_with_xshape,
+PD_REGISTER_GENERAL_KERNEL(reshape_with_xshape,
                            CPU,
                            ALL_LAYOUT,
                            phi::ReshapeWithXShape<phi::CPUContext>,
                            ALL_DTYPE) {}
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_REGISTER_GENERAL_KERNEL(
+PD_REGISTER_GENERAL_KERNEL(
     reshape, GPU, ALL_LAYOUT, phi::ReshapeKernel<phi::GPUContext>, ALL_DTYPE) {}
-PT_REGISTER_GENERAL_KERNEL(reshape_with_xshape,
+PD_REGISTER_GENERAL_KERNEL(reshape_with_xshape,
                            GPU,
                            ALL_LAYOUT,
                            phi::ReshapeWithXShape<phi::GPUContext>,
@@ -71,9 +71,9 @@ PT_REGISTER_GENERAL_KERNEL(reshape_with_xshape,
 #endif
 
 #ifdef PADDLE_WITH_XPU
-PT_REGISTER_GENERAL_KERNEL(
+PD_REGISTER_GENERAL_KERNEL(
     reshape, XPU, ALL_LAYOUT, phi::ReshapeKernel<phi::XPUContext>, ALL_DTYPE) {}
-PT_REGISTER_GENERAL_KERNEL(reshape_with_xshape,
+PD_REGISTER_GENERAL_KERNEL(reshape_with_xshape,
                            XPU,
                            ALL_LAYOUT,
                            phi::ReshapeWithXShape<phi::XPUContext>,
diff --git a/paddle/phi/kernels/selected_rows/full_kernel.cc b/paddle/phi/kernels/selected_rows/full_kernel.cc
index baedf899d2b53..02231867fdd35 100644
--- a/paddle/phi/kernels/selected_rows/full_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/full_kernel.cc
@@ -36,7 +36,7 @@ void FullSR(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(full_sr,
+PD_REGISTER_KERNEL(full_sr,
                    CPU,
                    ALL_LAYOUT,
                    phi::FullSR,
@@ -53,7 +53,7 @@ PT_REGISTER_KERNEL(full_sr,
                    phi::dtype::complex<double>) {}
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_REGISTER_KERNEL(full_sr,
+PD_REGISTER_KERNEL(full_sr,
                    GPU,
                    ALL_LAYOUT,
                    phi::FullSR,
diff --git a/paddle/phi/kernels/selected_rows/scale_kernel.cc b/paddle/phi/kernels/selected_rows/scale_kernel.cc
index 67717ed469488..094b6f4d12022 100644
--- a/paddle/phi/kernels/selected_rows/scale_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/scale_kernel.cc
@@ -38,7 +38,7 @@ void ScaleSR(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(scale_sr,
+PD_REGISTER_KERNEL(scale_sr,
                    CPU,
                    ALL_LAYOUT,
                    phi::ScaleSR,
@@ -52,7 +52,7 @@ PT_REGISTER_KERNEL(scale_sr,
                    int64_t) {}
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_REGISTER_KERNEL(scale_sr,
+PD_REGISTER_KERNEL(scale_sr,
                    GPU,
                    ALL_LAYOUT,
                    phi::ScaleSR,
diff --git a/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
index 408240b90a988..4374b5d7f1a1d 100644
--- a/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
@@ -284,7 +284,7 @@ void SparseCooToDenseKernel(const Context& dev_ctx,
 }  // namespace sparse
 }  // namespace phi
 
-PT_REGISTER_KERNEL(dense_to_sparse_coo,
+PD_REGISTER_KERNEL(dense_to_sparse_coo,
                    CPU,
                    ALL_LAYOUT,
                    phi::sparse::DenseToSparseCooKernel,
@@ -297,7 +297,7 @@ PT_REGISTER_KERNEL(dense_to_sparse_coo,
                    int,
                    int64_t) {}
 
-PT_REGISTER_KERNEL(sparse_csr_to_coo,
+PD_REGISTER_KERNEL(sparse_csr_to_coo,
                    CPU,
                    ALL_LAYOUT,
                    phi::sparse::SparseCsrToCooKernel,
@@ -310,7 +310,7 @@ PT_REGISTER_KERNEL(sparse_csr_to_coo,
                    int,
                    int64_t) {}
 
-PT_REGISTER_KERNEL(sparse_coo_to_csr,
+PD_REGISTER_KERNEL(sparse_coo_to_csr,
                    CPU,
                    ALL_LAYOUT,
                    phi::sparse::SparseCooToCsrKernel,
@@ -323,7 +323,7 @@ PT_REGISTER_KERNEL(sparse_coo_to_csr,
                    int,
                    int64_t) {}
 
-PT_REGISTER_KERNEL(dense_to_sparse_csr,
+PD_REGISTER_KERNEL(dense_to_sparse_csr,
                    CPU,
                    ALL_LAYOUT,
                    phi::sparse::DenseToSparseCsrKernel,
@@ -336,7 +336,7 @@ PT_REGISTER_KERNEL(dense_to_sparse_csr,
                    int,
                    int64_t) {}
 
-PT_REGISTER_KERNEL(sparse_coo_to_dense,
+PD_REGISTER_KERNEL(sparse_coo_to_dense,
                    CPU,
                    ALL_LAYOUT,
                    phi::sparse::SparseCooToDenseKernel,
@@ -349,7 +349,7 @@ PT_REGISTER_KERNEL(sparse_coo_to_dense,
                    int,
                    int64_t) {}
 
-PT_REGISTER_KERNEL(sparse_csr_to_dense,
+PD_REGISTER_KERNEL(sparse_csr_to_dense,
                    CPU,
                    ALL_LAYOUT,
                    phi::sparse::SparseCsrToDenseKernel,
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
index ab2be13615e0e..b7793e4055445 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
@@ -553,7 +553,7 @@ void SparseCooToDenseKernel(const Context& dev_ctx,
 }  // namespace sparse
 }  // namespace phi
 
-PT_REGISTER_KERNEL(dense_to_sparse_coo,
+PD_REGISTER_KERNEL(dense_to_sparse_coo,
                    GPU,
                    ALL_LAYOUT,
                    phi::sparse::DenseToSparseCooKernel,
@@ -566,7 +566,7 @@ PT_REGISTER_KERNEL(dense_to_sparse_coo,
                    int,
                    int64_t) {}
 
-PT_REGISTER_KERNEL(sparse_csr_to_coo,
+PD_REGISTER_KERNEL(sparse_csr_to_coo,
                    GPU,
                    ALL_LAYOUT,
                    phi::sparse::SparseCsrToCooKernel,
@@ -579,7 +579,7 @@ PT_REGISTER_KERNEL(sparse_csr_to_coo,
                    int,
                    int64_t) {}
 
-PT_REGISTER_KERNEL(sparse_coo_to_csr,
+PD_REGISTER_KERNEL(sparse_coo_to_csr,
                    GPU,
                    ALL_LAYOUT,
                    phi::sparse::SparseCooToCsrKernel,
@@ -592,7 +592,7 @@ PT_REGISTER_KERNEL(sparse_coo_to_csr,
                    int,
                    int64_t) {}
 
-PT_REGISTER_KERNEL(dense_to_sparse_csr,
+PD_REGISTER_KERNEL(dense_to_sparse_csr,
                    GPU,
                    ALL_LAYOUT,
                    phi::sparse::DenseToSparseCsrKernel,
@@ -605,7 +605,7 @@ PT_REGISTER_KERNEL(dense_to_sparse_csr,
                    int,
                    int64_t) {}
 
-PT_REGISTER_KERNEL(sparse_coo_to_dense,
+PD_REGISTER_KERNEL(sparse_coo_to_dense,
                    GPU,
                    ALL_LAYOUT,
                    phi::sparse::SparseCooToDenseKernel,
@@ -618,7 +618,7 @@ PT_REGISTER_KERNEL(sparse_coo_to_dense,
                    int,
                    int64_t) {}
 
-PT_REGISTER_KERNEL(sparse_csr_to_dense,
+PD_REGISTER_KERNEL(sparse_csr_to_dense,
                    GPU,
                    ALL_LAYOUT,
                    phi::sparse::SparseCsrToDenseKernel,
diff --git a/paddle/phi/kernels/transfer_layout_kernel.cc b/paddle/phi/kernels/transfer_layout_kernel.cc
index eb7146487e38b..c981ca1158507 100644
--- a/paddle/phi/kernels/transfer_layout_kernel.cc
+++ b/paddle/phi/kernels/transfer_layout_kernel.cc
@@ -69,7 +69,7 @@ void TransferLayoutKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_GENERAL_KERNEL(pten_transfer_layout,
+PD_REGISTER_GENERAL_KERNEL(pten_transfer_layout,
                            CPU,
                            ALL_LAYOUT,
                            phi::TransferLayoutKernel<phi::CPUContext>,
diff --git a/paddle/phi/kernels/xpu/cast_kernel.cc b/paddle/phi/kernels/xpu/cast_kernel.cc
index 0e50306a068c8..9aa503d58736d 100644
--- a/paddle/phi/kernels/xpu/cast_kernel.cc
+++ b/paddle/phi/kernels/xpu/cast_kernel.cc
@@ -86,7 +86,7 @@ void CastKernel(const Context& dev_ctx,
 }
 }  // namespace phi
 
-PT_REGISTER_KERNEL(cast,
+PD_REGISTER_KERNEL(cast,
                    XPU,
                    ALL_LAYOUT,
                    phi::CastKernel,
diff --git a/paddle/phi/kernels/xpu/copy_kernel.cc b/paddle/phi/kernels/xpu/copy_kernel.cc
index 559d110a9e8ad..3bbedbbb346e4 100644
--- a/paddle/phi/kernels/xpu/copy_kernel.cc
+++ b/paddle/phi/kernels/xpu/copy_kernel.cc
@@ -69,5 +69,5 @@ void Copy(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_GENERAL_KERNEL(
+PD_REGISTER_GENERAL_KERNEL(
     copy, XPU, ALL_LAYOUT, phi::Copy<phi::XPUContext>, ALL_DTYPE) {}
diff --git a/paddle/phi/kernels/xpu/full_kernel.cc b/paddle/phi/kernels/xpu/full_kernel.cc
index 98810fa9779a4..b514425cc54da 100644
--- a/paddle/phi/kernels/xpu/full_kernel.cc
+++ b/paddle/phi/kernels/xpu/full_kernel.cc
@@ -116,7 +116,7 @@ void FullLikeKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(full,
+PD_REGISTER_KERNEL(full,
                    XPU,
                    ALL_LAYOUT,
                    phi::FullKernel,
@@ -132,7 +132,7 @@ PT_REGISTER_KERNEL(full,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-PT_REGISTER_KERNEL(full_like,
+PD_REGISTER_KERNEL(full_like,
                    XPU,
                    ALL_LAYOUT,
                    phi::FullLikeKernel,
diff --git a/paddle/phi/kernels/xpu/scale_kernel.cc b/paddle/phi/kernels/xpu/scale_kernel.cc
index 0814e2d9b322f..e103e5afdcf9b 100644
--- a/paddle/phi/kernels/xpu/scale_kernel.cc
+++ b/paddle/phi/kernels/xpu/scale_kernel.cc
@@ -56,7 +56,7 @@ void ScaleKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(scale,
+PD_REGISTER_KERNEL(scale,
                    XPU,
                    ALL_LAYOUT,
                    phi::ScaleKernel,
diff --git a/paddle/phi/ops/compat/abs_sig.cc b/paddle/phi/ops/compat/abs_sig.cc
index 67319a18aafa1..b4b94457e6be9 100644
--- a/paddle/phi/ops/compat/abs_sig.cc
+++ b/paddle/phi/ops/compat/abs_sig.cc
@@ -32,7 +32,7 @@ KernelSignature AbsDoubleGradOpArgumentMapping(
 
 }  // namespace phi
 
-PT_REGISTER_ARG_MAPPING_FN(abs, phi::AbsOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(abs_grad, phi::AbsGradOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(abs_double_grad,
+PD_REGISTER_ARG_MAPPING_FN(abs, phi::AbsOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(abs_grad, phi::AbsGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(abs_double_grad,
                            phi::AbsDoubleGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/cast_sig.cc b/paddle/phi/ops/compat/cast_sig.cc
index 79cf59f32990e..3d970e92a7d68 100644
--- a/paddle/phi/ops/compat/cast_sig.cc
+++ b/paddle/phi/ops/compat/cast_sig.cc
@@ -22,4 +22,4 @@ KernelSignature CastOpArgumentMapping(const ArgumentMappingContext& ctx) {
 
 }  // namespace phi
 
-PT_REGISTER_ARG_MAPPING_FN(cast, phi::CastOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(cast, phi::CastOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/concat_sig.cc b/paddle/phi/ops/compat/concat_sig.cc
index de37b973409e9..21e653ccfe90f 100644
--- a/paddle/phi/ops/compat/concat_sig.cc
+++ b/paddle/phi/ops/compat/concat_sig.cc
@@ -25,4 +25,4 @@ KernelSignature ConcatOpArgumentMapping(const ArgumentMappingContext& ctx) {
 
 }  // namespace phi
 
-PT_REGISTER_ARG_MAPPING_FN(concat, phi::ConcatOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(concat, phi::ConcatOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/diagonal_sig.cc b/paddle/phi/ops/compat/diagonal_sig.cc
index 430edea89bea2..b4a424ec06bf2 100644
--- a/paddle/phi/ops/compat/diagonal_sig.cc
+++ b/paddle/phi/ops/compat/diagonal_sig.cc
@@ -25,4 +25,4 @@ KernelSignature DiagonalGradOpArgumentMapping(
 }
 
 }  // namespace phi
-PT_REGISTER_ARG_MAPPING_FN(diagonal_grad, phi::DiagonalGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(diagonal_grad, phi::DiagonalGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/digamma_sig.cc b/paddle/phi/ops/compat/digamma_sig.cc
index 555c16ef6b6bf..fa693f92c6fe3 100644
--- a/paddle/phi/ops/compat/digamma_sig.cc
+++ b/paddle/phi/ops/compat/digamma_sig.cc
@@ -24,4 +24,4 @@ KernelSignature DigammaGradOpArgumentMapping(
 
 }  // namespace phi
 
-PT_REGISTER_ARG_MAPPING_FN(digamma_grad, phi::DigammaGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(digamma_grad, phi::DigammaGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/dot_sig.cc b/paddle/phi/ops/compat/dot_sig.cc
index 481bd3a4949d8..2437ecc1ca767 100644
--- a/paddle/phi/ops/compat/dot_sig.cc
+++ b/paddle/phi/ops/compat/dot_sig.cc
@@ -25,4 +25,4 @@ KernelSignature DotGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
 
 }  // namespace phi
 
-PT_REGISTER_ARG_MAPPING_FN(dot_grad, phi::DotGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(dot_grad, phi::DotGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/elementwise_sig.cc b/paddle/phi/ops/compat/elementwise_sig.cc
index dfffa034f1d1d..cddebcbce1273 100644
--- a/paddle/phi/ops/compat/elementwise_sig.cc
+++ b/paddle/phi/ops/compat/elementwise_sig.cc
@@ -102,28 +102,28 @@ KernelSignature ElementwiseSubGradOpArgumentMapping(
 
 }  // namespace phi
 
-PT_REGISTER_BASE_KERNEL_NAME(elementwise_add, add);
-PT_REGISTER_BASE_KERNEL_NAME(elementwise_sub, subtract);
-PT_REGISTER_BASE_KERNEL_NAME(elementwise_mul, multiply);
-PT_REGISTER_BASE_KERNEL_NAME(elementwise_div, divide);
-PT_REGISTER_BASE_KERNEL_NAME(elementwise_add_grad, add_grad);
-PT_REGISTER_BASE_KERNEL_NAME(elementwise_add_grad_grad, add_double_grad);
-PT_REGISTER_BASE_KERNEL_NAME(elementwise_add_triple_grad, add_triple_grad);
-PT_REGISTER_BASE_KERNEL_NAME(elementwise_sub_grad, subtract_grad);
-
-PT_REGISTER_ARG_MAPPING_FN(elementwise_add,
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_add, add);
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_sub, subtract);
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_mul, multiply);
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_div, divide);
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_add_grad, add_grad);
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_add_grad_grad, add_double_grad);
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_add_triple_grad, add_triple_grad);
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_sub_grad, subtract_grad);
+
+PD_REGISTER_ARG_MAPPING_FN(elementwise_add,
                            phi::ElementwiseAddOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(elementwise_sub,
+PD_REGISTER_ARG_MAPPING_FN(elementwise_sub,
                            phi::ElementwiseSubOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(elementwise_mul,
+PD_REGISTER_ARG_MAPPING_FN(elementwise_mul,
                            phi::ElementwiseMulOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(elementwise_div,
+PD_REGISTER_ARG_MAPPING_FN(elementwise_div,
                            phi::ElementwiseDivOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(elementwise_add_grad,
+PD_REGISTER_ARG_MAPPING_FN(elementwise_add_grad,
                            phi::ElementwiseAddGradOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(elementwise_add_grad_grad,
+PD_REGISTER_ARG_MAPPING_FN(elementwise_add_grad_grad,
                            phi::ElementwiseAddDoubleGradOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(elementwise_add_triple_grad,
+PD_REGISTER_ARG_MAPPING_FN(elementwise_add_triple_grad,
                            phi::ElementwiseAddTripleGradOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(elementwise_sub_grad,
+PD_REGISTER_ARG_MAPPING_FN(elementwise_sub_grad,
                            phi::ElementwiseSubGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/empty_sig.cc b/paddle/phi/ops/compat/empty_sig.cc
index 9315fdf827dcf..42cd55bdc0cda 100644
--- a/paddle/phi/ops/compat/empty_sig.cc
+++ b/paddle/phi/ops/compat/empty_sig.cc
@@ -28,4 +28,4 @@ KernelSignature EmptyOpArgumentMapping(const ArgumentMappingContext& ctx) {
 
 }  // namespace phi
 
-PT_REGISTER_ARG_MAPPING_FN(empty, phi::EmptyOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(empty, phi::EmptyOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/expand_sig.cc b/paddle/phi/ops/compat/expand_sig.cc
index 3f7ff458296c7..3b2e468267da0 100644
--- a/paddle/phi/ops/compat/expand_sig.cc
+++ b/paddle/phi/ops/compat/expand_sig.cc
@@ -47,8 +47,8 @@ KernelSignature ExpandGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
 
 }  // namespace phi
 
-PT_REGISTER_BASE_KERNEL_NAME(expand_v2, expand);
-PT_REGISTER_BASE_KERNEL_NAME(expand_v2_grad, expand_grad);
+PD_REGISTER_BASE_KERNEL_NAME(expand_v2, expand);
+PD_REGISTER_BASE_KERNEL_NAME(expand_v2_grad, expand_grad);
 
-PT_REGISTER_ARG_MAPPING_FN(expand_v2, phi::ExpandOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(expand_v2_grad, phi::ExpandGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(expand_v2, phi::ExpandOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(expand_v2_grad, phi::ExpandGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/fill_any_like_sig.cc b/paddle/phi/ops/compat/fill_any_like_sig.cc
index 3fbd022ca6a9a..84af155d402d6 100644
--- a/paddle/phi/ops/compat/fill_any_like_sig.cc
+++ b/paddle/phi/ops/compat/fill_any_like_sig.cc
@@ -23,6 +23,6 @@ KernelSignature FillAnyLikeOpArgumentMapping(
 
 }  // namespace phi
 
-PT_REGISTER_BASE_KERNEL_NAME(fill_any_like, full_like);
+PD_REGISTER_BASE_KERNEL_NAME(fill_any_like, full_like);
 
-PT_REGISTER_ARG_MAPPING_FN(fill_any_like, phi::FillAnyLikeOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(fill_any_like, phi::FillAnyLikeOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/fill_constant_sig.cc b/paddle/phi/ops/compat/fill_constant_sig.cc
index 85dfdc3db3eae..df28a7b81b61b 100644
--- a/paddle/phi/ops/compat/fill_constant_sig.cc
+++ b/paddle/phi/ops/compat/fill_constant_sig.cc
@@ -123,6 +123,6 @@ KernelSignature FillConstantOpArgumentMapping(
 
 }  // namespace phi
 
-PT_REGISTER_BASE_KERNEL_NAME(fill_constant, full);
+PD_REGISTER_BASE_KERNEL_NAME(fill_constant, full);
 
-PT_REGISTER_ARG_MAPPING_FN(fill_constant, phi::FillConstantOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(fill_constant, phi::FillConstantOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/flatten_sig.cc b/paddle/phi/ops/compat/flatten_sig.cc
index ae5f438cafc24..b72ad05ea09d8 100644
--- a/paddle/phi/ops/compat/flatten_sig.cc
+++ b/paddle/phi/ops/compat/flatten_sig.cc
@@ -36,10 +36,10 @@ KernelSignature FlattenGradOpArgumentMapping(
 
 }  // namespace phi
 
-PT_REGISTER_BASE_KERNEL_NAME(flatten_contiguous_range, flatten);
-PT_REGISTER_BASE_KERNEL_NAME(flatten_contiguous_range_grad, flatten_grad);
+PD_REGISTER_BASE_KERNEL_NAME(flatten_contiguous_range, flatten);
+PD_REGISTER_BASE_KERNEL_NAME(flatten_contiguous_range_grad, flatten_grad);
 
-PT_REGISTER_ARG_MAPPING_FN(flatten_contiguous_range,
+PD_REGISTER_ARG_MAPPING_FN(flatten_contiguous_range,
                            phi::FlattenOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(flatten_contiguous_range_grad,
+PD_REGISTER_ARG_MAPPING_FN(flatten_contiguous_range_grad,
                            phi::FlattenGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/histogram_sig.cc b/paddle/phi/ops/compat/histogram_sig.cc
index 0fd1fdea76424..0cea146ea4e7f 100644
--- a/paddle/phi/ops/compat/histogram_sig.cc
+++ b/paddle/phi/ops/compat/histogram_sig.cc
@@ -22,4 +22,4 @@ KernelSignature HistogramOpArgumentMapping(const ArgumentMappingContext& ctx) {
 
 }  // namespace phi
 
-PT_REGISTER_ARG_MAPPING_FN(histogram, phi::HistogramOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(histogram, phi::HistogramOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/huber_loss_sig.cc b/paddle/phi/ops/compat/huber_loss_sig.cc
index 6e7183ff9f281..6f669a4a8b697 100644
--- a/paddle/phi/ops/compat/huber_loss_sig.cc
+++ b/paddle/phi/ops/compat/huber_loss_sig.cc
@@ -31,6 +31,6 @@ KernelSignature HuberLossGradOpArgumentMapping(
 
 }  // namespace phi
 
-PT_REGISTER_ARG_MAPPING_FN(huber_loss, phi::HuberLossOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(huber_loss_grad,
+PD_REGISTER_ARG_MAPPING_FN(huber_loss, phi::HuberLossOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(huber_loss_grad,
                            phi::HuberLossGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/lerp_sig.cc b/paddle/phi/ops/compat/lerp_sig.cc
index d33a714048bd0..3a8b23ca4c4a4 100644
--- a/paddle/phi/ops/compat/lerp_sig.cc
+++ b/paddle/phi/ops/compat/lerp_sig.cc
@@ -29,5 +29,5 @@ KernelSignature LerpGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
 
 }  // namespace phi
 
-PT_REGISTER_ARG_MAPPING_FN(lerp, phi::LerpOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(lerp_grad, phi::LerpGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(lerp, phi::LerpOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(lerp_grad, phi::LerpGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/masked_select_sig.cc b/paddle/phi/ops/compat/masked_select_sig.cc
index 77a97d103e889..8083b123bcff5 100644
--- a/paddle/phi/ops/compat/masked_select_sig.cc
+++ b/paddle/phi/ops/compat/masked_select_sig.cc
@@ -31,6 +31,6 @@ KernelSignature MaskedSelectGradOpArgumentMapping(
 
 }  // namespace phi
 
-PT_REGISTER_ARG_MAPPING_FN(masked_select, phi::MaskedSelectOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(masked_select_grad,
+PD_REGISTER_ARG_MAPPING_FN(masked_select, phi::MaskedSelectOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(masked_select_grad,
                            phi::MaskedSelectGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/matmul_sig.cc b/paddle/phi/ops/compat/matmul_sig.cc
index d4106cd39e304..771a7c3acc39d 100644
--- a/paddle/phi/ops/compat/matmul_sig.cc
+++ b/paddle/phi/ops/compat/matmul_sig.cc
@@ -49,13 +49,13 @@ KernelSignature MatmulTripleGradOpArgumentMapping(
 
 }  // namespace phi
 
-PT_REGISTER_BASE_KERNEL_NAME(matmul_v2, matmul);
-PT_REGISTER_BASE_KERNEL_NAME(matmul_v2_grad, matmul_grad);
-PT_REGISTER_BASE_KERNEL_NAME(matmul_v2_grad_grad, matmul_double_grad);
-PT_REGISTER_BASE_KERNEL_NAME(matmul_v2_triple_grad, matmul_triple_grad);
+PD_REGISTER_BASE_KERNEL_NAME(matmul_v2, matmul);
+PD_REGISTER_BASE_KERNEL_NAME(matmul_v2_grad, matmul_grad);
+PD_REGISTER_BASE_KERNEL_NAME(matmul_v2_grad_grad, matmul_double_grad);
+PD_REGISTER_BASE_KERNEL_NAME(matmul_v2_triple_grad, matmul_triple_grad);
 
-PT_REGISTER_ARG_MAPPING_FN(matmul_v2_grad, phi::MatmulGradOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(matmul_v2_grad_grad,
+PD_REGISTER_ARG_MAPPING_FN(matmul_v2_grad, phi::MatmulGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(matmul_v2_grad_grad,
                            phi::MatmulDoubleGradOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(matmul_v2_triple_grad,
+PD_REGISTER_ARG_MAPPING_FN(matmul_v2_triple_grad,
                            phi::MatmulTripleGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/norm_sig.cc b/paddle/phi/ops/compat/norm_sig.cc
index f67c22ba712c8..81d294b842485 100644
--- a/paddle/phi/ops/compat/norm_sig.cc
+++ b/paddle/phi/ops/compat/norm_sig.cc
@@ -30,5 +30,5 @@ KernelSignature NormGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
 
 }  // namespace phi
 
-PT_REGISTER_ARG_MAPPING_FN(norm, phi::NormOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(norm_grad, phi::NormGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(norm, phi::NormOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(norm_grad, phi::NormGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/reduce_sig.cc b/paddle/phi/ops/compat/reduce_sig.cc
index 2d16817ad886b..74704671f8b5d 100644
--- a/paddle/phi/ops/compat/reduce_sig.cc
+++ b/paddle/phi/ops/compat/reduce_sig.cc
@@ -45,8 +45,8 @@ KernelSignature ReduceMeanOpArgumentMapping(const ArgumentMappingContext& ctx) {
 
 }  // namespace phi
 
-PT_REGISTER_BASE_KERNEL_NAME(reduce_sum, sum);
-PT_REGISTER_BASE_KERNEL_NAME(reduce_mean, mean);
+PD_REGISTER_BASE_KERNEL_NAME(reduce_sum, sum);
+PD_REGISTER_BASE_KERNEL_NAME(reduce_mean, mean);
 
-PT_REGISTER_ARG_MAPPING_FN(reduce_sum, phi::ReduceSumOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(reduce_mean, phi::ReduceMeanOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(reduce_sum, phi::ReduceSumOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(reduce_mean, phi::ReduceMeanOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/reshape_sig.cc b/paddle/phi/ops/compat/reshape_sig.cc
index 8e8b7592f909a..b6d10dabb1c7f 100644
--- a/paddle/phi/ops/compat/reshape_sig.cc
+++ b/paddle/phi/ops/compat/reshape_sig.cc
@@ -45,11 +45,11 @@ KernelSignature ReshapeDoubleGradOpArgumentMapping(
 
 }  // namespace phi
 
-PT_REGISTER_BASE_KERNEL_NAME(reshape2, reshape);
-PT_REGISTER_BASE_KERNEL_NAME(reshape2_grad, reshape_grad);
-PT_REGISTER_BASE_KERNEL_NAME(reshape2_grad_grad, reshape_double_grad);
+PD_REGISTER_BASE_KERNEL_NAME(reshape2, reshape);
+PD_REGISTER_BASE_KERNEL_NAME(reshape2_grad, reshape_grad);
+PD_REGISTER_BASE_KERNEL_NAME(reshape2_grad_grad, reshape_double_grad);
 
-PT_REGISTER_ARG_MAPPING_FN(reshape2, phi::ReshapeOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(reshape2_grad, phi::ReshapeGradOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(reshape2_grad_grad,
+PD_REGISTER_ARG_MAPPING_FN(reshape2, phi::ReshapeOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(reshape2_grad, phi::ReshapeGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(reshape2_grad_grad,
                            phi::ReshapeDoubleGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/scale_sig.cc b/paddle/phi/ops/compat/scale_sig.cc
index da8d028b2e39a..915ea4ce302ae 100644
--- a/paddle/phi/ops/compat/scale_sig.cc
+++ b/paddle/phi/ops/compat/scale_sig.cc
@@ -72,4 +72,4 @@ KernelSignature ScaleOpArgumentMapping(const ArgumentMappingContext& ctx) {
 }  // namespace phi
 
 // op_type, api_name, arg_mapping_fn
-PT_REGISTER_ARG_MAPPING_FN(scale, phi::ScaleOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(scale, phi::ScaleOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/split_sig.cc b/paddle/phi/ops/compat/split_sig.cc
index 361a928e75394..b3a614aab0012 100644
--- a/paddle/phi/ops/compat/split_sig.cc
+++ b/paddle/phi/ops/compat/split_sig.cc
@@ -46,4 +46,4 @@ KernelSignature SplitOpArgumentMapping(const ArgumentMappingContext& ctx) {
 
 }  // namespace phi
 
-PT_REGISTER_ARG_MAPPING_FN(split, phi::SplitOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(split, phi::SplitOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/trace_sig.cc b/paddle/phi/ops/compat/trace_sig.cc
index 774ac5a944f59..44fd53db98a3c 100644
--- a/paddle/phi/ops/compat/trace_sig.cc
+++ b/paddle/phi/ops/compat/trace_sig.cc
@@ -30,5 +30,5 @@ KernelSignature TraceGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
 
 }  // namespace phi
 
-PT_REGISTER_ARG_MAPPING_FN(trace, phi::TraceOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(trace_grad, phi::TraceGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(trace, phi::TraceOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(trace_grad, phi::TraceGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/trunc_sig.cc b/paddle/phi/ops/compat/trunc_sig.cc
index 47fa5bc47b4b5..2d35439216da5 100644
--- a/paddle/phi/ops/compat/trunc_sig.cc
+++ b/paddle/phi/ops/compat/trunc_sig.cc
@@ -27,5 +27,5 @@ KernelSignature TruncGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
 
 }  // namespace phi
 
-PT_REGISTER_ARG_MAPPING_FN(trunc, phi::TruncOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(trunc_grad, phi::TruncGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(trunc, phi::TruncOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(trunc_grad, phi::TruncGradOpArgumentMapping);
diff --git a/paddle/phi/tests/core/test_custom_kernel.cc b/paddle/phi/tests/core/test_custom_kernel.cc
index b0957d80aa95e..bc75e6ec45245 100644
--- a/paddle/phi/tests/core/test_custom_kernel.cc
+++ b/paddle/phi/tests/core/test_custom_kernel.cc
@@ -17,6 +17,8 @@ limitations under the License. */
 #define _LINUX
 #endif
 
+#include <gtest/gtest.h>
+
 #ifdef _LINUX
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/platform/device_context.h"
@@ -30,8 +32,6 @@ limitations under the License. */
 #include "paddle/phi/core/meta_tensor.h"
 #include "paddle/phi/infermeta/binary.h"
 
-#include <gtest/gtest.h>
-
 // user kernel function
 namespace custom_kernel {
 
@@ -98,16 +98,16 @@ void FakeDot(const Context& dev_ctx,
 }
 }  // namespace custom_kernel
 
-PD_REGISTER_KERNEL(fake_dot,
-                   CPU,
-                   ALL_LAYOUT,
-                   custom_kernel::FakeDot,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   int8_t,
-                   uint8_t) {}
+PD_REGISTER_BUILTIN_KERNEL(fake_dot,
+                           CPU,
+                           ALL_LAYOUT,
+                           custom_kernel::FakeDot,
+                           float,
+                           double,
+                           int,
+                           int64_t,
+                           int8_t,
+                           uint8_t) {}
 
 namespace phi {
 namespace tests {
diff --git a/paddle/phi/tests/core/test_kernel_factory.cc b/paddle/phi/tests/core/test_kernel_factory.cc
index c85485cb91513..cb4b50f5b6c3d 100644
--- a/paddle/phi/tests/core/test_kernel_factory.cc
+++ b/paddle/phi/tests/core/test_kernel_factory.cc
@@ -22,7 +22,7 @@ limitations under the License. */
 
 #include "gtest/gtest.h"
 
-PT_DECLARE_KERNEL(scale, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(scale, CPU, ALL_LAYOUT);
 
 namespace phi {
 namespace tests {
@@ -76,7 +76,7 @@ TEST(KernelRegistry, SetFP32Input) {
 }  // namespace tests
 }  // namespace phi
 
-PT_REGISTER_KERNEL(test,
+PD_REGISTER_KERNEL(test,
                    CPU,
                    ALL_LAYOUT,
                    phi::tests::TestKernel,
diff --git a/paddle/phi/tests/kernels/test_flatten_dev_api.cc b/paddle/phi/tests/kernels/test_flatten_dev_api.cc
index b65720a4b4e24..dc283728ee5f7 100644
--- a/paddle/phi/tests/kernels/test_flatten_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_flatten_dev_api.cc
@@ -23,14 +23,14 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-PT_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT);
 #endif
 
 #ifdef PADDLE_WITH_XPU
-PT_DECLARE_KERNEL(copy, XPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(copy, XPU, ALL_LAYOUT);
 #endif
 
 namespace phi {
diff --git a/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot.cc b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot.cc
index 68393cba57e36..b0519138ca540 100644
--- a/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot.cc
+++ b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot.cc
@@ -45,7 +45,7 @@ void DotKernel(const Context& dev_ctx,
 }  // namespace custom_kernel
 }  // namespace paddle
 
-PD_REGISTER_KERNEL(
+PD_REGISTER_BUILTIN_KERNEL(
     dot, CPU, ALL_LAYOUT, paddle::custom_kernel::DotKernel, int8_t) {
   kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::INT8);
 }
diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py
index 106f698fd4b1e..f1e69a21f28d8 100644
--- a/python/paddle/utils/code_gen/api_gen.py
+++ b/python/paddle/utils/code_gen/api_gen.py
@@ -118,7 +118,7 @@ def source_include(header_file_path):
 
 def api_register():
     return """
-PT_REGISTER_API(Math);
+PD_REGISTER_API(Math);
 """
 
 
diff --git a/python/paddle/utils/code_gen/wrapped_infermeta_gen.py b/python/paddle/utils/code_gen/wrapped_infermeta_gen.py
index 53270c0546eae..0d018f8e3f64f 100644
--- a/python/paddle/utils/code_gen/wrapped_infermeta_gen.py
+++ b/python/paddle/utils/code_gen/wrapped_infermeta_gen.py
@@ -26,7 +26,7 @@ def get_wrapped_infermeta_name(api_name):
 def gene_wrapped_infermeta_and_register(api):
     if api.is_base_api and not api.is_dygraph_api:
         register_code = f"""
-PT_REGISTER_INFER_META_FN({api.kernel['func'][0]}, phi::{api.infer_meta['func']});"""
+PD_REGISTER_INFER_META_FN({api.kernel['func'][0]}, phi::{api.infer_meta['func']});"""
 
         if api.infer_meta['param'] is not None:
             kernel_params = api.kernel['param']
@@ -73,7 +73,7 @@ def gene_wrapped_infermeta_and_register(api):
 """
 
             register_code = f"""
-PT_REGISTER_INFER_META_FN({api.kernel['func'][0]}, phi::{get_wrapped_infermeta_name(api.kernel['func'][0])});"""
+PD_REGISTER_INFER_META_FN({api.kernel['func'][0]}, phi::{get_wrapped_infermeta_name(api.kernel['func'][0])});"""
 
             return declare_code, defind_code, register_code
         else:
diff --git a/tools/infrt/get_pten_kernel_function.sh b/tools/infrt/get_pten_kernel_function.sh
index 75009b077b823..488c5b4c4328d 100644
--- a/tools/infrt/get_pten_kernel_function.sh
+++ b/tools/infrt/get_pten_kernel_function.sh
@@ -24,9 +24,9 @@ set -e
 kernel_register_info_file=`mktemp`
 PADDLE_ROOT="$( cd "$( dirname "$0" )/../../" && pwd )"
 unset GREP_OPTIONS && find ${PADDLE_ROOT}/paddle/phi/kernels -name "*.c*" \
-  | xargs sed -e '/PT_REGISTER_\(GENERAL_\)\?KERNEL(/,/)/!d' \
+  | xargs sed -e '/PD_REGISTER_\(GENERAL_\)\?KERNEL(/,/)/!d' \
   | awk 'BEGIN { RS="{" }{ gsub(/\n /,""); print $0 }' \
-  | grep PT_REGISTER \
+  | grep PD_REGISTER \
   | awk -F ",|\(|\)" '{gsub(/ /,"");$1="";print}' \
   | sort -u  | awk '{gsub(/phi::/,"");gsub(/paddle::platform::/,"");gsub(/dtype::/,"");gsub(/paddle::/,"");print $0}' \
   | grep -v "_grad" > $kernel_register_info_file
@@ -38,7 +38,7 @@ python3 ${PADDLE_ROOT}/python/paddle/utils/code_gen/wrapped_infermeta_gen.py \
   --wrapped_infermeta_header_path ${temp_path}/generate.h \
   --wrapped_infermeta_source_path ${temp_path}/generate.cc
 
-grep PT_REGISTER_INFER_META_FN ${temp_path}/generate.cc  \
+grep PD_REGISTER_INFER_META_FN ${temp_path}/generate.cc  \
   | awk -F "\(|,|::|\)" '{print $2, $4}' > ${temp_path}/wrap_info.txt
 
 #step 3: merge all infos

From de760d2c51c6438c55b5cf0f460e8b48e9a0a63c Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Tue, 22 Feb 2022 09:24:19 +0800
Subject: [PATCH 010/101] [Dy2St]Fix gym library version update problem with
 unittest (#39785)

---
 python/unittest_py/requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/unittest_py/requirements.txt b/python/unittest_py/requirements.txt
index fe8382faa0c34..9165764adcaf4 100644
--- a/python/unittest_py/requirements.txt
+++ b/python/unittest_py/requirements.txt
@@ -3,6 +3,7 @@ coverage
 pycrypto ; platform_system != "Windows"
 mock
 gym
+pygame==2.1.0
 hypothesis
 opencv-python<=4.2.0.32
 visualdl

From f4e748870e18bb9eeb7b1ad2cbdbde83ee596b36 Mon Sep 17 00:00:00 2001
From: niuliling123 <51102941+niuliling123@users.noreply.github.com>
Date: Tue, 22 Feb 2022 09:29:37 +0800
Subject: [PATCH 011/101] Add Sort API for Kernel Primitive API (#39734)

* Add Sort API for Kernel Primitive API

* update & -> ptr
---
 .../kernels/primitive/compute_primitives.h    | 123 ++++++++++++++++++
 1 file changed, 123 insertions(+)

diff --git a/paddle/phi/kernels/primitive/compute_primitives.h b/paddle/phi/kernels/primitive/compute_primitives.h
index a9146c8aa5895..2d9a7522515d0 100644
--- a/paddle/phi/kernels/primitive/compute_primitives.h
+++ b/paddle/phi/kernels/primitive/compute_primitives.h
@@ -132,6 +132,40 @@ __device__ __forceinline__ T BlockYReduce(T val, ReduceOp reducer) {
   return shared_memory[threadIdx.x];
 }
 
+// Swap data
+template <typename T>
+__device__ __forceinline__ void Swap(T* first_value, T* second_value) {
+  T t_value;
+  t_value = (*first_value);
+  (*first_value) = (*second_value);
+  (*second_value) = t_value;
+}
+
+// swap with monotonic_type
+template <typename T>
+__device__ __forceinline__ void Comparator(T* first_value,
+                                           T* second_value,
+                                           int monotonic_type) {
+  if (((*first_value) > (*second_value)) == monotonic_type) {
+    Swap<T>(first_value, second_value);
+  }
+}
+
+template <typename T, typename IndexType>
+__device__ __forceinline__ void ComparatorWithIndex(T* first_value,
+
+                                                    T* second_value,
+                                                    IndexType* first_index,
+                                                    IndexType* second_index,
+                                                    int monotonic_type) {
+  if ((*first_value > (*second_value)) == monotonic_type) {
+    // swap value
+    Swap<T>(first_value, second_value);
+    // swap index
+    Swap<IndexType>(first_index, second_index);
+  }
+}
+
 }  // namespace details
 
 /**
@@ -481,5 +515,94 @@ __device__ __forceinline__ void Cumsum(OutT* out,
       static_cast<OutT>(temp[tidx + shared_size + (tidx + shared_size) / 32]);
 }
 
+#define SHARED_SIZE_LIMIT \
+  1024  // each thread load 2 data from global memory so SHARED_SIZE_LIMIT must
+        // larger than blockDim.x * 2
+// if monotonic_type = 1 then increase
+// if gridDim.x > 1 please set monotonic_type = blockIdx.x & 1; blockIdx.x % 2
+// == 1 the increase
+template <typename T>
+__device__ __forceinline__ void Sort(T* dst,
+                                     const T* src_data,
+                                     int num,
+                                     int monotonic_type) {
+  // todo: set  num = Pow2(num)
+  // shareMem for value and index  num must smaller than SHARED_SIZE_LIMIT / 2
+  __shared__ T value[SHARED_SIZE_LIMIT];  // shareMem's size must larger than
+                                          // blockDim * 2
+  // Copy value and index from src and src_index
+  value[threadIdx.x] = src_data[0];
+  value[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = src_data[1];
+  // make bitonicSort
+  for (int size = 2; size < num; size <<= 1) {
+    int bitonic_type = (threadIdx.x & (size / 2)) != 0;
+    for (int stride = size / 2; stride > 0; stride >>= 1) {
+      __syncthreads();
+      int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
+      details::Comparator<T>(&value[pos], &value[pos + stride], bitonic_type);
+    }
+  }
+  // last sort
+  for (int stride = SHARED_SIZE_LIMIT / 2; stride > 0; stride >>= 1) {
+    __syncthreads();
+    int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
+    // last sort when monotonic_type = 1 then increase
+    details::Comparator<T>(&value[pos], &value[pos + stride], monotonic_type);
+  }
+  __syncthreads();
+  dst[0] = value[threadIdx.x];
+  dst[1] = value[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
+}
+
+template <typename T, typename IndexType>
+__device__ __forceinline__ void Sort(T* dst,
+                                     IndexType* dst_index,
+                                     const T* src_data,
+                                     IndexType* src_index,
+                                     int num,
+                                     int monotonic_type) {
+  // todo: set  num = Pow2(num)
+  // shareMem for value and index  num must smaller than SHARED_SIZE_LIMIT / 2
+  __shared__ T value[SHARED_SIZE_LIMIT];  // shareMem's size must larger than
+                                          // blockDim * 2
+  __shared__ IndexType index[SHARED_SIZE_LIMIT];
+  // Copy value and index from src and src_index
+  value[threadIdx.x] = src_data[0];
+  value[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = src_data[1];
+  // index
+  index[threadIdx.x] = src_index[0];
+  index[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = src_index[1];
+  // make bitonicSort
+  for (int size = 2; size < num; size <<= 1) {
+    int bitonic_type = (threadIdx.x & (size / 2)) != 0;
+    for (int stride = size / 2; stride > 0; stride >>= 1) {
+      __syncthreads();
+      int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
+      details::ComparatorWithIndex<T, IndexType>(&value[pos],
+                                                 &value[pos + stride],
+                                                 &index[pos],
+                                                 &index[pos + stride],
+                                                 bitonic_type);
+    }
+  }
+
+  for (int stride = SHARED_SIZE_LIMIT / 2; stride > 0; stride >>= 1) {
+    __syncthreads();
+    int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
+    // last sort when monotonic_type = 1 then increase
+    details::ComparatorWithIndex<T, IndexType>(&value[pos],
+                                               &value[pos + stride],
+                                               &index[pos],
+                                               &index[pos + stride],
+                                               monotonic_type);
+  }
+
+  __syncthreads();
+  dst[0] = value[threadIdx.x];
+  dst[1] = value[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
+  dst_index[0] = index[threadIdx.x];
+  dst_index[1] = index[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
+}
+
 }  // namespace kps
 }  // namespace phi

From 9f94821b9af22a95daa65658cfc7d5901df33fe0 Mon Sep 17 00:00:00 2001
From: niuliling123 <51102941+niuliling123@users.noreply.github.com>
Date: Tue, 22 Feb 2022 09:39:26 +0800
Subject: [PATCH 012/101] Modified RandomKernel with Kernel Primitive API
 (#39666)

* Modified RandomKernel with Kernel Primitive API

* update pten.h to phi.h

* update

* update fullKernel
---
 paddle/fluid/operators/gaussian_random_op.cu  |  29 ++--
 paddle/fluid/operators/index_impl.cu.h        |  97 ++++++++++++
 .../operators/uniform_random_inplace_op.cu    | 141 ++----------------
 paddle/fluid/operators/uniform_random_op.cu   | 122 +--------------
 paddle/fluid/operators/uniform_random_op.h    | 122 +++++++++++++++
 .../kernels/primitive/datamover_primitives.h  |   9 ++
 .../unittests/test_gaussian_random_op.py      |   2 +-
 7 files changed, 256 insertions(+), 266 deletions(-)
 create mode 100644 paddle/fluid/operators/index_impl.cu.h

diff --git a/paddle/fluid/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu
index fa9fe9d860201..21d827c79200c 100644
--- a/paddle/fluid/operators/gaussian_random_op.cu
+++ b/paddle/fluid/operators/gaussian_random_op.cu
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/distribution_helper.h"
 #include "paddle/fluid/operators/fill_constant_op.h"
+#include "paddle/fluid/operators/index_impl.cu.h"
 
 DECLARE_bool(use_curand);
 
@@ -65,7 +66,6 @@ class GPUGaussianRandomKernel : public framework::OpKernel<T> {
     }
     T mean = static_cast<T>(context.Attr<float>("mean"));
     T std = static_cast<T>(context.Attr<float>("std"));
-    thrust::counting_iterator<int64_t> index_sequence_begin(0);
     auto shape = GetShape(context);
     tensor->Resize(shape);
 
@@ -88,15 +88,13 @@ class GPUGaussianRandomKernel : public framework::OpKernel<T> {
       } else {
         auto seed_offset = gen_cuda->IncrementOffset(1);
         int64_t gen_offset = size * seed_offset.second;
-        thrust::transform(
-            index_sequence_begin, index_sequence_begin + size,
-            thrust::device_ptr<T>(data),
-            GaussianGenerator<T>(mean, std, seed_offset.first, gen_offset));
+        auto func =
+            GaussianGenerator<T>(mean, std, seed_offset.first, gen_offset);
+        IndexKernel<T, GaussianGenerator<T>>(dev_cxt, tensor, func);
       }
     } else {
-      thrust::transform(index_sequence_begin, index_sequence_begin + size,
-                        thrust::device_ptr<T>(data),
-                        GaussianGenerator<T>(mean, std, seed));
+      auto func = GaussianGenerator<T>(mean, std, seed);
+      IndexKernel<T, GaussianGenerator<T>>(dev_cxt, tensor, func);
     }
   }
 };
@@ -116,23 +114,22 @@ class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
     }
     T mean = static_cast<T>(context.Attr<float>("mean"));
     T std = static_cast<T>(context.Attr<float>("std"));
-    thrust::counting_iterator<int64_t> index_sequence_begin(0);
     int64_t size = tensor->numel();
 
     int device_id = context.GetPlace().GetDeviceId();
     auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
+    auto& dev_cxt =
+        context.template device_context<platform::CUDADeviceContext>();
 
     if (gen_cuda->GetIsInitPy() && seed_flag) {
       auto seed_offset = gen_cuda->IncrementOffset(1);
       int64_t gen_offset = size * seed_offset.second;
-      thrust::transform(index_sequence_begin, index_sequence_begin + size,
-                        thrust::device_ptr<T>(data),
-                        GaussianGenerator<T>(mean, std, seed_offset.first,
-                                             seed_offset.second));
+      auto func = GaussianGenerator<T>(mean, std, seed_offset.first,
+                                       seed_offset.second);
+      IndexKernel<T, GaussianGenerator<T>>(dev_cxt, tensor, func);
     } else {
-      thrust::transform(index_sequence_begin, index_sequence_begin + size,
-                        thrust::device_ptr<T>(data),
-                        GaussianGenerator<T>(mean, std, seed));
+      auto func = GaussianGenerator<T>(mean, std, seed);
+      IndexKernel<T, GaussianGenerator<T>>(dev_cxt, tensor, func);
     }
   }
 };
diff --git a/paddle/fluid/operators/index_impl.cu.h b/paddle/fluid/operators/index_impl.cu.h
new file mode 100644
index 0000000000000..bae0d3f569f5f
--- /dev/null
+++ b/paddle/fluid/operators/index_impl.cu.h
@@ -0,0 +1,97 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/random.h>
+#include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/amp/fp16_type_traits.h"
+#include "paddle/fluid/operators/distribution_helper.h"
+#include "paddle/fluid/operators/fill_constant_op.h"
+#include "paddle/fluid/platform/aligned_vector.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/kernels/primitive/kernel_primitives.h"
+
+namespace paddle {
+namespace operators {
+
+namespace kps = phi::kps;
+template <typename T, typename Functor, int VecSize>
+__global__ void VectorizedIndexKernel(T *out, int numel, int main_offset,
+                                      Functor func) {
+  int data_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize;
+  int stride = BLOCK_NUM_X * GRID_NUM_X * VecSize;
+  int args[VecSize];
+  T result[VecSize];
+  for (; data_offset < main_offset; data_offset += stride) {
+    kps::InitWithDataIndex<int, VecSize, 1, 1>(&args[0], data_offset);
+    kps::ElementwiseUnary<int, T, VecSize, 1, 1, Functor>(&result[0], &args[0],
+                                                          func);
+    kps::WriteData<T, VecSize, 1, 1, false>(out + data_offset, &result[0],
+                                            BLOCK_NUM_X * VecSize);
+  }
+  int num = numel - data_offset;
+  if (numel > 0) {
+    kps::InitWithDataIndex<int, VecSize, 1, 1>(&args[0], data_offset);
+    kps::ElementwiseUnary<int, T, VecSize, 1, 1, Functor>(&result[0], &args[0],
+                                                          func);
+    kps::WriteData<T, VecSize, 1, 1, true>(out + data_offset, &result[0], num);
+  }
+}
+
+template <typename T, typename Functor>
+void IndexKernel(const KPDevice &dev_ctx, Tensor *out, Functor func) {
+  int numel = out->numel();
+  T *out_data = out->mutable_data<T>(dev_ctx.GetPlace());
+  if (numel <= 0) return;
+  int vec_size = paddle::platform::GetVectorizedSize((out->data<T>()));
+#ifdef PADDLE_WITH_XPU_KP
+  int block = 64;
+  int grid = 8;
+  auto stream = dev_ctx.x_context()->xpu_stream;
+#else
+  auto config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel, vec_size);
+  int grid = config.block_per_grid.x;
+  int block = config.thread_per_block.x;
+  auto stream = dev_ctx.stream();
+#endif
+
+  int main_offset = (numel / (vec_size * block)) * vec_size * block;
+  switch (vec_size) {
+    case 4:
+      VectorizedIndexKernel<T, Functor, 4><<<grid, block, 0, stream>>>(
+          out_data, numel, main_offset, func);
+      break;
+    case 2:
+      VectorizedIndexKernel<T, Functor, 2><<<grid, block, 0, stream>>>(
+          out_data, numel, main_offset, func);
+      break;
+    case 1:
+      VectorizedIndexKernel<T, Functor, 1><<<grid, block, 0, stream>>>(
+          out_data, numel, main_offset, func);
+      break;
+    default: {
+      PADDLE_THROW(paddle::platform::errors::Unimplemented(
+          "Unsupported vectorized size: %d !", vec_size));
+      break;
+    }
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/uniform_random_inplace_op.cu b/paddle/fluid/operators/uniform_random_inplace_op.cu
index a5231354eb47e..1c7b9a27f8688 100644
--- a/paddle/fluid/operators/uniform_random_inplace_op.cu
+++ b/paddle/fluid/operators/uniform_random_inplace_op.cu
@@ -12,130 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <thrust/device_vector.h>
-#include <thrust/host_vector.h>
-#include <thrust/random.h>
-#include <thrust/transform.h>
-#include "paddle/fluid/framework/generator.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/uniform_random_op.h"
+#include "paddle/phi/kernels/full_kernel.h"
 
 namespace paddle {
 namespace operators {
-
-template <typename T>
-struct UniformGenerator {
-  T min_, max_;
-  unsigned int seed_;
-  T diag_val_;
-  unsigned int diag_num_;
-  unsigned int diag_step_;
-  __host__ __device__ UniformGenerator(T min, T max, int seed, int diag_num,
-                                       int diag_step, T diag_val)
-      : min_(min),
-        max_(max),
-        seed_(seed),
-        diag_num_(diag_num),
-        diag_step_(diag_step),
-        diag_val_(diag_val) {}
-
-  __host__ __device__ T operator()(const unsigned int n) const {
-    thrust::minstd_rand rng;
-    rng.seed(seed_);
-    thrust::uniform_real_distribution<T> dist(min_, max_);
-    rng.discard(n);
-    T out = dist(rng);
-    unsigned int remainder = n % (diag_step_ + 1);
-    if (remainder == 0 && diag_num_ > n / (diag_step_ + 1)) {
-      out = diag_val_;
-    }
-    return out;
-  }
-};
-
-template <typename T>
-struct UniformGeneratorOffset {
-  T min_, max_;
-  unsigned int seed_;
-  T diag_val_;
-  unsigned int diag_num_;
-  unsigned int diag_step_;
-  int offset_;
-  __host__ __device__ UniformGeneratorOffset(T min, T max, int seed,
-                                             int diag_num, int diag_step,
-                                             T diag_val, int offset)
-      : min_(min),
-        max_(max),
-        seed_(seed),
-        diag_num_(diag_num),
-        diag_step_(diag_step),
-        diag_val_(diag_val),
-        offset_(offset) {}
-
-  __host__ __device__ T operator()(const unsigned int n) const {
-    thrust::minstd_rand rng;
-    rng.seed(seed_);
-    thrust::uniform_real_distribution<T> dist(min_, max_);
-    rng.discard(n + offset_);
-    T out = dist(rng);
-    unsigned int remainder = n % (diag_step_ + 1);
-    if (remainder == 0 && diag_num_ > n / (diag_step_ + 1)) {
-      out = diag_val_;
-    }
-    return out;
-  }
-};
-
-template <typename T>
-__global__ void fill_value(int64_t size, T* data, float value) {
-  for (int idx = threadIdx.x; idx < size; idx += blockDim.x) {
-    data[idx] = static_cast<T>(value);
-  }
-}
-
-// It seems that Eigen::Tensor::random in GPU will SEGFAULT.
-// Use std::random and thrust::random(thrust is a std library in CUDA) to
-// implement uniform random as uniform_random_op.cu.
 template <typename T>
 class GPUUniformRandomInplaceKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto out_var = ctx.OutputVar("Out");
-    auto* tensor = out_var->GetMutable<framework::LoDTensor>();
-    T* data = tensor->mutable_data<T>(ctx.GetPlace());
-    unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
-    bool seed_flag = false;
-    if (seed == 0) {
-      std::random_device rd;
-      seed = rd();
-      seed_flag = true;
-    }
-
-    T min = static_cast<T>(ctx.Attr<float>("min"));
-    T max = static_cast<T>(ctx.Attr<float>("max"));
-    unsigned int diag_num =
-        static_cast<unsigned int>(ctx.Attr<int>("diag_num"));
-    unsigned int diag_step =
-        static_cast<unsigned int>(ctx.Attr<int>("diag_step"));
-    T diag_val = static_cast<T>(ctx.Attr<float>("diag_val"));
-    thrust::counting_iterator<int64_t> index_sequence_begin(0);
-    int64_t size = tensor->numel();
-    int device_id = ctx.GetPlace().GetDeviceId();
-    auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
-    if (gen_cuda->GetIsInitPy() && seed_flag) {
-      auto seed_offset = gen_cuda->IncrementOffset(1);
-      int64_t gen_offset = size * seed_offset.second;
-      thrust::transform(
-          index_sequence_begin, index_sequence_begin + size,
-          thrust::device_ptr<T>(data),
-          UniformGeneratorOffset<T>(min, max, seed_offset.first, diag_num,
-                                    diag_step, diag_val, gen_offset));
-    } else {
-      thrust::transform(
-          index_sequence_begin, index_sequence_begin + size,
-          thrust::device_ptr<T>(data),
-          UniformGenerator<T>(min, max, seed, diag_num, diag_step, diag_val));
-    }
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* tensor = context.Output<framework::Tensor>("Out");
+    UniformRandom<T>(context, tensor);
   }
 };
 
@@ -143,17 +30,15 @@ template <typename T>
 class GPUUniformRandomInplaceGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-#ifdef __HIPCC__
-    const int64_t kMaxBlockDim = 256;
-#else
-    const int64_t kMaxBlockDim = 512;
-#endif
     auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* data = dx->mutable_data<T>(ctx.GetPlace());
-
-    auto size = dx->numel();
-    int64_t kBlockDim = std::min(size, kMaxBlockDim);
-    fill_value<T><<<1, kBlockDim, 0>>>(size, data, static_cast<float>(0));
+    auto dims = vectorize(dx->dims());
+    const auto& dev_cxt =
+        ctx.template device_context<platform::CUDADeviceContext>();
+    float value = static_cast<float>(0.0f);
+    phi::FullKernel<T>(
+        static_cast<const typename paddle::framework::ConvertToPtenContext<
+            paddle::platform::CUDADeviceContext>::TYPE&>(dev_cxt),
+        dims, value, phi::DataType::UNDEFINED, dx);
   }
 };
 
diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu
index 086c57527b48f..fb38a6aded4cf 100644
--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
@@ -11,88 +11,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <thrust/device_vector.h>
-#include <thrust/host_vector.h>
-#include <thrust/random.h>
-#include <thrust/transform.h>
-#include "paddle/fluid/framework/generator.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/fluid/operators/distribution_helper.h"
 #include "paddle/fluid/operators/uniform_random_op.h"
 
-DECLARE_bool(use_curand);
-
 namespace paddle {
 namespace operators {
 
-template <typename T>
-struct UniformGenerator {
-  T min_, max_;
-  unsigned int seed_;
-  T diag_val_;
-  unsigned int diag_num_;
-  unsigned int diag_step_;
-  __host__ __device__ UniformGenerator(T min, T max, int seed, int diag_num,
-                                       int diag_step, T diag_val)
-      : min_(min),
-        max_(max),
-        seed_(seed),
-        diag_num_(diag_num),
-        diag_step_(diag_step),
-        diag_val_(diag_val) {}
-
-  __host__ __device__ T operator()(const unsigned int n) const {
-    thrust::minstd_rand rng;
-    rng.seed(seed_);
-    thrust::uniform_real_distribution<T> dist(min_, max_);
-    rng.discard(n);
-    T out = dist(rng);
-    unsigned int remainder = n % (diag_step_ + 1);
-    if (remainder == 0 && diag_num_ > n / (diag_step_ + 1)) {
-      out = diag_val_;
-    }
-    return out;
-  }
-};
-
-template <typename T>
-struct UniformGeneratorOffset {
-  T min_, max_;
-  unsigned int seed_;
-  T diag_val_;
-  unsigned int diag_num_;
-  unsigned int diag_step_;
-  int offset_;
-  __host__ __device__ UniformGeneratorOffset(T min, T max, int seed,
-                                             int diag_num, int diag_step,
-                                             T diag_val, int offset)
-      : min_(min),
-        max_(max),
-        seed_(seed),
-        diag_num_(diag_num),
-        diag_step_(diag_step),
-        diag_val_(diag_val),
-        offset_(offset) {}
-
-  __host__ __device__ T operator()(const unsigned int n) const {
-    thrust::minstd_rand rng;
-    rng.seed(seed_);
-    thrust::uniform_real_distribution<T> dist(min_, max_);
-    rng.discard(n + offset_);
-    T out = dist(rng);
-    unsigned int remainder = n % (diag_step_ + 1);
-    if (remainder == 0 && diag_num_ > n / (diag_step_ + 1)) {
-      out = diag_val_;
-    }
-    return out;
-  }
-};
-
-// It seems that Eigen::Tensor::random in GPU will SEGFAULT.
-// Use std::random and thrust::random(thrust is a std library in CUDA) to
-// implement uniform random.
 template <typename T>
 class GPUUniformRandomKernel : public framework::OpKernel<T> {
  public:
@@ -128,50 +51,7 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
           "unsupport type: %s.",
           framework::ToTypeName(out_var->Type())));
     }
-    auto& dev_cxt =
-        context.template device_context<platform::CUDADeviceContext>();
-    T* data = tensor->mutable_data<T>(dev_cxt.GetPlace());
-    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    bool seed_flag = false;
-    if (seed == 0) {
-      std::random_device rd;
-      seed = rd();
-      seed_flag = true;
-    }
-
-    T min = static_cast<T>(context.Attr<float>("min"));
-    T max = static_cast<T>(context.Attr<float>("max"));
-    unsigned int diag_num =
-        static_cast<unsigned int>(context.Attr<int>("diag_num"));
-    unsigned int diag_step =
-        static_cast<unsigned int>(context.Attr<int>("diag_step"));
-    T diag_val = static_cast<T>(context.Attr<float>("diag_val"));
-    thrust::counting_iterator<int64_t> index_sequence_begin(0);
-    int64_t size = tensor->numel();
-    int device_id = context.GetPlace().GetDeviceId();
-    auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
-    if (gen_cuda->GetIsInitPy() && seed_flag) {
-      if (FLAGS_use_curand) {
-        using MT = typename details::MPTypeTrait<T>::Type;
-        distribution::uniform_distribution<MT> dist;
-        distribution::uniform_transform<MT> trans(min, max);
-        distribution::distribution_and_transform<T>(dev_cxt, tensor, dist,
-                                                    trans);
-      } else {
-        auto seed_offset = gen_cuda->IncrementOffset(1);
-        int64_t gen_offset = size * seed_offset.second;
-        thrust::transform(
-            index_sequence_begin, index_sequence_begin + size,
-            thrust::device_ptr<T>(data),
-            UniformGeneratorOffset<T>(min, max, seed_offset.first, diag_num,
-                                      diag_step, diag_val, gen_offset));
-      }
-    } else {
-      thrust::transform(
-          index_sequence_begin, index_sequence_begin + size,
-          thrust::device_ptr<T>(data),
-          UniformGenerator<T>(min, max, seed, diag_num, diag_step, diag_val));
-    }
+    UniformRandom<T>(context, tensor);
   }
 };
 
diff --git a/paddle/fluid/operators/uniform_random_op.h b/paddle/fluid/operators/uniform_random_op.h
index be6c3c740e692..a864c48ad7574 100644
--- a/paddle/fluid/operators/uniform_random_op.h
+++ b/paddle/fluid/operators/uniform_random_op.h
@@ -18,6 +18,16 @@
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#if defined(__NVCC__) || defined(__HIPCC__)
+DECLARE_bool(use_curand);
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/random.h>
+#include <thrust/transform.h>
+#include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/operators/index_impl.cu.h"
+#include "paddle/phi/kernels/full_kernel.h"
+#endif
 
 namespace paddle {
 namespace operators {
@@ -102,5 +112,117 @@ inline std::vector<int64_t> GetNewDataFromShapeTensorList(
 
   return vec_new_shape;
 }
+
+#if defined(__NVCC__) || defined(__HIPCC__)
+
+template <typename T>
+struct UniformGenerator {
+  T min_, max_;
+  unsigned int seed_;
+  T diag_val_;
+  unsigned int diag_num_;
+  unsigned int diag_step_;
+  __host__ __device__ UniformGenerator(T min, T max, int seed, int diag_num,
+                                       int diag_step, T diag_val)
+      : min_(min),
+        max_(max),
+        seed_(seed),
+        diag_num_(diag_num),
+        diag_step_(diag_step),
+        diag_val_(diag_val) {}
+
+  __host__ __device__ T operator()(const unsigned int n) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed_);
+    thrust::uniform_real_distribution<T> dist(min_, max_);
+    rng.discard(n);
+    T out = dist(rng);
+    unsigned int remainder = n % (diag_step_ + 1);
+    if (remainder == 0 && diag_num_ > n / (diag_step_ + 1)) {
+      out = diag_val_;
+    }
+    return out;
+  }
+};
+
+template <typename T>
+struct UniformGeneratorOffset {
+  T min_, max_;
+  unsigned int seed_;
+  T diag_val_;
+  unsigned int diag_num_;
+  unsigned int diag_step_;
+  int offset_;
+  __host__ __device__ UniformGeneratorOffset(T min, T max, int seed,
+                                             int diag_num, int diag_step,
+                                             T diag_val, int offset)
+      : min_(min),
+        max_(max),
+        seed_(seed),
+        diag_num_(diag_num),
+        diag_step_(diag_step),
+        diag_val_(diag_val),
+        offset_(offset) {}
+
+  __host__ __device__ T operator()(const unsigned int n) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed_);
+    thrust::uniform_real_distribution<T> dist(min_, max_);
+    rng.discard(n + offset_);
+    T out = dist(rng);
+    unsigned int remainder = n % (diag_step_ + 1);
+    if (remainder == 0 && diag_num_ > n / (diag_step_ + 1)) {
+      out = diag_val_;
+    }
+    return out;
+  }
+};
+
+template <typename T>
+void UniformRandom(const framework::ExecutionContext& context,
+                   framework::Tensor* tensor) {
+  int64_t size = tensor->numel();
+  auto& dev_cxt =
+      context.template device_context<platform::CUDADeviceContext>();
+  T* data = tensor->mutable_data<T>(dev_cxt.GetPlace());
+  if (size <= 0) return;
+  unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+  bool seed_flag = false;
+  if (seed == 0) {
+    std::random_device rd;
+    seed = rd();
+    seed_flag = true;
+  }
+
+  T min = static_cast<T>(context.Attr<float>("min"));
+  T max = static_cast<T>(context.Attr<float>("max"));
+  unsigned int diag_num =
+      static_cast<unsigned int>(context.Attr<int>("diag_num"));
+  unsigned int diag_step =
+      static_cast<unsigned int>(context.Attr<int>("diag_step"));
+  T diag_val = static_cast<T>(context.Attr<float>("diag_val"));
+  int device_id = context.GetPlace().GetDeviceId();
+  auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
+  if (gen_cuda->GetIsInitPy() && seed_flag) {
+    if (FLAGS_use_curand) {
+      using MT = typename details::MPTypeTrait<T>::Type;
+      distribution::uniform_distribution<MT> dist;
+      distribution::uniform_transform<MT> trans(min, max);
+      distribution::distribution_and_transform<T>(dev_cxt, tensor, dist, trans);
+    } else {
+      auto seed_offset = gen_cuda->IncrementOffset(1);
+      int64_t gen_offset = size * seed_offset.second;
+      auto func =
+          UniformGeneratorOffset<T>(min, max, seed_offset.first, diag_num,
+                                    diag_step, diag_val, gen_offset);
+      IndexKernel<T, UniformGeneratorOffset<T>>(dev_cxt, tensor, func);
+    }
+  } else {
+    auto func =
+        UniformGenerator<T>(min, max, seed, diag_num, diag_step, diag_val);
+    IndexKernel<T, UniformGenerator<T>>(dev_cxt, tensor, func);
+  }
+}
+#endif
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/phi/kernels/primitive/datamover_primitives.h b/paddle/phi/kernels/primitive/datamover_primitives.h
index 120be251db2c8..a6c4c40a7505e 100644
--- a/paddle/phi/kernels/primitive/datamover_primitives.h
+++ b/paddle/phi/kernels/primitive/datamover_primitives.h
@@ -714,5 +714,14 @@ __device__ __forceinline__ void ReadDataBc(
   }
 }
 
+template <typename T, int NX, int NY, int BlockSize>
+__device__ __forceinline__ void InitWithDataIndex(T* dst, int block_offset) {
+  int thread_offset = block_offset + threadIdx.x * NX;
+#pragma unroll
+  for (int nx = 0; nx < NX; ++nx) {
+    dst[nx] = static_cast<T>(thread_offset + nx);
+  }
+}
+
 }  // namespace kps
 }  // namespace phi
diff --git a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
index 70ab1cc523507..43bcc3438eef4 100644
--- a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
@@ -21,7 +21,7 @@
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
 from paddle.fluid.executor import Executor
-from op_test import OpTest
+from paddle.fluid.tests.unittests.op_test import OpTest
 import paddle
 
 

From 38f87238193f75d1f03d31fa45356d6592fb1a4d Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Tue, 22 Feb 2022 09:56:52 +0800
Subject: [PATCH 013/101] fix usage of paddle.version.cuda() (#39780)

---
 python/paddle/fluid/dygraph/amp/auto_cast.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/dygraph/amp/auto_cast.py b/python/paddle/fluid/dygraph/amp/auto_cast.py
index 01d64550321d5..37134764e9d1c 100644
--- a/python/paddle/fluid/dygraph/amp/auto_cast.py
+++ b/python/paddle/fluid/dygraph/amp/auto_cast.py
@@ -149,7 +149,7 @@ def _is_gpu_bfloat16_supported():
     """
     prop = paddle.device.cuda.get_device_capability()
     cuda_version = paddle.version.cuda()
-    if cuda_version is not None:
+    if cuda_version is not None and cuda_version != 'False':
         cuda_version_check = int(cuda_version.split('.')[0]) >= 11
     else:
         cuda_version_check = False

From ec21bf9873df5b074e9a55e5a26316f2f9d504c4 Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Tue, 22 Feb 2022 10:00:27 +0800
Subject: [PATCH 014/101] make enable_program_desc_tracing_ thread_local
 (#39776)

---
 paddle/fluid/imperative/tracer.cc | 2 ++
 paddle/fluid/imperative/tracer.h  | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index 1c9cc538ffece..03811ac778779 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -32,6 +32,8 @@ DECLARE_string(tracer_mkldnn_ops_off);
 namespace paddle {
 namespace imperative {
 
+thread_local bool Tracer::enable_program_desc_tracing_ = false;
+
 thread_local bool Tracer::has_grad_ = true;
 
 thread_local AmpLevel Tracer::amp_level_ = AmpLevel::O0;
diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
index b508126c36796..73ecbbe6143ca 100644
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -160,10 +160,11 @@ class Tracer {
  private:
   std::unique_ptr<BasicEngine> basic_engine_;
   std::unique_ptr<jit::ProgramDescTracer> program_desc_tracer_;
-  bool enable_program_desc_tracing_{false};
   std::unique_ptr<UniqueNameGenerator> generator_;
   platform::Place expected_place_;
   GarbageCollectorMap gcs_;
+
+  static thread_local bool enable_program_desc_tracing_;
   static thread_local bool has_grad_;
   static thread_local AmpLevel amp_level_;
   static thread_local phi::DataType amp_dtype_;

From df1dbff160718de418cef4f02ea81070a7aed23a Mon Sep 17 00:00:00 2001
From: zhangchunle <clzhang_cauc@163.com>
Date: Tue, 22 Feb 2022 10:24:45 +0800
Subject: [PATCH 015/101] update precision catalog (#39717)

---
 tools/get_pr_ut.py | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/tools/get_pr_ut.py b/tools/get_pr_ut.py
index 0ba6026535307..6b90a656f0107 100644
--- a/tools/get_pr_ut.py
+++ b/tools/get_pr_ut.py
@@ -139,13 +139,10 @@ def get_is_white_file(self, filename):
         """ judge is white file in pr's files. """
         isWhiteFile = False
         not_white_files = (PADDLE_ROOT + 'cmake/', PADDLE_ROOT + 'patches/',
-                           PADDLE_ROOT + 'paddle/testing/',
                            PADDLE_ROOT + 'tools/dockerfile/',
                            PADDLE_ROOT + 'tools/windows/',
                            PADDLE_ROOT + 'tools/test_runner.py',
-                           PADDLE_ROOT + 'tools/parallel_UT_rule.py',
-                           PADDLE_ROOT + 'paddle/scripts/paddle_build.sh',
-                           PADDLE_ROOT + 'paddle/scripts/paddle_build.bat')
+                           PADDLE_ROOT + 'tools/parallel_UT_rule.py')
         if 'cmakelist' in filename.lower():
             isWhiteFile = False
         elif filename.startswith((not_white_files)):
@@ -285,9 +282,21 @@ def get_pr_ut(self):
         file_list = []
         file_dict = self.get_pr_files()
         for filename in file_dict:
-            if filename.startswith(
-                (PADDLE_ROOT + 'python/', PADDLE_ROOT + 'paddle/fluid/')):
+            if filename.startswith(PADDLE_ROOT + 'python/'):
                 file_list.append(filename)
+            elif filename.startswith(PADDLE_ROOT + 'paddle/'):
+                if filename.startswith((PADDLE_ROOT + 'paddle/infrt',
+                                        PADDLE_ROOT + 'paddle/utils')):
+                    filterFiles.append(filename)
+                elif filename.startswith(PADDLE_ROOT + 'paddle/scripts'):
+                    if filename.startswith(
+                        (PADDLE_ROOT + 'paddle/scripts/paddle_build.sh',
+                         PADDLE_ROOT + 'paddle/scripts/paddle_build.bat')):
+                        file_list.append(filename)
+                    else:
+                        filterFiles.append(filename)
+                else:
+                    file_list.append(filename)
             else:
                 if file_dict[filename] == 'added':
                     file_list.append(filename)

From d945e24cd6d3ce7865c187c07fc5aafa4a866443 Mon Sep 17 00:00:00 2001
From: Allen Guo <alleng@graphcore.ai>
Date: Tue, 22 Feb 2022 10:29:25 +0800
Subject: [PATCH 016/101] sync recent changes (#39763)

---
 paddle/fluid/framework/ir/CMakeLists.txt      |   2 +-
 .../ir/ipu/optimizer_extract_pass.cc          |   4 +-
 .../ir/ipu/popart_canonicalization_pass.cc    |   8 +-
 .../fluid/platform/device/ipu/CMakeLists.txt  |   6 +-
 .../fluid/platform/device/ipu/ipu_backend.cc  |  24 +--
 .../fluid/platform/device/ipu/ipu_backend.h   |   1 -
 .../fluid/platform/device/ipu/ipu_compiler.cc | 190 +++++++++---------
 .../fluid/platform/device/ipu/ipu_compiler.h  |  60 +++---
 .../fluid/platform/device/ipu/ipu_strategy.cc |  17 ++
 .../fluid/platform/device/ipu/ipu_strategy.h  |   6 +
 10 files changed, 170 insertions(+), 148 deletions(-)

diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 0e1e572a51f7f..dad5358590cb1 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -147,7 +147,7 @@ if(WITH_IPU)
     pass_library(ipu_runtime_replacer_pass base DIR ipu)
     pass_library(inference_process_pass base DIR ipu)
     pass_library(inference_postprocess_pass base DIR ipu)
-    pass_library(popart_canonicalization_pass base DIR ipu)
+    pass_library(popart_canonicalization_pass base DIR ipu DEPS paddle_ipu)
     pass_library(ipu_inplace_pass base DIR ipu)
     pass_library(infer_shape_pass base DIR ipu)
     pass_library(delete_scale_op_pass base DIR ipu)
diff --git a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc
index 3d8d353cbf530..9fe50deaf2d72 100644
--- a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc
@@ -56,7 +56,7 @@ const bool is_regularization_op(const std::string& op_namescope) {
 }
 
 void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const {
-  // 这里构建的 op 符合 popart 的定义, 涉及到的一些值需要在 LowerOptimier 时获得
+  // optimizer values will be extracted when lowering optimizer in ipu_backend
   OpDesc new_op("popart_optimizer", {}, {}, {});
   new_op.SetAttr("op_role", 0);
   new_op.SetAttr("with_lr_sched", false);
@@ -86,7 +86,7 @@ void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const {
       bool is_regularization = is_regularization_op(op_namescope);
 
       VLOG(10) << "found optimizer releated op: " << op_type;
-      // initial larning_rate will be set in LowerOptimier
+      // initial larning_rate will be set in ipu_backend
       set_ops.insert(op_type);
       if (op_type == "sgd") {
         auto type = std::string{"sgd"};
diff --git a/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.cc b/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.cc
index 975a4b62cc708..6806e44f09505 100644
--- a/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.h"
 
+#include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 #include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
 
@@ -28,11 +29,8 @@ void PopartCanonicalizationPass::ApplyImpl(ir::Graph* graph) const {
 
   auto custom_ops = Get<std::unordered_set<std::string>>("custom_ops");
   std::vector<std::string> missing_ops;
-  auto nodes = graph->Nodes();
-  for (auto* node : nodes) {
-    if (!node->IsOp()) {
-      continue;
-    }
+  auto sorted_ops = TopologySortOperations(*graph);
+  for (auto* node : sorted_ops) {
     auto* op = node->Op();
     auto op_type = op->Type();
 
diff --git a/paddle/fluid/platform/device/ipu/CMakeLists.txt b/paddle/fluid/platform/device/ipu/CMakeLists.txt
index d54c6a33ecbf5..acf914c5087d0 100644
--- a/paddle/fluid/platform/device/ipu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/ipu/CMakeLists.txt
@@ -13,9 +13,9 @@ IF(WITH_IPU)
     "ipu_device.cc"
   )
 
-  cc_library(ipu_backend SRCS ${IPU_BACKEND_SRC} DEPS popart graph graph_helper)
-  cc_library(ipu_info SRCS ${IPU_INFO_SRC} DEPS popart enforce)
-  cc_library(paddle_ipu SHARED SRCS ${PADDLE_IPU_SRC} DEPS popart graph_helper)
+  cc_library(ipu_backend SRCS ${IPU_BACKEND_SRC} DEPS popart-only graph graph_helper)
+  cc_library(ipu_info SRCS ${IPU_INFO_SRC} DEPS popart-only enforce)
+  add_library(paddle_ipu SHARED ${PADDLE_IPU_SRC})
   add_dependencies(paddle_ipu ipu_backend)
   set(PADDLE_IPU_LIB "${CMAKE_CURRENT_BINARY_DIR}/libpaddle_ipu.so" CACHE STRING "")
   set(PADDLE_IPU_LIB_DIR "${CMAKE_CURRENT_BINARY_DIR}" CACHE STRING "")
diff --git a/paddle/fluid/platform/device/ipu/ipu_backend.cc b/paddle/fluid/platform/device/ipu/ipu_backend.cc
index 8f2a7ef78c982..e0b3b08a2313d 100644
--- a/paddle/fluid/platform/device/ipu/ipu_backend.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_backend.cc
@@ -43,17 +43,17 @@ void IpuBackend::Compile(Graph* graph,
                          const std::vector<std::string>& feed_list,
                          const std::vector<std::string>& fetch_list) {
   VLOG(10) << "enter IpuBackend::Compile";
-  compiler_->Prepare();
-  executor_->SetCompilerResources(compiler_->GetResources());
-
-  compiler_->InitInputs(graph, feed_list);
-  compiler_->LowerConstants(graph, scope_);
-  compiler_->LowerWeights(graph, scope_);
-  compiler_->LowerBody(graph);
+  compiler_->Prepare(graph);
+  compiler_->InitInputs(feed_list);
+  compiler_->LowerConstants(scope_);
+  compiler_->LowerWeights(scope_);
+  compiler_->LowerBody();
   compiler_->InitOutputs(fetch_list);
   if (ipu_strategy_->is_training) {
-    compiler_->LowerOptimier(graph, scope_);
+    compiler_->LowerOptimizer(scope_);
   }
+  executor_->SetCompilerResources(compiler_->GetResources());
+
   is_compiled_ = true;
   // when call compile, means a new graph
   is_prepared_ = false;
@@ -95,11 +95,9 @@ void IpuBackend::SetIpuStrategy(const IpuStrategy& strategy) {
   ipu_strategy_ = &strategy;
   compiler_->SetIpuStrategy(strategy);
   executor_->SetIpuStrategy(strategy);
-}
-
-void IpuBackend::SetCustomOps(
-    const std::vector<IpuCustomOpIdentifier>& custom_ops) {
-  compiler_->SetCustomOps(custom_ops);
+  if (!strategy.custom_ops.empty()) {
+    compiler_->SetCustomOps(strategy.custom_ops);
+  }
 }
 
 void IpuBackend::SaveModelProto(const std::string& path) {
diff --git a/paddle/fluid/platform/device/ipu/ipu_backend.h b/paddle/fluid/platform/device/ipu/ipu_backend.h
index b12e2539258df..1244192490c16 100644
--- a/paddle/fluid/platform/device/ipu/ipu_backend.h
+++ b/paddle/fluid/platform/device/ipu/ipu_backend.h
@@ -71,7 +71,6 @@ class IpuBackend {
   const Scope *GetScope() { return scope_; }
   void SetIpuStrategy(const IpuStrategy &strategy);
   const IpuStrategy *GetIpuStrategy() { return ipu_strategy_; }
-  void SetCustomOps(const std::vector<IpuCustomOpIdentifier> &custom_ops);
 
   // save compiled model to onnx
   void SaveModelProto(const std::string &path);
diff --git a/paddle/fluid/platform/device/ipu/ipu_compiler.cc b/paddle/fluid/platform/device/ipu/ipu_compiler.cc
index df2e456383e17..cdb3f6f9b3e28 100644
--- a/paddle/fluid/platform/device/ipu/ipu_compiler.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_compiler.cc
@@ -98,6 +98,19 @@ TO GetCastSigAttrAllowNull(std::string attr, OpDesc* op_desc) {
   }
 }
 
+GraphHelper::GraphHelper(const Graph* g) {
+  graph = g;
+  sorted_ops = framework::ir::TopologySortOperations(*g);
+  for (auto* node : g->Nodes()) {
+    nodes_id_map[node->id()] = node;
+    if (node->IsVar()) {
+      vars_name_map[node->Name()] = node;
+      sorted_vars_id.push_back(node->id());
+    }
+  }
+  std::sort(sorted_vars_id.begin(), sorted_vars_id.end());
+}
+
 Compiler::Compiler() { RegisterOpFunc(); }
 
 Compiler::~Compiler() {
@@ -105,9 +118,10 @@ Compiler::~Compiler() {
   resources_.reset();
 }
 
-void Compiler::Prepare() {
+void Compiler::Prepare(const Graph* graph) {
   builder_ = popart::Builder::create();
   resources_ = std::make_unique<CompilerResources>();
+  graph_helper_ = std::make_unique<GraphHelper>(graph);
 }
 
 void Compiler::RegisterOpFunc() {
@@ -171,93 +185,24 @@ void Compiler::RegisterOpFunc() {
 #undef INT_VEC
 }
 
-void Compiler::LowerBody(const Graph* graph) {
-  VLOG(10) << "enter Compiler::LowerBody";
-  auto nodes = framework::ir::TopologySortOperations(*graph);
-  for (auto* node : nodes) {
-    auto* op_desc = node->Op();
-    auto op_type = op_desc->Type();
-    VLOG(10) << "lowering op: " << op_type;
-
-    if (op_type == "popart_constant") {
-      // pass
-    } else if (op_type == "popart_optimizer") {
-      // pass
-    } else if (op_type == "popart_checkpointoutput") {
-      auto inputs = GetOpInputs(op_desc);
-      auto outputs = GetOpOutputs(op_desc);
-      auto output_ids = builder_->checkpointOutput(inputs);
-      InsertTensors(outputs, output_ids);
-    } else if (op_type == "popart_custom_op") {
-      auto inputs = GetOpInputs(op_desc);
-      auto outputs = GetOpOutputs(op_desc);
-      auto debug_context = BuildDebugContext(op_desc);
-      auto attributes = std::map<std::string, popart::any>{};
-      for (auto& attr : op_desc->GetAttrMap()) {
-        CustomOpAttrVisitor visitor(&attributes, attr.first);
-        boost::apply_visitor(visitor, attr.second);
-      }
-      auto __op_type =
-          BOOST_GET_CONST(std::string, op_desc->GetAttr("__op_type"));
-      VLOG(10) << "Build graph from custom op: " << __op_type;
-      auto it = custom_ops_.find(__op_type);
-      auto output_ids =
-          builder_->customOp(it->second.popart_op, it->second.popart_op.version,
-                             inputs, outputs.size(), attributes, debug_context);
-      SetIpuIndexStage(output_ids, op_desc);
-      InsertTensors(outputs, output_ids);
-    } else if (op_type == "popart_printtensor") {
-      auto inputs = GetOpInputs(op_desc);
-      auto outputs = GetOpOutputs(op_desc);
-      auto debug_context = BuildDebugContext(op_desc);
-      auto print_gradient =
-          BOOST_GET_CONST(int64_t, op_desc->GetAttr("print_gradient"));
-      auto title = BOOST_GET_CONST(std::string, op_desc->GetAttr("title"));
-      auto output_ids = builder_->aiGraphcoreOpset1().printtensor(
-          inputs, print_gradient, debug_context, title);
-      SetIpuIndexStage(output_ids, op_desc);
-      InsertTensors(outputs, output_ids);
-    } else {
-      auto itr = name_function_.find(op_type);
-      if (itr != name_function_.end()) {
-        itr->second(node->Op());
-      } else {
-        PADDLE_THROW(platform::errors::NotFound(
-            "%s is not registered, please check for unsupported operators for "
-            "running on IPU",
-            op_type));
-      }
-    }
-  }
-  VLOG(10) << "leave Compiler::LowerBody";
-}
-
-void Compiler::InitInputs(Graph* graph,
-                          const std::vector<std::string>& feed_list) {
+void Compiler::InitInputs(const std::vector<std::string>& feed_list) {
   for (const auto& feed_name : feed_list) {
-    feed_list_.push_back(feed_name);
-    for (const Node* n : graph->Nodes()) {
-      if (n->IsVar()) {
-        auto* var_desc = n->Var();
-        if (feed_name == var_desc->Name()) {
-          VLOG(10) << "feed_name= " << var_desc->Name();
-          auto data_type = VarType2PopartType(var_desc->GetDataType());
-          popart::TensorInfo input_info{data_type, var_desc->GetShape()};
-          VLOG(10) << "popart input_info = " << input_info;
-          popart::TensorId tensor_id =
-              builder_->addInputTensor(input_info, feed_name);
-          VLOG(10) << "popart input tensor id = " << tensor_id;
-          resources_->inputs.push_back(tensor_id);
-          resources_->tensors.emplace(var_desc->Name(), tensor_id);
-        }
-      }
-    }
+    auto* node = graph_helper_->vars_name_map[feed_name];
+    auto* var_desc = node->Var();
+    VLOG(10) << "feed_name= " << var_desc->Name();
+    auto data_type = VarType2PopartType(var_desc->GetDataType());
+    popart::TensorInfo input_info{data_type, var_desc->GetShape()};
+    VLOG(10) << "popart input_info = " << input_info;
+    popart::TensorId tensor_id =
+        builder_->addInputTensor(input_info, feed_name);
+    VLOG(10) << "popart input tensor id = " << tensor_id;
+    resources_->inputs.push_back(tensor_id);
+    resources_->tensors.emplace(var_desc->Name(), tensor_id);
   }
 }
 
 void Compiler::InitOutputs(const std::vector<std::string>& fetch_list) {
   for (const auto& fetch_name : fetch_list) {
-    fetch_list_.push_back(fetch_name);
     auto tensor = resources_->tensors.find(fetch_name);
     PADDLE_ENFORCE_NE(
         tensor, resources_->tensors.end(),
@@ -271,14 +216,10 @@ void Compiler::InitOutputs(const std::vector<std::string>& fetch_list) {
   }
 }
 
-void Compiler::LowerConstants(const Graph* graph, const Scope* scope) {
+void Compiler::LowerConstants(const Scope* scope) {
   auto& kid_scope = scope->NewScope();
   VLOG(10) << "enter Compiler::LowerConstants";
-  for (auto* node : graph->Nodes()) {
-    if (!node->IsOp()) {
-      continue;
-    }
-
+  for (auto* node : graph_helper_->sorted_ops) {
     auto* op_desc = node->Op();
     auto op_type = op_desc->Type();
     if (op_type == "popart_constant") {
@@ -308,17 +249,16 @@ void Compiler::LowerConstants(const Graph* graph, const Scope* scope) {
   VLOG(10) << "leave Compiler::LowerConstants";
 }
 
-void Compiler::LowerWeights(const Graph* graph, const Scope* scope) {
+void Compiler::LowerWeights(const Scope* scope) {
   VLOG(10) << "enter Compiler::LowerWeights";
-  PADDLE_ENFORCE_NOT_NULL(scope,
-                          platform::errors::PreconditionNotMet(
-                              "You should call set_scope before LowerWeights"));
   // at this step, the graph doesn't contains optimizer related states
-  for (const auto* node : graph->Nodes()) {
+  for (auto id : graph_helper_->sorted_vars_id) {
+    auto* node = graph_helper_->nodes_id_map[id];
     if (node->IsVar() && !node->IsCtrlVar() && node->Var()) {
       if (node->Var()->Persistable() && node->inputs.empty()) {
         auto var_name = node->Var()->Name();
         if (resources_->tensors.count(var_name) != 0) {
+          VLOG(10) << "found existed one, skip lowering Weight: " << var_name;
           continue;
         }
         VLOG(10) << "lowering weight: " << var_name;
@@ -344,12 +284,68 @@ void Compiler::LowerWeights(const Graph* graph, const Scope* scope) {
   VLOG(10) << "leave Compiler::LowerWeights";
 }
 
-void Compiler::LowerOptimier(const Graph* graph, const Scope* scope) {
-  for (auto* node : graph->Nodes()) {
-    if (!node->IsOp()) {
-      continue;
+void Compiler::LowerBody() {
+  VLOG(10) << "enter Compiler::LowerBody";
+  for (auto* node : graph_helper_->sorted_ops) {
+    auto* op_desc = node->Op();
+    auto op_type = op_desc->Type();
+    VLOG(10) << "lowering op: " << op_type;
+
+    if (op_type == "popart_constant") {
+      // pass
+    } else if (op_type == "popart_optimizer") {
+      // pass
+    } else if (op_type == "popart_checkpointoutput") {
+      auto inputs = GetOpInputs(op_desc);
+      auto outputs = GetOpOutputs(op_desc);
+      auto output_ids = builder_->checkpointOutput(inputs);
+      InsertTensors(outputs, output_ids);
+    } else if (op_type == "popart_custom_op") {
+      auto inputs = GetOpInputs(op_desc);
+      auto outputs = GetOpOutputs(op_desc);
+      auto debug_context = BuildDebugContext(op_desc);
+      auto attributes = std::map<std::string, popart::any>{};
+      for (auto& attr : op_desc->GetAttrMap()) {
+        CustomOpAttrVisitor visitor(&attributes, attr.first);
+        boost::apply_visitor(visitor, attr.second);
+      }
+      auto __op_type =
+          BOOST_GET_CONST(std::string, op_desc->GetAttr("__op_type"));
+      VLOG(10) << "Build graph from custom op: " << __op_type;
+      auto it = custom_ops_.find(__op_type);
+      auto output_ids =
+          builder_->customOp(it->second.popart_op, it->second.popart_op.version,
+                             inputs, outputs.size(), attributes, debug_context);
+      SetIpuIndexStage(output_ids, op_desc);
+      InsertTensors(outputs, output_ids);
+    } else if (op_type == "popart_printtensor") {
+      auto inputs = GetOpInputs(op_desc);
+      auto outputs = GetOpOutputs(op_desc);
+      auto debug_context = BuildDebugContext(op_desc);
+      auto print_gradient =
+          BOOST_GET_CONST(int64_t, op_desc->GetAttr("print_gradient"));
+      auto title = BOOST_GET_CONST(std::string, op_desc->GetAttr("title"));
+      auto output_ids = builder_->aiGraphcoreOpset1().printtensor(
+          inputs, print_gradient, debug_context, title);
+      SetIpuIndexStage(output_ids, op_desc);
+      InsertTensors(outputs, output_ids);
+    } else {
+      auto itr = name_function_.find(op_type);
+      if (itr != name_function_.end()) {
+        itr->second(node->Op());
+      } else {
+        PADDLE_THROW(platform::errors::NotFound(
+            "%s is not registered, please check for unsupported operators for "
+            "running on IPU",
+            op_type));
+      }
     }
+  }
+  VLOG(10) << "leave Compiler::LowerBody";
+}
 
+void Compiler::LowerOptimizer(const Scope* scope) {
+  for (auto* node : graph_helper_->sorted_ops) {
     auto* op_desc = node->Op();
     auto op_type = op_desc->Type();
     if (op_type == "popart_optimizer") {
diff --git a/paddle/fluid/platform/device/ipu/ipu_compiler.h b/paddle/fluid/platform/device/ipu/ipu_compiler.h
index 5576266b1a771..5d1e8c2727d8f 100644
--- a/paddle/fluid/platform/device/ipu/ipu_compiler.h
+++ b/paddle/fluid/platform/device/ipu/ipu_compiler.h
@@ -68,34 +68,29 @@ struct CompilerResources {
   std::unique_ptr<popart::Optimizer> optimizer;
 };
 
+// helper for lowering graph
+struct GraphHelper {
+  explicit GraphHelper(const Graph *);
+
+  const Graph *graph;
+  std::map<std::string, Node *> vars_name_map;
+  std::map<int, Node *> nodes_id_map;
+  std::vector<Node *> sorted_ops;
+  std::vector<int> sorted_vars_id;
+};
+
 class Compiler {
  public:
   Compiler();
   ~Compiler();
 
-  void RegisterOpFunc();
-  void Prepare();
-  void LowerBody(const Graph *graph);
-  void InitInputs(Graph *graph, const std::vector<std::string> &feed_list);
+  void Prepare(const Graph *graph);
+  void InitInputs(const std::vector<std::string> &feed_list);
   void InitOutputs(const std::vector<std::string> &fetch_list);
-  void LowerConstants(const Graph *graph, const Scope *scope);
-  void LowerWeights(const Graph *graph, const Scope *scope);
-  void LowerOptimier(const Graph *graph, const Scope *scope);
-
-  void InsertTensors(const std::vector<std::string> &output_names,
-                     const std::vector<std::string> &tensor_ids);
-  void InsertTensors(const std::vector<std::string> &output_names,
-                     const std::string &tensor_id);
-  void SetIpuIndexStage(const std::vector<std::string> &tensor_ids,
-                        const OpDesc *op_desc);
-  void SetIpuIndexStage(const std::string &tensor_id, const OpDesc *op_desc);
-  void SetAMPAttributes(const std::vector<std::string> &tensor_ids,
-                        const OpDesc *op_desc);
-  void SetAMPAttributes(const std::string &tensor_id, const OpDesc *op_desc);
-  void SetSerializeAttributes(const std::vector<std::string> &tensor_ids,
-                              const OpDesc *op_desc);
-  void SetSerializeAttributes(const std::string &tensor_id,
-                              const OpDesc *op_desc);
+  void LowerConstants(const Scope *scope);
+  void LowerWeights(const Scope *scope);
+  void LowerBody();
+  void LowerOptimizer(const Scope *scope);
 
   void SetIpuStrategy(const IpuStrategy &strategy) {
     ipu_strategy_ = &strategy;
@@ -112,21 +107,34 @@ class Compiler {
   void SaveModelProtoNoCheck(const std::string &path);
 
  private:
+  void RegisterOpFunc();
   std::vector<std::string> GetOpInputs(const OpDesc *op);
   const std::vector<std::string> &GetOpOutputs(const OpDesc *op);
   popart::DebugContext BuildDebugContext(const OpDesc *op);
 
+  void InsertTensors(const std::vector<std::string> &output_names,
+                     const std::vector<std::string> &tensor_ids);
+  void InsertTensors(const std::vector<std::string> &output_names,
+                     const std::string &tensor_id);
+  void SetIpuIndexStage(const std::vector<std::string> &tensor_ids,
+                        const OpDesc *op_desc);
+  void SetIpuIndexStage(const std::string &tensor_id, const OpDesc *op_desc);
+  void SetAMPAttributes(const std::vector<std::string> &tensor_ids,
+                        const OpDesc *op_desc);
+  void SetAMPAttributes(const std::string &tensor_id, const OpDesc *op_desc);
+  void SetSerializeAttributes(const std::vector<std::string> &tensor_ids,
+                              const OpDesc *op_desc);
+  void SetSerializeAttributes(const std::string &tensor_id,
+                              const OpDesc *op_desc);
+
  private:
   std::unique_ptr<popart::Builder> builder_;
   std::unique_ptr<CompilerResources> resources_;
+  std::unique_ptr<GraphHelper> graph_helper_;
 
   using OpFunc = std::function<void(OpDesc *op_desc)>;
   std::unordered_map<std::string, OpFunc> name_function_;
 
-  // feed_list_ & fetch_list save paddle tensor id
-  std::vector<std::string> feed_list_;
-  std::vector<std::string> fetch_list_;
-
   const IpuStrategy *ipu_strategy_ = nullptr;
   std::map<std::string, IpuCustomOpIdentifier> custom_ops_;
 };
diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.cc b/paddle/fluid/platform/device/ipu/ipu_strategy.cc
index 4a9b9c00cb75c..943dfcc6cffb8 100644
--- a/paddle/fluid/platform/device/ipu/ipu_strategy.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_strategy.cc
@@ -241,6 +241,15 @@ IpuStrategy::IpuStrategy() {
 #undef ADD_POPART_BOOL_OPTION_ALIAS
 #undef ADD_POPART_ENUM_OPTION_ALIAS
 
+  RegisterGetter(vector_options_getter, options_type, "custom_ops", "vector",
+                 [&]() {
+                   std::vector<std::string> res;
+                   for (auto x : custom_ops) {
+                     res.push_back(x.repr());
+                   }
+                   return res;
+                 });
+
   RegisterSetter(bool_options, "enable_manual_shard", [&](bool value) {
     if (value) {
       popart_options.virtualGraphMode = popart::VirtualGraphMode::Manual;
@@ -429,6 +438,14 @@ void IpuStrategy::SetTensorLocation(const std::string& tensor,
   }
 }
 
+void IpuStrategy::AddCustomOp(const std::string& paddle_op,
+                              const std::string& popart_op,
+                              const std::string& domain, int version) {
+  LOG(INFO) << "IpuStrategy add custom op: " << paddle_op;
+  custom_ops.push_back(
+      IpuCustomOpIdentifier(paddle_op, popart_op, domain, version));
+}
+
 std::string IpuStrategy::GetOption(const std::string& option) {
   return get(option, options_getter);
 }
diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.h b/paddle/fluid/platform/device/ipu/ipu_strategy.h
index 0e2af26454c40..64436dc14fec3 100644
--- a/paddle/fluid/platform/device/ipu/ipu_strategy.h
+++ b/paddle/fluid/platform/device/ipu/ipu_strategy.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <popart/patterns/patterns.hpp>
 #include <popart/sessionoptions.hpp>
 #include <popart/tensorlocation.hpp>
+#include "paddle/fluid/platform/device/ipu/ipu_utils.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -71,6 +72,9 @@ struct IpuStrategy {
   // popart pattern manager
   popart::Patterns popart_patterns;
 
+  // custom ops
+  std::vector<IpuCustomOpIdentifier> custom_ops;
+
  private:
   std::map<std::string, std::function<void(bool)>> bool_options;
   std::map<std::string, std::function<void(std::uint64_t)>> uint64_options;
@@ -123,6 +127,8 @@ struct IpuStrategy {
                               const std::string &value);
   void SetTensorLocation(const std::string &tensor, const std::string &option,
                          std::uint64_t value);
+  void AddCustomOp(const std::string &paddle_op, const std::string &popart_op,
+                   const std::string &domain, int version);
 
   std::string GetOption(const std::string &);
   std::vector<std::string> GetVectorOption(const std::string &);

From 8d1d0bdfad1568367bdad1be71916d344509a345 Mon Sep 17 00:00:00 2001
From: zhangyikun02 <48021248+zhangyk0314@users.noreply.github.com>
Date: Tue, 22 Feb 2022 10:39:50 +0800
Subject: [PATCH 017/101] add hard_swish in xpu2_op_list.h and update
 xpu.cmake,test=kunlun (#39586)

---
 cmake/external/xpu.cmake                        | 2 +-
 paddle/fluid/platform/device/xpu/xpu2_op_list.h | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index 5e60f1f2b99fe..415c0fe9bef9e 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -36,7 +36,7 @@ ENDIF()
 
 if(NOT DEFINED XPU_BASE_URL)
   SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
-  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220215")
+  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220219")
 else()
   SET(XPU_BASE_URL "${XPU_BASE_URL}")
 endif()
diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
index e27d56642efde..e6b08ed7bc340 100644
--- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
@@ -196,6 +196,7 @@ XPUOpMap& get_kl2_ops() {
       {"hard_swish_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"hard_swish", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"huber_loss_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"huber_loss", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},

From 911cb2ea96452845aab650f04174b02851b62a65 Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Tue, 22 Feb 2022 10:42:49 +0800
Subject: [PATCH 018/101] Support NoNeedBuffer for final state codegen (#39628)

* Support NoNeedBuffer for final state codegen

* Replaced pten with phi
---
 .../final_state_generator/eager_gen.py        | 31 +++++++++++++++----
 paddle/fluid/eager/tensor_wrapper.h           | 20 ++++++++++--
 2 files changed, 43 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
index 786bf21e8c8a1..ca02a3d39779d 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
@@ -127,6 +127,15 @@ def ReadBwdFile(filepath):
 ######################
 ###  Yaml Parsers  ###
 ######################
+def ParseNoNeedBuffer(string):
+    # string: "x, y"
+    no_need_buffer_set = set()
+    for name in string.split(","):
+        no_need_buffer_set.add(name.strip())
+
+    return no_need_buffer_set
+
+
 def ParseYamlArgs(string):
     # Example: const Tensor& x, const Tensor& y, bool transpose_x, bool transpose_y
 
@@ -397,7 +406,7 @@ def SlotNameMatching(backward_inputs_list, backward_returns_list,
 
 
 def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map,
-                            backward_attrs_list):
+                            backward_attrs_list, no_need_buffer_set):
     # Inputs:
     # fwd_api_name = ""
     # backward_fwd_input_map   = { "name" : [type, is_fwd_input, orig_position] ...}
@@ -410,15 +419,20 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map,
     set_tensor_wrapper_methods_str = ""
     tensor_wrapper_members_str = ""
     for tname, (ttype, is_fwd_input, _) in backward_fwd_input_map.items():
+        if tname in no_need_buffer_set:
+            no_need_buffer = "true"
+        else:
+            no_need_buffer = "false"
+
         tensor_wrapper_name = GetSavedName(tname)
         if IsPlainTensorType(ttype):
             SET_PLAIN_TENSOR_WRAPPER_TEMPLATE = """
    void SetTensorWrapper{}(const paddle::experimental::Tensor& {}, bool full_reserved) {{     
-     {} = egr::TensorWrapper({}, full_reserved);
+     {} = egr::TensorWrapper({}, full_reserved, {});
    }}
 """
             set_tensor_wrapper_methods_str += SET_PLAIN_TENSOR_WRAPPER_TEMPLATE.format(
-                tname, tname, tensor_wrapper_name, tname)
+                tname, tname, tensor_wrapper_name, tname, no_need_buffer)
 
             PLAIN_TENSOR_MEMBER_TEMPLATE = """
    egr::TensorWrapper {};
@@ -430,12 +444,12 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map,
             SET_VECTOR_TENSOR_WRAPPER_TEMPLATE = """
    void SetTensorWrapper{}(const std::vector<paddle::experimental::Tensor>& {}, bool full_reserved) {{
      for(const auto& eager_tensor : {}) {{
-        {}.emplace_back( egr::TensorWrapper(eager_tensor, full_reserved) );
+        {}.emplace_back( egr::TensorWrapper(eager_tensor, full_reserved, {}) );
      }};
    }}
 """
             set_tensor_wrapper_methods_str += SET_VECTOR_TENSOR_WRAPPER_TEMPLATE.format(
-                tname, tname, tname, tensor_wrapper_name)
+                tname, tname, tname, tensor_wrapper_name, no_need_buffer)
 
             VECTOR_TENSOR_MEMBER_TEMPLATE = """
    std::vector<egr::TensorWrapper> {};
@@ -997,6 +1011,10 @@ def GenerateForwardHFile(filepath, forward_function_declaration_str):
         assert 'output' in fwd_api.keys()
         assert 'backward' in fwd_api.keys()
 
+        no_need_buffer_set = set()
+        if 'no_need_buffer' in fwd_api.keys():
+            no_need_buffer_set = ParseNoNeedBuffer(fwd_api['no_need_buffer'])
+
         fwd_api_name = fwd_api['api']
         fwd_args_str = fwd_api['args']
         fwd_returns_str = fwd_api['output']
@@ -1062,7 +1080,8 @@ def GenerateForwardHFile(filepath, forward_function_declaration_str):
 
         # Node Declaration Generation
         node_declaration_str += GenerateNodeDeclaration(
-            fwd_api_name, backward_fwd_input_map, backward_attrs_list)
+            fwd_api_name, backward_fwd_input_map, backward_attrs_list,
+            no_need_buffer_set)
         print("Generated Node Declaration: ", node_declaration_str)
 
         node_definition_str += GenerateNodeDefinition(
diff --git a/paddle/fluid/eager/tensor_wrapper.h b/paddle/fluid/eager/tensor_wrapper.h
index 6cc17b0a9c5fa..1732e0513d524 100644
--- a/paddle/fluid/eager/tensor_wrapper.h
+++ b/paddle/fluid/eager/tensor_wrapper.h
@@ -34,7 +34,8 @@ class TensorWrapper {
  public:
   TensorWrapper() = default;
   explicit TensorWrapper(const paddle::experimental::Tensor& tensor,
-                         bool full_reserved = false) {
+                         bool full_reserved = false,
+                         bool no_need_buffer = false) {
     /**
      * Normally, we should fully reserved all non-output or non-leaf fwd tensor
      * here. And for fwd output tensor, we should not reserve its autogradmeta,
@@ -48,7 +49,22 @@ class TensorWrapper {
     }
 
     // shallow copy tensor_impl here
-    intermidiate_tensor_.set_impl(tensor.impl());
+    if (no_need_buffer) {
+      if (phi::DenseTensor::classof(tensor.impl().get())) {
+        // Only Copy Meta
+        phi::DenseTensor* dense_tensor =
+            static_cast<phi::DenseTensor*>(tensor.impl().get());
+        auto tw_dense_tensor = std::make_shared<phi::DenseTensor>();
+        tw_dense_tensor->set_meta(dense_tensor->meta());
+        intermidiate_tensor_.set_impl(tw_dense_tensor);
+      } else {
+        PADDLE_THROW(paddle::platform::errors::Fatal(
+            "Unrecognized tensor type for no_need_buffer feature"));
+      }
+    } else {
+      intermidiate_tensor_.set_impl(tensor.impl());
+    }
+
     intermidiate_tensor_.set_name(tensor.name() + "@Saved");
     PADDLE_ENFORCE_NOT_NULL(
         EagerUtils::unsafe_autograd_meta(tensor),

From 12c6d06afec626828a0e812f3681d87714826de3 Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Tue, 22 Feb 2022 10:48:34 +0800
Subject: [PATCH 019/101] [pten]add check for using HostAlloc (#39771)

* add check for using HostAlloc

* add check for using HostAlloc
---
 tools/check_file_diff_approvals.sh | 43 +++++++++++++++++++-----------
 1 file changed, 27 insertions(+), 16 deletions(-)

diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index 7823646ff7bcb..55d2d59c7ece6 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -250,31 +250,42 @@ if [ "${EMPTY_GRAD_OP_REGISTERED}" != "" ] && [ "${GIT_PT_ID}" != "" ]; then
     check_approval 1 43953930 46782768 22165420 22361972
 fi
 
-HAS_MODIFIED_PTEN_FILES=`git diff --name-only upstream/$BRANCH | grep "paddle/phi/" || true`
-PTEN_INCLUDE_FLUID_FILES=""
-for CHANGE_FILE in ${HAS_MODIFIED_PTEN_FILES}; do
-    PTEN_DIR_ADDED_LINES=`git diff -U0 upstream/$BRANCH -- ${PADDLE_ROOT}/${CHANGE_FILE} | grep "^+" | grep "#include \"paddle/fluid/" || true`
-    if [ "${PTEN_DIR_ADDED_LINES}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-        PTEN_INCLUDE_FLUID_FILES="${PTEN_INCLUDE_FLUID_FILES} ${CHANGE_FILE}"
+HAS_MODIFIED_PHI_FILES=`git diff --name-only upstream/$BRANCH | grep "paddle/phi/" || true`
+PHI_INCLUDE_FLUID_FILES=""
+for CHANGE_FILE in ${HAS_MODIFIED_PHI_FILES}; do
+    PHI_DIR_ADDED_LINES=`git diff -U0 upstream/$BRANCH -- ${PADDLE_ROOT}/${CHANGE_FILE} | grep "^+" | grep "#include \"paddle/fluid/" || true`
+    if [ "${PHI_DIR_ADDED_LINES}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+        PHI_INCLUDE_FLUID_FILES="${PHI_INCLUDE_FLUID_FILES} ${CHANGE_FILE}"
     fi 
 done
-if [ "${PTEN_INCLUDE_FLUID_FILES}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-    echo_line="You must have one RD (chenwhql, MingMingShangTian, YuanRisheng or zyfncg) approval for the including paddle/fluid header in paddle/phi files(${PTEN_INCLUDE_FLUID_FILES}).\n"
+if [ "${PHI_INCLUDE_FLUID_FILES}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+    echo_line="You must have one RD (chenwhql, MingMingShangTian, YuanRisheng or zyfncg) approval for the including paddle/fluid header in paddle/phi files(${PHI_INCLUDE_FLUID_FILES}).\n"
     check_approval 1 chenwhql MingMingShangTian YuanRisheng zyfncg
 fi
 
-HAS_MODIFIED_PTEN_KERNEL_FILES=`git diff --name-only upstream/$BRANCH | grep "paddle/phi/kernels" || true`
-PTEN_USE_MUTABLE_DATA_FILES=""
-for CHANGE_FILE in ${HAS_MODIFIED_PTEN_KERNEL_FILES}; do
-    PTEN_DIR_ADDED_LINES=`git diff -U0 upstream/$BRANCH -- ${PADDLE_ROOT}/${CHANGE_FILE} | grep "^+" | grep -w "mutable_data" || true`
-    if [ "${PTEN_DIR_ADDED_LINES}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-        PTEN_USE_MUTABLE_DATA_FILES="${PTEN_USE_MUTABLE_DATA_FILES} ${CHANGE_FILE}"
+HAS_MODIFIED_PHI_KERNEL_FILES=`git diff --name-only upstream/$BRANCH | grep "paddle/phi/kernels" || true`
+PHI_USE_MUTABLE_DATA_FILES=""
+for CHANGE_FILE in ${HAS_MODIFIED_PHI_KERNEL_FILES}; do
+    PHI_DIR_ADDED_LINES=`git diff -U0 upstream/$BRANCH -- ${PADDLE_ROOT}/${CHANGE_FILE} | grep "^+" | grep -w "mutable_data" || true`
+    if [ "${PHI_DIR_ADDED_LINES}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+        PHI_USE_MUTABLE_DATA_FILES="${PHI_USE_MUTABLE_DATA_FILES} ${CHANGE_FILE}"
     fi 
 done
-if [ "${PTEN_USE_MUTABLE_DATA_FILES}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-    echo_line="You can not use the DenseTensor::mutable_data() method in paddle/phi/kernels files(${PTEN_USE_MUTABLE_DATA_FILES}). If you want to alloc memory, use phi::DeviceContext::Alloc() or phi::DeviceContext::HostAlloc() instead and if you want to get mutable data, use DenseTensor::data(). If you have any questions, you can have one RD (chenwhql, Shixiaowei02, MingMingShangTian, YuanRisheng or zyfncg) review and approve.\n"
+if [ "${PHI_USE_MUTABLE_DATA_FILES}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+    echo_line="You can not use the DenseTensor::mutable_data() method in paddle/phi/kernels files(${PHI_USE_MUTABLE_DATA_FILES}). If you want to alloc memory, use phi::DeviceContext::Alloc() or phi::DeviceContext::HostAlloc() instead and if you want to get mutable data, use DenseTensor::data(). If you have any questions, you can have one RD (chenwhql, Shixiaowei02, MingMingShangTian, YuanRisheng or zyfncg) review and approve.\n"
     check_approval 1 chenwhql Shixiaowei02 MingMingShangTian YuanRisheng zyfncg
 fi
+PHI_USE_HOSTALLOC_FILES=""
+for CHANGE_FILE in ${HAS_MODIFIED_PHI_KERNEL_FILES}; do
+    PHI_DIR_ADDED_LINES=`git diff -U0 upstream/$BRANCH -- ${PADDLE_ROOT}/${CHANGE_FILE} | grep "^+" | grep -w "HostAlloc" || true`
+    if [ "${PHI_DIR_ADDED_LINES}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+        PHI_USE_HOSTALLOC_FILES="${PHI_USE_HOSTALLOC_FILES} ${CHANGE_FILE}"
+    fi
+done
+if [ "${PHI_USE_HOSTALLOC_FILES}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+    echo_line="You must have one RD (phlrain, chenwhql) approval for the usage of phi::DeviceContext::HostAlloc() method in paddle/phi/kernels files(${PHI_USE_HOSTALLOC_FILES})\n"
+    check_approval 1 phlrain chenwhql
+fi
   
 ALL_CHANGE_FILES=`git diff --numstat upstream/$BRANCH | awk '{print $3}' | grep ".py"`
 ALL_OPTEST_BAN_DYGRAPH_MESSAGE=""

From b8dbffb772b4539c1fd3c8a613e5067cd238746d Mon Sep 17 00:00:00 2001
From: feng_shuai <fengshuai03@baidu.com>
Date: Tue, 22 Feb 2022 10:48:56 +0800
Subject: [PATCH 020/101] fix:Modify matrix latitude (#39686)

---
 .../ir/inference/test_trt_convert_matmul.py      | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_matmul.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_matmul.py
index 8913159b2c4df..c6f2fa205c713 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_matmul.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_matmul.py
@@ -172,11 +172,11 @@ def generate_dynamic_shape(attrs):
             }
             self.dynamic_shape.max_input_shape = {
                 "input1_data": [16, 4, 4],
-                "input2_data": [16, 4, 128]
+                "input2_data": [16, 4, 4]
             }
             self.dynamic_shape.opt_input_shape = {
                 "input1_data": [8, 4, 4],
-                "input2_data": [8, 4, 16]
+                "input2_data": [8, 4, 4]
             }
 
         attrs = [
@@ -192,17 +192,7 @@ def generate_dynamic_shape(attrs):
         yield self.create_inference_config(), (1, 3), 1e-5
 
     def add_skip_trt_case(self):
-        def teller1(program_config, predictor_config):
-            if len(
-                    self.dynamic_shape.min_input_shape
-            ) != 0 and self.trt_param.precision == paddle_infer.PrecisionType.Half:
-                return True
-            return False
-
-        self.add_skip_case(
-            teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
-            "Tensorrt MatrixMultiply layer will get error when dynamic shape fp16 mode."
-        )
+        pass
 
     def test(self):
         self.add_skip_trt_case()

From cdf05dfc0af2e9800eda25c9ec98651c36c8f139 Mon Sep 17 00:00:00 2001
From: feng_shuai <fengshuai03@baidu.com>
Date: Tue, 22 Feb 2022 10:49:13 +0800
Subject: [PATCH 021/101] delete skip_case for dropout_ut (#39629)

---
 .../ir/inference/test_trt_convert_dropout.py       | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_dropout.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_dropout.py
index 57f5b5a0bb245..f9bb4e66f2ab4 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_dropout.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_dropout.py
@@ -112,7 +112,7 @@ def clear_dynamic_shape():
         def generate_trt_nodes_num(attrs, dynamic_shape):
             if attrs[0]['dropout_implementation'] == "upscale_in_train":
                 return 0, 2
-            elif self.dims == 1:
+            elif self.dims == 1 and dynamic_shape == False:
                 return 0, 3
             else:
                 return 1, 2
@@ -141,17 +141,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
                                                                      True), 1e-5
 
     def add_skip_trt_case(self):
-        def teller1(program_config, predictor_config):
-            if len(
-                    program_config.inputs['input_data'].shape
-            ) == 2 and not predictor_config.tensorrt_dynamic_shape_enabled():
-                return True
-            return False
-
-        self.add_skip_case(
-            teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
-            "The output shape has diff, but we can add shuffle layer to resolve it."
-        )
+        pass
 
     def test(self):
         self.add_skip_trt_case()

From 9b9d52e008cb19ecb14446c56b699b70ca722291 Mon Sep 17 00:00:00 2001
From: 0x45f <23097963+0x45f@users.noreply.github.com>
Date: Tue, 22 Feb 2022 11:01:51 +0800
Subject: [PATCH 022/101] dont show warn msg default (#39730)

---
 .../fluid/dygraph/dygraph_to_static/logging_utils.py       | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/logging_utils.py b/python/paddle/fluid/dygraph/dygraph_to_static/logging_utils.py
index 4d9ed5916adfd..4a6d855a893f6 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/logging_utils.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/logging_utils.py
@@ -143,9 +143,10 @@ def error(self, msg, *args, **kwargs):
             self._output_to_stdout('ERROR: ' + msg, *args)
 
     def warn(self, msg, *args, **kwargs):
-        self.logger.warning(msg, *args, **kwargs)
-        if self.need_to_echo_log_to_stdout:
-            self._output_to_stdout('WARNING: ' + msg, *args)
+        if self.verbosity_level != -1:
+            self.logger.warning(msg, *args, **kwargs)
+            if self.need_to_echo_log_to_stdout:
+                self._output_to_stdout('WARNING: ' + msg, *args)
 
     def log(self, level, msg, *args, **kwargs):
         if self.has_verbosity(level):

From 7fa29a6be268380c39f8614390d73bfec5ae62cf Mon Sep 17 00:00:00 2001
From: Yuang Liu <liuyuang@baidu.com>
Date: Tue, 22 Feb 2022 11:02:05 +0800
Subject: [PATCH 023/101] [phi] add dtype fetcher for scalar (#39775)

---
 paddle/phi/common/scalar.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/paddle/phi/common/scalar.h b/paddle/phi/common/scalar.h
index 1da77a0fa1964..9a5a3fbf921d0 100644
--- a/paddle/phi/common/scalar.h
+++ b/paddle/phi/common/scalar.h
@@ -191,6 +191,8 @@ class ScalarBase {
     }
   }
 
+  DataType dtype() const { return dtype_; }
+
  private:
   template <typename T1, typename T2>
   friend void CopyScalar(const ScalarBase<T1>& src, ScalarBase<T2>* dst);

From 85a11c4781158fb17b0306f0f8085f0b58012909 Mon Sep 17 00:00:00 2001
From: Zhang Zheng <32410583+ZzSean@users.noreply.github.com>
Date: Tue, 22 Feb 2022 11:04:02 +0800
Subject: [PATCH 024/101] Modify the implementation of BlockXReduce to fit more
 scenes (#39554)

* Modify the implementation of BlockYReduce to fit more scenes

* fix

* fix
---
 paddle/phi/kernels/primitive/compute_primitives.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/paddle/phi/kernels/primitive/compute_primitives.h b/paddle/phi/kernels/primitive/compute_primitives.h
index 2d9a7522515d0..4f3c069f3b249 100644
--- a/paddle/phi/kernels/primitive/compute_primitives.h
+++ b/paddle/phi/kernels/primitive/compute_primitives.h
@@ -110,7 +110,11 @@ __device__ __forceinline__ T BlockXReduce(T val, ReduceOp reducer) {
     T temp = paddle::platform::CudaShuffleDownSync(mask, val, stride);
     val = reducer(val, temp);
   }
-  return val;
+  if (threadIdx.x == 0) {
+    shared[threadIdx.y] = val;
+  }
+  __syncthreads();
+  return shared[threadIdx.y];
 }
 
 /**

From 0efa64c8bc69bad501fdc71e7862fc82851d5c1b Mon Sep 17 00:00:00 2001
From: zmxdream <zhangminxu01@baidu.com>
Date: Tue, 22 Feb 2022 11:11:50 +0800
Subject: [PATCH 025/101] [GPUPS]Config fleet optimize 2 (#39783)

* update. test=develop

* update. test=develop

* fix. test=develop

* update. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* update. test=develop

* update. test=develop
---
 paddle/fluid/framework/ps_gpu_trainer.cc  | 160 ++++++++++++++++++++++
 paddle/fluid/framework/trainer.h          |   7 +
 paddle/fluid/framework/trainer_desc.proto |   3 +
 python/paddle/fluid/trainer_desc.py       |   4 +
 4 files changed, 174 insertions(+)

diff --git a/paddle/fluid/framework/ps_gpu_trainer.cc b/paddle/fluid/framework/ps_gpu_trainer.cc
index 4d34ba85517e1..0705f658ff5fe 100644
--- a/paddle/fluid/framework/ps_gpu_trainer.cc
+++ b/paddle/fluid/framework/ps_gpu_trainer.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <google/protobuf/text_format.h>
 #include <cstdlib>
 #include <string>
 #include <vector>
@@ -20,6 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_feed_factory.h"
 #include "paddle/fluid/framework/data_set.h"
 #include "paddle/fluid/framework/device_worker_factory.h"
+#include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
 #include "paddle/fluid/framework/trainer.h"
 #if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
     (defined PADDLE_WITH_PSLIB)
@@ -44,6 +46,164 @@ void PSGPUTrainer::Initialize(const TrainerDesc& trainer_desc,
       dense_grad_names_[table_id][j] = table.dense_grad_name(j);
     }
   }
+  // add for hbmps optimizer config
+  auto fleet_desc_str = trainer_desc.fleet_desc();
+  google::protobuf::TextFormat::ParseFromString(fleet_desc_str, &_ps_param);
+  auto sparse_table =
+      _ps_param.server_param().downpour_server_param().downpour_table_param(0);
+  auto sparse_table_accessor = sparse_table.accessor();
+  auto sparse_table_accessor_parameter =
+      sparse_table_accessor.downpour_accessor_param();
+  auto accessor_class = sparse_table_accessor.accessor_class();
+  // gpups' sparse table optimizer config
+  // now only support single sparse table
+  // auto sparse_table = param_.sparse_table(0);
+  std::unordered_map<std::string, float> config;
+  if (accessor_class == "DownpourFeatureValueAccessor" ||
+      accessor_class == "DownpourCtrAccessor" ||
+      accessor_class == "DownpourCtrDoubleAccessor") {
+    config["nonclk_coeff"] = sparse_table_accessor_parameter.nonclk_coeff();
+    config["clk_coeff"] = sparse_table_accessor_parameter.click_coeff();
+    config["learning_rate"] =
+        sparse_table_accessor.sparse_sgd_param().learning_rate();
+    config["initial_g2sum"] =
+        sparse_table_accessor.sparse_sgd_param().initial_g2sum();
+    config["initial_range"] =
+        sparse_table_accessor.sparse_sgd_param().initial_range();
+    if (sparse_table_accessor.sparse_sgd_param().weight_bounds_size() == 2) {
+      config["min_bound"] =
+          sparse_table_accessor.sparse_sgd_param().weight_bounds()[0];
+      config["max_bound"] =
+          sparse_table_accessor.sparse_sgd_param().weight_bounds()[1];
+    }
+    config["mf_create_thresholds"] = sparse_table_accessor.embedx_threshold();
+  } else if (accessor_class == "DownpourSparseValueAccessor") {
+    auto optimizer_name = sparse_table_accessor.sparse_commonsgd_param().name();
+    if (optimizer_name == "naive") {
+      config["learning_rate"] = sparse_table_accessor.sparse_commonsgd_param()
+                                    .naive()
+                                    .learning_rate();
+      config["initial_range"] = sparse_table_accessor.sparse_commonsgd_param()
+                                    .naive()
+                                    .initial_range();
+      if (sparse_table_accessor.sparse_commonsgd_param()
+              .naive()
+              .weight_bounds_size() == 2) {
+        config["min_bound"] = sparse_table_accessor.sparse_commonsgd_param()
+                                  .naive()
+                                  .weight_bounds()[0];
+        config["max_bound"] = sparse_table_accessor.sparse_commonsgd_param()
+                                  .naive()
+                                  .weight_bounds()[1];
+      }
+    } else if (optimizer_name == "adagrad") {
+      config["learning_rate"] = sparse_table_accessor.sparse_commonsgd_param()
+                                    .adagrad()
+                                    .learning_rate();
+      config["initial_range"] = sparse_table_accessor.sparse_commonsgd_param()
+                                    .adagrad()
+                                    .initial_range();
+      config["initial_g2sum"] = sparse_table_accessor.sparse_commonsgd_param()
+                                    .adagrad()
+                                    .initial_g2sum();
+      if (sparse_table_accessor.sparse_commonsgd_param()
+              .adagrad()
+              .weight_bounds_size() == 2) {
+        config["min_bound"] = sparse_table_accessor.sparse_commonsgd_param()
+                                  .adagrad()
+                                  .weight_bounds()[0];
+        config["max_bound"] = sparse_table_accessor.sparse_commonsgd_param()
+                                  .adagrad()
+                                  .weight_bounds()[1];
+      }
+    } else if (optimizer_name == "adam") {
+      config["learning_rate"] =
+          sparse_table_accessor.sparse_commonsgd_param().adam().learning_rate();
+      config["initial_range"] =
+          sparse_table_accessor.sparse_commonsgd_param().adam().initial_range();
+      if (sparse_table_accessor.sparse_commonsgd_param()
+              .adam()
+              .weight_bounds_size() == 2) {
+        config["min_bound"] = sparse_table_accessor.sparse_commonsgd_param()
+                                  .adam()
+                                  .weight_bounds()[0];
+        config["max_bound"] = sparse_table_accessor.sparse_commonsgd_param()
+                                  .adam()
+                                  .weight_bounds()[1];
+      }
+    }
+  } else if (accessor_class == "DownpourUnitAccessor" ||
+             accessor_class == "DownpourDoubleUnitAccessor") {
+    config["nonclk_coeff"] = sparse_table_accessor_parameter.nonclk_coeff();
+    config["clk_coeff"] = sparse_table_accessor_parameter.click_coeff();
+    auto optimizer_name = sparse_table_accessor.embedx_sgd_param().name();
+    if (optimizer_name == "naive") {
+      config["mf_learning_rate"] =
+          sparse_table_accessor.embedx_sgd_param().naive().learning_rate();
+      config["mf_initial_range"] =
+          sparse_table_accessor.embedx_sgd_param().naive().initial_range();
+      if (sparse_table_accessor.embedx_sgd_param()
+              .naive()
+              .weight_bounds_size() == 2) {
+        config["mf_min_bound"] =
+            sparse_table_accessor.embedx_sgd_param().naive().weight_bounds()[0];
+        config["mf_max_bound"] =
+            sparse_table_accessor.embedx_sgd_param().naive().weight_bounds()[1];
+      }
+    } else if (optimizer_name == "adagrad") {
+      config["mf_learning_rate"] =
+          sparse_table_accessor.embedx_sgd_param().adagrad().learning_rate();
+      config["mf_initial_range"] =
+          sparse_table_accessor.embedx_sgd_param().adagrad().initial_range();
+      config["mf_initial_g2sum"] =
+          sparse_table_accessor.embedx_sgd_param().adagrad().initial_g2sum();
+      if (sparse_table_accessor.embedx_sgd_param()
+              .adagrad()
+              .weight_bounds_size() == 2) {
+        config["mf_min_bound"] = sparse_table_accessor.embedx_sgd_param()
+                                     .adagrad()
+                                     .weight_bounds()[0];
+        config["mf_max_bound"] = sparse_table_accessor.embedx_sgd_param()
+                                     .adagrad()
+                                     .weight_bounds()[1];
+      }
+    } else if (optimizer_name == "std_adagrad") {
+      config["mf_learning_rate"] =
+          sparse_table_accessor.embedx_sgd_param().adagrad().learning_rate();
+      config["mf_initial_range"] =
+          sparse_table_accessor.embedx_sgd_param().adagrad().initial_range();
+      config["mf_initial_g2sum"] =
+          sparse_table_accessor.embedx_sgd_param().adagrad().initial_g2sum();
+      if (sparse_table_accessor.embedx_sgd_param()
+              .adagrad()
+              .weight_bounds_size() == 2) {
+        config["mf_min_bound"] = sparse_table_accessor.embedx_sgd_param()
+                                     .adagrad()
+                                     .weight_bounds()[0];
+        config["mf_max_bound"] = sparse_table_accessor.embedx_sgd_param()
+                                     .adagrad()
+                                     .weight_bounds()[1];
+      }
+    } else if (optimizer_name == "adam") {
+      config["mf_learning_rate"] =
+          sparse_table_accessor.embedx_sgd_param().adam().learning_rate();
+      config["mf_initial_range"] =
+          sparse_table_accessor.embedx_sgd_param().adam().initial_range();
+      if (sparse_table_accessor.embedx_sgd_param()
+              .adam()
+              .weight_bounds_size() == 2) {
+        config["mf_min_bound"] =
+            sparse_table_accessor.embedx_sgd_param().adam().weight_bounds()[0];
+        config["mf_max_bound"] =
+            sparse_table_accessor.embedx_sgd_param().adam().weight_bounds()[1];
+      }
+    }
+    config["mf_create_thresholds"] = sparse_table_accessor.embedx_threshold();
+  }
+
+  auto ps_gpu_wrapper = paddle::framework::PSGPUWrapper::GetInstance();
+  ps_gpu_wrapper->InitializeGPUServer(config);
+
   scale_datanorm_ = trainer_desc.scale_datanorm();
   int place_num = trainer_desc.worker_places_size();
   const std::vector<paddle::framework::DataFeed*> readers =
diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h
index c993895a9f0ea..85eef89ee27f6 100644
--- a/paddle/fluid/framework/trainer.h
+++ b/paddle/fluid/framework/trainer.h
@@ -36,6 +36,10 @@ limitations under the License. */
 #include "paddle/fluid/operators/reader/blocking_queue.h"
 #include "paddle/phi/backends/dynload/port.h"
 
+#ifdef PADDLE_WITH_PSLIB
+#include <pslib.h>
+#endif
+
 namespace paddle {
 namespace framework {
 
@@ -287,6 +291,9 @@ class PSGPUTrainer : public TrainerBase {
   int mpi_rank_;
   int mpi_size_;
   int dump_file_num_;
+
+  // _ps_param for gpups optimizer config
+  ::paddle::PSParameter _ps_param;
 };
 #endif
 
diff --git a/paddle/fluid/framework/trainer_desc.proto b/paddle/fluid/framework/trainer_desc.proto
index 96d312437b34c..6fe33545aa22d 100644
--- a/paddle/fluid/framework/trainer_desc.proto
+++ b/paddle/fluid/framework/trainer_desc.proto
@@ -66,6 +66,9 @@ message TrainerDesc {
   repeated int32 trainers = 35;
   optional int32 trainer_id = 36;
 
+  // add for gpu
+  optional string fleet_desc = 37;
+
   // device worker parameters
   optional HogwildWorkerParameter hogwild_param = 101;
   optional DownpourWorkerParameter downpour_param = 103;
diff --git a/python/paddle/fluid/trainer_desc.py b/python/paddle/fluid/trainer_desc.py
index 39320f5c0acf3..cdc9b14b6e328 100644
--- a/python/paddle/fluid/trainer_desc.py
+++ b/python/paddle/fluid/trainer_desc.py
@@ -111,6 +111,10 @@ def _set_infer(self, infer):
 
     def _set_fleet_desc(self, fleet_desc):
         self._fleet_desc = fleet_desc
+        ## serialize fleet_desc
+        from google.protobuf import text_format
+        fleet_desc_str = text_format.MessageToString(fleet_desc)
+        self.proto_desc.fleet_desc = fleet_desc_str
 
     def _gen_trainer_desc(self):
         pass

From c8d6c146354e85864700b941fc288803f077b72b Mon Sep 17 00:00:00 2001
From: zhangxiaoci <zhangxiaoci@baidu.com>
Date: Tue, 22 Feb 2022 11:14:52 +0800
Subject: [PATCH 026/101] refactor reshape2/shape unittest for kunlun (#39665)

*test=kunlun
---
 .../unittests/xpu/test_reshape2_op_xpu.py     | 339 ++++++++----------
 .../tests/unittests/xpu/test_shape_op_xpu.py  | 156 ++++----
 2 files changed, 245 insertions(+), 250 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reshape2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_reshape2_op_xpu.py
index 1a21b0f1972b7..0b000fc924ac1 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_reshape2_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_reshape2_op_xpu.py
@@ -14,194 +14,167 @@
 
 from __future__ import print_function
 
-import unittest
 import numpy as np
 import sys
-
+import unittest
 sys.path.append("..")
-from op_test import OpTest
+
 import paddle
-import paddle.fluid as fluid
-from paddle.fluid import compiler, Program, program_guard
-
-
-# situation 1: have shape( list, no tensor), no actual shape(Tensor)
-class TestReshapeOp(OpTest):
-    def setUp(self):
-        self.init_data()
-        self.op_type = "reshape2"
-        self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")}
-        self.attrs = {"shape": self.new_shape, "use_xpu": True}
-        self.outputs = {
-            "Out": self.inputs["X"].reshape(self.infered_shape),
-            'XShape': np.random.random(self.ori_shape).astype("float32")
-        }
-
-    def init_data(self):
-        self.ori_shape = (2, 60)
-        self.new_shape = (12, 10)
-        self.infered_shape = (12, 10)
-
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place, no_check_set=['XShape'])
-
-    def test_check_grad(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, ["X"], "Out")
-
-
-class TestReshapeOpDimInfer1(TestReshapeOp):
-    def init_data(self):
-        self.ori_shape = (5, 25)
-        self.new_shape = (5, -1, 5)
-        self.infered_shape = (5, -1, 5)
-
-
-class TestReshapeOpDimInfer2(TestReshapeOp):
-    def init_data(self):
-        self.ori_shape = (10, 2, 6)
-        self.new_shape = (10, 0, 3, -1)
-        self.infered_shape = (10, 2, 3, -1)
-
-
-# situation 2: have shape(list, no tensor), have actual shape(Tensor)
-class TestReshapeOpWithInputShape(OpTest):
-    def setUp(self):
-        self.init_data()
-        self.op_type = "reshape2"
-
-        self.inputs = {
-            "X": np.random.random(self.ori_shape).astype("float32"),
-            "Shape": np.array(
-                self.actual_shape, dtype="int32")
-        }
-        self.attrs = {"shape": self.new_shape, "use_xpu": True}
-        self.outputs = {
-            "Out": self.inputs["X"].reshape(self.actual_shape),
-            'XShape': np.random.random(self.ori_shape).astype("float32")
-        }
-
-    def init_data(self):
-        self.ori_shape = (6, 20)
-        self.new_shape = (0, -1, 20)
-        self.actual_shape = (2, 3, 20)
-
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place, no_check_set=['XShape'])
-
-    def test_check_grad(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, ["X"], "Out")
-
-
-# Situation 3: have shape(list, have tensor), no actual shape(Tensor)
-class TestReshapeOp_attr_ShapeTensor(OpTest):
-    def setUp(self):
-        self.init_data()
-        self.op_type = "reshape2"
-
-        shape_tensor = []
-        for index, ele in enumerate(self.new_shape):
-            shape_tensor.append(("x" + str(index), np.ones(
-                (1)).astype('int32') * ele))
-
-        self.inputs = {
-            "X": np.random.random(self.ori_shape).astype("float32"),
-            'ShapeTensor': shape_tensor
-        }
-        self.attrs = {'shape': self.shape, "use_xpu": True}
-        self.outputs = {
-            "Out": self.inputs["X"].reshape(self.infered_shape),
-            'XShape': np.random.random(self.ori_shape).astype("float32")
-        }
-
-    def init_data(self):
-        self.ori_shape = (4, 25)
-        self.new_shape = (10, 10)
-        self.infered_shape = (10, 10)
-        self.shape = (-1, -1)
-
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place, no_check_set=['XShape'])
-
-    def test_check_grad(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, ["X"], "Out")
-
-
-class TestReshapeOpDimInfer1_attr_ShapeTensor(TestReshapeOp_attr_ShapeTensor):
-    def init_data(self):
-        self.ori_shape = (5, 20)
-        self.new_shape = (5, -1, 20)
-        self.infered_shape = (5, -1, 20)
-        self.shape = (5, -1, -1)
-
-
-class TestReshapeOpDimInfer2_attr_ShapeTensor(TestReshapeOp_attr_ShapeTensor):
-    def init_data(self):
-        self.ori_shape = (10, 2, 6)
-        self.new_shape = (10, 0, 3, -1)
-        self.infered_shape = (10, 2, 3, -1)
-        self.shape = (10, 0, 3, -1)
-
-
-# Situation 4: have shape(Tensor), no actual shape(Tensor)
-class TestReshapeOp_attr_OnlyShape(OpTest):
-    def setUp(self):
-        self.init_data()
-        self.op_type = "reshape2"
-
-        self.inputs = {
-            "X": np.random.random(self.ori_shape).astype("float32"),
-            "Shape": np.array(
-                self.new_shape, dtype="int32")
-        }
-        self.attrs = {"use_xpu": True}
-        self.outputs = {
-            "Out": self.inputs["X"].reshape(self.infered_shape),
-            'XShape': np.random.random(self.ori_shape).astype("float32")
-        }
-
-    def init_data(self):
-        self.ori_shape = (4, 25)
-        self.new_shape = (10, 10)
-        self.infered_shape = (10, 10)
-
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place, no_check_set=['XShape'])
-
-    def test_check_grad(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, ["X"], "Out")
-
-
-class TestReshapeOpDimInfer1_attr_OnlyShape(TestReshapeOp_attr_OnlyShape):
-    def init_data(self):
-        self.ori_shape = (5, 20)
-        self.new_shape = (5, -1, 10)
-        self.infered_shape = (5, -1, 10)
-        self.shape = (5, -1, -1)
-
-
-class TestReshapeOpDimInfer2_attr_OnlyShape(TestReshapeOp_attr_OnlyShape):
-    def init_data(self):
-        self.ori_shape = (10, 2, 6)
-        self.new_shape = (10, 0, 3, -1)
-        self.infered_shape = (10, 2, 3, -1)
-        self.shape = (10, 0, 3, -1)
 
+from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
+
+paddle.enable_static()
+
+
+class XPUTestReshapeOp(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = "reshape2"
+        self.use_dynamic_create_class = False
+
+    # situation 1: have shape( list, no tensor), no actual shape(Tensor)
+    class TestReshapeOp(XPUOpTest):
+        def setUp(self):
+            self.init_data()
+            self.op_type = "reshape2"
+            self.init_test_input()
+            self.init_test_output()
+            self.init_attrs()
+
+        def init_data(self):
+            self.ori_shape = (2, 60)
+            self.new_shape = (12, 10)
+            self.infered_shape = (12, 10)
+
+        def init_test_input(self):
+            self.inputs = {
+                "X": np.random.random(self.ori_shape).astype(self.dtype)
+            }
+
+        def init_test_output(self):
+            self.outputs = {
+                "Out": self.inputs["X"].reshape(self.infered_shape),
+                'XShape': np.random.random(self.ori_shape).astype(self.dtype)
+            }
+
+        def init_attrs(self):
+            self.attrs = {"shape": self.new_shape, "use_xpu": True}
+
+        def test_check_output(self):
+            if paddle.is_compiled_with_xpu():
+                place = paddle.XPUPlace(0)
+                self.check_output_with_place(place, no_check_set=['XShape'])
+
+        def test_check_grad(self):
+            if paddle.is_compiled_with_xpu():
+                place = paddle.XPUPlace(0)
+                self.check_grad_with_place(place, ["X"], "Out")
+
+    class TestReshapeOpDimInfer1(TestReshapeOp):
+        def init_data(self):
+            self.ori_shape = (5, 25)
+            self.new_shape = (5, -1, 5)
+            self.infered_shape = (5, -1, 5)
+
+    class TestReshapeOpDimInfer2(TestReshapeOp):
+        def init_data(self):
+            self.ori_shape = (10, 2, 6)
+            self.new_shape = (10, 0, 3, -1)
+            self.infered_shape = (10, 2, 3, -1)
+
+    # situation 2: have shape(list, no tensor), have actual shape(Tensor)
+    class TestReshapeOpWithInputShape(TestReshapeOp):
+        def init_data(self):
+            self.ori_shape = (6, 20)
+            self.new_shape = (0, -1, 20)
+            self.actual_shape = (2, 3, 20)
+
+        def init_test_input(self):
+            self.inputs = {
+                "X": np.random.random(self.ori_shape).astype(self.dtype),
+                "Shape": np.array(
+                    self.actual_shape, dtype="int32")
+            }
+
+        def init_test_output(self):
+            self.outputs = {
+                "Out": self.inputs["X"].reshape(self.actual_shape),
+                'XShape': np.random.random(self.ori_shape).astype(self.dtype)
+            }
+
+    # Situation 3: have shape(list, have tensor), no actual shape(Tensor)
+    class TestReshapeOp_attr_ShapeTensor(TestReshapeOp):
+        def init_data(self):
+            self.ori_shape = (4, 25)
+            self.new_shape = (10, 10)
+            self.infered_shape = (10, 10)
+            self.shape = (-1, -1)
+
+        def init_test_input(self):
+            shape_tensor = []
+            for index, ele in enumerate(self.new_shape):
+                shape_tensor.append(("x" + str(index), np.ones(
+                    (1)).astype('int32') * ele))
+
+            self.inputs = {
+                "X": np.random.random(self.ori_shape).astype(self.dtype),
+                'ShapeTensor': shape_tensor
+            }
+
+        def init_attrs(self):
+            self.attrs = {'shape': self.shape, "use_xpu": True}
+
+    class TestReshapeOpDimInfer1_attr_ShapeTensor(
+            TestReshapeOp_attr_ShapeTensor):
+        def init_data(self):
+            self.ori_shape = (5, 20)
+            self.new_shape = (5, -1, 20)
+            self.infered_shape = (5, -1, 20)
+            self.shape = (5, -1, -1)
+
+    class TestReshapeOpDimInfer2_attr_ShapeTensor(
+            TestReshapeOp_attr_ShapeTensor):
+        def init_data(self):
+            self.ori_shape = (10, 2, 6)
+            self.new_shape = (10, 0, 3, -1)
+            self.infered_shape = (10, 2, 3, -1)
+            self.shape = (10, 0, 3, -1)
+
+    # Situation 4: have shape(Tensor), no actual shape(Tensor)
+    class TestReshapeOp_attr_OnlyShape(TestReshapeOp):
+        def init_data(self):
+            self.ori_shape = (4, 25)
+            self.new_shape = (10, 10)
+            self.infered_shape = (10, 10)
+
+        def init_test_input(self):
+            self.inputs = {
+                "X": np.random.random(self.ori_shape).astype(self.dtype),
+                "Shape": np.array(
+                    self.new_shape, dtype="int32")
+            }
+
+        def init_attrs(self):
+            self.attrs = {"use_xpu": True}
+
+    class TestReshapeOpDimInfer1_attr_OnlyShape(TestReshapeOp_attr_OnlyShape):
+        def init_data(self):
+            self.ori_shape = (5, 20)
+            self.new_shape = (5, -1, 10)
+            self.infered_shape = (5, -1, 10)
+            self.shape = (5, -1, -1)
+
+    class TestReshapeOpDimInfer2_attr_OnlyShape(TestReshapeOp_attr_OnlyShape):
+        def init_data(self):
+            self.ori_shape = (10, 2, 6)
+            self.new_shape = (10, 0, 3, -1)
+            self.infered_shape = (10, 2, 3, -1)
+            self.shape = (10, 0, 3, -1)
+
+
+support_types = get_xpu_op_support_types("reshape2")
+for stype in support_types:
+    create_test_class(globals(), XPUTestReshapeOp, stype)
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_shape_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_shape_op_xpu.py
index f194f3ca80cf0..c7fa72ca7700e 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_shape_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_shape_op_xpu.py
@@ -18,77 +18,99 @@
 import numpy as np
 import sys
 sys.path.append("..")
-from op_test import OpTest
+from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
 import paddle
 from paddle.fluid import core
 from paddle.fluid.op import Operator
 
-
-class TestShapeOp(OpTest):
-    def setUp(self):
-        self.op_type = "shape"
-        self.config()
-        self.shape = [2, 3]
-        input = np.zeros(self.shape)
-        self.inputs = {'Input': input}
-        self.outputs = {'Out': np.array(self.shape)}
-
-    def config(self):
-        self.shape = [2, 3]
-
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place)
-
-
-class case1(TestShapeOp):
-    def config(self):
-        self.shape = [2]
-
-
-class case2(TestShapeOp):
-    def config(self):
-        self.shape = [1, 2, 3]
-
-
-class TestShapeWithSelectedRows(unittest.TestCase):
-    def get_places(self):
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
-        if core.is_compiled_with_xpu():
-            places.append(core.XPUPlace(0))
-        return places
-
-    def check_with_place(self, place):
-        scope = core.Scope()
-        x_rows = [0, 1, 5, 4, 19]
-        height = 20
-        row_numel = 2
-
-        np_array = np.ones((len(x_rows), row_numel)).astype("float32")
-
-        # initialize input variable X
-        x = scope.var('X').get_selected_rows()
-        x.set_rows(x_rows)
-        x.set_height(height)
-        x_tensor = x.get_tensor()
-        x_tensor.set(np_array, place)
-
-        # initialize input variable Out
-        out_shape = scope.var("Out").get_tensor()
-        op = Operator("shape", Input="X", Out="Out")
-
-        op.run(scope, place)
-
-        out_shape = np.array(out_shape).tolist()
-        self.assertListEqual([5, 2], out_shape)
-
-    def test_check_output(self):
-        for place in self.get_places():
-            self.check_with_place(place)
-
+paddle.enable_static()
+
+
+class XPUTestShapeOp(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = "shape"
+        self.use_dynamic_create_class = False
+
+    class TestShapeOp(XPUOpTest):
+        def setUp(self):
+            self.dtype = self.in_type
+            self.op_type = "shape"
+            self.config()
+            input = np.zeros(self.shape)
+            self.inputs = {'Input': input.astype(self.dtype)}
+            self.outputs = {'Out': np.array(self.shape)}
+
+        def config(self):
+            self.shape = [2, 3]
+
+        def test_check_output(self):
+            if paddle.is_compiled_with_xpu():
+                place = paddle.XPUPlace(0)
+                self.check_output_with_place(place)
+
+    class TestShapeOp1(TestShapeOp):
+        def config(self):
+            self.shape = [2]
+
+    class TestShapeOp2(TestShapeOp):
+        def config(self):
+            self.shape = [1, 2, 3]
+
+    class TestShapeOp3(TestShapeOp):
+        def config(self):
+            self.shape = [1, 2, 3, 4]
+
+    class TestShapeOp4(TestShapeOp):
+        def config(self):
+            self.shape = [1, 2, 3, 4, 1024]
+
+    class TestShapeOp5(TestShapeOp):
+        def config(self):
+            self.shape = [1, 2, 3, 4, 1, 201]
+
+    class TestShapeWithSelectedRows(unittest.TestCase):
+        def setUp(self):
+            self.dtype = self.in_type
+
+        def get_places(self):
+            places = [core.CPUPlace()]
+            if core.is_compiled_with_cuda():
+                places.append(core.CUDAPlace(0))
+            if core.is_compiled_with_xpu():
+                places.append(core.XPUPlace(0))
+            return places
+
+        def check_with_place(self, place):
+            scope = core.Scope()
+            x_rows = [0, 1, 5, 4, 19]
+            height = 20
+            row_numel = 2
+
+            np_array = np.ones((len(x_rows), row_numel)).astype(self.dtype)
+
+            # initialize input variable X
+            x = scope.var('X').get_selected_rows()
+            x.set_rows(x_rows)
+            x.set_height(height)
+            x_tensor = x.get_tensor()
+            x_tensor.set(np_array, place)
+            out_shape = scope.var("Out").get_tensor()
+            op = Operator("shape", Input="X", Out="Out")
+
+            op.run(scope, place)
+
+            out_shape = np.array(out_shape).tolist()
+            self.assertListEqual([5, 2], out_shape)
+
+        def test_check_output(self):
+            for place in self.get_places():
+                self.check_with_place(place)
+
+
+support_types = get_xpu_op_support_types("shape")
+for stype in support_types:
+    create_test_class(globals(), XPUTestShapeOp, stype)
 
 if __name__ == '__main__':
     unittest.main()

From 5595fdbbd20b80190b30ab2f77329f2c0c4cfdc4 Mon Sep 17 00:00:00 2001
From: Yulong Ao <aoyulong@baidu.com>
Date: Tue, 22 Feb 2022 11:37:11 +0800
Subject: [PATCH 027/101] [Auto Parallel] Add the high-level Engine API
 (#39709)

* [Auto Parallel] Add the high-level Engine API

* Update the test cmakefile
---
 .../distributed/auto_parallel/dist_context.py |  20 +-
 .../distributed/auto_parallel/dist_loader.py  |  93 ++++++
 .../distributed/auto_parallel/engine.py       | 309 ++++++++++++++++++
 .../unittests/auto_parallel/CMakeLists.txt    |   1 +
 .../auto_parallel/test_engine_api.py          | 135 ++++++++
 5 files changed, 552 insertions(+), 6 deletions(-)
 create mode 100644 python/paddle/distributed/auto_parallel/dist_loader.py
 create mode 100644 python/paddle/distributed/auto_parallel/engine.py
 create mode 100644 python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py

diff --git a/python/paddle/distributed/auto_parallel/dist_context.py b/python/paddle/distributed/auto_parallel/dist_context.py
index e06811df88179..caf220646bb60 100644
--- a/python/paddle/distributed/auto_parallel/dist_context.py
+++ b/python/paddle/distributed/auto_parallel/dist_context.py
@@ -45,9 +45,13 @@ class DistributedContext:
     One auto-parallel run should use its own DistributedContext to avoid interfering other run.
     """
 
-    def __init__(self, program=None):
+    def __init__(self,
+                 serial_main_prog=None,
+                 serial_startup_prog=None,
+                 dist_main_progs=None,
+                 dist_startup_progs=None):
         # Program related data members
-        self._serial_program = program
+        self._serial_program = serial_main_prog
         self._is_initialized_for_program = False
         self._dist_tensors_for_program = {}
         self._dist_ops_for_program = {}
@@ -65,8 +69,12 @@ def __init__(self, program=None):
         self._tensor_id_to_tensor_node_ids = {}
 
         # Distributed programs
-        self._dist_main_programs = {}
-        self._dist_startup_programs = {}
+        self._dist_main_programs = dist_main_progs
+        if not self._dist_main_programs:
+            self._dist_main_programs = {}
+        self._dist_startup_programs = dist_startup_progs
+        if not self._dist_startup_programs:
+            self._dist_startup_programs = {}
 
     @property
     def serial_program(self):
@@ -78,8 +86,8 @@ def serial_graph(self):
 
     @serial_program.setter
     def serial_program(self, program):
-        assert self._serial_program is None, \
-            "This distributed context has already been realted to a serial program"
+        # assert self._serial_program is None, \
+        #     "This distributed context has already been realted to a serial program"
         self._serial_program = program
 
     @property
diff --git a/python/paddle/distributed/auto_parallel/dist_loader.py b/python/paddle/distributed/auto_parallel/dist_loader.py
new file mode 100644
index 0000000000000..92deeffd2c901
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/dist_loader.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+
+import abc
+import numpy as np
+import paddle
+from paddle.io import DataLoader, DistributedBatchSampler
+
+
+class DistributedDataLoader(metaclass=abc.ABCMeta):
+    def __init__(self,
+                 dataset,
+                 batch_size=1,
+                 epochs=1,
+                 data_parallel_world_size=None,
+                 data_parallel_rank=None,
+                 drop_last=False):
+        self.dataset = dataset
+        self.batch_size = batch_size
+        self.epochs = epochs
+        self.data_parallel_world_size = data_parallel_world_size
+        self.data_parallel_rank = data_parallel_rank
+        self.drop_lost = drop_last
+        if data_parallel_world_size is not None:
+            assert batch_size % data_parallel_world_size == 0
+
+    @abc.abstractmethod
+    def __iter__(self):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def __next__(self):
+        raise NotImplementedError
+
+
+class NonIterableGeneratorLoader(DistributedDataLoader):
+    def __init__(self,
+                 dataset,
+                 feed_list,
+                 places,
+                 batch_size=1,
+                 epochs=1,
+                 steps_per_epoch=1000,
+                 data_parallel_world_size=None,
+                 data_parallel_rank=None,
+                 drop_last=False):
+        self.feed_list = feed_list
+        self.places = places
+        self.steps_per_epoch = steps_per_epoch
+        super(NonIterableGeneratorLoader, self).__init__(
+            dataset, batch_size, epochs, data_parallel_world_size,
+            data_parallel_rank, drop_last)
+        self._inner_dataloader = self._create_inner_dataloader()
+
+    def __iter__(self):
+        self._cur_step = 0
+        self._inner_dataloader.start()
+        return self
+
+    def __next__(self):
+        if self._cur_step < self.steps_per_epoch:
+            self._cur_step += 1
+        else:
+            self._inner_dataloader.reset()
+            raise StopIteration
+
+    def _create_inner_dataloader(self):
+        def data_generator():
+            batch_data = None
+            for step, data in enumerate(self.dataset):
+                if batch_data is None:
+                    batch_data = [[] for i in range(len(data))]
+                for idx, data_item in enumerate(data):
+                    batch_data[idx].append(np.array(data_item))
+                if (step + 1) % self.batch_size == 0:
+                    yield batch_data[0], batch_data[1]
+                    batch_data = None
+
+        dataloader = paddle.fluid.io.DataLoader.from_generator(
+            feed_list=self.feed_list, capacity=70, iterable=False)
+        dataloader.set_batch_generator(data_generator, self.places)
+        return dataloader
diff --git a/python/paddle/distributed/auto_parallel/engine.py b/python/paddle/distributed/auto_parallel/engine.py
new file mode 100644
index 0000000000000..98b76056a15a4
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/engine.py
@@ -0,0 +1,309 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import logging
+from collections import defaultdict
+
+import paddle
+from paddle import fluid
+from paddle.io import Dataset
+from paddle.fluid.backward import append_backward
+import paddle.fluid.core as core
+from paddle.static import InputSpec
+from paddle.fluid import program_guard
+from paddle.fluid.framework import Operator
+from paddle.fluid.framework import _current_expected_place as _get_device
+from paddle.fluid.dygraph.parallel import ParallelEnv
+from paddle.distributed.passes import new_pass, PassContext
+from paddle.distributed.utils import get_logger
+
+from .dist_loader import NonIterableGeneratorLoader
+from .dist_op import DistributedOperator
+from .dist_tensor import DistributedTensor
+from .dist_context import DistributedContext
+from .dist_context import get_default_distributed_context
+from .dist_context import set_default_distributed_context
+from .process_group import get_all_process_groups
+from .process_group import get_process_group
+from .process_group import get_world_process_group
+from .process_group import _g_process_group_map, ProcessGroup
+from .completion import Completer
+from .partitioner import Partitioner
+from .reshard import reshard, HAS_SENT, HAS_RECV, HAS_ALLGATHER
+from .cluster import Cluster
+from .mapper import mapping
+from .planner import Planner
+from .utils import make_data_unshard
+from .utils import set_grad_var_shape
+from .utils import print_program_with_dist_attr
+from .utils import SerialProgramInfo
+
+paddle.enable_static()
+
+
+def to_list(value):
+    if value is None:
+        return value
+    if isinstance(value, (list, tuple)):
+        return list(value)
+    return [value]
+
+
+class Engine:
+    def __init__(self, model=None, data_spec=None, cluster=None, strategy=None):
+        self.model = model
+        self.data_spec = data_spec
+        self.cluster = cluster
+        self.strategy = strategy
+        self._executor = None
+        self._orig_main_prog = fluid.default_main_program()
+        self._orig_startup_prog = fluid.default_startup_program()
+        self._serial_main_progs = {}
+        self._serial_startup_progs = {}
+        self._dist_main_progs = defaultdict(dict)
+        self._dist_startup_progs = defaultdict(dict)
+        self._orig_dist_context = get_default_distributed_context()
+        self._dist_contexts = {}
+        self._pass_contexts = {}
+        self._cur_rank = paddle.distributed.get_rank()
+        self._logger = get_logger(logging.INFO)
+
+    def prepare(self,
+                optimizer=None,
+                loss=None,
+                metrics=None,
+                mode="train",
+                all_ranks=False):
+        self.optimizer = optimizer
+        self.loss = loss
+        self.metrics = metrics
+        self.mode = mode
+        self._build()
+        self._plan()
+        if not all_ranks:
+            self._parallel(self._cur_rank)
+        else:
+            world_process_group = get_world_process_group()
+            all_ranks = world_process_group.ranks
+            for rank in all_ranks:
+                self._parallel(rank)
+        place = _get_device()
+        if isinstance(place, fluid.CUDAPlace):
+            self._place = fluid.CUDAPlace(ParallelEnv().dev_id)
+        if self._executor is None:
+            self._executor = fluid.Executor(place)
+
+    def _build(self):
+        serial_main_prog = self._serial_main_progs.get(self.mode, None)
+        if serial_main_prog is not None:
+            return
+
+        serial_main_prog = self._orig_main_prog.clone()
+        serial_startup_prog = self._orig_startup_prog.clone()
+        with fluid.program_guard(serial_main_prog, serial_startup_prog):
+            inputs_spec = self.data_spec[0]
+            labels_spec = self.data_spec[1]
+            inputs = [s._create_feed_layer() for s in to_list(inputs_spec)]
+            labels = [s._create_feed_layer() for s in to_list(labels_spec)]
+            self._input_vars = inputs
+            self._label_vars = labels
+            feed_list = self._input_vars + self._label_vars
+            outputs = to_list(self.model(*inputs))
+            if self.mode != "predict" and self.loss:
+                loss = self.loss(*(outputs + labels))
+                self._loss_var = loss
+
+        self._serial_main_progs[self.mode] = serial_main_prog
+        self._serial_startup_progs[self.mode] = serial_startup_prog
+        self._dist_contexts[self.mode] = DistributedContext(
+            serial_main_prog, serial_startup_prog,
+            self._dist_main_progs[self.mode],
+            self._dist_startup_progs[self.mode])
+        self._pass_contexts[self.mode] = PassContext()
+
+    def _plan(self):
+        # Complete the distributed annotation
+        serial_main_prog = self._serial_main_progs[self.mode]
+        self._completer = Completer(self._dist_contexts[self.mode])
+        self._completer.complete_forward_annotation(serial_main_prog)
+        # TODO: add auto planner process
+
+    def _parallel(self, rank):
+        serial_main_program = self._serial_main_progs[self.mode]
+        serial_startup_program = self._serial_startup_progs[self.mode]
+        dist_context = self._dist_contexts[self.mode]
+        if self.mode != "predict" and self.loss:
+            # Generate backward
+            serial_loss = self._loss_var
+            params_grads = self._generate_backward(
+                serial_main_program, serial_startup_program, serial_loss)
+            # Apply pre optimization passes
+            self._apply_pre_optimization(serial_main_program,
+                                         serial_startup_program, serial_loss,
+                                         params_grads)
+            # Do logical partition
+            partitioner = Partitioner(dist_context, rank)
+            dist_main_prog, dist_startup_prog, dist_params_grads = partitioner.partition(
+                serial_main_program, serial_startup_program, params_grads)
+            # Generate optimizer
+            self._generate_optimizer(dist_main_prog, dist_startup_prog,
+                                     dist_params_grads)
+            # Do reshard process
+            set_grad_var_shape(dist_main_prog, dist_context)
+            make_data_unshard(dist_main_prog, dist_startup_prog, dist_context)
+            reshard(dist_main_prog, dist_startup_prog, rank, dist_context,
+                    dist_params_grads)
+            # Apply post optimization passes
+            self._apply_post_optimization(dist_main_prog, dist_startup_prog,
+                                          rank, dist_params_grads)
+        self._dist_main_progs[self.mode][rank] = dist_main_prog
+        self._dist_startup_progs[self.mode][rank] = dist_startup_prog
+
+    def _generate_backward(self, main_program, startup_program, loss):
+        with program_guard(main_program, startup_program):
+            params_grads = append_backward(
+                loss,
+                distop_context=self._dist_contexts[self.mode].dist_op_context)
+        self._completer.complete_backward_annotation(main_program)
+        return params_grads
+
+    def _generate_optimizer(self, main_program, startup_program, params_grads):
+        with program_guard(main_program, startup_program):
+            optimizer_ops = copy.deepcopy(self.optimizer).apply_gradients(
+                params_grads)
+        self._completer.complete_update_annotation(main_program)
+        return optimizer_ops
+
+    def _apply_pre_optimization(self, main_program, startup_program, loss,
+                                params_grads):
+        # apply amp pass
+        if self.strategy.amp:
+            config = copy.deepcopy(self.strategy.amp_configs)
+            config["dist_context"] = self._dist_contexts[self.mode]
+            config["params_grads"] = params_grads
+            config["loss"] = loss
+            auto_parallel_amp_pass = new_pass("auto_parallel_amp", config)
+            auto_parallel_amp_pass.apply([main_program], [startup_program],
+                                         self._pass_contexts[self.mode])
+
+        # apply recompute pass
+        if self.strategy.recompute:
+            config = copy.deepcopy(self.strategy.recompute_configs)
+            config["dist_context"] = self._dist_contexts[self.mode]
+            config["no_grad_set"] = None
+            config["loss"] = loss
+            auto_parallel_recompute_pass = new_pass("auto_parallel_recompute",
+                                                    config)
+            auto_parallel_recompute_pass.apply([main_program],
+                                               [startup_program],
+                                               self._pass_contexts[self.mode])
+
+    def _apply_post_optimization(self, main_program, startup_program, rank,
+                                 params_grads):
+        if self.strategy.sharding:
+            config = copy.deepcopy(self.strategy.sharding_configs)
+            config["dist_context"] = self._dist_contexts[self.mode]
+            config["params_grads"] = params_grads
+            config["global_rank"] = rank
+            auto_parallel_sharding_pass = new_pass("auto_parallel_sharding",
+                                                   config)
+            auto_parallel_sharding_pass.apply([main_program],
+                                              [startup_program],
+                                              self._pass_contexts[self.mode])
+
+        if self.strategy.gradient_merge:
+            config = copy.deepcopy(self.strategy.gradient_merge_configs)
+            config["dist_context"] = self._dist_contexts[self.mode]
+            config["params_grads"] = params_grads
+            auto_parallel_gradient_merge_pass = new_pass(
+                "auto_parallel_gradient_merge_pass", config)
+            auto_parallel_gradient_merge_pass.apply(
+                [main_program], [startup_program],
+                self._pass_contexts[self.mode])
+
+    def fit(self, train_data, batch_size=1, epochs=1, steps_per_epoch=1000):
+        assert isinstance(train_data, Dataset)
+        assert steps_per_epoch is not None
+        train_dataloader = self._create_dataloader(train_data, batch_size,
+                                                   epochs, steps_per_epoch)
+        self._init_communication()
+        dist_startup_prog = self._dist_startup_progs["train"][self._cur_rank]
+        self._executor.run(dist_startup_prog)
+        for epoch in range(epochs):
+            # train_dataloader.start()
+            # for step in range(steps_per_epoch):
+            #     logs = self.train_step(None)
+            #     self._logger.info(logs)
+            # train_dataloader.reset()
+            for step, data in enumerate(train_dataloader):
+                logs = self._train_step(data)
+                train_logs = {
+                    "train_" + name: val
+                    for name, val in logs.items()
+                }
+                self._logger.info(logs)
+
+    def _train_step(self, data):
+        logs = {}
+        dist_main_prog = self._dist_main_progs["train"][self._cur_rank]
+        if self._loss_var.name not in dist_main_prog.global_block().vars:
+            loss = self._executor.run(dist_main_prog)
+            logs["loss"] = None
+        else:
+            fetch_list = self._loss_var
+            loss = self._executor.run(dist_main_prog, fetch_list=fetch_list)
+            logs["loss"] = loss
+        return logs
+
+    def _create_dataloader(self, dataset, batch_size, epochs, steps_per_epoch):
+        feed_list = self._input_vars + self._label_vars
+        dist_main_prog = self._dist_main_progs[self.mode][self._cur_rank]
+        dist_startup_prog = self._dist_startup_progs[self.mode][self._cur_rank]
+        dist_context = self._dist_contexts[self.mode]
+        dist_main_block = dist_main_prog.global_block()
+        op_size = len(dist_main_block.ops)
+        places = paddle.static.cuda_places()
+        with fluid.program_guard(dist_main_prog, dist_startup_prog):
+            dataloader = NonIterableGeneratorLoader(
+                dataset, feed_list, places, batch_size, epochs, steps_per_epoch)
+        new_op_size = len(dist_main_block.ops)
+        for idx in range(new_op_size - 1, op_size - 1, -1):
+            op = dist_main_block.ops[new_op_size - 1]
+            new_op_desc = dist_main_block.desc._prepend_op()
+            new_op_desc.copy_from(op.desc)
+            new_op = Operator(
+                dist_main_block, new_op_desc, type=new_op_desc.type())
+            dist_main_block.ops.insert(0, new_op)
+            dist_op = DistributedOperator(new_op)
+            dist_context.add_dist_op_for_program(dist_op)
+        for _ in range(new_op_size - op_size):
+            dist_main_block._remove_op(new_op_size, sync=False)
+        dist_main_block._sync_with_cpp()
+        return dataloader
+
+    def _init_communication(self):
+        # Traverse different rank programs and traverse each op of them,
+        # instantiate communication by process_mapping.
+        all_process_groups = get_all_process_groups()
+        for process_group in all_process_groups:
+            if self._cur_rank not in process_group.ranks:
+                continue
+            process_group.instantiate()
+
+    # def save(self, path, training=True):
+    #     pass
+
+    # def load(self, path, strict=True, load_optimizer=True):
+    #     pass
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
index 220611be18144..0a9eaf34ba512 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
@@ -7,4 +7,5 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
     set_tests_properties(test_relaunch_with_planner PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 120)
     py_test_modules(test_relaunch_with_gpt_planner MODULES test_relaunch_with_planner ENVS ${dist_ENVS})
     set_tests_properties(test_relaunch_with_gpt_planner PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 240)
+    py_test_modules(test_engine_api MODULES test_engine_api ENVS ${dist_ENVS})
 endif()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py
new file mode 100644
index 0000000000000..0fc1ea41033e0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py
@@ -0,0 +1,135 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import time
+import paddle.fluid as fluid
+import copy
+import os
+import numpy as np
+import subprocess
+import paddle
+import paddle.nn as nn
+import paddle.fluid as fluid
+import paddle.static as static
+import paddle.nn.functional as F
+import paddle.utils as utils
+from paddle.fluid import layers
+from paddle.io import Dataset, IterableDataset, DataLoader
+from paddle.static import InputSpec
+from paddle.distributed import fleet
+import paddle.distributed.auto_parallel as auto
+from paddle.distributed.auto_parallel.engine import Engine
+
+paddle.enable_static()
+global_process_mesh = auto.ProcessMesh(mesh=[0])
+batch_size = 1
+batch_num = 10
+hidden_size = 1024
+sequence_len = 512
+image_size = hidden_size
+class_num = 10
+
+paddle.seed(44)
+
+
+class MyDataset(Dataset):
+    def __init__(self, num_samples):
+        super(MyDataset, self).__init__()
+        self.num_samples = num_samples
+
+    def __getitem__(self, index):
+        input = np.random.uniform(size=image_size).astype("float32")
+        label = np.random.randint(0, class_num - 1, dtype="int64")
+        return input, label
+
+    def __len__(self):
+        return self.num_samples
+
+
+class MLPLayer(nn.Layer):
+    def __init__(self,
+                 hidden_size=1024,
+                 intermediate_size=4 * 1024,
+                 dropout_ratio=0.1,
+                 initializer_range=0.02):
+        super(MLPLayer, self).__init__()
+        d_model = hidden_size
+        dim_feedforward = intermediate_size
+        weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(
+            mean=0.0, std=initializer_range))
+        bias_attr = None
+
+        self.linear0 = nn.Linear(
+            d_model, dim_feedforward, weight_attr, bias_attr=bias_attr)
+        self.linear1 = nn.Linear(
+            dim_feedforward, d_model, weight_attr, bias_attr=bias_attr)
+        self.linear2 = nn.Linear(d_model, 1, weight_attr, bias_attr=bias_attr)
+        # self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
+        # self.dropout = nn.Dropout(dropout_ratio, mode="upscale_in_train")
+
+    def forward(self, input):
+        auto.shard_tensor(
+            input,
+            dist_attr={
+                "process_mesh": global_process_mesh,
+                "dims_mappig": [-1]
+            })
+        # out = self.norm(input)
+        out = self.linear0(input)
+        out = F.gelu(out, approximate=True)
+        out = self.linear1(out)
+        # out = self.dropout(out)
+        out = self.linear2(out)
+        return out
+
+
+class TestEngineAPI(unittest.TestCase):
+    def test_engine_api(self):
+        mlp = MLPLayer(
+            hidden_size=hidden_size,
+            intermediate_size=4 * hidden_size,
+            dropout_ratio=0.1,
+            initializer_range=0.02)
+        loss = paddle.nn.CrossEntropyLoss()
+        optimizer = paddle.fluid.optimizer.AdamOptimizer(
+            learning_rate=0.00001,
+            beta1=0.9,
+            beta2=0.999,
+            epsilon=1e-08,
+            grad_clip=None)
+
+        dataset = MyDataset(batch_num * batch_size)
+        data_spec = [
+            InputSpec([batch_size, hidden_size], 'float32', 'x'),
+            InputSpec([batch_size], 'int64', 'label')
+        ]
+
+        dist_strategy = fleet.DistributedStrategy()
+        dist_strategy.amp = False
+        dist_strategy.pipeline = False
+        dist_strategy.recompute = False
+        # init parallel optimizer
+        dist_strategy.semi_auto = True
+        fleet.init(is_collective=True, strategy=dist_strategy)
+
+        engine = Engine(mlp, data_spec, strategy=dist_strategy)
+        engine.prepare(optimizer, loss)
+        engine.fit(dataset,
+                   batch_size=batch_size,
+                   steps_per_epoch=batch_num * batch_size)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 574f3402f47af8bd6d8d1f16b3734ccf11f91abb Mon Sep 17 00:00:00 2001
From: Wangzheee <634486483@qq.com>
Date: Tue, 22 Feb 2022 12:02:11 +0800
Subject: [PATCH 028/101] [Paddle-Inference] fix pass and convert_op for
 preln_ernie (#39733)

* fix pass and convert_op for preln_ernie and add preln_ernie'flag in pass
---
 ...n_embedding_eltwise_layernorm_fuse_pass.cc | 13 +++++
 .../ir/preln_skip_layernorm_fuse_pass.cc      | 22 ++++++--
 .../framework/ir/skip_layernorm_fuse_pass.cc  |  8 +--
 .../inference/analysis/ir_pass_manager.cc     | 52 +++++++++----------
 .../fluid/inference/api/analysis_predictor.cc | 16 +++---
 .../convert/preln_emb_eltwise_layernorm.cc    | 18 ++-----
 .../tensorrt/convert/preln_skip_layernorm.cc  |  6 ++-
 7 files changed, 76 insertions(+), 59 deletions(-)

diff --git a/paddle/fluid/framework/ir/preln_embedding_eltwise_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/preln_embedding_eltwise_layernorm_fuse_pass.cc
index ca42a613411ba..d6761d2e82ef3 100644
--- a/paddle/fluid/framework/ir/preln_embedding_eltwise_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/preln_embedding_eltwise_layernorm_fuse_pass.cc
@@ -428,6 +428,19 @@ PrelnEmbeddingEltwiseLayerNormFusePass::
 
 void PrelnEmbeddingEltwiseLayerNormFusePass::ApplyImpl(Graph* graph) const {
   FusePassBase::Init(name_scope_, graph);
+
+  bool enable_int8 = Get<bool>("enable_int8");
+  bool use_oss = Get<bool>("use_oss");
+  bool with_interleaved = Get<bool>("with_interleaved");
+  bool with_dynamic_shape = Get<bool>("with_dynamic_shape");
+  if (!(enable_int8 && use_oss && with_interleaved && with_dynamic_shape)) {
+    VLOG(4) << "preln_embedding_eltwise_layernorm_fuse_pass need: use_trt, "
+               "enable_int8, "
+               "use_oss, with_interleaved, with_dynamic_shape. Stop this pass, "
+               "please reconfig.";
+    return;
+  }
+
   int fusion_count =
       PrelnEmbeddingEltwiseLayerNormFusePass::BuildFusion(graph, name_scope_);
   if (fusion_count > 0) {
diff --git a/paddle/fluid/framework/ir/preln_skip_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/preln_skip_layernorm_fuse_pass.cc
index 1b7b82cbca9e8..978360d8f0a95 100644
--- a/paddle/fluid/framework/ir/preln_skip_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/preln_skip_layernorm_fuse_pass.cc
@@ -39,7 +39,6 @@ struct PrelnSkipLayerNorm : public PatternBase {
   void operator()(PDNode *x, PDNode *y);
 
   // declare operator node's name
-  PATTERN_DECL_NODE(fused_skipe_layernorm);
   PATTERN_DECL_NODE(elementwise);
   PATTERN_DECL_NODE(layer_norm);
   // declare variable node's name
@@ -62,8 +61,13 @@ void PrelnSkipLayerNorm::operator()(PDNode *x, PDNode *y) {
   auto *elementwise_out_var = pattern->NewNode(elementwise_out_repr())
                                   ->assert_is_op_output("elementwise_add")
                                   ->assert_is_op_input("layer_norm", "X")
-                                  ->assert_is_op_input("elementwise_add", "Y");
-
+                                  ->assert_more([](Node *x) {
+                                    if (x->outputs.size() == 2) {
+                                      return true;
+                                    } else {
+                                      return false;
+                                    }
+                                  });
   // Add links for elementwise_add op.
   elementwise->LinksFrom({x, y}).LinksTo({elementwise_out_var});
 
@@ -104,6 +108,18 @@ void PrelnSkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const {
   PADDLE_ENFORCE_NOT_NULL(
       graph, platform::errors::PreconditionNotMet("graph should not be null."));
   FusePassBase::Init("preln_skip_layernorm_fuse", graph);
+  bool enable_int8 = Get<bool>("enable_int8");
+  bool use_oss = Get<bool>("use_oss");
+  bool with_interleaved = Get<bool>("with_interleaved");
+  bool with_dynamic_shape = Get<bool>("with_dynamic_shape");
+  if (!(enable_int8 && use_oss && with_interleaved && with_dynamic_shape)) {
+    VLOG(4) << "preln_skip_layernorm_fuse_pass need: use_trt, enable_int8, "
+               "use_oss, "
+               "with_interleaved, with_dynamic_shape. Stop this pass, please "
+               "reconfig. ";
+    return;
+  }
+
   int found_subgraph_count = 0;
 
   GraphPatternDetector gpd;
diff --git a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
index db194d59d37ba..bfa14d9296b26 100644
--- a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
@@ -39,7 +39,6 @@ struct SkipLayerNorm : public PatternBase {
   PDNode *operator()(PDNode *x, PDNode *y);
 
   // declare operator node's name
-  PATTERN_DECL_NODE(fused_skipe_layernorm);
   PATTERN_DECL_NODE(elementwise);
   PATTERN_DECL_NODE(layer_norm);
   // declare variable node's name
@@ -59,9 +58,10 @@ PDNode *SkipLayerNorm::operator()(PDNode *x, PDNode *y) {
   y->assert_is_op_input("elementwise_add", "Y");
   auto *elementwise =
       pattern->NewNode(elementwise_repr())->assert_is_op("elementwise_add");
-  auto *elementwise_out_var = pattern->NewNode(elementwise_out_repr())
-                                  ->AsOutput()
-                                  ->assert_is_op_output("elementwise_add");
+  auto *elementwise_out_var =
+      pattern->NewNode(elementwise_out_repr())
+          ->AsOutput()
+          ->assert_is_only_output_of_op("elementwise_add");
 
   // Add links for elementwise_add op.
   elementwise->LinksFrom({x, y}).LinksTo({elementwise_out_var});
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index 837b83004de84..796c86a3ad1ef 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -54,6 +54,27 @@ void IRPassManager::CreatePasses(Argument *argument,
   int pass_num = 0;
   for (const std::string &pass_name : passes) {
     auto pass = framework::ir::PassRegistry::Instance().Get(pass_name);
+    pass->Set("use_oss", new bool(argument->tensorrt_use_oss()));
+    pass->Set("with_interleaved",
+              new bool(argument->tensorrt_with_interleaved()));
+    pass->Set("disable_logs", new bool(argument->disable_logs()));
+    auto precision_mode = argument->tensorrt_precision_mode();
+    bool enable_int8 = precision_mode == AnalysisConfig::Precision::kInt8;
+    pass->Set("enable_int8", new bool(enable_int8));
+    pass->Set("max_input_shape", new std::map<std::string, std::vector<int>>(
+                                     argument->max_input_shape()));
+    pass->Set("min_input_shape", new std::map<std::string, std::vector<int>>(
+                                     argument->min_input_shape()));
+    pass->Set("optim_input_shape", new std::map<std::string, std::vector<int>>(
+                                       argument->optim_input_shape()));
+    // tuned trt dynamic_shape
+    pass->Set("trt_tuned_dynamic_shape",
+              new bool(argument->tensorrt_tuned_dynamic_shape()));
+    bool with_dynamic_shape = (argument->max_input_shape().size() > 0 &&
+                               argument->min_input_shape().size() > 0 &&
+                               argument->optim_input_shape().size() > 0) ||
+                              argument->tensorrt_tuned_dynamic_shape();
+    pass->Set("with_dynamic_shape", new bool(with_dynamic_shape));
 
     if (pass_name == "graph_viz_pass") {
       std::string optim_cache_dir = argument->optim_cache_dir();
@@ -99,17 +120,9 @@ void IRPassManager::CreatePasses(Argument *argument,
                 new int(argument->tensorrt_min_subgraph_size()));
       pass->Set("program",
                 new framework::ProgramDesc *(&argument->main_program()));
-
-      auto precision_mode = argument->tensorrt_precision_mode();
-      bool enable_int8 = precision_mode == AnalysisConfig::Precision::kInt8;
-
       pass->Set("predictor_id", new int(argument->predictor_id()));
       bool use_calib_mode = argument->tensorrt_use_calib_mode();
-      pass->Set("enable_int8", new bool(enable_int8));
       pass->Set("use_calib_mode", new bool(use_calib_mode));
-      pass->Set("use_oss", new bool(argument->tensorrt_use_oss()));
-      pass->Set("with_interleaved",
-                new bool(argument->tensorrt_with_interleaved()));
       pass->Set("precision_mode",
                 new AnalysisConfig::Precision(precision_mode));
 
@@ -161,22 +174,8 @@ void IRPassManager::CreatePasses(Argument *argument,
       // tuned trt dynamic_shape
       pass->Set("trt_shape_range_info_path",
                 new std::string(argument->tensorrt_shape_range_info_path()));
-      pass->Set("trt_tuned_dynamic_shape",
-                new bool(argument->tensorrt_tuned_dynamic_shape()));
       pass->Set("trt_allow_build_at_runtime",
                 new bool(argument->tensorrt_allow_build_at_runtime()));
-      pass->Set("max_input_shape", new std::map<std::string, std::vector<int>>(
-                                       argument->max_input_shape()));
-      pass->Set("min_input_shape", new std::map<std::string, std::vector<int>>(
-                                       argument->min_input_shape()));
-      pass->Set("optim_input_shape",
-                new std::map<std::string, std::vector<int>>(
-                    argument->optim_input_shape()));
-      bool with_dynamic_shape = (argument->max_input_shape().size() > 0 &&
-                                 argument->min_input_shape().size() > 0 &&
-                                 argument->optim_input_shape().size() > 0) ||
-                                argument->tensorrt_tuned_dynamic_shape();
-      pass->Set("with_dynamic_shape", new bool(with_dynamic_shape));
       pass->Set("trt_disabled_ops", new std::vector<std::string>(
                                         argument->tensorrt_disabled_ops()));
       pass->Set("trt_use_dla", new bool(argument->tensorrt_use_dla()));
@@ -192,14 +191,15 @@ void IRPassManager::CreatePasses(Argument *argument,
                 new framework::ProgramDesc *(&argument->main_program()));
     }
     if (pass_name == "lite_subgraph_pass") {
-      bool enable_int8 =
+      bool lite_enable_int8 =
           argument->lite_precision_mode() == AnalysisConfig::Precision::kInt8;
       pass->Set("program",
                 new framework::ProgramDesc *(&argument->main_program()));
       pass->Set("lite_ops_filter",
                 new std::vector<std::string>(argument->lite_ops_filter()));
       pass->Set("predictor_id", new int(argument->predictor_id()));
-      pass->Set("enable_int8", new bool(enable_int8));
+      pass->Erase("enable_int8");
+      pass->Set("enable_int8", new bool(lite_enable_int8));
       pass->Set("use_gpu", new bool(argument->use_gpu()));
       pass->Set("zero_copy", new bool(argument->lite_zero_copy()));
       pass->Set("use_xpu", new bool(argument->use_xpu()));
@@ -236,7 +236,6 @@ void IRPassManager::CreatePasses(Argument *argument,
                 new std::vector<std::string>(
                     argument->nnadapter_model_cache_token()));
     }
-    disable_logs_ = argument->disable_logs();
     if (pass_name == "fc_fuse_pass") {
       pass->Set("use_gpu", new bool(argument->use_gpu()));
       bool fc_mkldnn_pass = 0;
@@ -248,9 +247,6 @@ void IRPassManager::CreatePasses(Argument *argument,
       bool use_fc_padding = !fc_mkldnn_pass && argument->use_fc_padding();
       pass->Set("use_fc_padding", new bool(use_fc_padding));
     }
-
-    pass->Set("disable_logs", new bool(disable_logs_));
-
     pre_pass = pass_name;
 
     passes_.emplace_back(std::move(pass));
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index a3812244fbe22..6c005e4b2d6e4 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -592,6 +592,14 @@ void AnalysisPredictor::PrepareArgument() {
     argument_.SetModelParamsPath(config_.params_file());
   }
 
+  argument_.SetTensorRtPrecisionMode(config_.tensorrt_precision_mode_);
+  argument_.SetTensorRtUseOSS(config_.trt_use_oss_);
+  argument_.SetTensorRtWithInterleaved(config_.trt_with_interleaved_);
+  argument_.SetMinInputShape(config_.min_input_shape_);
+  argument_.SetMaxInputShape(config_.max_input_shape_);
+  argument_.SetOptimInputShape(config_.optim_input_shape_);
+  argument_.SetTensorRtTunedDynamicShape(
+      config_.tuned_tensorrt_dynamic_shape());
   if (config_.use_gpu() && config_.tensorrt_engine_enabled()) {
     LOG(INFO) << "TensorRT subgraph engine is enabled";
     argument_.SetUseTensorRT(true);
@@ -601,18 +609,10 @@ void AnalysisPredictor::PrepareArgument() {
     argument_.SetTensorRtDisabledOPs(config_.trt_disabled_ops_);
     argument_.SetTensorRtUseDLA(config_.trt_use_dla_);
     argument_.SetTensorRtDLACore(config_.trt_dla_core_);
-    argument_.SetTensorRtPrecisionMode(config_.tensorrt_precision_mode_);
     argument_.SetTensorRtUseStaticEngine(config_.trt_use_static_engine_);
     argument_.SetTensorRtUseCalibMode(config_.trt_use_calib_mode_);
-    argument_.SetTensorRtUseOSS(config_.trt_use_oss_);
-    argument_.SetTensorRtWithInterleaved(config_.trt_with_interleaved_);
-    argument_.SetMinInputShape(config_.min_input_shape_);
-    argument_.SetMaxInputShape(config_.max_input_shape_);
-    argument_.SetOptimInputShape(config_.optim_input_shape_);
     argument_.SetCloseTrtPluginFp16(config_.disable_trt_plugin_fp16_);
     argument_.SetTensorRtShapeRangeInfoPath(config_.shape_range_info_path());
-    argument_.SetTensorRtTunedDynamicShape(
-        config_.tuned_tensorrt_dynamic_shape());
     argument_.SetTensorRtAllowBuildAtRuntime(
         config_.trt_allow_build_at_runtime());
     argument_.SetTensorRtUseInspector(config_.trt_use_inspector_);
diff --git a/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc
index a58de101053b3..daa3b186ab4c4 100644
--- a/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc
@@ -51,21 +51,11 @@ class PrelnEmbEltwiseLayerNormOpConverter : public OpConverter {
     auto pos_emb_name = op_desc.Input("PosEmbedding").front();
     auto sent_emb_name = op_desc.Input("SentEmbedding").front();
 
-    std::vector<std::string> id_names;
     std::vector<std::string> emb_names;
-
-    id_names =
-        std::vector<std::string>{word_id_name, pos_id_name, sent_id_name};
     emb_names =
         std::vector<std::string>{word_emb_name, pos_emb_name, sent_emb_name};
 
-    int input_num = id_names.size();
-
-    // Declare inputs
-    std::vector<nvinfer1::ITensor*> input_ids;
-    for (int i = 0; i < input_num; i++) {
-      input_ids.push_back(engine_->GetITensor(id_names[i]));
-    }
+    int input_num = emb_names.size();
 
     // input_embs[0]: word_embedding
     // input_embs[1]: pos_embedding
@@ -126,7 +116,7 @@ class PrelnEmbEltwiseLayerNormOpConverter : public OpConverter {
         {"bert_embeddings_position_embeddings", input_embs[1],
          nvinfer1::PluginFieldType::kFLOAT32,
          static_cast<int32_t>(emb_sizes[1])},
-        {"output_int8", &output_int8, nvinfer1::PluginFieldType::kINT32, 1},
+        {"output_fp16", &output_int8, nvinfer1::PluginFieldType::kINT32, 1},
     };
 
     nvinfer1::PluginFieldCollection* plugin_ptr =
@@ -156,7 +146,7 @@ class PrelnEmbEltwiseLayerNormOpConverter : public OpConverter {
     shuffle_layer->setReshapeDimensions(shape_dim);
     shuffle_layer->setName(
         ("PrelnEmbeltwise_Shuffle_reshape (Output: max_seqlen " +
-         op_desc.Output("Out")[0] + ")")
+         op_desc.Output("Out_0")[0] + ")")
             .c_str());
     engine_->SetTensorDynamicRange(shuffle_layer->getOutput(0), 1.0f);
     plugin_inputs.emplace_back(
@@ -170,7 +160,7 @@ class PrelnEmbEltwiseLayerNormOpConverter : public OpConverter {
     auto plugin_layer = engine_->network()->addPluginV2(
         plugin_inputs.data(), plugin_inputs.size(), *plugin_obj);
     plugin_layer->setName(("CustomPrelnEmbLayerNormPluginDynamic_V3(Output: " +
-                           op_desc.Output("Out")[0] + ")")
+                           op_desc.Output("Out_0")[0] + ")")
                               .c_str());
     free(plugin_ptr);
     float out_0_scale =
diff --git a/paddle/fluid/inference/tensorrt/convert/preln_skip_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/preln_skip_layernorm.cc
index 521e04b8974fd..d9eca65fc45dc 100644
--- a/paddle/fluid/inference/tensorrt/convert/preln_skip_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/preln_skip_layernorm.cc
@@ -92,8 +92,10 @@ class PrelnSkipLayerNormOpConverter : public OpConverter {
             "fail to add CustomPrelnSkipLayerNormPluginDynamic layer"));
     layer = plugin_layer;
 
-    auto output_name = op_desc.Output("Out")[0];
-    RreplenishLayerAndOutput(layer, "preln_skip_layernorm", {output_name},
+    std::vector<std::string> output_names;
+    output_names.push_back(op_desc.Output("Out_0")[0]);
+    output_names.push_back(op_desc.Output("Out_1")[0]);
+    RreplenishLayerAndOutput(layer, "preln_skip_layernorm", {output_names},
                              test_mode);
 #else
     PADDLE_THROW(platform::errors::Fatal(

From e89bf25b15d47b669f889667738957bb0c3dfee1 Mon Sep 17 00:00:00 2001
From: houj04 <35131887+houj04@users.noreply.github.com>
Date: Tue, 22 Feb 2022 12:55:41 +0800
Subject: [PATCH 029/101] update unittests for nearest_interp_v2_op_xpu: 'sync'
 from gpu. test=kunlun (#39768)

---
 paddle/fluid/operators/interpolate_v2_op.h    |   7 +
 .../fluid/operators/interpolate_v2_op_xpu.cc  |  18 +-
 .../xpu/test_nearest_interp_v2_op_xpu.py      | 202 +++++++++++++++---
 3 files changed, 177 insertions(+), 50 deletions(-)

diff --git a/paddle/fluid/operators/interpolate_v2_op.h b/paddle/fluid/operators/interpolate_v2_op.h
index 66ab1e14390b3..f99d3f6c32442 100644
--- a/paddle/fluid/operators/interpolate_v2_op.h
+++ b/paddle/fluid/operators/interpolate_v2_op.h
@@ -65,6 +65,13 @@ inline std::vector<T> get_new_data_from_tensor(const Tensor* new_data_tensor) {
                                       &cpu_starts_tensor);
     new_data = cpu_starts_tensor.data<T>();
   }
+#endif
+#ifdef PADDLE_WITH_XPU
+  if (platform::is_xpu_place(new_data_tensor->place())) {
+    paddle::framework::TensorCopySync(*new_data_tensor, platform::CPUPlace(),
+                                      &cpu_starts_tensor);
+    new_data = cpu_starts_tensor.data<T>();
+  }
 #endif
   vec_new_data = std::vector<T>(new_data, new_data + new_data_tensor->numel());
   return vec_new_data;
diff --git a/paddle/fluid/operators/interpolate_v2_op_xpu.cc b/paddle/fluid/operators/interpolate_v2_op_xpu.cc
index 66314cb74456d..850dbe025b9cb 100644
--- a/paddle/fluid/operators/interpolate_v2_op_xpu.cc
+++ b/paddle/fluid/operators/interpolate_v2_op_xpu.cc
@@ -14,7 +14,7 @@
 #include <vector>
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/interpolate_op.h"
+#include "paddle/fluid/operators/interpolate_v2_op.h"
 
 #ifdef PADDLE_WITH_XPU
 
@@ -41,18 +41,6 @@ inline std::vector<int> get_new_shape_xpu(
   return vec_new_shape;
 }
 
-template <typename T>
-inline std::vector<T> get_new_data_from_tensor_xpu(
-    const Tensor* new_data_tensor) {
-  std::vector<T> vec_new_data;
-  framework::Tensor cpu_starts_tensor;
-  paddle::framework::TensorCopySync(*new_data_tensor, platform::CPUPlace(),
-                                    &cpu_starts_tensor);
-  auto* new_data = cpu_starts_tensor.data<T>();
-  vec_new_data = std::vector<T>(new_data, new_data + new_data_tensor->numel());
-  return vec_new_data;
-}
-
 template <typename T>
 class InterpolateV2XPUKernel : public framework::OpKernel<T> {
  public:
@@ -90,7 +78,7 @@ class InterpolateV2XPUKernel : public framework::OpKernel<T> {
       auto scale_tensor = ctx.Input<Tensor>("Scale");
       auto scale = ctx.Attr<std::vector<float>>("scale");
       if (scale_tensor != nullptr) {
-        auto scale_data = get_new_data_from_tensor_xpu<float>(scale_tensor);
+        auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
         if (scale_data.size() > 1) {
           scale_h = scale_data[0];
           scale_w = scale_data[1];
@@ -202,7 +190,7 @@ class InterpolateV2GradXPUKernel : public framework::OpKernel<T> {
       auto scale_tensor = ctx.Input<Tensor>("Scale");
       auto scale = ctx.Attr<std::vector<float>>("scale");
       if (scale_tensor != nullptr) {
-        auto scale_data = get_new_data_from_tensor_xpu<float>(scale_tensor);
+        auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
         if (scale_data.size() > 1) {
           scale_h = scale_data[0];
           scale_w = scale_data[1];
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_nearest_interp_v2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_nearest_interp_v2_op_xpu.py
index 8de8125166fb3..8c1ce68e9d0f8 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_nearest_interp_v2_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_nearest_interp_v2_op_xpu.py
@@ -81,7 +81,80 @@ def nearest_neighbor_interp_np(X,
 
     if data_layout == "NHWC":
         out = np.transpose(out, (0, 2, 3, 1))  # NCHW => NHWC
+    # out = np.expand_dims(out, 2)
+    return out.astype(X.dtype)
+
+
+def nearest_neighbor_interp3d_np(X,
+                                 out_d,
+                                 out_h,
+                                 out_w,
+                                 scale_d=0,
+                                 scale_h=0,
+                                 scale_w=0,
+                                 out_size=None,
+                                 actual_shape=None,
+                                 align_corners=True,
+                                 data_layout='NCHW'):
+    """nearest neighbor interpolation implement in shape [N, C, H, W]"""
+    if data_layout == "NHWC":
+        X = np.transpose(X, (0, 4, 1, 2, 3))  # NDHWC => NCDHW
+    if out_size is not None:
+        out_d = out_size[0]
+        out_h = out_size[1]
+        out_w = out_size[2]
+    if actual_shape is not None:
+        out_d = actual_shape[0]
+        out_h = actual_shape[1]
+        out_w = actual_shape[2]
+    n, c, in_d, in_h, in_w = X.shape
 
+    ratio_d = ratio_h = ratio_w = 0.0
+    if (out_d > 1):
+        if (align_corners):
+            ratio_d = (in_d - 1.0) / (out_d - 1.0)
+        else:
+            if scale_d > 0:
+                ratio_d = 1.0 / scale_d
+            else:
+                ratio_d = 1.0 * in_d / out_d
+    if (out_h > 1):
+        if (align_corners):
+            ratio_h = (in_h - 1.0) / (out_h - 1.0)
+        else:
+            if scale_h > 0:
+                ratio_h = 1.0 / scale_h
+            else:
+                ratio_h = 1.0 * in_h / out_h
+    if (out_w > 1):
+        if (align_corners):
+            ratio_w = (in_w - 1.0) / (out_w - 1.0)
+        else:
+            if scale_w > 0:
+                ratio_w = 1.0 / scale_w
+            else:
+                ratio_w = 1.0 * in_w / out_w
+    out = np.zeros((n, c, out_d, out_h, out_w))
+
+    if align_corners:
+        for d in range(out_d):
+            in_d = int(ratio_d * d + 0.5)
+            for i in range(out_h):
+                in_i = int(ratio_h * i + 0.5)
+                for j in range(out_w):
+                    in_j = int(ratio_w * j + 0.5)
+                    out[:, :, d, i, j] = X[:, :, in_d, in_i, in_j]
+    else:
+        for d in range(out_d):
+            in_d = int(ratio_d * d)
+            for i in range(out_h):
+                in_i = int(ratio_h * i)
+                for j in range(out_w):
+                    in_j = int(ratio_w * j)
+                    out[:, :, d, i, j] = X[:, :, in_d, in_i, in_j]
+
+    if data_layout == "NDHWC":
+        out = np.transpose(out, (0, 2, 3, 4, 1))  # NCDHW => NDHWC
     return out.astype(X.dtype)
 
 
@@ -90,46 +163,86 @@ def setUp(self):
         self.use_xpu = True
         self.out_size = None
         self.actual_shape = None
+        self.data_layout = 'NCHW'
         self.init_test_case()
         self.op_type = "nearest_interp_v2"
-        self.shape_by_1Dtensor = False
-        self.scale_by_1Dtensor = False
-        self.attrs = {
-            'interp_method': self.interp_method,
-            'align_corners': self.align_corners,
-        }
-
         input_np = np.random.random(self.input_shape).astype("float32")
-        self.inputs = {'X': input_np}
 
-        if self.scale_by_1Dtensor:
-            self.inputs['Scale'] = np.array([self.scale]).astype("float32")
-        elif self.scale:
+        if self.data_layout == "NCHW" and len(self.input_shape) == 4:
+            in_d = 1
+            in_h = self.input_shape[2]
+            in_w = self.input_shape[3]
+        else:
+            in_d = 1
+            in_h = self.input_shape[1]
+            in_w = self.input_shape[2]
+
+        if self.data_layout == "NCDHW" and len(self.input_shape) == 5:
+            in_d = self.input_shape[2]
+            in_h = self.input_shape[3]
+            in_w = self.input_shape[4]
+        else:
+            in_d = self.input_shape[1]
+            in_h = self.input_shape[2]
+            in_w = self.input_shape[3]
+        scale_d = 0
+        scale_h = 0
+        scale_w = 0
+        if self.scale:
             if isinstance(self.scale, float) or isinstance(self.scale, int):
                 if self.scale > 0:
-                    scale_h = scale_w = float(self.scale)
+                    scale_d = scale_h = scale_w = float(self.scale)
             if isinstance(self.scale, list) and len(self.scale) == 1:
-                scale_w = scale_h = self.scale[0]
+                scale_d = scale_w = scale_h = self.scale[0]
             elif isinstance(self.scale, list) and len(self.scale) > 1:
-                scale_w = self.scale[1]
-                scale_h = self.scale[0]
-            out_h = int(self.input_shape[2] * scale_h)
-            out_w = int(self.input_shape[3] * scale_w)
+                if len(self.scale) == 5:
+                    scale_w = self.scale[2]
+                    scale_h = self.scale[1]
+                    scale_d = self.scale[0]
+                else:
+                    scale_w = self.scale[1]
+                    scale_h = self.scale[0]
+
+            out_h = int(in_h * scale_h)
+            out_w = int(in_w * scale_w)
+            out_d = int(in_d * scale_d)
         else:
+            if len(self.input_shape) == 5:
+                out_d = self.out_d
             out_h = self.out_h
             out_w = self.out_w
 
-        if self.shape_by_1Dtensor:
+        if len(self.input_shape) == 4:
+            output_np = nearest_neighbor_interp_np(
+                input_np, out_h, out_w, scale_h, scale_w, self.out_size,
+                self.actual_shape, self.align_corners, self.data_layout)
+        elif len(self.input_shape) == 5:
+            output_np = nearest_neighbor_interp3d_np(
+                input_np, out_d, out_h, out_w, scale_d, scale_h, scale_w,
+                self.out_size, self.actual_shape, self.align_corners,
+                self.data_layout)
+        self.inputs = {'X': input_np}
+        if self.out_size is not None:
             self.inputs['OutSize'] = self.out_size
-        elif self.out_size is not None:
-            size_tensor = []
-            for index, ele in enumerate(self.out_size):
-                size_tensor.append(("x" + str(index), np.ones(
-                    (1)).astype('int32') * ele))
-            self.inputs['SizeTensor'] = size_tensor
-
-        self.attrs['out_h'] = self.out_h
-        self.attrs['out_w'] = self.out_w
+        if self.actual_shape is not None:
+            self.inputs['OutSize'] = self.actual_shape
+        if len(self.input_shape) == 5:
+            self.attrs = {
+                'out_d': self.out_d,
+                'out_h': self.out_h,
+                'out_w': self.out_w,
+                'interp_method': self.interp_method,
+                'align_corners': self.align_corners,
+                'data_layout': self.data_layout
+            }
+        else:
+            self.attrs = {
+                'out_h': self.out_h,
+                'out_w': self.out_w,
+                'interp_method': self.interp_method,
+                'align_corners': self.align_corners,
+                'data_layout': self.data_layout
+            }
         if self.scale:
             if isinstance(self.scale, float) or isinstance(self.scale, int):
                 if self.scale > 0:
@@ -137,9 +250,6 @@ def setUp(self):
             if isinstance(self.scale, list) and len(self.scale) == 1:
                 self.scale = [self.scale[0], self.scale[0]]
             self.attrs['scale'] = self.scale
-        output_np = nearest_neighbor_interp_np(input_np, out_h, out_w, 0, 0,
-                                               self.out_size, self.actual_shape,
-                                               self.align_corners)
         self.outputs = {'Out': output_np}
 
     def test_check_output(self):
@@ -154,22 +264,26 @@ def test_check_grad(self):
 
     def init_test_case(self):
         self.interp_method = 'nearest'
-        self.input_shape = [2, 5, 4, 4]
-        self.out_h = 3
-        self.out_w = 3
+        self.input_shape = [2, 3, 4, 5]
+        self.out_h = 2
+        self.out_w = 2
         self.scale = 0.
-        self.out_size = [3, 3]
+        self.out_size = np.array([3, 3]).astype("int32")
         self.align_corners = True
 
 
+"""
+# case copied form gpu but disabled in xpu: not support 5-dim input_shape
 class TestNearestNeighborInterpCase1(TestNearestInterpOp):
     def init_test_case(self):
         self.interp_method = 'nearest'
-        self.input_shape = [4, 1, 7, 8]
+        self.input_shape = [4, 1, 1, 7, 8]
+        self.out_d = 1
         self.out_h = 1
         self.out_w = 1
         self.scale = 0.
         self.align_corners = True
+"""
 
 
 class TestNearestNeighborInterpCase2(TestNearestInterpOp):
@@ -246,6 +360,8 @@ def init_test_case(self):
         self.align_corners = True
 
 
+"""
+# case copied form gpu but disabled in xpu: not support NHWC data_layout
 class TestNearestNeighborInterpDataLayout(TestNearestInterpOp):
     def init_test_case(self):
         self.interp_method = 'nearest'
@@ -256,6 +372,7 @@ def init_test_case(self):
         self.out_size = np.array([3, 8]).astype("int32")
         self.align_corners = True
         self.data_layout = "NHWC"
+"""
 
 
 class TestNearestInterpWithoutCorners(TestNearestInterpOp):
@@ -296,6 +413,21 @@ def init_test_case(self):
         self.align_corners = True
 
 
+"""
+# case copied form gpu but disabled in xpu: not support 5-dim input_shape
+class TestNearestNeighbor3DInterp(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [3, 2, 4, 7, 5]
+        self.out_d = 8
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = [4.0, 2.0, 3.0]
+        self.out_size = np.array([8, 66, 40]).astype("int32")
+        self.align_corners = True
+"""
+
+
 class TestNearestInterpOp_attr_tensor(XPUOpTest):
     def setUp(self):
         self.use_xpu = True

From 62ae5f6232add26ebcbaf78813188cb678bf46e9 Mon Sep 17 00:00:00 2001
From: TeFeng Chen <ctfeng66@163.com>
Date: Tue, 22 Feb 2022 13:06:47 +0800
Subject: [PATCH 030/101] build_cinn_pass: fix bug because of output control
 var (#39782)

---
 paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
index 3516e71b83791..d55950064a4a2 100644
--- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
+++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
@@ -375,7 +375,7 @@ std::unique_ptr<Graph> CreateNewSubGraph(const GraphNodeSet& cluster,
       const std::unordered_set<std::string>& ignore_names) {
     auto result = std::make_unique<std::vector<std::string>>();
     for (auto* node : nodes) {
-      if (ignore_names.count(node->Name())) {
+      if (!node->Var() || ignore_names.count(node->Name())) {
         continue;
       }
       result->emplace_back(node->Name());

From c5d1565519bc92069b59b30864ef0d8e7c492163 Mon Sep 17 00:00:00 2001
From: liutiexing <74819124+liutiexing@users.noreply.github.com>
Date: Tue, 22 Feb 2022 14:08:10 +0800
Subject: [PATCH 031/101] Update profiler (#39779)

* add align for WorkQueue

* add spinlock

* merge develop

* merge

* Add EventsWaiter

* Revert "Add EventsWaiter"

This reverts commit e206173aa9be7401b83a53581627bfaf557c8fb2.

* add log for Executor

* update the profiler

Co-authored-by: liutiexing <liutiexing@google.com>
---
 .../platform/profiler/cupti_data_process.cc   |  3 +-
 paddle/fluid/platform/profiler/event_python.h |  0
 .../fluid/platform/profiler/event_tracing.h   | 33 +++++++++++++++++--
 .../platform/profiler/host_event_recorder.h   |  3 +-
 .../fluid/platform/profiler/output_logger.h   |  0
 5 files changed, 34 insertions(+), 5 deletions(-)
 mode change 100755 => 100644 paddle/fluid/platform/profiler/event_python.h
 mode change 100755 => 100644 paddle/fluid/platform/profiler/output_logger.h

diff --git a/paddle/fluid/platform/profiler/cupti_data_process.cc b/paddle/fluid/platform/profiler/cupti_data_process.cc
index 4d3b807aba82e..da12dccb74924 100644
--- a/paddle/fluid/platform/profiler/cupti_data_process.cc
+++ b/paddle/fluid/platform/profiler/cupti_data_process.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/platform/profiler/cupti_data_process.h"
 #include <cstdio>
+#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/os_info.h"
 
 namespace paddle {
@@ -26,7 +27,7 @@ void AddKernelRecord(const CUpti_ActivityKernel4* kernel, uint64_t start_ns,
     return;
   }
   DeviceTraceEvent event;
-  event.name = kernel->name;
+  event.name = demangle(kernel->name);
   event.type = TracerEventType::Kernel;
   event.start_ns = kernel->start;
   event.end_ns = kernel->end;
diff --git a/paddle/fluid/platform/profiler/event_python.h b/paddle/fluid/platform/profiler/event_python.h
old mode 100755
new mode 100644
diff --git a/paddle/fluid/platform/profiler/event_tracing.h b/paddle/fluid/platform/profiler/event_tracing.h
index 54c5b219310a9..fcaba9a43ca93 100644
--- a/paddle/fluid/platform/profiler/event_tracing.h
+++ b/paddle/fluid/platform/profiler/event_tracing.h
@@ -21,26 +21,55 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
+// Default tracing level.
+// It is Recommended to set the level explicitly.
 static constexpr uint32_t kDefaultTraceLevel = 4;
-// CPU event tracing. A trace marks something that happens but has no duration
+
+// Host event tracing. A trace marks something that happens but has no duration
 // associated with it. For example, thread starts working.
 // Chrome Trace Viewer Format: Instant Event
 struct RecordInstantEvent {
+  /**
+   * @param name: It is the caller's reponsibility to manage the underlying
+   * storage. RecordInstantEvent stores the pointer.
+   * @param type: Classification which is used to instruct the profiling
+   * data statistics.
+   * @param level: Used to filter events, works like glog VLOG(level).
+   * RecordEvent will works if HostTraceLevel >= level.
+   */
   explicit RecordInstantEvent(const char* name, TracerEventType type,
                               uint32_t level = kDefaultTraceLevel);
 };
 
-// CPU event tracing. A trace starts when an object of this clas is created and
+// Host event tracing. A trace starts when an object of this clas is created and
 // stops when the object is destroyed.
 // Chrome Trace Viewer Format: Duration Event/Complte Event
 class RecordEvent {
  public:
+  /**
+   * @param name: If your string argument has a longer lifetime (e.g.: string
+   * literal, static variables, etc) than the event, use 'const char* name'.
+   * Do your best to avoid using 'std::string' as the argument type. It will
+   * cause deep-copy to harm performance.
+   * @param type: Classification which is used to instruct the profiling
+   * data statistics.
+   * @param level: Used to filter events, works like glog VLOG(level).
+   * RecordEvent will works if HostTraceLevel >= level.
+   */
   explicit RecordEvent(
       const std::string& name,
       const TracerEventType type = TracerEventType::UserDefined,
       uint32_t level = kDefaultTraceLevel,
       const EventRole role = EventRole::kOrdinary);
 
+  /**
+   * @param name: It is the caller's reponsibility to manage the underlying
+   * storage. RecordEvent stores the pointer.
+   * @param type: Classification which is used to instruct the profiling
+   * data statistics.
+   * @param level: Used to filter events, works like glog VLOG(level).
+   * RecordEvent will works if HostTraceLevel >= level.
+   */
   explicit RecordEvent(const char* name, const TracerEventType type =
                                              TracerEventType::UserDefined,
                        uint32_t level = kDefaultTraceLevel,
diff --git a/paddle/fluid/platform/profiler/host_event_recorder.h b/paddle/fluid/platform/profiler/host_event_recorder.h
index 3bcd68c559630..49f9362527591 100644
--- a/paddle/fluid/platform/profiler/host_event_recorder.h
+++ b/paddle/fluid/platform/profiler/host_event_recorder.h
@@ -202,7 +202,7 @@ class ThreadEventRecorder {
 
   ThreadEventSection GatherEvents() {
     ThreadEventSection thr_sec;
-    thr_sec.thread_name = thread_name_;
+    thr_sec.thread_name = GetCurrentThreadName();
     thr_sec.thread_id = thread_id_;
     thr_sec.events = std::move(base_evt_cntr_.Reduce());
     return thr_sec;
@@ -210,7 +210,6 @@ class ThreadEventRecorder {
 
  private:
   uint64_t thread_id_;
-  std::string thread_name_;
   EventContainer<CommonEvent> base_evt_cntr_;
 };
 
diff --git a/paddle/fluid/platform/profiler/output_logger.h b/paddle/fluid/platform/profiler/output_logger.h
old mode 100755
new mode 100644

From 60fc555e2c33d9fdafde4145dfad9e789d89274c Mon Sep 17 00:00:00 2001
From: ronnywang <524019753@qq.com>
Date: Tue, 22 Feb 2022 14:28:56 +0800
Subject: [PATCH 032/101] [CustomRuntime] fix CustomDeviceContext (#39766)

---
 paddle/fluid/platform/CMakeLists.txt    |  4 +++-
 paddle/fluid/platform/device_context.cc | 16 ++++------------
 paddle/fluid/platform/device_context.h  | 14 ++------------
 paddle/phi/common/backend.h             |  3 ---
 paddle/phi/common/place.h               |  1 +
 5 files changed, 10 insertions(+), 28 deletions(-)

diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index b808e1561b24a..478b71745e4ac 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -141,7 +141,9 @@ if(WITH_GPU OR WITH_ROCM)
     target_link_libraries(device_context gpu_info gpu_context pten_gpu_info)
     target_link_libraries(device_context gpu_resource_pool)
 endif()
-
+if (WITH_CUSTOM_DEVICE)
+    target_link_libraries(device_context custom_context)
+endif()
 if(WITH_ASCEND_CL)
     target_link_libraries(device_context npu_resource_pool)
 endif()
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 6452f6f7984e3..e5e369efd6bb4 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -897,21 +897,13 @@ MKLDNNDeviceContext::BlobPtr_t<void> MKLDNNDeviceContext::GetBlob(
 #endif
 
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-CustomDeviceContext::CustomDeviceContext(CustomPlace place) : place_(place) {
-  DeviceGuard guard(place_);
-  stream_.reset(new stream::Stream());
-  stream_->Init(place_);
+CustomDeviceContext::CustomDeviceContext(CustomPlace place)
+    : phi::CustomContext(place) {
+  Init();
+  stream_.reset(new platform::stream::Stream(place, stream()));
 }
 
 CustomDeviceContext::~CustomDeviceContext() {}
-
-const Place& CustomDeviceContext::GetPlace() const { return place_; }
-
-void CustomDeviceContext::Wait() const {
-  // platform::RecordEvent record_event("NPUDeviceContext/wait");
-  VLOG(4) << "CustomDevice context(" << this << ")  Wait";
-  stream_->Wait();
-}
 #endif
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 0101286f0dfa8..17288b354a280 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -21,6 +21,7 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/device/gpu/gpu_types.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/custom/custom_context.h"
 #include "paddle/phi/backends/gpu/gpu_decls.h"
 #include "paddle/phi/core/device_context.h"
 
@@ -819,17 +820,12 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
 #endif
 
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-class CustomDeviceContext : public DeviceContext {
+class CustomDeviceContext : public phi::CustomContext {
  public:
   explicit CustomDeviceContext(CustomPlace place);
   virtual ~CustomDeviceContext();
 
-  const Place& GetPlace() const override;
-  void Wait() const override;
   Eigen::DefaultDevice* eigen_device() const { return nullptr; }
-  C_Stream stream() const {
-    return reinterpret_cast<C_Stream>(stream_->raw_stream());
-  }
 
   template <typename Callback>
   void AddStreamCallback(Callback&& callback) const {
@@ -839,13 +835,7 @@ class CustomDeviceContext : public DeviceContext {
   void WaitStreamCallback() const { return stream_->WaitCallback(); }
 
  private:
-  std::string device_type_;
-
-  CustomPlace place_;
-
   std::shared_ptr<platform::stream::Stream> stream_;
-
-  CustomDeviceContext();
 };
 template <>
 struct DefaultDeviceContextType<platform::CustomPlace> {
diff --git a/paddle/phi/common/backend.h b/paddle/phi/common/backend.h
index 9a2ec093119fd..1d3e4369c6948 100644
--- a/paddle/phi/common/backend.h
+++ b/paddle/phi/common/backend.h
@@ -135,9 +135,6 @@ inline Backend StringToBackend(const char* backend_cstr) {
   if (s == std::string("Undefined")) {
     return Backend::UNDEFINED;
   }
-  for (size_t i = 0; i < s.size(); ++i) {
-    s[i] = toupper(s[i]);
-  }
   if (s == std::string("CPU")) {
     return Backend::CPU;
   } else if (s == std::string("GPU")) {
diff --git a/paddle/phi/common/place.h b/paddle/phi/common/place.h
index b6adb1c2932bf..36fb910cad6c7 100644
--- a/paddle/phi/common/place.h
+++ b/paddle/phi/common/place.h
@@ -188,6 +188,7 @@ class MLUPlace : public Place {
 
 class CustomPlace : public Place {
  public:
+  CustomPlace() : Place(AllocationType::CUSTOM, 0, "") {}
   explicit CustomPlace(const std::string dev_type)
       : Place(AllocationType::CUSTOM, 0, dev_type) {}
   CustomPlace(const std::string dev_type, int device_id)

From 1aa6777890b78a95aff7c3e790a348e3f2395477 Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Tue, 22 Feb 2022 14:59:32 +0800
Subject: [PATCH 033/101] [Phi] Migrate unfold_op into phi (#39778)

* [Phi] Migrate unfold_op into phi

* fix im2col CPUContext template instantial

* fix unfold_op.h header include problem

* fix unittest

* fix PT->PD
---
 paddle/fluid/imperative/prepared_operator.h   |   4 +-
 paddle/fluid/operators/fold_op.cc             |   1 -
 paddle/fluid/operators/math/im2col.cc         |  59 ++++---
 paddle/fluid/operators/unfold_op.cc           | 137 +--------------
 paddle/fluid/operators/unfold_op.cu           |  26 ---
 paddle/fluid/operators/unfold_op.h            | 121 -------------
 paddle/phi/infermeta/unary.cc                 | 159 ++++++++++++++++++
 paddle/phi/infermeta/unary.h                  |   7 +
 paddle/phi/kernels/CMakeLists.txt             |   2 +-
 paddle/phi/kernels/cpu/unfold_grad_kernel.cc  |  21 +++
 paddle/phi/kernels/cpu/unfold_kernel.cc       |  20 +++
 paddle/phi/kernels/funcs/unfold_functor.h     |  33 ++++
 paddle/phi/kernels/gpu/unfold_grad_kernel.cu  |  21 +++
 paddle/phi/kernels/gpu/unfold_kernel.cu       |  20 +++
 .../kernels/impl/unfold_grad_kernel_impl.h    |  72 ++++++++
 paddle/phi/kernels/impl/unfold_kernel_impl.h  |  65 +++++++
 paddle/phi/kernels/unfold_grad_kernel.h       |  32 ++++
 paddle/phi/kernels/unfold_kernel.h            |  31 ++++
 paddle/phi/ops/compat/unfold_sig.cc           |  28 +++
 19 files changed, 558 insertions(+), 301 deletions(-)
 delete mode 100644 paddle/fluid/operators/unfold_op.cu
 delete mode 100644 paddle/fluid/operators/unfold_op.h
 create mode 100644 paddle/phi/kernels/cpu/unfold_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/unfold_kernel.cc
 create mode 100644 paddle/phi/kernels/funcs/unfold_functor.h
 create mode 100644 paddle/phi/kernels/gpu/unfold_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/unfold_kernel.cu
 create mode 100644 paddle/phi/kernels/impl/unfold_grad_kernel_impl.h
 create mode 100644 paddle/phi/kernels/impl/unfold_kernel_impl.h
 create mode 100644 paddle/phi/kernels/unfold_grad_kernel.h
 create mode 100644 paddle/phi/kernels/unfold_kernel.h
 create mode 100644 paddle/phi/ops/compat/unfold_sig.cc

diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h
index 589c8edd446bd..714e429798662 100644
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -442,7 +442,9 @@ void BuildDygraphPtenKernelContext(
                                                        vector_int_attr.end());
           kernel_ctx->EmplaceBackAttr(vector_int64_attr);
         }
-        // TODO(YuanRisheng) Need support vector<int64_t> attr
+      } else if (attr_defs[i].type_index ==
+                 std::type_index(typeid(std::vector<int>))) {
+        kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(std::vector<int>, attr));
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
             "Unsupported cast op attribute `%s` when construct "
diff --git a/paddle/fluid/operators/fold_op.cc b/paddle/fluid/operators/fold_op.cc
index 7b97663c387ca..40ec9aef190ff 100644
--- a/paddle/fluid/operators/fold_op.cc
+++ b/paddle/fluid/operators/fold_op.cc
@@ -13,7 +13,6 @@
  *     limitations under the License. */
 
 #include "paddle/fluid/operators/fold_op.h"
-#include "paddle/fluid/operators/unfold_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/im2col.cc b/paddle/fluid/operators/math/im2col.cc
index 8efd35ca10810..8fc6c52122abf 100644
--- a/paddle/fluid/operators/math/im2col.cc
+++ b/paddle/fluid/operators/math/im2col.cc
@@ -22,6 +22,10 @@ class CPUDeviceContext;
 }  // namespace platform
 }  // namespace paddle
 
+namespace phi {
+class CPUContext;
+}  // namespace phi
+
 namespace paddle {
 namespace operators {
 namespace math {
@@ -31,12 +35,12 @@ namespace math {
  * col =
  *   [input_channels, filter_height, filter_width, output_height, output_width]
  */
-template <class T>
-class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
-                    platform::CPUDeviceContext, T> {
+template <class T, typename DeviceContext>
+class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO, DeviceContext,
+                    T> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& im, const std::vector<int>& dilation,
+  void operator()(const DeviceContext& context, const framework::Tensor& im,
+                  const std::vector<int>& dilation,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding, framework::Tensor* col,
                   const DataLayout data_layout) {
@@ -73,12 +77,11 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
  * col =
  *   [input_channels, filter_height, filter_width, output_height, output_width]
  */
-template <class T>
-class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
-                    platform::CPUDeviceContext, T> {
+template <class T, typename DeviceContext>
+class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO, DeviceContext,
+                    T> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& col,
+  void operator()(const DeviceContext& context, const framework::Tensor& col,
                   const std::vector<int>& dilation,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding, framework::Tensor* im,
@@ -155,22 +158,30 @@ template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
                              platform::CPUDeviceContext, float>;
 template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
                              platform::CPUDeviceContext, double>;
+template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
+                             phi::CPUContext, float>;
+template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
+                             phi::CPUContext, double>;
 template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
                              platform::CPUDeviceContext, float>;
 template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
                              platform::CPUDeviceContext, double>;
+template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
+                             phi::CPUContext, float>;
+template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
+                             phi::CPUContext, double>;
 
 /*
  * im = [input_channels, input_height, input_width]
  * col =
  *   [output_height, output_width, input_channels, filter_height, filter_width]
  */
-template <class T>
-class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
-                    platform::CPUDeviceContext, T> {
+template <class T, typename DeviceContext>
+class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF, DeviceContext,
+                    T> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& im, const std::vector<int>& dilation,
+  void operator()(const DeviceContext& context, const framework::Tensor& im,
+                  const std::vector<int>& dilation,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding, framework::Tensor* col,
                   const DataLayout data_layout) {
@@ -235,12 +246,11 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
  * col =
  *   [output_height, output_width, input_channels, filter_height, filter_width]
  */
-template <class T>
-class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
-                    platform::CPUDeviceContext, T> {
+template <class T, typename DeviceContext>
+class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF, DeviceContext,
+                    T> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& col,
+  void operator()(const DeviceContext& context, const framework::Tensor& col,
                   const std::vector<int>& dilation,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding, framework::Tensor* im,
@@ -316,11 +326,18 @@ template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
                              platform::CPUDeviceContext, float>;
 template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
                              platform::CPUDeviceContext, double>;
+template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
+                             phi::CPUContext, float>;
+template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
+                             phi::CPUContext, double>;
 template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
                              platform::CPUDeviceContext, float>;
 template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
                              platform::CPUDeviceContext, double>;
-
+template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
+                             phi::CPUContext, float>;
+template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
+                             phi::CPUContext, double>;
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/unfold_op.cc b/paddle/fluid/operators/unfold_op.cc
index 0a8cd6e65f93e..c45b839d5b40b 100644
--- a/paddle/fluid/operators/unfold_op.cc
+++ b/paddle/fluid/operators/unfold_op.cc
@@ -12,7 +12,9 @@
  *     See the License for the specific language governing permissions and
  *     limitations under the License. */
 
-#include "paddle/fluid/operators/unfold_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -60,126 +62,6 @@ feature map, a series of such columns will be formed.
 class UnfoldOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("X"), true,
-        platform::errors::NotFound("Input(X) of UnfoldOp should not be null"));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("Y"), true,
-        platform::errors::NotFound("Output(Y) of UnfoldOp should not be null"));
-    auto in_dims = ctx->GetInputDim("X");
-    std::vector<int> kernel_sizes =
-        ctx->Attrs().Get<std::vector<int>>("kernel_sizes");
-    std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
-    std::vector<int> dilations =
-        ctx->Attrs().Get<std::vector<int>>("dilations");
-
-    // Only [N, C, H, W] input supported now
-    PADDLE_ENFORCE_EQ(
-        in_dims.size(), 4,
-        platform::errors::InvalidArgument(
-            "Input should be 4-D tensor of format [N, C, H, W], but get %u",
-            in_dims.size()));
-    PADDLE_ENFORCE_EQ(
-        in_dims.size() - kernel_sizes.size(), 2U,
-        platform::errors::InvalidArgument(
-            "The dims of X should be larger than that of kernel_sizes "
-            "by a number of 2, due to the batch size and input channel dim. "
-            "But recieved dims(X:%u) - dims(kernel_sizes:%u) != 2",
-            in_dims.size(), kernel_sizes.size()));
-    PADDLE_ENFORCE_EQ(
-        strides.size(), kernel_sizes.size(),
-        platform::errors::InvalidArgument(
-            "The dims of strides should be the same with that of kernel_sizes. "
-            "But recieved dims(strides: %u) != dims(kernel_sizes: %u).",
-            strides.size(), kernel_sizes.size()));
-    PADDLE_ENFORCE_EQ(
-        paddings.size(), 2 * strides.size(),
-        platform::errors::InvalidArgument(
-            "The dims of paddings should be 2 times of that of strides. "
-            "But recieved dims(paddings: %u) != 2*dims(strides: %u).",
-            paddings.size(), strides.size()));
-    PADDLE_ENFORCE_EQ(
-        strides.size(), dilations.size(),
-        platform::errors::InvalidArgument(
-            "The dims of strides should be the same with that of dilations. "
-            "But recieved dims(strides: %u) != dims(dilations: %u).",
-            strides.size(), dilations.size()));
-
-    // check kernel_sizes
-    PADDLE_ENFORCE_GT(kernel_sizes[0], 0,
-                      platform::errors::InvalidArgument(
-                          "The `kernel_sizes` should be greater than zero, "
-                          "but recieved kernel_height: %d kernel_width: %d.",
-                          kernel_sizes[0], kernel_sizes[1]));
-    PADDLE_ENFORCE_GT(kernel_sizes[1], 0,
-                      platform::errors::InvalidArgument(
-                          "The `kernel_sizes` should be greater than zero, "
-                          "but recieved kernel_height: %d kernel_width: %d.",
-                          kernel_sizes[0], kernel_sizes[1]));
-    // check strides
-    PADDLE_ENFORCE_GT(strides[0], 0,
-                      platform::errors::InvalidArgument(
-                          "The `strides` should be greater than zero, "
-                          "but recieved strides_height: %d strides_width: %d.",
-                          strides[0], strides[1]));
-    PADDLE_ENFORCE_GT(strides[1], 0,
-                      platform::errors::InvalidArgument(
-                          "The `strides` should be greater than zero, "
-                          "but recieved strides_height: %d strides_width: %d.",
-                          strides[0], strides[1]));
-    // check dilations
-    PADDLE_ENFORCE_GT(
-        dilations[0], 0,
-        platform::errors::InvalidArgument(
-            "The `dilations` should be greater than zero, "
-            "but recieved dilations_height: %d dilations_width: %d.",
-            dilations[0], dilations[1]));
-    PADDLE_ENFORCE_GT(
-        dilations[1], 0,
-        platform::errors::InvalidArgument(
-            "The `dilations` should be greater than zero, "
-            "but recieved dilations_height: %d dilations_width: %d.",
-            dilations[0], dilations[1]));
-
-    std::vector<int> out_dims;
-    out_dims.push_back(in_dims[0]);
-    int output_channels = in_dims[1] * kernel_sizes[0] * kernel_sizes[1];
-    out_dims.push_back(output_channels);
-
-    int output_height =
-        CalcOutputSize(in_dims[2], kernel_sizes[0], dilations[0], paddings[0],
-                       paddings[2], strides[0]);
-    int output_width = CalcOutputSize(in_dims[3], kernel_sizes[1], dilations[1],
-                                      paddings[1], paddings[3], strides[1]);
-    if (ctx->IsRuntime()) {
-      // only check output height and width in runtime
-      PADDLE_ENFORCE_GT(
-          output_height, 0,
-          platform::errors::InvalidArgument(
-              "The sliding blocks calculated from input spatial size "
-              "(%d, %d), kernel_sizes (%d, %d), strides (%d, %d), "
-              "dilations (%d, %d), is (%d, %d), which should be a "
-              "positive integer.",
-              in_dims[2], in_dims[3], kernel_sizes[0], kernel_sizes[1],
-              strides[0], strides[1], dilations[0], dilations[1], output_height,
-              output_width));
-      PADDLE_ENFORCE_GT(
-          output_width, 0,
-          platform::errors::InvalidArgument(
-              "The sliding blocks calculated from input spatial size "
-              "(%d, %d), kernel_sizes (%d, %d), strides (%d, %d), "
-              "dilations (%d, %d), is (%d, %d), which should be a "
-              "positive integer.",
-              in_dims[2], in_dims[3], kernel_sizes[0], kernel_sizes[1],
-              strides[0], strides[1], dilations[0], dilations[1], output_height,
-              output_width));
-    }
-    int output_col_length = output_height * output_width;
-    out_dims.push_back(output_col_length);
-    ctx->SetOutputDim("Y", phi::make_ddim(out_dims));
-  }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
@@ -237,16 +119,11 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(UnfoldGradOpNoNeedBufferVarsInferer, "X");
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DELCARE_INFER_SHAPE_FUNCTOR(unfold, UnfoldInferShapeFunctor,
+                            PT_INFER_META(phi::UnfoldInferMeta));
 REGISTER_OPERATOR(unfold, ops::UnfoldOp, ops::UnfoldOpMaker,
                   ops::UnfoldGradMaker<paddle::framework::OpDesc>,
-                  ops::UnfoldGradMaker<paddle::imperative::OpBase>);
+                  ops::UnfoldGradMaker<paddle::imperative::OpBase>,
+                  UnfoldInferShapeFunctor);
 REGISTER_OPERATOR(unfold_grad, ops::UnfoldGradOp,
                   ops::UnfoldGradOpNoNeedBufferVarsInferer);
-
-REGISTER_OP_CPU_KERNEL(
-    unfold, ops::UnfoldOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::UnfoldOpKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    unfold_grad,
-    ops::UnfoldGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::UnfoldGradOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/unfold_op.cu b/paddle/fluid/operators/unfold_op.cu
deleted file mode 100644
index 46584506d4315..0000000000000
--- a/paddle/fluid/operators/unfold_op.cu
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-Indicesou may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/unfold_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    unfold, ops::UnfoldOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::UnfoldOpKernel<paddle::platform::CUDADeviceContext, double>);
-
-REGISTER_OP_CUDA_KERNEL(
-    unfold_grad,
-    ops::UnfoldGradOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::UnfoldGradOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/unfold_op.h b/paddle/fluid/operators/unfold_op.h
deleted file mode 100644
index f35bce3abff2b..0000000000000
--- a/paddle/fluid/operators/unfold_op.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- *     Unless required by applicable law or agreed to in writing, software
- *     distributed under the License is distributed on an "AS IS" BASIS,
- *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *     See the License for the specific language governing permissions and
- *     limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/im2col.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-inline int CalcOutputSize(int input_size, int filter_size, int dilation,
-                          int padding1, int padding2, int stride) {
-  const int dkernel = dilation * (filter_size - 1) + 1;
-  int output_size = (input_size + padding1 + padding2 - dkernel) / stride + 1;
-  return output_size;
-}
-
-template <typename DeviceContext, typename T>
-class UnfoldOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* input = ctx.Input<Tensor>("X");
-    const int batch_size = static_cast<int>(input->dims()[0]);
-    Tensor* output = ctx.Output<Tensor>("Y");
-    output->mutable_data<T>(ctx.GetPlace());
-
-    std::vector<int> kernel_sizes = ctx.Attr<std::vector<int>>("kernel_sizes");
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-
-    math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-
-    auto input_dims = input->dims();
-
-    int output_height =
-        CalcOutputSize(input_dims[2], kernel_sizes[0], dilations[0],
-                       paddings[0], paddings[2], strides[0]);
-    int output_width =
-        CalcOutputSize(input_dims[3], kernel_sizes[1], dilations[1],
-                       paddings[1], paddings[3], strides[1]);
-
-    framework::DDim input_shape({input_dims[1], input_dims[2], input_dims[3]});
-    framework::DDim output_matrix_shape({input_dims[1], kernel_sizes[0],
-                                         kernel_sizes[1], output_height,
-                                         output_width});
-
-    for (int i = 0; i < batch_size; i++) {
-      Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-      Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-      im2col(dev_ctx, in_batch, dilations, strides, paddings, &out_batch);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class UnfoldGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* output_grad = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-    input_grad->mutable_data<T>(ctx.GetPlace());
-
-    if ((!output_grad) || (!input_grad)) return;
-
-    std::vector<int> kernel_sizes = ctx.Attr<std::vector<int>>("kernel_sizes");
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-
-    const int batch_size = static_cast<int>(input_grad->dims()[0]);
-
-    auto input_dims = input_grad->dims();
-
-    int output_height =
-        CalcOutputSize(input_dims[2], kernel_sizes[0], dilations[0],
-                       paddings[0], paddings[2], strides[0]);
-    int output_width =
-        CalcOutputSize(input_dims[3], kernel_sizes[1], dilations[1],
-                       paddings[1], paddings[3], strides[1]);
-
-    framework::DDim input_shape({input_dims[1], input_dims[2], input_dims[3]});
-    framework::DDim output_matrix_shape({input_dims[1], kernel_sizes[0],
-                                         kernel_sizes[1], output_height,
-                                         output_width});
-
-    math::Col2ImFunctor<math::ColFormat::kCFO, DeviceContext, T> col2im;
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-
-    phi::funcs::SetConstant<DeviceContext, T> set_zero;
-    set_zero(dev_ctx, input_grad, static_cast<T>(0));
-    for (int i = 0; i < batch_size; i++) {
-      Tensor out_grad_batch =
-          output_grad->Slice(i, i + 1).Resize(output_matrix_shape);
-      Tensor in_grad_batch = input_grad->Slice(i, i + 1).Resize(input_shape);
-      col2im(dev_ctx, out_grad_batch, dilations, strides, paddings,
-             &in_grad_batch);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 66a91e0ca53e8..fda395e6d95ec 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/kernels/funcs/unfold_functor.h"
 
 namespace phi {
 
@@ -537,6 +538,164 @@ void TraceInferMeta(
   out->set_dims(phi::make_ddim(sizes));
 }
 
+void UnfoldInferMeta(const MetaTensor& x,
+                     const std::vector<int>& kernel_sizes,
+                     const std::vector<int>& strides,
+                     const std::vector<int>& paddings,
+                     const std::vector<int>& dilations,
+                     MetaTensor* out,
+                     MetaConfig config) {
+  auto in_dims = x.dims();
+  // Only [N, C, H, W] input supported now
+  PADDLE_ENFORCE_EQ(
+      in_dims.size(),
+      4,
+      phi::errors::InvalidArgument(
+          "Input should be 4-D tensor of format [N, C, H, W], but get %u",
+          in_dims.size()));
+  PADDLE_ENFORCE_EQ(
+      in_dims.size() - kernel_sizes.size(),
+      2U,
+      phi::errors::InvalidArgument(
+          "The dims of X should be larger than that of kernel_sizes "
+          "by a number of 2, due to the batch size and input channel dim. "
+          "But recieved dims(X:%u) - dims(kernel_sizes:%u) != 2",
+          in_dims.size(),
+          kernel_sizes.size()));
+  PADDLE_ENFORCE_EQ(
+      strides.size(),
+      kernel_sizes.size(),
+      phi::errors::InvalidArgument(
+          "The dims of strides should be the same with that of kernel_sizes. "
+          "But recieved dims(strides: %u) != dims(kernel_sizes: %u).",
+          strides.size(),
+          kernel_sizes.size()));
+  PADDLE_ENFORCE_EQ(
+      paddings.size(),
+      2 * strides.size(),
+      phi::errors::InvalidArgument(
+          "The dims of paddings should be 2 times of that of strides. "
+          "But recieved dims(paddings: %u) != 2*dims(strides: %u).",
+          paddings.size(),
+          strides.size()));
+  PADDLE_ENFORCE_EQ(
+      strides.size(),
+      dilations.size(),
+      phi::errors::InvalidArgument(
+          "The dims of strides should be the same with that of dilations. "
+          "But recieved dims(strides: %u) != dims(dilations: %u).",
+          strides.size(),
+          dilations.size()));
+
+  // check kernel_sizes
+  PADDLE_ENFORCE_GT(kernel_sizes[0],
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The `kernel_sizes` should be greater than zero, "
+                        "but recieved kernel_height: %d kernel_width: %d.",
+                        kernel_sizes[0],
+                        kernel_sizes[1]));
+  PADDLE_ENFORCE_GT(kernel_sizes[1],
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The `kernel_sizes` should be greater than zero, "
+                        "but recieved kernel_height: %d kernel_width: %d.",
+                        kernel_sizes[0],
+                        kernel_sizes[1]));
+  // check strides
+  PADDLE_ENFORCE_GT(strides[0],
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The `strides` should be greater than zero, "
+                        "but recieved strides_height: %d strides_width: %d.",
+                        strides[0],
+                        strides[1]));
+  PADDLE_ENFORCE_GT(strides[1],
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The `strides` should be greater than zero, "
+                        "but recieved strides_height: %d strides_width: %d.",
+                        strides[0],
+                        strides[1]));
+  // check dilations
+  PADDLE_ENFORCE_GT(
+      dilations[0],
+      0,
+      phi::errors::InvalidArgument(
+          "The `dilations` should be greater than zero, "
+          "but recieved dilations_height: %d dilations_width: %d.",
+          dilations[0],
+          dilations[1]));
+  PADDLE_ENFORCE_GT(
+      dilations[1],
+      0,
+      phi::errors::InvalidArgument(
+          "The `dilations` should be greater than zero, "
+          "but recieved dilations_height: %d dilations_width: %d.",
+          dilations[0],
+          dilations[1]));
+
+  std::vector<int> out_dims;
+  out_dims.push_back(in_dims[0]);
+  int output_channels = in_dims[1] * kernel_sizes[0] * kernel_sizes[1];
+  out_dims.push_back(output_channels);
+
+  int output_height = phi::funcs::CalcOutputSize(in_dims[2],
+                                                 kernel_sizes[0],
+                                                 dilations[0],
+                                                 paddings[0],
+                                                 paddings[2],
+                                                 strides[0]);
+  int output_width = phi::funcs::CalcOutputSize(in_dims[3],
+                                                kernel_sizes[1],
+                                                dilations[1],
+                                                paddings[1],
+                                                paddings[3],
+                                                strides[1]);
+  if (config.is_runtime) {
+    // only check output height and width in runtime
+    PADDLE_ENFORCE_GT(
+        output_height,
+        0,
+        phi::errors::InvalidArgument(
+            "The sliding blocks calculated from input spatial size "
+            "(%d, %d), kernel_sizes (%d, %d), strides (%d, %d), "
+            "dilations (%d, %d), is (%d, %d), which should be a "
+            "positive integer.",
+            in_dims[2],
+            in_dims[3],
+            kernel_sizes[0],
+            kernel_sizes[1],
+            strides[0],
+            strides[1],
+            dilations[0],
+            dilations[1],
+            output_height,
+            output_width));
+    PADDLE_ENFORCE_GT(
+        output_width,
+        0,
+        phi::errors::InvalidArgument(
+            "The sliding blocks calculated from input spatial size "
+            "(%d, %d), kernel_sizes (%d, %d), strides (%d, %d), "
+            "dilations (%d, %d), is (%d, %d), which should be a "
+            "positive integer.",
+            in_dims[2],
+            in_dims[3],
+            kernel_sizes[0],
+            kernel_sizes[1],
+            strides[0],
+            strides[1],
+            dilations[0],
+            dilations[1],
+            output_height,
+            output_width));
+  }
+  int output_col_length = output_height * output_width;
+  out_dims.push_back(output_col_length);
+  out->set_dims(phi::make_ddim(out_dims));
+}
+
 }  // namespace phi
 
 PD_REGISTER_INFER_META_FN(copy_to, phi::CopyToInferMeta);
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index 2ab425d42cd33..c6d5d250d98aa 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -93,4 +93,11 @@ void SplitInferMeta(const MetaTensor& x_meta,
 void TraceInferMeta(
     const MetaTensor& x, int offset, int axis1, int axis2, MetaTensor* out);
 
+void UnfoldInferMeta(const MetaTensor& x,
+                     const std::vector<int>& kernel_sizes,
+                     const std::vector<int>& strides,
+                     const std::vector<int>& paddings,
+                     const std::vector<int>& dilations,
+                     MetaTensor* out,
+                     MetaConfig config = MetaConfig());
 }  // namespace phi
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index 4f78a6500f434..f819eb3de3ef7 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -10,7 +10,7 @@ add_subdirectory(funcs)
 set_property(GLOBAL PROPERTY PTEN_KERNELS "")
 
 set(COMMON_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils)
-set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function)
+set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col)
 # remove this dep after removing fluid deps on tensor creation
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} pten_api_utils)
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} infermeta)
diff --git a/paddle/phi/kernels/cpu/unfold_grad_kernel.cc b/paddle/phi/kernels/cpu/unfold_grad_kernel.cc
new file mode 100644
index 0000000000000..c97005dd84547
--- /dev/null
+++ b/paddle/phi/kernels/cpu/unfold_grad_kernel.cc
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/unfold_grad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/unfold_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    unfold_grad, CPU, ALL_LAYOUT, phi::UnfoldGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/unfold_kernel.cc b/paddle/phi/kernels/cpu/unfold_kernel.cc
new file mode 100644
index 0000000000000..e38d8acd09820
--- /dev/null
+++ b/paddle/phi/kernels/cpu/unfold_kernel.cc
@@ -0,0 +1,20 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/unfold_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/unfold_kernel_impl.h"
+
+PD_REGISTER_KERNEL(unfold, CPU, ALL_LAYOUT, phi::UnfoldKernel, float, double) {}
diff --git a/paddle/phi/kernels/funcs/unfold_functor.h b/paddle/phi/kernels/funcs/unfold_functor.h
new file mode 100644
index 0000000000000..2bd5437a7f10a
--- /dev/null
+++ b/paddle/phi/kernels/funcs/unfold_functor.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace phi {
+namespace funcs {
+
+//////// CalcOutputSize Functor ///////
+inline int CalcOutputSize(int input_size,
+                          int filter_size,
+                          int dilation,
+                          int padding1,
+                          int padding2,
+                          int stride) {
+  const int dkernel = dilation * (filter_size - 1) + 1;
+  int output_size = (input_size + padding1 + padding2 - dkernel) / stride + 1;
+  return output_size;
+}
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/unfold_grad_kernel.cu b/paddle/phi/kernels/gpu/unfold_grad_kernel.cu
new file mode 100644
index 0000000000000..3740f59603bef
--- /dev/null
+++ b/paddle/phi/kernels/gpu/unfold_grad_kernel.cu
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/unfold_grad_kernel_impl.h"
+#include "paddle/phi/kernels/unfold_grad_kernel.h"
+
+PD_REGISTER_KERNEL(
+    unfold_grad, GPU, ALL_LAYOUT, phi::UnfoldGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/unfold_kernel.cu b/paddle/phi/kernels/gpu/unfold_kernel.cu
new file mode 100644
index 0000000000000..4f72a6f794e5f
--- /dev/null
+++ b/paddle/phi/kernels/gpu/unfold_kernel.cu
@@ -0,0 +1,20 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/unfold_kernel_impl.h"
+#include "paddle/phi/kernels/unfold_kernel.h"
+
+PD_REGISTER_KERNEL(unfold, GPU, ALL_LAYOUT, phi::UnfoldKernel, float, double) {}
diff --git a/paddle/phi/kernels/impl/unfold_grad_kernel_impl.h b/paddle/phi/kernels/impl/unfold_grad_kernel_impl.h
new file mode 100644
index 0000000000000..5556654ee7c0d
--- /dev/null
+++ b/paddle/phi/kernels/impl/unfold_grad_kernel_impl.h
@@ -0,0 +1,72 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/operators/math/im2col.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/unfold_functor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void UnfoldGradKernel(const Context& ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& out_grad,
+                      const std::vector<int>& kernel_sizes,
+                      const std::vector<int>& strides,
+                      const std::vector<int>& paddings,
+                      const std::vector<int>& dilations,
+                      DenseTensor* x_grad) {
+  ctx.template Alloc<T>(x_grad);
+
+  if (!x_grad) return;
+
+  auto x_dims = x_grad->dims();
+  const int batch_size = static_cast<int>(x_dims[0]);
+
+  int out_height = phi::funcs::CalcOutputSize(x_dims[2],
+                                              kernel_sizes[0],
+                                              dilations[0],
+                                              paddings[0],
+                                              paddings[2],
+                                              strides[0]);
+  int out_width = phi::funcs::CalcOutputSize(x_dims[3],
+                                             kernel_sizes[1],
+                                             dilations[1],
+                                             paddings[1],
+                                             paddings[3],
+                                             strides[1]);
+
+  DDim x_shape = make_ddim({x_dims[1], x_dims[2], x_dims[3]});
+  DDim out_matrix_shape = make_ddim(
+      {x_dims[1], kernel_sizes[0], kernel_sizes[1], out_height, out_width});
+
+  paddle::operators::math::
+      Col2ImFunctor<paddle::operators::math::ColFormat::kCFO, Context, T>
+          col2im;
+
+  phi::funcs::SetConstant<Context, T> set_zero;
+  set_zero(ctx, x_grad, static_cast<T>(0));
+  for (int i = 0; i < batch_size; i++) {
+    DenseTensor out_grad_batch =
+        out_grad.Slice(i, i + 1).Resize(out_matrix_shape);
+    DenseTensor x_grad_batch = x_grad->Slice(i, i + 1).Resize(x_shape);
+    col2im(ctx, out_grad_batch, dilations, strides, paddings, &x_grad_batch);
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/unfold_kernel_impl.h b/paddle/phi/kernels/impl/unfold_kernel_impl.h
new file mode 100644
index 0000000000000..e914f6cacbde9
--- /dev/null
+++ b/paddle/phi/kernels/impl/unfold_kernel_impl.h
@@ -0,0 +1,65 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/operators/math/im2col.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/unfold_functor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void UnfoldKernel(const Context& ctx,
+                  const DenseTensor& x,
+                  const std::vector<int>& kernel_sizes,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::vector<int>& dilations,
+                  DenseTensor* out) {
+  const int batch_size = static_cast<int>(x.dims()[0]);
+  ctx.template Alloc<T>(out);
+
+  paddle::operators::math::
+      Im2ColFunctor<paddle::operators::math::ColFormat::kCFO, Context, T>
+          im2col;
+  auto x_dims = x.dims();
+
+  int out_height = phi::funcs::CalcOutputSize(x_dims[2],
+                                              kernel_sizes[0],
+                                              dilations[0],
+                                              paddings[0],
+                                              paddings[2],
+                                              strides[0]);
+  int out_width = phi::funcs::CalcOutputSize(x_dims[3],
+                                             kernel_sizes[1],
+                                             dilations[1],
+                                             paddings[1],
+                                             paddings[3],
+                                             strides[1]);
+
+  DDim x_shape = make_ddim({x_dims[1], x_dims[2], x_dims[3]});
+  DDim out_matrix_shape = make_ddim(
+      {x_dims[1], kernel_sizes[0], kernel_sizes[1], out_height, out_width});
+
+  for (int i = 0; i < batch_size; i++) {
+    DenseTensor in_batch = x.Slice(i, i + 1).Resize(x_shape);
+    DenseTensor out_batch = out->Slice(i, i + 1).Resize(out_matrix_shape);
+    im2col(ctx, in_batch, dilations, strides, paddings, &out_batch);
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/unfold_grad_kernel.h b/paddle/phi/kernels/unfold_grad_kernel.h
new file mode 100644
index 0000000000000..6578cf8c650b4
--- /dev/null
+++ b/paddle/phi/kernels/unfold_grad_kernel.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void UnfoldGradKernel(const Context& ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& out_grad,
+                      const std::vector<int>& kernel_sizes,
+                      const std::vector<int>& strides,
+                      const std::vector<int>& paddings,
+                      const std::vector<int>& dilations,
+                      DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/unfold_kernel.h b/paddle/phi/kernels/unfold_kernel.h
new file mode 100644
index 0000000000000..d26805e978697
--- /dev/null
+++ b/paddle/phi/kernels/unfold_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void UnfoldKernel(const Context& ctx,
+                  const DenseTensor& x,
+                  const std::vector<int>& kernel_sizes,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::vector<int>& dilations,
+                  DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/unfold_sig.cc b/paddle/phi/ops/compat/unfold_sig.cc
new file mode 100644
index 0000000000000..ddc3b1813cbef
--- /dev/null
+++ b/paddle/phi/ops/compat/unfold_sig.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature UnfoldGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("unfold_grad",
+                         {"X", GradVarName("Y")},
+                         {"kernel_sizes", "strides", "paddings", "dilations"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(unfold_grad, phi::UnfoldGradOpArgumentMapping);

From a710738e4ff5eaec9924cf9ce9164daa6d8b8c1b Mon Sep 17 00:00:00 2001
From: zhiboniu <31800336+zhiboniu@users.noreply.github.com>
Date: Tue, 22 Feb 2022 15:07:11 +0800
Subject: [PATCH 034/101] unset fluid in nn.others (#34935)

---
 python/paddle/__init__.py                     | 52 ++++++++---------
 python/paddle/framework/__init__.py           |  9 +++
 python/paddle/nn/__init__.py                  |  1 -
 python/paddle/nn/functional/activation.py     | 56 +++++++++----------
 python/paddle/nn/functional/common.py         | 53 +++++++++---------
 python/paddle/nn/functional/conv.py           | 38 +++++++------
 python/paddle/nn/functional/extension.py      |  8 +--
 python/paddle/nn/functional/input.py          |  7 +--
 python/paddle/nn/functional/loss.py           | 38 ++++++-------
 python/paddle/nn/functional/norm.py           | 16 +++---
 python/paddle/nn/functional/pooling.py        | 50 ++++++++---------
 .../paddle/nn/functional/sparse_attention.py  |  6 +-
 python/paddle/nn/functional/vision.py         | 13 +++--
 python/paddle/nn/initializer/assign.py        | 11 ++--
 python/paddle/nn/initializer/dirac.py         |  6 +-
 python/paddle/nn/initializer/orthogonal.py    |  4 +-
 python/paddle/nn/layer/activation.py          |  2 -
 python/paddle/nn/layer/common.py              |  4 +-
 python/paddle/nn/layer/conv.py                | 13 +++--
 python/paddle/nn/layer/distance.py            |  4 +-
 python/paddle/nn/layer/loss.py                |  6 +-
 python/paddle/nn/layer/norm.py                |  6 +-
 python/paddle/nn/layer/rnn.py                 | 17 ++++--
 python/paddle/nn/quant/functional_layers.py   |  4 +-
 python/paddle/nn/quant/quant_layers.py        | 38 ++++++-------
 python/paddle/nn/utils/weight_norm_hook.py    | 40 ++++++-------
 python/paddle/tensor/manipulation.py          |  1 +
 27 files changed, 258 insertions(+), 245 deletions(-)

diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 12d31aee41e39..bba9c226dc07b 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -22,23 +22,32 @@
                      )
 
 from .batch import batch  # noqa: F401
-from .fluid import monkey_patch_variable
-from .fluid.dygraph import monkey_patch_math_varbase
+from .framework import monkey_patch_variable
+from .framework import monkey_patch_math_varbase
 monkey_patch_variable()
 monkey_patch_math_varbase()
+
+from .framework import disable_signal_handler  # noqa: F401
+from .framework import get_flags  # noqa: F401
+from .framework import set_flags  # noqa: F401
+
+from .framework import disable_static  # noqa: F401
+from .framework import enable_static  # noqa: F401
+from .framework import in_dynamic_mode  # noqa: F401
+
 from .framework.dtype import dtype as dtype  # noqa: F401
-from paddle.framework.dtype import uint8  # noqa: F401
-from paddle.framework.dtype import int8  # noqa: F401
-from paddle.framework.dtype import int16  # noqa: F401
-from paddle.framework.dtype import int32  # noqa: F401
-from paddle.framework.dtype import int64  # noqa: F401
-from paddle.framework.dtype import float16  # noqa: F401
-from paddle.framework.dtype import float32  # noqa: F401
-from paddle.framework.dtype import float64  # noqa: F401
-from paddle.framework.dtype import bfloat16  # noqa: F401
-from paddle.framework.dtype import bool  # noqa: F401
-from paddle.framework.dtype import complex64  # noqa: F401
-from paddle.framework.dtype import complex128  # noqa: F401
+from .framework.dtype import uint8  # noqa: F401
+from .framework.dtype import int8  # noqa: F401
+from .framework.dtype import int16  # noqa: F401
+from .framework.dtype import int32  # noqa: F401
+from .framework.dtype import int64  # noqa: F401
+from .framework.dtype import float16  # noqa: F401
+from .framework.dtype import float32  # noqa: F401
+from .framework.dtype import float64  # noqa: F401
+from .framework.dtype import bfloat16  # noqa: F401
+from .framework.dtype import bool  # noqa: F401
+from .framework.dtype import complex64  # noqa: F401
+from .framework.dtype import complex128  # noqa: F401
 from .framework import VarBase as Tensor  # noqa: F401
 Tensor.__qualname__ = 'Tensor'  # noqa: F401
 import paddle.compat  # noqa: F401
@@ -142,6 +151,7 @@
 from .tensor.manipulation import scatter_nd  # noqa: F401
 from .tensor.manipulation import shard_index  # noqa: F401
 from .tensor.manipulation import slice  # noqa: F401
+from .tensor.manipulation import crop  # noqa: F401
 from .tensor.manipulation import split  # noqa: F401
 from .tensor.manipulation import squeeze  # noqa: F401
 from .tensor.manipulation import squeeze_  # noqa: F401
@@ -316,23 +326,15 @@
 from .device import get_cudnn_version  # noqa: F401
 from .device import set_device  # noqa: F401
 from .device import get_device  # noqa: F401
-from .fluid.framework import is_compiled_with_cinn  # noqa: F401
-from .fluid.framework import is_compiled_with_cuda  # noqa: F401
-from .fluid.framework import is_compiled_with_rocm  # noqa: F401
-from .fluid.framework import disable_signal_handler  # noqa: F401
-from .fluid.framework import get_flags  # noqa: F401
-from .fluid.framework import set_flags  # noqa: F401
 from .device import is_compiled_with_xpu  # noqa: F401
 from .device import is_compiled_with_npu  # noqa: F401
 from .device import is_compiled_with_ipu  # noqa: F401
 from .device import is_compiled_with_mlu  # noqa: F401
+from .device import is_compiled_with_cinn  # noqa: F401
+from .device import is_compiled_with_cuda  # noqa: F401
+from .device import is_compiled_with_rocm  # noqa: F401
 from .device import XPUPlace  # noqa: F401
 
-from .fluid.dygraph.base import enable_dygraph as disable_static  # noqa: F401
-from .fluid.dygraph.base import disable_dygraph as enable_static  # noqa: F401
-from .fluid.framework import in_dygraph_mode as in_dynamic_mode  # noqa: F401
-from .fluid.layers import crop_tensor as crop  # noqa: F401
-
 # high-level api
 from .hapi import Model  # noqa: F401
 from . import callbacks  # noqa: F401
diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py
index 72e8e73ce7c2e..7da9c0accfb49 100644
--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
@@ -39,4 +39,13 @@
 from .io import load  # noqa: F401
 from ..fluid.dygraph.parallel import DataParallel  # noqa: F401
 
+from ..fluid import monkey_patch_variable
+from ..fluid.dygraph import monkey_patch_math_varbase
+from ..fluid.framework import disable_signal_handler  # noqa: F401
+from ..fluid.framework import get_flags  # noqa: F401
+from ..fluid.framework import set_flags  # noqa: F401
+from ..fluid.dygraph.base import enable_dygraph as disable_static  # noqa: F401
+from ..fluid.dygraph.base import disable_dygraph as enable_static  # noqa: F401
+from ..fluid.framework import in_dygraph_mode as in_dynamic_mode  # noqa: F401
+
 __all__ = []
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index ad8f28f40bb58..c0820e140268b 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -14,7 +14,6 @@
 
 # TODO: import all neural network related api under this directory,
 # including layers, linear, conv, rnn etc.
-
 from ..fluid.dygraph.layers import Layer  # noqa: F401
 from ..fluid.dygraph.container import LayerList  # noqa: F401
 from ..fluid.dygraph.container import ParameterList  # noqa: F401
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index ac08ac9391eb3..91449ef538ff3 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -22,11 +22,11 @@
 
 import warnings
 from ...fluid.layer_helper import LayerHelper
-from ...fluid.framework import in_dygraph_mode, convert_np_dtype_to_dtype_
-from ...fluid import core
+from ...fluid.framework import convert_np_dtype_to_dtype_
 from ...fluid.data_feeder import check_variable_and_dtype, check_dtype
 import paddle
-from paddle import _C_ops
+from paddle import _C_ops, in_dynamic_mode
+from paddle.framework import core
 
 __all__ = []
 
@@ -61,7 +61,7 @@ def celu(x, alpha=1.0, name=None):
     if alpha == 0:
         raise ZeroDivisionError("alpha cannot be 0 for celu")
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.celu(x, 'alpha', alpha)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'celu')
@@ -110,7 +110,7 @@ def elu(x, alpha=1.0, name=None):
             #  [ 1.          15.6      ]]
     """
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.elu(x, 'alpha', alpha)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'elu')
@@ -174,7 +174,7 @@ def gelu(x, approximate=False, name=None):
             #  [ 0.84119201,  1.39957154]]
     """
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.gelu(x, 'approximate', approximate)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'gelu')
@@ -222,7 +222,7 @@ def hardshrink(x, threshold=0.5, name=None):
             out = F.hardshrink(x) # [-1., 0., 2.5]
 
     """
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.hard_shrink(x, 'threshold', threshold)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
@@ -273,7 +273,7 @@ def hardtanh(x, min=-1.0, max=1.0, name=None):
             out = F.hardtanh(x) # [-1., 0.3, 1.]
     """
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.brelu(x, 't_min', min, 't_max', max)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
@@ -328,7 +328,7 @@ def hardsigmoid(x, slope=0.1666667, offset=0.5, name=None):
             out = F.hardsigmoid(x) # [0., 1., 0.666667]
     """
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.hard_sigmoid(x, 'slope', slope, 'offset', offset)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
@@ -382,7 +382,7 @@ def hardswish(x, name=None):
             out = F.hardswish(x) # [0., 5., 0.666667]
     """
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.hard_swish(x)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
@@ -427,7 +427,7 @@ def leaky_relu(x, negative_slope=0.01, name=None):
             out = F.leaky_relu(x) # [-0.02, 0., 1.]
 
     """
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.leaky_relu(x, 'alpha', negative_slope)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
@@ -518,7 +518,7 @@ def prelu(x, weight, data_format="NCHW", name=None):
                 1], "The weight size should be equal to x input channel in prelu() when weight shape is not [1]."
         mode = 'channel'
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.prelu(x, weight, 'mode', mode, 'data_format', data_format)
 
     helper = LayerHelper('prelu', **locals())
@@ -560,7 +560,7 @@ def relu(x, name=None):
             out = F.relu(x) # [0., 0., 1.]
     """
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.relu(x)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'relu')
@@ -605,7 +605,7 @@ def log_sigmoid(x, name=None):
             out = F.log_sigmoid(x) # [-0.313262 -0.126928 -0.0485874 -0.0181499]
     """
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.logsigmoid(x)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
@@ -672,7 +672,7 @@ def maxout(x, groups, axis=1, name=None):
             #    [0.7142536  0.88725346 0.61093384 0.38833922]]]]
     """
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.maxout(x, 'groups', groups, 'axis', axis)
 
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'maxout')
@@ -721,7 +721,7 @@ def relu6(x, name=None):
             out = F.relu6(x) # [0, 0.3, 6]
     """
     threshold = 6.0
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.relu6(x, 'threshold', threshold)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'relu6')
@@ -780,7 +780,7 @@ def selu(x,
         raise ValueError(
             "The alpha must be no less than zero. Received: {}.".format(alpha))
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.selu(x, 'scale', scale, 'alpha', alpha)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'selu')
@@ -821,7 +821,7 @@ def silu(x, name=None):
             out = F.silu(x) # [ 0.731059, 1.761594, 2.857722, 3.928055 ]
     """
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.silu(x)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'silu')
@@ -951,7 +951,7 @@ def softmax(x, axis=-1, dtype=None, name=None):
         dtype = convert_np_dtype_to_dtype_(dtype)
     use_cudnn = True
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         outs_cast = x if dtype is None \
             else _C_ops.cast(x, 'in_dtype', x.dtype, 'out_dtype', dtype)
         return _C_ops.softmax(outs_cast, 'axis', axis, 'use_cudnn', use_cudnn)
@@ -1026,7 +1026,7 @@ def softplus(x, beta=1, threshold=20, name=None):
             x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
             out = F.softplus(x) # [0.513015, 0.598139, 0.744397, 0.854355]
     """
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.softplus(x, 'beta', beta, 'threshold', threshold)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
@@ -1081,7 +1081,7 @@ def softshrink(x, threshold=0.5, name=None):
             "The threshold must be no less than zero. Received: {}.".format(
                 threshold))
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.softshrink(x, 'lambda', threshold)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
@@ -1122,7 +1122,7 @@ def softsign(x, name=None):
             x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
             out = F.softsign(x) # [-0.285714, -0.166667, 0.0909091, 0.230769]
     """
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.softsign(x)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
@@ -1160,7 +1160,7 @@ def swish(x, name=None):
             out = F.swish(x) # [-0.238406, 0., 0.731059]
     """
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.swish(x, 'beta', 1.0)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'swish')
@@ -1204,7 +1204,7 @@ def mish(x, name=None):
             x = paddle.to_tensor([-5., 0., 5.])
             out = F.mish(x) # [-0.03357624, 0., 4.99955208]
     """
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.mish(x)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'mish')
@@ -1240,7 +1240,7 @@ def tanhshrink(x, name=None):
             x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
             out = F.tanhshrink(x) # [-0.020051, -0.00262468, 0.000332005, 0.00868739]
     """
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.tanh_shrink(x)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
@@ -1286,7 +1286,7 @@ def thresholded_relu(x, threshold=1.0, name=None):
             out = F.thresholded_relu(x) # [2., 0., 0.]
     """
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.thresholded_relu(x, 'threshold', threshold)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
@@ -1360,7 +1360,7 @@ def log_softmax(x, axis=-1, dtype=None, name=None):
     if (dtype is not None) and (not isinstance(dtype, core.VarDesc.VarType)):
         dtype = convert_np_dtype_to_dtype_(dtype)
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         if dtype is not None:
             x = _C_ops.cast(x, 'in_dtype', x.dtype, 'out_dtype', dtype)
         return _C_ops.log_softmax(x, 'axis', axis)
@@ -1498,7 +1498,7 @@ def gumbel_softmax(x, temperature=1.0, hard=False, axis=-1, name=None):
             # [0.00000000, 0.00000000, 0.00000000, 0.00001258, 0.99998736, 0.00000000]]
         
     """
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.gumbel_softmax(x, 'temperature', temperature, 'hard',
                                      hard, 'axis', axis)
 
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index 5a010ad2f20c5..ed668ed124c23 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -14,13 +14,11 @@
 
 import warnings
 import paddle
-from ...fluid.framework import in_dygraph_mode, default_main_program
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.layers.tensor import fill_constant
 from ...tensor import concat
 from ...tensor.creation import zeros
 from paddle.static import Variable
-from ...fluid.layers import core
 from ...fluid import dygraph_utils
 # TODO: define the common functions to build a neural network  
 from ...fluid.layers import unfold  # noqa: F401
@@ -30,13 +28,17 @@
 from ...tensor import sum
 from ...tensor import sqrt
 from ...fluid.data_feeder import check_variable_and_dtype, check_dtype
-from ...fluid.framework import in_dygraph_mode, _varbase_creator
+from ...fluid.framework import _varbase_creator
 
-from ...fluid.framework import in_dygraph_mode
-from ...fluid import core, dygraph_utils
-from ...fluid import core, layers
+from ...fluid import dygraph_utils
+from ...fluid import layers
 from ...fluid.data_feeder import check_variable_and_dtype
+
 from paddle import _C_ops
+from paddle.framework import in_dynamic_mode
+from paddle.tensor.creation import full
+from paddle.framework import core
+from paddle.static import default_main_program
 
 __all__ = []
 
@@ -353,11 +355,11 @@ def _is_list_or_turple_(data):
     if out_shape is not None and scale is not None:
         raise ValueError("Only one of size or scale_factor should be defined.")
     if out_shape is not None:
-        if isinstance(out_shape, Variable) and not in_dygraph_mode():
+        if isinstance(out_shape, Variable) and not in_dynamic_mode():
             out_shape.stop_gradient = True
             inputs['OutSize'] = out_shape
         else:
-            if in_dygraph_mode():
+            if in_dynamic_mode():
                 if isinstance(out_shape, Variable):
                     out_shape = list(out_shape.numpy())
                 for i, dim in enumerate(out_shape):
@@ -428,7 +430,7 @@ def _is_list_or_turple_(data):
                     attrs['out_w'] = out_shape[2]
 
     else:
-        if in_dygraph_mode() and isinstance(scale, Variable):
+        if in_dynamic_mode() and isinstance(scale, Variable):
             scale = list(scale.numpy())
         if isinstance(scale, Variable):
             scale.stop_gradient = True
@@ -454,7 +456,7 @@ def _is_list_or_turple_(data):
                 "Attr(scale)'s type should be float, int, list, tuple, or Tensor."
             )
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         attr_list = []
         for k, v in attrs.items():
             attr_list.append(k)
@@ -719,7 +721,7 @@ def bilinear(x1, x2, weight, bias=None, name=None):
 
     """
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.bilinear_tensor_product(x1, x2, weight, bias)
 
     check_variable_and_dtype(x1, 'x1', ['float32', 'float64'], 'bilinear')
@@ -891,7 +893,7 @@ def dropout(x,
         seed = None
         mode = 'downgrade_in_infer' if mode == 'downscale_in_infer' else mode  #semantic transfer
 
-        if in_dygraph_mode():
+        if in_dynamic_mode():
             if default_main_program().random_seed != 0:
                 seed = default_main_program().random_seed
             out, mask = _C_ops.dropout(
@@ -930,7 +932,7 @@ def get_attrs(prog, dropout_prob, is_test, seed):
             attrs=attrs)
         return out
     else:  #sometimes called dropout_nd #TODO: optimize with c++
-        if not in_dygraph_mode():
+        if not in_dynamic_mode():
             check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'dropout')
         dtype = x.dtype
         keep_prob = 1 - p
@@ -943,7 +945,7 @@ def get_attrs(prog, dropout_prob, is_test, seed):
 
             #get mask shape
             input_shape = x.shape
-            if not in_dygraph_mode():
+            if not in_dynamic_mode():
                 input_shape_tensor = paddle.shape(x)
             drop_axes = [axis] if isinstance(axis, int) else list(axis)
             if min(drop_axes) < 0 or max(drop_axes) > len(input_shape) - 1:
@@ -954,7 +956,7 @@ def get_attrs(prog, dropout_prob, is_test, seed):
                     "length of axis should not be greater than dimensions of x:{}, but get length of axis: {}".
                     format(len(input_shape), len(drop_axes)))
             mask_shape = [1] * len(input_shape)
-            if not in_dygraph_mode():
+            if not in_dynamic_mode():
                 for i in drop_axes:
                     mask_shape[i] = input_shape_tensor[i]
             else:
@@ -964,7 +966,7 @@ def get_attrs(prog, dropout_prob, is_test, seed):
             #get mask
             random_tensor = paddle.uniform(
                 mask_shape, dtype='float32', min=0., max=1.0)
-            p = layers.fill_constant(shape=[1], dtype='float32', value=p)
+            p = full(shape=[1], fill_value=p, dtype='float32')
             keep_mask = paddle.greater_equal(random_tensor, p)
 
             scale_input = paddle.cast(scale_input, dtype)
@@ -1122,7 +1124,7 @@ def alpha_dropout(x, p=0.5, training=True, name=None):
     if p < 0 or p > 1:
         raise ValueError("p argument should between 0 and 1")
 
-    if not in_dygraph_mode():
+    if not in_dynamic_mode():
         check_variable_and_dtype(x, 'x', ['float32', 'float64'],
                                  'alpha_dropout')
 
@@ -1142,16 +1144,15 @@ def alpha_dropout(x, p=0.5, training=True, name=None):
         #get mask
         random_tensor = paddle.uniform(
             input_shape, dtype='float32', min=0., max=1.0)
-        p = layers.fill_constant(shape=[1], dtype='float32', value=p)
+        p = full(shape=[1], fill_value=p, dtype='float32')
         keep_mask = paddle.greater_equal(random_tensor, p)
         keep_mask = paddle.cast(keep_mask, dtype)
         drop_mask = paddle.subtract(
-            layers.fill_constant(
-                shape=input_shape, dtype=dtype, value=1.),
-            keep_mask)
+            full(
+                shape=input_shape, fill_value=1., dtype=dtype), keep_mask)
 
         #apply mask
-        b = layers.fill_constant(shape=[1], dtype=dtype, value=b)
+        b = full(shape=[1], fill_value=b, dtype=dtype)
         y = paddle.add(paddle.multiply(x, keep_mask),
                        paddle.scale(
                            drop_mask, scale=alpha_p))
@@ -1347,7 +1348,7 @@ def pad(x, pad, mode='constant', value=0, data_format="NCHW", name=None):
                 unsqueezed_dim = [1]
                 x = unsqueeze(x, axis=unsqueezed_dim)
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         if isinstance(pad, Variable):
             pad = pad.numpy()
         out = _C_ops.pad3d(x, "paddings", pad, "mode", mode, "value", value,
@@ -1519,7 +1520,7 @@ def linear(x, weight, bias=None, name=None):
           #     [0.9440598  0.9440598  0.9440598  0.9440598 ]
           #     [2.1077576  2.1077576  2.1077576  2.1077576 ]]
     """
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         pre_bias = _C_ops.matmul_v2(x, weight, 'trans_x', False, 'trans_y',
                                     False)
 
@@ -1614,7 +1615,7 @@ def label_smooth(label, prior_dist=None, epsilon=0.1, name=None):
     if epsilon > 1. or epsilon < 0.:
         raise ValueError("The value of epsilon must be between 0 and 1.")
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.label_smooth(label, prior_dist, 'epsilon', float(epsilon))
 
     check_variable_and_dtype(label, 'label', ['float32', 'float64'],
@@ -1765,7 +1766,7 @@ class centers and the shape of sampled_class_center will be [num_positive_class_
     if (seed is None or seed == 0) and default_main_program().random_seed != 0:
         seed = default_main_program().random_seed
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         remapped_label, sampled_class_center = _C_ops.class_center_sample(
             label, 'num_classes', num_classes, 'num_samples', num_samples,
             'ring_id', ring_id, 'nranks', nranks, 'rank', rank, 'fix_seed',
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index 31cb91bc93b48..f7d765d854116 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -16,9 +16,8 @@
 
 import numpy as np
 from ...device import get_cudnn_version
-from ...fluid.framework import in_dygraph_mode
 from ...static import Variable
-from ...fluid import core, dygraph_utils, get_flags
+from ...fluid import dygraph_utils
 from ...fluid.layers.utils import convert_to_list, _is_symmetric_padding
 from ...fluid.data_feeder import check_variable_and_dtype
 from ...framework import ParamAttr
@@ -27,6 +26,11 @@
 from ...tensor.manipulation import unsqueeze, squeeze
 from ...tensor.math import add
 from ...fluid.layers import nn
+from paddle.device import is_compiled_with_cuda
+from paddle.device import is_compiled_with_rocm
+from paddle.device import is_compiled_with_npu
+from paddle import in_dynamic_mode
+from paddle import get_flags
 
 __all__ = []
 
@@ -114,7 +118,7 @@ def _conv_nd(x,
              name=None):
 
     # Due to the poor performance of NHWC, we transpose the input to NCHW.
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         attrs = ('strides', stride, 'paddings', padding, 'dilations', dilation,
                  'groups', groups, 'use_cudnn', use_cudnn, 'use_mkldnn',
                  use_mkldnn, 'fuse_relu_before_depthwise_conv', False,
@@ -342,13 +346,13 @@ def conv1d(x,
     l_type = "conv2d"
 
     # When "groups==num_channels and num_filters% num_channels == 0" using depthwise_conv2d has better performance
-    if (core.is_compiled_with_cuda() and num_channels == groups and
+    if (is_compiled_with_cuda() and num_channels == groups and
             num_channels != 1 and num_filters % num_channels == 0):
         l_type = 'depthwise_conv2d'
         use_cudnn = False
 
     # NPU only supports depthwise_conv2d when  "input_channel = output_channel = groups"
-    if core.is_compiled_with_npu():
+    if is_compiled_with_npu():
         if (num_channels == groups and num_channels == num_filters):
             l_type = 'depthwise_conv2d'
         else:
@@ -357,7 +361,7 @@ def conv1d(x,
     squeeze_aixs = -3 if channel_last else -2
     x = unsqueeze(x, axis=[squeeze_aixs])
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         attrs = ('strides', stride, 'paddings', padding, 'dilations', dilation,
                  'groups', groups, 'use_cudnn', use_cudnn, 'use_mkldnn', False,
                  'fuse_relu_before_depthwise_conv', False, "padding_algorithm",
@@ -553,7 +557,7 @@ def conv2d(x,
 
     cudnn_version = get_cudnn_version()
 
-    use_cudnn = True if (core.is_compiled_with_cuda() and
+    use_cudnn = True if (is_compiled_with_cuda() and
                          cudnn_version is not None) else False
 
     use_mkldnn = _global_flags()["FLAGS_use_mkldnn"]
@@ -567,20 +571,20 @@ def conv2d(x,
     if (num_channels == groups and num_channels != 1 and
             num_filters % num_channels == 0):
         l_type = 'depthwise_conv2d'
-        if core.is_compiled_with_rocm():
+        if is_compiled_with_rocm():
             use_cudnn = True
         else:
             use_cudnn = False
 
     # NPU only supports depthwise_conv2d when  "input_channel = output_channel = groups"
-    if core.is_compiled_with_npu():
+    if is_compiled_with_npu():
         if (num_channels == groups and num_channels == num_filters):
             l_type = 'depthwise_conv2d'
         else:
             l_type = 'conv2d'
 
-    if (core.is_compiled_with_cuda() and get_flags("FLAGS_conv2d_disable_cudnn")
-        ["FLAGS_conv2d_disable_cudnn"]):
+    if (is_compiled_with_cuda() and get_flags("FLAGS_conv2d_disable_cudnn")[
+            "FLAGS_conv2d_disable_cudnn"]):
         use_cudnn = False
 
     return _conv_nd(x, weight, bias, stride, padding, padding_algorithm,
@@ -815,7 +819,7 @@ def conv1d_transpose(x,
     x = unsqueeze(x, axis=[squeeze_axis])
     weight = unsqueeze(weight, axis=[-1])
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         attrs = ('output_padding', output_padding, 'output_size', output_size,
                  'strides', stride, 'paddings', padding, 'padding_algorithm',
                  padding_algorithm, 'dilations', dilation, 'groups', groups,
@@ -1026,7 +1030,7 @@ def conv2d_transpose(x,
 
     cudnn_version = get_cudnn_version()
 
-    use_cudnn = True if (core.is_compiled_with_cuda() and
+    use_cudnn = True if (is_compiled_with_cuda() and
                          cudnn_version is not None) else False
 
     # update attrs
@@ -1057,7 +1061,7 @@ def conv2d_transpose(x,
         op_type = 'depthwise_conv2d_transpose'
         use_cudnn = False
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         attrs = ('output_padding', output_padding, 'output_size', output_size,
                  'strides', stride, 'paddings', padding, 'padding_algorithm',
                  padding_algorithm, 'dilations', dilation, 'groups', groups,
@@ -1242,7 +1246,7 @@ def conv3d(x,
                                                                   groups))
 
     cudnn_version = get_cudnn_version()
-    use_cudnn = True if (core.is_compiled_with_cuda() and
+    use_cudnn = True if (is_compiled_with_cuda() and
                          cudnn_version is not None) else False
 
     padding, padding_algorithm = _update_padding_nd(padding, channel_last, 3)
@@ -1458,13 +1462,13 @@ def conv3d_transpose(x,
     cudnn_version = get_cudnn_version()
 
     #TODO(LielinJiang): whether to use cudnn according to the version of cudnn
-    use_cudnn = True if (core.is_compiled_with_cuda() and
+    use_cudnn = True if (is_compiled_with_cuda() and
                          cudnn_version is not None) else False
 
     op_type = 'conv3d_transpose'
     data_format_ = "NHWC" if channel_last else "NCHW"
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         attrs = ('output_padding', output_padding, 'output_size', output_size,
                  'paddings', padding, "padding_algorithm", padding_algorithm,
                  'strides', stride, 'dilations', dilation, 'groups', groups,
diff --git a/python/paddle/nn/functional/extension.py b/python/paddle/nn/functional/extension.py
index bccb7bc7334fb..6a8686b612e7f 100644
--- a/python/paddle/nn/functional/extension.py
+++ b/python/paddle/nn/functional/extension.py
@@ -17,12 +17,12 @@
 import numpy as np
 from ...fluid.data_feeder import check_dtype
 from ...fluid.layer_helper import LayerHelper
-from ...fluid.framework import in_dygraph_mode
 from ...static import Variable
 from ...tensor.creation import assign
-from ...fluid import core, dygraph_utils
+from ...fluid import dygraph_utils
 from ...fluid.layers.layer_function_generator import templatedoc
-from ...fluid.layers.sequence_lod import sequence_mask
+from ...fluid.layers.sequence_lod import sequence_mask  #noqa: F401
+from paddle import in_dynamic_mode
 
 __all__ = []
 
@@ -125,7 +125,7 @@ def __check_input(input, offset, dim1, dim2):
                "dim1 and dim2 cannot be the same dimension." \
                 "But received dim1 = %d, dim2 = %d\n"%(dim1, dim2)
 
-    if not in_dygraph_mode():
+    if not in_dynamic_mode():
         __check_input(input, offset, dim1, dim2)
     helper = LayerHelper("diag_embed", **locals())
 
diff --git a/python/paddle/nn/functional/input.py b/python/paddle/nn/functional/input.py
index f71d3001f6f3b..de8a7ff6d3c7b 100644
--- a/python/paddle/nn/functional/input.py
+++ b/python/paddle/nn/functional/input.py
@@ -14,12 +14,11 @@
 
 from __future__ import print_function
 import warnings
-from ...fluid.framework import in_dygraph_mode
 from ...static import Variable
 from ...fluid.layer_helper import LayerHelper
-from ...fluid.layers import core
 from ...fluid.data_feeder import check_variable_and_dtype, check_dtype
 from paddle import _C_ops
+from paddle import in_dynamic_mode
 
 __all__ = []
 
@@ -87,7 +86,7 @@ def one_hot(x, num_classes, name=None):
 
     """
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.one_hot_v2(x, 'depth', num_classes, 'allow_out_of_range',
                                  False)
     else:
@@ -196,7 +195,7 @@ def embedding(x, weight, padding_idx=None, sparse=False, name=None):
         raise ValueError("padding_idx must be within [-{}, {})".format(
             weight.shape[0], weight.shape[0]))
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.lookup_table_v2(
             weight, x, 'is_sparse', sparse, 'is_distributed', False,
             'remote_prefetch', False, 'padding_idx', padding_idx)
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 8dc040325934f..636d2f645c5b0 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -14,15 +14,12 @@
 # limitations under the License.
 
 import paddle
-from ...fluid.layer_helper import LayerHelper
 from ...fluid.data_feeder import check_variable_and_dtype
-import paddle.fluid as fluid
 
 # TODO: define loss functions of neural network
 import numpy as np
 import paddle
 import paddle.fluid as fluid
-from ...fluid.framework import core, in_dygraph_mode
 from ...fluid.layers.nn import _elementwise_op_in_dygraph
 from ...fluid.layers import dice_loss  # noqa: F401
 from ...fluid.layers import log_loss  # noqa: F401
@@ -34,11 +31,12 @@
 from ...fluid.layers import edit_distance  # noqa: F401
 from ...fluid.layers import huber_loss
 from ...fluid.layer_helper import LayerHelper
-from ...fluid.framework import in_dygraph_mode
 from ...fluid.framework import _varbase_creator
 from ...static import Variable
 from paddle.utils import deprecated
 from paddle import _C_ops
+from paddle import in_dynamic_mode
+from paddle.framework import core
 
 __all__ = []
 
@@ -115,7 +113,7 @@ def binary_cross_entropy(input, label, weight=None, reduction='mean',
             "'mean' or 'none', but received %s, which is not allowed." %
             reduction)
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         out = _C_ops.bce_loss(input, label)
         if weight is not None:
             out = _C_ops.elementwise_mul(out, weight, 'axis', -1)
@@ -249,7 +247,7 @@ def binary_cross_entropy_with_logits(logit,
             "should be 'sum', 'mean' or 'none', but received %s, which is not allowed."
             % reduction)
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         one = _varbase_creator(dtype=logit.dtype)
         _C_ops.fill_constant(one, 'value',
                              float(1.0), 'force_cpu', False, 'dtype', one.dtype,
@@ -284,8 +282,7 @@ def binary_cross_entropy_with_logits(logit,
     out = paddle.fluid.layers.sigmoid_cross_entropy_with_logits(
         logit, label, name=sigmoid_name)
 
-    one = paddle.fluid.layers.fill_constant(
-        shape=[1], value=1.0, dtype=logit.dtype)
+    one = paddle.full(shape=[1], fill_value=1.0, dtype=logit.dtype)
     if pos_weight is not None:
         fluid.data_feeder.check_variable_and_dtype(
             pos_weight, 'pos_weight', ['float32', 'float64'],
@@ -392,7 +389,7 @@ def hsigmoid_loss(input,
             #  [2.2407534]]
     """
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         out, _, _ = _C_ops.hierarchical_sigmoid(
             input, weight, label, path_table, path_code, bias, 'num_classes',
             num_classes, 'is_sparse', is_sparse, 'remote_prefetch', is_sparse)
@@ -569,7 +566,7 @@ def margin_ranking_loss(input,
         raise ValueError(
             "The value of 'reduction' in MarginRankingLoss should be 'sum', 'mean' or 'none', but "
             "received %s, which is not allowed." % reduction)
-    if fluid.framework.in_dygraph_mode():
+    if in_dynamic_mode():
         out = _C_ops.elementwise_sub(other, input)
         out = _C_ops.elementwise_mul(out, label)
         if margin != 0.0:
@@ -595,8 +592,7 @@ def margin_ranking_loss(input,
 
     if margin != 0.0:
         margin_var = out.block.create_var(dtype=out.dtype)
-        paddle.fluid.layers.fill_constant(
-            [1], out.dtype, margin, out=margin_var)
+        margin_var = paddle.full(shape=[1], fill_value=margin, dtype=out.dtype)
         out = paddle.add(out, margin_var)
 
     result_out = helper.create_variable_for_type_inference(input.dtype)
@@ -686,7 +682,7 @@ def l1_loss(input, label, reduction='mean', name=None):
             "The value of 'reduction' in L1Loss should be 'sum', 'mean' or 'none', but "
             "received %s, which is not allowed." % reduction)
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         unreduced = _elementwise_op_in_dygraph(
             input, label, axis=-1, act='abs', op_name='elementwise_sub')
         if reduction == 'mean':
@@ -776,7 +772,7 @@ def nll_loss(input,
             input_dims))
     n = input_shape[0]
     c = input_shape[1]
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         if input_dims != 2 and input_dims != 4:
             input, _ = _C_ops.reshape2(input, None, 'shape', [n, c, 1, -1])
             label, _ = _C_ops.reshape2(label, None, 'shape', [n, 1, -1])
@@ -995,7 +991,7 @@ def mse_loss(input, label, reduction='mean', name=None):
             "'reduction' in 'mse_loss' should be 'sum', 'mean' or 'none', "
             "but received {}.".format(reduction))
 
-    if not paddle.fluid.framework.in_dygraph_mode():
+    if not in_dynamic_mode():
         paddle.fluid.data_feeder.check_variable_and_dtype(
             input, 'input', ['float32', 'float64'], 'mse_loss')
         paddle.fluid.data_feeder.check_variable_and_dtype(
@@ -1099,7 +1095,7 @@ def ctc_loss(log_probs,
     loss_out = fluid.layers.warpctc(log_probs, labels, blank, norm_by_times,
                                     input_lengths, label_lengths)
 
-    loss_out = fluid.layers.squeeze(loss_out, [-1])
+    loss_out = paddle.squeeze(loss_out, [-1])
     assert reduction in ['mean', 'sum', 'none']
     if reduction == 'mean':
         loss_out = paddle.mean(loss_out / label_lengths)
@@ -1319,7 +1315,7 @@ def margin_cross_entropy(logits,
     if input_dims - 1 == label_dims:
         label = paddle.unsqueeze(label, axis=-1)
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         softmax, loss = _C_ops.margin_cross_entropy(
             logits, label, 'ring_id', ring_id, 'rank', rank, 'nranks', nranks,
             'margin1', margin1, 'margin2', margin2, 'margin3', margin3, 'scale',
@@ -1664,7 +1660,7 @@ def cross_entropy(input,
              (got nput_dims{}, label_dims{})'.format(input_dims, label_dims))
     if input_dims - 1 == label_dims:
         label = paddle.unsqueeze(label, axis=axis)
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         if soft_label == False:
             valid_label = paddle.cast(
                 label != ignore_index, dtype=label.dtype) * label
@@ -1978,7 +1974,7 @@ def sigmoid_focal_loss(logit,
                 "Expected one dimension of normalizer in sigmoid_focal_loss but got {}.".
                 format(normalizer_dims))
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         one = _varbase_creator(dtype=logit.dtype)
         _C_ops.fill_constant(one, 'value',
                              float(1.0), 'force_cpu', False, 'dtype', one.dtype,
@@ -2025,7 +2021,7 @@ def sigmoid_focal_loss(logit,
     loss = paddle.nn.functional.binary_cross_entropy_with_logits(
         logit, label, reduction='none', name=bce_name)
 
-    pred = fluid.layers.sigmoid(logit)
+    pred = paddle.nn.functional.sigmoid(logit)
     p_t = pred * label + (1 - pred) * (1 - label)
 
     alpha_t = alpha * label + (1 - alpha) * (1 - label)
@@ -2125,7 +2121,7 @@ def hinge_embedding_loss(input, label, margin=1.0, reduction='mean', name=None):
             "'reduction' in 'hinge_embedding_loss' should be 'sum', 'mean' or 'none', "
             "but received {}.".format(reduction))
 
-    if not paddle.fluid.framework.in_dygraph_mode():
+    if not in_dynamic_mode():
         check_variable_and_dtype(input, 'input', ['float32', 'float64'],
                                  'hinge_embedding_loss')
         check_variable_and_dtype(label, 'label', ['float32', 'float64'],
diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py
index 9b765a1d7c782..c59d0eb5e6d11 100644
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -17,13 +17,13 @@
 import paddle.fluid as fluid
 from ...fluid.data_feeder import check_variable_and_dtype, check_type
 from ...fluid.layer_helper import LayerHelper
-from ...fluid.framework import in_dygraph_mode, core
 from ...framework import create_parameter
 from ..initializer import Constant
 from ...framework import ParamAttr
-from ...fluid import core, dygraph_utils
+from ...fluid import dygraph_utils
 import numbers
 from paddle import _C_ops
+from paddle import in_dynamic_mode
 
 __all__ = []
 
@@ -78,7 +78,7 @@ def normalize(x, p=2, axis=1, epsilon=1e-12, name=None):
             # [[0.         0.24253564 0.37139067]
             # [1.         0.97014254 0.9284767 ]]
     """
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         eps = fluid.dygraph.base.to_variable([epsilon], dtype=x.dtype)
         out = _C_ops.p_norm(x, 'axis', axis, 'porder',
                             float(p), 'keepdim', True, 'epsilon', epsilon)
@@ -104,7 +104,7 @@ def normalize(x, p=2, axis=1, epsilon=1e-12, name=None):
     helper.append_op(
         type='p_norm', inputs={'X': x}, outputs={'Out': out}, attrs=attrs)
     eps = out.block.create_var(dtype=out.dtype)
-    paddle.fluid.layers.fill_constant([1], out.dtype, epsilon, out=eps)
+    eps = paddle.full(shape=[1], fill_value=epsilon, dtype=out.dtype)
     return paddle.divide(x, paddle.maximum(out, eps), name=name)
 
 
@@ -180,7 +180,7 @@ def batch_norm(x,
     else:
         trainable_statistics = not use_global_stats
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         # for dygraph need tuple
         attrs = ("momentum", momentum, "epsilon", epsilon, "is_test",
                  not training, "data_layout", data_format, "use_mkldnn", False,
@@ -301,7 +301,7 @@ def layer_norm(x,
                          str_normalized_shape[
                              1:] + ', but got input shape ' + str(input_shape))
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         pre_act, _, _ = _C_ops.layer_norm(x, weight, bias, 'epsilon', epsilon,
                                           'begin_norm_axis', begin_norm_axis)
         return dygraph_utils._append_activation_in_dygraph(pre_act, act=None)
@@ -385,7 +385,7 @@ def instance_norm(x,
 
     """
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         out, _, _ = _C_ops.instance_norm(x, weight, bias, "epsilon", eps,
                                          "momentum", momentum, "data_format",
                                          data_format)
@@ -474,7 +474,7 @@ def local_response_norm(x,
             y = paddle.nn.functional.local_response_norm(x, size=5)
             print(y.shape)  # [3, 3, 112, 112]
         """
-    if not in_dygraph_mode():
+    if not in_dynamic_mode():
         check_variable_and_dtype(x, 'x', ['float32'], 'local_response_norm')
     if data_format not in ['NCL', 'NLC', 'NCHW', 'NHWC', 'NCDHW', 'NDHWC']:
         raise ValueError(
diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py
index 01ddf05fb82d2..a528a72ec5cac 100755
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -13,13 +13,11 @@
 # limitations under the License.
 
 # TODO: define pooling functions
-from ...fluid import core
-from ...fluid.framework import in_dygraph_mode
 from ...fluid.layers import utils, LayerHelper
 from ...tensor.manipulation import unsqueeze, squeeze
 from ...fluid.data_feeder import check_type, check_variable_and_dtype
 from paddle import _C_ops
-from paddle import _C_ops
+from paddle import in_dynamic_mode
 
 __all__ = []
 
@@ -210,7 +208,7 @@ def avg_pool1d(x,
     """
     """NCL to NCHW"""
     data_format = "NCHW"
-    if not in_dygraph_mode():
+    if not in_dynamic_mode():
         check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'avg_pool1d')
     _check_input(x, 3)
     x = unsqueeze(x, [2])
@@ -232,7 +230,7 @@ def avg_pool1d(x,
     # use 2d to implenment 1d should expand padding in advance.
     padding = _expand_low_nd_padding(padding)
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         output = _C_ops.pool2d(
             x, 'pooling_type', 'avg', 'ksize', kernel_size, 'global_pooling',
             False, 'strides', stride, 'paddings', padding, 'padding_algorithm',
@@ -346,7 +344,7 @@ def avg_pool2d(x,
     padding, padding_algorithm = _update_padding_nd(
         padding, 2, channel_last, ceil_mode=ceil_mode)
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         output = _C_ops.pool2d(x, 'pooling_type', 'avg', 'ksize', kernel_size,
                                'global_pooling', False, 'padding_algorithm',
                                padding_algorithm, 'strides', stride, 'paddings',
@@ -468,7 +466,7 @@ def avg_pool3d(x,
     _check_value_limitation(kernel_size, "kernel_size", min_limit=1e-3)
     _check_value_limitation(stride, "stride", min_limit=1e-3)
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         output = _C_ops.pool3d(
             x, 'pooling_type', 'avg', 'ksize', kernel_size, 'strides', stride,
             'paddings', padding, 'global_pooling', False, 'padding_algorithm',
@@ -571,7 +569,7 @@ def max_pool1d(x,
     """
     """NCL to NCHW"""
     data_format = "NCHW"
-    if not in_dygraph_mode():
+    if not in_dynamic_mode():
         check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'max_pool1d')
     _check_input(x, 3)
     x = unsqueeze(x, [2])
@@ -587,7 +585,7 @@ def max_pool1d(x,
     # use 2d to implenment 1d should expand padding in advance.
     padding = _expand_low_nd_padding(padding)
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         if return_mask:
             pool_out = _C_ops.max_pool2d_with_index(
                 x, 'ksize', kernel_size, 'global_pooling', False, 'strides',
@@ -746,7 +744,7 @@ def max_unpool1d(x,
     output_size = _unpool_output_size(x, kernel_size, stride, padding,
                                       output_size)
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         output = _C_ops.unpool(x, indices, 'unpooling_type', 'max', 'ksize',
                                kernel_size, 'strides', stride, 'paddings',
                                padding, "output_size", output_size,
@@ -861,7 +859,7 @@ def max_unpool2d(x,
     output_size = _unpool_output_size(x, kernel_size, stride, padding,
                                       output_size)
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         output = _C_ops.unpool(x, indices, 'unpooling_type', 'max', 'ksize',
                                kernel_size, 'strides', stride, 'paddings',
                                padding, "output_size", output_size,
@@ -973,7 +971,7 @@ def max_unpool3d(x,
     output_size = _unpool_output_size(x, kernel_size, stride, padding,
                                       output_size)
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         output = _C_ops.unpool3d(x, indices, 'unpooling_type', 'max', 'ksize',
                                  kernel_size, 'strides', stride, 'paddings',
                                  padding, "output_size", output_size,
@@ -1029,7 +1027,7 @@ def max_pool2d(x,
             "When setting return_mask to true, data_format must be set to NCHW in API:max_pool2d"
         )
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         if return_mask:
             output = _C_ops.max_pool2d_with_index(
                 x, 'ksize', kernel_size, 'global_pooling', False, 'strides',
@@ -1160,7 +1158,7 @@ def max_pool3d(x,
             "When setting return_mask to true, data_format must be set to NCDHW in API:max_pool3d"
         )
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         if return_mask:
             output = _C_ops.max_pool3d_with_index(
                 x, 'pooling_type', 'max', 'ksize', kernel_size, 'strides',
@@ -1250,7 +1248,7 @@ def adaptive_avg_pool1d(x, output_size, name=None):
               # pool_out shape: [1, 3, 16])
     """
     pool_type = 'avg'
-    if not in_dygraph_mode():
+    if not in_dynamic_mode():
         check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
                                  'adaptive_pool2d')
         check_type(output_size, 'pool_size', (int), 'adaptive_pool1d')
@@ -1258,7 +1256,7 @@ def adaptive_avg_pool1d(x, output_size, name=None):
     pool_size = [1] + utils.convert_to_list(output_size, 1, 'pool_size')
 
     x = unsqueeze(x, [2])
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         pool_out = _C_ops.pool2d(x, 'pooling_type', pool_type, 'ksize',
                                  pool_size, 'adaptive', True)
         return squeeze(pool_out, [2])
@@ -1333,7 +1331,7 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
                             output_size=[3, 3])
             # out.shape is [2, 3, 3, 3]
     """
-    if not in_dygraph_mode():
+    if not in_dynamic_mode():
         check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
                                  'adaptive_avg_pool2d')
         check_type(data_format, 'data_format', str, 'adaptive_avg_pool2d')
@@ -1357,7 +1355,7 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
         if output_size[1] == None:
             output_size[1] = in_w
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         output = _C_ops.pool2d(x, 'pooling_type', 'avg', 'ksize', output_size,
                                'global_pooling', False, 'adaptive', True,
                                'data_format', data_format)
@@ -1437,7 +1435,7 @@ def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
                             output_size=[3, 3, 3])
             # out.shape is [2, 3, 3, 3, 3]
     """
-    if not in_dygraph_mode():
+    if not in_dynamic_mode():
         check_variable_and_dtype(x, 'x', ['float32', 'float64'],
                                  'adaptive_avg_pool3d')
         check_type(data_format, 'data_format', str, 'adaptive_avg_pool3d')
@@ -1463,7 +1461,7 @@ def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
         if output_size[2] == None:
             output_size[2] = in_w
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         output = _C_ops.pool3d(x, 'pooling_type', 'avg', 'ksize', output_size,
                                'global_pooling', False, 'adaptive', True,
                                'data_format', data_format)
@@ -1537,7 +1535,7 @@ def adaptive_max_pool1d(x, output_size, return_mask=False, name=None):
               # pool_out shape: [1, 3, 16] indices  shape: [1, 3, 16]
     """
     pool_type = 'max'
-    if not in_dygraph_mode():
+    if not in_dynamic_mode():
         check_variable_and_dtype(x, 'x', ['float32', 'float64'],
                                  'adaptive_max_pool1d')
         check_type(output_size, 'pool_size', int, 'adaptive_max_pool1d')
@@ -1547,7 +1545,7 @@ def adaptive_max_pool1d(x, output_size, return_mask=False, name=None):
     pool_size = [1] + utils.convert_to_list(output_size, 1, 'pool_size')
 
     x = unsqueeze(x, [2])
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         pool_out = _C_ops.max_pool2d_with_index(
             x, 'pooling_type', pool_type, 'ksize', pool_size, 'adaptive', True)
         return (squeeze(pool_out[0], [2]), squeeze(
@@ -1619,7 +1617,7 @@ def adaptive_max_pool2d(x, output_size, return_mask=False, name=None):
                             output_size=[3, 3])
               # out.shape is [2, 3, 3, 3]
     """
-    if not in_dygraph_mode():
+    if not in_dynamic_mode():
         check_variable_and_dtype(x, 'x', ['float32', 'float64'],
                                  'adaptive_max_pool2d')
         check_type(return_mask, 'return_mask', bool, 'adaptive_max_pool2d')
@@ -1636,7 +1634,7 @@ def adaptive_max_pool2d(x, output_size, return_mask=False, name=None):
         if output_size[1] == None:
             output_size[1] = in_w
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         pool_out = _C_ops.max_pool2d_with_index(
             x, 'pooling_type', 'max', 'ksize', output_size, 'adaptive', True)
         return pool_out if return_mask else pool_out[0]
@@ -1710,7 +1708,7 @@ def adaptive_max_pool3d(x, output_size, return_mask=False, name=None):
               # out.shape is [2, 3, 3, 3, 3]
     """
 
-    if not in_dygraph_mode():
+    if not in_dynamic_mode():
         check_variable_and_dtype(x, 'x', ['float32', 'float64'],
                                  'adaptive_max_pool3d')
         check_type(return_mask, 'return_mask', bool, 'adaptive_max_pool3d')
@@ -1729,7 +1727,7 @@ def adaptive_max_pool3d(x, output_size, return_mask=False, name=None):
         if output_size[2] == None:
             output_size[2] = in_w
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         pool_out = _C_ops.max_pool3d_with_index(
             x, 'pooling_type', 'max', 'ksize', output_size, 'adaptive', True)
         return pool_out if return_mask else pool_out[0]
diff --git a/python/paddle/nn/functional/sparse_attention.py b/python/paddle/nn/functional/sparse_attention.py
index c39fcb8554a2f..53be014527815 100644
--- a/python/paddle/nn/functional/sparse_attention.py
+++ b/python/paddle/nn/functional/sparse_attention.py
@@ -14,10 +14,10 @@
 
 import warnings
 import paddle
-from ...fluid.framework import in_dygraph_mode, default_main_program
+from ...fluid.framework import default_main_program
 from paddle.fluid.layer_helper import LayerHelper
-from ...fluid.framework import in_dygraph_mode
 from paddle import _C_ops
+from paddle import in_dynamic_mode
 
 
 def sparse_attention(query,
@@ -143,7 +143,7 @@ def sparse_attention(query,
             #       [1.60885942, 2.60885954],
             #       [1.99830270, 2.99830270]]]]
     """
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         result_attention, result_sdd, result_softmax = _C_ops.sparse_attention(
             query, key, value, sparse_csr_offset, sparse_csr_columns,
             key_padding_mask, attn_mask)
diff --git a/python/paddle/nn/functional/vision.py b/python/paddle/nn/functional/vision.py
index bd3e27a25e12c..43c7757a8777b 100644
--- a/python/paddle/nn/functional/vision.py
+++ b/python/paddle/nn/functional/vision.py
@@ -13,13 +13,14 @@
 # limitations under the License.
 
 from ...device import get_cudnn_version
-from ...fluid.framework import core, in_dygraph_mode
 from ...static import Variable
 from ...fluid.layer_helper import LayerHelper
 from ...fluid.data_feeder import check_variable_and_dtype
 from ...fluid import dygraph_utils
 import numpy as np
 from paddle import _C_ops
+from ...device import is_compiled_with_rocm
+from paddle import in_dynamic_mode
 
 __all__ = []
 
@@ -83,14 +84,14 @@ def affine_grid(theta, out_shape, align_corners=True, name=None):
         use_cudnn = True
     else:
         use_cudnn = False
-    if core.is_compiled_with_rocm():
+    if is_compiled_with_rocm():
         use_cudnn = False  # ROCM platform do not have MIOPEN kernel for affine_grid
 
     if not (isinstance(out_shape, list) or isinstance(out_shape, tuple) or \
             isinstance(out_shape, Variable)):
         raise ValueError("The out_shape should be a list, tuple or Tensor.")
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         _out_shape = out_shape.numpy().tolist() if isinstance(
             out_shape, Variable) else out_shape
         return _C_ops.affine_grid(theta, "output_shape", _out_shape,
@@ -263,7 +264,7 @@ def grid_sample(x,
 
     cudnn_version = get_cudnn_version()
     use_cudnn = False
-    if not core.is_compiled_with_rocm() and (
+    if not is_compiled_with_rocm() and (
             cudnn_version is not None
     ) and align_corners and mode == 'bilinear' and padding_mode == 'zeros':
         use_cudnn = True
@@ -271,7 +272,7 @@ def grid_sample(x,
         x.stop_gradient = False
         grid.stop_gradient = False
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         attrs = ('mode', mode, 'padding_mode', padding_mode, 'align_corners',
                  align_corners, 'use_cudnn', use_cudnn)
         out = getattr(_C_ops, 'grid_sampler')(x, grid, *attrs)
@@ -329,7 +330,7 @@ def pixel_shuffle(x, upscale_factor, data_format="NCHW", name=None):
                          "But recevie Attr(data_format): {} ".format(
                              data_format))
 
-    if in_dygraph_mode():
+    if in_dynamic_mode():
         return _C_ops.pixel_shuffle(x, "upscale_factor", upscale_factor,
                                     "data_format", data_format)
 
diff --git a/python/paddle/nn/initializer/assign.py b/python/paddle/nn/initializer/assign.py
index 13a70a179ffe3..746d2b67b2a1d 100644
--- a/python/paddle/nn/initializer/assign.py
+++ b/python/paddle/nn/initializer/assign.py
@@ -11,11 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from ...fluid import framework
-from ...fluid import core
-from ...fluid import unique_name
-from ...fluid.core import VarDesc
+import paddle
 from ...fluid.data_feeder import check_type
 from ...fluid.initializer import NumpyArrayInitializer
 
@@ -88,13 +84,14 @@ class Assign(NumpyArrayInitializer):
     def __init__(self, value, name=None):
         import numpy
         check_type(value, 'value',
-                   (numpy.ndarray, list, tuple, framework.Variable), 'Assign')
+                   (numpy.ndarray, list, tuple, paddle.static.Variable),
+                   'Assign')
 
         if (isinstance(value, (list, tuple))):
             value = numpy.array(value)
 
         # TODO: value is already is a tensor, accounting efficiency maybe it does not need to convert tensor to numpy data and then initialized.
-        if (isinstance(value, framework.Variable)):
+        if (isinstance(value, paddle.static.Variable)):
             value = value.numpy()
 
         super(Assign, self).__init__(value)
diff --git a/python/paddle/nn/initializer/dirac.py b/python/paddle/nn/initializer/dirac.py
index 26aa349b5b1b4..514afb15a8edb 100644
--- a/python/paddle/nn/initializer/dirac.py
+++ b/python/paddle/nn/initializer/dirac.py
@@ -15,7 +15,9 @@
 from ...fluid.initializer import Initializer
 from ...fluid.data_feeder import check_variable_and_dtype
 from ...fluid.core import VarDesc
-from ...fluid import unique_name, framework
+from ...fluid import framework
+from paddle import in_dynamic_mode
+from paddle.utils import unique_name
 
 __all__ = []
 
@@ -221,6 +223,6 @@ def __call__(self, var, block=None):
                        "out_dtype": var.dtype},
                 stop_gradient=True)
 
-        if not framework.in_dygraph_mode():
+        if not in_dynamic_mode():
             var.op = op
         return op
diff --git a/python/paddle/nn/initializer/orthogonal.py b/python/paddle/nn/initializer/orthogonal.py
index 8e0acb9ab2d20..84cdb971d77d4 100644
--- a/python/paddle/nn/initializer/orthogonal.py
+++ b/python/paddle/nn/initializer/orthogonal.py
@@ -14,9 +14,9 @@
 
 from ...fluid.initializer import Initializer
 from ...fluid.data_feeder import check_variable_and_dtype
-from ...fluid.core import VarDesc
-from ...fluid import unique_name, framework
+from ...fluid import framework
 from ...tensor import diag, transpose, sign, qr, reshape
+from paddle.utils import unique_name
 
 __all__ = []
 
diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py
index 617981cb8f74c..400585c431830 100644
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -14,8 +14,6 @@
 
 # TODO: define activation functions of neural network
 
-from ...fluid import core
-from ...fluid.framework import in_dygraph_mode
 from ...framework import ParamAttr
 from ..initializer import Constant
 from paddle.framework import get_default_dtype
diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py
index 89ff156bded2a..9ae9d5bec437e 100644
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -15,10 +15,10 @@
 # TODO: define the common classes to build a neural network
 import paddle
 from ...fluid.dygraph import Flatten  # noqa: F401
-from ...fluid.framework import in_dygraph_mode
 from .. import functional as F
 from ...fluid.framework import _dygraph_tracer
 from paddle.nn import Layer
+from paddle import in_dynamic_mode
 
 __all__ = []
 
@@ -1456,7 +1456,7 @@ def __init__(self,
             dtype=self._dtype,
             is_bias=False)
 
-        if in_dygraph_mode() and padding_idx != -1:
+        if in_dynamic_mode() and padding_idx != -1:
             with paddle.no_grad():
                 self.weight[padding_idx] = 0.0
 
diff --git a/python/paddle/nn/layer/conv.py b/python/paddle/nn/layer/conv.py
index 26fd544ecce11..fd7355e162ae7 100644
--- a/python/paddle/nn/layer/conv.py
+++ b/python/paddle/nn/layer/conv.py
@@ -16,14 +16,15 @@
 
 import numpy as np
 
-from ...fluid import get_flags
-from ...fluid import core
+from paddle import get_flags
 from ...device import get_cudnn_version
 from .. import Layer
 from ..initializer import Normal
 from .. import functional as F
 from ...fluid.layers import utils
 from ..functional.conv import _update_padding_nd
+from ...device import is_compiled_with_cuda
+from ...device import is_compiled_with_rocm
 
 __all__ = []
 
@@ -138,7 +139,7 @@ def _get_default_param_initializer():
 
         cudnn_version = get_cudnn_version()
 
-        self._use_cudnn = True if (core.is_compiled_with_cuda() and
+        self._use_cudnn = True if (is_compiled_with_cuda() and
                                    cudnn_version is not None) else False
 
         self._op_type = "conv" + str(dims) + 'd'
@@ -146,13 +147,13 @@ def _get_default_param_initializer():
                                           in_channels != 1 and
                                           out_channels % in_channels == 0):
             self._op_type = 'depthwise_conv2d'
-            if core.is_compiled_with_rocm():
+            if is_compiled_with_rocm():
                 self._use_cudnn = True
             else:
                 self._use_cudnn = False
 
-        if (core.is_compiled_with_cuda() and get_flags(
-                "FLAGS_conv2d_disable_cudnn")["FLAGS_conv2d_disable_cudnn"]):
+        if (is_compiled_with_cuda() and get_flags("FLAGS_conv2d_disable_cudnn")[
+                "FLAGS_conv2d_disable_cudnn"]):
             self._use_cudnn = False
 
     def extra_repr(self):
diff --git a/python/paddle/nn/layer/distance.py b/python/paddle/nn/layer/distance.py
index 0547bf75a4bf6..1fb7e8c4f2148 100644
--- a/python/paddle/nn/layer/distance.py
+++ b/python/paddle/nn/layer/distance.py
@@ -16,10 +16,10 @@
 
 import paddle
 from .. import Layer
-from ...fluid.framework import core, in_dygraph_mode
 from ...fluid.data_feeder import check_variable_and_dtype, check_type
 from ...fluid.layer_helper import LayerHelper
 from paddle import _C_ops
+from paddle import in_dynamic_mode
 
 __all__ = []
 
@@ -78,7 +78,7 @@ def __init__(self, p=2., epsilon=1e-6, keepdim=False, name=None):
         check_type(self.keepdim, 'keepdim', (bool), 'PairwiseDistance')
 
     def forward(self, x, y):
-        if in_dygraph_mode():
+        if in_dynamic_mode():
             sub = _C_ops.elementwise_sub(x, y)
             return _C_ops.p_norm(sub, 'axis', 1, 'porder', self.p, 'keepdim',
                                  self.keepdim, 'epsilon', self.epsilon)
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index 9da41f26969c8..7e40c029a02ec 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -16,11 +16,11 @@
 # TODO: define loss functions of neural network
 import numpy as np
 import paddle.fluid as fluid
-import paddle.fluid.core as core
 import paddle
 from .. import functional as F
-from paddle.fluid.framework import core, in_dygraph_mode, _varbase_creator
+from paddle.fluid.framework import _varbase_creator
 from .. import Layer
+from paddle import in_dynamic_mode
 
 __all__ = []
 
@@ -591,7 +591,7 @@ def __init__(self, reduction='mean'):
         self.reduction = reduction
 
     def forward(self, input, label):
-        if not fluid.framework.in_dygraph_mode():
+        if not in_dynamic_mode():
             fluid.data_feeder.check_variable_and_dtype(
                 input, 'input', ['float32', 'float64'], 'MSELoss')
             fluid.data_feeder.check_variable_and_dtype(
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index de9f8663e6769..8113073d757d6 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -33,12 +33,11 @@
 from ...fluid.dygraph import SpectralNorm  # noqa: F401
 
 from ...framework import get_default_dtype, set_default_dtype
-from ...fluid.framework import in_dygraph_mode
 
 from ..initializer import Constant
 from ...framework import ParamAttr
 from ...fluid.data_feeder import check_variable_and_dtype, check_type
-from ...fluid import core, dygraph_utils
+from ...fluid import dygraph_utils
 
 from ..functional import batch_norm, layer_norm, instance_norm
 
@@ -49,6 +48,7 @@
 from .. import functional as F
 from paddle import _C_ops
 from .. import Layer
+from paddle import in_dynamic_mode
 
 __all__ = []
 
@@ -1087,7 +1087,7 @@ def forward(self, x):
 
         ### train mode: use mini-batch stats, eval mode: use global stats
         ### use_global_stats only support False in sync_batch_norm
-        if in_dygraph_mode():
+        if in_dynamic_mode():
             attrs = ("momentum", self._momentum, "epsilon", self._epsilon,
                      "is_test", not self.training, "data_layout",
                      self._data_format, "use_mkldnn", False, "fuse_with_relu",
diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py
index f7d5448d1324b..2bb1f1311107b 100644
--- a/python/paddle/nn/layer/rnn.py
+++ b/python/paddle/nn/layer/rnn.py
@@ -33,6 +33,11 @@
 from paddle.fluid.layers.utils import map_structure, flatten, pack_sequence_as
 from paddle.fluid.data_feeder import convert_dtype
 from paddle import _C_ops
+from paddle import in_dynamic_mode
+from paddle.framework import core
+from paddle.static import default_startup_program
+from paddle.static import program_guard
+
 __all__ = []
 
 
@@ -970,8 +975,8 @@ def flatten_parameters(self):
             # dropout state may also can be hided and avoid saving
             # should dropout state be persistable for static-graph
             self._dropout_state = self.create_variable(
-                dtype=fluid.core.VarDesc.VarType.UINT8)
-            if fluid.framework.in_dygraph_mode():
+                dtype=core.VarDesc.VarType.UINT8)
+            if in_dynamic_mode():
                 with paddle.no_grad():
                     _C_ops.coalesce_tensor(self._all_weights, self._all_weights,
                                            self._flat_weight[0], "copy_data",
@@ -979,8 +984,8 @@ def flatten_parameters(self):
                                            params[0].dtype)
                     return
             # for static-graph, append coalesce_tensor into startup program
-            with fluid.program_guard(fluid.default_startup_program(),
-                                     fluid.default_startup_program()):
+            with program_guard(default_startup_program(),
+                               default_startup_program()):
                 with paddle.no_grad():
                     self._helper.append_op(
                         type="coalesce_tensor",
@@ -999,7 +1004,7 @@ def _cudnn_impl(self, inputs, initial_states, sequence_length):
         if not self.time_major:
             inputs = paddle.tensor.transpose(inputs, [1, 0, 2])
 
-        if fluid.framework.in_dygraph_mode():
+        if in_dynamic_mode():
             _, _, out, state = _C_ops.rnn(
                 inputs, initial_states, self._all_weights, sequence_length,
                 self._dropout_state, self.state_components, 'dropout_prob',
@@ -1014,7 +1019,7 @@ def _cudnn_impl(self, inputs, initial_states, sequence_length):
                 for i in range(self.state_components)
             ]
             reserve = self._helper.create_variable_for_type_inference(
-                dtype=fluid.core.VarDesc.VarType.UINT8, stop_gradient=True)
+                dtype=core.VarDesc.VarType.UINT8, stop_gradient=True)
 
             inputs = {
                 'Input': inputs,
diff --git a/python/paddle/nn/quant/functional_layers.py b/python/paddle/nn/quant/functional_layers.py
index ce5fb3e616eb5..2c0eb88e0875c 100644
--- a/python/paddle/nn/quant/functional_layers.py
+++ b/python/paddle/nn/quant/functional_layers.py
@@ -12,13 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from ...fluid.dygraph import layers
 from ...tensor import math, manipulation
+from .. import Layer
 
 __all__ = []
 
 
-class FloatFunctionalLayer(layers.Layer):
+class FloatFunctionalLayer(Layer):
     def __init__(self):
         super(FloatFunctionalLayer, self).__init__()
 
diff --git a/python/paddle/nn/quant/quant_layers.py b/python/paddle/nn/quant/quant_layers.py
index 7ad43da6ed5c8..8e9316a19623b 100644
--- a/python/paddle/nn/quant/quant_layers.py
+++ b/python/paddle/nn/quant/quant_layers.py
@@ -12,19 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from paddle.fluid.dygraph import layers
-from paddle.fluid import core
+from paddle.framework import core
 from paddle.fluid import dygraph_utils
-from paddle.fluid import unique_name
-from paddle.fluid.param_attr import ParamAttr
+from paddle.utils import unique_name
+from paddle.framework import ParamAttr
 from paddle.fluid.framework import _varbase_creator
-from paddle.fluid.framework import in_dygraph_mode
-from paddle.fluid.initializer import Constant
+from paddle.nn.initializer import Constant
 from paddle.fluid.data_feeder import check_variable_and_dtype
 from paddle.nn import functional as F
 import logging
 from paddle.fluid.log_helper import get_logger
 from paddle import _C_ops
+from paddle import in_dynamic_mode
+from paddle.nn import Layer
 
 __all__ = [
     'FakeQuantAbsMax',
@@ -43,7 +43,7 @@
     __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
 
 
-class FakeQuantAbsMax(layers.Layer):
+class FakeQuantAbsMax(Layer):
     r"""
     FakeQuantAbsMax layer does the abs_max quant and then dequant.
     Its computational formula is described as below:
@@ -76,7 +76,7 @@ def __init__(self,
             self._scale = None
 
     def forward(self, input):
-        if in_dygraph_mode():
+        if in_dynamic_mode():
             attrs = ('bit_length', self._quant_bits)
             quant_out = _varbase_creator(
                 type=input.type,
@@ -125,7 +125,7 @@ def forward(self, input):
         return quant_out
 
 
-class FakeQuantMovingAverageAbsMax(layers.Layer):
+class FakeQuantMovingAverageAbsMax(Layer):
     r"""
     FakeQuantMovingAverageAbsMax layer does the moving_average_abs_max quant and then dequant.
     Its computational formula is described as below:
@@ -175,7 +175,7 @@ def __init__(self,
         self._accum.stop_gradient = True
 
     def forward(self, input):
-        if in_dygraph_mode():
+        if in_dynamic_mode():
             attrs = ('moving_rate', self._moving_rate, 'bit_length',
                      self._quant_bits, 'is_test', not self.training)
             quant_out = _varbase_creator(
@@ -223,7 +223,7 @@ def forward(self, input):
         return quant_out
 
 
-class FakeQuantChannelWiseAbsMax(layers.Layer):
+class FakeQuantChannelWiseAbsMax(Layer):
     def __init__(self,
                  name=None,
                  channel_num=None,
@@ -253,7 +253,7 @@ def __init__(self,
             self._scale = None
 
     def forward(self, input):
-        if in_dygraph_mode():
+        if in_dynamic_mode():
             attrs = ('bit_length', self._quant_bits, 'quant_axis',
                      self._quant_axis)
             quant_out = _varbase_creator(
@@ -306,7 +306,7 @@ def forward(self, input):
         return quant_out
 
 
-class MovingAverageAbsMaxScale(layers.Layer):
+class MovingAverageAbsMaxScale(Layer):
     def __init__(self, name=None, moving_rate=0.9, dtype='float32'):
         r"""
         MovingAverageMaxScale layer is used to calculating the output quantization
@@ -345,7 +345,7 @@ def __init__(self, name=None, moving_rate=0.9, dtype='float32'):
         self._accum.stop_gradient = True
 
     def forward(self, input):
-        if in_dygraph_mode():
+        if in_dynamic_mode():
             attrs = ('moving_rate', self._moving_rate, 'is_test',
                      not self.training)
             state = self._state if self.training else None
@@ -393,7 +393,7 @@ def forward(self, input):
 QuantStub = MovingAverageAbsMaxScale
 
 
-class QuantizedConv2D(layers.Layer):
+class QuantizedConv2D(Layer):
     """
     The computational logic of QuantizedConv2D is the same with Conv2D.
     The only difference is that its inputs are all fake quantized.
@@ -482,7 +482,7 @@ def forward(self, input):
             data_format=self._data_format)
 
 
-class QuantizedConv2DTranspose(layers.Layer):
+class QuantizedConv2DTranspose(Layer):
     """
     The computational logic of QuantizedConv2DTranspose is the same with Conv2DTranspose.
     The only difference is that its inputs are all fake quantized.
@@ -588,7 +588,7 @@ def forward(self, input, output_size=None):
             data_format=self._data_format)
 
 
-class QuantizedLinear(layers.Layer):
+class QuantizedLinear(Layer):
     """
     The computational logic of QuantizedLinear is the same with Linear.
     The only difference is that its inputs are all fake quantized.
@@ -657,7 +657,7 @@ def forward(self, input):
         return out
 
 
-class MAOutputScaleLayer(layers.Layer):
+class MAOutputScaleLayer(Layer):
     """
     Add MovingAverageMaxScale layer to the behind of the input layer.
     Calculate the scale (moving average abs max) for the output of the input layer.
@@ -684,7 +684,7 @@ def forward(self, *inputs, **kwargs):
             return self._ma_output_scale(out)
 
 
-class FakeQuantMAOutputScaleLayer(layers.Layer):
+class FakeQuantMAOutputScaleLayer(Layer):
     """
     Add FakeQuantMovingAverageAbsMax layer to the behind of the input layer.
     """
diff --git a/python/paddle/nn/utils/weight_norm_hook.py b/python/paddle/nn/utils/weight_norm_hook.py
index 8d2cc8062d2cc..c131d218a1cde 100755
--- a/python/paddle/nn/utils/weight_norm_hook.py
+++ b/python/paddle/nn/utils/weight_norm_hook.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import paddle
 import numpy as np
 from ... import fluid
 from ...fluid import dygraph
@@ -39,25 +39,25 @@ def l2_norm(x, axis, epsilon=1e-12, name=None):
             "axis": 1 if axis is None else axis,
             "epsilon": epsilon,
         })
-    return F.squeeze(norm, axes=[axis])
+    return paddle.squeeze(norm, axis=[axis])
 
 
 def norm_except_dim(p, dim):
     shape = p.shape
     ndims = len(shape)
     if dim == -1:
-        return F.sqrt(F.reduce_sum(F.square(p)) + 1e-12)
+        return paddle.sqrt(paddle.sum(paddle.square(p)) + 1e-12)
     elif dim == 0:
-        p_matrix = F.reshape(p, (shape[0], -1))
+        p_matrix = paddle.reshape(p, (shape[0], -1))
         return l2_norm(p_matrix, axis=1)
     elif dim == ndims - 1:
-        p_matrix = F.reshape(p, (-1, shape[-1]))
+        p_matrix = paddle.reshape(p, (-1, shape[-1]))
         return l2_norm(p_matrix, axis=0)
     else:
         perm = list(range(ndims))
         perm[0] = dim
         perm[dim] = 0
-        p_transposed = F.transpose(p, perm)
+        p_transposed = paddle.transpose(p, perm)
         return norm_except_dim(p_transposed, 0)
 
 
@@ -66,25 +66,25 @@ def _weight_norm(v, g, dim):
     ndims = len(shape)
 
     if dim == -1:
-        v_normalized = v / (F.sqrt(F.reduce_sum(F.square(v))) + 1e-12)
+        v_normalized = v / (paddle.sqrt(paddle.sum(paddle.square(v))) + 1e-12)
     elif dim == 0:
-        p_matrix = F.reshape(v, (shape[0], -1))
+        p_matrix = paddle.reshape(v, (shape[0], -1))
         v_normalized = F.l2_normalize(p_matrix, axis=1)
-        v_normalized = F.reshape(v_normalized, shape)
+        v_normalized = paddle.reshape(v_normalized, shape)
     elif dim == ndims - 1:
-        p_matrix = F.reshape(v, (-1, shape[-1]))
+        p_matrix = paddle.reshape(v, (-1, shape[-1]))
         v_normalized = F.l2_normalize(p_matrix, axis=0)
-        v_normalized = F.reshape(v_normalized, shape)
+        v_normalized = paddle.reshape(v_normalized, shape)
     else:
         perm = list(range(ndims))
         perm[0] = dim
         perm[dim] = 0
-        p_transposed = F.transpose(v, perm)
+        p_transposed = paddle.transpose(v, perm)
         transposed_shape = p_transposed.shape
-        p_matrix = F.reshape(p_transposed, (p_transposed.shape[0], -1))
+        p_matrix = paddle.reshape(p_transposed, (p_transposed.shape[0], -1))
         v_normalized = F.l2_normalize(p_matrix, axis=1)
-        v_normalized = F.reshape(v_normalized, transposed_shape)
-        v_normalized = F.transpose(v_normalized, perm)
+        v_normalized = paddle.reshape(v_normalized, transposed_shape)
+        v_normalized = paddle.transpose(v_normalized, perm)
     weight = F.elementwise_mul(
         v_normalized, g, axis=dim if dim is not None else -1)
     return weight
@@ -130,9 +130,9 @@ def apply(layer, name, dim):
         layer.add_parameter(name + "_v", v)
         g = layer.create_parameter(g_var.shape, dtype=g_var.dtype)
         layer.add_parameter(name + '_g', g)
-        with dygraph.no_grad():
-            F.assign(w, v)
-            F.assign(g_var, g)
+        with paddle.no_grad():
+            paddle.assign(w, v)
+            paddle.assign(g_var, g)
         setattr(layer, name, fn.compute_weight(layer))
 
         layer.register_forward_pre_hook(fn)
@@ -145,8 +145,8 @@ def remove(self, layer):
         del layer._parameters[self.name + '_v']
         w = layer.create_parameter(w_var.shape, dtype=w_var.dtype)
         layer.add_parameter(self.name, w)
-        with dygraph.no_grad():
-            F.assign(w_var, w)
+        with paddle.no_grad():
+            paddle.assign(w_var, w)
 
     def __call__(self, layer, inputs):
         setattr(layer, self.name, self.compute_weight(layer))
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index d8ebae9d6bf39..4df026cfa4892 100755
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -30,6 +30,7 @@
 
 from ..fluid.layers import scatter_nd  # noqa: F401
 from ..fluid.layers import shard_index  # noqa: F401
+from ..fluid.layers import crop_tensor as crop  # noqa: F401
 from ..fluid.layers.nn import _elementwise_op_in_dygraph
 from ..fluid import layers
 from ..fluid.dygraph.inplace_utils import inplace_apis_in_dygraph_only

From a6abb6e7b3a9fec5051acf1368a0199118794e97 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E6=98=8E=E5=86=AC?=
 <78149749+winter-wang@users.noreply.github.com>
Date: Tue, 22 Feb 2022 15:45:41 +0800
Subject: [PATCH 035/101] add pten convert pass.test=develop (#39664)

---
 paddle/infrt/CMakeLists.txt                   |  18 +--
 paddle/infrt/api/infrt_api.cc                 |   3 +-
 .../{pten_allocator.h => phi_allocator.h}     |   2 +-
 .../host/{pten_context.h => phi_context.h}    |   2 +-
 paddle/infrt/dialect/CMakeLists.txt           |   6 +-
 paddle/infrt/dialect/basic_kernels.cc         |   5 +-
 paddle/infrt/dialect/basic_kernels.td         |   8 +-
 paddle/infrt/dialect/dense_tensor.cc          |  90 +-----------
 paddle/infrt/dialect/dense_tensor.h           |  62 +-------
 paddle/infrt/dialect/dense_tensor.td          |  24 ++--
 paddle/infrt/dialect/infrt/CMakeLists.txt     |  10 +-
 paddle/infrt/dialect/infrt/common_type.cc     |  88 ++++++++++++
 paddle/infrt/dialect/infrt/common_type.h      |  47 +++++++
 paddle/infrt/dialect/infrt/infrt_dialect.cc   | 103 ++++++++++++--
 paddle/infrt/dialect/infrt/infrt_dialect.h    |   6 +
 paddle/infrt/dialect/infrt/infrt_ops.td       |  51 ++-----
 paddle/infrt/dialect/infrt/infrt_ops_base.td  |  49 +++++++
 paddle/infrt/dialect/infrt_base.cc            |  53 -------
 paddle/infrt/dialect/infrt_base.h             |   2 +-
 paddle/infrt/dialect/infrt_base.td            |   6 +-
 paddle/infrt/dialect/init_infrt_dialects.cc   |  10 +-
 paddle/infrt/dialect/mlir_loader_test.cc      |  10 +-
 paddle/infrt/dialect/pd_op_base.td            |   2 +-
 paddle/infrt/dialect/phi/CMakeLists.txt       |  18 +++
 .../infrt_phi_base.td}                        |  20 +--
 paddle/infrt/dialect/phi/infrt_phi_kernel.td  |  31 ++++
 .../infrt_phi_tensor.cc}                      |  16 +--
 .../infrt_phi_tensor.h}                       |   8 +-
 .../infrt_phi_tensor.td}                      |  22 +--
 paddle/infrt/dialect/phi/pass/CMakeLists.txt  |   7 +
 .../infrt/dialect/phi/pass/kernel_op_desc.cc  | 133 ++++++++++++++++++
 .../infrt/dialect/phi/pass/kernel_op_desc.h   |  32 +++++
 .../infrt/dialect/phi/pass/phi_op_cvt_pass.cc | 116 +++++++++++++++
 .../infrt/dialect/phi/pass/phi_op_cvt_pass.h  |  57 ++++++++
 .../dialect/phi/pass/proto_arg_map_context.cc |  73 ++++++++++
 .../dialect/phi/pass/proto_arg_map_context.h  |  55 ++++++++
 .../{pten/pten_base.cc => phi/phi_base.cc}    |  24 ++--
 .../{pten/pten_base.h => phi/phi_base.h}      |   8 +-
 paddle/infrt/dialect/phi/phi_exec.cc          |  47 +++++++
 paddle/infrt/dialect/pten/CMakeLists.txt      |  13 --
 .../infrt/dialect/pten/infrt_pten_kernel.td   |  26 ----
 paddle/infrt/dialect/test_kernels.cc          |   2 +-
 paddle/infrt/dialect/test_kernels.td          |   2 +-
 paddle/infrt/external_kernels/basic.mlir      |   6 +-
 paddle/infrt/external_kernels/fc.mlir         |  50 +++----
 paddle/infrt/external_kernels/paddle.mlir     |  64 ++++-----
 paddle/infrt/host_context/mlir_exec.cc        |   8 +-
 .../infrt/host_context/mlir_tests/basic.mlir  |  24 ++--
 .../host_context/mlir_tests/dense_tensor.mlir |   8 +-
 .../infrt/host_context/mlir_tests/shape.mlir  |   2 +-
 .../host_context/mlir_to_runtime_translate.cc |   8 +-
 .../host_context/mlir_to_runtime_translate.h  |   2 +-
 .../mlir_to_runtime_translate_test.cc         |  38 ++---
 paddle/infrt/host_context/value.cc            |   2 +-
 paddle/infrt/host_context/value.h             |  20 +--
 paddle/infrt/kernel/CMakeLists.txt            |   4 +-
 paddle/infrt/kernel/basic_kernels.cc          |  24 ++--
 paddle/infrt/kernel/control_flow_kernels.cc   |   2 +-
 .../infrt/kernel/{pten => phi}/CMakeLists.txt |  12 +-
 .../kernel/{pten => phi}/allocator_kernels.cc |   8 +-
 .../kernel/{pten => phi}/allocator_kernels.h  |   8 +-
 .../kernel/{pten => phi}/context_kernels.cc   |   8 +-
 .../kernel/{pten => phi}/context_kernels.h    |   8 +-
 .../{pten => phi}/dense_tensor_kernels.cc     |   8 +-
 .../{pten => phi}/dense_tensor_kernels.h      |   8 +-
 .../infershaped/infershape_launchers_test.cc  |   6 +-
 .../infershaped_kernel_launcher.cc            |   2 +-
 .../infershaped/infershaped_kernel_launcher.h |   0
 .../infershaped_kernel_launchers.h            |   0
 .../infershaped/infershaped_utils.h           |   0
 .../infershaped/phi_kernel_launcher.h}        |   4 +-
 paddle/infrt/kernel/{pten => phi}/registry.cc |  30 ++--
 paddle/infrt/kernel/{pten => phi}/registry.h  |   4 +-
 paddle/infrt/kernel/test_kernels.cc           |   4 +-
 paddle/infrt/pass/CMakeLists.txt              |   1 +
 paddle/infrt/tests/dialect/basic.mlir         |  36 ++---
 paddle/infrt/tests/dialect/benchmark.mlir     |  14 +-
 paddle/infrt/tests/dialect/dense_tensor.mlir  |  18 +--
 .../tests/dialect/disabled_tensor_map.mlir    |  30 ++--
 .../infrt/tests/dialect/disabled_trt_ops.mlir |   6 +-
 paddle/infrt/tests/dialect/paddle_ops.mlir    |   5 +-
 .../tests/dialect/pten/dense_tensor.mlir      |  10 +-
 .../infrt/tests/dialect/pten/pten_pass.mlir   |  10 ++
 .../tests/dialect/tensor/dense_tensor.mlir    |  18 +--
 .../tests/dialect/tensor/naive_kernels.mlir   |  28 ++--
 .../tests/dialect/tensor/tensor_map.mlir.in   |  10 +-
 .../tests/dialect/tensor/tensor_shape.mlir    |   2 +-
 .../tests/dialect/tensor/tensor_type.mlir     |   8 +-
 paddle/infrt/tests/dialect/tensor_shape.mlir  |   2 +-
 paddle/infrt/tests/dialect/tensor_type.mlir   |   8 +-
 paddle/scripts/infrt_build.sh                 |   2 +-
 ...function.sh => get_phi_kernel_function.sh} |   6 +-
 ..._kernel_info.py => get_phi_kernel_info.py} |  18 +--
 93 files changed, 1272 insertions(+), 699 deletions(-)
 rename paddle/infrt/backends/host/{pten_allocator.h => phi_allocator.h} (95%)
 rename paddle/infrt/backends/host/{pten_context.h => phi_context.h} (94%)
 create mode 100644 paddle/infrt/dialect/infrt/common_type.cc
 create mode 100644 paddle/infrt/dialect/infrt/common_type.h
 create mode 100644 paddle/infrt/dialect/infrt/infrt_ops_base.td
 create mode 100644 paddle/infrt/dialect/phi/CMakeLists.txt
 rename paddle/infrt/dialect/{pten/infrt_pten_base.td => phi/infrt_phi_base.td} (56%)
 create mode 100644 paddle/infrt/dialect/phi/infrt_phi_kernel.td
 rename paddle/infrt/dialect/{pten/infrt_pten_tensor.cc => phi/infrt_phi_tensor.cc} (65%)
 rename paddle/infrt/dialect/{pten/infrt_pten_tensor.h => phi/infrt_phi_tensor.h} (83%)
 rename paddle/infrt/dialect/{pten/infrt_pten_tensor.td => phi/infrt_phi_tensor.td} (71%)
 create mode 100644 paddle/infrt/dialect/phi/pass/CMakeLists.txt
 create mode 100644 paddle/infrt/dialect/phi/pass/kernel_op_desc.cc
 create mode 100644 paddle/infrt/dialect/phi/pass/kernel_op_desc.h
 create mode 100644 paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc
 create mode 100644 paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h
 create mode 100644 paddle/infrt/dialect/phi/pass/proto_arg_map_context.cc
 create mode 100644 paddle/infrt/dialect/phi/pass/proto_arg_map_context.h
 rename paddle/infrt/dialect/{pten/pten_base.cc => phi/phi_base.cc} (75%)
 rename paddle/infrt/dialect/{pten/pten_base.h => phi/phi_base.h} (78%)
 create mode 100644 paddle/infrt/dialect/phi/phi_exec.cc
 delete mode 100644 paddle/infrt/dialect/pten/CMakeLists.txt
 delete mode 100644 paddle/infrt/dialect/pten/infrt_pten_kernel.td
 rename paddle/infrt/kernel/{pten => phi}/CMakeLists.txt (61%)
 rename paddle/infrt/kernel/{pten => phi}/allocator_kernels.cc (81%)
 rename paddle/infrt/kernel/{pten => phi}/allocator_kernels.h (84%)
 rename paddle/infrt/kernel/{pten => phi}/context_kernels.cc (82%)
 rename paddle/infrt/kernel/{pten => phi}/context_kernels.h (84%)
 rename paddle/infrt/kernel/{pten => phi}/dense_tensor_kernels.cc (90%)
 rename paddle/infrt/kernel/{pten => phi}/dense_tensor_kernels.h (89%)
 rename paddle/infrt/kernel/{pten => phi}/infershaped/infershape_launchers_test.cc (93%)
 rename paddle/infrt/kernel/{pten => phi}/infershaped/infershaped_kernel_launcher.cc (96%)
 rename paddle/infrt/kernel/{pten => phi}/infershaped/infershaped_kernel_launcher.h (100%)
 rename paddle/infrt/kernel/{pten => phi}/infershaped/infershaped_kernel_launchers.h (100%)
 rename paddle/infrt/kernel/{pten => phi}/infershaped/infershaped_utils.h (100%)
 rename paddle/infrt/kernel/{pten/infershaped/pten_kernel_launcher.h => phi/infershaped/phi_kernel_launcher.h} (93%)
 rename paddle/infrt/kernel/{pten => phi}/registry.cc (65%)
 rename paddle/infrt/kernel/{pten => phi}/registry.h (88%)
 create mode 100755 paddle/infrt/pass/CMakeLists.txt
 create mode 100644 paddle/infrt/tests/dialect/pten/pten_pass.mlir
 rename tools/infrt/{get_pten_kernel_function.sh => get_phi_kernel_function.sh} (89%)
 rename tools/infrt/{get_pten_kernel_info.py => get_phi_kernel_info.py} (92%)

diff --git a/paddle/infrt/CMakeLists.txt b/paddle/infrt/CMakeLists.txt
index 2486c54d5addc..0f6dfb9d8f44e 100644
--- a/paddle/infrt/CMakeLists.txt
+++ b/paddle/infrt/CMakeLists.txt
@@ -2,13 +2,13 @@ if (NOT WITH_INFRT)
     return()
 endif()
 
-option(INFRT_WITH_PTEN  "Compile INFRT with PTEN"    ON)
+option(INFRT_WITH_PHI  "Compile INFRT with PHI"    ON)
 
 #TODO(xiaowei) remove fluid
 include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/platform)
 
-if (INFRT_WITH_PTEN)
-    add_definitions("-DINFRT_WITH_PTEN")
+if (INFRT_WITH_PHI)
+    add_definitions("-DINFRT_WITH_PHI")
 endif()
 
 # compile flags
@@ -97,16 +97,16 @@ set(infrt_mlir_incs
         rewrite_inc
         trt_ops_inc
         )
-if (INFRT_WITH_PTEN)
-    set(pten_libs pten)
+if (INFRT_WITH_PHI)
+    set(phi_libs pten)
     set(infrt_mlir_incs ${infrt_mlir_incs}
-        MLIRinfrt_pten_tensorIncGen
-        MLIRinfrt_pten_baseIncGen
+        MLIRinfrt_phi_tensorIncGen
+        MLIRinfrt_phi_baseIncGen
         )
 endif()
 
-cc_library(infrt SHARED SRCS ${infrt_src} DEPS glog boost ${mlir_libs} ${pten_libs} paddle_framework_proto infrt_naive)
-cc_library(infrt_static SRCS ${infrt_src} DEPS glog boost ${mlir_libs} ${pten_libs} paddle_framework_proto)
+cc_library(infrt SHARED SRCS ${infrt_src} DEPS glog boost ${mlir_libs} ${phi_libs} paddle_framework_proto infrt_naive)
+cc_library(infrt_static SRCS ${infrt_src} DEPS glog boost ${mlir_libs} ${phi_libs} paddle_framework_proto)
 add_dependencies(infrt ${infrt_mlir_incs} mlir-headers)
 
 add_custom_target(test_infrt_exec DEPENDS ${INFRT_TEST_TARGETS})
diff --git a/paddle/infrt/api/infrt_api.cc b/paddle/infrt/api/infrt_api.cc
index c2a4e0aff7a08..28f63db49f4ba 100644
--- a/paddle/infrt/api/infrt_api.cc
+++ b/paddle/infrt/api/infrt_api.cc
@@ -42,7 +42,6 @@ using namespace infrt::host_context;  // NOLINT
 using namespace infrt::tensor;        // NOLINT
 using namespace infrt::tensor;        // NOLINT
 using infrt::dt::TensorMapType;       // NOLINT
-using infrt::dt::TensorType;          // NOLINT
 
 namespace infrt {
 
@@ -145,7 +144,7 @@ class PredictExecutor : public MlirToRuntimeTranslator {
 
     // process results
     auto& last_op = predict_func.front().back();
-    if (last_op.getName().getStringRef() == "infrt.return") {
+    if (last_op.getName().getStringRef() == "Infrt.return") {
       for (size_t i = 0; i < last_op.getNumOperands(); ++i) {
         auto* value = AddValue(mlir::Value(last_op.getOperand(i)));
         results_.push_back(ValueRef(value));
diff --git a/paddle/infrt/backends/host/pten_allocator.h b/paddle/infrt/backends/host/phi_allocator.h
similarity index 95%
rename from paddle/infrt/backends/host/pten_allocator.h
rename to paddle/infrt/backends/host/phi_allocator.h
index fa61e04fb6707..c8f97e04a1b83 100644
--- a/paddle/infrt/backends/host/pten_allocator.h
+++ b/paddle/infrt/backends/host/phi_allocator.h
@@ -16,7 +16,7 @@ limitations under the License. */
 namespace infrt {
 namespace backends {
 
-class CpuPtenAllocator : public phi::Allocator {
+class CpuPhiAllocator : public phi::Allocator {
  public:
   static void deleter(phi::Allocation* ptr) { ::operator delete(ptr); }
 
diff --git a/paddle/infrt/backends/host/pten_context.h b/paddle/infrt/backends/host/phi_context.h
similarity index 94%
rename from paddle/infrt/backends/host/pten_context.h
rename to paddle/infrt/backends/host/phi_context.h
index 961c93529aeb4..9d0e3bc4fbb31 100644
--- a/paddle/infrt/backends/host/pten_context.h
+++ b/paddle/infrt/backends/host/phi_context.h
@@ -16,7 +16,7 @@ limitations under the License. */
 namespace infrt {
 namespace backends {
 
-class CpuPtenContext : public phi::CPUContext {
+class CpuPhiContext : public phi::CPUContext {
  public:
   using Base = phi::CPUContext;
   using phi::CPUContext::SetEigenDevice;
diff --git a/paddle/infrt/dialect/CMakeLists.txt b/paddle/infrt/dialect/CMakeLists.txt
index 757d47a8de43e..e35989da2085b 100644
--- a/paddle/infrt/dialect/CMakeLists.txt
+++ b/paddle/infrt/dialect/CMakeLists.txt
@@ -16,7 +16,7 @@ gather_srcs(infrt_src SRCS
 
 mlir_tablegen_on(basic_kernels)
 mlir_tablegen_on(test_kernels)
-mlir_tablegen_on(infrt_base DIALECT infrt)
+mlir_tablegen_on(infrt_base DIALECT Infrt)
 mlir_tablegen_on(tensor_shape DIALECT ts)
 mlir_tablegen_on(dense_tensor DIALECT dt)
 mlir_tablegen_on(pd_op_base DIALECT pd)
@@ -36,6 +36,6 @@ cc_test_tiny(test_infrt_mlir_loader SRCS mlir_loader_test.cc DEPS infrt ${MLIR_I
 add_subdirectory(infrt)
 add_subdirectory(tensorrt)
 
-if (INFRT_WITH_PTEN)
-    add_subdirectory(pten)
+if (INFRT_WITH_PHI)
+    add_subdirectory(phi)
 endif()
diff --git a/paddle/infrt/dialect/basic_kernels.cc b/paddle/infrt/dialect/basic_kernels.cc
index bad7e73ec5ae5..c1aa75fb24650 100644
--- a/paddle/infrt/dialect/basic_kernels.cc
+++ b/paddle/infrt/dialect/basic_kernels.cc
@@ -90,7 +90,7 @@ static ParseResult parseReturnOp(OpAsmParser &parser,       // NOLINT
 }
 
 static void print(OpAsmPrinter &p, CallOp op) {  // NOLINT
-  p << "infrt.call " << op->getAttr("callee") << "(";
+  p << op->getAttr("callee") << "(";
   p.printOperands(op.getOperands());
   p << ")";
   p.printOptionalAttrDict(op->getAttrs(), {"callee"});
@@ -98,7 +98,7 @@ static void print(OpAsmPrinter &p, CallOp op) {  // NOLINT
 }
 
 static void printConstant(OpAsmPrinter &p, mlir::Operation *op) {  // NOLINT
-  p << op->getName() << " ";
+  p << " ";
   p.printOptionalAttrDict(op->getAttrs(), /*elidedAttrs=*/{"value"});
 
   if (op->getAttrs().size() > 1) p << ' ';
@@ -128,7 +128,6 @@ static void print(OpAsmPrinter &p, ConstantI64Op op) {  // NOLINT
 }
 
 static void print(OpAsmPrinter &p, ReturnOp op) {  // NOLINT
-  p << "infrt.return";
   if (op.getNumOperands() > 0) {
     p << ' ';
     p.printOperands(op.getOperands());
diff --git a/paddle/infrt/dialect/basic_kernels.td b/paddle/infrt/dialect/basic_kernels.td
index 32845a09351f7..aadc146e36280 100644
--- a/paddle/infrt/dialect/basic_kernels.td
+++ b/paddle/infrt/dialect/basic_kernels.td
@@ -48,10 +48,10 @@ def ConstantF64Op : ConstantOp<"f64", F64, F64Attr>;
 def ReturnOp : INFRT_Op<"return", [Terminator]> {
   let summary = "host executor return operation";
   let description = [{
-      The "infrt.return" operation represents a return operation within a function.
+      The "Infrt.return" operation represents a return operation within a function.
 
         func @foo() : (i32, f8) {
-        infrt.return %0, %1 : i32, f8
+        Infrt.return %0, %1 : i32, f8
         }
     }];
 
@@ -112,7 +112,7 @@ def PrintF32Op : PrintOp<"f32", F32>;
 def PrintF64Op : PrintOp<"f64", F64>;
 
 def GetStringOp : INFRT_Op<"get_string"> {
-  let summary = "infrt.get_string";
+  let summary = "Infrt.get_string";
   let description = [{
     Get a !infrt.string value from the given string attribute.
   }];
@@ -124,7 +124,7 @@ def GetStringOp : INFRT_Op<"get_string"> {
 }
 
 def PrintStringOp : INFRT_Op<"print_string"> {
-  let summary = "infrt.print_string";
+  let summary = "Infrt.print_string";
   let description = [{
       An operation that prints a string.
   }];
diff --git a/paddle/infrt/dialect/dense_tensor.cc b/paddle/infrt/dialect/dense_tensor.cc
index fde265765c6d2..49d6887ada032 100644
--- a/paddle/infrt/dialect/dense_tensor.cc
+++ b/paddle/infrt/dialect/dense_tensor.cc
@@ -39,52 +39,6 @@ void DTDialect::initialize() {
       >();
 }
 
-llvm::Optional<TargetType> GetTargetType(mlir::StringRef key) {
-  if (key.equals_insensitive("x86"))
-    return TargetType::X86;
-  else if (key.equals_insensitive("cuda"))
-    return TargetType::CUDA;
-  else
-    return llvm::None;
-}
-
-llvm::Optional<LayoutType> GetLayoutType(mlir::StringRef key) {
-  if (key.equals_insensitive("nchw"))
-    return LayoutType::NCHW;
-  else if (key.equals_insensitive("nhwc"))
-    return LayoutType::NHWC;
-  else
-    return llvm::None;
-}
-
-llvm::Optional<PrecisionType> GetPrecisionType(mlir::StringRef key) {
-  if (key.equals_insensitive("i32"))
-    return PrecisionType::I32;
-  else if (key.equals_insensitive("f32"))
-    return PrecisionType::F32;
-  else
-    return llvm::None;
-}
-
-TensorType TensorType::get(mlir::MLIRContext *ctx,
-                           TargetType target,
-                           LayoutType layout,
-                           PrecisionType precision) {
-  return Base::get(ctx, target, layout, precision);
-}
-
-TargetType TensorType::target() { return getImpl()->target_; }
-
-LayoutType TensorType::layout() { return getImpl()->layout_; }
-
-PrecisionType TensorType::precision() { return getImpl()->precision_; }
-
-mlir::raw_ostream &operator<<(mlir::raw_ostream &os, TensorType tensorType) {
-  os << "TensorType<" << tensorType.target() << ", " << tensorType.layout()
-     << ", " << tensorType.precision() << ">";
-  return os;
-}
-
 TensorMapType TensorMapType::get() {
   return Base::get(::infrt::Global::getMLIRContext());
 }
@@ -101,48 +55,6 @@ StringType StringType::get(mlir::MLIRContext *context) {
   return Base::get(context);
 }
 
-mlir::raw_ostream &operator<<(mlir::raw_ostream &os, TargetType type) {
-  switch (type) {
-    case (TargetType::X86):
-      os << "X86";
-      break;
-    case (TargetType::CUDA):
-      os << "CUDA";
-      break;
-    default:
-      os << "Unsupported";
-  }
-  return os;
-}
-
-mlir::raw_ostream &operator<<(mlir::raw_ostream &os, LayoutType type) {
-  switch (type) {
-    case (LayoutType::NCHW):
-      os << "NCHW";
-      break;
-    case (LayoutType::NHWC):
-      os << "NHWC";
-      break;
-    default:
-      os << "Unsupported";
-  }
-  return os;
-}
-
-mlir::raw_ostream &operator<<(mlir::raw_ostream &os, PrecisionType type) {
-  switch (type) {
-    case (PrecisionType::I32):
-      os << "I32";
-      break;
-    case (PrecisionType::F32):
-      os << "F32";
-      break;
-    default:
-      os << "Unsupported";
-  }
-  return os;
-}
-
 static mlir::Type getTensorType(mlir::MLIRContext *context) {
   auto t_dialect = mlir::Identifier::get("t", context);
   return mlir::OpaqueType::get(t_dialect, "tensor");
@@ -165,7 +77,7 @@ static mlir::ParseResult parseCreateUninitTensorOp(
 
   if (parser.parseArrow()) return mlir::failure();
   if (parser.parseType(outputRawTypes[0])) return mlir::failure();
-  if (!outputRawTypes[0].isa<TensorType>())
+  if (!outputRawTypes[0].isa<DenseTensorType>())
     return parser.emitError(loc, "invalid kind of type specified");
   result.addTypes(outputTypes);
   return mlir::success();
diff --git a/paddle/infrt/dialect/dense_tensor.h b/paddle/infrt/dialect/dense_tensor.h
index 08ba8d720662b..b0a1ea412c53e 100644
--- a/paddle/infrt/dialect/dense_tensor.h
+++ b/paddle/infrt/dialect/dense_tensor.h
@@ -19,68 +19,10 @@
 
 #include <string>
 
+#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
+
 namespace infrt {
 namespace dt {
-enum class TargetType : uint8_t { X86, CUDA };
-enum class LayoutType : uint8_t { NCHW, NHWC };
-enum class PrecisionType : uint8_t { I32, F32 };
-
-llvm::Optional<TargetType> GetTargetType(mlir::StringRef key);
-llvm::Optional<LayoutType> GetLayoutType(mlir::StringRef key);
-llvm::Optional<PrecisionType> GetPrecisionType(mlir::StringRef key);
-
-mlir::raw_ostream &operator<<(mlir::raw_ostream &os, TargetType type);
-mlir::raw_ostream &operator<<(mlir::raw_ostream &os, LayoutType type);
-mlir::raw_ostream &operator<<(mlir::raw_ostream &os, PrecisionType type);
-
-namespace detail {
-struct TensorTypeStorage : public mlir::TypeStorage {
-  TensorTypeStorage(TargetType target,
-                    LayoutType layout,
-                    PrecisionType precision)
-      : target_(target), layout_(layout), precision_(precision) {}
-
-  using KeyTy = std::tuple<TargetType, LayoutType, PrecisionType>;
-
-  bool operator==(const KeyTy &key) const {
-    return key == KeyTy(target_, layout_, precision_);
-  }
-
-  static llvm::hash_code hashKey(const KeyTy &key) {
-    return llvm::hash_value(key);
-  }
-
-  static TensorTypeStorage *construct(
-      mlir::TypeStorageAllocator &allocator,  // NOLINT
-      const KeyTy &key) {
-    return new (allocator.allocate<TensorTypeStorage>())
-        TensorTypeStorage(std::get<0>(key), std::get<1>(key), std::get<2>(key));
-  }
-
-  TargetType target_;
-  LayoutType layout_;
-  PrecisionType precision_;
-};
-}  // namespace detail
-
-class TensorType : public mlir::Type::TypeBase<TensorType,
-                                               mlir::Type,
-                                               detail::TensorTypeStorage> {
- public:
-  using Base::Base;
-
-  static TensorType get(mlir::MLIRContext *ctx,
-                        TargetType target,
-                        LayoutType layout,
-                        PrecisionType precision);
-
-  TargetType target();
-  LayoutType layout();
-  PrecisionType precision();
-};
-
-mlir::raw_ostream &operator<<(mlir::raw_ostream &os, TensorType tensorType);
-
 class TensorMapType : public mlir::Type::TypeBase<TensorMapType,
                                                   mlir::Type,
                                                   mlir::TypeStorage> {
diff --git a/paddle/infrt/dialect/dense_tensor.td b/paddle/infrt/dialect/dense_tensor.td
index 75c8a0d88e4c1..7e6e838a72372 100644
--- a/paddle/infrt/dialect/dense_tensor.td
+++ b/paddle/infrt/dialect/dense_tensor.td
@@ -28,7 +28,7 @@ class CreateUninitTensorOp<string dtype>
   }];
 
   let arguments = (ins I64ArrayAttr:$shape);
-  let results = (outs TensorType:$output);
+  let results = (outs DenseTensor:$output);
 
   let parser  = [{ return infrt::dt::parseCreateUninitTensorOp(parser, result); }];
   let printer = [{ return infrt::dt::printCreateUninitTensorOp(p, *this); }];
@@ -43,8 +43,8 @@ def ShallowCopyTensorOp
       An operation that copy a tensor shallowly.
   }];
 
-  let arguments = (ins TensorType:$input);
-  let results = (outs TensorType:$output);
+  let arguments = (ins DenseTensor:$input);
+  let results = (outs DenseTensor:$output);
 
   let assemblyFormat = "$input attr-dict `:` type($input) `->` type($output)";
 }
@@ -59,7 +59,7 @@ class FillTensorWithConstantOp<string dtype> :
   }];
 
   let arguments = (ins
-      TensorType:$input,
+      DenseTensor:$input,
       AnyAttr:$value
   );
   let results = (outs);
@@ -77,7 +77,7 @@ def PrintTensorOp : DT_Op<"print_tensor"> {
     An operation that prints a tensor.
   }];
 
-  let arguments = (ins TensorType:$input);
+  let arguments = (ins DenseTensor:$input);
   let results = (outs);
   let assemblyFormat = "`(` $input `:` type($input) `)` attr-dict";
 }
@@ -90,7 +90,7 @@ class SetTensorOp<string dtype> :
     An operation that sets an input tensor with given values.
   }];
 
-  let arguments = (ins TensorType);
+  let arguments = (ins DenseTensor);
   let results = (outs);
 
   let parser  = [{ return infrt::dt::parseSetTensorOp(parser, result); }];
@@ -125,7 +125,7 @@ def TensorMapGetTensorOp : DT_Op<"tensor_map_get_tensor", [NoSideEffect]> {
           TensorMapType:$map,
           StrAttr:$name
           );
-  let results = (outs TensorType:$output);
+  let results = (outs DenseTensor:$output);
   let assemblyFormat = "`(` operands `)` attr-dict `->` type($output)";
   let verifier = ?;
 }
@@ -149,7 +149,7 @@ def GetTensorShapeOp : DT_Op<"get_tensor_shape", [NoSideEffect]> {
       An operation that returns the shape of the input tensor.
   }];
 
-  let arguments = (ins TensorType:$input);
+  let arguments = (ins DenseTensor:$input);
   let results = (outs TS_Shape:$output);
   let assemblyFormat = "$input attr-dict `:` type($input) `->` type($output)";
 }
@@ -162,8 +162,8 @@ class NaiveElementwiseAddOp<string dtype> :
     Naive elementwise_add operation.
     Just for testing.
   }];
-  let arguments = (ins TensorType:$a, TensorType:$b);
-  let results = (outs TensorType:$output);
+  let arguments = (ins DenseTensor:$a, DenseTensor:$b);
+  let results = (outs DenseTensor:$output);
   let assemblyFormat = "`(` $a `,` $b `)` attr-dict `:` `(` type($a) `,` type($b) `)` `->` type($output)";
 }
 
@@ -175,8 +175,8 @@ class NaiveMatmulOp<string dtype> :
     Naive matmul operation.
     Just for testing.
   }];
-  let arguments = (ins TensorType:$x, TensorType:$w);
-  let results = (outs TensorType:$output);
+  let arguments = (ins DenseTensor:$x, DenseTensor:$w);
+  let results = (outs DenseTensor:$output);
   let assemblyFormat = "`(` $x `,` $w `)` attr-dict `:` `(` type($x) `,` type($w) `)` `->` type($output)";
 }
 
diff --git a/paddle/infrt/dialect/infrt/CMakeLists.txt b/paddle/infrt/dialect/infrt/CMakeLists.txt
index 98910d8d0ecf0..daf710e0baf54 100644
--- a/paddle/infrt/dialect/infrt/CMakeLists.txt
+++ b/paddle/infrt/dialect/infrt/CMakeLists.txt
@@ -1,7 +1,15 @@
 core_gather_headers()
 
 gather_srcs(infrt_src SRCS
+    common_type.cc
     infrt_dialect.cc
     )
 
-add_mlir_dialect(infrt_ops Infrt)
+
+add_mlir_dialect(infrt_ops infrt)
+
+set(LLVM_TARGET_DEFINITIONS infrt_ops.td)
+mlir_tablegen(infrt_opsAttributes.h.inc -gen-attrdef-decls -dialect=infrt)
+mlir_tablegen(infrt_opsAttributes.cpp.inc -gen-attrdef-defs -dialect=infrt)
+add_public_tablegen_target(MLIRinfrt_opsAttributesIncGen)
+add_dependencies(mlir-headers MLIRinfrt_opsAttributesIncGen)
diff --git a/paddle/infrt/dialect/infrt/common_type.cc b/paddle/infrt/dialect/infrt/common_type.cc
new file mode 100644
index 0000000000000..5cbd7b2cd6153
--- /dev/null
+++ b/paddle/infrt/dialect/infrt/common_type.cc
@@ -0,0 +1,88 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/infrt/common_type.h"
+
+namespace infrt {
+
+llvm::Optional<TargetType> GetTargetType(llvm::StringRef key) {
+  if (key.equals_insensitive("CPU"))
+    return TargetType::CPU;
+  else if (key.equals_insensitive("GPU"))
+    return TargetType::GPU;
+  else
+    return llvm::None;
+}
+
+llvm::Optional<LayoutType> GetLayoutType(llvm::StringRef key) {
+  if (key.equals_insensitive("NCHW"))
+    return LayoutType::NCHW;
+  else if (key.equals_insensitive("NHWC"))
+    return LayoutType::NHWC;
+  else
+    return llvm::None;
+}
+
+llvm::Optional<PrecisionType> GetPrecisionType(llvm::StringRef key) {
+  if (key.equals_insensitive("FP32"))
+    return PrecisionType::FLOAT32;
+  else if (key.equals_insensitive("FP16"))
+    return PrecisionType::FLOAT16;
+  else
+    return llvm::None;
+}
+
+llvm::raw_ostream &operator<<(llvm::raw_ostream &os, TargetType type) {
+  switch (type) {
+    case (TargetType::CPU):
+      os << "CPU";
+      break;
+    case (TargetType::GPU):
+      os << "GPU";
+      break;
+    default:
+      os << "Unsupported";
+  }
+  return os;
+}
+
+llvm::raw_ostream &operator<<(llvm::raw_ostream &os, LayoutType type) {
+  switch (type) {
+    case (LayoutType::NCHW):
+      os << "NCHW";
+      break;
+    case (LayoutType::NHWC):
+      os << "NHWC";
+      break;
+    default:
+      os << "Unsupported";
+  }
+  return os;
+}
+
+llvm::raw_ostream &operator<<(llvm::raw_ostream &os, PrecisionType type) {
+  switch (type) {
+    case (PrecisionType::FLOAT32):
+      os << "FP32";
+      break;
+    case (PrecisionType::FLOAT16):
+      os << "FP16";
+      break;
+    default:
+      os << "Unsupported";
+  }
+  return os;
+}
+
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/infrt/common_type.h b/paddle/infrt/dialect/infrt/common_type.h
new file mode 100644
index 0000000000000..d6d6503c03be5
--- /dev/null
+++ b/paddle/infrt/dialect/infrt/common_type.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <llvm/ADT/Optional.h>
+#include <llvm/ADT/StringRef.h>
+#include <llvm/Support/raw_ostream.h>
+
+namespace infrt {
+
+enum class TargetType : uint8_t { CPU, GPU, UNK };
+enum class PrecisionType : uint8_t { FLOAT32, FLOAT16, UNK };
+enum class LayoutType : uint8_t { NCHW, NHWC, UNK };
+
+struct Place {
+  TargetType target;
+  PrecisionType precision;
+  LayoutType layout;
+  Place(TargetType tar, PrecisionType pre, LayoutType lay)
+      : target(tar), precision(pre), layout(lay) {}
+  Place()
+      : target(TargetType::UNK),
+        precision(PrecisionType::UNK),
+        layout(LayoutType::UNK) {}
+};
+
+llvm::Optional<TargetType> GetTargetType(llvm::StringRef key);
+llvm::Optional<LayoutType> GetLayoutType(llvm::StringRef key);
+llvm::Optional<PrecisionType> GetPrecisionType(llvm::StringRef key);
+
+llvm::raw_ostream &operator<<(llvm::raw_ostream &os, TargetType type);
+llvm::raw_ostream &operator<<(llvm::raw_ostream &os, LayoutType type);
+llvm::raw_ostream &operator<<(llvm::raw_ostream &os, PrecisionType type);
+
+}  // end namespace infrt
diff --git a/paddle/infrt/dialect/infrt/infrt_dialect.cc b/paddle/infrt/dialect/infrt/infrt_dialect.cc
index 388de858b6572..abb60016f9023 100644
--- a/paddle/infrt/dialect/infrt/infrt_dialect.cc
+++ b/paddle/infrt/dialect/infrt/infrt_dialect.cc
@@ -23,6 +23,9 @@
 #define GET_TYPEDEF_CLASSES
 #include "paddle/infrt/dialect/infrt/infrt_opsTypes.cpp.inc"
 
+#define GET_ATTRDEF_CLASSES
+#include "paddle/infrt/dialect/infrt/infrt_opsAttributes.cpp.inc"
+
 #define GET_OP_CLASSES
 #include "paddle/infrt/dialect/infrt/infrt_ops.cpp.inc"
 
@@ -33,6 +36,12 @@ void InfrtDialect::initialize() {
 #define GET_TYPEDEF_LIST
 #include "paddle/infrt/dialect/infrt/infrt_opsTypes.cpp.inc"  // NOLINT
       >();
+
+  addAttributes<
+#define GET_ATTRDEF_LIST
+#include "paddle/infrt/dialect/infrt/infrt_opsAttributes.cpp.inc"  // NOLINT
+      >();
+
   addOperations<
 #define GET_OP_LIST
 #include "paddle/infrt/dialect/infrt/infrt_ops.cpp.inc"  // NOLINT
@@ -57,36 +66,104 @@ mlir::Type InfrtDialect::parseType(::mlir::DialectAsmParser &parser) const {
 
     // Parse the element type.
     if (parser.parseType(elementType)) return nullptr;
-    // parse ","
-    if (parser.parseComma()) return nullptr;
-
-    // llvm::APInt lod_level;
-    if (parser.parseInteger(lod_level)) return nullptr;
-
+    // parse optional lod_level
+    if (parser.parseOptionalComma().succeeded()) {
+      // llvm::APInt lod_level;
+      if (parser.parseInteger(lod_level)) return nullptr;
+    }
     // parse ">"
     if (parser.parseGreater()) return nullptr;
 
     return LoDTensorType::get(
         parser.getContext(), shape, elementType, lod_level);
   }
+  if (keyword == "dense_tensor") {
+    // parse DenseTensor, for example: !i=Infrt.tensor<X86, CUDA, F32>
+    llvm::StringRef target;
+    llvm::StringRef layout;
+    llvm::StringRef precision;
+
+    // parse "<"
+    if (parser.parseLess()) return mlir::Type();
+    // parse target
+    if (parser.parseKeyword(&target)) return mlir::Type();
+    auto targetType = GetTargetType(target);
+    if (!targetType) {
+      parser.emitError(parser.getCurrentLocation(), "unknown target type: ")
+          << target;
+      return mlir::Type();
+    }
+
+    // parse ","
+    if (parser.parseComma()) return mlir::Type();
+    // parse precision
+    if (parser.parseKeyword(&precision)) return mlir::Type();
+    auto precisionType = GetPrecisionType(precision);
+    if (!precisionType) {
+      parser.emitError(parser.getCurrentLocation(), "unknown precision type: ")
+          << precision;
+      return mlir::Type();
+    }
+
+    // parse ","
+    if (parser.parseComma()) return mlir::Type();
+
+    // parse layout
+    if (parser.parseKeyword(&layout)) return mlir::Type();
+    auto layoutType = GetLayoutType(layout);
+    if (!layoutType) {
+      parser.emitError(parser.getCurrentLocation(), "unknown layout type: ")
+          << layout;
+      return mlir::Type();
+    }
+    // parse ">"
+    if (parser.parseGreater()) return mlir::Type();
+    return DenseTensorType::get(
+        parser.getContext(), *targetType, *precisionType, *layoutType);
+  }
   // Todo: parse other type
   return mlir::Type();
 }
 
 void InfrtDialect::printType(::mlir::Type type,
                              ::mlir::DialectAsmPrinter &os) const {
-  // print TensorType, for example: !infrt.tensor<X86, CUDA, F32>
+  // print LoDTensorType, for example: !Infrt.lod_tensor<3x64x3x3xf32,5>
   if (type.isa<infrt::LoDTensorType>()) {
-    auto lodTensorType = type.cast<infrt::LoDTensorType>();
+    auto lod_tensor_type = type.cast<infrt::LoDTensorType>();
     os << "lod_tensor<";
-    auto shape = lodTensorType.getShape();
-    for (auto dim = shape.begin(), e = shape.end() - 1; dim != e; ++dim)
-      os << *dim << 'x';
-    os << shape.back() << 'x' << lodTensorType.getElementType() << ", "
-       << lodTensorType.getLod_level() << ">";
+    auto shape = lod_tensor_type.getShape();
+    for (auto dim = shape.begin(), e = shape.end() - 1; dim != e; ++dim) {
+      *dim < 0 ? os << '?' : os << *dim;
+      os << 'x';
+    }
+    shape.back() < 0 ? os << '?' : os << shape.back();
+    os << 'x' << lod_tensor_type.getElementType() << ", "
+       << lod_tensor_type.getLod_level() << ">";
     return;
   }
+
+  // print DenseTensorType, for example: !infrt.dense_tensor<CPU, FP32, NCHW>
+  if (type.isa<infrt::DenseTensorType>()) {
+    auto dense_tensor_type = type.cast<infrt::DenseTensorType>();
+    os << "dense_tensor<" << dense_tensor_type.getTarget() << ", "
+       << dense_tensor_type.getPrecision() << ", "
+       << dense_tensor_type.getLayout() << ">";
+    return;
+  }
+
   llvm_unreachable("unknown infrt type.");
 }
 
+// /// Parse an attribute registered to this dialect.
+// ::mlir::Attribute InfrtDialect::parseAttribute(::mlir::DialectAsmParser
+// &parser,
+//                                    ::mlir::Type type) const {
+//   return mlir::Attribute();
+//                                    }
+// /// Print an attribute registered to this dialect.
+// void InfrtDialect::printAttribute(::mlir::Attribute attr,
+//                       ::mlir::DialectAsmPrinter &os) const {
+
+//                       }
+
 }  // namespace infrt
diff --git a/paddle/infrt/dialect/infrt/infrt_dialect.h b/paddle/infrt/dialect/infrt/infrt_dialect.h
index 21a1f6b34f6a5..ed5b36e556149 100644
--- a/paddle/infrt/dialect/infrt/infrt_dialect.h
+++ b/paddle/infrt/dialect/infrt/infrt_dialect.h
@@ -17,13 +17,19 @@
 //===----------------------------------------------------------------------===//
 // Dialect
 //===----------------------------------------------------------------------===//
+#include <llvm/ADT/StringMap.h>
 #include <mlir/IR/BuiltinTypes.h>
 #include <mlir/IR/Dialect.h>
 #include <mlir/IR/OpDefinition.h>
 #include <mlir/Interfaces/SideEffectInterfaces.h>
+#include "paddle/infrt/dialect/infrt/common_type.h"
 
 #include "paddle/infrt/dialect/infrt/infrt_opsDialect.h.inc"
 #define GET_TYPEDEF_CLASSES
 #include "paddle/infrt/dialect/infrt/infrt_opsTypes.h.inc"
+
+#define GET_ATTRDEF_CLASSES
+#include "paddle/infrt/dialect/infrt/infrt_opsAttributes.h.inc"
+
 #define GET_OP_CLASSES
 #include "paddle/infrt/dialect/infrt/infrt_ops.h.inc"
diff --git a/paddle/infrt/dialect/infrt/infrt_ops.td b/paddle/infrt/dialect/infrt/infrt_ops.td
index 319760973cd90..00f94805c7db2 100644
--- a/paddle/infrt/dialect/infrt/infrt_ops.td
+++ b/paddle/infrt/dialect/infrt/infrt_ops.td
@@ -1,34 +1,4 @@
-#ifndef Infrt_OpS
-#define Infrt_OpS
-
-include "mlir/IR/OpBase.td"
-include "mlir/Interfaces/SideEffectInterfaces.td"
-
-def Infrt_Dialect : Dialect {
-  let summary =
-    "A dialect containing the Infrt Attributes, Operations, and Types";
-
-  let name = "Infrt";
-  let cppNamespace = "::infrt";
-}
-
-// Type definitions
-
-// Base class for Infrt dialect types.
-class Infrt_Type<string name, list<Trait> traits = [],
-                   string baseCppClass = "::mlir::Type">
-    : TypeDef<Infrt_Dialect, name, traits, baseCppClass> {
-}
-
-def LoDTensor : Infrt_Type<"LoDTensor"> {
-  let summary = "infrt lod tensor";
-  let description = [{lod_tensor<3x64x3x3xf32, 3>}];
-  let parameters = (ins
-    ArrayRefParameter<"int64_t">:$shape,
-    "mlir::Type":$elementType,
-    "int32_t":$lod_level
-  );
-}
+include "paddle/infrt/dialect/infrt/infrt_ops_base.td"
 
 // Op definition
 class Infrt_Op<string mnemonic, list<OpTrait> traits = []> : Op<Infrt_Dialect, mnemonic, traits> {
@@ -39,14 +9,11 @@ class Infrt_Op<string mnemonic, list<OpTrait> traits = []> : Op<Infrt_Dialect, m
   // let parser = [{ return infrt::parse$cppClass(parser, result); }];
 }
 
-// def InfRT_KernelOp : Infrt_Op<"kernel", [NoSideEffect]> {
-//  let summary = "kernel op";
-//  let description = [{
-//    kernel op!
-//  }];
-// let arguments = (ins StrAttr:$name, PD_Tensor:$X, PD_Tensor:$Y, DefaultValuedAttr<F32Attr, "1.0">:$Alpha, DefaultValuedAttr<F32Attr, "1.0">:$Beta);
-//
-// let results = (outs PD_Tensor:$Out);
-// }
-
-#endif // Infrt_OpS
+def Infrt_KernelOp : Infrt_Op<"kernel", [NoSideEffect]> {
+  let summary = "kernel op";
+  let description = [{kernel op!}];
+  let arguments = (ins Variadic<AnyType>:$operands,
+                       StrAttr:$name,
+                       OptionalAttr<DictionaryAttr>:$attrs);
+  let results = (outs Variadic<AnyType>);
+}
diff --git a/paddle/infrt/dialect/infrt/infrt_ops_base.td b/paddle/infrt/dialect/infrt/infrt_ops_base.td
new file mode 100644
index 0000000000000..81d3d028a66be
--- /dev/null
+++ b/paddle/infrt/dialect/infrt/infrt_ops_base.td
@@ -0,0 +1,49 @@
+#ifndef INFRT_OPS_BASE
+#define INFRT_OPS_BASE
+
+include "mlir/IR/OpBase.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+
+def Infrt_Dialect : Dialect {
+  let summary =
+    "A dialect containing the Infrt Attributes, Operations, and Types";
+
+  let name = "infrt";
+  let cppNamespace = "::infrt";
+}
+
+// Type definitions
+
+// Base class for Infrt dialect types.
+class Infrt_Type<string name, list<Trait> traits = [],
+                   string baseCppClass = "::mlir::Type">
+    : TypeDef<Infrt_Dialect, name, traits, baseCppClass> {
+}
+
+def LoDTensor : Infrt_Type<"LoDTensor"> {
+  let summary = "infrt lod tensor";
+  let description = [{lod_tensor<3x64x3x3xf32, 3>}];
+  let parameters = (ins
+    ArrayRefParameter<"int64_t">:$shape,
+    "mlir::Type":$elementType,
+    "int32_t":$lod_level
+  );
+}
+
+def DenseTensor : Infrt_Type<"DenseTensor"> {
+  let summary = "infrt dense tensor";
+  let description = [{dense_tensor<, 3>}];
+  let parameters = (ins
+    "TargetType":$target,
+    "PrecisionType":$precision,
+    "LayoutType":$layout
+  );
+}
+
+// Base class for infrt dialect attributes.
+class Infrt_Attr<string name, list<Trait> traits = [],
+                   string baseCppClass = "::mlir::Attribute">
+    : AttrDef<Infrt_Dialect, name, traits, baseCppClass> {
+  let mnemonic = ?;
+}
+#endif // INFRT_OPS_BASE
diff --git a/paddle/infrt/dialect/infrt_base.cc b/paddle/infrt/dialect/infrt_base.cc
index c0101a8c16608..8c595c06745f1 100644
--- a/paddle/infrt/dialect/infrt_base.cc
+++ b/paddle/infrt/dialect/infrt_base.cc
@@ -27,7 +27,6 @@ void INFRTDialect::initialize() {
   allowUnknownOperations();
 
   addTypes<infrt::dt::StringType>();
-  addTypes<infrt::dt::TensorType>();
   addTypes<infrt::dt::TensorMapType>();
 
   addOperations<
@@ -43,51 +42,6 @@ void INFRTDialect::initialize() {
 mlir::Type INFRTDialect::parseType(mlir::DialectAsmParser &parser) const {
   llvm::StringRef keyword;
   if (parser.parseKeyword(&keyword)) return mlir::Type();
-  // parse TensorType, for example: !infrt.tensor<X86, CUDA, F32>
-  if (keyword == "tensor") {
-    llvm::StringRef target;
-    llvm::StringRef layout;
-    llvm::StringRef precision;
-
-    // parse "<"
-    if (parser.parseLess()) return mlir::Type();
-    // parse target
-    if (parser.parseKeyword(&target)) return mlir::Type();
-    auto targetType = infrt::dt::GetTargetType(target);
-    if (!targetType) {
-      parser.emitError(parser.getCurrentLocation(), "unknown target type: ")
-          << target;
-      return mlir::Type();
-    }
-
-    // parse ","
-    if (parser.parseComma()) return mlir::Type();
-    // parse layout
-    if (parser.parseKeyword(&layout)) return mlir::Type();
-    auto layoutType = infrt::dt::GetLayoutType(layout);
-    if (!layoutType) {
-      parser.emitError(parser.getCurrentLocation(), "unknown layout type: ")
-          << layout;
-      return mlir::Type();
-    }
-
-    // parse ","
-    if (parser.parseComma()) return mlir::Type();
-    // parse precision
-    if (parser.parseKeyword(&precision)) return mlir::Type();
-    auto precisionType = infrt::dt::GetPrecisionType(precision);
-    if (!precisionType) {
-      parser.emitError(parser.getCurrentLocation(), "unknown precision type: ")
-          << precision;
-      return mlir::Type();
-    }
-
-    // parse ">"
-    if (parser.parseGreater()) return mlir::Type();
-
-    return infrt::dt::TensorType::get(
-        parser.getContext(), *targetType, *layoutType, *precisionType);
-  }
   // parse TensorMapType, for example: !infrt.tensor_map
   if (keyword == "tensor_map") {
     return infrt::dt::TensorMapType::get();
@@ -104,13 +58,6 @@ mlir::Type INFRTDialect::parseType(mlir::DialectAsmParser &parser) const {
 
 void INFRTDialect::printType(mlir::Type type,
                              mlir::DialectAsmPrinter &printer) const {
-  // print TensorType, for example: !infrt.tensor<X86, CUDA, F32>
-  if (type.isa<infrt::dt::TensorType>()) {
-    auto tensorType = type.cast<infrt::dt::TensorType>();
-    printer << "tensor<" << tensorType.target() << ", " << tensorType.layout()
-            << ", " << tensorType.precision() << ">";
-    return;
-  }
   // print TensorMapType, for example: !infrt.tensor_map
   if (type.isa<infrt::dt::TensorMapType>()) {
     printer << "tensor_map";
diff --git a/paddle/infrt/dialect/infrt_base.h b/paddle/infrt/dialect/infrt_base.h
index 4021a5a6d3cd2..a8e7e13a681ca 100644
--- a/paddle/infrt/dialect/infrt_base.h
+++ b/paddle/infrt/dialect/infrt_base.h
@@ -43,7 +43,7 @@ class INFRTDialect : public mlir::Dialect {
   friend class mlir::MLIRContext;
 
  public:
-  static ::llvm::StringRef getDialectNamespace() { return "infrt"; }
+  static ::llvm::StringRef getDialectNamespace() { return "Infrt"; }
 };
 }  // namespace dialect
 
diff --git a/paddle/infrt/dialect/infrt_base.td b/paddle/infrt/dialect/infrt_base.td
index 1abd294236d93..4d4727ee8e185 100644
--- a/paddle/infrt/dialect/infrt_base.td
+++ b/paddle/infrt/dialect/infrt_base.td
@@ -2,9 +2,10 @@
 #define INFRT_BASE
 
 include "mlir/IR/OpBase.td"
+include "paddle/infrt/dialect/infrt/infrt_ops_base.td"
 
 def INFRT_Dialect : Dialect {
-  let name = "infrt";
+  let name = "Infrt";
 
   let description = [{
     The INFRT host dialect.
@@ -18,9 +19,6 @@ def StringType :
     Type<CPred<"$_self.isa<::infrt::dt::StringType>()">, "!infrt.string type">,
     BuildableType<"$_builder.getType<::infrt::dt::StringType>()">;
 
-def TensorType :
-    Type<CPred<"$_self.isa<::infrt::dt::TensorType>()">, "!infrt.tensor type">;
-
 def TensorMapType :
     Type<CPred<"$_self.isa<::infrt::dt::TensorMapType>()">, "!infrt.tensor_map type">,
     BuildableType<"$_builder.getType<::infrt::dt::TensorMapType>()">;
diff --git a/paddle/infrt/dialect/init_infrt_dialects.cc b/paddle/infrt/dialect/init_infrt_dialects.cc
index 090f1aea28910..b5b8de7a20d08 100644
--- a/paddle/infrt/dialect/init_infrt_dialects.cc
+++ b/paddle/infrt/dialect/init_infrt_dialects.cc
@@ -21,8 +21,8 @@
 #include "paddle/infrt/dialect/infrt/infrt_dialect.h"
 #include "paddle/infrt/dialect/infrt_base.h"
 #include "paddle/infrt/dialect/pd_ops.h"
-#include "paddle/infrt/dialect/pten/infrt_pten_tensor.h"
-#include "paddle/infrt/dialect/pten/pten_base.h"
+#include "paddle/infrt/dialect/phi/infrt_phi_tensor.h"
+#include "paddle/infrt/dialect/phi/phi_base.h"
 #include "paddle/infrt/dialect/tensor_shape.h"
 
 namespace infrt {
@@ -32,9 +32,9 @@ void registerCinnDialects(mlir::DialectRegistry &registry) {  // NOLINT
                   infrt::InfrtDialect,
                   dt::DTDialect,
                   mlir::pd::PaddleDialect,
-#ifdef INFRT_WITH_PTEN
-                  pten::PTENDenseTensorDialect,
-                  pten::PTENDialect
+#ifdef INFRT_WITH_PHI
+                  phi::PHIDenseTensorDialect,
+                  phi::PHIDialect
 #endif
                   >();
 }
diff --git a/paddle/infrt/dialect/mlir_loader_test.cc b/paddle/infrt/dialect/mlir_loader_test.cc
index 1115053073044..2f721e49a6309 100644
--- a/paddle/infrt/dialect/mlir_loader_test.cc
+++ b/paddle/infrt/dialect/mlir_loader_test.cc
@@ -32,13 +32,13 @@ TEST(MlirLoader, basic) {
 
   auto source = R"ROC(
 func @main() -> f32 {
-  %v0 = infrt.constant.f32 1.0
-  %v1 = infrt.constant.f32 2.0
-  %value = "infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
+  %v0 = Infrt.constant.f32 1.0
+  %v1 = Infrt.constant.f32 2.0
+  %value = "Infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
 
-  "infrt.print.f32"(%v0) : (f32) -> ()
+  "Infrt.print.f32"(%v0) : (f32) -> ()
 
-  infrt.return %value : f32
+  Infrt.return %value : f32
 }
 )ROC";
 
diff --git a/paddle/infrt/dialect/pd_op_base.td b/paddle/infrt/dialect/pd_op_base.td
index a61a4645eff76..266bdf60de788 100644
--- a/paddle/infrt/dialect/pd_op_base.td
+++ b/paddle/infrt/dialect/pd_op_base.td
@@ -6,7 +6,7 @@
 
 include "mlir/IR/OpBase.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
-include "paddle/infrt/dialect/infrt/infrt_ops.td"
+include "paddle/infrt/dialect/infrt/infrt_ops_base.td"
 
 def PD_Dialect : Dialect {
   let name = "pd";
diff --git a/paddle/infrt/dialect/phi/CMakeLists.txt b/paddle/infrt/dialect/phi/CMakeLists.txt
new file mode 100644
index 0000000000000..626b02c1f790d
--- /dev/null
+++ b/paddle/infrt/dialect/phi/CMakeLists.txt
@@ -0,0 +1,18 @@
+if (NOT INFRT_WITH_PHI)
+    return()
+endif()
+
+#mlir_tablegen_on(infrt_phi_base DIALECT phi)
+add_mlir_dialect(infrt_phi_base phi)
+add_mlir_dialect(infrt_phi_tensor phi_dt)
+add_mlir_dialect(infrt_phi_kernel phi_kernel)
+#mlir_tablegen_on(infrt_phi_tensor)
+
+gather_srcs(infrt_src SRCS
+    phi_base.cc infrt_phi_tensor.cc
+    infrt_phi_tensor.cc)
+
+add_subdirectory(pass)
+
+add_executable(phi-exec phi_exec.cc)
+target_link_libraries(phi-exec infrt)
diff --git a/paddle/infrt/dialect/pten/infrt_pten_base.td b/paddle/infrt/dialect/phi/infrt_phi_base.td
similarity index 56%
rename from paddle/infrt/dialect/pten/infrt_pten_base.td
rename to paddle/infrt/dialect/phi/infrt_phi_base.td
index 20a43f9a92620..e297fad86be75 100644
--- a/paddle/infrt/dialect/pten/infrt_pten_base.td
+++ b/paddle/infrt/dialect/phi/infrt_phi_base.td
@@ -1,26 +1,26 @@
-#ifndef PTEN_BASE
-#define PTEN_BASE
+#ifndef PHI_BASE
+#define PHI_BASE
 
 include "mlir/IR/OpBase.td"
 
-def PTEN_Dialect : Dialect {
-  let name = "pten";
+def PHI_Dialect : Dialect {
+  let name = "phi";
 
   let description = [{
-    The PTEN host dialect.
+    The PHI host dialect.
   }];
 
-  let cppNamespace = "::infrt::pten";
+  let cppNamespace = "::infrt::phi";
 }
 
 class AllocatorTypeOf<string place, list<Trait> traits=[]>:
-    TypeDef<PTEN_Dialect, place # "Allocator", traits> {
-    let summary = !strconcat("!pten.allocator_", place, " type");
+    TypeDef<PHI_Dialect, place # "Allocator", traits> {
+    let summary = !strconcat("!phi.allocator_", place, " type");
 }
 
 class ContextTypeOf<string place, list<Trait> traits=[]>:
-    TypeDef<PTEN_Dialect, place # "Context", traits> {
-    let summary = !strconcat("!pten.context_", place, " type");
+    TypeDef<PHI_Dialect, place # "Context", traits> {
+    let summary = !strconcat("!phi.context_", place, " type");
 }
 
 def CPU_Allocator : AllocatorTypeOf<"CPU">;
diff --git a/paddle/infrt/dialect/phi/infrt_phi_kernel.td b/paddle/infrt/dialect/phi/infrt_phi_kernel.td
new file mode 100644
index 0000000000000..9ae469605860b
--- /dev/null
+++ b/paddle/infrt/dialect/phi/infrt_phi_kernel.td
@@ -0,0 +1,31 @@
+#ifndef PHI_KERNEL
+#define PHI_KERNEL
+
+include "paddle/infrt/dialect/phi/infrt_phi_tensor.td"
+
+def PHI_KernelDialect : Dialect {
+  let name = "phi_kernel";
+
+  let description = [{
+    The PHI Kernel dialect.
+  }];
+
+  let cppNamespace = "::infrt::phi";
+}
+
+// PHI Kernel related ops.
+class PDT_Kernel<string mnemonic, list<OpTrait> traits = []> : Op<PHI_KernelDialect, mnemonic, !listconcat(traits, [IsolatedFromAbove])> {
+}
+
+def FakeKernelOp : PDT_Kernel<"phi.matmul.host.fp32"> {
+  let arguments = (ins CPU_Context:$dev_ctx, DenseTensor:$x, DenseTensor:$y, BoolAttr:$transpose_x, BoolAttr:$transpose_y);
+  let results = (outs DenseTensor:$output);
+}
+
+def PDCK_AbsOp : PDT_Kernel<"phi.abs.host.fp32"> {
+  let arguments = (ins CPU_Context:$dev_ctx, DenseTensor:$x);
+  let results = (outs DenseTensor:$output);
+}
+
+#endif
+
diff --git a/paddle/infrt/dialect/pten/infrt_pten_tensor.cc b/paddle/infrt/dialect/phi/infrt_phi_tensor.cc
similarity index 65%
rename from paddle/infrt/dialect/pten/infrt_pten_tensor.cc
rename to paddle/infrt/dialect/phi/infrt_phi_tensor.cc
index b3e99da8750fb..9df1a47031b1f 100644
--- a/paddle/infrt/dialect/pten/infrt_pten_tensor.cc
+++ b/paddle/infrt/dialect/phi/infrt_phi_tensor.cc
@@ -12,25 +12,25 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/infrt/dialect/pten/infrt_pten_tensor.h"
+#include "paddle/infrt/dialect/phi/infrt_phi_tensor.h"
 
 #include <mlir/IR/BuiltinTypes.h>
 
-#include "paddle/infrt/dialect/pten/infrt_pten_tensorDialect.cpp.inc"
-#include "paddle/infrt/dialect/pten/infrt_pten_tensorTypes.cpp.inc"
+#include "paddle/infrt/dialect/phi/infrt_phi_tensorDialect.cpp.inc"
+#include "paddle/infrt/dialect/phi/infrt_phi_tensorTypes.cpp.inc"
 
 namespace infrt {
-namespace pten {
+namespace phi {
 
-void PTENDenseTensorDialect::initialize() {
+void PHIDenseTensorDialect::initialize() {
 #define GET_OP_LIST
   addOperations<
-#include "paddle/infrt/dialect/pten/infrt_pten_tensor.cpp.inc"
+#include "paddle/infrt/dialect/phi/infrt_phi_tensor.cpp.inc"
       >();
 }
 
-}  // namespace pten
+}  // namespace phi
 }  // namespace infrt
 
 #define GET_OP_CLASSES
-#include "paddle/infrt/dialect/pten/infrt_pten_tensor.cpp.inc"  // NOLINT
+#include "paddle/infrt/dialect/phi/infrt_phi_tensor.cpp.inc"  // NOLINT
diff --git a/paddle/infrt/dialect/pten/infrt_pten_tensor.h b/paddle/infrt/dialect/phi/infrt_phi_tensor.h
similarity index 83%
rename from paddle/infrt/dialect/pten/infrt_pten_tensor.h
rename to paddle/infrt/dialect/phi/infrt_phi_tensor.h
index 5fe259300d2ae..2780f9759185e 100644
--- a/paddle/infrt/dialect/pten/infrt_pten_tensor.h
+++ b/paddle/infrt/dialect/phi/infrt_phi_tensor.h
@@ -29,11 +29,11 @@
 #include <mlir/Interfaces/LoopLikeInterface.h>
 #include <mlir/Interfaces/SideEffectInterfaces.h>
 
-#include "paddle/infrt/dialect/pten/infrt_pten_tensorDialect.h.inc"
-#include "paddle/infrt/dialect/pten/infrt_pten_tensorTypes.h.inc"
+#include "paddle/infrt/dialect/phi/infrt_phi_tensorDialect.h.inc"
+#include "paddle/infrt/dialect/phi/infrt_phi_tensorTypes.h.inc"
 
 #include "paddle/infrt/dialect/dense_tensor.h"
-#include "paddle/infrt/dialect/pten/pten_base.h"
+#include "paddle/infrt/dialect/phi/phi_base.h"
 // NOLINT
 #define GET_OP_CLASSES
-#include "paddle/infrt/dialect/pten/infrt_pten_tensor.h.inc"
+#include "paddle/infrt/dialect/phi/infrt_phi_tensor.h.inc"
diff --git a/paddle/infrt/dialect/pten/infrt_pten_tensor.td b/paddle/infrt/dialect/phi/infrt_phi_tensor.td
similarity index 71%
rename from paddle/infrt/dialect/pten/infrt_pten_tensor.td
rename to paddle/infrt/dialect/phi/infrt_phi_tensor.td
index 528f0f919680d..b4607f632c9b9 100644
--- a/paddle/infrt/dialect/pten/infrt_pten_tensor.td
+++ b/paddle/infrt/dialect/phi/infrt_phi_tensor.td
@@ -1,36 +1,36 @@
-#ifdef PTEN_TENSOR
+#ifdef PHI_TENSOR
 #else
-#define PTEN_TENSOR
+#define PHI_TENSOR
 
-include "paddle/infrt/dialect/pten/infrt_pten_base.td"
+include "paddle/infrt/dialect/phi/infrt_phi_base.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/IR/OpBase.td"
 include "paddle/infrt/dialect/infrt_base.td"
 
-def PTEN_DenseTensorDialect : Dialect {
-  let name = "pten_dt";
+def PHI_DenseTensorDialect : Dialect {
+  let name = "phi_dt";
 
   let description = [{
-    The PTEN DenseTensor dialect.
+    The PHI DenseTensor dialect.
   }];
 
-  let cppNamespace = "::infrt::pten";
+  let cppNamespace = "::infrt::phi";
 }
 
-// PTEN DenseTensor related Op.
-class PDT_Op<string mnemonic, list<OpTrait> traits = []> : Op<PTEN_DenseTensorDialect, mnemonic, !listconcat(traits, [IsolatedFromAbove])> {
+// PHI DenseTensor related Op.
+class PDT_Op<string mnemonic, list<OpTrait> traits = []> : Op<PHI_DenseTensorDialect, mnemonic, !listconcat(traits, [IsolatedFromAbove])> {
 }
 
 class CreateDenseTensorOp<string place, string dtype, string layout> 
       : PDT_Op<"create_dense_tensor." # place # "." # dtype # "." # layout, [NoSideEffect]> {
   let arguments = (ins CPU_Allocator:$allocator, I64ArrayAttr:$dims, I64ArrayAttr:$lod);
-  let results = (outs TensorType:$output);
+  let results = (outs DenseTensor:$output);
 }
 
 class FillDenseTensorOp<Attr attr_type, string dtype> :
       PDT_Op<"fill_dense_tensor." # dtype> {
   let arguments = (ins
-      TensorType:$input,
+      DenseTensor:$input,
       attr_type:$value
   );
   let results = (outs);
diff --git a/paddle/infrt/dialect/phi/pass/CMakeLists.txt b/paddle/infrt/dialect/phi/pass/CMakeLists.txt
new file mode 100644
index 0000000000000..5c55a6b0acaed
--- /dev/null
+++ b/paddle/infrt/dialect/phi/pass/CMakeLists.txt
@@ -0,0 +1,7 @@
+core_gather_headers()
+
+gather_srcs(infrt_src SRCS
+    proto_arg_map_context.cc
+    phi_op_cvt_pass.cc
+    kernel_op_desc.cc
+    )
diff --git a/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc b/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc
new file mode 100644
index 0000000000000..63869b7d7b9ea
--- /dev/null
+++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc
@@ -0,0 +1,133 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/phi/pass/kernel_op_desc.h"
+#include <glog/logging.h>
+#include "paddle/phi/core/kernel_factory.h"
+#include "paddle/phi/core/kernel_registry.h"
+namespace infrt {
+
+phi::Backend cvtTarget2Phi(TargetType target) {
+  switch (target) {
+    case TargetType::CPU:
+      return phi::Backend::CPU;
+    case TargetType::GPU:
+      return phi::Backend::GPU;
+    default:
+      return phi::Backend::UNDEFINED;
+  }
+}
+
+TargetType cvtTargetFromPhi(phi::Backend backend) {
+  switch (backend) {
+    case phi::Backend::CPU:
+      return TargetType::CPU;
+    case phi::Backend::GPU:
+      return TargetType::GPU;
+    default:
+      return TargetType::UNK;
+  }
+}
+
+phi::DataType cvtPrecision2Phi(PrecisionType precision) {
+  switch (precision) {
+    case PrecisionType::FLOAT32:
+      return phi::DataType::FLOAT32;
+      break;
+    case PrecisionType::FLOAT16:
+      return phi::DataType::FLOAT16;
+    default:
+      return phi::DataType::UNDEFINED;
+  }
+}
+
+PrecisionType cvtPrecisionFromPhi(phi::DataType datatype) {
+  switch (datatype) {
+    case phi::DataType::FLOAT32:
+      return PrecisionType::FLOAT32;
+    case phi::DataType::FLOAT16:
+      return PrecisionType::FLOAT16;
+    default:
+      return PrecisionType::UNK;
+  }
+}
+
+phi::DataLayout cvtLayout2Phi(LayoutType layout) {
+  switch (layout) {
+    case LayoutType::NCHW:
+      return phi::DataLayout::NCHW;
+    case LayoutType::NHWC:
+      return phi::DataLayout::NHWC;
+    default:
+      return phi::DataLayout::UNDEFINED;
+  }
+}
+
+LayoutType cvtLayoutFromPhi(phi::DataLayout layout) {
+  switch (layout) {
+    case phi::DataLayout::NCHW:
+      return LayoutType::NCHW;
+    case phi::DataLayout::NHWC:
+      return LayoutType::NHWC;
+    default:
+      return LayoutType::UNK;
+  }
+}
+
+phi::KernelKey cvtPlace2Phi(const Place& place) {
+  return phi::KernelKey(cvtTarget2Phi(place.target),
+                        cvtLayout2Phi(place.layout),
+                        cvtPrecision2Phi(place.precision));
+}
+
+Place cvtPlaceFromPhi(phi::TensorArgDef tensor_arg) {
+  return Place(cvtTargetFromPhi(tensor_arg.backend),
+               cvtPrecisionFromPhi(tensor_arg.dtype),
+               cvtLayoutFromPhi(tensor_arg.layout));
+}
+
+std::vector<PhiKernelDesc> getCandidateKernels(
+    std::string name, const std::vector<Place>& valid_palces) {
+  std::vector<PhiKernelDesc> candidate_kernels;
+  PhiKernelDesc phi_kernel_desc;
+  phi::KernelKeyMap kernel_key_map =
+      phi::KernelFactory::Instance().SelectKernelMap(name);
+  for (const Place& place : valid_palces) {
+    phi::KernelKey kernel_key = cvtPlace2Phi(place);
+    if (kernel_key_map.find(kernel_key) == kernel_key_map.end()) {
+      kernel_key = phi::KernelKey(kernel_key.backend(),
+                                  phi::DataLayout::ALL_LAYOUT,
+                                  kernel_key.dtype());
+      if (kernel_key_map.find(kernel_key) == kernel_key_map.end()) continue;
+    }
+    phi_kernel_desc.kernelType = place;
+    phi_kernel_desc.inputsType.clear();
+    phi_kernel_desc.outputsType.clear();
+    phi::KernelArgsDef args_def = kernel_key_map.at(kernel_key).args_def();
+    const paddle::SmallVector<phi::TensorArgDef>& input_arg =
+        args_def.input_defs();
+    const paddle::SmallVector<phi::TensorArgDef>& output_arg =
+        args_def.output_defs();
+    for (auto tensor_arg : input_arg) {
+      phi_kernel_desc.inputsType.emplace_back(cvtPlaceFromPhi(tensor_arg));
+    }
+    for (auto tensor_arg : output_arg) {
+      phi_kernel_desc.outputsType.emplace_back(cvtPlaceFromPhi(tensor_arg));
+    }
+    candidate_kernels.emplace_back(phi_kernel_desc);
+  }
+  return candidate_kernels;
+}
+
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/phi/pass/kernel_op_desc.h b/paddle/infrt/dialect/phi/pass/kernel_op_desc.h
new file mode 100644
index 0000000000000..b74107f674e51
--- /dev/null
+++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/infrt/dialect/infrt/common_type.h"
+
+namespace infrt {
+
+struct PhiKernelDesc {
+  std::vector<Place> inputsType;   // kernel input place
+  std::vector<Place> outputsType;  // kernel output place
+  Place kernelType;                // kernel place
+};
+
+std::vector<PhiKernelDesc> getCandidateKernels(
+    std::string name, const std::vector<Place>& valid_palces);
+
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc b/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc
new file mode 100644
index 0000000000000..df3472aa01dfb
--- /dev/null
+++ b/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc
@@ -0,0 +1,116 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h"
+
+#include <glog/logging.h>
+#include <llvm/ADT/SetVector.h>
+#include <mlir/Analysis/SliceAnalysis.h>
+#include <mlir/IR/Builders.h>
+#include <list>
+#include <unordered_set>
+#include <vector>
+
+#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
+#include "paddle/infrt/dialect/phi/pass/kernel_op_desc.h"
+#include "paddle/infrt/dialect/phi/pass/proto_arg_map_context.h"
+#include "paddle/phi/core/compat/op_utils.h"
+#include "paddle/phi/ops/compat/signatures.h"
+namespace infrt {
+// Implementation of the phiOpCvtPass.
+void phiOpCvtPass::runOnFunction() {
+  convertStage();
+  diapatchStage();
+}
+void phiOpCvtPass::convertStage() {
+  mlir::Block &body = getFunction().front();
+  std::vector<mlir::Operation *> worklist;
+  for (auto &op : body.without_terminator()) {
+    worklist.push_back(&op);
+  }
+  mlir::OpBuilder builder(&body, body.begin());
+  while (!worklist.empty()) {
+    auto *op = worklist.back();
+    worklist.pop_back();
+    if (op == nullptr) continue;
+
+    std::string op_name = op->getName().getIdentifier().str();
+
+    // only convert op in pd dialect.
+    if (op_name.substr(0, 3) != "pd.") continue;
+    op_name = op_name.substr(3);
+    if (pd_dialect_inputs_info_map_.find(op_name) ==
+            pd_dialect_inputs_info_map_.end() ||
+        pd_dialect_outputs_info_map_.find(op_name) ==
+            pd_dialect_outputs_info_map_.end()) {
+      // Todo: print log
+      continue;
+    }
+
+    phi::KernelSignature kernel_sign =
+        phi::OpUtilsMap::Instance().GetArgumentMappingFn(op_name)(
+            ProtoArgumentMappingContext(op));
+    // resort input&output according to kernel_sign
+    ::llvm::SmallVector<mlir::Value, 4> inputs, ori_output;
+    ::llvm::SmallVector<mlir::Type, 4> output_types;
+    for (const std::string &str : std::get<0>(kernel_sign.args)) {
+      if (pd_dialect_inputs_info_map_.at(op_name).count(str) == 0) {
+        // Todo: print error log
+        return;
+      }
+      uint8_t index = pd_dialect_inputs_info_map_.at(op_name).at(str);
+      inputs.push_back(op->getOperands()[index]);
+    }
+
+    for (const std::string &str : std::get<2>(kernel_sign.args)) {
+      if (pd_dialect_outputs_info_map_.at(op_name).count(str) == 0) {
+        // Todo: print error log
+        return;
+      }
+      uint8_t index = pd_dialect_outputs_info_map_.at(op_name).at(str);
+      output_types.push_back(op->getResultTypes()[index]);
+      ori_output.push_back(op->getResult(index));
+    }
+
+    auto loc = getFunction().getLoc();
+    builder.setInsertionPoint(op);
+    auto kernel_op = builder.create<infrt::KernelOp>(
+        loc, output_types, inputs, kernel_sign.name, op->getAttrDictionary());
+    for (size_t index = 0; index < ori_output.size(); ++index) {
+      ori_output[index].replaceAllUsesWith(kernel_op.getResult(index));
+    }
+    if (!op->use_empty()) {
+      // Todo: print error log
+      return;
+    }
+    op->erase();
+  }
+}
+void phiOpCvtPass::diapatchStage() {
+  std::vector<infrt::KernelOp> worklist;
+  mlir::Block &block = getFunction().front();
+  for (auto &op : block) {
+    infrt::KernelOp kernel_op = ::llvm::dyn_cast_or_null<infrt::KernelOp>(&op);
+    if (nullptr != kernel_op) worklist.push_back(kernel_op);
+  }
+  // ToDo: implementation in the next PR
+  while (!worklist.empty()) {
+    // infrt::KernelOp kernel_op = worklist.back();
+    worklist.pop_back();
+    // std::string kernel_name = kernel_op.name().str();
+    // std::vector<PhiKernelDesc> candidates =
+    //     getCandidateKernels(kernel_name, valid_places_);
+  }
+}
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h b/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h
new file mode 100644
index 0000000000000..051fee9b61a24
--- /dev/null
+++ b/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h
@@ -0,0 +1,57 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <mlir/Pass/Pass.h>
+#include "paddle/infrt/dialect/infrt/common_type.h"
+
+namespace infrt {
+/*
+ * phiOpCvtPass.
+ *
+ * Convert the general operators in pd Dialect to a infrt.kernelOp.
+ *
+ * source func:
+ *
+ * func @main() -> tensor<?xf32> {
+ *  %a = "pd.feed"()...
+ *  %c = "pd.conv2d"(%a) ...
+ *  %d = "pd.conv3d"(%c) ...
+ *  %f = "pd.conv2d"(%a) ...
+ *  "pd.fetch" (%d, %f)
+ * }
+ *
+ * destination func:
+ * func @main() -> tensor<?xf32> {
+ *  %a = "pd.feed"()...
+ *  %c = "infrt.kernel"(%a){name = "conv2d"} ...
+ *  %d = "infrt.kernel"(%c){name = "conv3d"}...
+ *  %f = "infrt.kernel"(%a){name = "conv2d"}...
+ *  "pd.fetch" (%d, %f)
+ * }
+ */
+class phiOpCvtPass
+    : public mlir::PassWrapper<phiOpCvtPass, mlir::FunctionPass> {
+ public:
+  ::llvm::StringRef getName() const override { return "phiOpCvtPass"; }
+  void runOnFunction() override;
+  explicit phiOpCvtPass(std::vector<Place> valid_places = std::vector<Place>())
+      : valid_places_(valid_places) {}
+
+ private:
+  void convertStage();
+  void diapatchStage();
+  std::vector<Place> valid_places_;
+};
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.cc b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.cc
new file mode 100644
index 0000000000000..64b184359700e
--- /dev/null
+++ b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.cc
@@ -0,0 +1,73 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/phi/pass/proto_arg_map_context.h"
+
+namespace infrt {
+
+bool ProtoArgumentMappingContext::HasInput(const std::string& name) const {
+  if (input_map_.find(name) == input_map_.end()) {
+    return false;
+  }
+  uint8_t index = input_map_.at(name);
+  return static_cast<bool>(op_->getOperand(index));
+}
+
+bool ProtoArgumentMappingContext::HasOutput(const std::string& name) const {
+  if (output_map_.find(name) == output_map_.end()) {
+    return false;
+  }
+  return true;
+}
+
+bool ProtoArgumentMappingContext::HasAttr(const std::string& name) const {
+  return op_->hasAttr(name);
+}
+
+paddle::any ProtoArgumentMappingContext::Attr(const std::string& name) const {
+  mlir::Attribute attrs = op_->getAttr(name);
+  if (mlir::StringAttr str_attr = attrs.dyn_cast_or_null<mlir::StringAttr>()) {
+    return paddle::any(str_attr.str());
+  } else {
+    // ToDO: implementation in the ext PR.
+    return paddle::any(0);
+  }
+}
+
+size_t ProtoArgumentMappingContext::InputSize(const std::string& name) const {
+  return op_->getNumOperands();
+}
+size_t ProtoArgumentMappingContext::OutputSize(const std::string& name) const {
+  return op_->getNumResults();
+}
+
+bool ProtoArgumentMappingContext::IsDenseTensorInput(
+    const std::string& name) const {
+  return true;
+}
+bool ProtoArgumentMappingContext::IsSelectedRowsInput(
+    const std::string& name) const {
+  return false;
+}
+
+bool ProtoArgumentMappingContext::IsDenseTensorOutput(
+    const std::string& name) const {
+  return true;
+}
+bool ProtoArgumentMappingContext::IsSelectedRowsOutput(
+    const std::string& name) const {
+  return false;
+}
+
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h
new file mode 100644
index 0000000000000..843b19d217feb
--- /dev/null
+++ b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h
@@ -0,0 +1,55 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <mlir/IR/Operation.h>
+#include <unordered_map>
+#include "paddle/infrt/dialect/pd_ops_info.h"
+#include "paddle/phi/core/compat/arg_map_context.h"
+
+namespace infrt {
+class ProtoArgumentMappingContext : public phi::ArgumentMappingContext {
+ public:
+  // only support op in pd dialect
+  explicit ProtoArgumentMappingContext(mlir::Operation* op)
+      : op_(op),
+        input_map_(pd_dialect_inputs_info_map_.at(
+            op->getName().getIdentifier().str().substr(3))),
+        output_map_(pd_dialect_outputs_info_map_.at(
+            op->getName().getIdentifier().str().substr(3))) {}
+  bool HasInput(const std::string& name) const override;
+  bool HasOutput(const std::string& name) const override;
+  bool HasAttr(const std::string& name) const override;
+
+  // now we can't use Attribute here, it will cause phi relay on
+  // boost::variant and BlockDesc
+  paddle::any Attr(const std::string& name) const override;
+
+  size_t InputSize(const std::string& name) const override;
+  size_t OutputSize(const std::string& name) const override;
+
+  bool IsDenseTensorInput(const std::string& name) const override;
+  bool IsSelectedRowsInput(const std::string& name) const override;
+
+  bool IsDenseTensorOutput(const std::string& name) const override;
+  bool IsSelectedRowsOutput(const std::string& name) const override;
+
+ private:
+  mlir::Operation* op_;
+  const std::unordered_map<std::string, uint8_t>& input_map_;
+  const std::unordered_map<std::string, uint8_t>& output_map_;
+};
+
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/pten/pten_base.cc b/paddle/infrt/dialect/phi/phi_base.cc
similarity index 75%
rename from paddle/infrt/dialect/pten/pten_base.cc
rename to paddle/infrt/dialect/phi/phi_base.cc
index ba87787dd7f7c..a1caa40f6383b 100644
--- a/paddle/infrt/dialect/pten/pten_base.cc
+++ b/paddle/infrt/dialect/phi/phi_base.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/infrt/dialect/pten/pten_base.h"
+#include "paddle/infrt/dialect/phi/phi_base.h"
 
 #include <mlir/IR/Builders.h>
 #include <mlir/IR/Dialect.h>
@@ -21,14 +21,14 @@
 #include <mlir/IR/TypeUtilities.h>
 #include <mlir/IR/Types.h>
 #include "paddle/infrt/common/global.h"
-#include "paddle/infrt/dialect/pten/infrt_pten_base.cpp.inc"
-#include "paddle/infrt/dialect/pten/infrt_pten_baseDialect.cpp.inc"
+#include "paddle/infrt/dialect/phi/infrt_phi_base.cpp.inc"
+#include "paddle/infrt/dialect/phi/infrt_phi_baseDialect.cpp.inc"
 
 namespace infrt {
-namespace pten {
+namespace phi {
 
-void PTENDialect::printType(::mlir::Type type,
-                            mlir::DialectAsmPrinter& os) const {
+void PHIDialect::printType(::mlir::Type type,
+                           mlir::DialectAsmPrinter& os) const {
   if (type.isa<CPUAllocatorType>()) {
     os << "CPU_Allocator";
     return;
@@ -48,18 +48,18 @@ void PTENDialect::printType(::mlir::Type type,
   llvm_unreachable("unexpected 'allocator/context' type kind");
 }
 
-void PTENDialect::initialize() {
+void PHIDialect::initialize() {
   addOperations<
 #define GET_OP_LIST
-#include "paddle/infrt/dialect/pten/infrt_pten_base.cpp.inc"  // NOLINT
+#include "paddle/infrt/dialect/phi/infrt_phi_base.cpp.inc"  // NOLINT
       >();
   addTypes<
 #define GET_TYPEDEF_LIST
-#include "paddle/infrt/dialect/pten/infrt_pten_baseTypes.cpp.inc"  // NOLINT
+#include "paddle/infrt/dialect/phi/infrt_phi_baseTypes.cpp.inc"  // NOLINT
       >();
 }
 
-mlir::Type PTENDialect::parseType(mlir::DialectAsmParser& parser) const {
+mlir::Type PHIDialect::parseType(mlir::DialectAsmParser& parser) const {
   llvm::StringRef keyword;
   if (parser.parseKeyword(&keyword)) return mlir::Type();
   if (keyword == "CPU_allocator") {
@@ -77,8 +77,8 @@ mlir::Type PTENDialect::parseType(mlir::DialectAsmParser& parser) const {
   return mlir::Type();
 }
 
-}  // namespace pten
+}  // namespace phi
 }  // namespace infrt
 
 #define GET_TYPEDEF_CLASSES
-#include "paddle/infrt/dialect/pten/infrt_pten_baseTypes.cpp.inc"  // NOLINT
+#include "paddle/infrt/dialect/phi/infrt_phi_baseTypes.cpp.inc"  // NOLINT
diff --git a/paddle/infrt/dialect/pten/pten_base.h b/paddle/infrt/dialect/phi/phi_base.h
similarity index 78%
rename from paddle/infrt/dialect/pten/pten_base.h
rename to paddle/infrt/dialect/phi/phi_base.h
index c3be6ef4e8bf4..e3e58c2269620 100644
--- a/paddle/infrt/dialect/pten/pten_base.h
+++ b/paddle/infrt/dialect/phi/phi_base.h
@@ -19,12 +19,12 @@
 
 #include <string>
 
-#include "paddle/infrt/dialect/pten/infrt_pten_base.h.inc"
-#include "paddle/infrt/dialect/pten/infrt_pten_baseDialect.h.inc"
+#include "paddle/infrt/dialect/phi/infrt_phi_base.h.inc"
+#include "paddle/infrt/dialect/phi/infrt_phi_baseDialect.h.inc"
 
 #define GET_TYPEDEF_CLASSES
-#include "paddle/infrt/dialect/pten/infrt_pten_baseTypes.h.inc"
+#include "paddle/infrt/dialect/phi/infrt_phi_baseTypes.h.inc"
 
 namespace infrt {
-namespace pten {}  // namespace pten
+namespace phi {}  // namespace phi
 }  // namespace infrt
diff --git a/paddle/infrt/dialect/phi/phi_exec.cc b/paddle/infrt/dialect/phi/phi_exec.cc
new file mode 100644
index 0000000000000..4e99661a6a205
--- /dev/null
+++ b/paddle/infrt/dialect/phi/phi_exec.cc
@@ -0,0 +1,47 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <llvm/Support/CommandLine.h>
+#include <mlir/Pass/PassManager.h>
+#include <iostream>
+#include <string>
+#include "paddle/infrt/common/global.h"
+#include "paddle/infrt/dialect/mlir_loader.h"
+#include "paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h"
+
+int main(int argc, char** argv) {
+  static llvm::cl::opt<std::string> input_file(
+      llvm::cl::Positional,
+      llvm::cl::desc("Specify input filename"),
+      llvm::cl::init("-"));
+
+  llvm::cl::ParseCommandLineOptions(argc, argv);
+
+  mlir::MLIRContext* context = infrt::Global::getMLIRContext();
+  auto module = infrt::dialect::LoadMlirFile(input_file.c_str(), context);
+
+  module->dump();
+  mlir::PassManager pm(context);
+
+  mlir::OpPassManager& phi_pass_manager = pm.nest<mlir::FuncOp>();
+  std::vector<infrt::Place> valid_places = {{infrt::TargetType::CPU,
+                                             infrt::PrecisionType::FLOAT32,
+                                             infrt::LayoutType::NCHW}};
+  phi_pass_manager.addPass(std::make_unique<infrt::phiOpCvtPass>(valid_places));
+  if (mlir::failed(pm.run(*module))) {
+    std::cout << "\npass failed!\n" << std::endl;
+    return 4;
+  }
+  module->dump();
+  return 0;
+}
diff --git a/paddle/infrt/dialect/pten/CMakeLists.txt b/paddle/infrt/dialect/pten/CMakeLists.txt
deleted file mode 100644
index b4ed5cdc1d82f..0000000000000
--- a/paddle/infrt/dialect/pten/CMakeLists.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-if (NOT INFRT_WITH_PTEN)
-    return()
-endif()
-
-#mlir_tablegen_on(infrt_pten_base DIALECT pten)
-add_mlir_dialect(infrt_pten_base pten)
-add_mlir_dialect(infrt_pten_tensor pten_dt)
-add_mlir_dialect(infrt_pten_kernel pten_kernel)
-#mlir_tablegen_on(infrt_pten_tensor)
-
-gather_srcs(infrt_src SRCS
-    pten_base.cc infrt_pten_tensor.cc
-    infrt_pten_tensor.cc)
diff --git a/paddle/infrt/dialect/pten/infrt_pten_kernel.td b/paddle/infrt/dialect/pten/infrt_pten_kernel.td
deleted file mode 100644
index a3a1609d9918a..0000000000000
--- a/paddle/infrt/dialect/pten/infrt_pten_kernel.td
+++ /dev/null
@@ -1,26 +0,0 @@
-#ifndef PTEN_KERNEL
-#define PTEN_KERNEL
-
-include "paddle/infrt/dialect/pten/infrt_pten_tensor.td"
-
-def PTEN_KernelDialect : Dialect {
-  let name = "pten_kernel";
-
-  let description = [{
-    The PTEN Kernel dialect.
-  }];
-
-  let cppNamespace = "::infrt::pten";
-}
-
-// PTEN Kernel related ops.
-class PDT_Kernel<string mnemonic, list<OpTrait> traits = []> : Op<PTEN_KernelDialect, mnemonic, !listconcat(traits, [IsolatedFromAbove])> {
-}
-
-def FakeKernelOp : PDT_Kernel<"pten.matmul.host.fp32"> {
-  let arguments = (ins CPU_Context:$dev_ctx, TensorType:$x, TensorType:$y, BoolAttr:$transpose_x, BoolAttr:$transpose_y);
-  let results = (outs TensorType:$output);
-}
-
-#endif
-
diff --git a/paddle/infrt/dialect/test_kernels.cc b/paddle/infrt/dialect/test_kernels.cc
index c4588d7cf8bab..f0c4723b49a79 100644
--- a/paddle/infrt/dialect/test_kernels.cc
+++ b/paddle/infrt/dialect/test_kernels.cc
@@ -147,7 +147,7 @@ static mlir::LogicalResult verify(BenchmarkOp op) {
   // Verify that the target benchmark region has exactly one return value.
   auto &region = op.region();
   auto &last_op = region.front().back();
-  if (last_op.getName().getStringRef() != "infrt.return") {
+  if (last_op.getName().getStringRef() != "Infrt.return") {
     return op.emitOpError("missing return statement");
   }
   if (last_op.getNumOperands() != 1) {
diff --git a/paddle/infrt/dialect/test_kernels.td b/paddle/infrt/dialect/test_kernels.td
index 6aa12f252d014..6e4bc26aa1496 100644
--- a/paddle/infrt/dialect/test_kernels.td
+++ b/paddle/infrt/dialect/test_kernels.td
@@ -45,7 +45,7 @@ def BenchmarkOp : Test_Op<"benchmark"> {
          // The following code benchmarks the infrt.add.i32 kernel.
          %x = infrt.add.i32 %c, %c
          // The benchmarked function needs to return exactly one value.
-         infrt.return %x : i32
+         Infrt.return %x : i32
        }
   }];
 
diff --git a/paddle/infrt/external_kernels/basic.mlir b/paddle/infrt/external_kernels/basic.mlir
index 843b12ced21a9..1a7ea854c9ce4 100644
--- a/paddle/infrt/external_kernels/basic.mlir
+++ b/paddle/infrt/external_kernels/basic.mlir
@@ -1,7 +1,7 @@
 // CHECK: basic
 func @basic() -> f32 {
-  %v0 = infrt.constant.f32 1.0
-  %v1 = infrt.constant.f32 2.0
+  %v0 = Infrt.constant.f32 1.0
+  %v1 = Infrt.constant.f32 2.0
   %v2 = "external.add.f32"(%v0, %v1) : (f32, f32) -> f32
 
   // CHECK: 1
@@ -17,5 +17,5 @@ func @basic() -> f32 {
   // CHECK: 6
   "external.print.f32"(%v3) : (f32) -> ()
 
-  infrt.return %v3 : f32
+  Infrt.return %v3 : f32
 }
diff --git a/paddle/infrt/external_kernels/fc.mlir b/paddle/infrt/external_kernels/fc.mlir
index bdac9ded2ef65..b0cabddc3ebc4 100644
--- a/paddle/infrt/external_kernels/fc.mlir
+++ b/paddle/infrt/external_kernels/fc.mlir
@@ -1,43 +1,43 @@
 // CHECK-LABEL: @fc
-func @fc(%input : !infrt.tensor<X86, NCHW, F32>,
-         %w : !infrt.tensor<X86, NCHW, F32>,
-         %bias : !infrt.tensor<X86, NCHW, F32>) -> !infrt.tensor<X86, NCHW, F32>
+func @fc(%input : !Infrt.tensor<X86, NCHW, F32>,
+         %w : !Infrt.tensor<X86, NCHW, F32>,
+         %bias : !Infrt.tensor<X86, NCHW, F32>) -> !Infrt.tensor<X86, NCHW, F32>
 {
-  %out = dt.create_uninit_tensor.f32 [30, 50] -> !infrt.tensor<X86, NCHW, F32>
-  // dt.fill_tensor_with_constant.f32 (%out : !infrt.tensor<X86, NCHW, F32>) {value=0.0:f32}
+  %out = dt.create_uninit_tensor.f32 [30, 50] -> !Infrt.tensor<X86, NCHW, F32>
+  // dt.fill_tensor_with_constant.f32 (%out : !Infrt.tensor<X86, NCHW, F32>) {value=0.0:f32}
 
   // fc1
-  "external.matmul"(%input, %w, %out) {}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
-  "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
-  "external.sigmoid"(%out, %out) {}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
+  "external.matmul"(%input, %w, %out) {}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
+  "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
+  "external.sigmoid"(%out, %out) {}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
 
   // fc2
-  "external.matmul"(%out, %w, %out) {}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
-  "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
-  "external.sigmoid"(%out, %out) {}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
+  "external.matmul"(%out, %w, %out) {}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
+  "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
+  "external.sigmoid"(%out, %out) {}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
 
-  infrt.return %out : !infrt.tensor<X86, NCHW, F32>
+  Infrt.return %out : !Infrt.tensor<X86, NCHW, F32>
 }
 
 // CHECK-LABEL: @benchmark
 func @benchmark() {
-  %input = dt.create_uninit_tensor.f32 [30, 50] -> !infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%input : !infrt.tensor<X86, NCHW, F32>) {value=1.0:f32}
+  %input = dt.create_uninit_tensor.f32 [30, 50] -> !Infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%input : !Infrt.tensor<X86, NCHW, F32>) {value=1.0:f32}
 
-  %w = dt.create_uninit_tensor.f32 [50, 50] -> !infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%w : !infrt.tensor<X86, NCHW, F32>) {value=2.0:f32}
+  %w = dt.create_uninit_tensor.f32 [50, 50] -> !Infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%w : !Infrt.tensor<X86, NCHW, F32>) {value=2.0:f32}
 
-  %bias = dt.create_uninit_tensor.f32 [30, 50] -> !infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%bias : !infrt.tensor<X86, NCHW, F32>) {value=3.0:f32}
+  %bias = dt.create_uninit_tensor.f32 [30, 50] -> !Infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%bias : !Infrt.tensor<X86, NCHW, F32>) {value=3.0:f32}
 
-  infrt.benchmark "add.f32"(
-          %input:!infrt.tensor<X86, NCHW, F32>,
-          %w:!infrt.tensor<X86, NCHW, F32>,
-          %bias:!infrt.tensor<X86, NCHW, F32>)
+  Infrt.benchmark "add.f32"(
+          %input:!Infrt.tensor<X86, NCHW, F32>,
+          %w:!Infrt.tensor<X86, NCHW, F32>,
+          %bias:!Infrt.tensor<X86, NCHW, F32>)
           duration_secs = 100, max_count = 300000, num_warmup_runs = 3
   {
-    %res = infrt.call @fc(%input, %w, %bias) : (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> (!infrt.tensor<X86, NCHW, F32>)
-    infrt.return %res : !infrt.tensor<X86, NCHW, F32>
+    %res = Infrt.call @fc(%input, %w, %bias) : (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> (!Infrt.tensor<X86, NCHW, F32>)
+    Infrt.return %res : !Infrt.tensor<X86, NCHW, F32>
   }
-  infrt.return
+  Infrt.return
 }
diff --git a/paddle/infrt/external_kernels/paddle.mlir b/paddle/infrt/external_kernels/paddle.mlir
index e7b8e9efba838..d55d9904b5bc4 100644
--- a/paddle/infrt/external_kernels/paddle.mlir
+++ b/paddle/infrt/external_kernels/paddle.mlir
@@ -1,50 +1,50 @@
 // CHECK: paddle_func
 func @paddle_func() -> () {
-  %input = dt.create_uninit_tensor.f32 [3, 5] -> !infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%input : !infrt.tensor<X86, NCHW, F32>) {value=1.0:f32}
+  %input = dt.create_uninit_tensor.f32 [3, 5] -> !Infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%input : !Infrt.tensor<X86, NCHW, F32>) {value=1.0:f32}
 
-  %w = dt.create_uninit_tensor.f32 [5, 4] -> !infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%w : !infrt.tensor<X86, NCHW, F32>) {value=2.0:f32}
+  %w = dt.create_uninit_tensor.f32 [5, 4] -> !Infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%w : !Infrt.tensor<X86, NCHW, F32>) {value=2.0:f32}
 
-  %bias = dt.create_uninit_tensor.f32 [4] -> !infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%bias : !infrt.tensor<X86, NCHW, F32>) {value=3.0:f32}
+  %bias = dt.create_uninit_tensor.f32 [4] -> !Infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%bias : !Infrt.tensor<X86, NCHW, F32>) {value=3.0:f32}
 
-  %out = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%out : !infrt.tensor<X86, NCHW, F32>) {value=0.0:f32}
+  %out = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%out : !Infrt.tensor<X86, NCHW, F32>) {value=0.0:f32}
 
-  "external.fc2"(%input, %w, %bias, %out) {in_num_col_dims=3:i32, test_attr=5:i32}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
+  "external.fc2"(%input, %w, %bias, %out) {in_num_col_dims=3:i32, test_attr=5:i32}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
   // CHECK-LABEL: tensor: shape=shape[3,5], values=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
-  dt.print_tensor (%input : !infrt.tensor<X86, NCHW, F32>)
+  dt.print_tensor (%input : !Infrt.tensor<X86, NCHW, F32>)
   // CHECK-LABEL: tensor: shape=shape[5,4], values=[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
-  dt.print_tensor (%w : !infrt.tensor<X86, NCHW, F32>)
-  dt.print_tensor (%bias : !infrt.tensor<X86, NCHW, F32>)
-  dt.print_tensor (%out : !infrt.tensor<X86, NCHW, F32>)
+  dt.print_tensor (%w : !Infrt.tensor<X86, NCHW, F32>)
+  dt.print_tensor (%bias : !Infrt.tensor<X86, NCHW, F32>)
+  dt.print_tensor (%out : !Infrt.tensor<X86, NCHW, F32>)
 
   // test external.matmul
-  %out1 = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%out1 : !infrt.tensor<X86, NCHW, F32>) {value=0.0:f32}
-  "external.matmul"(%input, %w, %out1) {}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
-  dt.print_tensor (%out1 : !infrt.tensor<X86, NCHW, F32>)
+  %out1 = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%out1 : !Infrt.tensor<X86, NCHW, F32>) {value=0.0:f32}
+  "external.matmul"(%input, %w, %out1) {}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
+  dt.print_tensor (%out1 : !Infrt.tensor<X86, NCHW, F32>)
 
   // test external.elementwise_add
-  %out2 = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%out2 : !infrt.tensor<X86, NCHW, F32>) {value=0.0:f32}
-  %bias1 = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%bias1 : !infrt.tensor<X86, NCHW, F32>) {value=3.0:f32}
-  "external.elementwise_add"(%out1, %bias1, %out2) {axis=-1}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
-  dt.print_tensor (%out2 : !infrt.tensor<X86, NCHW, F32>)
+  %out2 = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%out2 : !Infrt.tensor<X86, NCHW, F32>) {value=0.0:f32}
+  %bias1 = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%bias1 : !Infrt.tensor<X86, NCHW, F32>) {value=3.0:f32}
+  "external.elementwise_add"(%out1, %bias1, %out2) {axis=-1}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
+  dt.print_tensor (%out2 : !Infrt.tensor<X86, NCHW, F32>)
 
   // test external.relu
-  %out3 = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%out3 : !infrt.tensor<X86, NCHW, F32>) {value=0.0:f32}
-  "external.relu"(%out1, %out3) {}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
-  dt.print_tensor (%out3 : !infrt.tensor<X86, NCHW, F32>)
+  %out3 = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%out3 : !Infrt.tensor<X86, NCHW, F32>) {value=0.0:f32}
+  "external.relu"(%out1, %out3) {}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
+  dt.print_tensor (%out3 : !Infrt.tensor<X86, NCHW, F32>)
 
   // test external.sigmoid
-  %out4 = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%out4 : !infrt.tensor<X86, NCHW, F32>) {value=0.0:f32}
-  "external.sigmoid"(%out1, %out4) {}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
-  dt.print_tensor (%out4 : !infrt.tensor<X86, NCHW, F32>)
+  %out4 = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%out4 : !Infrt.tensor<X86, NCHW, F32>) {value=0.0:f32}
+  "external.sigmoid"(%out1, %out4) {}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
+  dt.print_tensor (%out4 : !Infrt.tensor<X86, NCHW, F32>)
 
-  infrt.return
+  Infrt.return
 }
diff --git a/paddle/infrt/host_context/mlir_exec.cc b/paddle/infrt/host_context/mlir_exec.cc
index 62c907bc9159f..79717ba2cc034 100644
--- a/paddle/infrt/host_context/mlir_exec.cc
+++ b/paddle/infrt/host_context/mlir_exec.cc
@@ -28,8 +28,8 @@
 #include "paddle/infrt/kernel/tensor_kernels.h"
 #include "paddle/infrt/kernel/tensor_shape_kernels.h"
 #include "paddle/infrt/kernel/test_kernels.h"
-#ifdef INFRT_WITH_PTEN
-#include "paddle/infrt/kernel/pten/registry.h"
+#ifdef INFRT_WITH_PHI
+#include "paddle/infrt/kernel/phi/registry.h"
 #endif
 
 static llvm::cl::list<std::string> cl_shared_libs(  // NOLINT
@@ -56,8 +56,8 @@ int main(int argc, char** argv) {
   kernel::RegisterTensorShapeKernels(&registry);
   kernel::RegisterTensorKernels(&registry);
   kernel::RegisterControlFlowKernels(&registry);
-#ifdef INFRT_WITH_PTEN
-  kernel::RegisterPtenKernels(&registry);
+#ifdef INFRT_WITH_PHI
+  kernel::RegisterPhiKernels(&registry);
 #endif
 
   // load extra shared library
diff --git a/paddle/infrt/host_context/mlir_tests/basic.mlir b/paddle/infrt/host_context/mlir_tests/basic.mlir
index 263d5884134b1..1b55b408f2b08 100644
--- a/paddle/infrt/host_context/mlir_tests/basic.mlir
+++ b/paddle/infrt/host_context/mlir_tests/basic.mlir
@@ -1,30 +1,30 @@
 // CHECK-LABEL: basic
 func @basic() -> f32 {
-  %v0 = infrt.constant.f32 1.0
-  %v1 = infrt.constant.f32 2.0
-  %v2 = "infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
+  %v0 = Infrt.constant.f32 1.0
+  %v1 = Infrt.constant.f32 2.0
+  %v2 = "Infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
 
   // CHECK: 1
-  "infrt.print.f32"(%v0) : (f32) -> ()
+  "Infrt.print.f32"(%v0) : (f32) -> ()
   // CHECK: 2
-  "infrt.print.f32"(%v1) : (f32) -> ()
+  "Infrt.print.f32"(%v1) : (f32) -> ()
 
   // CHECK: 3
-  "infrt.print.f32"(%v2) : (f32) -> ()
+  "Infrt.print.f32"(%v2) : (f32) -> ()
 
-  %v3 = "infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32
+  %v3 = "Infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32
 
   // CHECK: 6
-  "infrt.print.f32"(%v3) : (f32) -> ()
+  "Infrt.print.f32"(%v3) : (f32) -> ()
 
-  infrt.return %v3 : f32
+  Infrt.return %v3 : f32
 }
 
 // CHECK-LABEL: basic1
 // Check the mlir executor can work with more than one function in a file.
 func @basic1() -> () {
-  %v0 = infrt.constant.f32 1.0
-  "infrt.print.f32"(%v0) : (f32) -> ()
+  %v0 = Infrt.constant.f32 1.0
+  "Infrt.print.f32"(%v0) : (f32) -> ()
   // CHECK: 1
-  infrt.return
+  Infrt.return
 }
\ No newline at end of file
diff --git a/paddle/infrt/host_context/mlir_tests/dense_tensor.mlir b/paddle/infrt/host_context/mlir_tests/dense_tensor.mlir
index 83afa1db8a91c..5a973a3eb23e6 100644
--- a/paddle/infrt/host_context/mlir_tests/dense_tensor.mlir
+++ b/paddle/infrt/host_context/mlir_tests/dense_tensor.mlir
@@ -1,9 +1,9 @@
 // CHECK-LABEL: build_tensor1
 func @build_tensor1() {
-  %a = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%a : !infrt.tensor<X86, NCHW, F32>) {value=1.0:f32}
+  %a = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%a : !Infrt.tensor<X86, NCHW, F32>) {value=1.0:f32}
   // CHECK: tensor: shape=shape[3,4], values=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
-  dt.print_tensor (%a : !infrt.tensor<X86, NCHW, F32>)
+  dt.print_tensor (%a : !Infrt.tensor<X86, NCHW, F32>)
 
-  infrt.return
+  Infrt.return
 }
diff --git a/paddle/infrt/host_context/mlir_tests/shape.mlir b/paddle/infrt/host_context/mlir_tests/shape.mlir
index a3130857b0ef7..22df1c8010d8d 100644
--- a/paddle/infrt/host_context/mlir_tests/shape.mlir
+++ b/paddle/infrt/host_context/mlir_tests/shape.mlir
@@ -3,5 +3,5 @@ func @build_tensor1() {
   %a = ts.build_shape [1:i64, 57:i64, 92:i64]
   // CHECK: shape[1,57,92]
   ts.print_shape %a
-  infrt.return
+  Infrt.return
 }
\ No newline at end of file
diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate.cc b/paddle/infrt/host_context/mlir_to_runtime_translate.cc
index 3dbc7a702be38..b47e2b27eab7c 100644
--- a/paddle/infrt/host_context/mlir_to_runtime_translate.cc
+++ b/paddle/infrt/host_context/mlir_to_runtime_translate.cc
@@ -74,7 +74,7 @@ struct MlirToRuntimeTranslator::Impl {
 };
 
 bool MlirToRuntimeTranslator::EmitConstantOp(mlir::Operation* op) {
-  if (!infrt::Startswith(op->getName().getStringRef().str(), "infrt.constant"))
+  if (!infrt::Startswith(op->getName().getStringRef().str(), "Infrt.constant"))
     return false;
   VLOG(3) << "Emitting constant op [" << op->getName().getStringRef().str()
           << "]";
@@ -224,7 +224,7 @@ boost::optional<std::vector<double>> MlirToRuntimeTranslator::EmitAttribute(
 }
 
 static bool IsReturn(mlir::Operation* op) {
-  return op->getName().getStringRef() == "infrt.return";
+  return op->getName().getStringRef() == "Infrt.return";
 }
 
 bool MlirToRuntimeTranslator::EmitGeneralOp(mlir::Operation* op) {
@@ -345,7 +345,7 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(mlir::Operation* op) {
 bool MlirToRuntimeTranslator::EmitReturnOp(
     mlir::Operation* op, llvm::SmallVectorImpl<mlir::Value>* results) {
   CHECK(results);
-  if (op->getName().getStringRef() == "infrt.return") {
+  if (op->getName().getStringRef() == "Infrt.return") {
     for (size_t i = 0; i < op->getNumOperands(); i++) {
       results->push_back(op->getOperand(i));
     }
@@ -418,7 +418,7 @@ bool MlirToRuntimeTranslator::EmitCallOp(mlir::Operation* op,
                                          function_defs_t* function_table) {
   CHECK(op);
   CHECK(function_table);
-  if (op->getName().getStringRef() != "infrt.call") return false;
+  if (op->getName().getStringRef() != "Infrt.call") return false;
 
   impl_->cur_op =
       impl_->runtime->NewOpExecutable(op->getName().getStringRef().str());
diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate.h b/paddle/infrt/host_context/mlir_to_runtime_translate.h
index fcd79eaf386ee..0c453651d9e6d 100644
--- a/paddle/infrt/host_context/mlir_to_runtime_translate.h
+++ b/paddle/infrt/host_context/mlir_to_runtime_translate.h
@@ -57,7 +57,7 @@ class MlirToRuntimeTranslator {
  protected:
   //! Emit a "infrt.constant.*" operation, return true if succeed.
   bool EmitConstantOp(mlir::Operation* op);
-  //! Emit a "infrt.return" operation.
+  //! Emit a "Infrt.return" operation.
   bool EmitReturnOp(mlir::Operation* op,
                     llvm::SmallVectorImpl<mlir::Value>* results);
   //! Emit a "ts.build_shape" operation.
diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc b/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc
index 375daa4515e17..5824e40abf97a 100644
--- a/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc
+++ b/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc
@@ -37,14 +37,14 @@ TEST(MlirToRuntimeTranslate, basic) {
 
   auto source = R"ROC(
 func @main() -> () {
-  %v0 = infrt.constant.f32 1.0
-  %v1 = infrt.constant.f32 2.0
-  %v2 = "infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
-  %v3 = "infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32
+  %v0 = Infrt.constant.f32 1.0
+  %v1 = Infrt.constant.f32 2.0
+  %v2 = "Infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
+  %v3 = "Infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32
 
-  "infrt.print.f32"(%v1) : (f32) -> ()
+  "Infrt.print.f32"(%v1) : (f32) -> ()
 
-  infrt.return
+  Infrt.return
 }
 )ROC";
 
@@ -63,14 +63,14 @@ TEST(TestMlir, basic) {
 
   auto source = R"ROC(
 func @main() -> () {
-  %v0 = infrt.constant.f32 1.0
-  %v1 = infrt.constant.f32 2.0
-  %v2 = "infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
-  %v3 = "infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32
+  %v0 = Infrt.constant.f32 1.0
+  %v1 = Infrt.constant.f32 2.0
+  %v2 = "Infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
+  %v3 = "Infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32
 
-  "infrt.print.f32"(%v1) : (f32) -> ()
+  "Infrt.print.f32"(%v1) : (f32) -> ()
 
-  infrt.return
+  Infrt.return
 }
 )ROC";
 
@@ -88,18 +88,20 @@ TEST(TestMlir, shadow_copy_tensor_profile) {
   mlir::MLIRContext* context = infrt::Global::getMLIRContext();
 
   auto head = R"ROC(
-func @predict(%a: !infrt.tensor<X86, NCHW, F32>, %b: !infrt.tensor<X86, NCHW, F32>) -> (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) {
+func @predict(%a: !infrt.dense_tensor<CPU, FP32, NCHW>, %b: !infrt.dense_tensor<CPU, FP32, NCHW>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) {
 )ROC";
 
   auto tpl0 =
-      "%a{0} = dt.shallow_copy_tensor %a : !infrt.tensor<X86, NCHW, F32> -> "
-      "!infrt.tensor<X86, NCHW, F32>";
+      "%a{0} = dt.shallow_copy_tensor %a : !infrt.dense_tensor<CPU, FP32, "
+      "NCHW> -> "
+      "!infrt.dense_tensor<CPU, FP32, NCHW>";
   auto tpl1 =
-      "%b{0} = dt.shallow_copy_tensor %b : !infrt.tensor<X86, NCHW, F32> -> "
-      "!infrt.tensor<X86, NCHW, F32>";
+      "%b{0} = dt.shallow_copy_tensor %b : !infrt.dense_tensor<CPU, FP32, "
+      "NCHW> -> "
+      "!infrt.dense_tensor<CPU, FP32, NCHW>";
 
   auto end = R"ROC(
-infrt.return %a0, %b0: !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>
+Infrt.return %a0, %b0: !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>
 }
   )ROC";
 
diff --git a/paddle/infrt/host_context/value.cc b/paddle/infrt/host_context/value.cc
index 222c5dcd6c575..3f40490557290 100644
--- a/paddle/infrt/host_context/value.cc
+++ b/paddle/infrt/host_context/value.cc
@@ -24,7 +24,7 @@ ValueRef::ValueRef(int64_t val) : Shared<Value>(new Value(val)) {}
 ValueRef::ValueRef(float val) : Shared<Value>(new Value(val)) {}
 ValueRef::ValueRef(double val) : Shared<Value>(new Value(val)) {}
 ValueRef::ValueRef(bool val) : Shared<Value>(new Value(val)) {}
-ValueRef::ValueRef(backends::CpuPtenContext&& val)
+ValueRef::ValueRef(backends::CpuPhiContext&& val)
     : Shared<Value>(new Value(std::move(val))) {}
 ValueRef::ValueRef(::phi::CPUContext&& val)
     : Shared<Value>(new Value(std::move(val))) {}
diff --git a/paddle/infrt/host_context/value.h b/paddle/infrt/host_context/value.h
index c39ddf69a90e2..21c06c4bfd8f4 100644
--- a/paddle/infrt/host_context/value.h
+++ b/paddle/infrt/host_context/value.h
@@ -29,9 +29,9 @@
 #include "paddle/infrt/tensor/tensor_map.h"
 #include "paddle/infrt/tensor/tensor_shape.h"
 
-#ifdef INFRT_WITH_PTEN
-#include "paddle/infrt/backends/host/pten_allocator.h"
-#include "paddle/infrt/backends/host/pten_context.h"
+#ifdef INFRT_WITH_PHI
+#include "paddle/infrt/backends/host/phi_allocator.h"
+#include "paddle/infrt/backends/host/phi_context.h"
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/common/backend.h"
 #include "paddle/phi/common/data_type.h"
@@ -61,11 +61,11 @@ using ValueVariantType =
             tensor::DenseHostTensor,
             MlirFunctionExecutable*,
             tensor::TensorMap,
-#ifdef INFRT_WITH_PTEN
+#ifdef INFRT_WITH_PHI
             ::phi::MetaTensor,
             ::phi::DenseTensor,
-            backends::CpuPtenAllocator,
-            backends::CpuPtenContext,
+            backends::CpuPhiAllocator,
+            backends::CpuPhiContext,
             ::phi::CPUContext,
             std::vector<phi::DenseTensor>,
             paddle::experimental::ScalarBase<phi::DenseTensor>,
@@ -108,12 +108,12 @@ class Value : public common::Object {
   explicit Value(tensor::TensorShape&& x) : data(std::move(x)) {}
   explicit Value(tensor::DenseHostTensor&& x) : data(std::move(x)) {}
   explicit Value(MlirFunctionExecutable* x) : data(x) {}
-#ifdef INFRT_WITH_PTEN
-  explicit Value(backends::CpuPtenContext&& x) : data(std::move(x)) {}
+#ifdef INFRT_WITH_PHI
+  explicit Value(backends::CpuPhiContext&& x) : data(std::move(x)) {}
   explicit Value(::phi::CPUContext&& x) : data(std::move(x)) {}
   explicit Value(::phi::DenseTensor&& x) : data(std::move(x)) {}
   explicit Value(::phi::MetaTensor&& x) : data(std::move(x)) {}
-  explicit Value(backends::CpuPtenAllocator&& x) : data(std::move(x)) {}
+  explicit Value(backends::CpuPhiAllocator&& x) : data(std::move(x)) {}
 #endif
 
   template <typename T>
@@ -173,7 +173,7 @@ class ValueRef : common::Shared<Value> {
   explicit ValueRef(double val);
   explicit ValueRef(bool val);
   explicit ValueRef(::phi::MetaTensor&& val);
-  explicit ValueRef(backends::CpuPtenContext&& x);
+  explicit ValueRef(backends::CpuPhiContext&& x);
   explicit ValueRef(::phi::CPUContext&& x);
   explicit ValueRef(::phi::DenseTensor&& x);
 
diff --git a/paddle/infrt/kernel/CMakeLists.txt b/paddle/infrt/kernel/CMakeLists.txt
index 402665119ac2d..f1cbfba1c46b3 100644
--- a/paddle/infrt/kernel/CMakeLists.txt
+++ b/paddle/infrt/kernel/CMakeLists.txt
@@ -1,10 +1,10 @@
-add_subdirectory(pten)
+add_subdirectory(phi)
 
 core_gather_headers()
 
 gather_srcs(infrt_src SRCS
     basic_kernels.cc
-    # pten_kernels.cc
+    # phi_kernels.cc
     test_kernels.cc
     tensor_shape_kernels.cc
     tensor_kernels.cc
diff --git a/paddle/infrt/kernel/basic_kernels.cc b/paddle/infrt/kernel/basic_kernels.cc
index b186cfcfd2b35..23e50a5ddc874 100644
--- a/paddle/infrt/kernel/basic_kernels.cc
+++ b/paddle/infrt/kernel/basic_kernels.cc
@@ -63,24 +63,24 @@ static void PrintString(const std::string &str) {
 void RegisterBasicKernels(host_context::KernelRegistry *registry) {
   RegisterIntBasicKernels(registry);
   RegisterFloatBasicKernels(registry);
-  registry->AddKernel("infrt.get_string", INFRT_KERNEL(GetString));
-  registry->AddKernel("infrt.print_string", INFRT_KERNEL(PrintString));
+  registry->AddKernel("Infrt.get_string", INFRT_KERNEL(GetString));
+  registry->AddKernel("Infrt.print_string", INFRT_KERNEL(PrintString));
 }
 
 void RegisterIntBasicKernels(host_context::KernelRegistry *registry) {
-  registry->AddKernel("infrt.add.i32", INFRT_KERNEL(add<int32_t>));
-  registry->AddKernel("infrt.sub.i32", INFRT_KERNEL(sub<int32_t>));
-  registry->AddKernel("infrt.mul.i32", INFRT_KERNEL(mul<int32_t>));
-  registry->AddKernel("infrt.div.i32", INFRT_KERNEL(div<int32_t>));
-  registry->AddKernel("infrt.print.i32", INFRT_KERNEL(print<int32_t>));
+  registry->AddKernel("Infrt.add.i32", INFRT_KERNEL(add<int32_t>));
+  registry->AddKernel("Infrt.sub.i32", INFRT_KERNEL(sub<int32_t>));
+  registry->AddKernel("Infrt.mul.i32", INFRT_KERNEL(mul<int32_t>));
+  registry->AddKernel("Infrt.div.i32", INFRT_KERNEL(div<int32_t>));
+  registry->AddKernel("Infrt.print.i32", INFRT_KERNEL(print<int32_t>));
 }
 
 void RegisterFloatBasicKernels(host_context::KernelRegistry *registry) {
-  registry->AddKernel("infrt.add.f32", INFRT_KERNEL(add<float>));
-  registry->AddKernel("infrt.sub.f32", INFRT_KERNEL(sub<float>));
-  registry->AddKernel("infrt.mul.f32", INFRT_KERNEL(mul<float>));
-  registry->AddKernel("infrt.div.f32", INFRT_KERNEL(div<float>));
-  registry->AddKernel("infrt.print.f32", INFRT_KERNEL(print<float>));
+  registry->AddKernel("Infrt.add.f32", INFRT_KERNEL(add<float>));
+  registry->AddKernel("Infrt.sub.f32", INFRT_KERNEL(sub<float>));
+  registry->AddKernel("Infrt.mul.f32", INFRT_KERNEL(mul<float>));
+  registry->AddKernel("Infrt.div.f32", INFRT_KERNEL(div<float>));
+  registry->AddKernel("Infrt.print.f32", INFRT_KERNEL(print<float>));
 }
 
 }  // namespace kernel
diff --git a/paddle/infrt/kernel/control_flow_kernels.cc b/paddle/infrt/kernel/control_flow_kernels.cc
index 6cc94dbcce077..8b18aca021086 100644
--- a/paddle/infrt/kernel/control_flow_kernels.cc
+++ b/paddle/infrt/kernel/control_flow_kernels.cc
@@ -37,7 +37,7 @@ static void INFRTCall(
 }
 
 void RegisterControlFlowKernels(host_context::KernelRegistry* registry) {
-  registry->AddKernel("infrt.call", INFRT_KERNEL(INFRTCall));
+  registry->AddKernel("Infrt.call", INFRT_KERNEL(INFRTCall));
 }
 
 }  // namespace kernel
diff --git a/paddle/infrt/kernel/pten/CMakeLists.txt b/paddle/infrt/kernel/phi/CMakeLists.txt
similarity index 61%
rename from paddle/infrt/kernel/pten/CMakeLists.txt
rename to paddle/infrt/kernel/phi/CMakeLists.txt
index fbb205e2af011..e21cacfbc10b3 100644
--- a/paddle/infrt/kernel/pten/CMakeLists.txt
+++ b/paddle/infrt/kernel/phi/CMakeLists.txt
@@ -1,4 +1,4 @@
-if (NOT INFRT_WITH_PTEN)
+if (NOT INFRT_WITH_PHI)
     return()
 endif()
 
@@ -11,16 +11,16 @@ gather_srcs(infrt_src SRCS
     allocator_kernels.cc
 )
 
-set(infrt_register_pten_kernels_gen_source_file ${CMAKE_SOURCE_DIR}/paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launchers.cc)
-set(infrt_register_pten_kernels_gen_file ${CMAKE_SOURCE_DIR}/tools/infrt/get_pten_kernel_function.sh)
+set(infrt_register_phi_kernels_gen_source_file ${CMAKE_SOURCE_DIR}/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.cc)
+set(infrt_register_phi_kernels_gen_file ${CMAKE_SOURCE_DIR}/tools/infrt/get_phi_kernel_function.sh)
 set(wrapped_infermeta_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/infermeta/generated.h)
 set(wrapped_infermeta_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/infermeta/generated.cc)
 
 add_custom_command(
-        OUTPUT ${infrt_register_pten_kernels_gen_source_file}
-        COMMAND sh ${infrt_register_pten_kernels_gen_file}
+        OUTPUT ${infrt_register_phi_kernels_gen_source_file}
+        COMMAND sh ${infrt_register_phi_kernels_gen_file}
         DEPENDS ${wrapped_infermeta_header_file} ${wrapped_infermeta_source_file}
-        COMMENT "infrt generate ${infrt_register_pten_kernels_gen_source_file}"
+        COMMENT "infrt generate ${infrt_register_phi_kernels_gen_source_file}"
         VERBATIM)
 
 cc_library(infrt_naive SRCS infershaped/infershaped_kernel_launcher.cc
diff --git a/paddle/infrt/kernel/pten/allocator_kernels.cc b/paddle/infrt/kernel/phi/allocator_kernels.cc
similarity index 81%
rename from paddle/infrt/kernel/pten/allocator_kernels.cc
rename to paddle/infrt/kernel/phi/allocator_kernels.cc
index d3ecbed15da96..eba12e688b4ae 100644
--- a/paddle/infrt/kernel/pten/allocator_kernels.cc
+++ b/paddle/infrt/kernel/phi/allocator_kernels.cc
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/infrt/kernel/pten/allocator_kernels.h"
+#include "paddle/infrt/kernel/phi/allocator_kernels.h"
 
 namespace infrt {
 namespace kernel {
-namespace pten {
+namespace phi {
 
-backends::CpuPtenAllocator CreateCpuAllocator() { return {}; }
+backends::CpuPhiAllocator CreateCpuAllocator() { return {}; }
 
-}  // namespace pten
+}  // namespace phi
 }  // namespace kernel
 }  // namespace infrt
diff --git a/paddle/infrt/kernel/pten/allocator_kernels.h b/paddle/infrt/kernel/phi/allocator_kernels.h
similarity index 84%
rename from paddle/infrt/kernel/pten/allocator_kernels.h
rename to paddle/infrt/kernel/phi/allocator_kernels.h
index ddc316c269923..d10382f5e6014 100644
--- a/paddle/infrt/kernel/pten/allocator_kernels.h
+++ b/paddle/infrt/kernel/phi/allocator_kernels.h
@@ -14,15 +14,15 @@
 
 #pragma once
 
-#include "paddle/infrt/backends/host/pten_allocator.h"
+#include "paddle/infrt/backends/host/phi_allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace infrt {
 namespace kernel {
-namespace pten {
+namespace phi {
 
-backends::CpuPtenAllocator CreateCpuAllocator();
+backends::CpuPhiAllocator CreateCpuAllocator();
 
-}  // namespace pten
+}  // namespace phi
 }  // namespace kernel
 }  // namespace infrt
diff --git a/paddle/infrt/kernel/pten/context_kernels.cc b/paddle/infrt/kernel/phi/context_kernels.cc
similarity index 82%
rename from paddle/infrt/kernel/pten/context_kernels.cc
rename to paddle/infrt/kernel/phi/context_kernels.cc
index 0c5e53212113b..ff9ae50bc4345 100644
--- a/paddle/infrt/kernel/pten/context_kernels.cc
+++ b/paddle/infrt/kernel/phi/context_kernels.cc
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/infrt/kernel/pten/context_kernels.h"
+#include "paddle/infrt/kernel/phi/context_kernels.h"
 
 namespace infrt {
 namespace kernel {
-namespace pten {
+namespace phi {
 
-backends::CpuPtenContext CreateCpuContext() { return {}; }
+backends::CpuPhiContext CreateCpuContext() { return {}; }
 
-}  // namespace pten
+}  // namespace phi
 }  // namespace kernel
 }  // namespace infrt
diff --git a/paddle/infrt/kernel/pten/context_kernels.h b/paddle/infrt/kernel/phi/context_kernels.h
similarity index 84%
rename from paddle/infrt/kernel/pten/context_kernels.h
rename to paddle/infrt/kernel/phi/context_kernels.h
index 95a20f912efbf..6fe1a01f770db 100644
--- a/paddle/infrt/kernel/pten/context_kernels.h
+++ b/paddle/infrt/kernel/phi/context_kernels.h
@@ -14,15 +14,15 @@
 
 #pragma once
 
-#include "paddle/infrt/backends/host/pten_context.h"
+#include "paddle/infrt/backends/host/phi_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace infrt {
 namespace kernel {
-namespace pten {
+namespace phi {
 
-backends::CpuPtenContext CreateCpuContext();
+backends::CpuPhiContext CreateCpuContext();
 
-}  // namespace pten
+}  // namespace phi
 }  // namespace kernel
 }  // namespace infrt
diff --git a/paddle/infrt/kernel/pten/dense_tensor_kernels.cc b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc
similarity index 90%
rename from paddle/infrt/kernel/pten/dense_tensor_kernels.cc
rename to paddle/infrt/kernel/phi/dense_tensor_kernels.cc
index b21e418789663..ce9200b9918c0 100644
--- a/paddle/infrt/kernel/pten/dense_tensor_kernels.cc
+++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/infrt/kernel/pten/dense_tensor_kernels.h"
+#include "paddle/infrt/kernel/phi/dense_tensor_kernels.h"
 
 namespace infrt {
 namespace kernel {
-namespace pten {
+namespace phi {
 
 ::phi::DenseTensor CreateDenseTensorCpuF32Nchw(
-    backends::CpuPtenAllocator* allocator,
+    backends::CpuPhiAllocator* allocator,
     host_context::Attribute<std::vector<int64_t>> dims,
     host_context::Attribute<std::vector<int64_t>> lod) {
   return ::phi::DenseTensor(allocator,
@@ -32,6 +32,6 @@ ::phi::DenseTensor CreateDenseTensorCpuF32Nchw(
 void FillDenseTensorF32(::phi::DenseTensor* dense_tensor,
                         host_context::Attribute<std::vector<int64_t>> values) {}
 
-}  // namespace pten
+}  // namespace phi
 }  // namespace kernel
 }  // namespace infrt
diff --git a/paddle/infrt/kernel/pten/dense_tensor_kernels.h b/paddle/infrt/kernel/phi/dense_tensor_kernels.h
similarity index 89%
rename from paddle/infrt/kernel/pten/dense_tensor_kernels.h
rename to paddle/infrt/kernel/phi/dense_tensor_kernels.h
index 41f701b01032a..25daf7027e8cb 100644
--- a/paddle/infrt/kernel/pten/dense_tensor_kernels.h
+++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.h
@@ -14,22 +14,22 @@
 
 #pragma once
 
-#include "paddle/infrt/backends/host/pten_allocator.h"
+#include "paddle/infrt/backends/host/phi_allocator.h"
 #include "paddle/infrt/host_context/kernel_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace infrt {
 namespace kernel {
-namespace pten {
+namespace phi {
 
 ::phi::DenseTensor CreateDenseTensorCpuF32Nchw(
-    backends::CpuPtenAllocator* allocator,
+    backends::CpuPhiAllocator* allocator,
     host_context::Attribute<std::vector<int64_t>> dims,
     host_context::Attribute<std::vector<int64_t>> lod);
 
 void FillDenseTensorF32(::phi::DenseTensor* dense_tensor,
                         host_context::Attribute<std::vector<int64_t>> values);
 
-}  // namespace pten
+}  // namespace phi
 }  // namespace kernel
 }  // namespace infrt
diff --git a/paddle/infrt/kernel/pten/infershaped/infershape_launchers_test.cc b/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc
similarity index 93%
rename from paddle/infrt/kernel/pten/infershaped/infershape_launchers_test.cc
rename to paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc
index c781ca908fdf0..331ebcfb4a5d2 100644
--- a/paddle/infrt/kernel/pten/infershaped/infershape_launchers_test.cc
+++ b/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc
@@ -14,9 +14,9 @@
 
 #include <gtest/gtest.h>
 
-#include "paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launcher.h"
-#include "paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launchers.h"
-#include "paddle/infrt/kernel/pten/infershaped/infershaped_utils.h"
+#include "paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.h"
+#include "paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.h"
+#include "paddle/infrt/kernel/phi/infershaped/infershaped_utils.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/dense_tensor.h"
diff --git a/paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launcher.cc b/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc
similarity index 96%
rename from paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launcher.cc
rename to paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc
index c21339bed3872..62b204b160448 100644
--- a/paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launcher.cc
+++ b/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launcher.h"
+#include "paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.h"
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace infrt {
diff --git a/paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launcher.h b/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.h
similarity index 100%
rename from paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launcher.h
rename to paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.h
diff --git a/paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launchers.h b/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.h
similarity index 100%
rename from paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launchers.h
rename to paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.h
diff --git a/paddle/infrt/kernel/pten/infershaped/infershaped_utils.h b/paddle/infrt/kernel/phi/infershaped/infershaped_utils.h
similarity index 100%
rename from paddle/infrt/kernel/pten/infershaped/infershaped_utils.h
rename to paddle/infrt/kernel/phi/infershaped/infershaped_utils.h
diff --git a/paddle/infrt/kernel/pten/infershaped/pten_kernel_launcher.h b/paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h
similarity index 93%
rename from paddle/infrt/kernel/pten/infershaped/pten_kernel_launcher.h
rename to paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h
index 9a3e978e966b0..713f7df7f5225 100644
--- a/paddle/infrt/kernel/pten/infershaped/pten_kernel_launcher.h
+++ b/paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h
@@ -16,8 +16,8 @@
 #include <llvm/ADT/SmallVector.h>
 
 #include "paddle/infrt/host_context/kernel_utils.h"
-#include "paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launcher.h"
-#include "paddle/infrt/kernel/pten/infershaped/infershaped_utils.h"
+#include "paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.h"
+#include "paddle/infrt/kernel/phi/infershaped/infershaped_utils.h"
 
 namespace infrt {
 namespace kernel {
diff --git a/paddle/infrt/kernel/pten/registry.cc b/paddle/infrt/kernel/phi/registry.cc
similarity index 65%
rename from paddle/infrt/kernel/pten/registry.cc
rename to paddle/infrt/kernel/phi/registry.cc
index d70f5deca6aea..f4f0e75a987a2 100644
--- a/paddle/infrt/kernel/pten/registry.cc
+++ b/paddle/infrt/kernel/phi/registry.cc
@@ -12,17 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/infrt/kernel/pten/registry.h"
+#include "paddle/infrt/kernel/phi/registry.h"
 
 #include <iostream>
 #include <string>
 
 #include "paddle/infrt/host_context/kernel_registry.h"
 #include "paddle/infrt/host_context/kernel_utils.h"
-#include "paddle/infrt/kernel/pten/allocator_kernels.h"
-#include "paddle/infrt/kernel/pten/context_kernels.h"
-#include "paddle/infrt/kernel/pten/dense_tensor_kernels.h"
-#include "paddle/infrt/kernel/pten/infershaped/pten_kernel_launcher.h"
+#include "paddle/infrt/kernel/phi/allocator_kernels.h"
+#include "paddle/infrt/kernel/phi/context_kernels.h"
+#include "paddle/infrt/kernel/phi/dense_tensor_kernels.h"
+#include "paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h"
 #include "paddle/phi/include/infermeta.h"
 #include "paddle/phi/include/kernels.h"
 #include "paddle/phi/kernels/matmul_kernel.h"
@@ -32,18 +32,18 @@ using infrt::host_context::Attribute;
 namespace infrt {
 namespace kernel {
 
-void RegisterPtenKernels(host_context::KernelRegistry* registry) {
-  registry->AddKernel("pten_dt.create_allocator.cpu",
-                      INFRT_KERNEL(infrt::kernel::pten::CreateCpuAllocator));
-  registry->AddKernel("pten_dt.create_context.cpu",
-                      INFRT_KERNEL(infrt::kernel::pten::CreateCpuContext));
+void RegisterPhiKernels(host_context::KernelRegistry* registry) {
+  registry->AddKernel("phi_dt.create_allocator.cpu",
+                      INFRT_KERNEL(infrt::kernel::phi::CreateCpuAllocator));
+  registry->AddKernel("phi_dt.create_context.cpu",
+                      INFRT_KERNEL(infrt::kernel::phi::CreateCpuContext));
   registry->AddKernel(
-      "pten_dt.create_dense_tensor.cpu.f32.nchw",
-      INFRT_KERNEL(infrt::kernel::pten::CreateDenseTensorCpuF32Nchw));
-  registry->AddKernel("pten_dt.fill_dense_tensor.f32",
-                      INFRT_KERNEL(infrt::kernel::pten::FillDenseTensorF32));
+      "phi_dt.create_dense_tensor.cpu.f32.nchw",
+      INFRT_KERNEL(infrt::kernel::phi::CreateDenseTensorCpuF32Nchw));
+  registry->AddKernel("phi_dt.fill_dense_tensor.f32",
+                      INFRT_KERNEL(infrt::kernel::phi::FillDenseTensorF32));
   registry->AddKernel(
-      "pten.matmul.host.fp32",
+      "phi.matmul.host.fp32",
       std::bind(&kernel::KernelLauncherFunc<
                     decltype(&::phi::MatmulKernel<float, ::phi::CPUContext>),
                     &::phi::MatmulKernel<float, ::phi::CPUContext>,
diff --git a/paddle/infrt/kernel/pten/registry.h b/paddle/infrt/kernel/phi/registry.h
similarity index 88%
rename from paddle/infrt/kernel/pten/registry.h
rename to paddle/infrt/kernel/phi/registry.h
index c290f8ea524fb..c72085a50c1e7 100644
--- a/paddle/infrt/kernel/pten/registry.h
+++ b/paddle/infrt/kernel/phi/registry.h
@@ -27,9 +27,9 @@ namespace infrt {
 namespace kernel {
 
 /**
- * Register all the pten kernels to registry.
+ * Register all the phi kernels to registry.
  */
-void RegisterPtenKernels(host_context::KernelRegistry* registry);
+void RegisterPhiKernels(host_context::KernelRegistry* registry);
 
 }  // namespace kernel
 }  // namespace infrt
diff --git a/paddle/infrt/kernel/test_kernels.cc b/paddle/infrt/kernel/test_kernels.cc
index ccfb3356a855f..d15bbe221f91a 100644
--- a/paddle/infrt/kernel/test_kernels.cc
+++ b/paddle/infrt/kernel/test_kernels.cc
@@ -193,8 +193,8 @@ tensor::DenseHostTensor ShadowCopyTensor(tensor::DenseHostTensor src) {
 }
 
 void RegisterTestKernels(host_context::KernelRegistry *registry) {
-  registry->AddKernel("infrt.benchmark", INFRT_KERNEL(benchmark));
-  registry->AddKernel("infrt.test.shadow_copy_tensor",
+  registry->AddKernel("Infrt.benchmark", INFRT_KERNEL(benchmark));
+  registry->AddKernel("Infrt.test.shadow_copy_tensor",
                       INFRT_KERNEL(ShadowCopyTensor));
 }
 
diff --git a/paddle/infrt/pass/CMakeLists.txt b/paddle/infrt/pass/CMakeLists.txt
new file mode 100755
index 0000000000000..51fecdf907798
--- /dev/null
+++ b/paddle/infrt/pass/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(phi)
diff --git a/paddle/infrt/tests/dialect/basic.mlir b/paddle/infrt/tests/dialect/basic.mlir
index 3c76b438a0eba..2d4d6f2629ec7 100644
--- a/paddle/infrt/tests/dialect/basic.mlir
+++ b/paddle/infrt/tests/dialect/basic.mlir
@@ -1,41 +1,33 @@
 // RUN: infrtexec -i %s | FileCheck %s
 // CHECK-LABEL: @basic_f32
 func @basic_f32() -> f32 {
-  %v0 = infrt.constant.f32 1.0
-  %v1 = infrt.constant.f32 2.0
-  %value = "infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
+  %v0 = Infrt.constant.f32 1.0
+  %v1 = Infrt.constant.f32 2.0
+  %value = "Infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
 
   // CHECK-NEXT: 3
-  "infrt.print.f32"(%value) : (f32) -> ()
+  "Infrt.print.f32"(%value) : (f32) -> ()
 
-  infrt.return %value : f32
+  Infrt.return %value : f32
 }
 
 /// ================================================================
 /// @caller call the other function @callee
 func @callee.add.f32(%x : f32, %y : f32, %y1 : f32) -> f32 {
-  %z = "infrt.add.f32"(%x, %y) : (f32, f32) -> f32
-  %z1 = "infrt.add.f32"(%z, %y1) : (f32, f32) -> f32
-  infrt.return %z1 : f32
+  %z = "Infrt.add.f32"(%x, %y) : (f32, f32) -> f32
+  %z1 = "Infrt.add.f32"(%z, %y1) : (f32, f32) -> f32
+  Infrt.return %z1 : f32
 }
 
 // CHECK-LABEL: @caller.add.f32
 func @caller.add.f32() -> f32 {
-  %x = infrt.constant.f32 1.0
-  %y = infrt.constant.f32 2.0
-  %y1 = infrt.constant.f32 3.0
-  %z = infrt.call @callee.add.f32(%x, %y, %y1) : (f32, f32, f32) -> f32
+  %x = Infrt.constant.f32 1.0
+  %y = Infrt.constant.f32 2.0
+  %y1 = Infrt.constant.f32 3.0
+  %z = Infrt.call @callee.add.f32(%x, %y, %y1) : (f32, f32, f32) -> f32
 
   // CHECK-NEXT: 6
-  "infrt.print.f32"(%z) : (f32) -> ()
-  infrt.return %z : f32
+  "Infrt.print.f32"(%z) : (f32) -> ()
+  Infrt.return %z : f32
 }
 /// <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-
-// CHECK-LABEL: @string_test
-func @string_test() {
-  %path = infrt.get_string("this is get_string op.")
-  // CHECK-LABEL: string = this is get_string op.
-  infrt.print_string(%path)
-  infrt.return
-}
diff --git a/paddle/infrt/tests/dialect/benchmark.mlir b/paddle/infrt/tests/dialect/benchmark.mlir
index 1a57b43499062..381fd534f6a5a 100644
--- a/paddle/infrt/tests/dialect/benchmark.mlir
+++ b/paddle/infrt/tests/dialect/benchmark.mlir
@@ -12,13 +12,13 @@ func @benchmark() {
   // CHECK-LABEL: BM:add.f32:CPU 95%(ns)
   // CHECK-LABEL: BM:add.f32:CPU 99%(ns)
   // CHECK-LABEL: BM:add.f32:CPU utilization(percent)
-  infrt.benchmark "add.f32"() duration_secs = 1, max_count = 3, num_warmup_runs = 3
+  Infrt.benchmark "add.f32"() duration_secs = 1, max_count = 3, num_warmup_runs = 3
   {
-    %0 = infrt.constant.f32 1.0
-    %1 = infrt.constant.f32 2.0
-    %res = "infrt.add.f32"(%0, %1) : (f32, f32) -> f32
-    "infrt.print.f32"(%res) : (f32) -> ()
-    infrt.return %res : f32
+    %0 = Infrt.constant.f32 1.0
+    %1 = Infrt.constant.f32 2.0
+    %res = "Infrt.add.f32"(%0, %1) : (f32, f32) -> f32
+    "Infrt.print.f32"(%res) : (f32) -> ()
+    Infrt.return %res : f32
   }
-  infrt.return
+  Infrt.return
 }
diff --git a/paddle/infrt/tests/dialect/dense_tensor.mlir b/paddle/infrt/tests/dialect/dense_tensor.mlir
index f1def17aa8796..faade62d35063 100644
--- a/paddle/infrt/tests/dialect/dense_tensor.mlir
+++ b/paddle/infrt/tests/dialect/dense_tensor.mlir
@@ -2,23 +2,23 @@
 // CHECK-LABEL: dense_shape0
 func @dense_shape0() {
   %shape = ts.build_shape [1:i64, 57:i64]
-  %a = dt.create_uninit_tensor.f32 [12:i64, 23:i64] -> !infrt.tensor<X86, NCHW, F32>
+  %a = dt.create_uninit_tensor.f32 [12:i64, 23:i64] -> !infrt.dense_tensor<CPU, FP32, NCHW>
 
-  infrt.return
+  Infrt.return
 }
 
-func @predict(%a: !infrt.tensor<X86, NCHW, F32>, %b: !infrt.tensor<X86, NCHW, F32>) -> (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) {
-  %a0 = dt.shallow_copy_tensor %a : !infrt.tensor<X86, NCHW, F32> -> !infrt.tensor<X86, NCHW, F32>
-  %b0 = dt.shallow_copy_tensor %b : !infrt.tensor<X86, NCHW, F32> -> !infrt.tensor<X86, NCHW, F32>
+func @predict(%a: !infrt.dense_tensor<CPU, FP32, NCHW>, %b: !infrt.dense_tensor<CPU, FP32, NCHW>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) {
+  %a0 = dt.shallow_copy_tensor %a : !infrt.dense_tensor<CPU, FP32, NCHW> -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  %b0 = dt.shallow_copy_tensor %b : !infrt.dense_tensor<CPU, FP32, NCHW> -> !infrt.dense_tensor<CPU, FP32, NCHW>
 
-  infrt.return %a0, %b0: !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>
+  Infrt.return %a0, %b0: !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>
 }
 
 
 func @main() {
   %shape = ts.build_shape [1:i64, 57:i64]
-  %a = dt.create_uninit_tensor.f32 [12:i64, 23:i64] -> !infrt.tensor<X86, NCHW, F32>
+  %a = dt.create_uninit_tensor.f32 [12:i64, 23:i64] -> !infrt.dense_tensor<CPU, FP32, NCHW>
 
-  %b, %c = infrt.call @predict(%a, %a) : (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>)
-  infrt.return
+  %b, %c = Infrt.call @predict(%a, %a) : (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>)
+  Infrt.return
 }
diff --git a/paddle/infrt/tests/dialect/disabled_tensor_map.mlir b/paddle/infrt/tests/dialect/disabled_tensor_map.mlir
index 111c01c9a108b..8e2d3bc49b96c 100644
--- a/paddle/infrt/tests/dialect/disabled_tensor_map.mlir
+++ b/paddle/infrt/tests/dialect/disabled_tensor_map.mlir
@@ -1,31 +1,31 @@
 // CHECK-LABEL: @predict
-func @predict(%input:!infrt.tensor<X86, NCHW, F32>, %map: !infrt.tensor_map) -> (!infrt.tensor<X86, NCHW, F32>) {
-  %w = dt.get_param(%map, "create_parameter_0.w_0") -> !infrt.tensor<X86, NCHW, F32>
-  %bias = dt.get_param(%map, "create_parameter_1.w_0") -> !infrt.tensor<X86, NCHW, F32>
+func @predict(%input:!Infrt.tensor<X86, NCHW, F32>, %map: !Infrt.tensor_map) -> (!Infrt.tensor<X86, NCHW, F32>) {
+  %w = dt.get_param(%map, "create_parameter_0.w_0") -> !Infrt.tensor<X86, NCHW, F32>
+  %bias = dt.get_param(%map, "create_parameter_1.w_0") -> !Infrt.tensor<X86, NCHW, F32>
 
-  %out = dt.create_uninit_tensor.f32 [3, 3] -> !infrt.tensor<X86, NCHW, F32>
+  %out = dt.create_uninit_tensor.f32 [3, 3] -> !Infrt.tensor<X86, NCHW, F32>
 
   // fc
-  "external.matmul"(%input, %w, %out) {}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
-  "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
-  "external.sigmoid"(%out, %out) {}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
-  //dt.print_tensor (%out : !infrt.tensor<X86, NCHW, F32>)
+  "external.matmul"(%input, %w, %out) {}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
+  "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
+  "external.sigmoid"(%out, %out) {}: (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor<X86, NCHW, F32>) -> ()
+  //dt.print_tensor (%out : !Infrt.tensor<X86, NCHW, F32>)
 
-  infrt.return %out : !infrt.tensor<X86, NCHW, F32>
+  Infrt.return %out : !Infrt.tensor<X86, NCHW, F32>
 }
 
 // CHECK-LABEL: @main
 func @main() {
-  %input = dt.create_uninit_tensor.f32 [3, 3] -> !infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%input : !infrt.tensor<X86, NCHW, F32>) {value=1.0:f32}
+  %input = dt.create_uninit_tensor.f32 [3, 3] -> !Infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%input : !Infrt.tensor<X86, NCHW, F32>) {value=1.0:f32}
 
-  %path = infrt.get_string("/infrt/build/paddle/paddle_1.8_fc_model")
+  %path = Infrt.get_string("/Infrt/build/paddle/paddle_1.8_fc_model")
   // CHECK-LABEL: loading params
   %map = dt.load_params(%path)
 
-  %out = infrt.call @predict(%input, %map): (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor_map) -> (!infrt.tensor<X86, NCHW, F32>)
-  dt.print_tensor (%out : !infrt.tensor<X86, NCHW, F32>)
+  %out = Infrt.call @predict(%input, %map): (!Infrt.tensor<X86, NCHW, F32>, !Infrt.tensor_map) -> (!Infrt.tensor<X86, NCHW, F32>)
+  dt.print_tensor (%out : !Infrt.tensor<X86, NCHW, F32>)
 
-  infrt.return
+  Infrt.return
 }
 
diff --git a/paddle/infrt/tests/dialect/disabled_trt_ops.mlir b/paddle/infrt/tests/dialect/disabled_trt_ops.mlir
index d98f107bab41e..75ec98f04661a 100644
--- a/paddle/infrt/tests/dialect/disabled_trt_ops.mlir
+++ b/paddle/infrt/tests/dialect/disabled_trt_ops.mlir
@@ -7,15 +7,15 @@ func @main() -> tensor<?xf32> {
   %bias1 = "pd.feed"() {name="input4"} : () -> tensor<?xf32>
   %bias2 = "pd.feed"() {name="input5"} : () -> tensor<?xf32>
 
-  %d = "pd.elementwise_add"(%c, %bias) {axis=1:i32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %d = "pd.elementwise_add"(%c, %bias) {axis=1:si32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
   %e = "pd.relu6"(%d) {} : (tensor<?xf32>) -> tensor<?xf32>
 
   %c1 = "pd.matmul"(%e, %b1) {transpose_x=false, transpose_y=false} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  %d1 = "pd.elementwise_add"(%c1, %bias1) {axis=1:i32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %d1 = "pd.elementwise_add"(%c1, %bias1) {axis=1:si32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
   %e1 = "pd.relu"(%d1) {} : (tensor<?xf32>) -> tensor<?xf32>
 
   %c2 = "pd.matmul"(%e1, %b2) {transpose_x=true, transpose_y=false} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  %d2 = "pd.elementwise_add"(%c2, %bias2) {axis=1:i32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %d2 = "pd.elementwise_add"(%c2, %bias2) {axis=1:si32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
   %e2 = "pd.relu"(%d2) {} : (tensor<?xf32>) -> tensor<?xf32>
   
   "pd.fetch"(%e2) {name="output"} :(tensor<?xf32>)->()
diff --git a/paddle/infrt/tests/dialect/paddle_ops.mlir b/paddle/infrt/tests/dialect/paddle_ops.mlir
index 02511b21e4792..48ee4b9d725c0 100644
--- a/paddle/infrt/tests/dialect/paddle_ops.mlir
+++ b/paddle/infrt/tests/dialect/paddle_ops.mlir
@@ -3,8 +3,7 @@
 func @ops() {
   %a = pd.feed() {name="input0"} : tensor<?xf32>
   %b = pd.feed() {name="input1"}: tensor<?xf32>
-  %d = pd.feed() {name="input3"}: !Infrt.lod_tensor<3x4x9xf32, 0>
+  %d = pd.feed() {name="input3"}: !infrt.lod_tensor<3x4x9xf32, 0>
   %c = "pd.matmul"(%a, %b) {transpose_x=true, transpose_y=false} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-
-  infrt.return
+  Infrt.return
 }
diff --git a/paddle/infrt/tests/dialect/pten/dense_tensor.mlir b/paddle/infrt/tests/dialect/pten/dense_tensor.mlir
index 88f5b289fd9f8..21ba15d5fce7d 100644
--- a/paddle/infrt/tests/dialect/pten/dense_tensor.mlir
+++ b/paddle/infrt/tests/dialect/pten/dense_tensor.mlir
@@ -2,10 +2,10 @@
 
 // CHECK-LABEL: @basic_tensor
 func @basic_tensor() {
-  %a = "pten_dt.create_allocator.cpu" (): () -> !pten.CPU_allocator
-  %b = "pten_dt.create_context.cpu" (): () -> !pten.CPU_context
-  %c = "pten_dt.create_dense_tensor.cpu.f32.nchw" (%a) {dims=[1:i64], lod=[1:i64]}: (!pten.CPU_allocator) -> (!infrt.tensor<X86, NCHW, F32>)
-  // "pten_dt.fill_dense_tensor.f32" (%c) {value=[1.0:f32]} : (!infrt.tensor<X86, NCHW, F32>) -> ()
+  %a = "phi_dt.create_allocator.cpu" (): () -> !phi.CPU_allocator
+  %b = "phi_dt.create_context.cpu" (): () -> !phi.CPU_context
+  %c = "phi_dt.create_dense_tensor.cpu.f32.nchw" (%a) {dims=[1:i64], lod=[1:i64]}: (!phi.CPU_allocator) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
+  // "phi_dt.fill_dense_tensor.f32" (%c) {value=[1.0:f32]} : (!Infrt.tensor<CPU, FP32, NCHW>) -> ()
 
-  infrt.return
+  Infrt.return
 }
diff --git a/paddle/infrt/tests/dialect/pten/pten_pass.mlir b/paddle/infrt/tests/dialect/pten/pten_pass.mlir
new file mode 100644
index 0000000000000..30ff2636ae5a4
--- /dev/null
+++ b/paddle/infrt/tests/dialect/pten/pten_pass.mlir
@@ -0,0 +1,10 @@
+// RUN: infrtopt %s | FileCheck %s
+// CHECK-LABEL: @ops
+func @ops() {
+  %a = pd.feed() {name="input0"} : !infrt.lod_tensor<?xf32,0>
+  %b = pd.feed() {name="input1"} : !infrt.lod_tensor<?xf32,0>
+  %d = pd.feed() {name="input3"} : !infrt.lod_tensor<3x4x9xf32, 0>
+  %g = "pd.elementwise_add"(%a, %b) {axis=1:si32} : (!infrt.lod_tensor<?xf32,0>, !infrt.lod_tensor<?xf32>) -> tensor<?xf32>
+  %h = "pd.abs"(%g):(tensor<?xf32>) -> tensor<?xf32>
+  "pd.fetch"(%h) {name="output"} :(tensor<?xf32>)->()
+}
diff --git a/paddle/infrt/tests/dialect/tensor/dense_tensor.mlir b/paddle/infrt/tests/dialect/tensor/dense_tensor.mlir
index ff7f36f5078d6..76ae140dd6cbd 100644
--- a/paddle/infrt/tests/dialect/tensor/dense_tensor.mlir
+++ b/paddle/infrt/tests/dialect/tensor/dense_tensor.mlir
@@ -1,23 +1,23 @@
 // RUN: infrtexec -i %s | FileCheck %s
 // CHECK-LABEL: dense_shape0
 func @dense_shape0() {
-  %a = dt.create_uninit_tensor.f32 [12:i64, 23:i64] -> !infrt.tensor<X86, NCHW, F32>
+  %a = dt.create_uninit_tensor.f32 [12:i64, 23:i64] -> !infrt.dense_tensor<CPU, FP32, NCHW>
 
-  infrt.return
+  Infrt.return
 }
 
-func @predict(%a: !infrt.tensor<X86, NCHW, F32>, %b: !infrt.tensor<X86, NCHW, F32>) -> (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) {
-  %a0 = dt.shallow_copy_tensor %a : !infrt.tensor<X86, NCHW, F32> -> !infrt.tensor<X86, NCHW, F32>
-  %b0 = dt.shallow_copy_tensor %b : !infrt.tensor<X86, NCHW, F32> -> !infrt.tensor<X86, NCHW, F32>
+func @predict(%a: !infrt.dense_tensor<CPU, FP32, NCHW>, %b: !infrt.dense_tensor<CPU, FP32, NCHW>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) {
+  %a0 = dt.shallow_copy_tensor %a : !infrt.dense_tensor<CPU, FP32, NCHW> -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  %b0 = dt.shallow_copy_tensor %b : !infrt.dense_tensor<CPU, FP32, NCHW> -> !infrt.dense_tensor<CPU, FP32, NCHW>
 
-  infrt.return %a0, %b0: !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>
+  Infrt.return %a0, %b0: !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>
 }
 
 
 func @main() {
   %shape = ts.build_shape [1:i64, 57:i64]
-  %a = dt.create_uninit_tensor.f32 [12:i64, 23:i64] -> !infrt.tensor<X86, NCHW, F32>
+  %a = dt.create_uninit_tensor.f32 [12:i64, 23:i64] -> !infrt.dense_tensor<CPU, FP32, NCHW>
 
-  %b, %c = infrt.call @predict(%a, %a) : (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>)
-  infrt.return
+  %b, %c = Infrt.call @predict(%a, %a) : (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>)
+  Infrt.return
 }
diff --git a/paddle/infrt/tests/dialect/tensor/naive_kernels.mlir b/paddle/infrt/tests/dialect/tensor/naive_kernels.mlir
index 914e863db49cc..52b296e06cd36 100644
--- a/paddle/infrt/tests/dialect/tensor/naive_kernels.mlir
+++ b/paddle/infrt/tests/dialect/tensor/naive_kernels.mlir
@@ -2,34 +2,34 @@
 // CHECK-LABEL: naive_elementwise_add
 func @naive_elementwise_add() {
   // create a
-  %a = dt.create_uninit_tensor.f32 [2:i64, 8:i64] -> !infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%a : !infrt.tensor<X86, NCHW, F32>) {value=1.0:f32}
+  %a = dt.create_uninit_tensor.f32 [2:i64, 8:i64] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  dt.fill_tensor_with_constant.f32 (%a : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=1.0:f32}
   // create b
-  %b = dt.create_uninit_tensor.f32 [2:i64, 8:i64] -> !infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%b : !infrt.tensor<X86, NCHW, F32>) {value=2.0:f32}
+  %b = dt.create_uninit_tensor.f32 [2:i64, 8:i64] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  dt.fill_tensor_with_constant.f32 (%b : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=2.0:f32}
   // get c
-  %c = dt.naive_elementwise_add.f32(%a, %b) {} : (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> !infrt.tensor<X86, NCHW, F32>
+  %c = dt.naive_elementwise_add.f32(%a, %b) {} : (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW>
 
   // CHECK: tensor: shape=shape[2,8], values=[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
-  dt.print_tensor (%c : !infrt.tensor<X86, NCHW, F32>)
+  dt.print_tensor (%c : !infrt.dense_tensor<CPU, FP32, NCHW>)
 
-  infrt.return
+  Infrt.return
 }
 
 // RUN: infrtexec -i %s | FileCheck %s
 // CHECK-LABEL: naive_matmul
 func @naive_matmul() {
   // create a
-  %a = dt.create_uninit_tensor.f32 [2:i64, 8:i64] -> !infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%a : !infrt.tensor<X86, NCHW, F32>) {value=1.0:f32}
+  %a = dt.create_uninit_tensor.f32 [2:i64, 8:i64] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  dt.fill_tensor_with_constant.f32 (%a : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=1.0:f32}
   // create b
-  %b = dt.create_uninit_tensor.f32 [8:i64, 4:i64] -> !infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%b : !infrt.tensor<X86, NCHW, F32>) {value=2.0:f32}
+  %b = dt.create_uninit_tensor.f32 [8:i64, 4:i64] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  dt.fill_tensor_with_constant.f32 (%b : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=2.0:f32}
   // get c
-  %c = dt.naive_matmul.f32(%a, %b) {} : (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> !infrt.tensor<X86, NCHW, F32>
+  %c = dt.naive_matmul.f32(%a, %b) {} : (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW>
 
   // CHECK: tensor: shape=shape[2,4], values=[16, 16, 16, 16, 16, 16, 16, 16]
-  dt.print_tensor (%c : !infrt.tensor<X86, NCHW, F32>)
+  dt.print_tensor (%c : !infrt.dense_tensor<CPU, FP32, NCHW>)
 
-  infrt.return
+  Infrt.return
 }
diff --git a/paddle/infrt/tests/dialect/tensor/tensor_map.mlir.in b/paddle/infrt/tests/dialect/tensor/tensor_map.mlir.in
index 4edb918b5a28f..5c1396d47f551 100644
--- a/paddle/infrt/tests/dialect/tensor/tensor_map.mlir.in
+++ b/paddle/infrt/tests/dialect/tensor/tensor_map.mlir.in
@@ -1,15 +1,15 @@
 // RUN: infrtexec -i %s | FileCheck %s
 
 func @load_tensor_map() {
-  %path = infrt.get_string("@CMAKE_BINARY_DIR@/multi_fc_model")
+  %path = Infrt.get_string("@CMAKE_BINARY_DIR@/multi_fc_model")
   %map = dt.load_params(%path)
   %size = dt.tensor_map_get_size(%map) -> i32
-  infrt.print.i32 %size
+  Infrt.print.i32 %size
 
-  %a = dt.tensor_map_get_tensor(%map) {name="fc_bias"} -> !infrt.tensor<X86, NCHW, F32>
+  %a = dt.tensor_map_get_tensor(%map) {name="fc_bias"} -> !infrt.dense_tensor<CPU, FP32, NCHW>
 
   // CHECK: tensor: shape=shape[2], values=[0, 0]
-  dt.print_tensor (%a : !infrt.tensor<X86, NCHW, F32>)
+  dt.print_tensor (%a : !infrt.dense_tensor<CPU, FP32, NCHW>)
 
-  infrt.return
+  Infrt.return
 }
diff --git a/paddle/infrt/tests/dialect/tensor/tensor_shape.mlir b/paddle/infrt/tests/dialect/tensor/tensor_shape.mlir
index 09210078b9d7d..5623aef71aa2c 100644
--- a/paddle/infrt/tests/dialect/tensor/tensor_shape.mlir
+++ b/paddle/infrt/tests/dialect/tensor/tensor_shape.mlir
@@ -4,5 +4,5 @@ func @build_tensor1() {
   %a = ts.build_shape [1:i64, 57:i64, 92:i64]
   // CHECK: shape[1,57,92]
   ts.print_shape %a
-  infrt.return
+  Infrt.return
 }
diff --git a/paddle/infrt/tests/dialect/tensor/tensor_type.mlir b/paddle/infrt/tests/dialect/tensor/tensor_type.mlir
index 01a2f7df32608..e580634055a72 100644
--- a/paddle/infrt/tests/dialect/tensor/tensor_type.mlir
+++ b/paddle/infrt/tests/dialect/tensor/tensor_type.mlir
@@ -1,10 +1,10 @@
 // RUN: infrtexec -i %s | FileCheck %s
 // CHECK-LABEL: test_tensor_type
 func @test_tensor_type() {
-  %a = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%a : !infrt.tensor<X86, NCHW, F32>) {value=1.0:f32}
+  %a = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  dt.fill_tensor_with_constant.f32 (%a : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=1.0:f32}
   // CHECK: tensor: shape=shape[3,4], values=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
-  dt.print_tensor (%a : !infrt.tensor<X86, NCHW, F32>)
+  dt.print_tensor (%a : !infrt.dense_tensor<CPU, FP32, NCHW>)
 
-  infrt.return
+  Infrt.return
 }
diff --git a/paddle/infrt/tests/dialect/tensor_shape.mlir b/paddle/infrt/tests/dialect/tensor_shape.mlir
index 09210078b9d7d..5623aef71aa2c 100644
--- a/paddle/infrt/tests/dialect/tensor_shape.mlir
+++ b/paddle/infrt/tests/dialect/tensor_shape.mlir
@@ -4,5 +4,5 @@ func @build_tensor1() {
   %a = ts.build_shape [1:i64, 57:i64, 92:i64]
   // CHECK: shape[1,57,92]
   ts.print_shape %a
-  infrt.return
+  Infrt.return
 }
diff --git a/paddle/infrt/tests/dialect/tensor_type.mlir b/paddle/infrt/tests/dialect/tensor_type.mlir
index 01a2f7df32608..e580634055a72 100644
--- a/paddle/infrt/tests/dialect/tensor_type.mlir
+++ b/paddle/infrt/tests/dialect/tensor_type.mlir
@@ -1,10 +1,10 @@
 // RUN: infrtexec -i %s | FileCheck %s
 // CHECK-LABEL: test_tensor_type
 func @test_tensor_type() {
-  %a = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.tensor<X86, NCHW, F32>
-  dt.fill_tensor_with_constant.f32 (%a : !infrt.tensor<X86, NCHW, F32>) {value=1.0:f32}
+  %a = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  dt.fill_tensor_with_constant.f32 (%a : !infrt.dense_tensor<CPU, FP32, NCHW>) {value=1.0:f32}
   // CHECK: tensor: shape=shape[3,4], values=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
-  dt.print_tensor (%a : !infrt.tensor<X86, NCHW, F32>)
+  dt.print_tensor (%a : !infrt.dense_tensor<CPU, FP32, NCHW>)
 
-  infrt.return
+  Infrt.return
 }
diff --git a/paddle/scripts/infrt_build.sh b/paddle/scripts/infrt_build.sh
index ff86e7f52d535..8d858647ea63d 100755
--- a/paddle/scripts/infrt_build.sh
+++ b/paddle/scripts/infrt_build.sh
@@ -90,7 +90,7 @@ function infrt_gen_and_build() {
         exit 7;
     fi
 
-    make -j ${parallel_number} infrt infrtopt infrtexec test_infrt_exec trt-exec infrt_lib_dist paddle-mlir-convert;build_error=$?
+    make -j ${parallel_number} infrt infrtopt infrtexec test_infrt_exec trt-exec phi-exec infrt_lib_dist paddle-mlir-convert;build_error=$?
     if [ "$build_error" != 0 ];then
         exit 7;
     fi
diff --git a/tools/infrt/get_pten_kernel_function.sh b/tools/infrt/get_phi_kernel_function.sh
similarity index 89%
rename from tools/infrt/get_pten_kernel_function.sh
rename to tools/infrt/get_phi_kernel_function.sh
index 488c5b4c4328d..3b9f4b7273500 100644
--- a/tools/infrt/get_pten_kernel_function.sh
+++ b/tools/infrt/get_phi_kernel_function.sh
@@ -42,12 +42,12 @@ grep PD_REGISTER_INFER_META_FN ${temp_path}/generate.cc  \
   | awk -F "\(|,|::|\)" '{print $2, $4}' > ${temp_path}/wrap_info.txt
 
 #step 3: merge all infos
-#  @input1 => pten kernel infomation : kernel_name kernel_key(GPU/CPU, precision, layout)
+#  @input1 => phi kernel infomation : kernel_name kernel_key(GPU/CPU, precision, layout)
 #  @input2 => information from api.yaml : kernel_name kernel_function_name inferMeta_function_name 
 #  @input3 => information from wrapped_infermeta_gen : ensure the inferMeta function has
 #             same signature with kernel function
-python3 ${PADDLE_ROOT}/tools/infrt/get_pten_kernel_info.py \
+python3 ${PADDLE_ROOT}/tools/infrt/get_phi_kernel_info.py \
   --paddle_root_path ${PADDLE_ROOT} \
   --kernel_info_file $kernel_register_info_file \
   --infermeta_wrap_file ${temp_path}/wrap_info.txt \
-  --generate_file ${PADDLE_ROOT}/paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launchers.cc
+  --generate_file ${PADDLE_ROOT}/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.cc
diff --git a/tools/infrt/get_pten_kernel_info.py b/tools/infrt/get_phi_kernel_info.py
similarity index 92%
rename from tools/infrt/get_pten_kernel_info.py
rename to tools/infrt/get_phi_kernel_info.py
index 23296fb5a9436..b0c834718b1b3 100644
--- a/tools/infrt/get_pten_kernel_info.py
+++ b/tools/infrt/get_phi_kernel_info.py
@@ -21,7 +21,7 @@
 
 
 def parse_args():
-    parser = argparse.ArgumentParser("gather pten kernel and infermate info")
+    parser = argparse.ArgumentParser("gather phi kernel and infermate info")
     parser.add_argument(
         "--paddle_root_path",
         type=str,
@@ -31,7 +31,7 @@ def parse_args():
         "--kernel_info_file",
         type=str,
         required=True,
-        help="kernel info file generated by get_pten_kernel_function.sh.")
+        help="kernel info file generated by get_phi_kernel_function.sh.")
     parser.add_argument(
         "--infermeta_wrap_file",
         type=str,
@@ -41,7 +41,7 @@ def parse_args():
         "--generate_file",
         type=str,
         required=True,
-        default="../paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launchers.cc",
+        default="../paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.cc",
         help="generated file.")
     args = parser.parse_args()
     return args
@@ -84,15 +84,15 @@ def merge(infer_meta_data, kernel_data, wrap_data):
 
 
 def gen_warn_info():
-    return """// Generated by tools/infrt/gen_pten_kernel_register.py for infrt.
+    return """// Generated by tools/infrt/gen_phi_kernel_register.py for infrt.
 // DO NOT edit or include it within paddle.
 """
 
 
 def gen_include_headers():
     return """
-#include "paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launchers.h"
-#include "paddle/infrt/kernel/pten/infershaped/pten_kernel_launcher.h"
+#include "paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.h"
+#include "paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h"
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/include/kernels.h"
 #include "paddle/phi/include/infermeta.h"
@@ -240,8 +240,8 @@ def gen_register_info(resources: List[List[str]]):
     return res
 
 
-def gen_pten_kernel_register_code(resources: List[List[str]],
-                                  src_file_path: str):
+def gen_phi_kernel_register_code(resources: List[List[str]],
+                                 src_file_path: str):
     source_file = open(src_file_path, 'w')
     source_file.write(gen_warn_info())
     source_file.write(gen_include_headers())
@@ -258,4 +258,4 @@ def gen_pten_kernel_register_code(resources: List[List[str]],
     kernel_data = get_kernel_info(args.kernel_info_file)
     info_meta_wrap_data = get_kernel_info(args.infermeta_wrap_file)
     out = merge(infer_meta_data, kernel_data, info_meta_wrap_data)
-    gen_pten_kernel_register_code(out, args.generate_file)
+    gen_phi_kernel_register_code(out, args.generate_file)

From d56a0a1bea4be064e03648cfa587db4b01ce3d27 Mon Sep 17 00:00:00 2001
From: wangguanqun <esythan@126.com>
Date: Tue, 22 Feb 2022 16:23:02 +0800
Subject: [PATCH 036/101] fix bug in new the_one_ps (#39505)

* fix benchmark and communicator config

* fix bugs of the_one_ps

* multi program and fix bug in optimizer

* multi program in the_one_ps

* public commcontext
---
 .../communicator/communicator_common.h        |  13 +-
 paddle/fluid/pybind/fleet_py.cc               |   6 +-
 .../fleet/meta_optimizers/ps_optimizer.py     |   2 +
 .../distributed/passes/ps_trainer_pass.py     |   4 +-
 python/paddle/distributed/ps/the_one_ps.py    | 160 +++++---
 .../ps/utils/ps_program_builder.py            |   8 +-
 python/paddle/distributed/ps/utils/public.py  | 380 ++++++++++++------
 .../fleet/parameter_server/ir/public.py       |  11 +-
 8 files changed, 400 insertions(+), 184 deletions(-)

diff --git a/paddle/fluid/distributed/ps/service/communicator/communicator_common.h b/paddle/fluid/distributed/ps/service/communicator/communicator_common.h
index 66784c53c0026..27b282a945d15 100644
--- a/paddle/fluid/distributed/ps/service/communicator/communicator_common.h
+++ b/paddle/fluid/distributed/ps/service/communicator/communicator_common.h
@@ -31,7 +31,8 @@ struct CommContext {
               const std::vector<std::string> &origin_names, int id,
               bool merge_add_ = true, bool is_sparse_ = true,
               bool is_distributed_ = false, int table_id_ = -1,
-              bool is_tensor_table_ = false)
+              bool is_tensor_table_ = false, bool is_datanorm_table_ = false,
+              int64_t program_id_ = -1)
       : var_name(name),
         splited_varnames(names),
         epmap(emap),
@@ -42,7 +43,9 @@ struct CommContext {
         is_sparse(is_sparse_),
         is_distributed(is_distributed_),
         table_id(table_id_),
-        is_tensor_table(is_tensor_table_) {}
+        program_id(program_id_),
+        is_tensor_table(is_tensor_table_),
+        is_datanorm_table(is_datanorm_table_) {}
 
   CommContext(const CommContext &ctx) {
     var_name = ctx.var_name;
@@ -55,7 +58,9 @@ struct CommContext {
     origin_varnames = ctx.origin_varnames;
     is_distributed = ctx.is_distributed;
     table_id = ctx.table_id;
+    program_id = ctx.program_id;
     is_tensor_table = ctx.is_tensor_table;
+    is_datanorm_table = ctx.is_datanorm_table;
   }
 
   std::string print() const {
@@ -78,7 +83,9 @@ struct CommContext {
     ss << " is_sparse: " << is_sparse;
     ss << " is_distributed: " << is_distributed << "\n";
     ss << " table_id: " << table_id << "\n";
+    ss << " program_id: " << program_id << "\n";
     ss << " is_tensor_table: " << is_tensor_table << "\n";
+    ss << " is_datanorm_table: " << is_datanorm_table << "\n";
 
     return ss.str();
   }
@@ -93,7 +100,9 @@ struct CommContext {
   bool is_sparse;
   bool is_distributed;
   int table_id;
+  int64_t program_id;
   bool is_tensor_table;
+  bool is_datanorm_table;
 };
 
 }  // namespace distributed
diff --git a/paddle/fluid/pybind/fleet_py.cc b/paddle/fluid/pybind/fleet_py.cc
index 73c8f362d145d..3145a9cf7655c 100644
--- a/paddle/fluid/pybind/fleet_py.cc
+++ b/paddle/fluid/pybind/fleet_py.cc
@@ -103,11 +103,13 @@ void BindCommunicatorContext(py::module* m) {
           py::init<const std::string&, const std::vector<std::string>&,
                    const std::vector<std::string>&, const std::vector<int64_t>&,
                    const std::vector<std::string>&, int, bool, bool, bool, int,
-                   bool>())
+                   bool, bool, int64_t>())
       .def("var_name", [](const CommContext& self) { return self.var_name; })
       .def("trainer_id",
            [](const CommContext& self) { return self.trainer_id; })
       .def("table_id", [](const CommContext& self) { return self.table_id; })
+      .def("program_id",
+           [](const CommContext& self) { return self.program_id; })
       .def("split_varnames",
            [](const CommContext& self) { return self.splited_varnames; })
       .def("split_endpoints",
@@ -122,6 +124,8 @@ void BindCommunicatorContext(py::module* m) {
            [](const CommContext& self) { return self.origin_varnames; })
       .def("is_tensor_table",
            [](const CommContext& self) { return self.is_tensor_table; })
+      .def("is_datanorm_table",
+           [](const CommContext& self) { return self.is_datanorm_table; })
       .def("__str__", [](const CommContext& self) { return self.print(); });
 }
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py
index bc50bef010941..100a6882b1b35 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py
@@ -46,7 +46,9 @@ def _init_ps_pass_context(self, loss, startup_program):
         attrs['loss'] = loss
         attrs['min_block_size'] = 81920
         attrs['origin_main_program'] = loss.block.program
+        attrs['origin_main_programs'] = [loss.block.program]
         attrs['origin_startup_program'] = startup_program
+        attrs['origin_startup_programs'] = [startup_program]
 
         attrs['cloned_main'] = attrs['origin_main_program'].clone()
         attrs['cloned_startup'] = attrs['origin_startup_program'].clone()
diff --git a/python/paddle/distributed/passes/ps_trainer_pass.py b/python/paddle/distributed/passes/ps_trainer_pass.py
index 3f39db69abdb2..284365ce06651 100755
--- a/python/paddle/distributed/passes/ps_trainer_pass.py
+++ b/python/paddle/distributed/passes/ps_trainer_pass.py
@@ -560,9 +560,9 @@ def _check_conflict(self, other_pass):
         return True
 
     def _get_sparse_table_names(self, attrs):
-        dist_varnames = get_sparse_tablenames(attrs['origin_main_program'],
+        dist_varnames = get_sparse_tablenames(attrs['origin_main_programs'],
                                               True)
-        sparse_varnames = get_sparse_tablenames(attrs['origin_main_program'],
+        sparse_varnames = get_sparse_tablenames(attrs['origin_main_programs'],
                                                 False)
         return list(set(dist_varnames + sparse_varnames))
 
diff --git a/python/paddle/distributed/ps/the_one_ps.py b/python/paddle/distributed/ps/the_one_ps.py
index f842ca791f1e5..14a68ad916747 100755
--- a/python/paddle/distributed/ps/the_one_ps.py
+++ b/python/paddle/distributed/ps/the_one_ps.py
@@ -24,8 +24,8 @@
 from paddle.fluid.executor import Executor
 from paddle.fluid.parallel_executor import ParallelExecutor
 from paddle.fluid.framework import Variable, Parameter
-from .runtime_base import RuntimeBase
-from ..base.private_helper_function import wait_server_ready
+from paddle.distributed.fleet.runtime.runtime_base import RuntimeBase
+from paddle.distributed.fleet.base.private_helper_function import wait_server_ready
 from paddle.fluid.communicator import Communicator, HeterClient
 from google.protobuf import text_format
 
@@ -39,8 +39,17 @@ def conv_indent(indent):
 PSERVER_SAVE_SUFFIX = ".shard"
 
 
-def parse_table_class(varname, o_main_program):
-    for op in o_main_program.global_block().ops:
+def get_program_by_id(context, program_id):
+    programs = context["origin_main_programs"]
+    for i, program in enumerate(programs):
+        if id(program) == program_id:
+            return program, context["origin_startup_programs"][i]
+    return None, None
+
+
+def parse_table_class(varname, program_id, context):
+    main_program, startup_program = get_program_by_id(context, program_id)
+    for op in main_program.global_block().ops:
         if not is_distributed_sparse_op(op) and not is_sparse_op(op):
             continue
 
@@ -53,9 +62,10 @@ def parse_table_class(varname, o_main_program):
                 return "MemorySparseTable"
 
 
-def get_default_accessor_proto(accessor, varname, o_main_program):
+def get_default_accessor_proto(accessor, varname, program_id, context):
+    main_program, startup_program = get_program_by_id(context, program_id)
     embedding_dim = 0
-    for var in o_main_program.list_vars():
+    for var in main_program.list_vars():
         if var.name == varname:
             embedding_dim = var.shape[1]
             break
@@ -123,9 +133,10 @@ def get_default_accessor_proto(accessor, varname, o_main_program):
                 sgd_param.adam.weight_bounds.extend([-10.0, 10.0])
 
 
-def check_embedding_dim(accessor, varname, o_main_program):
+def check_embedding_dim(accessor, varname, program_id, context):
+    main_program, startup_program = get_program_by_id(context, program_id)
     embedding_dim = 0
-    for var in o_main_program.list_vars():
+    for var in main_program.list_vars():
         if var.name == varname:
             embedding_dim = var.shape[1]
             break
@@ -172,6 +183,8 @@ def __init__(self):
         self.dims = []
         self.trainer_num = 0
         self.sync = "false"
+        self.table_num = None
+        self.table_dim = None
         self.initializers = []
         self.opt_input_map = {}
         self.opt_attr_map = {}
@@ -192,6 +205,7 @@ def define_optimize_map(self):
         opt_input_map["sum"] = [("Param", None)]
         opt_input_map["naive_adagrad"] = [("Param", None), ("G2Sum", 1),
                                           ("LearningRate", 1)]
+        opt_input_map["summary"] = [("Param", None), ("SummaryDecayRate", 1)]
 
         opt_attr_map = {}
         opt_attr_map["sgd"] = []
@@ -201,6 +215,7 @@ def define_optimize_map(self):
                                 ("epsilon", "f")]
         opt_attr_map["adam_d2sum"] = [("beta1", "f"), ("beta2", "f"),
                                       ("epsilon", "f")]
+        opt_attr_map["summary"] = []
 
         opt_init_map = {}
         opt_init_map["gaussian_random"] = ["seed", "mean", "std"]
@@ -212,8 +227,9 @@ def define_optimize_map(self):
         self.opt_input_map = opt_input_map
         self.opt_init_map = opt_init_map
 
-    def parse_entry(self, varname, o_main_program):
-        for op in o_main_program.global_block().ops:
+    def parse_entry(self, varname, program_id, context):
+        main_program, startup_program = get_program_by_id(context, program_id)
+        for op in main_program.global_block().ops:
             if not is_distributed_sparse_op(op) and not is_sparse_op(op):
                 continue
 
@@ -243,23 +259,36 @@ def get_initializer_attr(self, value_name, o_startup_program):
         attr_str = ""
 
         origin_var_name = value_name
+        print("get_initializer_attr param name:", value_name)
         for op in o_startup_program.global_block().ops:
             if op.type in self.opt_init_map.keys(
             ) and origin_var_name == op.output("Out")[0]:
                 init_attr = [op.type]
+                print("get_initializer_attr op type:", op.type)
                 for attr in self.opt_init_map[op.type]:
+                    print("get_initializer_attr opt_init_map attr:", attr)
                     init_attr.append(str(op.attr(attr)))
+                    print("get_initializer_attr op attr:", str(op.attr(attr)))
                 attr_str = l_in.join(init_attr)
                 break
         return attr_str
 
-    def parse_by_optimizer(self, grad_name, is_sparse, total_dims, context,
-                           adam_d2sum):
-        main_program = context['origin_main_program']
-        startup_program = context['startup_main_program']
+    def parse_by_optimizer(self, ctx, context):
+        grad_name = ctx.origin_varnames()[0]
+        is_sparse = ctx.is_sparse()
+        size = ctx.sections()[0]
+        single_dim = ctx.sections()[1] if ctx.is_sparse() else 1
+        adam_d2sum = context["user_defined_strategy"].adam_d2sum
+        print("parse_by_optimizer table_id:{} is_datanorm:{}".format(
+            ctx.table_id(), ctx.is_datanorm_table()))
+
+        main_program, startup_program = get_program_by_id(context,
+                                                          ctx.program_id())
         pserver_id = get_role_id(context['role_maker'])
         pserver_num = len(get_ps_endpoints(context['role_maker']))
         optimizer_ops = get_optimize_ops(main_program)
+        print("the one ps optimizer_ops:", optimizer_ops)
+        print("the one ps parse_by_optimizer grad_name:", grad_name)
         oop = None
 
         for op in optimizer_ops:
@@ -278,6 +307,8 @@ def parse_by_optimizer(self, grad_name, is_sparse, total_dims, context,
         initializers = []
 
         self.trainer_num = get_trainers(context['role_maker'])
+        self.table_num = size
+        self.table_dim = single_dim
 
         if oop.type != 'adam' and adam_d2sum == True:
             print('optimization algorithm is not adam, set adam_d2sum False')
@@ -291,7 +322,11 @@ def parse_by_optimizer(self, grad_name, is_sparse, total_dims, context,
             param_varnames = self.opt_input_map["naive_adagrad"]
             attr_varnames = self.opt_attr_map["naive_adagrad"]
             self.accessor_class = "sgd"
-        elif adam_d2sum:
+        elif ctx.is_datanorm_table():
+            param_varnames = self.opt_input_map["summary"]
+            attr_varnames = self.opt_attr_map["summary"]
+            self.accessor_class = "summary"
+        elif adam_d2sum and not is_sparse:
             param_varnames = self.opt_input_map["adam_d2sum"]
             attr_varnames = self.opt_attr_map["adam_d2sum"]
             self.accessor_class = "adam_d2sum"
@@ -306,10 +341,9 @@ def parse_by_optimizer(self, grad_name, is_sparse, total_dims, context,
                 #for dims
                 if shape is None:
                     if is_sparse:
-                        shape = total_dims
+                        shape = single_dim
                     else:
-                        shape = self.get_shard(total_dims, pserver_num,
-                                               pserver_id)
+                        shape = self.get_shard(size, pserver_num, pserver_id)
                 dims.append(shape)
 
                 #for initializers
@@ -333,6 +367,27 @@ def parse_by_optimizer(self, grad_name, is_sparse, total_dims, context,
                 else:
                     initializer = "fill_constant&0"
                 initializers.append(initializer)
+            elif self.accessor_class == "summary":
+                #for dims
+                if shape is None:
+                    if is_sparse:
+                        shape = single_dim
+                    else:
+                        shape = self.get_shard(size, pserver_num, pserver_id)
+                dims.append(shape)
+
+                #for initializers
+                if formal_name == "Param":
+                    param = main_program.global_block().vars[oop.input(
+                        formal_name)[0]]
+
+                    initializer = self.get_initializer_attr(param.name,
+                                                            startup_program)
+                elif formal_name == "SummaryDecayRate":
+                    initializer = "fill_constant&0.99999"
+                else:
+                    initializer = "fill_constant&0"
+                initializers.append(initializer)
             else:
                 if formal_name == "G2Sum":
                     dims.append(1)
@@ -348,9 +403,9 @@ def parse_by_optimizer(self, grad_name, is_sparse, total_dims, context,
 
                     if shape is None:
                         if is_sparse:
-                            shape = total_dims
+                            shape = single_dim
                         else:
-                            shape = self.get_shard(total_dims, pserver_num,
+                            shape = self.get_shard(size, pserver_num,
                                                    pserver_id)
                     dims.append(shape)
 
@@ -379,6 +434,10 @@ def to_string(self, indent):
             attrs += "entry: \"{}\" ".format(self.entry)
         attrs += "trainer_num: {} ".format(self.trainer_num)
         attrs += "sync: {} ".format(self.sync)
+        if self.table_num:
+            attrs += "table_num: {} ".format(self.table_num)
+        if self.table_dim:
+            attrs += "table_dim: {} ".format(self.table_dim)
 
         for param in self.params:
             attrs += "params: \"{}\" ".format(param)
@@ -448,10 +507,7 @@ def to_string(self, indent):
             accessor_str = accessor_str.format(
                 conv_indent(indent), self.accessor_proto, conv_indent(indent))
             attrs += accessor_str + "\n"
-            return table_str.format(
-                conv_indent(indent), attrs, conv_indent(indent))
-
-        if self.accessor is not None:
+        elif self.accessor is not None:
             attrs += self.accessor.to_string(indent)
             attrs += "\n"
 
@@ -607,7 +663,9 @@ def __init__(self):
     def _set_basic_info(self, context):
         self.context = context
         self.role_maker = context["role_maker"]
+
         self.origin_main_program = context["origin_main_program"]
+        self.origin_main_programs = context["origin_main_programs"]
 
         self.context[
             'is_heter_ps_mode'] = self.role_maker._is_heter_parameter_server_mode
@@ -615,10 +673,13 @@ def _set_basic_info(self, context):
         self.context['trainer'] = TrainerRuntimeConfig(context[
             'valid_strategy'])
         self.context['ps_mode'] = self.context['trainer'].mode
-        self.context['use_ps_gpu'] = context['valid_strategy'].use_ps_gpu
+        self.context['use_ps_gpu'] = context['valid_strategy'].a_sync_configs[
+            'use_ps_gpu']
         self.is_sync = True if self.context[
             'ps_mode'] == DistributedMode.SYNC else False
         self.context['grad_name_to_param_name'] = {}
+        self.context['tensor_table'] = {}
+        build_var_distributed(self.context)
 
     def _init_worker(self):
         worker = self._get_fleet_proto(is_server=False, is_sync=self.is_sync)
@@ -689,6 +750,7 @@ def sync_strategy_envs():
             sync_kwargs = sync_strategy_envs()
             kwargs.update(sync_kwargs)
 
+        print("communicator config:", trainer_config.get_communicator_flags())
         self._communicator = Communicator(
             trainer_config.mode, kwargs,
             trainer_config.get_communicator_flags())
@@ -893,7 +955,7 @@ def _get_tables():
                     common.table_name = self.context['grad_name_to_param_name'][
                         ctx.origin_varnames()[0]]
 
-                    if self.ps_mode == DistributedMode.GEO:
+                    if self.context['ps_mode'] == DistributedMode.GEO:
                         table.table_class = "SparseGeoTable"
                     else:
                         all_table_proto = self.context[
@@ -907,7 +969,8 @@ def _get_tables():
                             table.table_class = table_proto.table_class
                         else:
                             table.table_class = parse_table_class(
-                                common.table_name, self.origin_main_program)
+                                common.table_name,
+                                ctx.program_id(), self.context)
                         if table.table_class != 'MemorySparseTable':
                             table.table_class = 'MemorySparseTable'
                             warnings.warn(
@@ -925,12 +988,12 @@ def _get_tables():
                             warnings.warn(
                                 "The accessor of sparse table is not set, use default value."
                             )
-                        get_default_accessor_proto(table_proto.accessor,
-                                                   common.table_name,
-                                                   self.origin_main_program)
+                        get_default_accessor_proto(
+                            table_proto.accessor, common.table_name,
+                            ctx.program_id(), self.context)
                         check_embedding_dim(table_proto.accessor,
                                             common.table_name,
-                                            self.origin_main_program)
+                                            ctx.program_id(), self.context)
                         table.accessor_proto = text_format.MessageToString(
                             table_proto.accessor)
                 else:
@@ -940,15 +1003,11 @@ def _get_tables():
                     common.table_name = "MergedDense"
 
                 adam_d2sum = self.context["user_defined_strategy"].adam_d2sum
-                common.parse_by_optimizer(ctx.origin_varnames()[0],
-                                          ctx.is_sparse(),
-                                          ctx.sections()[1] if ctx.is_sparse()
-                                          else ctx.sections()[0], self.context,
-                                          adam_d2sum)
+                common.parse_by_optimizer(ctx, self.context)
 
                 if ctx.is_sparse():
                     common.parse_entry(common.table_name,
-                                       self.origin_main_program)
+                                       ctx.program_id(), self.context)
 
                 if is_sync:
                     common.sync = "true"
@@ -1023,8 +1082,9 @@ def _init_server(self, dirname=None, var_names=None, **kwargs):
         self._server.init_server(proto_txt, string_hosts, role_id, trainers,
                                  self._server_sub_program)
 
-        dist_varnames = get_sparse_tablenames(self.origin_main_program, True)
-        sparse_varnames = get_sparse_tablenames(self.origin_main_program, False)
+        dist_varnames = get_sparse_tablenames(self.origin_main_programs, True)
+        sparse_varnames = get_sparse_tablenames(self.origin_main_programs,
+                                                False)
 
         distributed_varnames = dist_varnames + sparse_varnames
 
@@ -1070,6 +1130,7 @@ def is_valid(var):
             if var.name in exclude_var_names:
                 return False
 
+            from .utils.public import _get_varname_parts
             origin_varname, _, _ = _get_varname_parts(var.name)
             if origin_varname.endswith("@GRAD"):
                 return False
@@ -1085,16 +1146,24 @@ def is_valid(var):
 
         return is_valid
 
+    def _get_inference_model_path(self, dirname):
+        if dirname.startswith("afs:") or dirname.startswith("hdfs:"):
+            model_path = "./dnn_plugin"
+        else:
+            model_path = os.path.join(dirname, "dnn_plugin")
+        return model_path
+
     def _save_sparse_params(self, executor, dirname, context, main_program,
                             mode):
-        distributed_varnames = get_sparse_tablenames(
-            self.context['origin_main_program'], True)
+        distributed_varnames = get_sparse_tablenames(self.origin_main_programs,
+                                                     True)
         values = []
+        model_path = self._get_inference_model_path(dirname)
         for id, names in context.items():
             if names[0] not in distributed_varnames:
                 # only save sparse param to local
                 try:
-                    self._worker.recv_and_save_model(id, dirname)
+                    self._worker.recv_and_save_model(id, model_path)
                 except:
                     pass
             # save sparse & distributed param on server
@@ -1221,10 +1290,7 @@ def _ps_inference_save_inference_model(self,
 
         infer_program._copy_dist_param_info_from(program)
 
-        if dirname.startswith("afs:") or dirname.startswith("hdfs:"):
-            model_path = "./dnn_plugin"
-        else:
-            model_path = os.path.join(dirname, "dnn_plugin")
+        model_path = self._get_inference_model_path(dirname)
         model_basename = "__model__"
         model_basename = os.path.join(model_path, model_basename)
         paddle.save(infer_program, model_basename)
@@ -1266,7 +1332,7 @@ def _save_persistables(self, *args, **kwargs):
         self._ps_inference_save_persistables(*args, **kwargs)
 
     def _load_sparse_params(self, dirname, context, main_program, mode):
-        distributed_varnames = get_sparse_tablenames(self.origin_main_program,
+        distributed_varnames = get_sparse_tablenames(self.origin_main_programs,
                                                      True)
         values = []
         for id, names in context.items():
diff --git a/python/paddle/distributed/ps/utils/ps_program_builder.py b/python/paddle/distributed/ps/utils/ps_program_builder.py
index c6afd0cb03bf3..25e4dc28bdcb8 100755
--- a/python/paddle/distributed/ps/utils/ps_program_builder.py
+++ b/python/paddle/distributed/ps/utils/ps_program_builder.py
@@ -79,7 +79,7 @@ def __init__(self, pass_ctx):
         super(GeoPsProgramBuilder, self).__init__(pass_ctx)
         if self.ps_mode != DistributedMode.GEO:
             raise ValueError("ps mode: {} not matched {}",
-                             format(ps_mode, "GeoPsProgramBuilder"))
+                             format(self.ps_mode, "GeoPsProgramBuilder"))
 
     def _build_trainer_programs(self):
         append_send_ops_pass = new_pass("append_send_ops_pass", self.attrs)
@@ -97,9 +97,9 @@ class CpuSyncPsProgramBuilder(PsProgramBuilder):
     def __init__(self, pass_ctx):
         logger.info("start building cpu-sync-ps program")
         super(CpuSyncPsProgramBuilder, self).__init__(pass_ctx)
-        if self.ps_mode != DistributedMode.SYNC:
+        if self.ps_mode != DistributedMode.SYNC and self.ps_mode != DistributedMode.ASYNC:
             raise ValueError("ps mode: {} not matched {}",
-                             format(ps_mode, "CpuSyncPsProgramBuilder"))
+                             format(self.ps_mode, "CpuSyncPsProgramBuilder"))
 
     def _build_trainer_programs(self):
         add_lr_decay_table_pass = new_pass("add_lr_decay_table_pass",
@@ -178,7 +178,7 @@ def __init__(self, pass_ctx):
         if self.use_ps_gpu or self.ps_mode == DistributedMode.GEO or self.attrs[
                 'is_heter_ps_mode'] == False:
             raise ValueError("ps mode: {} not matched {}",
-                             format(ps_mode, "HeterAsyncPsProgramBuilder"))
+                             format(self.ps_mode, "HeterAsyncPsProgramBuilder"))
 
     def _build_trainer_programs(self):
         add_lr_decay_table_pass = new_pass("add_lr_decay_table_pass",
diff --git a/python/paddle/distributed/ps/utils/public.py b/python/paddle/distributed/ps/utils/public.py
index 7743db1057dd6..ebec6900e38f5 100755
--- a/python/paddle/distributed/ps/utils/public.py
+++ b/python/paddle/distributed/ps/utils/public.py
@@ -54,6 +54,9 @@
 }
 DEFAULT_DEVICE = 'cpu'
 
+DATA_NORM_NAME = [".batch_size", ".batch_sum", ".batch_square_sum"]
+DATA_NORM_GRAD_NAME = [x + "@GRAD" for x in DATA_NORM_NAME]
+
 
 def logger_config(log_path, logging_name):
     logger = logging.getLogger(logging_name)
@@ -84,6 +87,8 @@ class DistributedMode:
 class TrainerRuntimeConfig(object):
     def __init__(self, valid_strategy):
         self.mode = None
+        num_threads = os.getenv("CPU_NUM", "1")
+        send_queue_size = num_threads
         k_steps = valid_strategy.a_sync_configs["k_steps"]
         logger.info("ps mode in strategy: {}, {}".format(
             valid_strategy.a_sync, valid_strategy.a_sync_configs["k_steps"]))
@@ -95,14 +100,13 @@ def __init__(self, valid_strategy):
 
         if valid_strategy.a_sync and k_steps > 0:
             self.mode = DistributedMode.GEO
-
-        num_threads = os.getenv("CPU_NUM", "1")
+            send_queue_size = k_steps
 
         self.runtime_configs = {}
         self.runtime_configs['communicator_max_merge_var_num'] = os.getenv(
-            "FLAGS_communicator_max_merge_var_num", num_threads)
+            "FLAGS_communicator_max_merge_var_num", send_queue_size)
         self.runtime_configs['communicator_send_queue_size'] = os.getenv(
-            "FLAGS_communicator_send_queue_size", num_threads)
+            "FLAGS_communicator_send_queue_size", send_queue_size)
         self.runtime_configs[
             'communicator_independent_recv_thread'] = os.getenv(
                 "FLAGS_communicator_independent_recv_thread", "1")
@@ -116,6 +120,55 @@ def __init__(self, valid_strategy):
         self.runtime_configs['communicator_is_sgd_optimizer'] = os.getenv(
             "FLAGS_communicator_is_sgd_optimizer", "1")
 
+    def get_communicator_flags(self):
+        need_keys = []
+        num_threads = os.getenv("CPU_NUM", "1")
+        mode_str = ""
+        if self.mode is None or self.mode == DistributedMode.ASYNC:
+            need_keys = self.runtime_configs.keys()
+            mode_str = "async"
+        elif self.mode == DistributedMode.SYNC or self.mode == DistributedMode.HALF_ASYNC:
+            mode_str = "sync or half_async"
+            need_keys = [
+                'communicator_max_merge_var_num',
+                'communicator_send_wait_times', 'communicator_thread_pool_size',
+                'communicator_send_queue_size'
+            ]
+        elif self.mode == DistributedMode.GEO:
+            mode_str = "GEO"
+            need_keys = [
+                'communicator_thread_pool_size', 'communicator_send_wait_times',
+                'communicator_max_merge_var_num', 'communicator_send_queue_size'
+            ]
+        else:
+            raise ValueError("Unsupported Mode")
+
+        if self.mode == DistributedMode.SYNC or self.mode == DistributedMode.HALF_ASYNC:
+            max_merge_var_num = self.runtime_configs[
+                'communicator_max_merge_var_num']
+            send_queue_size = self.runtime_configs[
+                'communicator_send_queue_size']
+            if max_merge_var_num != num_threads:
+                print('WARNING: In {} mode, communicator_max_merge_var_num '
+                      'must be equal to CPU_NUM. But received, '
+                      'communicator_max_merge_var_num = {}, CPU_NUM = '
+                      '{}. communicator_max_merge_var_num will be forced to {}.'
+                      .format(mode_str, max_merge_var_num, num_threads,
+                              num_threads))
+                self.runtime_configs[
+                    'communicator_max_merge_var_num'] = num_threads
+            if send_queue_size != num_threads:
+                print('WARNING: In {} mode, communicator_send_queue_size '
+                      'must be equal to CPU_NUM. But received, '
+                      'communicator_send_queue_size = {}, CPU_NUM = '
+                      '{}. communicator_send_queue_size will be forced to {}.'
+                      .format(mode_str, send_queue_size, num_threads,
+                              num_threads))
+                self.runtime_configs[
+                    'communicator_send_queue_size'] = num_threads
+
+        return dict((key, str(self.runtime_configs[key])) for key in need_keys)
+
 
 def get_lr_ops(program):
     lr_ops = []
@@ -176,6 +229,13 @@ def get_ps_endpoint(role_maker):
         return role_maker.get_pserver_endpoints()[get_role_id(role_maker)]
 
 
+def get_ps_endpoints(role_maker):
+    try:
+        return role_maker._get_pserver_endpoints()
+    except Exception:
+        return role_maker.get_pserver_endpoints()
+
+
 def get_heter_worker_endpoint(role_maker):
     try:
         return role_maker._get_heter_worker_endpoint()
@@ -224,26 +284,20 @@ def is_sparse_op(op):
     return False
 
 
-def get_sparse_tablenames(program, is_distributed):
+def get_sparse_tablenames(programs, is_distributed):
     tablenames = set()
-    if is_distributed:
-        for op in program.global_block().ops:
-            if is_distributed_sparse_op(op):
-                tablenames.add(get_sparse_tablename(op))
-    else:
-        for op in program.global_block().ops:
-            if is_sparse_op(op):
-                tablenames.add(get_sparse_tablename(op))
+    for program in programs:
+        if is_distributed:
+            for op in program.global_block().ops:
+                if is_distributed_sparse_op(op):
+                    tablenames.add(get_sparse_tablename(op))
+        else:
+            for op in program.global_block().ops:
+                if is_sparse_op(op):
+                    tablenames.add(get_sparse_tablename(op))
     return list(tablenames)
 
 
-def get_ps_endpoints(role_maker):
-    try:
-        return role_maker._get_pserver_endpoints()
-    except Exception:
-        return role_maker.get_pserver_endpoints()
-
-
 def get_trainers(role_maker):
     try:
         return role_maker._worker_num()
@@ -251,7 +305,7 @@ def get_trainers(role_maker):
         return role_maker.worker_num()
 
 
-def get_dense_send_context(context,
+def get_dense_send_context(program,
                            send_ctx,
                            idx,
                            merged_dense_pairs,
@@ -260,34 +314,72 @@ def get_dense_send_context(context,
     if len(merged_dense_pairs) < 1:
         return idx
     if not split_dense_table:
+        dense_pairs = []
+        data_norm_pairs = []
+        for merged in merged_dense_pairs:
+            is_data_norm = False
+            grad = merged[1]
+            varname = grad.merged_var.name
+            for name in DATA_NORM_GRAD_NAME:
+                if varname.endswith(name):
+                    is_data_norm = True
+            if is_data_norm:
+                data_norm_pairs.append(merged)
+            else:
+                dense_pairs.append(merged)
+
+        # simple dense table
         origin_varnames = []
         var_numel = 0
-        for merged in merged_dense_pairs:
+        for merged in dense_pairs:
             grad = merged[1]
             origin_varnames.append(grad.merged_var.name)
-            var = context['origin_main_program'].global_block().vars[
-                grad.merged_var.name]
+            var = program.global_block().vars[grad.merged_var.name]
             var_numel += reduce(lambda x, y: x * y, var.shape)
-        grad_name = "Dense@Grad"
-        trainer_id = get_role_id(context['role_maker'])
+        grad_name = "Dense@GRAD_" + str(idx)
         aggregate = True
+        print("public get_dense_send_context dense_table:", grad_name,
+              var_numel, origin_varnames)
         dense_ctx = CommContext(grad_name, [grad_name], ["127.0.0.1:6071"],
                                 [var_numel], origin_varnames, trainer_id,
-                                aggregate, False, False, idx, False)
+                                aggregate, False, False, idx, False, False,
+                                id(program))
         send_ctx[grad_name] = dense_ctx
         idx += 1
+
+        if len(data_norm_pairs) <= 0:
+            return idx
+
+        # data norm table
+        origin_varnames = []
+        var_numel = 0
+        for merged in data_norm_pairs:
+            grad = merged[1]
+            origin_varnames.append(grad.merged_var.name)
+            var = program.global_block().vars[grad.merged_var.name]
+            var_numel += reduce(lambda x, y: x * y, var.shape)
+        grad_name = "DataNorm@GRAD_" + str(idx)
+        aggregate = True
+        print("public get_dense_send_context data_norm table:", grad_name,
+              var_numel, origin_varnames)
+        data_norm_ctx = CommContext(grad_name, [grad_name], ["127.0.0.1:6071"],
+                                    [var_numel], origin_varnames, trainer_id,
+                                    aggregate, False, False, idx, False, True,
+                                    id(program))
+        send_ctx[grad_name] = data_norm_ctx
+        idx += 1
     else:
         for merged in merged_dense_pairs:
             grad = merged[1]
             origin_varname = grad.merged_var.name
-            var = context['origin_main_program'].global_block().vars[
-                origin_varname]
+            var = program.global_block().vars[origin_varname]
             var_numel = reduce(lambda x, y: x * y, var.shape)
             grad_name = origin_varname
             aggregate = True
             dense_ctx = CommContext(grad_name, [grad_name], ["127.0.0.1:6071"],
                                     [var_numel], [origin_varname], trainer_id,
-                                    aggregate, False, False, idx, False)
+                                    aggregate, False, False, idx, False, False,
+                                    id(program))
             send_ctx[grad_name] = dense_ctx
             idx += 1
     return idx
@@ -299,25 +391,28 @@ def get_geo_trainer_send_context(context):
                          format(ps_mode, "get_geo_trainer_send_context"))
     send_ctx = {}
     trainer_id = get_role_id(context['role_maker'])
+    origin_programs = context['origin_main_programs']
     idx = 0
 
-    distibuted_varnames = get_sparse_tablenames(context['origin_main_program'],
-                                                True)
-    for merged in context['merged_sparse_pairs']:
-        param, grad = merged
-        grad_name = grad.merged_var.name
-        param_name = param.merged_var.name
-        is_distributed = True if param_name in distibuted_varnames else False
-
-        var = context['origin_main_program'].global_block().vars[
-            grad.merged_var.name]
-        var_numel = reduce(lambda x, y: x * y, var.shape[1:])
-
-        sparse_ctx = CommContext(grad_name, [grad_name], ["127.0.0.1:6071"],
-                                 [var_numel], [grad_name], trainer_id, True,
-                                 True, is_distributed, idx, False)
-        idx += 1
-        send_ctx[sparse_ctx.var_name()] = sparse_ctx
+    distibuted_varnames = get_sparse_tablenames(origin_programs, True)
+    for i, program in enumerate(origin_programs):
+        merged_sparse_pairs = context['merged_sparse_pairs'][i]
+        for merged in merged_sparse_pairs:
+            param, grad = merged
+            grad_name = grad.merged_var.name
+            param_name = param.merged_var.name
+            is_distributed = True if param_name in distibuted_varnames else False
+
+            var = program.global_block().vars[grad.merged_var.name]
+            var_numel = reduce(lambda x, y: x * y, var.shape[1:])
+
+            sparse_ctx = CommContext(grad_name, [grad_name],
+                                     ["127.0.0.1:6071"], [var_numel],
+                                     [grad_name], trainer_id, True, True,
+                                     is_distributed, idx, False, False,
+                                     id(program))
+            idx += 1
+            send_ctx[sparse_ctx.var_name()] = sparse_ctx
 
     if len(send_ctx) == 0:
         raise ValueError("GeoSGD require sparse parameters in your net.")
@@ -336,7 +431,7 @@ def _step_ctx(idx, role_maker):
     sections = [1] * len(endpoints)
     names = [name] * len(endpoints)
     ctx = CommContext(name, names, endpoints, sections, [name], trainer_id,
-                      True, False, False, idx, True)
+                      True, False, False, idx, True, False, -1)
     return name, ctx
 
 
@@ -348,36 +443,45 @@ def get_the_one_send_context(context,
         ep_list = ["127.0.0.1:6071"]
     send_ctx = {}
     trainer_id = get_role_id(context['role_maker'])
+    origin_programs = context['origin_main_programs']
 
     idx = 0
-    idx += get_dense_send_context(context, send_ctx, idx,
-                                  context['merged_dense_pairs'], trainer_id,
-                                  split_dense_table)
-    distibuted_varnames = get_sparse_tablenames(context['origin_main_program'],
-                                                True)
-    for merged in context['merged_sparse_pairs']:
-        param, grad = merged
-        grad_name = grad.merged_var.name
-        param_name = param.merged_var.name
-        splited_varname = []
-
-        for i in range(len(ep_list)):
-            splited_varname.append("{}.block{}".format(param_name, i))
-
-        is_distributed = True if param_name in distibuted_varnames else False
-
-        var = context['origin_main_program'].global_block().vars[
-            grad.merged_var.name]
-
-        shape = list(var.shape)
-        shape[0] = 0 if is_distributed else shape[0]
-
-        sparse_ctx = CommContext(grad_name, splited_varname, ep_list, shape,
-                                 [grad_name], trainer_id, True, True,
-                                 is_distributed, idx, False)
+    for i, program in enumerate(origin_programs):
+        merged_dense_pairs = context['merged_dense_pairs'][i]
+        idx += get_dense_send_context(program, send_ctx, idx,
+                                      merged_dense_pairs, trainer_id,
+                                      split_dense_table)
+    distibuted_varnames = get_sparse_tablenames(origin_programs, True)
+    print("public distibuted_varnames:", distibuted_varnames)
+    for i, program in enumerate(origin_programs):
+        merged_sparse_pairs = context['merged_sparse_pairs'][i]
+        for merged in merged_sparse_pairs:
+            param, grad = merged
+            grad_name = grad.merged_var.name
+            param_name = param.merged_var.name
+            splited_varname = []
+
+            for i in range(len(ep_list)):
+                splited_varname.append("{}.block{}".format(param_name, i))
+
+            is_distributed = True if param_name in distibuted_varnames else False
+
+            var = program.global_block().vars[grad.merged_var.name]
+
+            shape = list(var.shape)
+            shape[0] = 0 if is_distributed else shape[0]
+
+            print("public get_the_one_send_context sparse:", grad_name,
+                  splited_varname, shape)
+            if grad_name in send_ctx:
+                continue
+            sparse_ctx = CommContext(grad_name, splited_varname, ep_list, shape,
+                                     [grad_name], trainer_id, True, True,
+                                     is_distributed, idx, False, False,
+                                     id(program))
 
-        idx += 1
-        send_ctx[sparse_ctx.var_name()] = sparse_ctx
+            idx += 1
+            send_ctx[sparse_ctx.var_name()] = sparse_ctx
 
     if len(context['tensor_table']) > 0 and context['is_worker']:
         name, ctx = _step_ctx(idx, context['role_maker'])
@@ -1073,7 +1177,7 @@ def get_the_one_recv_context(context,
 
             param_names = []
             for grad_varname in origin_grad_varnames:
-                param_name = grad_name_to_param_name[grad_varname]
+                param_name = context["grad_name_to_param_name"][grad_varname]
                 param_names.append(param_name)
             recv_id_maps[ctx.table_id()] = param_names
     else:
@@ -1090,7 +1194,7 @@ def get_the_one_recv_context(context,
 
             param_names = []
             for grad_varname in origin_grad_varnames:
-                param_name = grad_name_to_param_name[grad_varname]
+                param_name = context["grad_name_to_param_name"][grad_varname]
                 param_names.append(param_name)
             recv_id_maps[ctx.table_id()] = param_names
     return recv_id_maps
@@ -1141,58 +1245,88 @@ def __init__(self, merged, ordered, offsets):
 
 
 def build_var_distributed(context):
-    sparse_pairs, dense_pairs = get_param_grads(context['origin_main_program'])
-    origin_for_sparse = []
-    origin_for_dense = []
-    param_name_grad_name = {}
+    origin_programs = context['origin_main_programs']
+
+    param_name_to_grad_name = {}
     grad_name_to_param_name = {}
-    context["merged_variables_pairs"] = []
+    context["origin_sparse_pairs"] = []
+    context["origin_dense_pairs"] = []
     context["merged_sparse_pairs"] = []
     context['merged_dense_pairs'] = []
+    context["merged_variables_pairs"] = []
     context["merged_variable_map"] = {}
-
-    for param, grad in sparse_pairs:
-        origin_for_sparse.append((param, grad))
-
-    for param, grad in dense_pairs:
-        origin_for_dense.append((param, grad))
-
-    for dense_pair in origin_for_dense:
-        param, grad = dense_pair
-
-        m_param = MergedVariable(param, [param], [0])
-        m_grad = MergedVariable(grad, [grad], [0])
-        context["merged_variables_pairs"].append((m_param, m_grad))
-        context["merged_dense_pairs"].append((m_param, m_grad))
-
-    for sparse_pair in origin_for_sparse:
-        param, grad = sparse_pair
-
-        m_param = MergedVariable(param, [param], [0])
-        m_grad = MergedVariable(grad, [grad], [0])
-        context["merged_variables_pairs"].append((m_param, m_grad))
-        context["merged_sparse_pairs"].append((m_param, m_grad))
-
-    for merged in context["merged_variables_pairs"]:
-        m_param, m_grad = merged
-        context["merged_variable_map"][
-            m_param.merged_var.name] = m_param.merged_var
-        context["merged_variable_map"][
-            m_grad.merged_var.name] = m_grad.merged_var
-
-    param_merges = []
-    param_merges.extend(origin_for_sparse)
-    param_merges.extend(origin_for_dense)
-
-    for param, grad in param_merges:
-        param_name_grad_name[param.name] = grad.name
-        grad_name_to_param_name[grad.name] = param.name
-
-    context["origin_sparse_pairs"] = origin_for_sparse
-    context["origin_dense_pairs"] = origin_for_dense
-    context["param_name_to_grad_name"] = param_name_grad_name
+    for origin_program in origin_programs:
+        sparse_pairs, dense_pairs = get_param_grads(origin_program)
+        print("public build_var_distributed sparse_pairs:", sparse_pairs)
+        print("public build_var_distributed dense_pairs:", dense_pairs)
+        origin_for_sparse = []
+        origin_for_dense = []
+        merged_sparse_pairs = []
+        merged_dense_pairs = []
+        merged_variables_pairs = []
+
+        for param, grad in sparse_pairs:
+            origin_for_sparse.append((param, grad))
+
+        for param, grad in dense_pairs:
+            origin_for_dense.append((param, grad))
+
+        for dense_pair in origin_for_dense:
+            param, grad = dense_pair
+
+            m_param = MergedVariable(param, [param], [0])
+            m_grad = MergedVariable(grad, [grad], [0])
+            merged_variables_pairs.append((m_param, m_grad))
+            merged_dense_pairs.append((m_param, m_grad))
+        print("public build_var_distributed merged_dense_pairs:",
+              merged_dense_pairs)
+
+        for sparse_pair in origin_for_sparse:
+            param, grad = sparse_pair
+
+            m_param = MergedVariable(param, [param], [0])
+            m_grad = MergedVariable(grad, [grad], [0])
+            merged_variables_pairs.append((m_param, m_grad))
+            merged_sparse_pairs.append((m_param, m_grad))
+        print("public build_var_distributed merged_sparse_pairs:",
+              merged_sparse_pairs)
+
+        for merged in merged_variables_pairs:
+            m_param, m_grad = merged
+            context["merged_variable_map"][
+                m_param.merged_var.name] = m_param.merged_var
+            context["merged_variable_map"][
+                m_grad.merged_var.name] = m_grad.merged_var
+
+        param_merges = []
+        param_merges.extend(origin_for_sparse)
+        param_merges.extend(origin_for_dense)
+
+        for param, grad in param_merges:
+            param_name_to_grad_name[param.name] = grad.name
+            grad_name_to_param_name[grad.name] = param.name
+
+        context["origin_sparse_pairs"].append(origin_for_sparse)
+        context["origin_dense_pairs"].append(origin_for_dense)
+        context["merged_sparse_pairs"].append(merged_sparse_pairs)
+        context['merged_dense_pairs'].append(merged_dense_pairs)
+
+    context["param_name_to_grad_name"] = param_name_to_grad_name
     context["grad_name_to_param_name"] = grad_name_to_param_name
 
+    print("public build_var_distributed origin_sparse_pairs:",
+          context["origin_sparse_pairs"])
+    print("public build_var_distributed origin_for_dense:",
+          context["origin_dense_pairs"])
+    print("public build_var_distributed merged_sparse_pairs:",
+          context["merged_sparse_pairs"])
+    print("public build_var_distributed merged_dense_pairs:",
+          context['merged_dense_pairs'])
+    print("public build_var_distributed param_name_to_grad_name:",
+          param_name_to_grad_name)
+    print("public build_var_distributed grad_name_to_param_name:",
+          grad_name_to_param_name)
+
 
 def _is_opt_role_op(op):
     # NOTE : depend on oprole to find out whether this op is for
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
index 4b8c7ccbb69cf..b6ec09bab7254 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
@@ -577,7 +577,7 @@ def get_the_one_trainer_send_context(self, split_dense_table):
                 sparse_ctx = CommContext(grad_name, [grad_name],
                                          ["127.0.0.1:6071"], [var_numel],
                                          [grad_name], trainer_id, True, True,
-                                         is_distributed, idx, False)
+                                         is_distributed, idx, False, False, -1)
                 idx += 1
                 send_ctx[sparse_ctx.var_name()] = sparse_ctx
 
@@ -615,7 +615,8 @@ def get_dense_send_context(self,
             aggregate = True
             dense_ctx = CommContext(grad_name, [grad_name], ["127.0.0.1:6071"],
                                     [var_numel], origin_varnames, trainer_id,
-                                    aggregate, False, False, idx, False)
+                                    aggregate, False, False, idx, False, False,
+                                    -1)
             send_ctx[grad_name] = dense_ctx
             idx += 1
         else:
@@ -630,7 +631,7 @@ def get_dense_send_context(self,
                 dense_ctx = CommContext(grad_name, [grad_name],
                                         ["127.0.0.1:6071"], [var_numel],
                                         [origin_varname], trainer_id, aggregate,
-                                        False, False, idx, False)
+                                        False, False, idx, False, False, -1)
                 send_ctx[grad_name] = dense_ctx
                 idx += 1
         return idx
@@ -672,7 +673,7 @@ def get_the_one_send_context(self,
 
             sparse_ctx = CommContext(grad_name, splited_varname, ep_list, shape,
                                      [grad_name], trainer_id, True, True,
-                                     is_distributed, idx, False)
+                                     is_distributed, idx, False, False, -1)
 
             idx += 1
             send_ctx[sparse_ctx.var_name()] = sparse_ctx
@@ -750,7 +751,7 @@ def _step_ctx(self, idx):
         sections = [1] * len(endpoints)
         names = [name] * len(endpoints)
         ctx = CommContext(name, names, endpoints, sections, [name], trainer_id,
-                          True, False, False, idx, True)
+                          True, False, False, idx, True, False, -1)
         return name, ctx
 
     def _create_vars_from_blocklist(self, block_list):

From 728c06249ccb1477eb357a67873ab25536f1074a Mon Sep 17 00:00:00 2001
From: xiongkun <xiongkun03@baidu.com>
Date: Tue, 22 Feb 2022 17:07:30 +0800
Subject: [PATCH 037/101] =?UTF-8?q?change=20Vector=20to=20std::vector=20an?=
 =?UTF-8?q?d=20provide=20MixVector=20class=20as=20a=20helper=20=E2=80=A6?=
 =?UTF-8?q?=20(#39559)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* change Vector to std::vector and provide MixVector class as a helper wrapper class

* solve the multi-gpu hang problem

* remove the duplicate template instantialize

* Copy vector to cpu

* add CopyToCPU

* xxx

* final version: fix the problem of all reduce

* remove mixvector dependence

* fix

* merge

* fix code

* fix by CI
---
 paddle/fluid/framework/lod_tensor_test.cu     |  14 +-
 paddle/fluid/framework/mixed_vector.cc        |  21 +-
 paddle/fluid/framework/mixed_vector.h         | 200 +++++++-----------
 paddle/fluid/framework/mixed_vector_test.cu   |  12 +-
 paddle/fluid/framework/tensor.h               |   1 +
 paddle/fluid/framework/tensor_util.cc         |  25 +--
 paddle/fluid/imperative/all_reduce.cc         |  48 +++--
 paddle/fluid/imperative/gloo_context.cc       |   8 +-
 paddle/fluid/inference/lite/tensor_utils.cc   |   2 -
 paddle/fluid/operators/ctc_align_op.cu        |   6 +-
 paddle/fluid/operators/cvm_op.cu              |   3 +-
 .../fluid/operators/detection/box_clip_op.cu  |   4 +-
 .../operators/detection/target_assign_op.h    |  12 +-
 .../fused/mkldnn/multi_gru_mkldnn_op.cc       |   1 +
 paddle/fluid/operators/lookup_table_op.cu     |   4 +-
 paddle/fluid/operators/lookup_table_v2_op.cu  |   8 +-
 paddle/fluid/operators/math/beam_search.cu    |   8 +-
 .../operators/math/selected_rows_functor.cc   |   3 +-
 .../operators/math/selected_rows_functor.cu   |  27 ++-
 paddle/fluid/operators/math/sequence2batch.cu |   3 +-
 .../fluid/operators/math/sequence_padding.cu  |  10 +-
 .../fluid/operators/math/sequence_pooling.cu  |  26 +--
 paddle/fluid/operators/math/sequence_scale.cu |  12 +-
 .../fluid/operators/optimizers/adagrad_op.cu  |   6 +-
 paddle/fluid/operators/optimizers/adam_op.cu  |   5 +-
 paddle/fluid/operators/optimizers/adam_op.h   |   5 +-
 paddle/fluid/operators/optimizers/adamw_op.cu |   5 +-
 paddle/fluid/operators/optimizers/ftrl_op.h   |   4 +-
 paddle/fluid/operators/optimizers/lamb_op.h   |   5 +-
 .../fluid/operators/optimizers/momentum_op.h  |   5 +-
 .../fluid/operators/optimizers/rmsprop_op.h   |   5 +-
 paddle/fluid/operators/optimizers/sgd_op.cu   |   6 +-
 paddle/fluid/operators/row_conv_op.cu         |   8 +-
 .../sequence_ops/sequence_enumerate_op.cu     |   3 +-
 .../sequence_ops/sequence_erase_op.cu         |   3 +-
 .../sequence_ops/sequence_expand_as_op.cu     |   8 +-
 .../sequence_ops/sequence_expand_op.cu        |  15 +-
 .../sequence_ops/sequence_reverse_op.h        |   4 +-
 .../sequence_ops/sequence_softmax_op.cu       |   9 +-
 paddle/phi/api/ext/dispatch.h                 |   2 +-
 paddle/phi/api/lib/utils/storage.cc           |  12 +-
 paddle/phi/api/lib/utils/storage.h            |   4 +-
 paddle/phi/backends/dynload/cudnn.cc          |   2 +-
 paddle/phi/backends/dynload/cufft.cc          |   2 +-
 paddle/phi/backends/dynload/dynamic_loader.cc |  15 +-
 paddle/phi/backends/dynload/miopen.cc         |   2 +-
 paddle/phi/backends/dynload/tensorrt.h        |  32 +--
 paddle/phi/backends/gpu/cuda/cuda_info.cc     | 144 ++++++-------
 paddle/phi/backends/gpu/gpu_launch_config.h   |  36 ++--
 paddle/phi/backends/gpu/rocm/rocm_info.cc     | 142 ++++++-------
 paddle/phi/backends/xpu/enforce_xpu.h         |   6 +-
 paddle/phi/backends/xpu/xpu_info.cc           |   2 +-
 paddle/phi/core/CMakeLists.txt                |   6 +-
 paddle/phi/core/ddim.h                        |   1 +
 paddle/phi/core/dense_tensor.cc               |  18 +-
 paddle/phi/core/dense_tensor.inl              |  10 +-
 paddle/phi/core/infermeta_utils.h             |   1 +
 paddle/phi/core/kernel_context.cc             |   4 +-
 paddle/phi/core/lod_utils.h                   |  15 +-
 paddle/phi/core/selected_rows.h               |  14 +-
 paddle/phi/core/selected_rows_impl.cc         |  20 +-
 paddle/phi/core/selected_rows_impl.h          |  19 +-
 paddle/phi/core/sparse_coo_tensor.cc          |   6 +-
 paddle/phi/core/sparse_csr_tensor.cc          |   8 +-
 paddle/phi/core/tensor_meta.h                 |   6 +-
 paddle/phi/infermeta/binary.cc                |  14 +-
 paddle/phi/infermeta/multiary.cc              |   4 +-
 paddle/phi/infermeta/unary.cc                 |  32 +--
 paddle/phi/kernels/cpu/concat_kernel.cc       |   2 +-
 paddle/phi/kernels/cpu/elementwise.h          |  24 +--
 .../phi/kernels/cpu/masked_select_kernel.cc   |   2 +-
 paddle/phi/kernels/funcs/common_shape.h       |   6 +-
 paddle/phi/kernels/funcs/concat_funcs.h       |  28 +--
 paddle/phi/kernels/funcs/eigen/common.h       |  10 +-
 paddle/phi/kernels/funcs/elementwise_base.h   |  10 +-
 paddle/phi/kernels/funcs/math_function.cc     |  28 +--
 paddle/phi/kernels/funcs/math_function.cu     |   8 +-
 paddle/phi/kernels/funcs/math_function.h      |   2 +-
 paddle/phi/kernels/funcs/math_function_impl.h |  58 +++--
 paddle/phi/kernels/gpu/concat_and_split.h     |   9 +-
 paddle/phi/kernels/gpu/concat_kernel.cu       |   2 +-
 paddle/phi/kernels/gpu/copy_kernel.cu         |  24 +--
 paddle/phi/kernels/gpu/elementwise.h          |   6 +-
 paddle/phi/kernels/gpu/histogram_kernel.cu    |   4 +-
 paddle/phi/kernels/impl/full_kernel_impl.h    |   2 +-
 paddle/phi/kernels/impl/matmul_kernel_impl.h  | 144 ++++++-------
 .../kernels/sparse/cpu/sparse_utils_kernel.cc |   4 +-
 .../kernels/sparse/gpu/sparse_utils_kernel.cu |   2 +-
 .../phi/kernels/sparse/sparse_utils_kernel.h  |   2 +-
 paddle/phi/kernels/xpu/copy_kernel.cc         |   2 +-
 paddle/phi/kernels/xpu/scale_kernel.cc        |  16 +-
 paddle/phi/tests/core/allocator.h             |   3 +-
 paddle/phi/tests/core/test_dense_tensor.cc    |   2 +-
 .../phi/tests/core/test_sparse_coo_tensor.cc  |   2 +-
 paddle/utils/string/tinyformat/tinyformat.h   |   2 +
 paddle/utils/string/to_string.h               |  21 ++
 96 files changed, 792 insertions(+), 766 deletions(-)

diff --git a/paddle/fluid/framework/lod_tensor_test.cu b/paddle/fluid/framework/lod_tensor_test.cu
index ddda7231887ed..006485a698fb3 100644
--- a/paddle/fluid/framework/lod_tensor_test.cu
+++ b/paddle/fluid/framework/lod_tensor_test.cu
@@ -31,15 +31,17 @@ TEST(LoD, data) {
   lod.push_back(std::vector<size_t>({0, 1, 6, 8, 10, 11}));
 
   auto& v = lod[0];
+  paddle::framework::MixVector<size_t> mix_vector_v(&v);
   paddle::platform::CUDAPlace gpu(0);
 #ifdef PADDLE_WITH_HIP
-  hipLaunchKernelGGL(test, dim3(1), dim3(1), 0, 0, v.CUDAMutableData(gpu),
-                     v.size());
+  hipLaunchKernelGGL(test, dim3(1), dim3(1), 0, 0,
+                     mix_vector_v.CUDAMutableData(gpu), v.size());
   hipDeviceSynchronize();
 #else
-  test<<<1, 1>>>(v.CUDAMutableData(gpu), v.size());
+  test<<<1, 1>>>(mix_vector_v.CUDAMutableData(gpu), v.size());
   cudaDeviceSynchronize();
 #endif
+  mix_vector_v.CopyToCPU();
   for (size_t i = 0; i < v.size(); ++i) {
     EXPECT_EQ(v[i], i * 2);
   }
@@ -62,15 +64,17 @@ TEST(LoDTensor, LoDInGPU) {
   EXPECT_EQ(lod_tensor.lod_element(0, 4).first, 8UL);
 
   auto lod = lod_tensor.lod();
+  paddle::framework::MixVector<size_t> mix_vector(&(lod[0]));
 
 #ifdef PADDLE_WITH_HIP
   hipLaunchKernelGGL(test, dim3(1), dim3(8), 0, 0,
-                     lod[0].CUDAMutableData(place), lod[0].size());
+                     mix_vector.CUDAMutableData(place), lod[0].size());
   hipDeviceSynchronize();
 #else
-  test<<<1, 8>>>(lod[0].CUDAMutableData(place), lod[0].size());
+  test<<<1, 8>>>(mix_vector.CUDAMutableData(place), lod[0].size());
   cudaDeviceSynchronize();
 #endif
+  mix_vector.CopyToCPU();
 
   for (size_t i = 0; i < src_lod[0].size(); ++i) {
     EXPECT_EQ(lod[0].data()[i], src_lod[0].data()[i] * 2);
diff --git a/paddle/fluid/framework/mixed_vector.cc b/paddle/fluid/framework/mixed_vector.cc
index b15a66c51c4b6..67b2d70f3440c 100644
--- a/paddle/fluid/framework/mixed_vector.cc
+++ b/paddle/fluid/framework/mixed_vector.cc
@@ -64,19 +64,20 @@ void CopyCPUDataToCUDAHelper(std::vector<T> *cpu_,
   auto stream = dev_ctx->stream();
   paddle::memory::Copy(OptionalCUDAPlace(*gpu_).get(), dst,
                        platform::CPUPlace(), src, *gpu_memory_size_, stream);
+  dev_ctx->Wait();
 #endif
 }
 
-#define INSTANTIATE_VECTOR_FOR_TYPE(__TYPE__)                                  \
-  template <>                                                                  \
-  void Vector<__TYPE__>::VectorData::CopyToCPU() const {                       \
-    CopyToCPUHelper<__TYPE__>(&cpu_, &gpu_, &gpu_memory_size_);                \
-  }                                                                            \
-                                                                               \
-  template <>                                                                  \
-  void Vector<__TYPE__>::VectorData::CopyCPUDataToCUDA(                        \
-      const platform::Place &place) const {                                    \
-    CopyCPUDataToCUDAHelper<__TYPE__>(&cpu_, &gpu_, &gpu_memory_size_, place); \
+#define INSTANTIATE_VECTOR_FOR_TYPE(__TYPE__)                                 \
+  template <>                                                                 \
+  void MixVector<__TYPE__>::VectorData::CopyToCPU() const {                   \
+    CopyToCPUHelper<__TYPE__>(cpu_, &gpu_, &gpu_memory_size_);                \
+  }                                                                           \
+                                                                              \
+  template <>                                                                 \
+  void MixVector<__TYPE__>::VectorData::CopyCPUDataToCUDA(                    \
+      const platform::Place &place) const {                                   \
+    CopyCPUDataToCUDAHelper<__TYPE__>(cpu_, &gpu_, &gpu_memory_size_, place); \
   }
 
 INSTANTIATE_VECTOR_FOR_TYPE(size_t)
diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h
index 0fd67efc177b3..a589a5b4ea7e1 100644
--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@@ -22,7 +22,6 @@ limitations under the License. */
 #include <vector>
 
 #include "glog/logging.h"
-#include "paddle/fluid/framework/details/cow_ptr.h"
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/utils/none.h"
 #include "paddle/utils/optional.h"
@@ -30,6 +29,9 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+template <class T>
+using Vector = std::vector<T>;
+
 inline paddle::optional<platform::CUDAPlace> OptionalCUDAPlace(
     const paddle::memory::allocation::AllocationPtr &gpu_) {
   return gpu_ == nullptr ? paddle::none
@@ -39,7 +41,7 @@ inline paddle::optional<platform::CUDAPlace> OptionalCUDAPlace(
 // Vector<T> implements the std::vector interface, and can get Data or
 // MutableData from any place. The data will be synced implicitly inside.
 template <typename T>
-class Vector {
+class MixVector {
  public:
   using value_type = T;
   using iterator = typename std::vector<T>::iterator;
@@ -49,82 +51,68 @@ class Vector {
   // The actual class to implement vector logic
   class VectorData {
    public:
-    VectorData() : flag_(kDataInCPU) {}
-    VectorData(size_t count, const T &value)
-        : cpu_(count, value), flag_(kDataInCPU) {}
-    VectorData(std::initializer_list<T> init) : cpu_(init), flag_(kDataInCPU) {}
     template <typename U>
-    explicit VectorData(const std::vector<U> &dat)
-        : cpu_(dat), flag_(kDataInCPU) {}
+    explicit VectorData(std::vector<U> *dat) : cpu_(dat), flag_(kDataInCPU) {}
     ~VectorData() {}
 
-    VectorData(const VectorData &o) {
-      o.ImmutableCPU();
-      cpu_ = o.cpu_;
-      flag_ = kDataInCPU;
-    }
+    VectorData(const VectorData &o) = delete;
 
-    VectorData &operator=(const VectorData &o) {
-      o.ImmutableCPU();
-      cpu_ = o.cpu_;
-      flag_ = kDataInCPU;
-      return *this;
-    }
+    VectorData &operator=(const VectorData &o) = delete;
 
     T &operator[](size_t i) {
       MutableCPU();
-      return cpu_[i];
+      return (*cpu_)[i];
     }
 
     const T &operator[](size_t i) const {
       ImmutableCPU();
-      return cpu_[i];
+      return (*cpu_)[i];
     }
 
-    size_t size() const { return cpu_.size(); }
+    size_t size() const { return (*cpu_).size(); }
 
     iterator begin() {
       MutableCPU();
-      return cpu_.begin();
+      return (*cpu_).begin();
     }
 
     iterator end() {
       MutableCPU();
-      return cpu_.end();
+      return (*cpu_).end();
     }
 
     T &front() {
       MutableCPU();
-      return cpu_.front();
+      return (*cpu_).front();
     }
 
     T &back() {
       MutableCPU();
-      return cpu_.back();
+      return (*cpu_).back();
     }
 
     const_iterator begin() const {
       ImmutableCPU();
-      return cpu_.begin();
+      return (*cpu_).begin();
     }
 
     const_iterator end() const {
       ImmutableCPU();
-      return cpu_.end();
+      return (*cpu_).end();
     }
 
     const T &back() const {
       ImmutableCPU();
-      return cpu_.back();
+      return (*cpu_).back();
     }
 
-    T *data() { return &(*this)[0]; }
+    T *data() { return cpu_->data(); }
 
-    const T *data() const { return &(*this)[0]; }
+    const T *data() const { return cpu_->data(); }
 
     const T &front() const {
       ImmutableCPU();
-      return cpu_.front();
+      return (*cpu_).front();
     }
 
     // assign this from iterator.
@@ -132,14 +120,14 @@ class Vector {
     template <typename Iter>
     void assign(Iter begin, Iter end) {
       MutableCPU();
-      cpu_.assign(begin, end);
+      (*cpu_).assign(begin, end);
     }
 
     // push_back. If the previous capacity is not enough, the memory will
     // double.
     void push_back(T elem) {
       MutableCPU();
-      cpu_.push_back(elem);
+      (*cpu_).push_back(elem);
     }
 
     // extend a vector by iterator.
@@ -147,14 +135,14 @@ class Vector {
     template <typename It>
     void Extend(It begin, It end) {
       MutableCPU();
-      auto out_it = std::back_inserter<std::vector<T>>(this->cpu_);
+      auto out_it = std::back_inserter<std::vector<T>>(*(this->cpu_));
       std::copy(begin, end, out_it);
     }
 
     // resize the vector
     void resize(size_t size) {
       MutableCPU();
-      cpu_.resize(size);
+      (*cpu_).resize(size);
     }
 
     // get cuda ptr. immutable
@@ -176,26 +164,16 @@ class Vector {
 
     // clear
     void clear() {
-      cpu_.clear();
+      (*cpu_).clear();
       flag_ = kDirty | kDataInCPU;
     }
 
-    size_t capacity() const { return cpu_.capacity(); }
-
-    // reserve data
-    void reserve(size_t size) const { cpu_.reserve(size); }
+    std::vector<T> *get_vector() { return cpu_; }
 
-    // implicit cast operator. Vector can be cast to std::vector implicitly.
-    operator std::vector<T>() const {
-      ImmutableCPU();
-      return cpu_;
-    }
+    size_t capacity() const { return (*cpu_).capacity(); }
 
-    bool operator==(const VectorData &other) const {
-      ImmutableCPU();
-      other.ImmutableCPU();
-      return cpu_ == other.cpu_;
-    }
+    // reserve data
+    void reserve(size_t size) const { (*cpu_).reserve(size); }
 
     std::mutex &Mutex() const { return mtx_; }
 
@@ -203,6 +181,13 @@ class Vector {
       return OptionalCUDAPlace(gpu_);
     }
 
+    void MutableCPU() {
+      if (IsInCUDA() && IsDirty()) {
+        CopyToCPU();
+      }
+      flag_ = kDirty | kDataInCPU;
+    }
+
    private:
     enum DataFlag {
       kDataInCPU = 0x01,
@@ -213,13 +198,6 @@ class Vector {
 
     void CopyToCPU() const;
 
-    void MutableCPU() {
-      if (IsInCUDA() && IsDirty()) {
-        CopyToCPU();
-      }
-      flag_ = kDirty | kDataInCPU;
-    }
-
     void ImmutableCUDA(platform::Place place) const {
       if (IsDirty()) {
         if (IsInCPU()) {
@@ -269,7 +247,7 @@ class Vector {
 
     bool IsInCPU() const { return flag_ & kDataInCPU; }
 
-    mutable std::vector<T> cpu_;
+    std::vector<T> *cpu_;
     mutable paddle::memory::allocation::AllocationPtr gpu_;
     mutable size_t gpu_memory_size_{0};
     mutable int flag_;
@@ -278,89 +256,77 @@ class Vector {
   };
 
  public:
-  // Default ctor. Create empty Vector
-  Vector() : m_(new VectorData()) {}
-
-  // Fill vector with value. The vector size is `count`.
-  explicit Vector(size_t count, const T &value = T())
-      : m_(new VectorData(count, value)) {}
-
-  // Ctor with init_list
-  Vector(std::initializer_list<T> init) : m_(new VectorData(init)) {}
-
   // implicit cast from std::vector.
   template <typename U>
-  Vector(const std::vector<U> &dat) : m_(new VectorData(dat)) {  // NOLINT
+  MixVector(const std::vector<U> *dat) {  // NOLINT
+    m_.reset(new VectorData(const_cast<std::vector<U> *>(dat)));
   }
 
   // Copy ctor
-  Vector(const Vector<T> &other) { m_ = other.m_; }
+  MixVector(const MixVector<T> &other) = delete;
 
   // Copy operator
-  Vector<T> &operator=(const Vector<T> &other) {
-    m_ = other.m_;
-    return *this;
-  }
+  MixVector<T> &operator=(const MixVector<T> &other) = delete;
 
   // Move ctor
-  Vector(Vector<T> &&other) { m_ = std::move(other.m_); }
+  MixVector(MixVector<T> &&other) = delete;
 
   // CPU data access method. Mutable.
-  T &operator[](size_t i) { return (*m_.MutableData())[i]; }
+  T &operator[](size_t i) { return (*m_)[i]; }
 
   // CPU data access method. Immutable.
-  const T &operator[](size_t i) const { return m_.Data()[i]; }
+  const T &operator[](size_t i) const { return (*m_)[i]; }
 
   // std::vector iterator methods. Based on CPU data access method
-  size_t size() const { return m_.Data().size(); }
+  size_t size() const { return m_->size(); }
 
-  iterator begin() { return m_.MutableData()->begin(); }
+  iterator begin() { return m_->begin(); }
 
-  iterator end() { return m_.MutableData()->end(); }
+  iterator end() { return m_->end(); }
 
-  T &front() { return m_.MutableData()->front(); }
+  T &front() { return m_->front(); }
 
-  T &back() { return m_.MutableData()->back(); }
+  T &back() { return m_->back(); }
 
-  const_iterator begin() const { return m_.Data().begin(); }
+  const_iterator begin() const { return m_->begin(); }
 
-  const_iterator end() const { return m_.Data().end(); }
+  const_iterator end() const { return m_->end(); }
 
   const_iterator cbegin() const { return begin(); }
 
   const_iterator cend() const { return end(); }
 
-  const T &back() const { return m_.Data().back(); }
+  const T &back() const { return m_->back(); }
 
-  T *data() { return m_.MutableData()->data(); }
+  T *data() { return m_->data(); }
 
-  const T *data() const { return m_.Data().data(); }
+  const T *data() const { return m_->data(); }
 
-  const T &front() const { return m_.Data().front(); }
+  const T &front() const { return m_->front(); }
   // end of std::vector iterator methods
 
   // assign this from iterator.
   // NOTE: the iterator must support `end-begin`
   template <typename Iter>
   void assign(Iter begin, Iter end) {
-    m_.MutableData()->assign(begin, end);
+    m_->assign(begin, end);
   }
 
   // push_back. If the previous capacity is not enough, the memory will
   // double.
-  void push_back(T elem) { m_.MutableData()->push_back(elem); }
+  void push_back(T elem) { m_->push_back(elem); }
 
   // extend a vector by iterator.
   // NOTE: the iterator must support end-begin
   template <typename It>
   void Extend(It begin, It end) {
-    m_.MutableData()->Extend(begin, end);
+    m_->Extend(begin, end);
   }
 
   // resize the vector
   void resize(size_t size) {
-    if (m_.Data().size() != size) {
-      m_.MutableData()->resize(size);
+    if (m_->size() != size) {
+      m_->resize(size);
     }
   }
 
@@ -368,15 +334,15 @@ class Vector {
   const T *CUDAData(platform::Place place) const {
     {
       platform::CUDAPlace p(place.GetDeviceId());
-      auto &mtx = m_.Data().Mutex();
+      auto &mtx = m_->Mutex();
       std::lock_guard<std::mutex> guard(mtx);
-      auto cuda_place = m_.Data().CUDAPlace();
+      auto cuda_place = m_->CUDAPlace();
       if (cuda_place == paddle::none || cuda_place == p) {
-        return m_.Data().CUDAData(place);
+        return m_->CUDAData(place);
       }
     }
-    // If m_ contains CUDAData in a different place. Detach manually.
-    m_.Detach();
+    m_->MutableCPU();
+    m_.reset(new VectorData(m_->get_vector()));
     return CUDAData(place);
   }
 
@@ -384,25 +350,25 @@ class Vector {
   T *CUDAMutableData(platform::Place place) {
     {
       platform::CUDAPlace p(place.GetDeviceId());
-      auto &mtx = m_.Data().Mutex();
+      auto &mtx = m_->Mutex();
       std::lock_guard<std::mutex> guard(mtx);
-      auto cuda_place = m_.Data().CUDAPlace();
+      auto cuda_place = m_->CUDAPlace();
       if (cuda_place == paddle::none || cuda_place == p) {
-        return m_.MutableData()->CUDAMutableData(place);
+        return m_->CUDAMutableData(place);
       }
     }
-    // If m_ contains CUDAData in a different place. Detach manually.
-    m_.Detach();
+    m_->MutableCPU();
+    m_.reset(new VectorData(m_->get_vector()));
     return CUDAMutableData(place);
   }
 
   // clear
-  void clear() { m_.MutableData()->clear(); }
+  void clear() { m_->clear(); }
 
-  size_t capacity() const { return m_.Data().capacity(); }
+  size_t capacity() const { return m_->capacity(); }
 
   // reserve data
-  void reserve(size_t size) { m_.Data().reserve(size); }
+  void reserve(size_t size) { m_->reserve(size); }
 
   // the unify method to access CPU or CUDA data. immutable.
   const T *Data(platform::Place place) const {
@@ -422,26 +388,12 @@ class Vector {
     }
   }
 
-  // implicit cast operator. Vector can be cast to std::vector implicitly.
-  operator std::vector<T>() const { return m_.Data(); }
-
-  bool operator==(const Vector<T> &other) const {
-    if (size() != other.size()) return false;
-    auto it1 = cbegin();
-    auto it2 = other.cbegin();
-    for (; it1 < cend(); ++it1, ++it2) {
-      if (*it1 != *it2) {
-        return false;
-      }
-    }
-    return true;
-  }
+  void CopyToCPU() { m_->MutableCPU(); }
 
-  const void *Handle() const { return &m_.Data(); }
+  const void *Handle() const { return m_.get(); }
 
  private:
-  // Vector is an COW object.
-  mutable details::COWPtr<VectorData> m_;
+  mutable std::unique_ptr<VectorData> m_;
 };
 
 };  // namespace framework
diff --git a/paddle/fluid/framework/mixed_vector_test.cu b/paddle/fluid/framework/mixed_vector_test.cu
index 011e2729d4adf..4cd9aab2896b6 100644
--- a/paddle/fluid/framework/mixed_vector_test.cu
+++ b/paddle/fluid/framework/mixed_vector_test.cu
@@ -28,7 +28,7 @@
 #include "paddle/fluid/platform/device_context.h"
 
 template <typename T>
-using vec = paddle::framework::Vector<T>;
+using vec = paddle::framework::MixVector<T>;
 using gpuStream_t = paddle::gpuStream_t;
 
 static __global__ void multiply_10(int* ptr) {
@@ -44,10 +44,11 @@ gpuStream_t GetCUDAStream(paddle::platform::CUDAPlace place) {
 }
 
 TEST(mixed_vector, GPU_VECTOR) {
-  vec<int> tmp;
+  std::vector<int> x;
   for (int i = 0; i < 10; ++i) {
-    tmp.push_back(i);
+    x.push_back(i);
   }
+  vec<int> tmp(&x);
   ASSERT_EQ(tmp.size(), 10UL);
   paddle::platform::CUDAPlace gpu(0);
 
@@ -70,10 +71,11 @@ TEST(mixed_vector, MultiGPU) {
     return;
   }
 
-  vec<int> tmp;
+  std::vector<int> x;
   for (int i = 0; i < 10; ++i) {
-    tmp.push_back(i);
+    x.push_back(i);
   }
+  vec<int> tmp(&x);
   ASSERT_EQ(tmp.size(), 10UL);
   paddle::platform::CUDAPlace gpu0(0);
   paddle::platform::SetDeviceId(0);
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index b9a262105e474..57eddf782f06b 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -30,6 +30,7 @@ limitations under the License. */
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/stream.h"
 
+#include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 1eb5727298c39..10eefff093b0e 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -1455,22 +1455,10 @@ std::ostream& print_tensor<paddle::platform::complex<double>>(
 }
 
 std::ostream& operator<<(std::ostream& os, const LoD& lod) {
-  os << "{";
-  for (auto& v : lod) {
-    os << "{";
-    bool is_first = true;
-    for (auto& i : v) {
-      if (is_first) {
-        os << i;
-        is_first = false;
-      } else {
-        os << ", " << i;
-      }
-    }
-    os << "}";
-  }
-  os << "}";
-
+  // NOTE(xiongkun):
+  // https://stackoverflow.com/questions/5195512/namespaces-and-operator-resolution
+  // if we don't redefine, the operator << of pten / framework LoD is not found.
+  paddle::string::operator<<(os, lod);
   return os;
 }
 
@@ -1479,6 +1467,11 @@ std::ostream& operator<<(std::ostream& os, const LoD& lod) {
 
 namespace phi {
 
+std::ostream& operator<<(std::ostream& os, const LoD& lod) {
+  paddle::string::operator<<(os, lod);
+  return os;
+}
+
 std::ostream& operator<<(std::ostream& os, const phi::DenseTensor& t) {
   if (t.lod().size() > 0) {
     os << "  - lod: " << t.lod() << "\n";
diff --git a/paddle/fluid/imperative/all_reduce.cc b/paddle/fluid/imperative/all_reduce.cc
index 24a8ffbabf526..436e22f00c303 100644
--- a/paddle/fluid/imperative/all_reduce.cc
+++ b/paddle/fluid/imperative/all_reduce.cc
@@ -90,6 +90,7 @@ static void AllReduce(const phi::SelectedRows &src, phi::SelectedRows *dst,
       platform::DeviceContextPool::Instance().Get(place));
 
   bool use_calc_stream = (dev_ctx->stream() == stream);
+  VLOG(4) << "Is use calculate stream: " << use_calc_stream;
 
   // 1. Gather rows number from all workers. Here use ncclAllGather to do this,
   // but we can use other ways to implement is in the future
@@ -97,7 +98,9 @@ static void AllReduce(const phi::SelectedRows &src, phi::SelectedRows *dst,
   framework::Vector<int64_t> rows_num_vector(strategy.nranks_);
   rows_num_vector[strategy.local_rank_] = static_cast<int64_t>(src_rows.size());
   // CUDAMutableData use CalStream
-  auto *gpu_rows_num_ptr = rows_num_vector.CUDAMutableData(place);
+  paddle::framework::MixVector<int64_t> mixv_rows_num_vector(&rows_num_vector);
+  auto *gpu_rows_num_ptr = mixv_rows_num_vector.CUDAMutableData(place);
+  VLOG(4) << "start dev_ctx->wait";
   if (!use_calc_stream) {
     dev_ctx->Wait();
   }
@@ -109,6 +112,7 @@ static void AllReduce(const phi::SelectedRows &src, phi::SelectedRows *dst,
     platform::GpuStreamSync(stream);
   }
 
+  mixv_rows_num_vector.CopyToCPU();
   const auto *cpu_rows_num_ptr = rows_num_vector.data();
   auto rows_num =
       std::accumulate(cpu_rows_num_ptr, cpu_rows_num_ptr + strategy.nranks_,
@@ -121,8 +125,10 @@ static void AllReduce(const phi::SelectedRows &src, phi::SelectedRows *dst,
 
   auto *dst_rows = dst->mutable_rows();
   dst_rows->resize(rows_num);
-  auto *dst_rows_ptr = dst_rows->CUDAMutableData(place);
-  const auto *src_rows_ptr = src_rows.CUDAData(place);
+  paddle::framework::MixVector<int64_t> mixv_dst_rows(dst_rows);
+  auto *dst_rows_ptr = mixv_dst_rows.CUDAMutableData(place);
+  paddle::framework::MixVector<int64_t> mixv_src_rows(&src_rows);
+  const auto *src_rows_ptr = mixv_src_rows.CUDAData(place);
 
   auto *dst_tensor = dst->mutable_value();
   auto dims = src_tensor.dims();
@@ -150,24 +156,28 @@ static void AllReduce(const phi::SelectedRows &src, phi::SelectedRows *dst,
     PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(
         src_tensor_ptr, dst_tensor_ptr, value_sendcount, nccl_dtype,
         comm->comm(), stream));
-    return;
-  }
-  for (int i = 0; i < strategy.nranks_; ++i) {
-    if (cpu_rows_num_ptr[i] > 0) {
-      // 2. Broadcast the rows of SelectedRows
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBroadcast(
-          src_rows_ptr, dst_rows_ptr + row_offset, cpu_rows_num_ptr[i],
-          ncclInt64, i, comm->comm(), stream));
-      // 3. Broadcast the tensor data of SelectedRows
-      auto *dst_tensor_ptr_i = reinterpret_cast<uint8_t *>(dst_tensor_ptr) +
-                               row_offset * feature_size * sizeof_dtype;
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBroadcast(
-          src_tensor_ptr, dst_tensor_ptr_i, cpu_rows_num_ptr[i] * feature_size,
-          nccl_dtype, i, comm->comm(), stream));
-      row_offset += cpu_rows_num_ptr[i];
+  } else {
+    for (int i = 0; i < strategy.nranks_; ++i) {
+      if (cpu_rows_num_ptr[i] > 0) {
+        // 2. Broadcast the rows of SelectedRows
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBroadcast(
+            src_rows_ptr, dst_rows_ptr + row_offset, cpu_rows_num_ptr[i],
+            ncclInt64, i, comm->comm(), stream));
+        // 3. Broadcast the tensor data of SelectedRows
+        auto *dst_tensor_ptr_i = reinterpret_cast<uint8_t *>(dst_tensor_ptr) +
+                                 row_offset * feature_size * sizeof_dtype;
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBroadcast(
+            src_tensor_ptr, dst_tensor_ptr_i,
+            cpu_rows_num_ptr[i] * feature_size, nccl_dtype, i, comm->comm(),
+            stream));
+        row_offset += cpu_rows_num_ptr[i];
+      }
     }
   }
-
+  if (!use_calc_stream) {
+    platform::GpuStreamSync(stream);
+  }
+  mixv_dst_rows.CopyToCPU();
   VLOG(3) << "Original SelectedRows rows: "
           << string::join_strings(src_rows, ',');
   VLOG(3) << "Result SelectedRows rows: "
diff --git a/paddle/fluid/imperative/gloo_context.cc b/paddle/fluid/imperative/gloo_context.cc
index 8997966165769..dd34b8b619f80 100644
--- a/paddle/fluid/imperative/gloo_context.cc
+++ b/paddle/fluid/imperative/gloo_context.cc
@@ -143,7 +143,7 @@ void GLOOParallelContext::AllReduce(const phi::SelectedRows &src,
   auto dtype = framework::TransToProtoVarType(src_tensor.dtype());
   // 1. Gather rows number from all workers. Here use ncclAllGather to do this,
   // but we can use other ways to implement is in the future
-  const auto &src_rows = src.rows();
+  auto &src_rows = src.rows();
   auto gloo_wrapper = framework::GlooWrapper::GetInstance();
   size_t local_row_num = src_rows.size();
   std::vector<size_t> rows_num_vector =
@@ -157,8 +157,10 @@ void GLOOParallelContext::AllReduce(const phi::SelectedRows &src,
           << ", height: " << src.height();
   auto *dst_rows = dst->mutable_rows();
   dst_rows->resize(rows_num);
-  auto *dst_rows_ptr = dst_rows->MutableData(place);
-  const int64_t *src_rows_ptr = src_rows.Data(place);
+  paddle::framework::MixVector<int64_t> mixv_dst_rows(dst_rows);
+  auto *dst_rows_ptr = mixv_dst_rows.MutableData(place);
+  paddle::framework::MixVector<int64_t> mixv_src_rows(&src_rows);
+  const int64_t *src_rows_ptr = mixv_src_rows.Data(place);
 
   auto *dst_tensor = dst->mutable_value();
   auto dims = src_tensor.dims();
diff --git a/paddle/fluid/inference/lite/tensor_utils.cc b/paddle/fluid/inference/lite/tensor_utils.cc
index 04ae3b9afe32c..0e4fb3335f3d7 100644
--- a/paddle/fluid/inference/lite/tensor_utils.cc
+++ b/paddle/fluid/inference/lite/tensor_utils.cc
@@ -38,8 +38,6 @@ void SetLoD(DstLoD* dst, const SrcLoD& src) {
     dst->emplace_back(v);
   }
 }
-template void SetLoD<paddle::lite::LoD, framework::LoD>(
-    paddle::lite::LoD* dst, const framework::LoD& src);
 template void SetLoD<framework::LoD, paddle::lite::LoD>(
     framework::LoD* dst, const paddle::lite::LoD& src);
 
diff --git a/paddle/fluid/operators/ctc_align_op.cu b/paddle/fluid/operators/ctc_align_op.cu
index 8a44c1327b9e6..b1f2e61ef3930 100644
--- a/paddle/fluid/operators/ctc_align_op.cu
+++ b/paddle/fluid/operators/ctc_align_op.cu
@@ -110,10 +110,12 @@ class CTCAlignOpCUDAKernel : public framework::OpKernel<T> {
       // merge elements and delete blank
       T* output_data = output->mutable_data<T>({num_tokens, 1}, ctx.GetPlace());
 
+      paddle::framework::MixVector<size_t> mixv_input_lod(&input_lod[level]);
       MergeAndDelCudaKernel<T><<<1, 1, 0, stream>>>(
           num_tokens, tokens, num_seq,
-          input_lod[level].CUDAMutableData(ctx.GetPlace()), blank,
-          merge_repeated, dev_out_lod0_ptr, output_data);
+          mixv_input_lod.CUDAMutableData(ctx.GetPlace()), blank, merge_repeated,
+          dev_out_lod0_ptr, output_data);
+      mixv_input_lod.CopyToCPU();
 
       // set output lod
       std::vector<size_t> host_out_lod0(dev_out_lod0.begin(),
diff --git a/paddle/fluid/operators/cvm_op.cu b/paddle/fluid/operators/cvm_op.cu
index ad96dc24b9206..1a3bdee53e9bd 100644
--- a/paddle/fluid/operators/cvm_op.cu
+++ b/paddle/fluid/operators/cvm_op.cu
@@ -149,11 +149,12 @@ class CVMGradCUDAKernel : public framework::OpKernel<T> {
           batch_size, lod[lod.size() - 1],
           platform::errors::PreconditionNotMet(
               "Output(X@GRAD)'s dim[0] must be equal to last element of lod"));
+      paddle::framework::MixVector<size_t> mixv_lod(&lod);
       CvmGradComputeKernel<<<(dx_numel + PADDLE_CUDA_NUM_THREADS - 1) /
                                  PADDLE_CUDA_NUM_THREADS,
                              PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
           use_cvm, item_size, cvm_data, dout_data, dx_data, true,
-          lod.CUDAData(context.GetPlace()), lod.size(), dx_numel);
+          mixv_lod.CUDAData(context.GetPlace()), lod.size(), dx_numel);
     }
   }
 };
diff --git a/paddle/fluid/operators/detection/box_clip_op.cu b/paddle/fluid/operators/detection/box_clip_op.cu
index bda22dd0155cc..65f2a5590716d 100644
--- a/paddle/fluid/operators/detection/box_clip_op.cu
+++ b/paddle/fluid/operators/detection/box_clip_op.cu
@@ -57,9 +57,11 @@ class GPUBoxClipKernel : public framework::OpKernel<T> {
     auto stream = dev_ctx.stream();
     const size_t batch_size = lod.back().size() - 1;
     T *output_data = output->mutable_data<T>(dev_ctx.GetPlace());
+    paddle::framework::MixVector<size_t> mix_vector(&abs_offset_lod[0]);
     GPUBoxClip<T, 512><<<batch_size, 512, 0, stream>>>(
-        input->data<T>(), abs_offset_lod[0].CUDAMutableData(dev_ctx.GetPlace()),
+        input->data<T>(), mix_vector.CUDAMutableData(dev_ctx.GetPlace()),
         bbox_width, im_info->data<T>(), output_data);
+    mix_vector.CopyToCPU();
   }
 };
 
diff --git a/paddle/fluid/operators/detection/target_assign_op.h b/paddle/fluid/operators/detection/target_assign_op.h
index 01b15865e93b6..c4506f04e083e 100644
--- a/paddle/fluid/operators/detection/target_assign_op.h
+++ b/paddle/fluid/operators/detection/target_assign_op.h
@@ -108,7 +108,8 @@ class TargetAssignKernel : public framework::OpKernel<T> {
 
     auto x_lod = x->lod().back();
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    size_t* x_lod_data = x_lod.MutableData(ctx.GetPlace());
+    paddle::framework::MixVector<size_t> mixv_x_lod(&x_lod);
+    size_t* x_lod_data = mixv_x_lod.MutableData(ctx.GetPlace());
 #else
     size_t* x_lod_data = x_lod.data();
 #endif
@@ -116,6 +117,9 @@ class TargetAssignKernel : public framework::OpKernel<T> {
     TargetAssignFunctor<T, WT> functor(x_data, match_idx_data, x_lod_data,
                                        mismatch_value, n, m, p, k, out_data,
                                        out_wt_data);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    mixv_x_lod.CopyToCPU();
+#endif
 
     auto& device_ctx = ctx.template device_context<DeviceContext>();
     platform::ForRange<DeviceContext> for_range(device_ctx, n * m);
@@ -130,13 +134,17 @@ class TargetAssignKernel : public framework::OpKernel<T> {
       const int* neg_idx_data = neg_indices->data<int>();
       auto neg_lod = neg_indices->lod().back();
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      size_t* neg_lod_data = neg_lod.MutableData(ctx.GetPlace());
+      paddle::framework::MixVector<size_t> mixv_neg_lod(&neg_lod);
+      size_t* neg_lod_data = mixv_neg_lod.MutableData(ctx.GetPlace());
 #else
       size_t* neg_lod_data = neg_lod.data();
 #endif
       NegTargetAssignFunctor<DeviceContext, T, WT> neg_trg_functor;
       neg_trg_functor(device_ctx, neg_idx_data, neg_lod_data, n, m, k,
                       mismatch_value, out_data, out_wt_data);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+      mixv_neg_lod.CopyToCPU();
+#endif
     }
   }
 };
diff --git a/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc
index 177e8f5bcb7bd..0ffc4c91b851c 100644
--- a/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc
+++ b/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <iostream>
 #include <memory>
 #include "dnnl.hpp"
+#include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/fused/multi_gru_op.h"
 #include "paddle/fluid/platform/errors.h"
diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu
index e36c8b1c1b253..29079b8b1385d 100644
--- a/paddle/fluid/operators/lookup_table_op.cu
+++ b/paddle/fluid/operators/lookup_table_op.cu
@@ -164,8 +164,10 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
       auto gpu_place = context.GetPlace();
 
       // TODO(yuyang18): Strange code here.
-      memory::Copy(gpu_place, new_rows.CUDAMutableData(context.GetPlace()),
+      paddle::framework::MixVector<int64_t> mixv_new_rows(&new_rows);
+      memory::Copy(gpu_place, mixv_new_rows.CUDAMutableData(context.GetPlace()),
                    gpu_place, ids_data, ids_num * sizeof(int64_t), stream);
+      mixv_new_rows.CopyToCPU();
       d_table->set_rows(new_rows);
 
       auto *d_table_value = d_table->mutable_value();
diff --git a/paddle/fluid/operators/lookup_table_v2_op.cu b/paddle/fluid/operators/lookup_table_v2_op.cu
index 42318ca6a8d3e..4539f7091b578 100644
--- a/paddle/fluid/operators/lookup_table_v2_op.cu
+++ b/paddle/fluid/operators/lookup_table_v2_op.cu
@@ -152,14 +152,16 @@ struct LookupTableV2GradCUDAFunctor {
       new_rows.resize(ids_num);
       auto gpu_place = context_.GetPlace();
 
+      paddle::framework::MixVector<int64_t> mixv_new_rows(&new_rows);
       if (!std::is_same<IdT, int64_t>::value) {
         InputTypeConvert<<<grids, threads, 0, stream>>>(
-            ids_data, ids_num, new_rows.MutableData(gpu_place));
+            ids_data, ids_num, mixv_new_rows.MutableData(gpu_place));
       } else {
-        memory::Copy(gpu_place, new_rows.CUDAMutableData(gpu_place), gpu_place,
-                     ids_data, ids_num * sizeof(int64_t), stream);
+        memory::Copy(gpu_place, mixv_new_rows.CUDAMutableData(gpu_place),
+                     gpu_place, ids_data, ids_num * sizeof(int64_t), stream);
       }
 
+      mixv_new_rows.CopyToCPU();
       d_table->set_rows(new_rows);
 
       auto *d_table_value = d_table->mutable_value();
diff --git a/paddle/fluid/operators/math/beam_search.cu b/paddle/fluid/operators/math/beam_search.cu
index c954bdf81d30d..486979aa0a8b3 100644
--- a/paddle/fluid/operators/math/beam_search.cu
+++ b/paddle/fluid/operators/math/beam_search.cu
@@ -357,8 +357,9 @@ class BeamSearchFunctor<platform::CUDADeviceContext, T> {
     framework::LoD selected_lod(2);
     selected_lod[0].assign(abs_lod[level].begin(), abs_lod[level].end());
     selected_lod[1].resize(scores->dims()[0] + 1);
-    size_t* selected_offsets =
-        selected_lod[1].CUDAMutableData(context.GetPlace());
+    paddle::framework::MixVector<size_t> mix_vector(&selected_lod[1]);
+    paddle::framework::MixVector<size_t> mixv_abs(&abs_lod[level]);
+    size_t* selected_offsets = mix_vector.CUDAMutableData(context.GetPlace());
 
     if (num_seqs == 1) {
       const int seq_length = static_cast<int>(abs_lod[level][1]);
@@ -377,7 +378,7 @@ class BeamSearchFunctor<platform::CUDADeviceContext, T> {
                 is_accumulated, num_used_threads));
       }
     } else if (num_seqs <= 4) {
-      const size_t* seq_offsets = abs_lod[level].CUDAData(context.GetPlace());
+      const size_t* seq_offsets = mixv_abs.CUDAData(context.GetPlace());
       // Use only 1 block
       const int kMaxThreadsPerSeq = 32;
       const int kMaxSeqs = 4;
@@ -400,6 +401,7 @@ class BeamSearchFunctor<platform::CUDADeviceContext, T> {
     }
 
     context.Wait();
+    mix_vector.CopyToCPU();
     if (!framework::CheckLoD(selected_lod)) {
       PADDLE_THROW(platform::errors::InvalidArgument(
           "lod %s is not right in"
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc
index 67165ff221989..fcd5c06a6f310 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -170,7 +170,8 @@ struct SelectedRowsAddTo<platform::CPUDeviceContext, T> {
     auto* in2_value = input2->mutable_value();
 
     // concat rows
-    in2_rows.Extend(in1_rows.begin(), in1_rows.end());
+    paddle::framework::MixVector<int64_t> mixv_in2_rows(&in2_rows);
+    mixv_in2_rows.Extend(in1_rows.begin(), in1_rows.end());
 
     auto in1_place = input1.place();
     PADDLE_ENFORCE_EQ(platform::is_cpu_place(in1_place), true,
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu
index ea0b0bb29548b..8563d8b05b186 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
@@ -161,9 +161,10 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
     const int block_size = 256;
     dim3 threads(block_size, 1);
     dim3 grid(in1_rows.size(), 1);
+    paddle::framework::MixVector<int64_t> mixv_in1_rows(&in1_rows);
     SelectedRowsAddTensorKernel<
         T, block_size><<<grid, threads, 0, context.stream()>>>(
-        in1_data, in1_rows.CUDAData(context.GetPlace()), out_data,
+        in1_data, mixv_in1_rows.CUDAData(context.GetPlace()), out_data,
         in1_row_numel);
 
     auto out_eigen = framework::EigenVector<T>::Flatten(*output);
@@ -198,8 +199,9 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
     auto* in2_value = input2->mutable_value();
 
     // concat rows
+    paddle::framework::MixVector<int64_t> mixv_in2_rows(&in2_rows);
     if (in1_rows.size()) {
-      in2_rows.Extend(in1_rows.begin(), in1_rows.end());
+      mixv_in2_rows.Extend(in1_rows.begin(), in1_rows.end());
     }
 
     auto in1_place = input1.place();
@@ -274,9 +276,10 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
     const int block_size = 256;
     dim3 threads(block_size, 1);
     dim3 grid(in1_rows.size(), 1);
+    paddle::framework::MixVector<int64_t> mixv_in1_rows(&in1_rows);
     SelectedRowsAddToTensorKernel<
         T, block_size><<<grid, threads, 0, context.stream()>>>(
-        in1_data, in1_rows.CUDAData(context.GetPlace()), in2_data,
+        in1_data, mixv_in1_rows.CUDAData(context.GetPlace()), in2_data,
         in1_row_numel);
   }
 };
@@ -356,10 +359,13 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
     dim3 threads(block_size, 1);
     dim3 grid1(input_rows.size(), 1);
 
+    paddle::framework::MixVector<int64_t> mix_vector_input(&input_rows);
+    paddle::framework::MixVector<int64_t> mix_vector_out(out.mutable_rows());
     MergeAddKernel<T, 256><<<grid1, threads, 0, context.stream()>>>(
-        input_data, input_rows.CUDAData(context.GetPlace()), out_data,
-        out.mutable_rows()->CUDAMutableData(context.GetPlace()),
-        out.rows().size(), input_width);
+        input_data, mix_vector_input.CUDAData(context.GetPlace()), out_data,
+        mix_vector_out.CUDAMutableData(context.GetPlace()), out.rows().size(),
+        input_width);
+    mix_vector_out.CopyToCPU();
   }
 
   void operator()(const platform::CUDADeviceContext& context,
@@ -423,10 +429,13 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
       auto& input_rows = input->rows();
       dim3 grid1(input_rows.size(), 1);
 
+      paddle::framework::MixVector<int64_t> mix_vector_input(&input_rows);
+      paddle::framework::MixVector<int64_t> mix_vector_out(out.mutable_rows());
       MergeAddKernel<T, 256><<<grid1, threads, 0, context.stream()>>>(
-          input_data, input_rows.CUDAData(context.GetPlace()), out_data,
-          out.mutable_rows()->CUDAMutableData(context.GetPlace()),
-          out.rows().size(), input_width);
+          input_data, mix_vector_input.CUDAData(context.GetPlace()), out_data,
+          mix_vector_out.CUDAMutableData(context.GetPlace()), out.rows().size(),
+          input_width);
+      mix_vector_out.CopyToCPU();
     }
   }
 };
diff --git a/paddle/fluid/operators/math/sequence2batch.cu b/paddle/fluid/operators/math/sequence2batch.cu
index cd1ca572689bc..f56c5293971bc 100644
--- a/paddle/fluid/operators/math/sequence2batch.cu
+++ b/paddle/fluid/operators/math/sequence2batch.cu
@@ -72,8 +72,9 @@ class CopyMatrixRowsFunctor<platform::CUDADeviceContext, T> {
     dim3 threads(128, 8);
     dim3 grid(8, 1);
     auto stream = context.stream();
+    paddle::framework::MixVector<size_t> mix_index_lod(&index_lod);
     CopyMatrixRowsKernel<T, 128, 8, 8><<<grid, threads, 0, stream>>>(
-        src_data, dst_data, index_lod.CUDAData(context.GetPlace()), height,
+        src_data, dst_data, mix_index_lod.CUDAData(context.GetPlace()), height,
         width, is_src_index);
   }
 };
diff --git a/paddle/fluid/operators/math/sequence_padding.cu b/paddle/fluid/operators/math/sequence_padding.cu
index 65bf77f0d152b..01fd2d403c456 100644
--- a/paddle/fluid/operators/math/sequence_padding.cu
+++ b/paddle/fluid/operators/math/sequence_padding.cu
@@ -59,7 +59,7 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
                   int lod_level = 0, bool norm_by_times = false,
                   const PadLayout layout = kBatchLengthWidth) {
     auto seq_lod = seq_tensor.lod();
-    const auto seq_offsets = framework::ToAbsOffset(seq_lod)[lod_level];
+    auto seq_offsets = framework::ToAbsOffset(seq_lod)[lod_level];
     const auto& seq_tensor_dims = seq_tensor.dims();
     const auto& pad_tensor_dims = pad_tensor->dims();
     int max_seq_len = MaximumSequenceLength(seq_offsets);
@@ -104,10 +104,11 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
     T* pad_data = pad_tensor->data<T>();
     const T* pad_value_data = pad_value.data<T>();
 
+    paddle::framework::MixVector<size_t> mix_vector_seq_offsets(&seq_offsets);
     SequencePaddingKernel<T, kSeqToPad><<<grid, threads, 0, context.stream()>>>(
         pad_data, seq_data, pad_value_data, pad_value.numel() == 1,
-        seq_offsets.CUDAData(context.GetPlace()), seq_num, pad_seq_len,
-        step_width, norm_by_times, layout);
+        mix_vector_seq_offsets.CUDAData(context.GetPlace()), seq_num,
+        pad_seq_len, step_width, norm_by_times, layout);
   }
 };
 
@@ -157,9 +158,10 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
     const T* pad_data = pad_tensor.data<T>();
     T* seq_data = seq_tensor->data<T>();
 
+    paddle::framework::MixVector<size_t> mixv_seq_offsets(&seq_offsets);
     SequencePaddingKernel<T, kPadToSeq><<<grid, threads, 0, context.stream()>>>(
         seq_data, pad_data, nullptr, false,
-        seq_offsets.CUDAData(context.GetPlace()), seq_num, pad_seq_len,
+        mixv_seq_offsets.CUDAData(context.GetPlace()), seq_num, pad_seq_len,
         step_width, norm_by_times, layout);
   }
 };
diff --git a/paddle/fluid/operators/math/sequence_pooling.cu b/paddle/fluid/operators/math/sequence_pooling.cu
index 1c09acf52fae3..fa7b043153851 100644
--- a/paddle/fluid/operators/math/sequence_pooling.cu
+++ b/paddle/fluid/operators/math/sequence_pooling.cu
@@ -168,41 +168,42 @@ class SequencePoolFunctor<platform::CUDADeviceContext, T> {
     const size_t item_dim = output->numel() / output->dims()[0];
     dim3 threads(1024, 1);
     dim3 grid(std::max(static_cast<int>(lod.size()) - 1, 1), 1);
+    paddle::framework::MixVector<size_t> mix_vector(&lod);
     if (pooltype == "MAX") {
       sequence_pool_kernel<
           T, MaxPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
           MaxPoolFunctor<T>(), input.data<T>(), pad_value,
-          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
           output->mutable_data<T>(context.GetPlace()), index->data<int>());
     } else if (pooltype == "AVERAGE") {
       sequence_pool_kernel<
           T, AvgPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
           AvgPoolFunctor<T>(), input.data<T>(), pad_value,
-          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
           output->mutable_data<T>(context.GetPlace()), nullptr);
     } else if (pooltype == "SUM") {
       sequence_pool_kernel<
           T, SumPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
           SumPoolFunctor<T>(), input.data<T>(), pad_value,
-          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
           output->mutable_data<T>(context.GetPlace()), nullptr);
     } else if (pooltype == "SQRT") {
       sequence_pool_kernel<
           T, SqrtPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
           SqrtPoolFunctor<T>(), input.data<T>(), pad_value,
-          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
           output->mutable_data<T>(context.GetPlace()), nullptr);
     } else if (pooltype == "LAST") {
       sequence_pool_kernel<
           T, LastPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
           LastPoolFunctor<T>(), input.data<T>(), pad_value,
-          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
           output->mutable_data<T>(context.GetPlace()), nullptr);
     } else if (pooltype == "FIRST") {
       sequence_pool_kernel<
           T, FirstPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
           FirstPoolFunctor<T>(), input.data<T>(), pad_value,
-          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
           output->mutable_data<T>(context.GetPlace()), nullptr);
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
@@ -335,41 +336,42 @@ class SequencePoolGradFunctor<platform::CUDADeviceContext, T> {
     const size_t item_dim = in_grad->numel() / in_grad->dims()[0];
     dim3 threads(1024, 1);
     dim3 grid(std::max(static_cast<int>(lod.size()) - 1, 1), 1);
+    paddle::framework::MixVector<size_t> mix_vector(&lod);
     if (pooltype == "MAX") {
       sequence_pool_grad_kernel<
           T, MaxPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
           MaxPoolGradFunctor<T>(), out_grad.data<T>(),
-          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
           in_grad->mutable_data<T>(context.GetPlace()), index->data<int>());
     } else if (pooltype == "AVERAGE") {
       sequence_pool_grad_kernel<
           T, AvgPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
           AvgPoolGradFunctor<T>(), out_grad.data<T>(),
-          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
           in_grad->mutable_data<T>(context.GetPlace()), nullptr);
     } else if (pooltype == "SUM") {
       sequence_pool_grad_kernel<
           T, SumPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
           SumPoolGradFunctor<T>(), out_grad.data<T>(),
-          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
           in_grad->mutable_data<T>(context.GetPlace()), nullptr);
     } else if (pooltype == "SQRT") {
       sequence_pool_grad_kernel<
           T, SqrtPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
           SqrtPoolGradFunctor<T>(), out_grad.data<T>(),
-          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
           in_grad->mutable_data<T>(context.GetPlace()), nullptr);
     } else if (pooltype == "LAST") {
       sequence_pool_grad_kernel<
           T, LastPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
           LastPoolGradFunctor<T>(), out_grad.data<T>(),
-          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
           in_grad->mutable_data<T>(context.GetPlace()), nullptr);
     } else if (pooltype == "FIRST") {
       sequence_pool_grad_kernel<
           T, FirstPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
           FirstPoolGradFunctor<T>(), out_grad.data<T>(),
-          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
           in_grad->mutable_data<T>(context.GetPlace()), nullptr);
 
     } else {
diff --git a/paddle/fluid/operators/math/sequence_scale.cu b/paddle/fluid/operators/math/sequence_scale.cu
index 1807c77e37ca1..8e02d1b70ff83 100644
--- a/paddle/fluid/operators/math/sequence_scale.cu
+++ b/paddle/fluid/operators/math/sequence_scale.cu
@@ -41,21 +41,23 @@ class ScaleLoDTensorFunctor<platform::CUDADeviceContext, T> {
     auto lod = seq->lod();
     const size_t num_seq = lod[level].size() - 1;
     const size_t seq_width = seq->numel() / seq->dims()[0];
-    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
+    auto abs_offset_lod = framework::ToAbsOffset(lod);
     T* seq_data = seq->mutable_data<T>(context.GetPlace());
+    paddle::framework::MixVector<size_t> mix_vector(&(abs_offset_lod[level]));
 
 #ifdef PADDLE_WITH_HIP
     hipLaunchKernelGGL(
         HIP_KERNEL_NAME(SequenceScaleKernel<T, PADDLE_CUDA_NUM_THREADS>),
         dim3(num_seq), dim3(PADDLE_CUDA_NUM_THREADS), 0, context.stream(),
-        seq_data, abs_offset_lod[level].CUDAMutableData(context.GetPlace()),
-        scales, seq_width);
+        seq_data, mix_vector.CUDAMutableData(context.GetPlace()), scales,
+        seq_width);
 #else
     SequenceScaleKernel<T, PADDLE_CUDA_NUM_THREADS><<<
         num_seq, PADDLE_CUDA_NUM_THREADS, 0, context.stream()>>>(
-        seq_data, abs_offset_lod[level].CUDAMutableData(context.GetPlace()),
-        scales, seq_width);
+        seq_data, mix_vector.CUDAMutableData(context.GetPlace()), scales,
+        seq_width);
 #endif
+    mix_vector.CopyToCPU();
   }
 };
 
diff --git a/paddle/fluid/operators/optimizers/adagrad_op.cu b/paddle/fluid/operators/optimizers/adagrad_op.cu
index 5bfbc3fd681b8..3b8ef9056946a 100644
--- a/paddle/fluid/operators/optimizers/adagrad_op.cu
+++ b/paddle/fluid/operators/optimizers/adagrad_op.cu
@@ -96,12 +96,14 @@ struct SparseAdagradFunctor<platform::CUDADeviceContext, T> {
     const int block_size = 256;
     dim3 threads(block_size, 1);
     dim3 grid2(1, merge_rows.size());
+    paddle::framework::MixVector<int64_t> mixv_merge_rows(&merge_rows);
     SparseAdagradFunctorKernel<
         T, 256><<<grid2, threads, 0,
                   reinterpret_cast<const platform::CUDADeviceContext&>(context)
                       .stream()>>>(
-        grad_merge_data, merge_rows.CUDAMutableData(context.GetPlace()), lr,
-        param_data, moment_data, grad_width, epsilon);
+        grad_merge_data, mixv_merge_rows.CUDAMutableData(context.GetPlace()),
+        lr, param_data, moment_data, grad_width, epsilon);
+    mixv_merge_rows.CopyToCPU();
   }
 };
 
diff --git a/paddle/fluid/operators/optimizers/adam_op.cu b/paddle/fluid/operators/optimizers/adam_op.cu
index 668dd41fa257f..c1aa392d8a528 100644
--- a/paddle/fluid/operators/optimizers/adam_op.cu
+++ b/paddle/fluid/operators/optimizers/adam_op.cu
@@ -345,7 +345,10 @@ class AdamOpCUDAKernel : public framework::OpKernel<T> {
       auto& grad_merge = *grad_merge_ptr;
       auto& grad_tensor = grad_merge.value();
       const T* grad_data = grad_tensor.template data<T>();
-      const int64_t* rows = grad_merge.rows().Data(ctx.GetPlace());
+      auto* grad_merge_rows = &grad_merge.rows();
+      paddle::framework::MixVector<int64_t> mixv_grad_merge_rows(
+          grad_merge_rows);
+      const int64_t* rows = mixv_grad_merge_rows.Data(ctx.GetPlace());
       auto row_numel = grad_tensor.numel() / grad_merge.rows().size();
 
       if (beta1_pow->place() == platform::CPUPlace() &&
diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h
index 7a04b0bd75a49..decab04f1ca26 100644
--- a/paddle/fluid/operators/optimizers/adam_op.h
+++ b/paddle/fluid/operators/optimizers/adam_op.h
@@ -592,7 +592,10 @@ class AdamOpKernel : public framework::OpKernel<T> {
       auto& grad_merge = *grad_merge_ptr;
       auto& grad_tensor = grad_merge.value();
       const T* grad_data = grad_tensor.template data<T>();
-      const int64_t* rows = grad_merge.rows().Data(ctx.GetPlace());
+      auto* grad_merge_rows = &grad_merge.rows();
+      paddle::framework::MixVector<int64_t> mixv_grad_merge_rows(
+          grad_merge_rows);
+      const int64_t* rows = mixv_grad_merge_rows.Data(ctx.GetPlace());
       auto row_numel = grad_tensor.numel() / grad_merge.rows().size();
 
       SparseAdamFunctor<T, CPUAdam> functor(
diff --git a/paddle/fluid/operators/optimizers/adamw_op.cu b/paddle/fluid/operators/optimizers/adamw_op.cu
index abdc61e7fcb46..1d61bdec26d58 100644
--- a/paddle/fluid/operators/optimizers/adamw_op.cu
+++ b/paddle/fluid/operators/optimizers/adamw_op.cu
@@ -368,7 +368,10 @@ class AdamWOpCUDAKernel : public framework::OpKernel<T> {
       auto& grad_merge = *grad_merge_ptr;
       auto& grad_tensor = grad_merge.value();
       const T* grad_data = grad_tensor.template data<T>();
-      const int64_t* rows = grad_merge.rows().Data(ctx.GetPlace());
+      auto* grad_merge_rows = &grad_merge.rows();
+      paddle::framework::MixVector<int64_t> mixv_grad_merge_rows(
+          grad_merge_rows);
+      const int64_t* rows = mixv_grad_merge_rows.Data(ctx.GetPlace());
       auto row_numel = grad_tensor.numel() / grad_merge.rows().size();
 
       if (beta1_pow->place() == platform::CPUPlace() &&
diff --git a/paddle/fluid/operators/optimizers/ftrl_op.h b/paddle/fluid/operators/optimizers/ftrl_op.h
index b74009120abc4..596ed05df3ffd 100644
--- a/paddle/fluid/operators/optimizers/ftrl_op.h
+++ b/paddle/fluid/operators/optimizers/ftrl_op.h
@@ -189,7 +189,9 @@ class FTRLOpKernel : public framework::OpKernel<T> {
       merge_func(ctx.template device_context<DeviceContext>(), *grad,
                  merged_grad);
 
-      const int64_t* rows = merged_grad->rows().Data(ctx.GetPlace());
+      auto* merged_rows = merged_grad->mutable_rows();
+      paddle::framework::MixVector<int64_t> mixv_merged_rows(merged_rows);
+      const int64_t* rows = mixv_merged_rows.Data(ctx.GetPlace());
       auto row_numel = static_cast<int64_t>(merged_grad->value().dims()[1]);
       auto row_height = static_cast<int64_t>(merged_grad->rows().size());
 
diff --git a/paddle/fluid/operators/optimizers/lamb_op.h b/paddle/fluid/operators/optimizers/lamb_op.h
index a2189d2a7ca0e..45acf2b3e4834 100644
--- a/paddle/fluid/operators/optimizers/lamb_op.h
+++ b/paddle/fluid/operators/optimizers/lamb_op.h
@@ -594,7 +594,10 @@ class LambOpKernel : public framework::OpKernel<T> {
       auto& grad_merge = *grad_merge_ptr;
       auto& grad_tensor = grad_merge.value();
       const T* grad_data = grad_tensor.template data<T>();
-      const int64_t* rows = grad_merge.rows().Data(ctx.GetPlace());
+      auto* grad_merge_rows = &grad_merge.rows();
+      paddle::framework::MixVector<int64_t> mixv_grad_merge_rows(
+          grad_merge_rows);
+      const int64_t* rows = mixv_grad_merge_rows.Data(ctx.GetPlace());
       auto row_numel = grad_tensor.numel() / grad_merge.rows().size();
       if (platform::is_gpu_place(ctx.GetPlace()) &&
           beta1_pow.place() == platform::CPUPlace() &&
diff --git a/paddle/fluid/operators/optimizers/momentum_op.h b/paddle/fluid/operators/optimizers/momentum_op.h
index 0561c18580a3f..e271755b740ce 100644
--- a/paddle/fluid/operators/optimizers/momentum_op.h
+++ b/paddle/fluid/operators/optimizers/momentum_op.h
@@ -561,7 +561,10 @@ class MomentumOpKernel : public framework::OpKernel<T> {
       merge_func(ctx.template device_context<DeviceContext>(), *grad,
                  merged_grad);
 
-      const int64_t* rows = merged_grad->rows().Data(ctx.GetPlace());
+      auto* grad_merge_rows = merged_grad->mutable_rows();
+      paddle::framework::MixVector<int64_t> mixv_grad_merge_rows(
+          grad_merge_rows);
+      const int64_t* rows = mixv_grad_merge_rows.Data(ctx.GetPlace());
       int64_t row_numel =
           merged_grad->value().numel() / merged_grad->rows().size();
       platform::ForRange<DeviceContext> for_range(
diff --git a/paddle/fluid/operators/optimizers/rmsprop_op.h b/paddle/fluid/operators/optimizers/rmsprop_op.h
index 66c16d8015806..71decd27d0d78 100644
--- a/paddle/fluid/operators/optimizers/rmsprop_op.h
+++ b/paddle/fluid/operators/optimizers/rmsprop_op.h
@@ -227,7 +227,10 @@ class RmspropOpKernel : public framework::OpKernel<T> {
       merge_func(dev_ctx, grad, merged_grad);
 
       platform::ForRange<DeviceContext> for_range(dev_ctx, limit);
-      const int64_t *rows = merged_grad->rows().Data(ctx.GetPlace());
+      auto &grad_merge_rows = merged_grad->rows();
+      paddle::framework::MixVector<int64_t> mixv_grad_merge_rows(
+          &grad_merge_rows);
+      const int64_t *rows = mixv_grad_merge_rows.Data(ctx.GetPlace());
 
       auto &merged_tensor = merged_grad->value();
       int64_t row_count = merged_grad->rows().size();
diff --git a/paddle/fluid/operators/optimizers/sgd_op.cu b/paddle/fluid/operators/optimizers/sgd_op.cu
index a255f0fed3ce0..3149f5f56ed49 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.cu
+++ b/paddle/fluid/operators/optimizers/sgd_op.cu
@@ -148,11 +148,11 @@ class SGDOpKernel<platform::CUDADeviceContext, T>
       int thread_x = kThreadsPerBlock;
       int max_threads = ctx.cuda_device_context().GetMaxPhysicalThreadCount();
       int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
-
+      paddle::framework::MixVector<int64_t> mixv_in_rows(&in_rows);
       SparseSGDFunctorKernel<<<max_blocks, thread_x, 0,
                                ctx.cuda_device_context().stream()>>>(
-          in_data, in_rows.CUDAData(ctx.GetPlace()), learning_rate->data<T>(),
-          out_data, in_row_numel, in_rows.size());
+          in_data, mixv_in_rows.CUDAData(ctx.GetPlace()),
+          learning_rate->data<T>(), out_data, in_row_numel, in_rows.size());
 
     } else {
       PADDLE_ENFORCE_EQ(false, true,
diff --git a/paddle/fluid/operators/row_conv_op.cu b/paddle/fluid/operators/row_conv_op.cu
index 3def7875232e8..c5794948aaec6 100644
--- a/paddle/fluid/operators/row_conv_op.cu
+++ b/paddle/fluid/operators/row_conv_op.cu
@@ -336,7 +336,8 @@ class RowConvKernel<platform::CUDADeviceContext, T>
 
     int num_sequence = batch_indices.size() - 1;
     int future_context = Filter->dims()[0];
-    size_t *idx = batch_indices.CUDAMutableData(context.GetPlace());
+    paddle::framework::MixVector<size_t> mix_vector(&batch_indices);
+    size_t *idx = mix_vector.CUDAMutableData(context.GetPlace());
     auto stream = context.cuda_device_context().stream();
 
     if (future_context <= 32) {
@@ -352,6 +353,7 @@ class RowConvKernel<platform::CUDADeviceContext, T>
       RowConvForward<T><<<grid_dim, block_dim, 0, stream>>>(
           in, weight, num_sequence, input_dim, future_context, idx, out);
     }
+    mix_vector.CopyToCPU();
   }
 };
 
@@ -392,7 +394,8 @@ class RowConvGradKernel<platform::CUDADeviceContext, T>
     // int input_dim = X->dims()[1];
     int num_sequence = batch_indices.size() - 1;
     int future_context = Filter->dims()[0];
-    size_t *idx = batch_indices.CUDAMutableData(context.GetPlace());
+    paddle::framework::MixVector<size_t> mixv_batch_indices(&batch_indices);
+    size_t *idx = mixv_batch_indices.CUDAMutableData(context.GetPlace());
 
     auto &device_ctx = context.cuda_device_context();
     phi::funcs::SetConstant<platform::CUDADeviceContext, T> zero;
@@ -444,6 +447,7 @@ class RowConvGradKernel<platform::CUDADeviceContext, T>
             dout, weights, num_sequence, input_dim, future_context, idx, din);
       }
     }
+    mixv_batch_indices.CopyToCPU();
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu
index 8092a40d19b19..9591f3e8b5bbf 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu
@@ -71,7 +71,8 @@ class SequenceEnumerateOpCUDAKernel : public framework::OpKernel<T> {
     out->Resize({in_dims[0], win_size});
     auto out_data = out->mutable_data<T>(context.GetPlace());
     // Copy LoD to GPU
-    const size_t* dev_in_lod_ptr = lod0.CUDAData(context.GetPlace());
+    paddle::framework::MixVector<size_t> mixv_lod0(&lod0);
+    const size_t* dev_in_lod_ptr = mixv_lod0.CUDAData(context.GetPlace());
     // Calc output tensor
     CalcOutPut<<<(in_len - 1) / PADDLE_CUDA_NUM_THREADS + 1,
                  PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
diff --git a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu b/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu
index bb928cf401c33..12d3eee65da70 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu
@@ -88,7 +88,8 @@ class SequenceEraseOpCUDAKernel : public framework::OpKernel<T> {
     // Copy LoD to GPU
     auto last_lod = lod[lod.size() - 1];
     auto lod_len = last_lod.size();
-    const size_t* dev_in_lod_ptr = last_lod.CUDAData(ctx.GetPlace());
+    paddle::framework::MixVector<size_t> mixv_last_lod(&last_lod);
+    const size_t* dev_in_lod_ptr = mixv_last_lod.CUDAData(ctx.GetPlace());
     // Calc output LoD
     thrust::device_vector<size_t> dev_out_lod(lod_len);
     size_t* dev_out_lod_ptr = thrust::raw_pointer_cast(dev_out_lod.data());
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu
index f13849fda4176..7e1a06b9eca5b 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu
@@ -81,8 +81,9 @@ struct SequenceExpandAsFunctor<platform::CUDADeviceContext, T> {
 
     dim3 block_size(thread_x);
     dim3 grid_size(block_x);
+    paddle::framework::MixVector<size_t> mixv_ref_lod(&ref_lod);
     sequence_expand_as_kernel<<<grid_size, block_size, 0, context.stream()>>>(
-        x.data<T>(), ref_lod.CUDAData(context.GetPlace()), height, width,
+        x.data<T>(), mixv_ref_lod.CUDAData(context.GetPlace()), height, width,
         out->mutable_data<T>(context.GetPlace()));
   }
 };
@@ -107,10 +108,11 @@ struct SequenceExpandAsGradFunctor<platform::CUDADeviceContext, T> {
 
     dim3 block_size(thread_x);
     dim3 grid_size(block_x);
+    paddle::framework::MixVector<size_t> mixv_ref_lod(&ref_lod);
     sequence_expand_as_grad_kernel<<<grid_size, block_size, 0,
                                      context.stream()>>>(
-        dout.data<T>(), ref_lod.CUDAData(context.GetPlace()), height, width,
-        dx->mutable_data<T>(context.GetPlace()));
+        dout.data<T>(), mixv_ref_lod.CUDAData(context.GetPlace()), height,
+        width, dx->mutable_data<T>(context.GetPlace()));
   }
 };
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
index cbf5df0017075..7b7bc5183bf1f 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
@@ -157,7 +157,9 @@ struct SequenceExpandFunctor<platform::CUDADeviceContext, T> {
         out_offset[2 * x_lod_size + i] = ref_lod[i];
       }
 
-      const size_t* out_offset_data = out_offset.CUDAData(context.GetPlace());
+      paddle::framework::MixVector<size_t> mixv_out_offset(&out_offset);
+      const size_t* out_offset_data =
+          mixv_out_offset.CUDAData(context.GetPlace());
       const size_t* x_lod_data = out_offset_data + x_lod_size;
       const size_t* ref_lod_data = out_offset_data + 2 * x_lod_size;
 
@@ -193,11 +195,14 @@ struct SequenceExpandGradFunctor<platform::CUDADeviceContext, T> {
     int block_x = static_cast<int>(ref_lod.size());
     dim3 block_size(thread_x, thread_y, thread_z);
     dim3 grid_size(block_x, 1);
+    paddle::framework::MixVector<size_t> mixv_ref_lod(&ref_lod);
+    paddle::framework::MixVector<size_t> mixv_x_lod(&x_lod);
+    paddle::framework::MixVector<size_t> mixv_out_offset(&out_offset);
     sequence_expand_grad_kernel<<<grid_size, block_size, 0, context.stream()>>>(
-        dout.data<T>(), ref_lod.CUDAData(context.GetPlace()),
-        x_lod.CUDAData(context.GetPlace()),
-        out_offset.CUDAData(context.GetPlace()), ref_lod.size(), x_item_length,
-        dx->mutable_data<T>(context.GetPlace()));
+        dout.data<T>(), mixv_ref_lod.CUDAData(context.GetPlace()),
+        mixv_x_lod.CUDAData(context.GetPlace()),
+        mixv_out_offset.CUDAData(context.GetPlace()), ref_lod.size(),
+        x_item_length, dx->mutable_data<T>(context.GetPlace()));
   }
 };
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h b/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h
index c42df836de15f..90a17d713cf29 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h
@@ -132,7 +132,9 @@ class SequenceReverseOpKernel : public framework::OpKernel<T> {
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (platform::is_gpu_place(ctx.GetPlace())) {
-      lod = x.lod()[0].CUDAData(ctx.GetPlace());
+      auto xlod = x.lod()[0];
+      paddle::framework::MixVector<size_t> mixv_xlod(&xlod);
+      lod = mixv_xlod.CUDAData(ctx.GetPlace());
     } else {
 #endif
       lod = x.lod()[0].data();
diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
index 220165ac1bd4f..c91c59dbfee99 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
@@ -133,9 +133,10 @@ struct SequenceSoftmaxFunctor<platform::CUDADeviceContext, T> {
 
     dim3 block_size(thread_x);
     dim3 grid_size(max_blocks);
+    paddle::framework::MixVector<size_t> mixv_ref_lod(&ref_lod);
     sequence_softmax_kernel<
         T, kThreadsPerBlock><<<grid_size, block_size, 0, context.stream()>>>(
-        x.data<T>(), ref_lod.CUDAData(context.GetPlace()), height,
+        x.data<T>(), mixv_ref_lod.CUDAData(context.GetPlace()), height,
         out->mutable_data<T>(context.GetPlace()));
   }
 };
@@ -156,10 +157,12 @@ struct SequenceSoftmaxGradFunctor<platform::CUDADeviceContext, T> {
     dim3 block_size(thread_x);
     dim3 grid_size(max_blocks);
 
+    paddle::framework::MixVector<size_t> mixv_ref_lod(&ref_lod);
     sequence_softmax_grad_kernel<
         T, kThreadsPerBlock><<<grid_size, block_size, 0, context.stream()>>>(
-        dout.data<T>(), out.data<T>(), ref_lod.CUDAData(context.GetPlace()),
-        height, dx->mutable_data<T>(context.GetPlace()));
+        dout.data<T>(), out.data<T>(),
+        mixv_ref_lod.CUDAData(context.GetPlace()), height,
+        dx->mutable_data<T>(context.GetPlace()));
   }
 };
 
diff --git a/paddle/phi/api/ext/dispatch.h b/paddle/phi/api/ext/dispatch.h
index 4e5fa879a2cfc..6b6d0ae7fe723 100644
--- a/paddle/phi/api/ext/dispatch.h
+++ b/paddle/phi/api/ext/dispatch.h
@@ -292,7 +292,7 @@ namespace paddle {
                            paddle::experimental::complex128,                   \
                            __VA_ARGS__)                                        \
       default:                                                                 \
-        PADDLE_THROW(paddle::platform::errors::InvalidArgument(                \
+        PADDLE_THROW(phi::errors::InvalidArgument(                             \
             "Invalid enum data type `%d`.", static_cast<int>(__dtype__)));     \
     }                                                                          \
   }()
diff --git a/paddle/phi/api/lib/utils/storage.cc b/paddle/phi/api/lib/utils/storage.cc
index db3f5f0c8f98b..09ff18d10e312 100644
--- a/paddle/phi/api/lib/utils/storage.cc
+++ b/paddle/phi/api/lib/utils/storage.cc
@@ -19,7 +19,7 @@ namespace experimental {
 
 ExternalStorage::ExternalStorage(void* ptr,
                                  size_t size,
-                                 const paddle::platform::Place& place)
+                                 const phi::Place& place)
     : phi::Storage(std::make_shared<phi::Allocation>(ptr, size, place)),
       size_(size) {}
 
@@ -29,11 +29,11 @@ ExternalStorage::ExternalStorage(const phi::intrusive_ptr<phi::Storage>& root,
     : Storage(std::make_shared<phi::Allocation>(
           static_cast<uint8_t*>(root->data()) + delta, size, root->place())),
       size_(size) {
-  PADDLE_ENFORCE_LE(static_cast<size_t>(delta + size),
-                    root->size(),
-                    paddle::platform::errors::InvalidArgument(
-                        "The size of the external storage does "
-                        "not meet the metadata requirements."));
+  PADDLE_ENFORCE_LE(
+      static_cast<size_t>(delta + size),
+      root->size(),
+      phi::errors::InvalidArgument("The size of the external storage does "
+                                   "not meet the metadata requirements."));
 }
 
 }  // namespace experimental
diff --git a/paddle/phi/api/lib/utils/storage.h b/paddle/phi/api/lib/utils/storage.h
index ede5f80483662..c2eedd0fa63f7 100644
--- a/paddle/phi/api/lib/utils/storage.h
+++ b/paddle/phi/api/lib/utils/storage.h
@@ -30,7 +30,7 @@ class ExternalStorage : public phi::Storage {
   static const char* name() { return "ExternalStorage"; }
 
   void Realloc(size_t n) override {
-    PADDLE_THROW(paddle::platform::errors::Unavailable(
+    PADDLE_THROW(phi::errors::Unavailable(
         "The external shared storage cannot be reallocated."));
   }
 
@@ -55,7 +55,7 @@ class ExternalStorage : public phi::Storage {
   const phi::Place& place() const override {
     PADDLE_ENFORCE_NOT_NULL(
         data_,
-        paddle::platform::errors::Unavailable(
+        phi::errors::Unavailable(
             "Unable to visit place as data_ has not been initialized yet."));
     return data_->place();
   }
diff --git a/paddle/phi/backends/dynload/cudnn.cc b/paddle/phi/backends/dynload/cudnn.cc
index ff000d27c4f2e..02d626d5f98f9 100644
--- a/paddle/phi/backends/dynload/cudnn.cc
+++ b/paddle/phi/backends/dynload/cudnn.cc
@@ -54,7 +54,7 @@ bool HasCUDNN() {
 void EnforceCUDNNLoaded(const char* fn_name) {
   PADDLE_ENFORCE_NOT_NULL(
       cudnn_dso_handle,
-      paddle::platform::errors::PreconditionNotMet(
+      phi::errors::PreconditionNotMet(
           "Cannot load cudnn shared library. Cannot invoke method %s.",
           fn_name));
 }
diff --git a/paddle/phi/backends/dynload/cufft.cc b/paddle/phi/backends/dynload/cufft.cc
index 14240af41046c..596a68c1ed6aa 100644
--- a/paddle/phi/backends/dynload/cufft.cc
+++ b/paddle/phi/backends/dynload/cufft.cc
@@ -33,7 +33,7 @@ bool HasCUFFT() {
 void EnforceCUFFTLoaded(const char* fn_name) {
   PADDLE_ENFORCE_NOT_NULL(
       cufft_dso_handle,
-      paddle::platform::errors::PreconditionNotMet(
+      phi::errors::PreconditionNotMet(
           "Cannot load cufft shared library. Cannot invoke method %s.",
           fn_name));
 }
diff --git a/paddle/phi/backends/dynload/dynamic_loader.cc b/paddle/phi/backends/dynload/dynamic_loader.cc
index 473c58b33eebc..2f35e22a18f82 100644
--- a/paddle/phi/backends/dynload/dynamic_loader.cc
+++ b/paddle/phi/backends/dynload/dynamic_loader.cc
@@ -24,7 +24,7 @@ limitations under the License. */
 #include <windows.h>
 #endif
 
-// TODO(wilber): The pten computing library requires a component to manage flags
+// TODO(wilber): The phi computing library requires a component to manage flags
 // (maybe not use gflags).
 #include "gflags/gflags.h"
 #include "glog/logging.h"
@@ -299,8 +299,8 @@ static inline void* GetDsoHandleFromSearchPath(
 #endif  // !_WIN32
     if (throw_on_error) {
       // NOTE: Special error report case, no need to change its format
-      PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
-          error_msg, dso_name, errorno));
+      PADDLE_THROW(
+          phi::errors::PreconditionNotMet(error_msg, dso_name, errorno));
     } else {
       LOG(WARNING) << paddle::string::Sprintf(error_msg, dso_name, errorno);
     }
@@ -547,14 +547,11 @@ void* GetOpDsoHandle(const std::string& dso_name) {
 
 void* GetNvtxDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
-  PADDLE_THROW(
-      paddle::platform::errors::Unimplemented("Nvtx do not support Apple."));
+  PADDLE_THROW(phi::errors::Unimplemented("Nvtx do not support Apple."));
 #elif defined(_WIN32)
-  PADDLE_THROW(
-      paddle::platform::errors::Unimplemented("Nvtx do not support Windows."));
+  PADDLE_THROW(phi::errors::Unimplemented("Nvtx do not support Windows."));
 #elif !defined(PADDLE_WITH_CUDA)
-  PADDLE_THROW(paddle::platform::errors::Unimplemented(
-      "Nvtx do not support without CUDA."));
+  PADDLE_THROW(phi::errors::Unimplemented("Nvtx do not support without CUDA."));
 #else
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvToolsExt.so");
 #endif
diff --git a/paddle/phi/backends/dynload/miopen.cc b/paddle/phi/backends/dynload/miopen.cc
index a57574dbab13b..e7916873ccfde 100644
--- a/paddle/phi/backends/dynload/miopen.cc
+++ b/paddle/phi/backends/dynload/miopen.cc
@@ -58,7 +58,7 @@ bool HasCUDNN() {
 void EnforceCUDNNLoaded(const char* fn_name) {
   PADDLE_ENFORCE_NOT_NULL(
       miopen_dso_handle,
-      paddle::platform::errors::PreconditionNotMet(
+      phi::errors::PreconditionNotMet(
           "Cannot load miopen shared library. Cannot invoke method %s.",
           fn_name));
 }
diff --git a/paddle/phi/backends/dynload/tensorrt.h b/paddle/phi/backends/dynload/tensorrt.h
index 77f25ec0b5aaf..cd8c6457f1b91 100644
--- a/paddle/phi/backends/dynload/tensorrt.h
+++ b/paddle/phi/backends/dynload/tensorrt.h
@@ -54,21 +54,21 @@ extern void* tensorrt_plugin_dso_handle;
   };                                                                   \
   extern DynLoad__##__name __name
 
-#define DECLARE_DYNAMIC_LOAD_TENSORRT_NON_POINTER_WRAP(__name)              \
-  struct DynLoad__##__name {                                                \
-    template <typename... Args>                                             \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {        \
-      std::call_once(tensorrt_dso_flag, []() {                              \
-        tensorrt_dso_handle = phi::dynload::GetTensorRtHandle();            \
-      });                                                                   \
-      static void* p_##__name = dlsym(tensorrt_dso_handle, #__name);        \
-      PADDLE_ENFORCE_NOT_NULL(p_##__name,                                   \
-                              paddle::platform::errors::Unavailable(        \
-                                  "Load tensorrt api %s failed", #__name)); \
-      using tensorrt_func = decltype(&::__name);                            \
-      return reinterpret_cast<tensorrt_func>(p_##__name)(args...);          \
-    }                                                                       \
-  };                                                                        \
+#define DECLARE_DYNAMIC_LOAD_TENSORRT_NON_POINTER_WRAP(__name)               \
+  struct DynLoad__##__name {                                                 \
+    template <typename... Args>                                              \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {         \
+      std::call_once(tensorrt_dso_flag, []() {                               \
+        tensorrt_dso_handle = phi::dynload::GetTensorRtHandle();             \
+      });                                                                    \
+      static void* p_##__name = dlsym(tensorrt_dso_handle, #__name);         \
+      PADDLE_ENFORCE_NOT_NULL(                                               \
+          p_##__name,                                                        \
+          phi::errors::Unavailable("Load tensorrt api %s failed", #__name)); \
+      using tensorrt_func = decltype(&::__name);                             \
+      return reinterpret_cast<tensorrt_func>(p_##__name)(args...);           \
+    }                                                                        \
+  };                                                                         \
   extern DynLoad__##__name __name
 
 #define DECLARE_DYNAMIC_LOAD_TENSORRT_PLUGIN_WRAP(__name)                      \
@@ -80,7 +80,7 @@ extern void* tensorrt_plugin_dso_handle;
       });                                                                      \
       static void* p_##__name = dlsym(tensorrt_plugin_dso_handle, #__name);    \
       PADDLE_ENFORCE_NOT_NULL(p_##__name,                                      \
-                              paddle::platform::errors::Unavailable(           \
+                              phi::errors::Unavailable(                        \
                                   "Load tensorrt plugin %s failed", #__name)); \
       using tensorrt_plugin_func = decltype(&::__name);                        \
       return reinterpret_cast<tensorrt_plugin_func>(p_##__name)(args...);      \
diff --git a/paddle/phi/backends/gpu/cuda/cuda_info.cc b/paddle/phi/backends/gpu/cuda/cuda_info.cc
index f8e4ec02bc39e..7be21e85f0005 100644
--- a/paddle/phi/backends/gpu/cuda/cuda_info.cc
+++ b/paddle/phi/backends/gpu/cuda/cuda_info.cc
@@ -14,7 +14,7 @@
 
 #include "paddle/phi/backends/gpu/gpu_info.h"
 
-// TODO(pten): remove fluid headers.
+// TODO(phi): remove fluid headers.
 #include "paddle/fluid/platform/enforce.h"
 
 static std::once_flag g_device_props_size_init_flag;
@@ -74,13 +74,13 @@ int GetGPUDeviceCount() {
 }
 
 int GetGPUComputeCapability(int id) {
-  PADDLE_ENFORCE_LT(id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
   int major, minor;
   auto major_error_code =
       cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, id);
@@ -93,26 +93,26 @@ int GetGPUComputeCapability(int id) {
 }
 
 int GetGPURuntimeVersion(int id) {
-  PADDLE_ENFORCE_LT(id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
   int runtime_version = 0;
   PADDLE_ENFORCE_GPU_SUCCESS(cudaRuntimeGetVersion(&runtime_version));
   return runtime_version;
 }
 
 int GetGPUDriverVersion(int id) {
-  PADDLE_ENFORCE_LT(id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
   int driver_version = 0;
   PADDLE_ENFORCE_GPU_SUCCESS(cudaDriverGetVersion(&driver_version));
   return driver_version;
@@ -125,13 +125,13 @@ bool TensorCoreAvailable() {
 }
 
 int GetGPUMultiProcessors(int id) {
-  PADDLE_ENFORCE_LT(id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
   int count;
   PADDLE_ENFORCE_GPU_SUCCESS(
       cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, id));
@@ -139,13 +139,13 @@ int GetGPUMultiProcessors(int id) {
 }
 
 int GetGPUMaxThreadsPerMultiProcessor(int id) {
-  PADDLE_ENFORCE_LT(id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
   int count;
   PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceGetAttribute(
       &count, cudaDevAttrMaxThreadsPerMultiProcessor, id));
@@ -154,13 +154,13 @@ int GetGPUMaxThreadsPerMultiProcessor(int id) {
 }
 
 int GetGPUMaxThreadsPerBlock(int id) {
-  PADDLE_ENFORCE_LT(id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
   int count;
   PADDLE_ENFORCE_GPU_SUCCESS(
       cudaDeviceGetAttribute(&count, cudaDevAttrMaxThreadsPerBlock, id));
@@ -174,13 +174,13 @@ int GetCurrentDeviceId() {
 }
 
 std::array<int, 3> GetGpuMaxGridDimSize(int id) {
-  PADDLE_ENFORCE_LT(id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
   std::array<int, 3> ret;
   int size;
   auto error_code_x = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimX, id);
@@ -213,7 +213,7 @@ const gpuDeviceProp &GetDeviceProperties(int id) {
   }
 
   if (id < 0 || id >= static_cast<int>(g_device_props.size())) {
-    PADDLE_THROW(paddle::platform::errors::OutOfRange(
+    PADDLE_THROW(phi::errors::OutOfRange(
         "The device id %d is out of range [0, %d), where %d is the number of "
         "devices on this machine. Because the device id should be greater than "
         "or equal to zero and smaller than the number of gpus. Please input "
@@ -233,13 +233,13 @@ const gpuDeviceProp &GetDeviceProperties(int id) {
 
 void SetDeviceId(int id) {
   // TODO(qijun): find a better way to cache the cuda device count
-  PADDLE_ENFORCE_LT(id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
   PADDLE_RETRY_CUDA_SUCCESS(cudaSetDevice(id));
 }
 
@@ -294,13 +294,13 @@ gpuError_t GpuGetLastError() { return cudaGetLastError(); }
 // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-requirements
 // for more detail about managed memory requirements
 bool IsGPUManagedMemorySupported(int dev_id) {
-  PADDLE_ENFORCE_LT(dev_id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        dev_id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      dev_id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   dev_id,
+                                   GetGPUDeviceCount()));
 #if defined(__linux__) || defined(_WIN32)
   int ManagedMemoryAttr;
   PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceGetAttribute(
@@ -312,13 +312,13 @@ bool IsGPUManagedMemorySupported(int dev_id) {
 }
 
 bool IsGPUManagedMemoryOversubscriptionSupported(int dev_id) {
-  PADDLE_ENFORCE_LT(dev_id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        dev_id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      dev_id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   dev_id,
+                                   GetGPUDeviceCount()));
 #ifdef __linux__
   return IsGPUManagedMemorySupported(dev_id) &&
          GetGPUComputeCapability(dev_id) >= 60;
diff --git a/paddle/phi/backends/gpu/gpu_launch_config.h b/paddle/phi/backends/gpu/gpu_launch_config.h
index 2119375504457..5aa569e0197bd 100644
--- a/paddle/phi/backends/gpu/gpu_launch_config.h
+++ b/paddle/phi/backends/gpu/gpu_launch_config.h
@@ -100,12 +100,12 @@ struct GpuLaunchConfig {
 inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context,
                                             int64_t numel,
                                             int vec_size = 1) {
-  PADDLE_ENFORCE_GT(numel,
-                    0,
-                    paddle::platform::errors::InvalidArgument(
-                        "element quantity should be greater than 0,"
-                        " but received value is: %d.",
-                        numel));
+  PADDLE_ENFORCE_GT(
+      numel,
+      0,
+      phi::errors::InvalidArgument("element quantity should be greater than 0,"
+                                   " but received value is: %d.",
+                                   numel));
   // Get compute_capability
   const int capability = context.GetComputeCapability();
   /* If thread number per block is 64/128/256/512, cuda performs better.*/
@@ -142,18 +142,18 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context,
 inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context,
                                             int x_dim,
                                             int y_dim) {
-  PADDLE_ENFORCE_GT(x_dim,
-                    0,
-                    paddle::platform::errors::InvalidArgument(
-                        "x dim number should greater than 0,"
-                        " but received value is: %d",
-                        x_dim));
-  PADDLE_ENFORCE_GT(y_dim,
-                    0,
-                    paddle::platform::errors::InvalidArgument(
-                        "y dim number should greater than 0,"
-                        " but received value is: %d",
-                        y_dim));
+  PADDLE_ENFORCE_GT(
+      x_dim,
+      0,
+      phi::errors::InvalidArgument("x dim number should greater than 0,"
+                                   " but received value is: %d",
+                                   x_dim));
+  PADDLE_ENFORCE_GT(
+      y_dim,
+      0,
+      phi::errors::InvalidArgument("y dim number should greater than 0,"
+                                   " but received value is: %d",
+                                   y_dim));
 
   const int kThreadsPerBlock = 256;
   int block_cols = (std::min)(x_dim, kThreadsPerBlock);
diff --git a/paddle/phi/backends/gpu/rocm/rocm_info.cc b/paddle/phi/backends/gpu/rocm/rocm_info.cc
index c7390cfb6a219..11dd4f7248782 100644
--- a/paddle/phi/backends/gpu/rocm/rocm_info.cc
+++ b/paddle/phi/backends/gpu/rocm/rocm_info.cc
@@ -78,13 +78,13 @@ int GetGPUDeviceCount() {
 }
 
 int GetGPUComputeCapability(int id) {
-  PADDLE_ENFORCE_LT(id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
   int major, minor;
   auto major_error_code = hipDeviceGetAttribute(
       &major, hipDeviceAttributeComputeCapabilityMajor, id);
@@ -97,26 +97,26 @@ int GetGPUComputeCapability(int id) {
 }
 
 int GetGPURuntimeVersion(int id) {
-  PADDLE_ENFORCE_LT(id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
   int runtime_version = 0;
   PADDLE_ENFORCE_GPU_SUCCESS(hipRuntimeGetVersion(&runtime_version));
   return runtime_version;
 }
 
 int GetGPUDriverVersion(int id) {
-  PADDLE_ENFORCE_LT(id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
   int driver_version = 0;
   PADDLE_ENFORCE_GPU_SUCCESS(hipDriverGetVersion(&driver_version));
   return driver_version;
@@ -125,13 +125,13 @@ int GetGPUDriverVersion(int id) {
 bool TensorCoreAvailable() { return false; }
 
 int GetGPUMultiProcessors(int id) {
-  PADDLE_ENFORCE_LT(id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
   int count;
   PADDLE_ENFORCE_GPU_SUCCESS(
       hipDeviceGetAttribute(&count, hipDeviceAttributeMultiprocessorCount, id));
@@ -139,13 +139,13 @@ int GetGPUMultiProcessors(int id) {
 }
 
 int GetGPUMaxThreadsPerMultiProcessor(int id) {
-  PADDLE_ENFORCE_LT(id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
   int count;
   PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceGetAttribute(
       &count, hipDeviceAttributeMaxThreadsPerMultiProcessor, id));
@@ -154,13 +154,13 @@ int GetGPUMaxThreadsPerMultiProcessor(int id) {
 }
 
 int GetGPUMaxThreadsPerBlock(int id) {
-  PADDLE_ENFORCE_LT(id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
   int count;
   PADDLE_ENFORCE_GPU_SUCCESS(
       hipDeviceGetAttribute(&count, hipDeviceAttributeMaxThreadsPerBlock, id));
@@ -174,13 +174,13 @@ int GetCurrentDeviceId() {
 }
 
 std::array<int, 3> GetGpuMaxGridDimSize(int id) {
-  PADDLE_ENFORCE_LT(id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
   std::array<int, 3> ret;
   int size;
   auto error_code_x =
@@ -216,7 +216,7 @@ const gpuDeviceProp &GetDeviceProperties(int id) {
   }
 
   if (id < 0 || id >= static_cast<int>(g_device_props.size())) {
-    PADDLE_THROW(paddle::platform::errors::OutOfRange(
+    PADDLE_THROW(phi::errors::OutOfRange(
         "The device id %d is out of range [0, %d), where %d is the number of "
         "devices on this machine. Because the device id should be greater than "
         "or equal to zero and smaller than the number of gpus. Please input "
@@ -235,13 +235,13 @@ const gpuDeviceProp &GetDeviceProperties(int id) {
 
 void SetDeviceId(int id) {
   // TODO(qijun): find a better way to cache the cuda device count
-  PADDLE_ENFORCE_LT(id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
   PADDLE_RETRY_CUDA_SUCCESS(hipSetDevice(id));
 }
 
@@ -293,13 +293,13 @@ void GpuDeviceSync() { PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); }
 gpuError_t GpuGetLastError() { return hipGetLastError(); }
 
 bool IsGPUManagedMemorySupported(int dev_id) {
-  PADDLE_ENFORCE_LT(dev_id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        dev_id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      dev_id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   dev_id,
+                                   GetGPUDeviceCount()));
 #if defined(__linux__) || defined(_WIN32)
   int ManagedMemoryAttr;
   PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceGetAttribute(
@@ -311,13 +311,13 @@ bool IsGPUManagedMemorySupported(int dev_id) {
 }
 
 bool IsGPUManagedMemoryOversubscriptionSupported(int dev_id) {
-  PADDLE_ENFORCE_LT(dev_id,
-                    GetGPUDeviceCount(),
-                    paddle::platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        dev_id,
-                        GetGPUDeviceCount()));
+  PADDLE_ENFORCE_LT(
+      dev_id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   dev_id,
+                                   GetGPUDeviceCount()));
 #ifdef __linux__
   return IsGPUManagedMemorySupported(dev_id) &&
          GetGPUComputeCapability(dev_id) >= 60;
diff --git a/paddle/phi/backends/xpu/enforce_xpu.h b/paddle/phi/backends/xpu/enforce_xpu.h
index bcfebf6d49fb8..29b048ead852d 100644
--- a/paddle/phi/backends/xpu/enforce_xpu.h
+++ b/paddle/phi/backends/xpu/enforce_xpu.h
@@ -173,7 +173,7 @@ DEFINE_EXTERNAL_API_TYPE(BKCLResult_t, BKCL_SUCCESS);
         ::phi::backends::xpu::details::ExternalApiType<         \
             __XPU_STATUS_TYPE__>::kSuccess;                     \
     if (UNLIKELY(__cond__ != __success_type__)) {               \
-      auto __summary__ = paddle::platform::errors::External(    \
+      auto __summary__ = phi::errors::External(                 \
           ::phi::backends::xpu::build_xpu_error_msg(__cond__)); \
       __THROW_ERROR_INTERNAL__(__summary__);                    \
     }                                                           \
@@ -183,7 +183,7 @@ DEFINE_EXTERNAL_API_TYPE(BKCLResult_t, BKCL_SUCCESS);
   do {                                                                    \
     auto __cond__ = (COND);                                               \
     if (UNLIKELY(__cond__ != baidu::xpu::api::Error_t::SUCCESS)) {        \
-      auto __summary__ = paddle::platform::errors::External(              \
+      auto __summary__ = phi::errors::External(                           \
           ::phi::backends::xpu::build_xpu_xdnn_error_msg(__cond__, MSG)); \
       __THROW_ERROR_INTERNAL__(__summary__);                              \
     }                                                                     \
@@ -192,7 +192,7 @@ DEFINE_EXTERNAL_API_TYPE(BKCLResult_t, BKCL_SUCCESS);
 #define PADDLE_ENFORCE_XDNN_NOT_NULL(ptr)                    \
   do {                                                       \
     if (UNLIKELY(ptr == nullptr)) {                          \
-      auto __summary__ = paddle::platform::errors::External( \
+      auto __summary__ = phi::errors::External(              \
           ::phi::backends::xpu::build_xpu_xdnn_error_msg(    \
               baidu::xpu::api::Error_t::NO_ENOUGH_WORKSPACE, \
               "XPU memory is not enough"));                  \
diff --git a/paddle/phi/backends/xpu/xpu_info.cc b/paddle/phi/backends/xpu/xpu_info.cc
index 527e13238082e..96e95df7a9886 100644
--- a/paddle/phi/backends/xpu/xpu_info.cc
+++ b/paddle/phi/backends/xpu/xpu_info.cc
@@ -100,7 +100,7 @@ void SetXPUDeviceId(int id) {
   PADDLE_ENFORCE_LT(
       id,
       GetXPUDeviceCount(),
-      paddle::platform::errors::InvalidArgument("id must less than XPU count"));
+      phi::errors::InvalidArgument("id must less than XPU count"));
   PADDLE_ENFORCE_XPU_SUCCESS(xpu_set_device(id));
 }
 
diff --git a/paddle/phi/core/CMakeLists.txt b/paddle/phi/core/CMakeLists.txt
index 32b9b42f74f62..80bcc66477cb1 100644
--- a/paddle/phi/core/CMakeLists.txt
+++ b/paddle/phi/core/CMakeLists.txt
@@ -13,8 +13,8 @@ cc_library(kernel_context SRCS kernel_context.cc DEPS pten_enforce pten_context)
 
 cc_library(ddim SRCS ddim.cc DEPS pten_enforce)
 cc_library(tensor_base SRCS tensor_base.cc allocator.cc DEPS pten_enforce)
-cc_library(tensor_meta SRCS tensor_meta.cc DEPS pten_enforce mixed_vector)
-cc_library(lod_utils SRCS lod_utils.cc DEPS pten_enforce mixed_vector)
+cc_library(tensor_meta SRCS tensor_meta.cc DEPS pten_enforce)
+cc_library(lod_utils SRCS lod_utils.cc DEPS pten_enforce)
 
 cc_library(pten_device_context SRCS device_context.cc DEPS tensor_base)
 cc_library(dense_tensor SRCS dense_tensor.cc dense_tensor_impl.cc DEPS fluid_convert_utils tensor_meta tensor_base)
@@ -23,7 +23,7 @@ cc_library(sparse_csr_tensor SRCS sparse_csr_tensor.cc DEPS dense_tensor tensor_
 
 cc_library(meta_tensor SRCS meta_tensor.cc DEPS tensor_base tensor_meta dense_tensor)
 cc_library(infermeta_utils SRCS infermeta_utils.cc DEPS meta_tensor)
-cc_library(selected_rows SRCS selected_rows_impl.cc DEPS dense_tensor mixed_vector pten_enforce ddim)
+cc_library(selected_rows SRCS selected_rows_impl.cc DEPS dense_tensor pten_enforce ddim)
 
 cc_library(pten_custom_kernel SRCS custom_kernel.cc DEPS kernel_factory convert_utils)
 
diff --git a/paddle/phi/core/ddim.h b/paddle/phi/core/ddim.h
index 1d186fe3b43fe..ce462d8d95402 100644
--- a/paddle/phi/core/ddim.h
+++ b/paddle/phi/core/ddim.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 #pragma once
 #include <initializer_list>
+#include <numeric>
 #include <stdexcept>
 #include <string>
 #include <vector>
diff --git a/paddle/phi/core/dense_tensor.cc b/paddle/phi/core/dense_tensor.cc
index a363d3cbaaa34..44cb63e2b874b 100644
--- a/paddle/phi/core/dense_tensor.cc
+++ b/paddle/phi/core/dense_tensor.cc
@@ -73,7 +73,7 @@ void* DenseTensor::AllocateFrom(Allocator* allocator,
                                 size_t requested_size) {
   PADDLE_ENFORCE_NOT_NULL(
       allocator,
-      paddle::platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "Required allocator shall not be nullptr, but received nullptr."));
   if (this->dtype() != dtype) {
     VLOG(10) << "change data type in mutbale_data, target dtype - " << dtype;
@@ -81,13 +81,13 @@ void* DenseTensor::AllocateFrom(Allocator* allocator,
   }
   PADDLE_ENFORCE(
       valid(),
-      paddle::platform::errors::PreconditionNotMet(
+      phi::errors::PreconditionNotMet(
           "The meta data must be valid when call the mutable data function."));
   size_t bytes = numel() * SizeOf(this->dtype());
   if (requested_size) {
     PADDLE_ENFORCE_GE(requested_size,
                       bytes,
-                      paddle::platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The reserved size %d should be enough to meet the "
                           "volume required by metadata %d.",
                           requested_size,
@@ -112,7 +112,7 @@ const T* DenseTensor::data() const {
   check_memory_size();
   PADDLE_ENFORCE(
       (dtype() == paddle::experimental::CppTypeToDataType<T>::Type()),
-      paddle::platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "The type of data we are trying to retrieve does not match the "
           "type of data currently contained in the container."));
   return static_cast<const T*>(data());
@@ -123,7 +123,7 @@ T* DenseTensor::data() {
   check_memory_size();
   PADDLE_ENFORCE(
       (dtype() == paddle::experimental::CppTypeToDataType<T>::Type()),
-      paddle::platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "The type of data we are trying to retrieve does not match the "
           "type of data currently contained in the container."));
   return static_cast<T*>(data());
@@ -133,7 +133,7 @@ void* DenseTensor::data() {
   check_memory_size();
   PADDLE_ENFORCE_NOT_NULL(
       holder_,
-      paddle::platform::errors::PreconditionNotMet(
+      phi::errors::PreconditionNotMet(
           "The storage must be valid when call the data function."));
   return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
                                  meta_.offset);
@@ -143,7 +143,7 @@ const void* DenseTensor::data() const {
   check_memory_size();
   PADDLE_ENFORCE_NOT_NULL(
       holder_,
-      paddle::platform::errors::PreconditionNotMet(
+      phi::errors::PreconditionNotMet(
           "The storage must be valid when call the data function."));
   return reinterpret_cast<const void*>(
       reinterpret_cast<uintptr_t>(holder_->ptr()) + meta_.offset);
@@ -151,7 +151,7 @@ const void* DenseTensor::data() const {
 
 void DenseTensor::set_meta(DenseTensorMeta&& meta) {
   PADDLE_ENFORCE(!meta_.valid(),
-                 paddle::platform::errors::InvalidArgument(
+                 phi::errors::InvalidArgument(
                      "Only when the original attribute of Tensor is "
                      "incomplete, can it be reset."));
   meta_ = std::move(meta);
@@ -160,7 +160,7 @@ void DenseTensor::set_meta(DenseTensorMeta&& meta) {
 void DenseTensor::set_meta(const DenseTensorMeta& meta) {
   PADDLE_ENFORCE(
       meta.valid(),
-      paddle::platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "Input meta is invalid, please check the meta attribute."));
   meta_.dims = meta.dims;
   meta_.dtype = meta.dtype;
diff --git a/paddle/phi/core/dense_tensor.inl b/paddle/phi/core/dense_tensor.inl
index 0547776acad1f..a422a95346e8b 100644
--- a/paddle/phi/core/dense_tensor.inl
+++ b/paddle/phi/core/dense_tensor.inl
@@ -54,22 +54,22 @@ DenseTensor(intrusive_ptr<Storage> storage, DenseTensorMeta&& meta);
 inline bool IsInitialized() const { return holder_ != nullptr; }
 
 template <typename T>
-T* mutable_data(const paddle::platform::Place& place,
+T* mutable_data(const phi::Place& place,
                 size_t requested_size = 0);
 
 template <typename T>
 T* mutable_data(const DDim& dims,
-                const paddle::platform::Place& place,
+                const phi::Place& place,
                 size_t requested_size = 0);
 
-void* mutable_data(const paddle::platform::Place& place,
+void* mutable_data(const phi::Place& place,
                     paddle::experimental::DataType type,
                     size_t requested_size = 0);
 
-void* mutable_data(const paddle::platform::Place& place,
+void* mutable_data(const phi::Place& place,
                     size_t requested_size = 0);
 
-void* mutable_data(const paddle::platform::Place& place,
+void* mutable_data(const phi::Place& place,
                     paddle::experimental::DataType type,
                     const phi::Stream& stream);
 
diff --git a/paddle/phi/core/infermeta_utils.h b/paddle/phi/core/infermeta_utils.h
index 1b8cfea130d49..7cf92e4d933b3 100644
--- a/paddle/phi/core/infermeta_utils.h
+++ b/paddle/phi/core/infermeta_utils.h
@@ -25,6 +25,7 @@ limitations under the License. */
 #include "paddle/phi/core/macros.h"
 #include "paddle/phi/core/meta_tensor.h"
 #include "paddle/phi/core/type_defs.h"
+#include "paddle/utils/any.h"
 #include "paddle/utils/flat_hash_map.h"
 #include "paddle/utils/small_vector.h"
 
diff --git a/paddle/phi/core/kernel_context.cc b/paddle/phi/core/kernel_context.cc
index 3c7222f7a5379..a32e0e44f4696 100644
--- a/paddle/phi/core/kernel_context.cc
+++ b/paddle/phi/core/kernel_context.cc
@@ -69,7 +69,7 @@ void KernelContext::AssignInputRange(std::pair<int, int>&& range, size_t idx) {
   } else if (idx == input_range_.size()) {
     input_range_.emplace_back(range);
   } else {
-    PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
+    PADDLE_THROW(phi::errors::PreconditionNotMet(
         "Invalid idx when trying to set InputRange, "
         "index is `%d`, it is greater than the size(%d) of InputRange.",
         idx,
@@ -83,7 +83,7 @@ void KernelContext::AssignOutputRange(std::pair<int, int>&& range, size_t idx) {
   } else if (idx == output_range_.size()) {
     output_range_.emplace_back(range);
   } else {
-    PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
+    PADDLE_THROW(phi::errors::PreconditionNotMet(
         "Invalid idx when trying to set InputRange, "
         "index is `%d`, it is greater than the size(%d) of InputRange.",
         idx,
diff --git a/paddle/phi/core/lod_utils.h b/paddle/phi/core/lod_utils.h
index a5f73b66fb99b..147fca4cb576c 100644
--- a/paddle/phi/core/lod_utils.h
+++ b/paddle/phi/core/lod_utils.h
@@ -13,18 +13,11 @@
 // limitations under the License.
 
 #pragma once
-
-// See Note [ Why still include the fluid headers? ]
-#ifndef PADDLE_WITH_CUSTOM_KERNEL
-#include "paddle/fluid/framework/mixed_vector.h"
-#endif
+#include <cstddef>
+#include <vector>
 
 namespace phi {
-#ifndef PADDLE_WITH_CUSTOM_KERNEL
-using LoD = std::vector<paddle::framework::Vector<size_t>>;
-#else
-using LoD = std::vector<std::vector<size_t>>;
-#endif
+using LoD = std::vector<std::vector<std::size_t>>;
 
 void AppendLoD(LoD* lod, const LoD& lod_length);
 
@@ -40,4 +33,4 @@ void AppendLoD(LoD* lod, const LoD& lod_length);
  */
 LoD ConvertToLengthBasedLoD(const LoD& offset_lod);
 
-}  // namespace  pten
+}  // namespace  phi
diff --git a/paddle/phi/core/selected_rows.h b/paddle/phi/core/selected_rows.h
index cd48777b8ea61..7ee475b4d5d9e 100644
--- a/paddle/phi/core/selected_rows.h
+++ b/paddle/phi/core/selected_rows.h
@@ -55,25 +55,17 @@ class SelectedRows : public TensorBase,
 
   void set_height(int64_t height) { impl_->set_height(height); }
 
-  const paddle::framework::Vector<int64_t>& rows() const {
-    return impl_->rows();
-  }
+  const std::vector<int64_t>& rows() const { return impl_->rows(); }
 
-  paddle::framework::Vector<int64_t>* mutable_rows() {
-    return impl_->mutable_rows();
-  }
-
-  void set_rows(const paddle::framework::Vector<int64_t>& rows) {
-    impl_->set_rows(rows);
-  }
+  std::vector<int64_t>* mutable_rows() { return impl_->mutable_rows(); }
 
+  void set_rows(const std::vector<int64_t>& rows) { impl_->set_rows(rows); }
   /*
    * @brief Get the index of key in rows
    *
    * @return -1 if the key does not exists.
    */
   int64_t Index(int64_t key) const { return impl_->Index(key); }
-
   /*
    * @brief whether has the specified key in the table.
    *
diff --git a/paddle/phi/core/selected_rows_impl.cc b/paddle/phi/core/selected_rows_impl.cc
index 920e9935d5899..7e5fd51343a09 100644
--- a/paddle/phi/core/selected_rows_impl.cc
+++ b/paddle/phi/core/selected_rows_impl.cc
@@ -28,7 +28,7 @@ struct ReAllocateVisitor {
   template <typename T>
   void operator()() const {
     phi::DenseTensor cpu_tensor;
-    paddle::platform::CPUPlace cpu;
+    phi::CPUPlace cpu;
     T* ptr = cpu_tensor.mutable_data<T>(dims_, cpu);
     const T* old_ptr =
         tensor_->memory_size() == 0 ? nullptr : tensor_->data<T>();
@@ -57,7 +57,7 @@ struct TensorCopyVisitor {
   template <typename T>
   void apply() const {
     // TODO(Yancey1989): support other place
-    paddle::platform::CPUPlace cpu;
+    phi::CPUPlace cpu;
     paddle::memory::Copy(cpu,
                          dst_->mutable_data<T>(cpu) + dst_offset_,
                          cpu,
@@ -82,7 +82,7 @@ struct TensorFillVisitor {
   template <typename T>
   void apply() const {
     // TODO(qiao): support other place
-    paddle::platform::CPUPlace cpu;
+    phi::CPUPlace cpu;
     auto* tensor_data = dst_->mutable_data<T>(cpu);
     auto* start = tensor_data + dst_offset_;
     auto* end = start + size_;
@@ -121,16 +121,16 @@ int64_t SelectedRowsImpl::AutoGrownIndex(int64_t key,
   auto iter = id_to_index_.find(key);
   if (iter == id_to_index_.end()) {
     rwlock_->UNLock();
-    PADDLE_ENFORCE_EQ(auto_grown,
-                      true,
-                      paddle::platform::errors::NotFound(
-                          "Input key(%lld) is not found.", key));
+    PADDLE_ENFORCE_EQ(
+        auto_grown,
+        true,
+        phi::errors::NotFound("Input key(%lld) is not found.", key));
     rwlock_->WRLock();
     auto map_size = id_to_index_.size();
     auto vector_size = rows_.size();
     if (map_size != vector_size) {
       rwlock_->UNLock();
-      PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+      PADDLE_THROW(phi::errors::InvalidArgument(
           "Row map size(%zu) should be equal to rows size(%zu).",
           map_size,
           vector_size));
@@ -140,7 +140,7 @@ int64_t SelectedRowsImpl::AutoGrownIndex(int64_t key,
       int row_num = rows_.size();
       if (row_num == value_->dims()[0]) {
         rwlock_->UNLock();
-        PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+        PADDLE_THROW(phi::errors::InvalidArgument(
             "Selected rows is full, then length exceed the length of first "
             "dimension (%d).",
             row_num));
@@ -187,7 +187,7 @@ void SelectedRowsImpl::Get(const phi::DenseTensor& ids,
     PADDLE_ENFORCE_EQ(
         value_width,
         value->numel() / value->dims()[0],
-        paddle::platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Output tensor should have the same shape with table "
             "except the first dimmension, excepted value width not counting "
             "the first dimension is %d, actual value width is %d.",
diff --git a/paddle/phi/core/selected_rows_impl.h b/paddle/phi/core/selected_rows_impl.h
index 86579e529371a..3c54b59a159dd 100644
--- a/paddle/phi/core/selected_rows_impl.h
+++ b/paddle/phi/core/selected_rows_impl.h
@@ -27,8 +27,6 @@ limitations under the License. */
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/utils/rw_lock.h"
 
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/framework/mixed_vector.h"
 namespace phi {
 class SelectedRowsImpl {
   /*
@@ -68,13 +66,11 @@ class SelectedRowsImpl {
 
   void set_height(int64_t height) { height_ = height; }
 
-  const paddle::framework::Vector<int64_t>& rows() const { return rows_; }
+  const std::vector<int64_t>& rows() const { return rows_; }
 
-  paddle::framework::Vector<int64_t>* mutable_rows() { return &rows_; }
+  std::vector<int64_t>* mutable_rows() { return &rows_; }
 
-  void set_rows(const paddle::framework::Vector<int64_t>& rows) {
-    rows_ = rows;
-  }
+  void set_rows(const std::vector<int64_t>& rows) { rows_ = rows; }
 
   /*
    * @brief Get the index of key in rows
@@ -84,7 +80,7 @@ class SelectedRowsImpl {
   int64_t Index(int64_t key) const {
     auto it = std::find(rows_.begin(), rows_.end(), key);
     if (it == rows_.end()) {
-      PADDLE_THROW(paddle::platform::errors::NotFound(
+      PADDLE_THROW(phi::errors::NotFound(
           "Input id (%lld) is not in current rows table.", key));
     }
     return static_cast<int64_t>(std::distance(rows_.begin(), it));
@@ -156,10 +152,7 @@ class SelectedRowsImpl {
 
   /// \brief Returns the dims of the tensor.
   /// \return The dims of the tensor.
-  const DDim& dims() const noexcept {
-    return value_->dims();
-    // return phi::make_ddim(dims);
-  }
+  const DDim& dims() const noexcept { return value_->dims(); }
 
   /// \brief Returns the data type of the tensor.
   /// \return The data type of the tensor.
@@ -185,7 +178,7 @@ class SelectedRowsImpl {
   // Notice: rows can be duplicate. We can have {0, 4, 7, 0, 5, 7, 9} here.
   // SelectedRowsImpl are simply concated when adding together. Until a
   // SelectedRowsImpl add a Tensor, will the duplicate rows be handled.
-  paddle::framework::Vector<int64_t> rows_;
+  std::vector<int64_t> rows_;
   std::unordered_map<int64_t, int64_t>
       id_to_index_;  // should not be used when rows_ has duplicate member
   std::unique_ptr<DenseTensor> value_{nullptr};
diff --git a/paddle/phi/core/sparse_coo_tensor.cc b/paddle/phi/core/sparse_coo_tensor.cc
index 1659f09248be0..f2987e36d3db0 100644
--- a/paddle/phi/core/sparse_coo_tensor.cc
+++ b/paddle/phi/core/sparse_coo_tensor.cc
@@ -69,17 +69,17 @@ void SparseCooTensor::Resize(const DDim& dense_dims,
                              const int64_t non_zero_num) {
   PADDLE_ENFORCE_GE(non_zero_num,
                     this->nnz(),
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "the non_zero_num must be greater than or equal to the "
                         "origin non_zero_num."));
   PADDLE_ENFORCE_GE(sparse_dim,
                     1,
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "the sparse_dim must be greater than or equal 1."));
   PADDLE_ENFORCE_LE(
       sparse_dim,
       dense_dims.size(),
-      paddle::platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "the sparse_dim must be less than or equal dense_dims."));
 
   DDim indices_dims = phi::make_ddim({sparse_dim, non_zero_num});
diff --git a/paddle/phi/core/sparse_csr_tensor.cc b/paddle/phi/core/sparse_csr_tensor.cc
index 7f7cd76378cc4..cbf5f941b665d 100644
--- a/paddle/phi/core/sparse_csr_tensor.cc
+++ b/paddle/phi/core/sparse_csr_tensor.cc
@@ -20,7 +20,7 @@ inline void check_shape(const DDim& dims) {
   bool valid = dims.size() == 2 || dims.size() == 3;
 
   PADDLE_ENFORCE(valid,
-                 paddle::platform::errors::InvalidArgument(
+                 phi::errors::InvalidArgument(
                      "the SparseCsrTensor only support 2-D Tensor."));
 }
 #define Check(non_zero_crows, non_zero_cols, non_zero_elements, dims)          \
@@ -29,12 +29,12 @@ inline void check_shape(const DDim& dims) {
     PADDLE_ENFORCE_EQ(                                                         \
         non_zero_cols.place(),                                                 \
         non_zero_crows.place(),                                                \
-        paddle::platform::errors::InvalidArgument(                             \
+        phi::errors::InvalidArgument(                                          \
             "non_zero_crows and non_zero_cols must have the same place."));    \
     PADDLE_ENFORCE_EQ(                                                         \
         non_zero_cols.place(),                                                 \
         non_zero_elements.place(),                                             \
-        paddle::platform::errors::InvalidArgument(                             \
+        phi::errors::InvalidArgument(                                          \
             "non_zero_cols and non_zero_elements must have the same place.")); \
   }
 
@@ -77,7 +77,7 @@ void* SparseCsrTensor::AllocateFrom(Allocator* allocator,
 void SparseCsrTensor::Resize(const DDim& dense_dims,
                              const int64_t non_zero_num) {
   PADDLE_ENFORCE(this->initialized(),
-                 paddle::platform::errors::InvalidArgument(
+                 phi::errors::InvalidArgument(
                      "the SparseCsrTensor must be initialized when call Resize "
                      "function."));
   check_shape(dense_dims);
diff --git a/paddle/phi/core/tensor_meta.h b/paddle/phi/core/tensor_meta.h
index ede9b43b1f382..3d2da542c7417 100644
--- a/paddle/phi/core/tensor_meta.h
+++ b/paddle/phi/core/tensor_meta.h
@@ -20,6 +20,8 @@ limitations under the License. */
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/utils/any.h"
+#include "paddle/utils/optional.h"
 
 // Note: mixed_vector include many header now, LoD will be
 // used on CUDA device? Can we use small_vector here?
@@ -31,11 +33,7 @@ limitations under the License. */
 namespace phi {
 
 using DDim = phi::DDim;
-#ifndef PADDLE_WITH_CUSTOM_KERNEL
-using LoD = std::vector<paddle::framework::Vector<size_t>>;
-#else
 using LoD = std::vector<std::vector<size_t>>;
-#endif
 /// \brief The meta data of dense tensor. Take the structure type
 /// and use all default operations.
 ///
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index a964788b15e31..7455f1e6a0896 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -23,7 +23,7 @@ void DotInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
   auto x_rank = static_cast<size_t>(x_dims.size());
   PADDLE_ENFORCE_EQ(true,
                     1 == x_rank || 2 == x_rank,
-                    paddle::platform::errors::PreconditionNotMet(
+                    phi::errors::PreconditionNotMet(
                         "ShapeError: The dimensions of input tensor X (%s) "
                         "should be 1 or 2",
                         x_dims.to_str()));
@@ -32,7 +32,7 @@ void DotInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
   PADDLE_ENFORCE_EQ(
       true,
       x_rank == static_cast<size_t>(y_dims.size()),
-      paddle::platform::errors::PreconditionNotMet(
+      phi::errors::PreconditionNotMet(
           "ShapeError: The shape of input tensor Y: %s should match with "
           "input tenosr X: %s",
           y_dims.to_str(),
@@ -47,7 +47,7 @@ void DotInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
 
   PADDLE_ENFORCE_EQ(true,
                     shape_match,
-                    paddle::platform::errors::PreconditionNotMet(
+                    phi::errors::PreconditionNotMet(
                         "ShapeError: The shape of input tensor X: %s should "
                         "be exactly the same "
                         "with input tensor Y: %s",
@@ -71,12 +71,12 @@ void MatmulInferMeta(const MetaTensor& x,
   auto ndims_y = dims_y.size();
   PADDLE_ENFORCE_GT(ndims_x,
                     0UL,
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "The Input(x) dims size must be greater than 0,"
                         " but reviced dims size is 0. "));
   PADDLE_ENFORCE_GT(ndims_y,
                     0UL,
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "The Input(y) dims size must be greater than 0,"
                         " but reviced dims size is 0. "));
 
@@ -150,7 +150,7 @@ void ElementwiseRawInferMeta(const MetaTensor& x,
     if (x_dims.size() == y_dims.size()) {
       PADDLE_ENFORCE_EQ((axis == -1) || (axis == 0),
                         true,
-                        paddle::platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "axis should be -1 or 0 while the dimension of "
                             "tensor X (%s) is equal to the dimension of "
                             "tensor Y (%s), but received axis: %s",
@@ -160,7 +160,7 @@ void ElementwiseRawInferMeta(const MetaTensor& x,
     }
     PADDLE_ENFORCE_EQ((axis >= (-1 * max_dim)) && (axis < max_dim),
                       true,
-                      paddle::platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The axis range must be [%s, %s), but axis is %s. "
                           "Please set the axis again.",
                           -1 * max_dim,
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index 5e7dd1de69d7d..d72033f952857 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -24,7 +24,7 @@ void ConcatInferMeta(const std::vector<MetaTensor>& x,
                      MetaConfig config) {
   PADDLE_ENFORCE_GE(x.size(),
                     0UL,
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "The size of input meta vector should be greater"
                         "than 0."));
 
@@ -34,7 +34,7 @@ void ConcatInferMeta(const std::vector<MetaTensor>& x,
   PADDLE_ENFORCE_EQ(
       axis >= -rank && axis < rank,
       true,
-      paddle::platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "The axis is expected to be in range of [%d, %d), but got %d",
           -rank,
           rank,
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index fda395e6d95ec..1fbd6c2b6c2f5 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -38,11 +38,11 @@ void FlattenInferMeta(const MetaTensor& x,
   if (stop_axis < 0) {
     stop_axis = stop_axis + in_dims_size;
   }
-  PADDLE_ENFORCE_GE(stop_axis,
-                    start_axis,
-                    paddle::platform::errors::InvalidArgument(
-                        "The stop_axis should be greater"
-                        "than or equal to start_axis."));
+  PADDLE_ENFORCE_GE(
+      stop_axis,
+      start_axis,
+      phi::errors::InvalidArgument("The stop_axis should be greater"
+                                   "than or equal to start_axis."));
 
   int64_t outer = 1;
   std::vector<int32_t> out_shape;
@@ -113,7 +113,7 @@ static phi::DDim ValidateShape(const std::vector<int64_t> shape,
       PADDLE_ENFORCE_EQ(
           unk_dim_idx,
           -1,
-          paddle::platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "Only one dimension value of 'shape' in ReshapeOp can "
               "be -1. But received shape = [%s], shape[%d] is also -1.",
               phi::make_ddim(shape),
@@ -123,7 +123,7 @@ static phi::DDim ValidateShape(const std::vector<int64_t> shape,
       PADDLE_ENFORCE_LT(
           static_cast<int>(i),
           in_dims.size(),
-          paddle::platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The index of 0 in `shape` must be less than "
               "the input tensor X's dimensions. "
               "But received shape = [%s], shape[%d] = 0, X's shape = [%s], "
@@ -136,7 +136,7 @@ static phi::DDim ValidateShape(const std::vector<int64_t> shape,
       PADDLE_ENFORCE_GT(
           shape[i],
           0,
-          paddle::platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "Each dimension value of 'shape' in ReshapeOp must not "
               "be negative except one unknown dimension. "
               "But received  shape = [%s], shape[%d] = %d.",
@@ -161,7 +161,7 @@ static phi::DDim ValidateShape(const std::vector<int64_t> shape,
       PADDLE_ENFORCE_EQ(
           output_shape[unk_dim_idx] * capacity,
           -in_size,
-          paddle::platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The 'shape' attribute in ReshapeOp is invalid. "
               "The input tensor X'size must be divisible by known "
               "capacity of 'shape'. "
@@ -179,7 +179,7 @@ static phi::DDim ValidateShape(const std::vector<int64_t> shape,
       PADDLE_ENFORCE_EQ(
           capacity,
           in_size,
-          paddle::platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The 'shape' in ReshapeOp is invalid. "
               "The input tensor X'size must be equal to the capacity of "
               "'shape'. "
@@ -199,7 +199,7 @@ static phi::DDim ValidateShape(const std::vector<int64_t> shape,
     PADDLE_ENFORCE_LE(
         capacity,
         in_size,
-        paddle::platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The 'shape' in ReshapeOp is invalid. "
             "The input tensor X's shape = [%s], X's capacity = %d."
             "But the target shape of Out is [%s],  the "
@@ -364,7 +364,7 @@ void SplitInferMeta(const MetaTensor& x,
   PADDLE_ENFORCE_EQ(
       axis_value >= -rank && axis_value < rank,
       true,
-      paddle::platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "The axis is expected to be in range of [%d, %d), but got %d",
           -rank,
           rank,
@@ -383,7 +383,7 @@ void SplitInferMeta(const MetaTensor& x,
 
     PADDLE_ENFORCE_EQ(input_axis_dim % num,
                       0,
-                      paddle::platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The input's size along the split dimension "
                           "must be evenly divisible by Attr(num_or_sections). "
                           "But received Attr(num_or_sections) "
@@ -416,7 +416,7 @@ void SplitInferMeta(const MetaTensor& x,
     if (config.is_runtime) {
       PADDLE_ENFORCE_LE(num_of_unknow,
                         1,
-                        paddle::platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "Only one dimension value of Attr(num_or_sections) "
                             "in SplitOp can be -1. "
                             "But received Attr(num_or_sections) = [%s].",
@@ -430,7 +430,7 @@ void SplitInferMeta(const MetaTensor& x,
       PADDLE_ENFORCE_LT(
           sum_of_section,
           input_axis_dim,
-          paddle::platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "Sum of Attr(num_or_sections) other than unknown section "
               "must be less than the input's "
               "size "
@@ -447,7 +447,7 @@ void SplitInferMeta(const MetaTensor& x,
       PADDLE_ENFORCE_EQ(
           sum_of_section,
           input_axis_dim,
-          paddle::platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "Sum of Attr(num_or_sections) must be equal to the input's "
               "size "
               "along the split dimension. But received Attr(num_or_sections)"
diff --git a/paddle/phi/kernels/cpu/concat_kernel.cc b/paddle/phi/kernels/cpu/concat_kernel.cc
index 0cae2599f8d13..3b74951a5041c 100644
--- a/paddle/phi/kernels/cpu/concat_kernel.cc
+++ b/paddle/phi/kernels/cpu/concat_kernel.cc
@@ -54,7 +54,7 @@ void ConcatKernel(const Context& dev_ctx,
         PADDLE_ENFORCE_EQ(
             x[i].lod().size(),
             lod_size_0,
-            paddle::platform::errors::Unimplemented(
+            phi::errors::Unimplemented(
                 "The lod level of all input LoDTensors should be same. "
                 "Maybe different lod level of input LoDTensors can concat,"
                 "it is not supported currently. The lod level of %dth input "
diff --git a/paddle/phi/kernels/cpu/elementwise.h b/paddle/phi/kernels/cpu/elementwise.h
index c692038d24a0a..28bf5ab743f6d 100644
--- a/paddle/phi/kernels/cpu/elementwise.h
+++ b/paddle/phi/kernels/cpu/elementwise.h
@@ -127,7 +127,7 @@ struct SameDimsDivideFunctor<
                   const DenseTensor& x,
                   const DenseTensor& y,
                   DenseTensor* z) {
-    paddle::platform::errors::InvalidArgument(
+    phi::errors::InvalidArgument(
         "If use SameDimsDivideFunctor, template args(T) must be floating "
         "point. ");
   }
@@ -278,12 +278,10 @@ void CommonForwardBroadcastCPU(const DenseTensor& x,
   std::vector<int> index_array(max_dim, 0);
   const T* x_data = x.data<T>();
   const T* y_data = y.data<T>();
-  PADDLE_ENFORCE_NOT_NULL(x_data,
-                          paddle::platform::errors::InvalidArgument(
-                              "The input X should not be empty."));
-  PADDLE_ENFORCE_NOT_NULL(y_data,
-                          paddle::platform::errors::InvalidArgument(
-                              "The input Y should not be empty."));
+  PADDLE_ENFORCE_NOT_NULL(
+      x_data, phi::errors::InvalidArgument("The input X should not be empty."));
+  PADDLE_ENFORCE_NOT_NULL(
+      y_data, phi::errors::InvalidArgument("The input Y should not be empty."));
   OutType* out_data = ctx.Alloc<OutType>(z);
 
   const int out_size = std::accumulate(
@@ -317,12 +315,12 @@ void CommonElementwiseBroadcastForward(const CPUContext& dev_ctx,
   PADDLE_ENFORCE_GE(
       axis,
       0,
-      paddle::platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "Axis should be great than or equal to 0, but received axis is %d.",
           axis));
   PADDLE_ENFORCE_LT(axis,
                     max_dim,
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "Axis should be less than %d, but received axis is %d.",
                         max_dim,
                         axis));
@@ -385,12 +383,12 @@ void ElementwiseCompute(const CPUContext& dev_ctx,
   PADDLE_ENFORCE_GE(
       axis,
       0,
-      paddle::platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "Axis should be great than or equal to 0, but received axis is %d.",
           axis));
   PADDLE_ENFORCE_LT(axis,
                     max_dim,
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "Axis should be less than %d, but received axis is %d.",
                         max_dim,
                         axis));
@@ -630,12 +628,12 @@ void ElemwiseGradComputeWithBroadcast(const CPUContext& ctx,
   PADDLE_ENFORCE_GE(
       axis,
       0,
-      paddle::platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "Axis should be great than or equal to 0, but received axis is %d.",
           axis));
   PADDLE_ENFORCE_LT(axis,
                     max_dim,
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "Axis should be less than %d, but received axis is %d.",
                         max_dim,
                         axis));
diff --git a/paddle/phi/kernels/cpu/masked_select_kernel.cc b/paddle/phi/kernels/cpu/masked_select_kernel.cc
index 274863a863b79..f377658d507f6 100644
--- a/paddle/phi/kernels/cpu/masked_select_kernel.cc
+++ b/paddle/phi/kernels/cpu/masked_select_kernel.cc
@@ -48,7 +48,7 @@ void MaskedSelectKernel(const Context& dev_ctx,
 
   DDim out_dim{out_size};
   out->Resize(out_dim);
-  auto out_data = out->mutable_data<T>(paddle::platform::CPUPlace());
+  auto out_data = out->mutable_data<T>(phi::CPUPlace());
 
   int index = 0;
   for (int i = 0; i < mask_size; i++) {
diff --git a/paddle/phi/kernels/funcs/common_shape.h b/paddle/phi/kernels/funcs/common_shape.h
index e14241d03c3af..8bd9867f39edd 100644
--- a/paddle/phi/kernels/funcs/common_shape.h
+++ b/paddle/phi/kernels/funcs/common_shape.h
@@ -42,12 +42,12 @@ inline void GetBroadcastDimsArrays(const DDim &x_dims,
   PADDLE_ENFORCE_GE(
       axis,
       0,
-      paddle::platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "Axis should be great than or equal to 0, but received axis is %d.",
           axis));
   PADDLE_ENFORCE_LT(axis,
                     max_dim,
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "Axis should be less than %d, but received axis is %d.",
                         max_dim,
                         axis));
@@ -72,7 +72,7 @@ inline void GetBroadcastDimsArrays(const DDim &x_dims,
         x_dims_array[i] == y_dims_array[i] || x_dims_array[i] <= 1 ||
             y_dims_array[i] <= 1,
         true,
-        paddle::platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Broadcast dimension mismatch. Operands could "
             "not be broadcast together with the shape of X = [%s] and "
             "the shape of Y = [%s]. Received [%d] in X is not equal to "
diff --git a/paddle/phi/kernels/funcs/concat_funcs.h b/paddle/phi/kernels/funcs/concat_funcs.h
index 63f0c8058acc1..32237e2cc2366 100644
--- a/paddle/phi/kernels/funcs/concat_funcs.h
+++ b/paddle/phi/kernels/funcs/concat_funcs.h
@@ -23,7 +23,7 @@ static inline int64_t ComputeAxis(int64_t axis, int64_t rank) {
   PADDLE_ENFORCE_EQ(
       axis >= -rank && axis < rank,
       true,
-      paddle::platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "The axis is expected to be in range of [%d, %d), but got %d",
           -rank,
           rank,
@@ -42,17 +42,17 @@ static inline phi::DDim ComputeAndCheckShape(
   auto out_dims = inputs_dims[0];
   size_t in_zero_dims_size = out_dims.size();
   for (size_t i = 1; i < n; i++) {
-    PADDLE_ENFORCE_EQ(inputs_dims[i].size(),
-                      out_dims.size(),
-                      paddle::platform::errors::InvalidArgument(
-                          "The shape of input[0] and input[%d] "
-                          "is expected to be equal."
-                          "But received input[0]'s shape = "
-                          "[%s], input[%d]'s shape = [%s].",
-                          i,
-                          inputs_dims[0],
-                          i,
-                          inputs_dims[i]));
+    PADDLE_ENFORCE_EQ(
+        inputs_dims[i].size(),
+        out_dims.size(),
+        phi::errors::InvalidArgument("The shape of input[0] and input[%d] "
+                                     "is expected to be equal."
+                                     "But received input[0]'s shape = "
+                                     "[%s], input[%d]'s shape = [%s].",
+                                     i,
+                                     inputs_dims[0],
+                                     i,
+                                     inputs_dims[i]));
     for (size_t j = 0; j < in_zero_dims_size; j++) {
       if (j == axis) {
         if (is_runtime) {
@@ -71,7 +71,7 @@ static inline phi::DDim ComputeAndCheckShape(
           // check all shape in run time
           PADDLE_ENFORCE_EQ(inputs_dims[0][j],
                             inputs_dims[i][j],
-                            paddle::platform::errors::InvalidArgument(
+                            phi::errors::InvalidArgument(
                                 "The %d-th dimension of input[0] and input[%d] "
                                 "is expected to be equal."
                                 "But received input[0]'s shape = "
@@ -92,4 +92,4 @@ static inline phi::DDim ComputeAndCheckShape(
 }
 
 }  // namespace funcs
-}  // namespace  pten
+}  // namespace  phi
diff --git a/paddle/phi/kernels/funcs/eigen/common.h b/paddle/phi/kernels/funcs/eigen/common.h
index dc64d3b122f10..d34427df0e499 100644
--- a/paddle/phi/kernels/funcs/eigen/common.h
+++ b/paddle/phi/kernels/funcs/eigen/common.h
@@ -21,7 +21,7 @@ limitations under the License. */
 
 namespace phi {
 
-// EigenDim converts paddle::platform::DDim into Eigen::DSizes.
+// EigenDim converts phi::DDim into Eigen::DSizes.
 template <int D>
 struct EigenDim {
   using Type = Eigen::DSizes<Eigen::DenseIndex, D>;
@@ -29,7 +29,7 @@ struct EigenDim {
   static Type From(const DDim& dims) {
     PADDLE_ENFORCE_EQ(arity(dims),
                       D,
-                      paddle::platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Input dimension size should be equal to %d, but "
                           "received dimension size is %d.",
                           arity(dims),
@@ -42,7 +42,7 @@ struct EigenDim {
   }
 };
 
-// Interpret paddle::platform::Tensor as EigenTensor and EigenConstTensor.
+// Interpret phi::Tensor as EigenTensor and EigenConstTensor.
 template <typename T,
           size_t D,
           int MajorType = Eigen::RowMajor,
@@ -86,7 +86,7 @@ struct EigenMatrix : public EigenTensor<T, 2, MajorType, IndexType> {
     int rank = tensor.dims().size();
     PADDLE_ENFORCE_EQ((num_col_dims > 0 && num_col_dims < rank),
                       true,
-                      paddle::platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Input dimension number(num_col_dims) must be "
                           "between 0 and %d, but received number is %d.",
                           rank,
@@ -100,7 +100,7 @@ struct EigenMatrix : public EigenTensor<T, 2, MajorType, IndexType> {
     int rank = tensor.dims().size();
     PADDLE_ENFORCE_EQ((num_col_dims > 0 && num_col_dims < rank),
                       true,
-                      paddle::platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Input dimension number(num_col_dims) must be "
                           "between 0 and %d, but received number is %d.",
                           rank,
diff --git a/paddle/phi/kernels/funcs/elementwise_base.h b/paddle/phi/kernels/funcs/elementwise_base.h
index 9fb2dac6c425f..9a429dfaaf957 100644
--- a/paddle/phi/kernels/funcs/elementwise_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_base.h
@@ -343,7 +343,7 @@ inline void get_mid_dims(const DDim &x_dims,
     if (x_dims[i + axis] != y_dims[i]) {
       PADDLE_ENFORCE_EQ(y_dims[i] == 1 || x_dims[i + axis] == 1,
                         true,
-                        paddle::platform::errors::InvalidArgument(
+                        phi::errors::InvalidArgument(
                             "Broadcast dimension mismatch. Operands "
                             "could not be broadcast together with the shape of "
                             "X = [%s] and the shape of Y = [%s]. Received [%d] "
@@ -754,7 +754,7 @@ void ElementwiseKernel(const KPDevice &ctx,
   const int kArity = Traits::arity;
   PADDLE_ENFORCE_EQ(ins.size(),
                     kArity,
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "The number of inputs is expected to be equal to the "
                         "arity of functor. But recieved: the number of inputs "
                         "is %d, the arity of functor is %d.",
@@ -762,7 +762,7 @@ void ElementwiseKernel(const KPDevice &ctx,
                         kArity));
   PADDLE_ENFORCE_EQ(outs->size(),
                     NumOuts,
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "Number of outputs shall equal to number of functions, "
                         "but number of outputs is %d, of functions is %d.",
                         outs->size(),
@@ -773,7 +773,7 @@ void ElementwiseKernel(const KPDevice &ctx,
       PADDLE_ENFORCE_EQ(
           (*outs)[i]->dims(),
           (*outs)[0]->dims(),
-          paddle::platform::errors::InvalidArgument(
+          phi::errors::InvalidArgument(
               "The shape of each output tensor shall be identical yet, "
               "but %dth output tensor`s shape is not.",
               i));
@@ -796,7 +796,7 @@ void ElementwiseKernel(const KPDevice &ctx,
           ctx, ins, outs, func);
       break;
     default: {
-      PADDLE_THROW(paddle::platform::errors::Unimplemented(
+      PADDLE_THROW(phi::errors::Unimplemented(
           "Unsupported vectorized size: %d !", vec_size));
       break;
     }
diff --git a/paddle/phi/kernels/funcs/math_function.cc b/paddle/phi/kernels/funcs/math_function.cc
index 8aed099d9f243..4201a75be8ac7 100644
--- a/paddle/phi/kernels/funcs/math_function.cc
+++ b/paddle/phi/kernels/funcs/math_function.cc
@@ -184,7 +184,7 @@ struct TensorSetConstantCPU {
       : tensor_(tensor), value_(value) {}
   template <typename T>
   void apply() const {
-    auto cpu = paddle::platform::CPUPlace();
+    auto cpu = phi::CPUPlace();
     auto* begin = tensor_->mutable_data<T>(cpu);
     std::fill(begin, begin + tensor_->numel(), static_cast<T>(value_));
   }
@@ -197,8 +197,7 @@ void set_constant_with_place<paddle::platform::XPUPlace>(
     const paddle::platform::DeviceContext& context,
     paddle::framework::Tensor* tensor,
     float value) {
-  PADDLE_THROW(
-      paddle::platform::errors::Unimplemented("XPUPlace is not supported"));
+  PADDLE_THROW(phi::errors::Unimplemented("XPUPlace is not supported"));
 }
 
 template <>
@@ -206,8 +205,7 @@ void set_constant_with_place<paddle::platform::NPUPlace>(
     const paddle::platform::DeviceContext& context,
     paddle::framework::Tensor* tensor,
     float value) {
-  PADDLE_THROW(
-      paddle::platform::errors::Unimplemented("NPUPlace is not supported"));
+  PADDLE_THROW(phi::errors::Unimplemented("NPUPlace is not supported"));
 }
 
 template <>
@@ -215,8 +213,7 @@ void set_constant_with_place<paddle::platform::NPUPinnedPlace>(
     const paddle::platform::DeviceContext& context,
     paddle::framework::Tensor* tensor,
     float value) {
-  PADDLE_THROW(paddle::platform::errors::Unimplemented(
-      "NPUPinnedPlace is not supported"));
+  PADDLE_THROW(phi::errors::Unimplemented("NPUPinnedPlace is not supported"));
 }
 
 template <>
@@ -224,8 +221,7 @@ void set_constant_with_place<paddle::platform::IPUPlace>(
     const paddle::platform::DeviceContext& context,
     paddle::framework::Tensor* tensor,
     float value) {
-  PADDLE_THROW(
-      paddle::platform::errors::Unimplemented("IPUPlace is not supported"));
+  PADDLE_THROW(phi::errors::Unimplemented("IPUPlace is not supported"));
 }
 
 template <>
@@ -233,12 +229,11 @@ void set_constant_with_place<paddle::platform::CustomPlace>(
     const paddle::platform::DeviceContext& context,
     paddle::framework::Tensor* tensor,
     float value) {
-  PADDLE_THROW(
-      paddle::platform::errors::Unimplemented("CustomPlace is not supported"));
+  PADDLE_THROW(phi::errors::Unimplemented("CustomPlace is not supported"));
 }
 
 template <>
-void set_constant_with_place<paddle::platform::CPUPlace>(
+void set_constant_with_place<phi::CPUPlace>(
     const paddle::platform::DeviceContext& context,
     paddle::framework::Tensor* tensor,
     float value) {
@@ -250,8 +245,7 @@ void set_constant_with_place<paddle::platform::MLUPlace>(
     const paddle::platform::DeviceContext& context,
     paddle::framework::Tensor* tensor,
     float value) {
-  PADDLE_THROW(
-      paddle::platform::errors::Unimplemented("MLUPlace is not supported"));
+  PADDLE_THROW(phi::errors::Unimplemented("MLUPlace is not supported"));
 }
 
 template <>
@@ -286,7 +280,7 @@ void set_constant(const paddle::platform::DeviceContext& context,
   // tensor->place().apply_visitor(func);
   paddle::platform::VisitPlace(tensor->place(), func);
 #else
-  func(paddle::platform::CPUPlace());
+  func(phi::CPUPlace());
 #endif
 }
 
@@ -302,7 +296,7 @@ struct RowwiseAdd<paddle::platform::CPUDeviceContext, T> {
     PADDLE_ENFORCE_EQ(
         vector.numel(),
         size,
-        paddle::platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The input vector size"
             " should be equal to the size of each row of input tensor."
             " Expected vector size=%d, but received %d",
@@ -312,7 +306,7 @@ struct RowwiseAdd<paddle::platform::CPUDeviceContext, T> {
     const char* out_dims_cstr = out_dims.to_str().c_str();
     PADDLE_ENFORCE_EQ(out_dims,
                       in_dims,
-                      paddle::platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The output tensor shape should be same as the input"
                           " tensor shape. Expected output tensor shape: %s,"
                           " but received %s",
diff --git a/paddle/phi/kernels/funcs/math_function.cu b/paddle/phi/kernels/funcs/math_function.cu
index 0b2b53c28c984..ae368a005f057 100644
--- a/paddle/phi/kernels/funcs/math_function.cu
+++ b/paddle/phi/kernels/funcs/math_function.cu
@@ -257,7 +257,7 @@ struct RowwiseAdd<paddle::platform::CUDADeviceContext, T> {
     PADDLE_ENFORCE_EQ(
         vector.numel(),
         size,
-        paddle::platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The input vector size"
             " should be equal to the size of each row of input tensor."
             " Expected vector size=%d, but received %d",
@@ -268,7 +268,7 @@ struct RowwiseAdd<paddle::platform::CUDADeviceContext, T> {
     PADDLE_ENFORCE_EQ(
         out_dims,
         in_dims,
-        paddle::platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The output tensor shape should be same as the input tensor"
             " shape. Expected output tensor shape: %s,"
             " but received %s",
@@ -303,7 +303,7 @@ void ColwiseSum<paddle::platform::CUDADeviceContext, double>::operator()(
   auto size = input.numel() / in_dims[0];
   PADDLE_ENFORCE_EQ(vector->numel(),
                     size,
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "The size of input vector"
                         " should be equal to the size of input tensor column"
                         " dimension. Expected vector size=%d, but received %d",
@@ -339,7 +339,7 @@ void RowwiseSum<paddle::platform::CUDADeviceContext, double>::operator()(
   auto size = input.numel() / in_dims[0];
   PADDLE_ENFORCE_EQ(vector->numel(),
                     in_dims[0],
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "The size of input vector"
                         " should be equal to the size of input tensor row"
                         " dimension. Expected vector size=%d, but received %d",
diff --git a/paddle/phi/kernels/funcs/math_function.h b/paddle/phi/kernels/funcs/math_function.h
index 7f581c395cc71..8e1a4cdd1a968 100644
--- a/paddle/phi/kernels/funcs/math_function.h
+++ b/paddle/phi/kernels/funcs/math_function.h
@@ -115,7 +115,7 @@ struct TensorSetConstantXPU {
     std::fill(data_cpu.get(), data_cpu.get() + numel, static_cast<T>(value_));
     paddle::memory::Copy(place_,
                          begin,
-                         paddle::platform::CPUPlace(),
+                         phi::CPUPlace(),
                          static_cast<void*>(data_cpu.get()),
                          numel * sizeof(T));
   }
diff --git a/paddle/phi/kernels/funcs/math_function_impl.h b/paddle/phi/kernels/funcs/math_function_impl.h
index b099c6d411602..1638d03e50f95 100644
--- a/paddle/phi/kernels/funcs/math_function_impl.h
+++ b/paddle/phi/kernels/funcs/math_function_impl.h
@@ -74,7 +74,7 @@ void ColwiseSum<DeviceContext, T>::operator()(
   auto size = input.numel() / in_dims[0];
   PADDLE_ENFORCE_EQ(out->numel(),
                     size,
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "The size of output tensor "
                         "should be equal to the size of input tensor column"
                         " dimension. Expected output size=%d, but received %d",
@@ -102,7 +102,7 @@ class ColwiseSum<paddle::platform::CPUDeviceContext, T> {
     PADDLE_ENFORCE_EQ(
         out->numel(),
         size,
-        paddle::platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The size of output tensor "
             "should be equal to the size of input tensor column"
             " dimension. Expected output size=%d, but received %d",
@@ -130,15 +130,14 @@ void RowwiseMean<DeviceContext, T>::operator()(
     const paddle::framework::Tensor& input,
     paddle::framework::Tensor* out) {
   auto in_dims = input.dims();
-  PADDLE_ENFORCE_EQ(
-      in_dims.size(),
-      2U,
-      paddle::platform::errors::InvalidArgument("The rank of input tensor "
-                                                "should be 2, but received %d",
-                                                in_dims.size()));
+  PADDLE_ENFORCE_EQ(in_dims.size(),
+                    2U,
+                    phi::errors::InvalidArgument("The rank of input tensor "
+                                                 "should be 2, but received %d",
+                                                 in_dims.size()));
   PADDLE_ENFORCE_EQ(out->numel(),
                     in_dims[0],
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "The size of output tensor "
                         "should be equal to the size of input tensor row"
                         " dimension. Expected output size=%d, but received %d",
@@ -161,18 +160,18 @@ class RowwiseMean<paddle::platform::CPUDeviceContext, T> {
                   const paddle::framework::Tensor& input,
                   paddle::framework::Tensor* out) {
     auto& in_dims = input.dims();
-    PADDLE_ENFORCE_EQ(in_dims.size(),
-                      2U,
-                      paddle::platform::errors::InvalidArgument(
-                          "The rank of input tensor "
-                          "should be 2, but received %d",
-                          in_dims.size()));
+    PADDLE_ENFORCE_EQ(
+        in_dims.size(),
+        2U,
+        phi::errors::InvalidArgument("The rank of input tensor "
+                                     "should be 2, but received %d",
+                                     in_dims.size()));
     auto height = in_dims[0];
     auto size = in_dims[1];
     PADDLE_ENFORCE_EQ(
         out->numel(),
         height,
-        paddle::platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The size of output tensor "
             "should be equal to the size of input tensor row"
             " dimension. Expected output size=%d, but received %d",
@@ -198,15 +197,14 @@ void RowwiseSum<DeviceContext, T>::operator()(
     const paddle::framework::Tensor& input,
     paddle::framework::Tensor* out) {
   auto in_dims = input.dims();
-  PADDLE_ENFORCE_EQ(
-      in_dims.size(),
-      2U,
-      paddle::platform::errors::InvalidArgument("The rank of input tensor "
-                                                "should be 2, but received %d",
-                                                in_dims.size()));
+  PADDLE_ENFORCE_EQ(in_dims.size(),
+                    2U,
+                    phi::errors::InvalidArgument("The rank of input tensor "
+                                                 "should be 2, but received %d",
+                                                 in_dims.size()));
   PADDLE_ENFORCE_EQ(out->numel(),
                     in_dims[0],
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "The size of output tensor "
                         "should be equal to the size of input tensor row"
                         " dimension. Expected output size=%d, but received %d",
@@ -229,18 +227,18 @@ class RowwiseSum<paddle::platform::CPUDeviceContext, T> {
                   const paddle::framework::Tensor& input,
                   paddle::framework::Tensor* out) {
     auto& in_dims = input.dims();
-    PADDLE_ENFORCE_EQ(in_dims.size(),
-                      2U,
-                      paddle::platform::errors::InvalidArgument(
-                          "The rank of input tensor "
-                          "should be 2, but received %d",
-                          in_dims.size()));
+    PADDLE_ENFORCE_EQ(
+        in_dims.size(),
+        2U,
+        phi::errors::InvalidArgument("The rank of input tensor "
+                                     "should be 2, but received %d",
+                                     in_dims.size()));
     auto height = in_dims[0];
     auto size = in_dims[1];
     PADDLE_ENFORCE_EQ(
         out->numel(),
         height,
-        paddle::platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The size of output tensor "
             "should be equal to the size of input tensor row"
             " dimension. Expected output size=%d, but received %d",
diff --git a/paddle/phi/kernels/gpu/concat_and_split.h b/paddle/phi/kernels/gpu/concat_and_split.h
index 46586012ccc1e..ced48ece979f0 100644
--- a/paddle/phi/kernels/gpu/concat_and_split.h
+++ b/paddle/phi/kernels/gpu/concat_and_split.h
@@ -16,7 +16,6 @@
 #include <algorithm>
 #include <vector>
 #include "gflags/gflags.h"
-#include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
@@ -329,7 +328,7 @@ void ConcatImpl(const Context& context,
         inputs_data, in_num);
     paddle::memory::Copy(context.GetPlace(),
                          tmp_dev_ins_data->ptr(),
-                         paddle::platform::CPUPlace(),
+                         phi::CPUPlace(),
                          restored,
                          in_num * sizeof(T*),
                          context.stream());
@@ -376,7 +375,7 @@ void ConcatImpl(const Context& context,
         inputs_col, inputs_col_num);
     paddle::memory::Copy(context.GetPlace(),
                          tmp_dev_ins_col_data->ptr(),
-                         paddle::platform::CPUPlace(),
+                         phi::CPUPlace(),
                          restored,
                          inputs_col_num * sizeof(int64_t),
                          context.stream());
@@ -488,7 +487,7 @@ void SplitImpl(const Context& context,
         outputs_data, o_num);
     paddle::memory::Copy(context.GetPlace(),
                          tmp_dev_outs_data->ptr(),
-                         paddle::platform::CPUPlace(),
+                         phi::CPUPlace(),
                          restored,
                          o_num * sizeof(T*),
                          context.stream());
@@ -535,7 +534,7 @@ void SplitImpl(const Context& context,
         outputs_cols, outputs_cols_num);
     paddle::memory::Copy(context.GetPlace(),
                          tmp_dev_ins_col_data->ptr(),
-                         paddle::platform::CPUPlace(),
+                         phi::CPUPlace(),
                          restored,
                          outputs_cols_num * sizeof(int64_t),
                          context.stream());
diff --git a/paddle/phi/kernels/gpu/concat_kernel.cu b/paddle/phi/kernels/gpu/concat_kernel.cu
index c80a873127708..b787b80c7e4ed 100644
--- a/paddle/phi/kernels/gpu/concat_kernel.cu
+++ b/paddle/phi/kernels/gpu/concat_kernel.cu
@@ -54,7 +54,7 @@ void ConcatKernel(const Context& dev_ctx,
         PADDLE_ENFORCE_EQ(
             x[i].lod().size(),
             lod_size_0,
-            paddle::platform::errors::Unimplemented(
+            phi::errors::Unimplemented(
                 "The lod level of all input LoDTensors should be same. "
                 "Maybe different lod level of input LoDTensors can concat,"
                 "it is not supported currently. The lod level of %dth input "
diff --git a/paddle/phi/kernels/gpu/copy_kernel.cu b/paddle/phi/kernels/gpu/copy_kernel.cu
index e88795b617370..0cbf5525d60f5 100644
--- a/paddle/phi/kernels/gpu/copy_kernel.cu
+++ b/paddle/phi/kernels/gpu/copy_kernel.cu
@@ -35,7 +35,7 @@ void Copy(const Context& dev_ctx,
   auto dst_place = dst->place();
 
   if (src_place == dst_place && paddle::platform::is_cpu_place(src_place)) {
-    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+    PADDLE_THROW(phi::errors::InvalidArgument(
         "The src and dst tensor are all CPU tensor, you should call copy "
         "function in CPU mode."));
   }
@@ -74,13 +74,13 @@ void Copy(const Context& dev_ctx,
     PADDLE_ENFORCE_EQ(
         paddle::platform::is_gpu_place(ctx_place),
         true,
-        paddle::platform::errors::PreconditionNotMet(
+        phi::errors::PreconditionNotMet(
             "Context place error, excepted GPUPlace, but actually %s.",
             ctx_place));
     auto ctx_gpu_place = ctx_place;
     PADDLE_ENFORCE_EQ(src_gpu_place,
                       ctx_gpu_place,
-                      paddle::platform::errors::Unavailable(
+                      phi::errors::Unavailable(
                           "Source place and context place do not match, source "
                           "place is %s, context place is %s.",
                           src_gpu_place,
@@ -98,13 +98,13 @@ void Copy(const Context& dev_ctx,
     PADDLE_ENFORCE_EQ(
         paddle::platform::is_gpu_place(ctx_place),
         true,
-        paddle::platform::errors::PreconditionNotMet(
+        phi::errors::PreconditionNotMet(
             "Context place error, excepted GPUPlace, but actually %s.",
             ctx_place));
     auto ctx_gpu_place = ctx_place;
     PADDLE_ENFORCE_EQ(dst_gpu_place,
                       ctx_gpu_place,
-                      paddle::platform::errors::Unavailable(
+                      phi::errors::Unavailable(
                           "Destination place and context place do not match, "
                           "destination place is %s, context place is %s.",
                           dst_gpu_place,
@@ -121,14 +121,14 @@ void Copy(const Context& dev_ctx,
     auto ctx_place = dev_ctx.GetPlace();
     PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(ctx_place),
                       true,
-                      paddle::platform::errors::PreconditionNotMet(
+                      phi::errors::PreconditionNotMet(
                           "Device context place mismatch. When copying Tensor "
                           "data from GPU memory to CUDA Pinned memory, current "
                           "device context place should be GPU."));
     auto ctx_gpu_place = ctx_place;
     PADDLE_ENFORCE_EQ(src_gpu_place,
                       ctx_gpu_place,
-                      paddle::platform::errors::PreconditionNotMet(
+                      phi::errors::PreconditionNotMet(
                           "The source GPU device and current device context do "
                           "not match. The source GPU device number is %d, but "
                           "device context GPU number is %d.",
@@ -146,14 +146,14 @@ void Copy(const Context& dev_ctx,
     auto ctx_place = dev_ctx.GetPlace();
     PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(ctx_place),
                       true,
-                      paddle::platform::errors::PreconditionNotMet(
+                      phi::errors::PreconditionNotMet(
                           "Device context place mismatch. When copying Tensor "
                           "data from CUDA Pinned memory to GPU memory, current "
                           "device context place should be GPU."));
     auto ctx_gpu_place = ctx_place;
     PADDLE_ENFORCE_EQ(dst_gpu_place,
                       ctx_gpu_place,
-                      paddle::platform::errors::PreconditionNotMet(
+                      phi::errors::PreconditionNotMet(
                           "The target GPU device and current device context do "
                           "not match. The target GPU device number is %d, but "
                           "device context GPU number is %d.",
@@ -172,7 +172,7 @@ void Copy(const Context& dev_ctx,
     PADDLE_ENFORCE_EQ(
         paddle::platform::is_gpu_place(ctx_place),
         true,
-        paddle::platform::errors::PreconditionNotMet(
+        phi::errors::PreconditionNotMet(
             "Context place error, excepted GPUPlace, but actually %s.",
             ctx_place));
     auto stream =
@@ -195,12 +195,12 @@ void Copy(const Context& dev_ctx,
         paddle::memory::Copy(
             dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
       } else {
-        PADDLE_THROW(paddle::platform::errors::Unavailable(
+        PADDLE_THROW(phi::errors::Unavailable(
             "Context place dose not match the source and destination place."));
       }
     }
   } else {
-    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+    PADDLE_THROW(phi::errors::InvalidArgument(
         "Place type error. Please check the place of src and dst Tensor."));
   }
 }
diff --git a/paddle/phi/kernels/gpu/elementwise.h b/paddle/phi/kernels/gpu/elementwise.h
index df66a00a80725..a2992702b164a 100644
--- a/paddle/phi/kernels/gpu/elementwise.h
+++ b/paddle/phi/kernels/gpu/elementwise.h
@@ -714,7 +714,7 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
                              DX_OP dx_op,
                              DY_OP dy_op) {
   const auto gplace = ctx.GetPlace();
-  auto cplace = paddle::platform::CPUPlace();
+  auto cplace = phi::CPUPlace();
   const T *x_data = x.data<T>();
   const T *y_data = y.data<T>();
   const Tout *out_data = out.data<Tout>();
@@ -1339,12 +1339,12 @@ void ElemwiseGradComputeWithBroadcast(const GPUContext &ctx,
   PADDLE_ENFORCE_GE(
       axis,
       0,
-      paddle::platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "Axis should be great than or equal to 0, but received axis is %d.",
           axis));
   PADDLE_ENFORCE_LT(axis,
                     max_dim,
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "Axis should be less than %d, but received axis is %d.",
                         max_dim,
                         axis));
diff --git a/paddle/phi/kernels/gpu/histogram_kernel.cu b/paddle/phi/kernels/gpu/histogram_kernel.cu
index 6db987e22fc6c..c5eb5220537cd 100644
--- a/paddle/phi/kernels/gpu/histogram_kernel.cu
+++ b/paddle/phi/kernels/gpu/histogram_kernel.cu
@@ -111,9 +111,9 @@ void HistogramKernel(const Context& dev_ctx,
 
     DenseTensor input_min_cpu, input_max_cpu;
     paddle::framework::TensorCopySync(
-        input_min_t, paddle::platform::CPUPlace(), &input_min_cpu);
+        input_min_t, phi::CPUPlace(), &input_min_cpu);
     paddle::framework::TensorCopySync(
-        input_max_t, paddle::platform::CPUPlace(), &input_max_cpu);
+        input_max_t, phi::CPUPlace(), &input_max_cpu);
 
     output_min = input_min_cpu.data<T>()[0];
     output_max = input_max_cpu.data<T>()[0];
diff --git a/paddle/phi/kernels/impl/full_kernel_impl.h b/paddle/phi/kernels/impl/full_kernel_impl.h
index 40675dd175bef..8cced49906ecc 100644
--- a/paddle/phi/kernels/impl/full_kernel_impl.h
+++ b/paddle/phi/kernels/impl/full_kernel_impl.h
@@ -59,7 +59,7 @@ void FullLikeKernel(const Context& dev_ctx,
           (common_type_value <=
            static_cast<CommonType>(std::numeric_limits<T>::max())),
       true,
-      paddle::platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "The filled value is out of range for target type, "
           "current kernel type is %s, the range should between %f "
           "and %f, but now value is %f.",
diff --git a/paddle/phi/kernels/impl/matmul_kernel_impl.h b/paddle/phi/kernels/impl/matmul_kernel_impl.h
index 119bdc2986ea5..f6136de5d8d0c 100644
--- a/paddle/phi/kernels/impl/matmul_kernel_impl.h
+++ b/paddle/phi/kernels/impl/matmul_kernel_impl.h
@@ -38,7 +38,7 @@ static void GetBroadcastFromDims(const int x_ndim,
     PADDLE_ENFORCE_EQ(
         x_bd_dims[i] == y_bd_dims[i] || x_bd_dims[i] <= 1 || y_bd_dims[i] <= 1,
         true,
-        paddle::platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Input(X) and Input(Y) has error dim."
             "X_broadcast's shape[%s] must be equal to Y_broadcast's shape[%s],"
             "or X_broadcast's shape[%s] <= 1, or Y_broadcast's shape[%s] <= 1,"
@@ -110,7 +110,7 @@ void MatMulFunction(const Context& dev_ctx,
     PADDLE_ENFORCE_EQ(
         M,
         N,
-        paddle::platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "X's numbers must be equal to Y's numbers,"
             "when X/Y's dims =1. But received X has [%d] elements,"
             "received Y has [%d] elements",
@@ -135,27 +135,27 @@ void MatMulFunction(const Context& dev_ctx,
   if (x_ndim == 1) {
     const int N = X.numel();
     if (trans_y) {
-      PADDLE_ENFORCE_EQ(y_dims[y_ndim - 1],
-                        N,
-                        paddle::platform::errors::InvalidArgument(
-                            "Input(Y) has error dim."
-                            "Y'dims[%d] must be equal to %d"
-                            "But received Y'dims[%d] is %d",
-                            y_ndim - 1,
-                            N,
-                            y_ndim - 1,
-                            y_dims[y_ndim - 1]));
+      PADDLE_ENFORCE_EQ(
+          y_dims[y_ndim - 1],
+          N,
+          phi::errors::InvalidArgument("Input(Y) has error dim."
+                                       "Y'dims[%d] must be equal to %d"
+                                       "But received Y'dims[%d] is %d",
+                                       y_ndim - 1,
+                                       N,
+                                       y_ndim - 1,
+                                       y_dims[y_ndim - 1]));
     } else {
-      PADDLE_ENFORCE_EQ(y_dims[y_ndim - 2],
-                        N,
-                        paddle::platform::errors::InvalidArgument(
-                            "Input(Y) has error dim."
-                            "Y'dims[%d] must be equal to %d"
-                            "But received Y'dims[%d] is %d",
-                            y_ndim - 2,
-                            N,
-                            y_ndim - 2,
-                            y_dims[y_ndim - 2]));
+      PADDLE_ENFORCE_EQ(
+          y_dims[y_ndim - 2],
+          N,
+          phi::errors::InvalidArgument("Input(Y) has error dim."
+                                       "Y'dims[%d] must be equal to %d"
+                                       "But received Y'dims[%d] is %d",
+                                       y_ndim - 2,
+                                       N,
+                                       y_ndim - 2,
+                                       y_dims[y_ndim - 2]));
     }
     std::vector<std::int64_t> out_dims(y_ndim - 1);
     if (trans_y) {
@@ -213,27 +213,27 @@ void MatMulFunction(const Context& dev_ctx,
   if (y_ndim == 1) {
     const int N = Y.numel();
     if (trans_x) {
-      PADDLE_ENFORCE_EQ(x_dims[x_ndim - 2],
-                        N,
-                        paddle::platform::errors::InvalidArgument(
-                            "Input(X) has error dim."
-                            "X'dims[%d] must be equal to %d"
-                            "But received X'dims[%d] is %d",
-                            x_ndim - 2,
-                            N,
-                            x_ndim - 2,
-                            x_dims[x_ndim - 2]));
+      PADDLE_ENFORCE_EQ(
+          x_dims[x_ndim - 2],
+          N,
+          phi::errors::InvalidArgument("Input(X) has error dim."
+                                       "X'dims[%d] must be equal to %d"
+                                       "But received X'dims[%d] is %d",
+                                       x_ndim - 2,
+                                       N,
+                                       x_ndim - 2,
+                                       x_dims[x_ndim - 2]));
     } else {
-      PADDLE_ENFORCE_EQ(x_dims[x_ndim - 1],
-                        N,
-                        paddle::platform::errors::InvalidArgument(
-                            "Input(X) has error dim."
-                            "X'dims[%d] must be equal to %d"
-                            "But received X'dims[%d] is %d",
-                            x_ndim - 1,
-                            N,
-                            x_ndim - 1,
-                            x_dims[x_ndim - 1]));
+      PADDLE_ENFORCE_EQ(
+          x_dims[x_ndim - 1],
+          N,
+          phi::errors::InvalidArgument("Input(X) has error dim."
+                                       "X'dims[%d] must be equal to %d"
+                                       "But received X'dims[%d] is %d",
+                                       x_ndim - 1,
+                                       N,
+                                       x_ndim - 1,
+                                       x_dims[x_ndim - 1]));
     }
     std::vector<std::int64_t> out_dims(x_ndim - 1);
     if (trans_x) {
@@ -292,27 +292,27 @@ void MatMulFunction(const Context& dev_ctx,
   const int M = trans_x ? x_dims[x_ndim - 1] : x_dims[x_ndim - 2];
   const int K = trans_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1];
   if (trans_y) {
-    PADDLE_ENFORCE_EQ(y_dims[y_ndim - 1],
-                      K,
-                      paddle::platform::errors::InvalidArgument(
-                          "Input(Y) has error dim."
-                          "Y'dims[%d] must be equal to %d"
-                          "But received Y'dims[%d] is %d",
-                          y_ndim - 1,
-                          K,
-                          y_ndim - 1,
-                          y_dims[y_ndim - 1]));
+    PADDLE_ENFORCE_EQ(
+        y_dims[y_ndim - 1],
+        K,
+        phi::errors::InvalidArgument("Input(Y) has error dim."
+                                     "Y'dims[%d] must be equal to %d"
+                                     "But received Y'dims[%d] is %d",
+                                     y_ndim - 1,
+                                     K,
+                                     y_ndim - 1,
+                                     y_dims[y_ndim - 1]));
   } else {
-    PADDLE_ENFORCE_EQ(y_dims[y_ndim - 2],
-                      K,
-                      paddle::platform::errors::InvalidArgument(
-                          "Input(Y) has error dim."
-                          "Y'dims[%d] must be equal to %d"
-                          "But received Y'dims[%d] is %d",
-                          y_ndim - 2,
-                          K,
-                          y_ndim - 2,
-                          y_dims[y_ndim - 2]));
+    PADDLE_ENFORCE_EQ(
+        y_dims[y_ndim - 2],
+        K,
+        phi::errors::InvalidArgument("Input(Y) has error dim."
+                                     "Y'dims[%d] must be equal to %d"
+                                     "But received Y'dims[%d] is %d",
+                                     y_ndim - 2,
+                                     K,
+                                     y_ndim - 2,
+                                     y_dims[y_ndim - 2]));
   }
   const int N = trans_y ? y_dims[y_ndim - 2] : y_dims[y_ndim - 1];
   const int ndim = (std::max)(x_ndim, y_ndim);
@@ -493,16 +493,16 @@ void MatmulKernel(const Context& dev_ctx,
                   bool transpose_x,
                   bool transpose_y,
                   DenseTensor* out) {
-  PADDLE_ENFORCE_NE(phi::product(x.dims()),
-                    0,
-                    paddle::platform::errors::InvalidArgument(
-                        "The Input(X) dims size must not be equal 0,"
-                        " but reviced dims size is 0. "));
-  PADDLE_ENFORCE_NE(phi::product(y.dims()),
-                    0,
-                    paddle::platform::errors::InvalidArgument(
-                        "The Input(Y) dims size must not be equal 0,"
-                        " but reviced dims size is 0. "));
+  PADDLE_ENFORCE_NE(
+      phi::product(x.dims()),
+      0,
+      phi::errors::InvalidArgument("The Input(X) dims size must not be equal 0,"
+                                   " but reviced dims size is 0. "));
+  PADDLE_ENFORCE_NE(
+      phi::product(y.dims()),
+      0,
+      phi::errors::InvalidArgument("The Input(Y) dims size must not be equal 0,"
+                                   " but reviced dims size is 0. "));
   MatMulFunction<Context, T>(dev_ctx, x, y, out, transpose_x, transpose_y);
 }
 
diff --git a/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
index 4374b5d7f1a1d..ba89135641e0e 100644
--- a/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
@@ -41,7 +41,7 @@ inline int64_t GetNonZeroNum(const DenseTensor& dense,
   PADDLE_ENFORCE_GE(
       dims.size(),
       sparse_dim,
-      paddle::platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "sparse_dim(%d) should be less than or equal to dense.dim(%d)",
           sparse_dim,
           dims.size()));
@@ -161,7 +161,7 @@ void SparseCooToCsrKernel(const Context& dev_ctx,
   bool valid = x_dims.size() == 2 || x_dims.size() == 3;
   PADDLE_ENFORCE_EQ(valid,
                     true,
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "SparseCsrTensor only support 2-D or 3-D matrix"));
   const int64_t non_zero_num = x.nnz();
   if (non_zero_num <= 0) return;
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
index b7793e4055445..1e2c70a9cf39b 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
@@ -379,7 +379,7 @@ void SparseCooToCsrKernel(const Context& dev_ctx,
   bool valid = x_dims.size() == 2 || x_dims.size() == 3;
   PADDLE_ENFORCE_EQ(valid,
                     true,
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "SparseCsrTensor only support 2-D or 3-D matrix"));
   const int64_t non_zero_num = x.nnz();
   if (non_zero_num <= 0) return;
diff --git a/paddle/phi/kernels/sparse/sparse_utils_kernel.h b/paddle/phi/kernels/sparse/sparse_utils_kernel.h
index 3d7304653e77b..b5201e16f548d 100644
--- a/paddle/phi/kernels/sparse/sparse_utils_kernel.h
+++ b/paddle/phi/kernels/sparse/sparse_utils_kernel.h
@@ -97,7 +97,7 @@ void DenseToSparseCsrKernel(const Context& dev_ctx,
   bool valid = x_dims.size() == 2 || x_dims.size() == 3;
   PADDLE_ENFORCE_EQ(valid,
                     true,
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "SparseCsrTensor only support 2-D or 3-D Tensor."));
   const int64_t sparse_dim = x_dims.size() == 2 ? 2 : 3;
   DenseTensor indices = phi::Empty<T, Context>(dev_ctx);
diff --git a/paddle/phi/kernels/xpu/copy_kernel.cc b/paddle/phi/kernels/xpu/copy_kernel.cc
index 3bbedbbb346e4..58efbafc88bee 100644
--- a/paddle/phi/kernels/xpu/copy_kernel.cc
+++ b/paddle/phi/kernels/xpu/copy_kernel.cc
@@ -62,7 +62,7 @@ void Copy(const Context& dev_ctx,
     }
     paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   } else {
-    PADDLE_THROW(paddle::platform::errors::Unimplemented(
+    PADDLE_THROW(phi::errors::Unimplemented(
         "Copy from %s to %s is not supported.", src_place, dst_place));
   }
 }
diff --git a/paddle/phi/kernels/xpu/scale_kernel.cc b/paddle/phi/kernels/xpu/scale_kernel.cc
index e103e5afdcf9b..b5a07a7a146c3 100644
--- a/paddle/phi/kernels/xpu/scale_kernel.cc
+++ b/paddle/phi/kernels/xpu/scale_kernel.cc
@@ -32,13 +32,13 @@ void ScaleKernel(const Context& dev_ctx,
                  DenseTensor* out) {
   out->mutable_data<T>(dev_ctx.GetPlace());
 
-  PADDLE_ENFORCE_EQ(x.dims(),
-                    out->dims(),
-                    paddle::platform::errors::InvalidArgument(
-                        "In and out should have the same dim,"
-                        " expected %s, but got %s.",
-                        x.dims().to_str().c_str(),
-                        out->dims().to_str().c_str()));
+  PADDLE_ENFORCE_EQ(
+      x.dims(),
+      out->dims(),
+      phi::errors::InvalidArgument("In and out should have the same dim,"
+                                   " expected %s, but got %s.",
+                                   x.dims().to_str().c_str(),
+                                   out->dims().to_str().c_str()));
   using XPUType = typename XPUTypeTrait<T>::Type;
   int r = xpu::scale(dev_ctx.x_context(),
                      reinterpret_cast<const XPUType*>(x.data<T>()),
@@ -50,7 +50,7 @@ void ScaleKernel(const Context& dev_ctx,
   PADDLE_ENFORCE_EQ(
       r,
       XPU_SUCCESS,
-      paddle::platform::errors::External(
+      phi::errors::External(
           "XPU scale kernel return wrong value[%d %s]", r, XPUAPIErrorMsg[r]));
 }
 
diff --git a/paddle/phi/tests/core/allocator.h b/paddle/phi/tests/core/allocator.h
index 66e5b4885c836..b92178eba3045 100644
--- a/paddle/phi/tests/core/allocator.h
+++ b/paddle/phi/tests/core/allocator.h
@@ -29,8 +29,7 @@ class FancyAllocator : public phi::Allocator {
 
   AllocationPtr Allocate(size_t bytes_size) override {
     void* data = ::operator new(bytes_size);
-    auto* allocation =
-        new phi::Allocation(data, bytes_size, paddle::platform::CPUPlace());
+    auto* allocation = new phi::Allocation(data, bytes_size, phi::CPUPlace());
     return AllocationPtr(allocation, Delete);
   }
 };
diff --git a/paddle/phi/tests/core/test_dense_tensor.cc b/paddle/phi/tests/core/test_dense_tensor.cc
index 6464ff24d24aa..ddfa184df2c1e 100644
--- a/paddle/phi/tests/core/test_dense_tensor.cc
+++ b/paddle/phi/tests/core/test_dense_tensor.cc
@@ -85,7 +85,7 @@ TEST(dense_tensor, ctor) {
     r = r && (t.dims() == m.dims);
     r = r && (t.dtype() == m.dtype);
     r = r && (t.layout() == m.layout);
-    r = r && (t.place() == paddle::platform::CPUPlace());
+    r = r && (t.place() == phi::CPUPlace());
     r = r && t.initialized();
     r = r && t.IsSharedWith(t);
     return r;
diff --git a/paddle/phi/tests/core/test_sparse_coo_tensor.cc b/paddle/phi/tests/core/test_sparse_coo_tensor.cc
index e93f1f0b0ecaf..5d0e16b0528e7 100644
--- a/paddle/phi/tests/core/test_sparse_coo_tensor.cc
+++ b/paddle/phi/tests/core/test_sparse_coo_tensor.cc
@@ -53,7 +53,7 @@ TEST(sparse_coo_tensor, construct) {
   CHECK(sparse.dims() == dense_dims);
   CHECK(sparse.dtype() == DataType::FLOAT32);
   CHECK(sparse.layout() == DataLayout::SPARSE_COO);
-  CHECK(sparse.place() == paddle::platform::CPUPlace());
+  CHECK(sparse.place() == phi::CPUPlace());
 }
 
 TEST(sparse_coo_tensor, other_function) {
diff --git a/paddle/utils/string/tinyformat/tinyformat.h b/paddle/utils/string/tinyformat/tinyformat.h
index 28a444f87c48f..4e46cbc26b638 100644
--- a/paddle/utils/string/tinyformat/tinyformat.h
+++ b/paddle/utils/string/tinyformat/tinyformat.h
@@ -133,6 +133,8 @@
 #include <iostream>
 #include <sstream>
 
+#include "paddle/utils/string/to_string.h"
+
 namespace paddle {
 namespace string {
 namespace tinyformat {
diff --git a/paddle/utils/string/to_string.h b/paddle/utils/string/to_string.h
index 7b3332861e0fa..3cec88a4571b6 100644
--- a/paddle/utils/string/to_string.h
+++ b/paddle/utils/string/to_string.h
@@ -56,5 +56,26 @@ inline std::string to_string(const char* v) {
   return std::string(v);
 }
 
+inline std::ostream& operator<<(std::ostream& os,
+                                const std::vector<std::vector<size_t>>& lod) {
+  os << "{";
+  for (auto& v : lod) {
+    os << "{";
+    bool is_first = true;
+    for (auto& i : v) {
+      if (is_first) {
+        os << i;
+        is_first = false;
+      } else {
+        os << ", " << i;
+      }
+    }
+    os << "}";
+  }
+  os << "}";
+
+  return os;
+}
+
 }  // namespace string
 }  // namespace paddle

From f33ae2060320fe68a1aa0465de503bc882febc8c Mon Sep 17 00:00:00 2001
From: Leo Guo <58431564+ZibinGuo@users.noreply.github.com>
Date: Tue, 22 Feb 2022 17:09:42 +0800
Subject: [PATCH 038/101] Adapt to batch_norm_grad op and add align function in
 roi_align op for kunlun (#39685)

* Adapt to batch_norm_grad op and add align function in
roi_align op for kunlun, *test=kunlun

* Adapt to batch_norm, batch_norm_grad op api for kunlun, and add unit-tests of batch_norm, roi_align. *test=kunlun
---
 paddle/fluid/operators/batch_norm_op_xpu.cc   | 140 ++++++++++--------
 paddle/fluid/operators/roi_align_op_xpu.cc    |   6 +-
 .../unittests/xpu/test_batch_norm_op_xpu.py   |  11 +-
 .../unittests/xpu/test_roi_align_op_xpu.py    |  29 ++--
 4 files changed, 109 insertions(+), 77 deletions(-)

diff --git a/paddle/fluid/operators/batch_norm_op_xpu.cc b/paddle/fluid/operators/batch_norm_op_xpu.cc
index 505acbbdbde1b..6699df0c8dc59 100644
--- a/paddle/fluid/operators/batch_norm_op_xpu.cc
+++ b/paddle/fluid/operators/batch_norm_op_xpu.cc
@@ -38,23 +38,25 @@ class BatchNormXPUKernel : public framework::OpKernel<T> {
     bool global_stats = test_mode || use_global_stats;
     const auto &data_layout_str = ctx.Attr<std::string>("data_layout");
     const auto data_layout = framework::StringToDataLayout(data_layout_str);
-    PADDLE_ENFORCE_EQ(data_layout, DataLayout::kNCHW,
+    PADDLE_ENFORCE_EQ(data_layout_str == "NCHW" || data_layout_str == "NHWC",
+                      true,
                       platform::errors::InvalidArgument(
-                          "The 'data_layout' attribute must be NCHW. But "
-                          "recevived 'data_layout' is [%s].",
+                          "The 'data_layout' attribute must be NCHW or NHWC. "
+                          "But recevived 'data_layout' is [%s].",
                           data_layout_str));
 
     const auto *x = ctx.Input<Tensor>("X");
     const auto &x_dims = x->dims();
-    PADDLE_ENFORCE_EQ(x_dims.size(), 4,
-                      platform::errors::InvalidArgument(
-                          "The input tensor X's dimension must equal to 4. But "
-                          "received X's shape = [%s], X's dimension = [%d].",
-                          x_dims, x_dims.size()));
-    const int N = x_dims[0];
-    const int C = x_dims[1];
-    const int H = x_dims[2];
-    const int W = x_dims[3];
+    PADDLE_ENFORCE_EQ(
+        x_dims.size() >= 2 && x_dims.size() <= 5, true,
+        platform::errors::InvalidArgument(
+            "The size of input's dimensions should be between 2 and 5"
+            "But received: the size of input's dimensions is [%d]",
+            x_dims.size()));
+
+    int N, C, H, W, D;
+    ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
+
     const auto *scale = ctx.Input<Tensor>("Scale");
     const auto *bias = ctx.Input<Tensor>("Bias");
     const auto *x_data = x->data<T>();
@@ -75,6 +77,7 @@ class BatchNormXPUKernel : public framework::OpKernel<T> {
     saved_variance->mutable_data<float>(ctx.GetPlace());
 
     auto &dev_ctx = ctx.template device_context<DeviceContext>();
+    bool is_nchw = data_layout_str == "NCHW";
 
     if (!global_stats) {
       auto *mean_out_data = mean_out->data<float>();
@@ -95,7 +98,7 @@ class BatchNormXPUKernel : public framework::OpKernel<T> {
       int r = xpu::batch_norm<T>(dev_ctx.x_context(), x_data, y_data, N, C, H,
                                  W, epsilon, momentum, scale_data, bias_data,
                                  saved_mean_data, saved_variance_data,
-                                 mean_out_data, variance_out_data, true);
+                                 mean_out_data, variance_out_data, is_nchw);
       PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
                         platform::errors::External(
                             "The batch_norm XPU API return wrong value[%d %s]",
@@ -107,7 +110,7 @@ class BatchNormXPUKernel : public framework::OpKernel<T> {
       const auto *variance_data = variance->data<float>();
       int r = xpu::batch_norm_infer(dev_ctx.x_context(), x_data, y_data, N, C,
                                     H, W, epsilon, scale_data, bias_data,
-                                    mean_data, variance_data, true);
+                                    mean_data, variance_data, is_nchw);
       PADDLE_ENFORCE_EQ(
           r, xpu::Error_t::SUCCESS,
           platform::errors::External(
@@ -168,11 +171,11 @@ class BatchNormGradXPUKernel : public framework::OpKernel<T> {
     const float epsilon = ctx.Attr<float>("epsilon");
     const auto data_layout = framework::StringToDataLayout(data_layout_str);
 
-    // TODO(guozbin): Transform input tensor from NHWC to NCHW
-    PADDLE_ENFORCE_EQ(data_layout, DataLayout::kNCHW,
+    PADDLE_ENFORCE_EQ(data_layout_str == "NCHW" || data_layout_str == "NHWC",
+                      true,
                       platform::errors::InvalidArgument(
-                          "The 'data_layout' attribute must be NCHW. But "
-                          "recevived 'data_layout' is [%s].",
+                          "The 'data_layout' attribute must be NCHW or NHWC. "
+                          "But recevived 'data_layout' is [%s].",
                           data_layout_str));
 
     auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
@@ -207,15 +210,15 @@ class BatchNormGradXPUKernel : public framework::OpKernel<T> {
     }
 
     const auto &x_dims = x->dims();
-    PADDLE_ENFORCE_EQ(x_dims.size(), 4,
-                      platform::errors::InvalidArgument(
-                          "The input tensor X's dimension must equal to 4. But "
-                          "received X's shape = [%s], X's dimension = [%d].",
-                          x_dims, x_dims.size()));
-    const int N = x_dims[0];
-    const int C = x_dims[1];
-    const int H = x_dims[2];
-    const int W = x_dims[3];
+    PADDLE_ENFORCE_EQ(
+        x_dims.size() >= 2 && x_dims.size() <= 5, true,
+        platform::errors::InvalidArgument(
+            "The size of input's dimensions should be between 2 and 5"
+            "But received: the size of input's dimensions is [%d]",
+            x_dims.size()));
+
+    int N, C, H, W, D;
+    ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
 
     const auto *x_data = x->data<T>();
     const auto *d_y_data = d_y->data<T>();
@@ -250,38 +253,35 @@ class BatchNormGradXPUKernel : public framework::OpKernel<T> {
     auto &dev_ctx = ctx.template device_context<DeviceContext>();
     xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
 
-    const T *mean_data = nullptr;
-    const T *inv_var_data = nullptr;
+    const auto *batch_mean = ctx.Input<Tensor>("SavedMean");
+    const auto *batch_inv_std = ctx.Input<Tensor>("SavedVariance");
+    const auto *global_mean = ctx.Input<Tensor>("Mean");
+    const auto *global_var = ctx.Input<Tensor>("Variance");
 
     // TODO(guozibin): hadle the situation case of N * H * W = 1
-    if (!use_global_stats) {
-      const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
-      // SavedVariance have been reverted in forward operator
-      const auto *saved_inv_variance = ctx.Input<Tensor>("SavedVariance");
-      mean_data = saved_mean->data<float>();
-      inv_var_data = saved_inv_variance->data<float>();
-    } else {
-      const auto *running_mean = ctx.Input<Tensor>("Mean");
-      const auto *running_variance = ctx.Input<Tensor>("Variance");
-      mean_data = running_mean->data<float>();
-      inv_var_data = running_variance->data<float>();
-      float *running_inv_var_data =
-          RAII_GUARD.alloc_l3_or_gm<float>(running_variance->numel());
-      float *epsilon_data = RAII_GUARD.alloc_l3_or_gm<float>(1);
-      int r1 = calculate_inv_var(dev_ctx.x_context(), inv_var_data, epsilon, C,
-                                 epsilon_data, running_inv_var_data);
-      PADDLE_ENFORCE_EQ(r1, XPU_SUCCESS, platform::errors::External(
-                                             "XPU API(batch_norm_grad "
-                                             "calculate_inv_var function) "
-                                             "return wrong value[%d %s]",
-                                             r1, XPUAPIErrorMsg[r1]));
-      inv_var_data = running_inv_var_data;
-    }
     if (is_inplace) {
+      float *global_inv_std_data;
+      if (use_global_stats) {
+        global_inv_std_data =
+            RAII_GUARD.alloc_l3_or_gm<float>(global_var->numel());
+        float *epsilon_data = RAII_GUARD.alloc_l3_or_gm<float>(1);
+        int r1 =
+            calculate_inv_var(dev_ctx.x_context(), global_var->data<float>(),
+                              epsilon, C, epsilon_data, global_inv_std_data);
+        PADDLE_ENFORCE_EQ(r1, XPU_SUCCESS, platform::errors::External(
+                                               "XPU API(batch_norm_grad "
+                                               "calculate_inv_var function) "
+                                               "return wrong value[%d %s]",
+                                               r1, XPUAPIErrorMsg[r1]));
+      }
       auto px = *x;
+      auto *inv_std_data =
+          use_global_stats ? global_inv_std_data : batch_inv_std->data<float>();
+      auto mean_data = use_global_stats ? global_mean->data<float>()
+                                        : batch_mean->data<float>();
       int r2 = calculate_inv_BN_Y(
           dev_ctx.x_context(), px.mutable_data<T>(ctx.GetPlace()),
-          scale->data<float>(), bias->data<float>(), mean_data, inv_var_data, N,
+          scale->data<float>(), bias->data<float>(), mean_data, inv_std_data, N,
           C, H * W, x->data<T>());
       PADDLE_ENFORCE_EQ(r2, XPU_SUCCESS, platform::errors::External(
                                              "XPU API(batch_norm_grad "
@@ -289,19 +289,29 @@ class BatchNormGradXPUKernel : public framework::OpKernel<T> {
                                              "return wrong value[%d %s]",
                                              r2, XPUAPIErrorMsg[r2]));
     }
-    if (!d_x) {
-      d_x_data = RAII_GUARD.alloc_l3_or_gm<T>(x->numel());
-    }
-    if (!d_scale) {
-      d_scale_data = RAII_GUARD.alloc_l3_or_gm<float>(C);
-    }
-    if (!d_bias_data) {
-      d_bias_data = RAII_GUARD.alloc_l3_or_gm<float>(C);
-    }
 
-    int r3 = xpu::batch_norm_grad<T>(
-        dev_ctx.x_context(), x_data, d_y_data, d_x_data, N, C, H, W, scale_data,
-        mean_data, inv_var_data, d_scale_data, d_bias_data, true);
+    int r3;
+    bool is_nchw = data_layout_str == "NCHW";
+    if (use_global_stats) {
+      r3 = xpu::batch_norm_grad<T>(
+          dev_ctx.x_context(), x_data, d_y_data, d_x_data, N, C, H, W,
+          scale_data, nullptr, nullptr, d_scale_data, d_bias_data, is_nchw,
+          global_mean->data<float>(), global_var->data<float>(), epsilon);
+    } else {
+      if (!d_x) {
+        d_x_data = RAII_GUARD.alloc_l3_or_gm<T>(x->numel());
+      }
+      if (!d_scale) {
+        d_scale_data = RAII_GUARD.alloc_l3_or_gm<float>(C);
+      }
+      if (!d_bias_data) {
+        d_bias_data = RAII_GUARD.alloc_l3_or_gm<float>(C);
+      }
+      r3 = xpu::batch_norm_grad<T>(
+          dev_ctx.x_context(), x_data, d_y_data, d_x_data, N, C, H, W,
+          scale_data, batch_mean->data<float>(), batch_inv_std->data<float>(),
+          d_scale_data, d_bias_data, is_nchw);
+    }
     PADDLE_ENFORCE_EQ(r3, XPU_SUCCESS, platform::errors::External(
                                            "XPU API(batch_norm_grad) return "
                                            "wrong value[%d %s]",
diff --git a/paddle/fluid/operators/roi_align_op_xpu.cc b/paddle/fluid/operators/roi_align_op_xpu.cc
index 7764e52c2f6da..09d2d906653e8 100644
--- a/paddle/fluid/operators/roi_align_op_xpu.cc
+++ b/paddle/fluid/operators/roi_align_op_xpu.cc
@@ -32,6 +32,7 @@ class XPUROIAlignOpKernel : public framework::OpKernel<T> {
     auto pooled_width = ctx.Attr<int>("pooled_width");
     auto spatial_scale = ctx.Attr<float>("spatial_scale");
     auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
+    auto aligned = ctx.Attr<bool>("aligned");
 
     auto in_dims = in->dims();
     int batch_size = in_dims[0];
@@ -117,7 +118,7 @@ class XPUROIAlignOpKernel : public framework::OpKernel<T> {
         dev_ctx.x_context(), in->data<T>(),
         out->mutable_data<T>(ctx.GetPlace()), rois->data<T>(), roi_id_data,
         batch_size, channels, height, width, out->dims()[0], pooled_height,
-        pooled_width, spatial_scale, sampling_ratio, true);
+        pooled_width, spatial_scale, sampling_ratio, true, aligned);
     PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
                       platform::errors::External(
                           "The roi_align XPU OP return wrong value[%d %s]", r,
@@ -143,6 +144,7 @@ class XPUROIAlignGradOpKernel : public framework::OpKernel<T> {
     auto pooled_width = ctx.Attr<int>("pooled_width");
     auto spatial_scale = ctx.Attr<float>("spatial_scale");
     auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
+    auto aligned = ctx.Attr<bool>("aligned");
 
     int rois_num = rois->dims()[0];
     int channels = in->dims()[1];
@@ -197,7 +199,7 @@ class XPUROIAlignGradOpKernel : public framework::OpKernel<T> {
           dev_ctx.x_context(), out_grad->data<T>(), in_grad->data<T>(),
           rois->data<T>(), roi_id_data, in->dims()[0], channels, height, width,
           out_grad->dims()[0], pooled_height, pooled_width, spatial_scale,
-          sampling_ratio, true);
+          sampling_ratio, true, aligned);
       PADDLE_ENFORCE_EQ(
           r, xpu::Error_t::SUCCESS,
           platform::errors::External(
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py
index 9cd34c82650e9..f401a9a537487 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py
@@ -296,7 +296,9 @@ def test_global_stats(self):
                     net2.training = False
                 y1 = net1(x)
                 y2 = net2(x)
-                self.assertEqual(np.allclose(y1.numpy(), y2.numpy()), True)
+                self.assertEqual(
+                    np.allclose(
+                        y1.numpy(), y2.numpy(), atol=1e-4), True)
 
 
 class TestXPUBatchNormUseGlobalStatsCase1(TestXPUBatchNormOpUseGlobalStats):
@@ -320,5 +322,12 @@ def init_test(self):
         self.trainable_statistics = True
 
 
+class TestXPUBatchNormUseGlobalStatsCase4(TestXPUBatchNormOpUseGlobalStats):
+    ### train mode
+    def init_test(self):
+        self.use_global_stats = True
+        self.trainable_statistics = False
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_roi_align_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_roi_align_op_xpu.py
index 2122223dbec1b..e80b1e4c50ef2 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_roi_align_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_roi_align_op_xpu.py
@@ -40,7 +40,8 @@ def set_data(self):
             'spatial_scale': self.spatial_scale,
             'pooled_height': self.pooled_height,
             'pooled_width': self.pooled_width,
-            'sampling_ratio': self.sampling_ratio
+            'sampling_ratio': self.sampling_ratio,
+            'aligned': self.continuous_coordinate
         }
 
         self.outputs = {'Out': self.out_data}
@@ -51,6 +52,8 @@ def init_test_case(self):
         self.height = 8
         self.width = 6
 
+        self.xpu_version = core.get_xpu_device_version(0)
+
         # n, c, h, w
         self.x_dim = (self.batch_size, self.channels, self.height, self.width)
 
@@ -58,7 +61,10 @@ def init_test_case(self):
         self.pooled_height = 2
         self.pooled_width = 2
         self.sampling_ratio = -1
-
+        if self.xpu_version == core.XPUVersion.XPU1:
+            self.continuous_coordinate = False
+        else:
+            self.continuous_coordinate = bool(np.random.randint(2))
         self.x = np.random.random(self.x_dim).astype('float32')
 
     def pre_calc(self, x_i, roi_xmin, roi_ymin, roi_bin_grid_h, roi_bin_grid_w,
@@ -124,12 +130,16 @@ def calc_roi_align(self):
             roi = self.rois[i]
             roi_batch_id = int(roi[0])
             x_i = self.x[roi_batch_id]
-            roi_xmin = roi[1] * self.spatial_scale
-            roi_ymin = roi[2] * self.spatial_scale
-            roi_xmax = roi[3] * self.spatial_scale
-            roi_ymax = roi[4] * self.spatial_scale
-            roi_width = max(roi_xmax - roi_xmin, 1)
-            roi_height = max(roi_ymax - roi_ymin, 1)
+            roi_offset = 0.5 if self.continuous_coordinate else 0
+            roi_xmin = roi[1] * self.spatial_scale - roi_offset
+            roi_ymin = roi[2] * self.spatial_scale - roi_offset
+            roi_xmax = roi[3] * self.spatial_scale - roi_offset
+            roi_ymax = roi[4] * self.spatial_scale - roi_offset
+            roi_width = roi_xmax - roi_xmin
+            roi_height = roi_ymax - roi_ymin
+            if not self.continuous_coordinate:
+                roi_width = max(roi_width, 1)
+                roi_height = max(roi_height, 1)
             bin_size_h = float(roi_height) / float(self.pooled_height)
             bin_size_w = float(roi_width) / float(self.pooled_width)
             roi_bin_grid_h = self.sampling_ratio if self.sampling_ratio > 0 else \
@@ -203,7 +213,8 @@ def set_data(self):
             'spatial_scale': self.spatial_scale,
             'pooled_height': self.pooled_height,
             'pooled_width': self.pooled_width,
-            'sampling_ratio': self.sampling_ratio
+            'sampling_ratio': self.sampling_ratio,
+            'aligned': self.continuous_coordinate
         }
 
         self.outputs = {'Out': self.out_data}

From da43e065cf8aa27e666fc06cc8e8717684434ee1 Mon Sep 17 00:00:00 2001
From: feng_shuai <fengshuai03@baidu.com>
Date: Tue, 22 Feb 2022 17:20:33 +0800
Subject: [PATCH 039/101] delete gather_ut skip_case (#39657)

* delete gather_ut skip_case

* add trt version limit
---
 paddle/fluid/inference/tensorrt/op_teller.cc  |  2 +
 .../ir/inference/test_trt_convert_gather.py   | 41 ++++++++-----------
 2 files changed, 19 insertions(+), 24 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 436c80d9a6bcf..7ddd4b558228b 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -560,12 +560,14 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
                      "the pass.";
           return false;
         }
+#if !IS_TRT_VERSION_GE(7000)
         auto* x_var_desc = block->FindVar(desc.Input("X")[0]);
         const auto x_shape = x_var_desc->GetShape();
         if (x_shape.size() == 1) {
           VLOG(3) << "Gather does not support 1-dimensional input in tensorrt";
           return false;
         }
+#endif
       }
     }
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather.py
index 37d23cb18d843..9bcbbf95990f2 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather.py
@@ -155,7 +155,7 @@ def generate_trt_nodes_num(dynamic_shape):
             if self.input_num == 3:
                 return 0, 5
             else:
-                if dynamic_shape and self.axis == 0:
+                if dynamic_shape:
                     return 1, 3
                 else:
                     return 0, 4
@@ -179,31 +179,24 @@ def generate_trt_nodes_num(dynamic_shape):
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), generate_trt_nodes_num(True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(True), 1e-3
 
     def add_skip_trt_case(self):
-        def teller1(program_config, predictor_config):
-            if len(self.dynamic_shape.min_input_shape) != 0:
-                inputs = program_config.inputs
-                if len(inputs['input_data'].shape) == 1 or len(inputs[
-                        'index_data'].shape) == 1:
-                    return True
-            return False
-
-        self.add_skip_case(
-            teller1, SkipReasons.TRT_NOT_SUPPORT,
-            "Need to repair the case: trt reshape out failed for dynamic shape mode when inputs' dims==1."
-        )
-
-        def teller2(program_config, predictor_config):
-            inputs = program_config.inputs
-            if "axis_data" in inputs.keys():
-                return True
-            return False
-
-        self.add_skip_case(
-            teller2, SkipReasons.TRT_NOT_SUPPORT,
-            "Need to repair the case: trt do not support axis tensor input.")
+        ver = paddle_infer.get_trt_compile_version()
+        if ver[0] * 1000 + ver[1] * 100 + ver[0] * 10 < 7000:
+
+            def teller1(program_config, predictor_config):
+                if len(self.dynamic_shape.min_input_shape) != 0:
+                    inputs = program_config.inputs
+                    if len(inputs['input_data'].shape) == 1 or len(inputs[
+                            'index_data'].shape) == 1:
+                        return True
+                return False
+
+            self.add_skip_case(
+                teller1, SkipReasons.TRT_NOT_SUPPORT,
+                "Need to repair the case: trt reshape out failed for dynamic shape mode when inputs' dims==1. under trt7.0 "
+            )
 
     def test(self):
         self.add_skip_trt_case()

From b95cd3b7ada652e5df0e4af6950de021de6a8941 Mon Sep 17 00:00:00 2001
From: lilong12 <lilong12@baidu.com>
Date: Tue, 22 Feb 2022 17:27:10 +0800
Subject: [PATCH 040/101] Add the implementation of TCP Store (#39384)

* add tcp_socket and tcp_store
---
 paddle/fluid/distributed/CMakeLists.txt       |   2 +
 paddle/fluid/distributed/store/CMakeLists.txt |   1 +
 paddle/fluid/distributed/store/store.h        |  43 +++
 paddle/fluid/distributed/store/tcp_store.cc   | 272 ++++++++++++++++++
 paddle/fluid/distributed/store/tcp_store.h    | 114 ++++++++
 paddle/fluid/distributed/store/tcp_utils.cc   | 201 +++++++++++++
 paddle/fluid/distributed/store/tcp_utils.h    | 133 +++++++++
 paddle/fluid/pybind/CMakeLists.txt            |   3 +-
 paddle/fluid/pybind/communication.cc          |  42 +++
 paddle/fluid/pybind/communication.h           |  31 ++
 paddle/fluid/pybind/pybind.cc                 |   2 +
 .../fluid/tests/unittests/test_tcp_store.py   |  34 +++
 12 files changed, 877 insertions(+), 1 deletion(-)
 create mode 100644 paddle/fluid/distributed/store/CMakeLists.txt
 create mode 100644 paddle/fluid/distributed/store/store.h
 create mode 100644 paddle/fluid/distributed/store/tcp_store.cc
 create mode 100644 paddle/fluid/distributed/store/tcp_store.h
 create mode 100644 paddle/fluid/distributed/store/tcp_utils.cc
 create mode 100644 paddle/fluid/distributed/store/tcp_utils.h
 create mode 100644 paddle/fluid/pybind/communication.cc
 create mode 100644 paddle/fluid/pybind/communication.h
 create mode 100644 python/paddle/fluid/tests/unittests/test_tcp_store.py

diff --git a/paddle/fluid/distributed/CMakeLists.txt b/paddle/fluid/distributed/CMakeLists.txt
index 5ae2e26e87c7b..1527b752c6906 100644
--- a/paddle/fluid/distributed/CMakeLists.txt
+++ b/paddle/fluid/distributed/CMakeLists.txt
@@ -1,3 +1,5 @@
+add_subdirectory(store)
+
 if(NOT WITH_PSCORE)
     add_subdirectory(fleet_executor)
     return()
diff --git a/paddle/fluid/distributed/store/CMakeLists.txt b/paddle/fluid/distributed/store/CMakeLists.txt
new file mode 100644
index 0000000000000..1fde447d97dd9
--- /dev/null
+++ b/paddle/fluid/distributed/store/CMakeLists.txt
@@ -0,0 +1 @@
+cc_library(tcp_store SRCS tcp_store.cc tcp_utils.cc DEPS enforce glog)
diff --git a/paddle/fluid/distributed/store/store.h b/paddle/fluid/distributed/store/store.h
new file mode 100644
index 0000000000000..2673314d222d2
--- /dev/null
+++ b/paddle/fluid/distributed/store/store.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <chrono>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/distributed/store/tcp_utils.h"
+
+namespace paddle {
+namespace distributed {
+
+class Store {
+ public:
+  Store() = delete;
+  explicit Store(const std::chrono::seconds& timeout) : _timeout(timeout) {}
+  virtual ~Store() = default;
+
+  virtual int64_t add(const std::string& key, int64_t value) = 0;
+  virtual std::vector<uint8_t> get(const std::string& key) = 0;
+  virtual void wait(const std::string& key) = 0;
+
+  virtual const std::chrono::seconds& timeout() const { return _timeout; }
+
+ private:
+  std::chrono::seconds _timeout;
+};
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/store/tcp_store.cc b/paddle/fluid/distributed/store/tcp_store.cc
new file mode 100644
index 0000000000000..de85ac0d910e9
--- /dev/null
+++ b/paddle/fluid/distributed/store/tcp_store.cc
@@ -0,0 +1,272 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <chrono>
+#include <iostream>
+#include <thread>
+
+#include "paddle/fluid/distributed/store/tcp_store.h"
+#include "paddle/fluid/distributed/store/tcp_utils.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace distributed {
+
+namespace detail {
+
+constexpr int INFTIME = -1;
+
+std::unique_ptr<MasterDaemon> MasterDaemon::start(SocketType socket) {
+  return std::make_unique<MasterDaemon>(socket);
+}
+
+MasterDaemon::MasterDaemon(SocketType socket) : _listen_socket(socket) {
+  _background_thread = std::thread{&MasterDaemon::run, this};
+}
+
+MasterDaemon::~MasterDaemon() {
+  _background_thread.join();
+  tcputils::close_socket(_listen_socket);
+  for (SocketType socket : _sockets) {
+    tcputils::close_socket(socket);
+  }
+}
+
+void MasterDaemon::_do_add(SocketType socket) {
+  int64_t new_value{};
+  std::string key = tcputils::receive_string(socket);
+  new_value = tcputils::receive_value<int64_t>(socket);
+  std::vector<uint8_t> old_value;
+  auto it = _store.find(key);
+  if (it != _store.end()) {
+    old_value = it->second;
+    char* buffer = reinterpret_cast<char*>(it->second.data());
+    size_t len = old_value.size();
+    new_value += std::stoll(std::string(buffer, len));
+  }
+
+  std::string new_value_str = std::to_string(new_value);
+  _store[key] =
+      std::vector<uint8_t>(new_value_str.begin(), new_value_str.end());
+  VLOG(3) << "TCPStore: new value (" << new_value << ") for key (" << key
+          << ").";
+  tcputils::send_value<int64_t>(socket, new_value);
+}
+
+void MasterDaemon::_do_get(SocketType socket) {
+  std::string key = tcputils::receive_string(socket);
+  auto iter = _store.find(key);
+  PADDLE_ENFORCE_NE(
+      iter, _store.end(),
+      platform::errors::InvalidArgument("Key %s not found in TCPStore.", key));
+  std::vector<uint8_t> value = iter->second;
+  VLOG(3) << "TCPStore: value ("
+          << std::stoll(std::string(reinterpret_cast<char*>(value.data()),
+                                    value.size()))
+          << ") for key (" << key << ").";
+  tcputils::send_vector<uint8_t>(socket, value);
+}
+
+void MasterDaemon::_do_stop(SocketType socket) {
+  ReplyType value = ReplyType::STOP_WAIT;
+  _stop = true;
+  tcputils::send_value<ReplyType>(socket, value);
+}
+
+void MasterDaemon::_do_wait(SocketType socket) {
+  std::string key = tcputils::receive_string(socket);
+  auto iter = _store.find(key);
+  auto reply = ReplyType::STOP_WAIT;
+  if (iter == _store.end()) {
+    reply = ReplyType::WAITING;
+  }
+  VLOG(3) << "TCPStore: wait reply (" << static_cast<int>(reply)
+          << ") for key (" << key << ").";
+  tcputils::send_value<ReplyType>(socket, reply);
+}
+
+void MasterDaemon::run() {
+  std::vector<struct pollfd> fds;
+#ifdef _WIN32
+  fds.push_back({_listen_socket, POLLIN});
+#else
+  fds.push_back({.fd = _listen_socket, .events = POLLIN, .revents = 0});
+#endif
+
+  while (!_stop) {
+    for (size_t i = 0; i < fds.size(); i++) {
+      fds[i].revents = 0;
+    }
+
+#ifdef _WIN32
+    ::WSAPoll(fds.data(), fds.size(), INFTIME);
+#else
+    ::poll(fds.data(), fds.size(), INFTIME);
+#endif
+
+    if (fds[0].revents != 0) {
+      auto socket = tcputils::tcp_accept(_listen_socket);
+      _sockets.emplace_back(socket);
+#ifdef _WIN32
+      fds.push_back({socket, POLLIN});
+#else
+      fds.push_back({.fd = socket, .events = POLLIN, .revents = 0});
+#endif
+    }
+
+    for (size_t i = 1; i < fds.size(); i++) {
+      if (fds[i].revents == 0) {
+        continue;
+      }
+
+      Command command = tcputils::receive_value<Command>(fds[i].fd);
+      VLOG(3) << "TCPStore: recv command: " << static_cast<int>(command) << ".";
+
+      switch (command) {
+        case Command::ADD:
+          _do_add(fds[i].fd);
+          break;
+        case Command::GET:
+          _do_get(fds[i].fd);
+          break;
+        case Command::WAIT:
+          _do_wait(fds[i].fd);
+          break;
+        case Command::STOP:
+          _do_stop(fds[i].fd);
+          break;
+      }
+    }
+  }
+}
+
+std::unique_ptr<TCPServer> TCPServer::create(uint16_t port) {
+  int socket = tcputils::tcp_listen("", std::to_string(port), AF_INET);
+  auto server = std::make_unique<TCPServer>();
+  server->_master_daemon = MasterDaemon::start(socket);
+  return server;
+}
+
+std::unique_ptr<TCPClient> TCPClient::connect(const std::string host,
+                                              uint16_t port) {
+  int socket = tcputils::tcp_connect(host, std::to_string(port), AF_INET);
+  return std::make_unique<TCPClient>(socket);
+}
+
+void TCPClient::send_command_for_key(Command type, const std::string& key) {
+  tcputils::send_value<Command>(_socket, type);
+  if (key.empty()) {
+    return;
+  }
+  tcputils::send_string(_socket, key);
+}
+
+template <typename T>
+void TCPClient::send_value(const T& value) {
+  tcputils::send_bytes<T>(_socket, &value, 1);
+}
+
+template <typename T>
+T TCPClient::receive_value() {
+  T res;
+  tcputils::receive_bytes<T>(_socket, &res, 1);
+  return res;
+}
+
+template <typename T>
+void TCPClient::send_vector(const std::vector<T>& value) {
+  tcputils::send_vector<T>(_socket, value);
+}
+
+template <typename T>
+std::vector<T> TCPClient::receive_vector() {
+  return tcputils::receive_vector<T>(_socket);
+}
+
+}  // namespace detail
+
+TCPStore::TCPStore(std::string host, uint16_t port, bool is_master,
+                   size_t num_workers, std::chrono::seconds timeout)
+    : Store(timeout), _is_master(is_master), _num_workers(num_workers) {
+  if (_is_master) {
+    _server = detail::TCPServer::create(port);
+  }
+
+  _client = detail::TCPClient::connect(host, port);
+  waitWorkers();
+}
+
+void TCPStore::waitWorkers() {
+  if (_num_workers == 0) {
+    return;
+  }
+  add(_init_key, 1);
+
+  if (_server) {
+    auto begin = std::chrono::steady_clock::now();
+    do {
+      auto value = get(_init_key);
+      int completed = std::stoi(std::string(value.begin(), value.end()));
+      VLOG(3) << completed << " worker ready, total " << _num_workers;
+      if (completed >= _num_workers) {
+        break;
+      }
+      const auto elapsed = std::chrono::duration_cast<std::chrono::seconds>(
+          std::chrono::steady_clock::now() - begin);
+
+      std::this_thread::sleep_for(std::chrono::milliseconds(100));
+      if (_timeout != tcputils::kNoTimeout && elapsed > _timeout) {
+        PADDLE_ENFORCE_EQ(
+            completed, _num_workers,
+            platform::errors::InvalidArgument(
+                "TCPStore timeouted and not all workers got ready."));
+      }
+    } while (true);
+  }
+  VLOG(3) << "TCPStore initialized.";
+}
+
+int64_t TCPStore::add(const std::string& key, int64_t value) {
+  _client->send_command_for_key(Command::ADD, _key_prefix + key);
+  _client->send_value<std::int64_t>(value);
+  return _client->receive_value<std::int64_t>();
+}
+
+std::vector<uint8_t> TCPStore::get(const std::string& key) {
+  wait(key);
+  _client->send_command_for_key(Command::GET, _key_prefix + key);
+  VLOG(3) << "TCPStore get.";
+  return _client->receive_vector<uint8_t>();
+}
+
+void TCPStore::wait(const std::string& key) {
+  ReplyType reply;
+  do {
+    _client->send_command_for_key(Command::WAIT, _key_prefix + key);
+
+    reply = _client->receive_value<ReplyType>();
+    std::this_thread::sleep_for(std::chrono::milliseconds(500));
+  } while (reply != ReplyType::STOP_WAIT);
+}
+
+TCPStore::~TCPStore() {
+  _client->send_command_for_key(Command::STOP, "");
+  ReplyType ret = _client->receive_value<ReplyType>();
+  PADDLE_ENFORCE_EQ(ret, ReplyType::STOP_WAIT,
+                    platform::errors::InvalidArgument(
+                        "The reply for TCPStore destructure must be 0."));
+}
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/store/tcp_store.h b/paddle/fluid/distributed/store/tcp_store.h
new file mode 100644
index 0000000000000..cd706dd6640ac
--- /dev/null
+++ b/paddle/fluid/distributed/store/tcp_store.h
@@ -0,0 +1,114 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <thread>
+#include <unordered_map>
+
+#include "paddle/fluid/distributed/store/store.h"
+#include "paddle/fluid/distributed/store/tcp_utils.h"
+
+namespace paddle {
+namespace distributed {
+
+enum class ReplyType { WAITING, STOP_WAIT };
+enum class Command { ADD, GET, WAIT, STOP };
+
+namespace detail {
+
+class MasterDaemon {
+ public:
+  static std::unique_ptr<MasterDaemon> start(SocketType listen_socket);
+  MasterDaemon() = delete;
+  explicit MasterDaemon(SocketType listen_socket);
+  ~MasterDaemon();
+
+ private:
+  void run();
+  void _do_add(SocketType socket);
+  void _do_wait(SocketType socket);
+  void _do_get(SocketType socket);
+  void _do_stop(SocketType socket);
+  SocketType _listen_socket;
+  std::vector<SocketType> _sockets;
+  std::unordered_map<std::string, std::vector<uint8_t>> _store;
+  std::thread _background_thread{};
+  bool _stop = false;
+};
+
+class TCPServer {
+ public:
+  TCPServer() = default;
+  static std::unique_ptr<TCPServer> create(std::uint16_t port);
+
+ private:
+  std::unique_ptr<MasterDaemon> _master_daemon;
+};
+
+class TCPClient {
+ public:
+  explicit TCPClient(SocketType socket) : _socket{socket} {}
+  static std::unique_ptr<TCPClient> connect(const std::string host,
+                                            uint16_t port);
+  ~TCPClient() { tcputils::close_socket(_socket); }
+  void send_command_for_key(Command type, const std::string& key);
+
+  template <typename T>
+  void send_value(const T& value);
+
+  template <typename T>
+  void send_vector(const std::vector<T>& value);
+  template <typename T>
+  std::vector<T> receive_vector();
+
+  template <typename T>
+  T receive_value();
+
+ private:
+  SocketType _socket;
+};
+
+}  // namespace detail
+
+class TCPStore : public Store {
+ public:
+  static constexpr std::uint16_t kDefaultPort = 6170;
+  explicit TCPStore(std::string host, uint16_t port = kDefaultPort,
+                    bool is_master = false, size_t num_workers = 1,
+                    std::chrono::seconds timeout = tcputils::kDefaultTimeout);
+
+  ~TCPStore();
+
+  int64_t add(const std::string& key, int64_t value) override;
+  std::vector<uint8_t> get(const std::string& key) override;
+  void wait(const std::string& key) override;
+
+ private:
+  void waitWorkers();
+  std::unique_ptr<detail::TCPServer> _server;
+  std::unique_ptr<detail::TCPClient> _client;
+
+  const std::string _init_key = "init/";
+  const std::string _key_prefix = "/";
+  std::chrono::seconds _timeout;
+  bool _is_master;
+  int _num_workers;
+};
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/store/tcp_utils.cc b/paddle/fluid/distributed/store/tcp_utils.cc
new file mode 100644
index 0000000000000..d0561d0b9a9c5
--- /dev/null
+++ b/paddle/fluid/distributed/store/tcp_utils.cc
@@ -0,0 +1,201 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/store/tcp_utils.h"
+#include <cerrno>
+#include <cstring>
+#include <thread>
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace distributed {
+namespace tcputils {
+
+std::error_code socket_error() {
+#ifdef _WIN32
+  return std::error_code{::WSAGetLastError(), std::generic_category()};
+#else
+  return std::error_code{errno, std::generic_category()};
+#endif
+}
+
+void close_socket(SocketType socket) {
+#ifdef _WIN32
+  ::closesocket(socket);
+#else
+  ::close(socket);
+#endif
+}
+
+::addrinfo* get_addr_info(const std::string host, const std::string port,
+                          int ai_flags, int family) {
+  ::addrinfo hints{}, *res;
+  hints.ai_flags = ai_flags;
+  hints.ai_family = family;
+  hints.ai_socktype = SOCK_STREAM;
+
+  const char* node = host.empty() ? nullptr : host.c_str();
+
+  int n;
+  n = ::getaddrinfo(node, port.c_str(), &hints, &res);
+  const char* gai_err = ::gai_strerror(n);
+  const char* proto =
+      (family == AF_INET ? "IPv4" : family == AF_INET6 ? "IPv6" : "");
+  PADDLE_ENFORCE_EQ(
+      n, 0, platform::errors::InvalidArgument(
+                "%s network %s:%s cannot be obtained. Details: %s.", proto,
+                host, port, gai_err));
+
+  return res;
+}
+
+void free_addr_info(::addrinfo* hint) {
+  PADDLE_ENFORCE_NOT_NULL(
+      hint, platform::errors::InvalidArgument(
+                "The parameter for free_addr_info cannot be null."));
+  ::freeaddrinfo(hint);
+}
+
+SocketType tcp_connect(const std::string host, const std::string port,
+                       int family, std::chrono::seconds timeout) {
+  int ai_flags = AI_NUMERICSERV | AI_V4MAPPED | AI_ALL;
+  ::addrinfo* res = get_addr_info(host, port, ai_flags, family);
+
+  SocketType sockfd = -1;
+  bool retry = true;
+  auto deadline = std::chrono::steady_clock::now() + timeout;
+  do {
+    for (::addrinfo* cur = res; cur != nullptr; cur = cur->ai_next) {
+      sockfd = ::socket(cur->ai_family, cur->ai_socktype, cur->ai_protocol);
+      PADDLE_ENFORCE_GT(sockfd, 0, platform::errors::InvalidArgument(
+                                       "Create socket to connect %s:%s failed. "
+                                       "Details: %s. ",
+                                       host, port, socket_error().message()));
+
+      if (::connect(sockfd, cur->ai_addr, cur->ai_addrlen) == 0) {
+        retry = false;
+        break;
+      }
+      VLOG(0) << "Retry to connect to " << host << ":" << port
+              << " while the server is not yet listening.";
+      close_socket(sockfd);
+      sockfd = -1;
+      std::this_thread::sleep_for(kDelay);
+      if (timeout != kNoTimeout &&
+          std::chrono::steady_clock::now() >= deadline) {
+        retry = false;
+        break;
+      }
+    }
+
+    if (timeout != kNoTimeout && std::chrono::steady_clock::now() >= deadline) {
+      retry = false;
+    }
+  } while (retry);
+
+  free_addr_info(res);
+
+  PADDLE_ENFORCE_GT(sockfd, 0,
+                    platform::errors::InvalidArgument(
+                        "Network %s:%s cannot be connected.", host, port));
+  VLOG(0) << "Successfully connected to " << host << ":" << port;
+
+  return sockfd;
+}
+
+SocketType tcp_listen(const std::string host, const std::string port,
+                      int family) {
+  int ai_flags = AI_PASSIVE | AI_NUMERICSERV;
+  ::addrinfo* res = get_addr_info(host, port, ai_flags, family);
+  ::addrinfo* cur = res;
+  SocketType sockfd{};
+
+  std::string node = host.empty() ? "IP_ANY" : host;
+  while (cur) {
+    sockfd = ::socket(cur->ai_family, cur->ai_socktype, cur->ai_protocol);
+    if (sockfd < 0) {
+      VLOG(0) << "Cannot create socket on " << node << ":" << port
+              << ". Details: " << socket_error().message();
+      cur = cur->ai_next;
+      continue;
+    }
+
+    int on = 1;
+#ifdef _WIN32
+    int ret = ::setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR,
+                           reinterpret_cast<char*>(&on), sizeof(on));
+#else
+    int ret = ::setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on));
+#endif
+    if (ret < 0) {
+      VLOG(0) << "Set the address reuse option failed on the server.";
+    }
+    if (::bind(sockfd, res->ai_addr, res->ai_addrlen) == 0) {
+      break;
+    }
+    close_socket(sockfd);
+    sockfd = -1;
+    cur = cur->ai_next;
+  }
+
+  PADDLE_ENFORCE_GT(sockfd, 0,
+                    platform::errors::InvalidArgument(
+                        "Bind network on %s:%s failedd.", node, port));
+
+  ::listen(sockfd, LISTENQ);
+
+  VLOG(0) << "The server starts to listen on " << node << ":" << port;
+  return sockfd;
+}
+
+SocketType tcp_accept(SocketType socket) {
+  ::sockaddr_storage addr_s{};
+  ::socklen_t addr_len = sizeof(addr_s);
+  SocketType new_socket =
+      ::accept(socket, reinterpret_cast<::sockaddr*>(&addr_s), &addr_len);
+  PADDLE_ENFORCE_GT(
+      new_socket, 0,
+      platform::errors::InvalidArgument(
+          "The server failed to accept a new connection. Details: %s.",
+          socket_error().message()));
+#ifndef _WIN32
+  ::fcntl(new_socket, F_SETFD, FD_CLOEXEC);
+#endif
+  auto value = 1;
+#ifdef _WIN32
+  ::setsockopt(new_socket, IPPROTO_TCP, TCP_NODELAY,
+               reinterpret_cast<const char*>(&value), sizeof(value));
+#else
+  ::setsockopt(new_socket, IPPROTO_TCP, TCP_NODELAY, &value, sizeof(value));
+#endif
+  return new_socket;
+}
+
+void send_string(SocketType socket, const std::string& s) {
+  std::string::size_type size = s.size();
+  send_bytes<std::string::size_type>(socket, &size, 1);
+  send_bytes<const char>(socket, s.data(), size);
+}
+
+std::string receive_string(SocketType socket) {
+  std::string::size_type size;
+  receive_bytes<std::string::size_type>(socket, &size, 1);
+  std::vector<char> v(size);
+  receive_bytes<char>(socket, v.data(), size);
+  return std::string(v.data(), v.size());
+}
+
+}  // namespace tcputils
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/store/tcp_utils.h b/paddle/fluid/distributed/store/tcp_utils.h
new file mode 100644
index 0000000000000..60cb3de124da3
--- /dev/null
+++ b/paddle/fluid/distributed/store/tcp_utils.h
@@ -0,0 +1,133 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifdef _WIN32
+#include <winsock2.h>
+#include <ws2tcpip.h>
+#pragma comment(lib, "Ws2_32.lib")
+#else
+#include <fcntl.h>
+#include <netdb.h>
+#include <netinet/tcp.h>
+#include <poll.h>
+#include <sys/socket.h>
+#include <unistd.h>
+#endif
+#include <chrono>
+#include <iostream>
+#include <vector>
+#include "paddle/fluid/platform/enforce.h"
+
+// Utility functions for TCP socket.
+namespace paddle {
+namespace distributed {
+
+#ifdef _WIN32
+using SocketType = SOCKET;
+#else
+using SocketType = int;
+#endif
+
+namespace tcputils {
+
+constexpr int LISTENQ = 2048;
+constexpr std::chrono::seconds kDelay = std::chrono::seconds(3);
+constexpr std::chrono::seconds kNoTimeout = std::chrono::seconds::zero();
+constexpr std::chrono::seconds kDefaultTimeout = std::chrono::seconds(360);
+
+std::error_code socket_error();
+void close_socket(SocketType socket);
+::addrinfo* get_addr_info(const std::string host, const std::string port,
+                          int ai_flags, int family);
+void free_addr_info(::addrinfo*);
+SocketType tcp_connect(const std::string host, const std::string port,
+                       int family, std::chrono::seconds timeout = kNoTimeout);
+SocketType tcp_listen(const std::string host, const std::string port,
+                      int family);
+SocketType tcp_accept(SocketType socket);
+
+void send_string(SocketType socket, const std::string& s);
+std::string receive_string(SocketType socket);
+
+template <typename T>
+void send_bytes(SocketType socket, const T* buffer, size_t len) {
+  size_t to_send = len * sizeof(T);
+  if (to_send == 0) {
+    return;
+  }
+
+  auto ptr = reinterpret_cast<const char*>(buffer);
+
+  while (to_send > 0) {
+    auto byte_sent = ::send(socket, ptr, to_send, 0);
+    PADDLE_ENFORCE_GT(byte_sent, 0, platform::errors::InvalidArgument(
+                                        "TCP send error. Details: %s.",
+                                        socket_error().message()));
+    to_send -= byte_sent;
+    ptr += byte_sent;
+  }
+}
+
+template <typename T>
+void receive_bytes(SocketType socket, T* buffer, size_t len) {
+  size_t to_recv = len * sizeof(T);
+  if (to_recv == 0) {
+    return;
+  }
+  auto ptr = reinterpret_cast<char*>(buffer);
+
+  while (to_recv > 0) {
+    auto byte_received = ::recv(socket, ptr, to_recv, 0);
+    PADDLE_ENFORCE_GT(byte_received, 0, platform::errors::InvalidArgument(
+                                            "TCP receive error. Details: %s.",
+                                            socket_error().message()));
+
+    to_recv -= byte_received;
+    ptr += byte_received;
+  }
+}
+
+template <typename T>
+void send_vector(SocketType socket, const std::vector<T>& v) {
+  size_t size = v.size();
+  send_bytes<size_t>(socket, &size, 1);
+  send_bytes<T>(socket, v.data(), size);
+}
+
+template <typename T>
+std::vector<T> receive_vector(SocketType socket) {
+  size_t size;
+  receive_bytes<size_t>(socket, &size, 1);
+  std::vector<T> res(size);
+  receive_bytes<T>(socket, res.data(), size);
+  return res;
+}
+
+template <typename T>
+void send_value(SocketType socket, const T& v) {
+  send_bytes<T>(socket, &v, 1);
+}
+
+template <typename T>
+T receive_value(SocketType socket) {
+  T v;
+  receive_bytes<T>(socket, &v, 1);
+  return v;
+}
+
+}  // namespace tcputils
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 3453cff30f5ad..26c35167f404a 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -2,7 +2,7 @@ set(PYBIND_DEPS init pybind python proto_desc memory executor fleet_wrapper box_
   feed_fetch_method pass generate_pass pass_builder parallel_executor profiler layer tracer engine scope_pool
   analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context
   gloo_wrapper infer_io_utils heter_wrapper generator op_version_registry ps_gpu_wrapper custom_operator
-  cost_model cuda_graph_with_memory_pool fleet_executor global_utils pten_utils)
+  cost_model cuda_graph_with_memory_pool fleet_executor global_utils pten_utils tcp_store)
 
 if (WITH_PSCORE)
   set(PYBIND_DEPS ${PYBIND_DEPS} ps_service)
@@ -73,6 +73,7 @@ set(PYBIND_SRCS
   compatible.cc
   io.cc
   generator_py.cc
+  communication.cc
   cuda_streams_py.cc)
 
 if(WITH_ASCEND)
diff --git a/paddle/fluid/pybind/communication.cc b/paddle/fluid/pybind/communication.cc
new file mode 100644
index 0000000000000..a0d2777f825dc
--- /dev/null
+++ b/paddle/fluid/pybind/communication.cc
@@ -0,0 +1,42 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <Python.h>
+#include <pybind11/chrono.h>
+#include <pybind11/complex.h>
+#include <pybind11/functional.h>
+#include <pybind11/stl.h>
+#include <chrono>
+#include <string>
+
+#include "paddle/fluid/distributed/store/tcp_store.h"
+#include "paddle/fluid/pybind/communication.h"
+
+namespace py = pybind11;
+
+namespace paddle {
+namespace pybind {
+
+using TCPStore = paddle::distributed::TCPStore;
+
+void BindTCPStore(py::module* m) {
+  py::class_<TCPStore>(*m, "TCPStore")
+      .def(
+          py::init<std::string, uint16_t, bool, size_t, std::chrono::seconds>())
+      .def("add", &TCPStore::add)
+      .def("get", &TCPStore::get);
+}
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/communication.h b/paddle/fluid/pybind/communication.h
new file mode 100644
index 0000000000000..17045ccfe65ca
--- /dev/null
+++ b/paddle/fluid/pybind/communication.h
@@ -0,0 +1,31 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <Python.h>
+
+#include "pybind11/chrono.h"
+#include "pybind11/complex.h"
+#include "pybind11/functional.h"
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+
+namespace paddle {
+namespace pybind {
+
+void BindTCPStore(pybind11::module* m);
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index f653070b2eff7..58205041b8041 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -91,6 +91,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/bind_cost_model.h"
 #include "paddle/fluid/pybind/bind_fleet_executor.h"
 #include "paddle/fluid/pybind/box_helper_py.h"
+#include "paddle/fluid/pybind/communication.h"
 #include "paddle/fluid/pybind/compatible.h"
 #include "paddle/fluid/pybind/const_value.h"
 #include "paddle/fluid/pybind/data_set_py.h"
@@ -2621,6 +2622,7 @@ All parameter, weight, gradient are variables in Paddle.
   BindGlobalValueGetterSetter(&m);
   BindProcessMeshDesc(&m);
   BindFleetExecutor(&m);
+  BindTCPStore(&m);
 
   py::class_<framework::LoDRankTable>(m, "LodRankTable")
       .def("items", [](framework::LoDRankTable &table) {
diff --git a/python/paddle/fluid/tests/unittests/test_tcp_store.py b/python/paddle/fluid/tests/unittests/test_tcp_store.py
new file mode 100644
index 0000000000000..11e1e8cd059c8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_tcp_store.py
@@ -0,0 +1,34 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import datetime
+import paddle
+
+
+class TestTCPStore(unittest.TestCase):
+    def test_tcp_store(self):
+        store = paddle.fluid.core.TCPStore("127.0.0.1", 6170, True, 1,
+                                           datetime.timedelta(0))
+        store.add("my", 3)
+        ret1 = store.get('my')
+        store.add("my", 3)
+        ret2 = store.get('my')
+        self.assertEqual(ret1[0] + 3, ret2[0])
+
+
+if __name__ == "__main__":
+    unittest.main()

From 74c0bc1c3056501adb9b8393e26134e518d21849 Mon Sep 17 00:00:00 2001
From: jakpiase <jakpia21@gmail.com>
Date: Tue, 22 Feb 2022 10:27:30 +0100
Subject: [PATCH 041/101] added round fwd onednn kernel (#39653)

---
 .../fluid/operators/mkldnn/activation_mkldnn_op.cc   | 10 ++++++++++
 .../unittests/mkldnn/test_activation_mkldnn_op.py    | 12 ++++++++++++
 2 files changed, 22 insertions(+)

diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
index bc13321473b88..e8c80096dd88b 100644
--- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
@@ -262,6 +262,10 @@ using EluMKLDNNFunctor = MKLDNNActivationFunc<T, dnnl::algorithm::eltwise_elu>;
 template <typename T>
 using ExpMKLDNNFunctor = MKLDNNActivationFunc<T, dnnl::algorithm::eltwise_exp>;
 
+template <typename T>
+using RoundMKLDNNFunctor =
+    MKLDNNActivationFunc<T, dnnl::algorithm::eltwise_round>;
+
 template <typename T>
 using ReluMKLDNNGradFunctor =
     MKLDNNActivationGradFunc<T, dnnl::algorithm::eltwise_relu>;
@@ -330,6 +334,10 @@ namespace ops = paddle::operators;
       ops::MKLDNNActivationGradKernel<                                        \
           ops::grad_functor<paddle::platform::bfloat16>>);
 
+#define REGISTER_ACTIVATION_MKLDNN_KERNEL_FWD_ONLY(act_type, functor) \
+  REGISTER_OP_KERNEL(act_type, MKLDNN, ::paddle::platform::CPUPlace,  \
+                     ops::MKLDNNActivationKernel<ops::functor<float>>);
+
 #define FOR_EACH_MKLDNN_KERNEL_FUNCTOR(__macro)                            \
   __macro(relu6, Relu6MKLDNNFunctor, Relu6MKLDNNGradFunctor);              \
   __macro(leaky_relu, ReluMKLDNNFunctor, ReluMKLDNNGradFunctor);           \
@@ -341,6 +349,8 @@ namespace ops = paddle::operators;
   __macro(exp, ExpMKLDNNFunctor, ExpMKLDNNGradUseOutFunctor);
 
 FOR_EACH_MKLDNN_KERNEL_FUNCTOR(REGISTER_ACTIVATION_MKLDNN_KERNEL);
+REGISTER_ACTIVATION_MKLDNN_KERNEL_FWD_ONLY(round, RoundMKLDNNFunctor);
+
 REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL(relu, ReluMKLDNNFunctor,
                                        ReluMKLDNNGradFunctor);
 REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL(gelu, GeluMKLDNNFunctor,
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
index e2d50fc853887..4e4fe69d914fa 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
@@ -328,6 +328,18 @@ def setUp(self):
         self.attrs = {"use_mkldnn": True}
 
 
+class TestMKLDNNRound(TestActivation):
+    def setUp(self):
+        self.op_type = "round"
+
+        x = np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype(np.float32)
+        out = np.round(x)
+
+        self.inputs = {'X': x}
+        self.outputs = {'Out': out}
+        self.attrs = {"use_mkldnn": True}
+
+
 class TestMKLDNNSigmoidDim4(TestSigmoid):
     def setUp(self):
         super(TestMKLDNNSigmoidDim4, self).setUp()

From 345cc8fa7115b00bc7589161346c147730bced69 Mon Sep 17 00:00:00 2001
From: From00 <zero.ruibiao@gmail.com>
Date: Tue, 22 Feb 2022 19:19:52 +0800
Subject: [PATCH 042/101] Move real and imag op to phi (#39777)

* Move Real OP to phi

* Move Imag OP to phi

* Move Real and Imag InferShape to phi

* Move Real and Imag to complex_kernel

* Change PT_REGISTER_XXX to PD_REGISTER_XXX
---
 paddle/fluid/operators/imag_op.cc             | 30 +++------
 paddle/fluid/operators/imag_op.cu             | 28 --------
 paddle/fluid/operators/imag_op.h              | 67 -------------------
 paddle/fluid/operators/real_op.cc             | 29 +++-----
 paddle/fluid/operators/real_op.cu             | 28 --------
 paddle/fluid/operators/real_op.h              | 67 -------------------
 paddle/phi/kernels/complex_grad_kernel.h      | 31 +++++++++
 paddle/phi/kernels/complex_kernel.h           | 10 +++
 paddle/phi/kernels/cpu/complex_grad_kernel.cc | 33 +++++++++
 paddle/phi/kernels/cpu/complex_kernel.cc      | 14 ++++
 paddle/phi/kernels/gpu/complex_grad_kernel.cu | 33 +++++++++
 paddle/phi/kernels/gpu/complex_kernel.cu      | 14 ++++
 .../kernels/impl/complex_grad_kernel_impl.h   | 50 ++++++++++++++
 paddle/phi/kernels/impl/complex_kernel_impl.h | 28 ++++++++
 paddle/phi/ops/compat/complex_sig.cc          | 32 +++++++++
 15 files changed, 263 insertions(+), 231 deletions(-)
 delete mode 100644 paddle/fluid/operators/imag_op.cu
 delete mode 100644 paddle/fluid/operators/imag_op.h
 delete mode 100644 paddle/fluid/operators/real_op.cu
 delete mode 100644 paddle/fluid/operators/real_op.h
 create mode 100644 paddle/phi/kernels/complex_grad_kernel.h
 create mode 100644 paddle/phi/kernels/cpu/complex_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/gpu/complex_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/impl/complex_grad_kernel_impl.h
 create mode 100644 paddle/phi/ops/compat/complex_sig.cc

diff --git a/paddle/fluid/operators/imag_op.cc b/paddle/fluid/operators/imag_op.cc
index 6a195bb9400e8..33b68d68992dd 100644
--- a/paddle/fluid/operators/imag_op.cc
+++ b/paddle/fluid/operators/imag_op.cc
@@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/imag_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -20,15 +23,6 @@ namespace operators {
 class ImagOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Imag");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Imag");
-
-    auto x_dims = ctx->GetInputDim("X");
-    ctx->SetOutputDim("Out", x_dims);
-    ctx->ShareLoD("X", "Out");
-  }
 };
 
 class ImagOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -88,19 +82,13 @@ DECLARE_INPLACE_OP_INFERER(ImagGradOpInplaceInferer,
 }  // namespace operators
 }  // namespace paddle
 
+DELCARE_INFER_SHAPE_FUNCTOR(imag, ImagInferShapeFunctor,
+                            PT_INFER_META(phi::UnchangedInferMeta));
+
 namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(imag, ops::ImagOp, ops::ImagOpMaker,
                   ops::ImagGradOpMaker<paddle::framework::OpDesc>,
-                  ops::ImagGradOpMaker<paddle::imperative::OpBase>);
+                  ops::ImagGradOpMaker<paddle::imperative::OpBase>,
+                  ImagInferShapeFunctor);
 REGISTER_OPERATOR(imag_grad, ops::ImagGradOp);
-
-REGISTER_OP_CPU_KERNEL(imag, ops::ImagKernel<paddle::platform::CPUDeviceContext,
-                                             paddle::platform::complex<float>>,
-                       ops::ImagKernel<paddle::platform::CPUDeviceContext,
-                                       paddle::platform::complex<double>>);
-REGISTER_OP_CPU_KERNEL(imag_grad,
-                       ops::ImagGradKernel<paddle::platform::CPUDeviceContext,
-                                           paddle::platform::complex<float>>,
-                       ops::ImagGradKernel<paddle::platform::CPUDeviceContext,
-                                           paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/imag_op.cu b/paddle/fluid/operators/imag_op.cu
deleted file mode 100644
index 9cfb2ef7f2fef..0000000000000
--- a/paddle/fluid/operators/imag_op.cu
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/imag_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(imag,
-                        ops::ImagKernel<paddle::platform::CUDADeviceContext,
-                                        paddle::platform::complex<float>>,
-                        ops::ImagKernel<paddle::platform::CUDADeviceContext,
-                                        paddle::platform::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(imag_grad,
-                        ops::ImagGradKernel<paddle::platform::CUDADeviceContext,
-                                            paddle::platform::complex<float>>,
-                        ops::ImagGradKernel<paddle::platform::CUDADeviceContext,
-                                            paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/imag_op.h b/paddle/fluid/operators/imag_op.h
deleted file mode 100644
index 33eab2abb74e1..0000000000000
--- a/paddle/fluid/operators/imag_op.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/kernels/funcs/complex_functors.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ImagKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    const framework::Tensor* x = ctx.Input<framework::Tensor>("X");
-    framework::Tensor* out = ctx.Output<framework::Tensor>("Out");
-
-    auto numel = x->numel();
-    auto* x_data = x->data<T>();
-    auto* out_data = out->mutable_data<phi::funcs::Real<T>>(
-        ctx.GetPlace(),
-        static_cast<size_t>(numel * sizeof(phi::funcs::Real<T>)));
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
-    phi::funcs::ImagFunctor<T> functor(x_data, out_data, numel);
-    for_range(functor);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ImagGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    const framework::Tensor* d_out =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    framework::Tensor* d_x =
-        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-
-    auto numel = d_out->numel();
-    auto* dout_data = d_out->data<phi::funcs::Real<T>>();
-    auto* dx_data = d_x->mutable_data<T>(
-        ctx.GetPlace(), static_cast<size_t>(numel * sizeof(T)));
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
-    phi::funcs::ImagToComplexFunctor<T> functor(dout_data, dx_data, numel);
-    for_range(functor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/real_op.cc b/paddle/fluid/operators/real_op.cc
index 1174e72a76b1b..1f3691978b577 100644
--- a/paddle/fluid/operators/real_op.cc
+++ b/paddle/fluid/operators/real_op.cc
@@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/real_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -20,14 +23,6 @@ namespace operators {
 class RealOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Real");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Real");
-
-    auto x_dims = ctx->GetInputDim("X");
-    ctx->SetOutputDim("Out", x_dims);
-    ctx->ShareLoD("X", "Out");
-  }
 };
 
 class RealOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -87,19 +82,13 @@ DECLARE_INPLACE_OP_INFERER(RealGradOpInplaceInferer,
 }  // namespace operators
 }  // namespace paddle
 
+DELCARE_INFER_SHAPE_FUNCTOR(real, RealInferShapeFunctor,
+                            PT_INFER_META(phi::UnchangedInferMeta));
+
 namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(real, ops::RealOp, ops::RealOpMaker,
                   ops::RealGradOpMaker<::paddle::framework::OpDesc>,
-                  ops::RealGradOpMaker<::paddle::imperative::OpBase>);
+                  ops::RealGradOpMaker<::paddle::imperative::OpBase>,
+                  RealInferShapeFunctor);
 REGISTER_OPERATOR(real_grad, ops::RealGradOp);
-
-REGISTER_OP_CPU_KERNEL(real, ops::RealKernel<paddle::platform::CPUDeviceContext,
-                                             paddle::platform::complex<float>>,
-                       ops::RealKernel<paddle::platform::CPUDeviceContext,
-                                       paddle::platform::complex<double>>);
-REGISTER_OP_CPU_KERNEL(real_grad,
-                       ops::RealGradKernel<paddle::platform::CPUDeviceContext,
-                                           paddle::platform::complex<float>>,
-                       ops::RealGradKernel<paddle::platform::CPUDeviceContext,
-                                           paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/real_op.cu b/paddle/fluid/operators/real_op.cu
deleted file mode 100644
index 9bfb2878a6261..0000000000000
--- a/paddle/fluid/operators/real_op.cu
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/real_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(real,
-                        ops::RealKernel<paddle::platform::CUDADeviceContext,
-                                        paddle::platform::complex<float>>,
-                        ops::RealKernel<paddle::platform::CUDADeviceContext,
-                                        paddle::platform::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(real_grad,
-                        ops::RealGradKernel<paddle::platform::CUDADeviceContext,
-                                            paddle::platform::complex<float>>,
-                        ops::RealGradKernel<paddle::platform::CUDADeviceContext,
-                                            paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/real_op.h b/paddle/fluid/operators/real_op.h
deleted file mode 100644
index c5a9724e8a304..0000000000000
--- a/paddle/fluid/operators/real_op.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/kernels/funcs/complex_functors.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class RealKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    const framework::Tensor* x = ctx.Input<framework::Tensor>("X");
-    framework::Tensor* out = ctx.Output<framework::Tensor>("Out");
-
-    auto numel = x->numel();
-    auto* x_data = x->data<T>();
-    auto* out_data = out->mutable_data<phi::funcs::Real<T>>(
-        ctx.GetPlace(),
-        static_cast<size_t>(numel * sizeof(phi::funcs::Real<T>)));
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
-    phi::funcs::RealFunctor<T> functor(x_data, out_data, numel);
-    for_range(functor);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class RealGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    const framework::Tensor* d_out =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    framework::Tensor* d_x =
-        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-
-    auto numel = d_out->numel();
-    auto* dout_data = d_out->data<phi::funcs::Real<T>>();
-    auto* dx_data = d_x->mutable_data<T>(
-        ctx.GetPlace(), static_cast<size_t>(numel * sizeof(T)));
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
-    phi::funcs::RealToComplexFunctor<T> functor(dout_data, dx_data, numel);
-    for_range(functor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/phi/kernels/complex_grad_kernel.h b/paddle/phi/kernels/complex_grad_kernel.h
new file mode 100644
index 0000000000000..505d4d3744241
--- /dev/null
+++ b/paddle/phi/kernels/complex_grad_kernel.h
@@ -0,0 +1,31 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RealGradKernel(const Context& dev_ctx,
+                    const DenseTensor& dout,
+                    DenseTensor* dx);
+
+template <typename T, typename Context>
+void ImagGradKernel(const Context& dev_ctx,
+                    const DenseTensor& dout,
+                    DenseTensor* dx);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/complex_kernel.h b/paddle/phi/kernels/complex_kernel.h
index cfe9da2388036..44bfae9820aa8 100644
--- a/paddle/phi/kernels/complex_kernel.h
+++ b/paddle/phi/kernels/complex_kernel.h
@@ -50,4 +50,14 @@ DenseTensor Conj(const Context& dev_ctx, const DenseTensor& x) {
   return x;
 }
 
+template <typename T, typename DeviceContext>
+void RealKernel(const DeviceContext& dev_ctx,
+                const DenseTensor& x,
+                DenseTensor* out);
+
+template <typename T, typename DeviceContext>
+void ImagKernel(const DeviceContext& dev_ctx,
+                const DenseTensor& x,
+                DenseTensor* out);
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/cpu/complex_grad_kernel.cc b/paddle/phi/kernels/cpu/complex_grad_kernel.cc
new file mode 100644
index 0000000000000..5c1d50f5bf27d
--- /dev/null
+++ b/paddle/phi/kernels/cpu/complex_grad_kernel.cc
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/complex_grad_kernel.h"
+#include "paddle/phi/kernels/impl/complex_grad_kernel_impl.h"
+
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(real_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::RealGradKernel,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(imag_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ImagGradKernel,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/complex_kernel.cc b/paddle/phi/kernels/cpu/complex_kernel.cc
index ae09f2a5effe1..801502e16737d 100644
--- a/paddle/phi/kernels/cpu/complex_kernel.cc
+++ b/paddle/phi/kernels/cpu/complex_kernel.cc
@@ -31,3 +31,17 @@ PD_REGISTER_KERNEL(conj,
                    double,
                    int,
                    int64_t) {}
+
+PD_REGISTER_KERNEL(real,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::RealKernel,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(imag,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ImagKernel,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/complex_grad_kernel.cu b/paddle/phi/kernels/gpu/complex_grad_kernel.cu
new file mode 100644
index 0000000000000..ad694445d1874
--- /dev/null
+++ b/paddle/phi/kernels/gpu/complex_grad_kernel.cu
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/complex_grad_kernel.h"
+#include "paddle/phi/kernels/impl/complex_grad_kernel_impl.h"
+
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(imag_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ImagGradKernel,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(real_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::RealGradKernel,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/complex_kernel.cu b/paddle/phi/kernels/gpu/complex_kernel.cu
index 02fd408aba86f..d0b086718a444 100644
--- a/paddle/phi/kernels/gpu/complex_kernel.cu
+++ b/paddle/phi/kernels/gpu/complex_kernel.cu
@@ -32,3 +32,17 @@ PD_REGISTER_KERNEL(conj,
                    double,
                    int,
                    int64_t) {}
+
+PD_REGISTER_KERNEL(real,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::RealKernel,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(imag,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ImagKernel,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/impl/complex_grad_kernel_impl.h b/paddle/phi/kernels/impl/complex_grad_kernel_impl.h
new file mode 100644
index 0000000000000..febc464e6a1f5
--- /dev/null
+++ b/paddle/phi/kernels/impl/complex_grad_kernel_impl.h
@@ -0,0 +1,50 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RealGradKernel(const Context& dev_ctx,
+                    const DenseTensor& dout,
+                    DenseTensor* dx) {
+  auto numel = dout.numel();
+  auto* dout_data = dout.data<phi::funcs::Real<T>>();
+  auto* dx_data =
+      dev_ctx.template Alloc<T>(dx, static_cast<size_t>(numel * sizeof(T)));
+
+  paddle::platform::ForRange<Context> for_range(dev_ctx, numel);
+  phi::funcs::RealToComplexFunctor<T> functor(dout_data, dx_data, numel);
+  for_range(functor);
+}
+
+template <typename T, typename Context>
+void ImagGradKernel(const Context& dev_ctx,
+                    const DenseTensor& dout,
+                    DenseTensor* dx) {
+  auto numel = dout.numel();
+  auto* dout_data = dout.data<phi::funcs::Real<T>>();
+  auto* dx_data =
+      dev_ctx.template Alloc<T>(dx, static_cast<size_t>(numel * sizeof(T)));
+
+  paddle::platform::ForRange<Context> for_range(dev_ctx, numel);
+  phi::funcs::ImagToComplexFunctor<T> functor(dout_data, dx_data, numel);
+  for_range(functor);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/complex_kernel_impl.h b/paddle/phi/kernels/impl/complex_kernel_impl.h
index 910a7be965e6b..2f9b1ad046653 100644
--- a/paddle/phi/kernels/impl/complex_kernel_impl.h
+++ b/paddle/phi/kernels/impl/complex_kernel_impl.h
@@ -33,4 +33,32 @@ void ConjKernel(const Context& dev_ctx,
   for_range(functor);
 }
 
+template <typename T, typename Context>
+void RealKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                DenseTensor* out) {
+  auto numel = x.numel();
+  auto* x_data = x.data<T>();
+  auto* out_data = dev_ctx.template Alloc<phi::funcs::Real<T>>(
+      out, static_cast<size_t>(numel * sizeof(phi::funcs::Real<T>)));
+
+  paddle::platform::ForRange<Context> for_range(dev_ctx, numel);
+  phi::funcs::RealFunctor<T> functor(x_data, out_data, numel);
+  for_range(functor);
+}
+
+template <typename T, typename Context>
+void ImagKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                DenseTensor* out) {
+  auto numel = x.numel();
+  auto* x_data = x.data<T>();
+  auto* out_data = dev_ctx.template Alloc<phi::funcs::Real<T>>(
+      out, static_cast<size_t>(numel * sizeof(phi::funcs::Real<T>)));
+
+  paddle::platform::ForRange<Context> for_range(dev_ctx, numel);
+  phi::funcs::ImagFunctor<T> functor(x_data, out_data, numel);
+  for_range(functor);
+}
+
 }  // namespace phi
diff --git a/paddle/phi/ops/compat/complex_sig.cc b/paddle/phi/ops/compat/complex_sig.cc
new file mode 100644
index 0000000000000..b9f59c97fb50f
--- /dev/null
+++ b/paddle/phi/ops/compat/complex_sig.cc
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature RealGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "real_grad", {GradVarName("Out")}, {}, {GradVarName("X")});
+}
+
+KernelSignature ImagGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "imag_grad", {GradVarName("Out")}, {}, {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(real_grad, phi::RealGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(imag_grad, phi::ImagGradOpArgumentMapping);

From ae8c811acbc8bce44197b12f16f206747bf647ad Mon Sep 17 00:00:00 2001
From: YUNSHEN XIE <1084314248@qq.com>
Date: Tue, 22 Feb 2022 19:24:16 +0800
Subject: [PATCH 043/101] disable some distribute test case when in CPU test
 env (#39801)

---
 python/paddle/fluid/tests/unittests/CMakeLists.txt          | 6 +++---
 .../fluid/tests/unittests/distributed_passes/CMakeLists.txt | 3 +++
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index ca18416a7a123..15ddcf588441e 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -712,11 +712,11 @@ if(WITH_DISTRIBUTE)
         endif()
 
         bash_test_modules(test_fleetrun START_BASH test_fleetrun.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
-        bash_test_modules(test_fleet_run_random_port START_BASH test_fleet_run_random_port.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
-        bash_test_modules(test_fleet_launch_async START_BASH test_fleet_launch_async.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
-        bash_test_modules(test_fleet_launch_cloud START_BASH test_fleet_launch_cloud.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
         if (WITH_GPU OR WITH_XPU OR WITH_ASCEND OR WITH_ASCEND_CL)
             bash_test_modules(test_fleet_launch_nproc START_BASH test_fleet_launch_nproc.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+            bash_test_modules(test_fleet_run_random_port START_BASH test_fleet_run_random_port.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+            bash_test_modules(test_fleet_launch_async START_BASH test_fleet_launch_async.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+            bash_test_modules(test_fleet_launch_cloud START_BASH test_fleet_launch_cloud.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
         endif()
         if(WITH_ASCEND OR WITH_ASCEND_CL)
             bash_test_modules(test_fleet_launch_ascend START_BASH test_fleet_launch_ascend.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt b/python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt
index 188b51ee16174..2bea60c3ded1a 100755
--- a/python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt
@@ -10,6 +10,9 @@ if ((NOT WITH_GPU) AND (NOT WITH_XPU) AND NOT (WITH_ASCEND OR WITH_ASCEND_CL))
     list(REMOVE_ITEM TEST_OPS "test_dist_fuse_relu_depthwise_conv_pass")
     list(REMOVE_ITEM TEST_OPS "test_dist_fuse_sgd_pass")
     list(REMOVE_ITEM TEST_OPS "test_dist_gradient_merge_pass")
+    list(REMOVE_ITEM TEST_OPS "test_auto_parallel_amp_pass")
+    list(REMOVE_ITEM TEST_OPS "test_auto_parallel_recompute_pass")
+    list(REMOVE_ITEM TEST_OPS "test_auto_parallel_sharding_pass")
 endif()
 
 foreach(TEST_OP ${TEST_OPS})

From a08ee62ac7153de51d7413eaff45822b197c12e0 Mon Sep 17 00:00:00 2001
From: JZ-LIANG <jianzhongliang10@gmail.com>
Date: Tue, 22 Feb 2022 20:00:37 +0800
Subject: [PATCH 044/101] Auto Parallel support conditional block (#39612)

* add subblock logic for context and partitioner

* partitioner support sub blocks

* revise typos

* fixed param init bug for while

* chmod 644

* add unitest

* mv forward parser

* update unitest

* update dist op ctx

* update dist op ctx

* fixed bug in dist op ctx

* fixed bug for recompute subblock
---
 .../distributed/auto_parallel/dist_context.py | 120 +++--
 .../dist_check_finite_and_unscale.py          |   6 +-
 .../auto_parallel/operators/dist_default.py   |  19 +-
 .../auto_parallel/operators/dist_embedding.py |  14 +-
 .../auto_parallel/operators/dist_matmul.py    |  47 +-
 .../auto_parallel/operators/dist_reshape.py   |  12 +-
 .../operators/dist_update_loss_scaling.py     |   6 +-
 .../distributed/auto_parallel/parallelizer.py |   6 +-
 .../distributed/auto_parallel/partitioner.py  |  80 ++--
 .../test_auto_parallel_while_op.py            | 440 ++++++++++++++++++
 .../test_auto_parallel_cost_model.py          |   1 +
 .../test_auto_parallel_dist_tensor.py         |  10 +-
 .../unittests/test_auto_parallel_mapper.py    |   2 +-
 .../test_auto_parallel_partitioner.py         |   1 +
 .../test_auto_parallel_partitioner_gpt.py     |   1 +
 .../unittests/test_auto_parallel_reshard.py   |   2 +-
 .../test_auto_parallel_reshard_dpmppp.py      |   2 +-
 .../test_auto_parallel_reshard_mppp.py        |   3 +-
 18 files changed, 657 insertions(+), 115 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_while_op.py

diff --git a/python/paddle/distributed/auto_parallel/dist_context.py b/python/paddle/distributed/auto_parallel/dist_context.py
index caf220646bb60..573f23fdca519 100644
--- a/python/paddle/distributed/auto_parallel/dist_context.py
+++ b/python/paddle/distributed/auto_parallel/dist_context.py
@@ -55,6 +55,7 @@ def __init__(self,
         self._is_initialized_for_program = False
         self._dist_tensors_for_program = {}
         self._dist_ops_for_program = {}
+        self._block_state = BlockState()
         # Graph related data members
         self._is_initialized_for_graph = False
         self._serial_graph = None
@@ -102,6 +103,10 @@ def process_meshes(self):
     def dist_op_context(self):
         return self._dist_op_context
 
+    @property
+    def block_state(self):
+        return self._block_state
+
     @property
     def dist_main_programs(self):
         return self._dist_main_programs
@@ -512,66 +517,83 @@ class DistributedOperatorContext:
 
     def __init__(self):
         self._dst_main_program = None
+        self._main_block = None
         self._dst_startup_program = None
-        self._varname_mapping = None
-        self._rank_id = None
+        self._startup_block = None
         self._cur_src_op = None
         self._cur_dist_attr = None
         self.grad_op_id_to_op_id = {}
+        self._work_block = None
         self.already_init_sync_vars = set()
+        self.varname_mapping = None
+        self.rank_id = None
 
     def __deepcopy__(self, memo):
         cls = self.__class__
         result = cls.__new__(cls)
         memo[id(self)] = result
         for k, v in self.__dict__.items():
-            if k == "_dst_main_program" or k == "_dst_startup_program" or k == "_cur_src_op":
+            if k in [
+                    "_dst_main_program", "_dst_startup_program", "_cur_src_op",
+                    "_work_block", "_main_block", "_startup_block"
+            ]:
                 setattr(result, k, v)
             else:
                 setattr(result, k, copy.deepcopy(v, memo))
         return result
 
-    def set_dst_main_program(self, prog):
-        self._dst_main_program = prog
-
-    def get_dst_main_program(self):
+    @property
+    def dst_main_program(self):
         return self._dst_main_program
 
-    def set_dst_startup_program(self, prog):
-        self._dst_startup_program = prog
+    @dst_main_program.setter
+    def dst_main_program(self, prog):
+        self._dst_main_program = prog
+        self._main_block = prog.blocks[0]
 
-    def get_dst_startup_program(self):
-        return self._dst_startup_program
+    @property
+    def main_block(self):
+        return self._main_block
 
-    def set_varname_mapping(self, mapping):
-        self._varname_mapping = mapping
+    @property
+    def dst_startup_program(self):
+        return self._dst_startup_program
 
-    def get_varname_mapping(self):
-        return self._varname_mapping
+    @dst_startup_program.setter
+    def dst_startup_program(self, prog):
+        self._dst_startup_program = prog
+        self._startup_block = prog.blocks[0]
 
-    def set_rank_id(self, rank_id):
-        self._rank_id = rank_id
+    @property
+    def startup_block(self):
+        return self._startup_block
 
-    def get_rank_id(self):
-        return self._rank_id
+    @property
+    def work_block(self):
+        assert self._work_block is not None
+        return self._work_block
 
-    def set_cur_src_op(self, cur_src_op):
-        self._cur_src_op = cur_src_op
+    @work_block.setter
+    def work_block(self, block):
+        assert block is not None
+        self._work_block = block
 
-    def get_cur_src_op(self):
+    @property
+    def cur_src_op(self):
+        assert self._cur_src_op is not None
         return self._cur_src_op
 
     def prepare_context(self, src_op):
 
-        self.set_cur_src_op(src_op)
+        self._cur_src_op = src_op
 
         # build input varname mapping
         kinputs = {}
         for input_name in src_op.desc.input_names():
             varnames = []
             for varname in src_op.desc.input(input_name):
-                assert varname in self._varname_mapping
-                varnames.append(self._varname_mapping[varname])
+                assert varname in self.varname_mapping
+                varnames.append(self.varname_mapping[varname])
             kinputs[input_name] = varnames
 
         # build output varname mapping
@@ -579,8 +601,52 @@ def prepare_context(self, src_op):
         for output_name in src_op.desc.output_names():
             varnames = []
             for varname in src_op.desc.output(output_name):
-                assert varname in self._varname_mapping
-                varnames.append(self._varname_mapping[varname])
+                assert varname in self.varname_mapping
+                varnames.append(self.varname_mapping[varname])
             koutputs[output_name] = varnames
 
         return kinputs, koutputs
+
+
+class BlockState(object):
+    def __init__(self):
+        self.nblock = 0
+        self.forward_indices = []
+        self.backward_indices = []
+        self.backward_to_forward_index_map = {}
+
+    def parse_forward_blocks(self, program):
+
+        while program.current_block_idx != 0:
+            program._rollback()
+
+        assert program.current_block_idx == 0
+
+        for idx, block in enumerate(program.blocks):
+
+            assert idx == block.idx, "index doesn't match"
+            assert block.forward_block_idx == -1, "forward_block_idx of forward block [{}] is not [{}]".format(
+                idx, block.forward_block_idx)
+            self.forward_indices.append(idx)
+            self.nblock += 1
+
+        assert self.nblock >= 1
+
+    def parse_backward_blocks(self, program):
+
+        assert 0 in self.forward_indices, "forward block idx are{}".format(
+            self.forward_indices)
+        self.backward_to_forward_index_map[0] = 0
+
+        for idx, block in enumerate(program.blocks):
+
+            if idx < len(self.forward_indices):
+                continue
+
+            assert idx == block.idx, "index doesn't match"
+            assert block.forward_block_idx in self.forward_indices
+            self.backward_indices.append(idx)
+            self.backward_to_forward_index_map[idx] = block.forward_block_idx
+            self.nblock += 1
+
+        assert self.nblock == len(program.blocks)
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_check_finite_and_unscale.py b/python/paddle/distributed/auto_parallel/operators/dist_check_finite_and_unscale.py
index 2870acfd367ca..b887de577b0a2 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_check_finite_and_unscale.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_check_finite_and_unscale.py
@@ -76,9 +76,9 @@ def backward(ctx, *args, **kwargs):
 
         # by now the backward function only insert the gradient allreduce for dist op itself
         dist_op_context = ctx.dist_op_context
-        main_block = dist_op_context.get_dst_main_program().global_block()
-        backward_op = dist_op_context.get_cur_src_op()
-        rank_id = dist_op_context.get_rank_id()
+        main_block = dist_op_context.main_block
+        backward_op = dist_op_context.cur_src_op
+        rank_id = dist_op_context.rank_id
         dist_attr = ctx.get_op_dist_attr_for_program(backward_op)
         assert dist_attr is not None, "backward op [{}] don't have dist attribute !".format(
             str(backward_op))
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_default.py b/python/paddle/distributed/auto_parallel/operators/dist_default.py
index 48f9b5a78dd8a..4e977007261a7 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_default.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_default.py
@@ -32,6 +32,8 @@
 from ..process_group import new_process_group
 from ..utils import _get_comm_group, _get_corresponding_rank
 
+__op_not_need_param_init__ = ["while", "cond"]
+
 
 class DistributedDefault(DistributedOperatorImplContainer):
     def __init__(self, op_type):
@@ -195,10 +197,10 @@ def update_dims_mapping(self, dist_op):
     def forward(ctx, *args, **kwargs):
 
         dist_op_context = ctx.dist_op_context
-        main_block = dist_op_context.get_dst_main_program().global_block()
-        startup_block = dist_op_context.get_dst_startup_program().global_block()
-        src_op = dist_op_context.get_cur_src_op()
-        rank_id = dist_op_context.get_rank_id()
+        main_block = dist_op_context.work_block
+        startup_block = dist_op_context.startup_block
+        src_op = dist_op_context.cur_src_op
+        rank_id = dist_op_context.rank_id
 
         # check validation of inputs / outputs
         for input_name in src_op.desc.input_names():
@@ -227,6 +229,9 @@ def forward(ctx, *args, **kwargs):
         main_block._sync_with_cpp()
 
         # param initialization sync
+        if src_op.type in __op_not_need_param_init__:
+            return
+
         for varname in dist_op_desc.input_arg_names():
             if startup_block.has_var(varname) and startup_block.var(
                     varname
@@ -278,12 +283,12 @@ def backward(ctx, *args, **kwargs):
 
         # by now the backward function only insert the gradient allreduce for dist op itself
         dist_op_context = ctx.dist_op_context
-        main_block = dist_op_context.get_dst_main_program().global_block()
-        backward_op = dist_op_context.get_cur_src_op()
+        main_block = dist_op_context.work_block
+        backward_op = dist_op_context.cur_src_op
         dist_attr = ctx.get_op_dist_attr_for_program(backward_op)
         assert dist_attr is not None, "backward op [{}] don't have dist attribute !".format(
             str(backward_op))
-        rank_id = dist_op_context.get_rank_id()
+        rank_id = dist_op_context.rank_id
 
         # check validation of inputs / outputs
         for input_name in backward_op.desc.input_names():
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_embedding.py b/python/paddle/distributed/auto_parallel/operators/dist_embedding.py
index eac4776f8f3bc..94eb0d2d469f0 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_embedding.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_embedding.py
@@ -128,10 +128,10 @@ def forward(ctx, *args, **kwargs):
         """
 
         dist_op_context = ctx.dist_op_context
-        main_block = dist_op_context.get_dst_main_program().global_block()
-        startup_block = dist_op_context.get_dst_startup_program().global_block()
-        src_op = dist_op_context.get_cur_src_op()
-        rank_id = dist_op_context.get_rank_id()
+        main_block = dist_op_context.work_block
+        startup_block = dist_op_context.startup_block
+        src_op = dist_op_context.cur_src_op
+        rank_id = dist_op_context.rank_id
         op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
         assert op_dist_attr is not None, "backward op [{}] don't have dist attribute !".format(
             str(src_op))
@@ -311,9 +311,9 @@ def backward(ctx, *args, **kwargs):
 
         # by now the backward function only insert the gradient allreduce for dist op itself
         dist_op_context = ctx.dist_op_context
-        main_block = dist_op_context.get_dst_main_program().global_block()
-        backward_op = dist_op_context.get_cur_src_op()
-        rank_id = dist_op_context.get_rank_id()
+        main_block = dist_op_context.work_block
+        backward_op = dist_op_context.cur_src_op
+        rank_id = dist_op_context.rank_id
         dist_attr = ctx.get_op_dist_attr_for_program(backward_op)
         assert dist_attr is not None, "backward op [{}] don't have dist attribute !".format(
             str(backward_op))
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
index cb59a6f25c487..9eb24a65e608c 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
@@ -223,9 +223,9 @@ def _right_operand_parameter_matmul_backward(ctx, *args, **kwargs):
     # by now the backward function only insert the gradient allreduce for dist op itself
 
     dist_op_context = ctx.dist_op_context
-    main_block = dist_op_context.get_dst_main_program().global_block()
-    backward_op = dist_op_context.get_cur_src_op()
-    rank_id = dist_op_context.get_rank_id()
+    main_block = dist_op_context.work_block
+    backward_op = dist_op_context.cur_src_op
+    rank_id = dist_op_context.rank_id
     dist_attr = ctx.get_op_dist_attr_for_program(backward_op)
     assert dist_attr is not None, "backward op [{}] don't have dist attribute !".format(
         str(backward_op))
@@ -257,7 +257,7 @@ def _right_operand_parameter_matmul_backward(ctx, *args, **kwargs):
         kwargs['Y@GRAD'])
 
     X_var = main_block.var(kwargs['X'][0])
-    Y_var = main_block.var(kwargs['Y'][0])
+    Y_var = main_block._var_recursive(kwargs['Y'][0])
     Out_grad = main_block.var(kwargs['Out@GRAD'][0])
     Y_grad = main_block.var(kwargs['Y@GRAD'][0])
 
@@ -433,7 +433,8 @@ def _right_operand_parameter_matmul_backward(ctx, *args, **kwargs):
 
 def _init_param_sync(Weight_var, dist_op_context, startup_block, ctx, rank_id):
 
-    assert Weight_var.name not in dist_op_context.already_init_sync_vars
+    assert Weight_var.name not in dist_op_context.already_init_sync_vars, "{} is in {}.".format(
+        Weight_var.name, dist_op_context.already_init_sync_vars)
     assert startup_block.has_var(Weight_var.name)
     dist_op_context.already_init_sync_vars.add(Weight_var.name)
     param = startup_block.var(Weight_var.name)
@@ -528,10 +529,10 @@ def forward(ctx, *args, **kwargs):
         """
 
         dist_op_context = ctx.dist_op_context
-        main_block = dist_op_context.get_dst_main_program().global_block()
-        startup_block = dist_op_context.get_dst_startup_program().global_block()
-        src_op = dist_op_context.get_cur_src_op()
-        rank_id = dist_op_context.get_rank_id()
+        main_block = dist_op_context.work_block
+        startup_block = dist_op_context.startup_block
+        src_op = dist_op_context.cur_src_op
+        rank_id = dist_op_context.rank_id
         op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
         assert op_dist_attr is not None, "backward op [{}] don't have dist attribute !".format(
             str(src_op))
@@ -753,10 +754,10 @@ def forward(ctx, *args, **kwargs):
         """
 
         dist_op_context = ctx.dist_op_context
-        main_block = dist_op_context.get_dst_main_program().global_block()
-        startup_block = dist_op_context.get_dst_startup_program().global_block()
-        src_op = dist_op_context.get_cur_src_op()
-        rank_id = dist_op_context.get_rank_id()
+        main_block = dist_op_context.work_block
+        startup_block = dist_op_context.startup_block
+        src_op = dist_op_context.cur_src_op
+        rank_id = dist_op_context.rank_id
         op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
         assert op_dist_attr is not None, "backward op [{}] don't have dist attribute !".format(
             str(src_op))
@@ -1042,10 +1043,10 @@ def forward(ctx, *args, **kwargs):
         """
 
         dist_op_context = ctx.dist_op_context
-        main_block = dist_op_context.get_dst_main_program().global_block()
-        startup_block = dist_op_context.get_dst_startup_program().global_block()
-        src_op = dist_op_context.get_cur_src_op()
-        rank_id = dist_op_context.get_rank_id()
+        main_block = dist_op_context.work_block
+        startup_block = dist_op_context.startup_block
+        src_op = dist_op_context.cur_src_op
+        rank_id = dist_op_context.rank_id
         op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
         assert op_dist_attr is not None, "backward op [{}] don't have dist attribute !".format(
             str(src_op))
@@ -1071,7 +1072,7 @@ def forward(ctx, *args, **kwargs):
                 output_name)
 
         X_var = main_block.var(kwargs['X'][0])
-        Weight_var = main_block.var(kwargs['Y'][0])
+        Weight_var = main_block._var_recursive(kwargs['Y'][0])
         Out_var = main_block.var(kwargs['Out'][0])
 
         # TODO infer logic comm presentation
@@ -1261,10 +1262,10 @@ def forward(ctx, *args, **kwargs):
         """
 
         dist_op_context = ctx.dist_op_context
-        main_block = dist_op_context.get_dst_main_program().global_block()
-        startup_block = dist_op_context.get_dst_startup_program().global_block()
-        src_op = dist_op_context.get_cur_src_op()
-        rank_id = dist_op_context.get_rank_id()
+        main_block = dist_op_context.work_block
+        startup_block = dist_op_context.startup_block
+        src_op = dist_op_context.cur_src_op
+        rank_id = dist_op_context.rank_id
         op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
         assert op_dist_attr is not None, "backward op [{}] don't have dist attribute !".format(
             str(src_op))
@@ -1290,7 +1291,7 @@ def forward(ctx, *args, **kwargs):
                 output_name)
 
         X_var = main_block.var(kwargs['X'][0])
-        Weight_var = main_block.var(kwargs['Y'][0])
+        Weight_var = main_block._var_recursive(kwargs['Y'][0])
         Out_var = main_block.var(kwargs['Out'][0])
 
         # TODO infer logic comm presentation
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_reshape.py b/python/paddle/distributed/auto_parallel/operators/dist_reshape.py
index 93b0d91b7836d..a72e304bb5b91 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_reshape.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_reshape.py
@@ -130,9 +130,9 @@ def forward(ctx, *args, **kwargs):
         """
 
         dist_op_context = ctx.dist_op_context
-        main_block = dist_op_context.get_dst_main_program().global_block()
-        src_op = dist_op_context.get_cur_src_op()
-        rank_id = dist_op_context.get_rank_id()
+        main_block = dist_op_context.work_block
+        src_op = dist_op_context.cur_src_op
+        rank_id = dist_op_context.rank_id
         op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
         assert op_dist_attr is not None, "backward op [{}] don't have dist attribute !".format(
             str(src_op))
@@ -287,9 +287,9 @@ def forward(ctx, *args, **kwargs):
         """
 
         dist_op_context = ctx.dist_op_context
-        main_block = dist_op_context.get_dst_main_program().global_block()
-        src_op = dist_op_context.get_cur_src_op()
-        rank_id = dist_op_context.get_rank_id()
+        main_block = dist_op_context.work_block
+        src_op = dist_op_context.cur_src_op
+        rank_id = dist_op_context.rank_id
         op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
         assert op_dist_attr is not None, "backward op [{}] don't have dist attribute !".format(
             str(src_op))
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_update_loss_scaling.py b/python/paddle/distributed/auto_parallel/operators/dist_update_loss_scaling.py
index f216fce16f30d..4ea2e0a884716 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_update_loss_scaling.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_update_loss_scaling.py
@@ -65,9 +65,9 @@ def backward(ctx, *args, **kwargs):
 
         # the backward function only filte the gradient with current rank id
         dist_op_context = ctx.dist_op_context
-        main_block = dist_op_context.get_dst_main_program().global_block()
-        backward_op = dist_op_context.get_cur_src_op()
-        rank_id = dist_op_context.get_rank_id()
+        main_block = dist_op_context.main_block
+        backward_op = dist_op_context.cur_src_op
+        rank_id = dist_op_context.rank_id
         dist_attr = ctx.get_op_dist_attr_for_program(backward_op)
         assert dist_attr is not None, "backward op [{}] don't have dist attribute !".format(
             str(backward_op))
diff --git a/python/paddle/distributed/auto_parallel/parallelizer.py b/python/paddle/distributed/auto_parallel/parallelizer.py
index 6278f0a2424a0..0f35ccd915f2a 100644
--- a/python/paddle/distributed/auto_parallel/parallelizer.py
+++ b/python/paddle/distributed/auto_parallel/parallelizer.py
@@ -132,7 +132,7 @@ def _generate_backward(self, main_program, startup_program, loss,
                 distop_context=self._dist_context.dist_op_context)
         self._completer = Completer(self._dist_context)
         self._completer.complete_backward_annotation(main_program)
-
+        self._dist_context.block_state.parse_backward_blocks(main_program)
         return params_grads
 
     def _apply_optimize(self, main_program, startup_program, params_grads):
@@ -174,6 +174,7 @@ def _get_dist_program(self, rank, dist_context=None, relaunch_phase=False):
         serial_main_program = self._main_program.clone()
         serial_startup_program = self._startup_program.clone()
         serial_loss = serial_main_program.global_block().var(self._loss.name)
+
         # generating serial 
         if dist_context is None:
             # Annotation completion
@@ -186,6 +187,9 @@ def _get_dist_program(self, rank, dist_context=None, relaunch_phase=False):
             completed_main_program = serial_main_program
             self._dist_context = copy.deepcopy(dist_context)
 
+        # parse forward sub block
+        self._dist_context.block_state.parse_forward_blocks(serial_main_program)
+
         # serial backward pass
         params_grads = self._generate_backward(
             completed_main_program, serial_startup_program, serial_loss,
diff --git a/python/paddle/distributed/auto_parallel/partitioner.py b/python/paddle/distributed/auto_parallel/partitioner.py
index e789d82632e07..2f88407c093a5 100644
--- a/python/paddle/distributed/auto_parallel/partitioner.py
+++ b/python/paddle/distributed/auto_parallel/partitioner.py
@@ -29,6 +29,9 @@
 from .operators.common import BACKWARD_ONLY_DIST_OPS
 
 __varname_not_in_block__ = ["lod_tensor_blocking_queue_0"]
+__not_shape_var_type__ = [
+    core.VarDesc.VarType.READER, core.VarDesc.VarType.STEP_SCOPES
+]
 
 
 class Partitioner(object):
@@ -75,8 +78,8 @@ def partition(self, serial_main_program, serial_startup_program,
 
         # init distop helper
         dist_op_context = self._dist_context.dist_op_context
-        dist_op_context.set_varname_mapping(self._serial2dist_varname_mapping)
-        dist_op_context.set_rank_id(self._rank_id)
+        dist_op_context.varname_mapping = self._serial2dist_varname_mapping
+        dist_op_context.rank_id = self._rank_id
 
         # partition startup program
         if serial_startup_program == None:
@@ -84,7 +87,7 @@ def partition(self, serial_main_program, serial_startup_program,
         else:
             partitioned_startup_prog = self.partition_startup_program(
                 serial_main_program, serial_startup_program)
-        dist_op_context.set_dst_startup_program(partitioned_startup_prog)
+        dist_op_context.dst_startup_program = partitioned_startup_prog
 
         # partition main program
         partitioned_main_prog, partitioned_params_grads = self.partition_main_program(
@@ -157,15 +160,45 @@ def partition_main_program(self, serial_main_program, params_and_grads):
         2. replace local op with corresponding dist op
         """
 
-        dist_op_context = self._dist_context.dist_op_context
         partitioned_main_prog = fluid.Program()
-        dist_op_context.set_dst_main_program(partitioned_main_prog)
-        target_block = partitioned_main_prog.global_block()
-        ref_block = serial_main_program.global_block()
-        serial_ops = serial_main_program.global_block().ops
+        dist_op_context = self._dist_context.dist_op_context
+        dist_op_context.dst_main_program = partitioned_main_prog
+
+        for idx in range(self._dist_context.block_state.nblock):
+            ref_block = serial_main_program.blocks[idx]
+
+            if idx == 0:
+                target_block = partitioned_main_prog.blocks[0]
+            else:
+                target_block = partitioned_main_prog._create_block(
+                    parent_idx=ref_block.parent_idx)
+                assert ref_block.idx == target_block.idx
+                target_block._set_forward_block_idx(ref_block.forward_block_idx)
+            dist_op_context.work_block = target_block
+            self.partition_block(ref_block, target_block)
+
+        partitioned_main_prog.current_block_idx = 0
+
+        partitioned_params_and_grads = []
+        for p, g in params_and_grads:
+            assert p.name in self._serial2dist_varname_mapping
+            dist_p = self._get_dist_var_by_serial_var(p, partitioned_main_prog)
+            if g is None:
+                dist_g = None
+            else:
+                assert g.name in self._serial2dist_varname_mapping
+                dist_g = self._get_dist_var_by_serial_var(g,
+                                                          partitioned_main_prog)
+            partitioned_params_and_grads.append((dist_p, dist_g))
+
+        return partitioned_main_prog, partitioned_params_and_grads
+
+    def partition_block(self, ref_block, target_block):
+
+        dist_op_context = self._dist_context.dist_op_context
+        serial_ops = ref_block.ops
 
         # init mapping
-        first_backward_op_idx = -1
         forward_op_id2forward_op = {}
         for idx in range(len(serial_ops)):
             if is_forward_op(serial_ops[idx]):
@@ -218,23 +251,6 @@ def partition_main_program(self, serial_main_program, params_and_grads):
                     "partitioner only support forward op and backward op, but got {}".
                     format(str(op)))
 
-        partitioned_params_and_grads = []
-        for p, g in params_and_grads:
-            assert p.name in self._serial2dist_varname_mapping
-            dist_p_name = self._serial2dist_varname_mapping[p.name]
-            assert target_block.has_var(dist_p_name)
-            dist_p = target_block.var(dist_p_name)
-            if g is None:
-                dist_g = None
-            else:
-                assert g.name in self._serial2dist_varname_mapping
-                dist_g_name = self._serial2dist_varname_mapping[g.name]
-                assert target_block.has_var(dist_g_name)
-                dist_g = target_block.var(dist_g_name)
-            partitioned_params_and_grads.append((dist_p, dist_g))
-
-        return partitioned_main_prog, partitioned_params_and_grads
-
     def _is_valid_annotated_program(self, program):
 
         # TODO (ZJ-LIANG) should check all block
@@ -245,7 +261,7 @@ def _is_valid_annotated_program(self, program):
         ]
         var_dist_attrs = [
             self._dist_context.get_tensor_dist_attr_for_program(var)
-            for var in vars_
+            for var in vars_ if (var.type not in __not_shape_var_type__)
         ]
 
         all_ops_annotated = all(dist_attr is not None
@@ -255,6 +271,14 @@ def _is_valid_annotated_program(self, program):
 
         return all_ops_annotated and all_vars_annotated
 
+    def _get_dist_var_by_serial_var(self, serial_var, partitioned_main_prog):
+
+        block_idx = serial_var.block.idx
+        target_block = partitioned_main_prog.blocks[block_idx]
+        dist_var_name = self._serial2dist_varname_mapping[serial_var.name]
+        assert target_block.has_var(dist_var_name)
+        return target_block.var(dist_var_name)
+
 
 def _get_dist_shape(var, dist_attr):
 
@@ -341,7 +365,7 @@ def _partition_var(dist_context, src_block, dst_block, src_varname,
     """
     src_var = src_block.var(src_varname)
 
-    if src_var.type == core.VarDesc.VarType.READER:
+    if src_var.type in __not_shape_var_type__:
         dst_block.create_var(
             type=src_var.type,
             name=dst_varname,
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_while_op.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_while_op.py
new file mode 100644
index 0000000000000..1cd8f8f3e7083
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_while_op.py
@@ -0,0 +1,440 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import numpy as np
+import paddle.nn as nn
+import paddle.utils as utils
+import paddle.fluid as fluid
+import paddle.static as static
+import paddle.nn.functional as F
+import paddle.distributed.auto_parallel as auto
+
+from paddle.distributed import fleet
+
+from paddle.distributed.auto_parallel.partitioner import Partitioner
+from paddle.distributed.auto_parallel.utils import make_data_unshard
+from paddle.distributed.auto_parallel.dist_attribute import OperatorDistributedAttribute, TensorDistributedAttribute
+from paddle.distributed.auto_parallel.dist_context import DistributedContext, get_default_distributed_context
+from paddle.distributed.auto_parallel.operators import find_best_compatible_distributed_operator_impl
+
+paddle.enable_static()
+
+batch_size = 4
+epoch_num = 10
+hidden_size = 1024
+sequence_len = 512
+_g_process_mesh = auto.ProcessMesh([0, 1])
+
+
+def get_random_inputs_and_labels(input_shape, label_shape):
+    input = np.random.random(size=input_shape).astype('float32')
+    label = np.random.random(size=label_shape).astype('float32')
+    return input, label
+
+
+def batch_generator_creator():
+    def __reader__():
+        for _ in range(batch_size):
+            batch_input, batch_label = get_random_inputs_and_labels(
+                [batch_size, sequence_len, hidden_size],
+                [batch_size, sequence_len, 1])
+            yield batch_input, batch_label
+
+    return __reader__
+
+
+class MLPLayer(nn.Layer):
+    def __init__(self,
+                 hidden_size=1024,
+                 intermediate_size=4 * 1024,
+                 dropout_ratio=0.1,
+                 initializer_range=0.02):
+        super(MLPLayer, self).__init__()
+        d_model = hidden_size
+        dim_feedforward = intermediate_size
+        param_initializer = nn.initializer.Normal(
+            mean=0.0, std=initializer_range)
+
+        self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
+        self.linear0 = nn.Linear(
+            d_model,
+            dim_feedforward,
+            weight_attr=paddle.ParamAttr(initializer=param_initializer),
+            bias_attr=None)
+        self.linear1 = nn.Linear(
+            dim_feedforward,
+            d_model,
+            weight_attr=paddle.ParamAttr(initializer=param_initializer),
+            bias_attr=None)
+
+    def forward(self, input):
+
+        auto.shard_tensor(
+            self.norm.weight,
+            dist_attr={"process_mesh": _g_process_mesh,
+                       "dims_mapping": [-1]})
+        auto.shard_tensor(
+            self.norm.bias,
+            dist_attr={"process_mesh": _g_process_mesh,
+                       "dims_mapping": [-1]})
+        auto.shard_tensor(
+            self.linear0.weight,
+            dist_attr={
+                "process_mesh": _g_process_mesh,
+                "dims_mapping": [-1, 0]
+            })
+        auto.shard_tensor(
+            self.linear0.bias,
+            dist_attr={"process_mesh": _g_process_mesh,
+                       "dims_mapping": [0]})
+        auto.shard_tensor(
+            self.linear1.weight,
+            dist_attr={
+                "process_mesh": _g_process_mesh,
+                "dims_mapping": [0, -1]
+            })
+        auto.shard_tensor(
+            self.linear1.bias,
+            dist_attr={"process_mesh": _g_process_mesh,
+                       "dims_mapping": [-1]})
+
+        out = self.norm(input)
+        auto.shard_tensor(
+            out,
+            dist_attr={
+                "process_mesh": _g_process_mesh,
+                "dims_mapping": [-1, -1, -1]
+            })
+        out = self.linear0(out)
+        auto.shard_tensor(
+            out,
+            dist_attr={
+                "process_mesh": _g_process_mesh,
+                "dims_mapping": [-1, -1, 0]
+            })
+        out = F.gelu(out, approximate=True)
+        auto.shard_tensor(
+            out,
+            dist_attr={
+                "process_mesh": _g_process_mesh,
+                "dims_mapping": [-1, -1, 0]
+            })
+        out = self.linear1(out)
+        auto.shard_tensor(
+            out,
+            dist_attr={
+                "process_mesh": _g_process_mesh,
+                "dims_mapping": [-1, -1, -1]
+            })
+
+        return out
+
+
+def get_program():
+    dist_strategy = fleet.DistributedStrategy()
+    dist_strategy.semi_auto = True
+    # fleet.init(is_collective=True, strategy=dist_strategy)
+
+    train_program = static.Program()
+    start_program = static.Program()
+    with fluid.program_guard(train_program, start_program):
+
+        # 循环计数器
+        i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=0)
+        auto.shard_tensor(
+            i,
+            dist_attr={"process_mesh": _g_process_mesh,
+                       "dims_mapping": [-1]})
+
+        # 循环次数
+        loop_len = fluid.layers.fill_constant(
+            shape=[1], dtype='int64', value=epoch_num)
+        auto.shard_tensor(
+            loop_len,
+            dist_attr={"process_mesh": _g_process_mesh,
+                       "dims_mapping": [-1]})
+
+        # input
+        input = static.data(
+            name="input",
+            shape=[batch_size, sequence_len, hidden_size],
+            dtype='float32')
+        label = static.data(
+            name="label", shape=[batch_size, sequence_len, 1], dtype='float32')
+        data_holder = [input, label]
+        # dataloader
+        dataloader = paddle.io.DataLoader.from_generator(
+            feed_list=data_holder, capacity=4 * batch_size, iterable=False)
+        dataloader.set_batch_generator(
+            batch_generator_creator(), places=paddle.static.cuda_places())
+        # data dist_attr
+        auto.shard_tensor(
+            input,
+            dist_attr={
+                "process_mesh": _g_process_mesh,
+                "dims_mapping": [-1, -1, -1]
+            })
+        auto.shard_tensor(
+            label,
+            dist_attr={
+                "process_mesh": _g_process_mesh,
+                "dims_mapping": [-1, -1, -1]
+            })
+
+        mlp_start = MLPLayer(
+            hidden_size=hidden_size,
+            intermediate_size=4 * hidden_size,
+            dropout_ratio=0.1,
+            initializer_range=0.02)
+        pred = mlp_start(input)
+
+        input_array = fluid.layers.array_write(pred, i)
+        auto.shard_tensor(
+            input_array,
+            dist_attr={
+                "process_mesh": _g_process_mesh,
+                "dims_mapping": [-1, -1, -1]
+            })
+
+        cond = fluid.layers.less_than(x=i, y=loop_len)
+        auto.shard_tensor(
+            cond,
+            dist_attr={"process_mesh": _g_process_mesh,
+                       "dims_mapping": [-1]})
+
+        while_op = fluid.layers.While(cond=cond)
+        with while_op.block():
+
+            pre_input = fluid.layers.array_read(array=input_array, i=i)
+            auto.shard_tensor(
+                pre_input,
+                dist_attr={
+                    "process_mesh": _g_process_mesh,
+                    "dims_mapping": [-1, -1, -1]
+                })
+
+            mlp_while = MLPLayer(
+                hidden_size=hidden_size,
+                intermediate_size=4 * hidden_size,
+                dropout_ratio=0.1,
+                initializer_range=0.02)
+            cur_pred = mlp_while(pre_input)
+
+            # 更新循环条件
+            i = fluid.layers.increment(x=i, value=1, in_place=True)
+            fluid.layers.array_write(cur_pred, array=input_array, i=i)
+            fluid.layers.less_than(x=i, y=loop_len, cond=cond)
+
+        end_pred = fluid.layers.array_read(array=input_array, i=i)
+        auto.shard_tensor(
+            end_pred,
+            dist_attr={
+                "process_mesh": _g_process_mesh,
+                "dims_mapping": [-1, -1, -1]
+            })
+
+        mlp_end = MLPLayer(
+            hidden_size=hidden_size,
+            intermediate_size=4 * hidden_size,
+            dropout_ratio=0.1,
+            initializer_range=0.02)
+        pred = mlp_end(end_pred)
+
+        error_cost = paddle.nn.functional.square_error_cost(pred, label)
+        auto.shard_tensor(
+            error_cost,
+            dist_attr={
+                "process_mesh": _g_process_mesh,
+                "dims_mapping": [-1, -1, -1]
+            })
+
+        loss = paddle.mean(error_cost)
+        auto.shard_tensor(
+            loss,
+            dist_attr={"process_mesh": _g_process_mesh,
+                       "dims_mapping": [-1]})
+
+    return train_program, start_program, dataloader, i, loss
+
+
+def completion(train_program, start_program, dist_context):
+    blocks = train_program.blocks
+    # completion tensors
+    for block in blocks:
+        for op in block.ops:
+            if op.type == "layer_norm":
+                for out_name in op.output_arg_names:
+                    out_var = block.vars[out_name]
+                    tensor_dist_attr = dist_context.get_tensor_dist_attr_for_program(
+                        out_var)
+                    if tensor_dist_attr:
+                        continue
+                    tensor_dist_attr = TensorDistributedAttribute()
+                    tensor_dist_attr.process_mesh = _g_process_mesh
+                    tensor_dist_attr.dims_mapping = [-1]
+                    dist_context.set_tensor_dist_attr_for_program(
+                        out_var, tensor_dist_attr)
+
+            elif op.type == "elementwise_sub":
+                for out_name in op.output_arg_names:
+                    out_var = block.vars[out_name]
+                    tensor_dist_attr = TensorDistributedAttribute()
+                    tensor_dist_attr.process_mesh = _g_process_mesh
+                    tensor_dist_attr.dims_mapping = [-1, -1, -1]
+                    dist_context.set_tensor_dist_attr_for_program(
+                        out_var, tensor_dist_attr)
+
+            elif op.type == "matmul_v2":
+                col = False
+                for in_name in op.input_arg_names:
+                    if ".w_" not in in_name:
+                        continue
+                    if in_name not in block.vars:
+                        in_var = blocks[0].vars[in_name]
+                    else:
+                        in_var = block.vars[in_name]
+                    tensor_dist_attr = dist_context.get_tensor_dist_attr_for_program(
+                        in_var)
+                    assert tensor_dist_attr is not None
+                    if tensor_dist_attr.dims_mapping == [-1, 0]:
+                        col = True
+                for out_name in op.output_arg_names:
+                    out_var = block.vars[out_name]
+                    tensor_dist_attr = dist_context.get_tensor_dist_attr_for_program(
+                        out_var)
+                    if tensor_dist_attr:
+                        continue
+                    tensor_dist_attr = TensorDistributedAttribute()
+                    tensor_dist_attr.process_mesh = _g_process_mesh
+                    if col:
+                        tensor_dist_attr.dims_mapping = [-1, -1, 0]
+                    else:
+                        tensor_dist_attr.dims_mapping = [-1, -1, -1]
+                    dist_context.set_tensor_dist_attr_for_program(
+                        out_var, tensor_dist_attr)
+            elif op.type == "while":
+                out_name = op.desc.output("StepScopes")[0]
+                out_var = block.vars[out_name]
+                tensor_dist_attr = TensorDistributedAttribute()
+                tensor_dist_attr.process_mesh = _g_process_mesh
+                tensor_dist_attr.dims_mapping = [-1]
+                dist_context.set_tensor_dist_attr_for_program(out_var,
+                                                              tensor_dist_attr)
+
+    # completion ops
+    for block in blocks:
+        for op in block.ops:
+            op_dist_attr = OperatorDistributedAttribute()
+            op_dist_attr.process_mesh = _g_process_mesh
+            if op.type == "create_by_read" or op.type == "create_double_buffer_reader":
+                for in_name in op.input_arg_names:
+                    op_dist_attr.set_input_dims_mapping(in_name, [])
+                for out_name in op.output_arg_names:
+                    op_dist_attr.set_output_dims_mapping(out_name, [])
+            elif op.type == "read":
+                for in_name in op.input_arg_names:
+                    op_dist_attr.set_output_dims_mapping(in_name, [])
+                for out_name in op.output_arg_names:
+                    out_var = block.vars[out_name]
+                    out_dist_attr = dist_context.get_tensor_dist_attr_for_program(
+                        out_var)
+                    op_dist_attr.set_output_dist_attr(out_name, out_dist_attr)
+            elif op.type == "while":
+                for in_name in op.input_arg_names:
+                    in_var = block.vars[in_name]
+                    in_dist_attr = dist_context.get_tensor_dist_attr_for_program(
+                        in_var)
+                    op_dist_attr.set_input_dist_attr(in_name, in_dist_attr)
+                for out_name in op.output_arg_names:
+                    if out_name == op.desc.output("StepScopes")[0]:
+                        op_dist_attr.set_output_dims_mapping(out_name, [])
+                    else:
+                        out_var = block.vars[out_name]
+                        out_dist_attr = dist_context.get_tensor_dist_attr_for_program(
+                            out_var)
+                        op_dist_attr.set_output_dist_attr(out_name,
+                                                          out_dist_attr)
+            else:
+                for in_name in op.input_arg_names:
+                    if in_name == "lod_tensor_blocking_queue_0":
+                        continue
+                    if in_name not in block.vars:
+                        in_var = blocks[0].vars[in_name]
+                    else:
+                        in_var = block.vars[in_name]
+                    in_dist_attr = dist_context.get_tensor_dist_attr_for_program(
+                        in_var)
+                    op_dist_attr.set_input_dist_attr(in_name, in_dist_attr)
+                for out_name in op.output_arg_names:
+                    if out_name not in block.vars:
+                        out_var = blocks[0].vars[out_name]
+                    else:
+                        out_var = block.vars[out_name]
+                    out_dist_attr = dist_context.get_tensor_dist_attr_for_program(
+                        out_var)
+                    op_dist_attr.set_output_dist_attr(out_name, out_dist_attr)
+
+            if op.type == "matmul_v2":
+                op_dist_attr.impl_type = "matmul_v2"
+                for in_name in op_dist_attr.inputs_dist_attrs.keys():
+                    in_dist_attr = op_dist_attr.inputs_dist_attrs[in_name]
+                    if ".w_" in in_name and in_dist_attr.dims_mapping[-1] == 0:
+                        op_dist_attr.impl_idx = 0
+                    else:
+                        op_dist_attr.impl_idx = 1
+            else:
+                op_dist_attr.impl_type = "default"
+                op_dist_attr.impl_idx = 0
+
+            dist_context.set_op_dist_attr_for_program(op, op_dist_attr)
+            make_data_unshard(train_program, start_program, dist_context)
+
+    return train_program, start_program
+
+
+def partition(train_program, start_program, dist_context):
+
+    # optimizer = paddle.optimizer.SGD(learning_rate=0.00001)
+    rank = paddle.distributed.get_rank()
+    partitioner = Partitioner(dist_context, rank)
+    dist_main_prog, dist_startup_prog, _ = partitioner.partition(
+        train_program, start_program, [])
+
+    return dist_main_prog, dist_startup_prog
+
+
+class TestMLP(unittest.TestCase):
+    def test_partitioner(self):
+
+        train_program, start_program, dataloader, i, loss = get_program()
+        dist_context = get_default_distributed_context()
+        train_program, start_program = completion(train_program, start_program,
+                                                  dist_context)
+        dist_context.block_state.parse_forward_blocks(train_program)
+
+        dist_main_prog, dist_startup_prog = partition(
+            train_program, start_program, dist_context)
+        global_block_ops = dist_main_prog.blocks[0].ops
+        global_block_ops = [op.type for op in global_block_ops]
+        sub_block_ops = dist_main_prog.blocks[1].ops
+        sub_block_ops = [op.type for op in sub_block_ops]
+
+        self.assertTrue("c_allreduce_sum" in global_block_ops)
+        self.assertTrue("c_allreduce_sum" in sub_block_ops)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py
index 52397f51321f5..96ab0aecb7585 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py
@@ -158,6 +158,7 @@ def get_dist_prog(train_program, startup_program, dist_context, rank_id):
     completer = Completer(dist_context)
     complete_train_program = completer.complete_forward_annotation(
         train_program)
+    dist_context.block_state.parse_forward_blocks(complete_train_program)
 
     params_grads = parallelizer._generate_backward(
         complete_train_program,
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_dist_tensor.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_dist_tensor.py
index 27de9f325063b..29575dc76c2a1 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_dist_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_dist_tensor.py
@@ -47,9 +47,7 @@ def get_dist_prog(train_program,
     complete_train_program = completer.complete_forward_annotation(
         train_program
     ) if complete_train_program is None else complete_train_program
-
-    # parallelizer._apply_serial_forward_pass(complete_train_program,
-    #                                         startup_program)
+    dist_context.block_state.parse_forward_blocks(complete_train_program)
 
     params_grads = parallelizer._generate_backward(
         complete_train_program,
@@ -95,9 +93,9 @@ def test_new_local_tensor(self):
         rank_id = 1
         train_program = paddle.static.Program()
         startup_program = paddle.static.Program()
-        dist_main_prog, dist_startup_prog, _ = get_dist_prog(
-            train_program, startup_program, dist_context, rank_id,
-            complete_train_program)
+        dist_context = DistributedContext()
+        dist_main_prog, dist_startup_prog, complete_train_program = get_dist_prog(
+            train_program, startup_program, dist_context, rank_id, None)
         dist_context.dist_main_programs[rank_id] = dist_main_prog
         dist_context.dist_startup_programs[rank_id] = dist_startup_prog
         name = "layer_norm_1.tmp_2"
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py
index 8869fd6a59e37..36a34815b681a 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py
@@ -486,7 +486,7 @@ def get_dist_prog(train_program, startup_program, dist_context, rank_id):
     completer = Completer(dist_context)
     complete_train_program = completer.complete_forward_annotation(
         train_program)
-
+    dist_context.block_state.parse_forward_blocks(complete_train_program)
     params_grads = parallelizer._generate_backward(
         complete_train_program,
         startup_program,
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner.py
index deff2144411fc..ef8780a020f33 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner.py
@@ -53,6 +53,7 @@ def get_programs(annotated_func):
     completer = Completer(dist_context)
     complete_train_program = completer.complete_forward_annotation(
         train_program)
+    dist_context.block_state.parse_forward_blocks(complete_train_program)
 
     rank_id = 3
     dist_strategy = fleet.DistributedStrategy()
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py
index 01e62d886e2b7..d0bed73f1b8c4 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py
@@ -885,6 +885,7 @@ def test_gpt_dp_mp(self):
         completer = Completer(dist_context)
         complete_train_program = completer.complete_forward_annotation(
             train_program)
+        dist_context.block_state.parse_forward_blocks(complete_train_program)
 
         # serial backward pass
         params_grads = parallelizer._generate_backward(
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py
index 1d8938785924c..1278ed68d959e 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py
@@ -160,7 +160,7 @@ def get_dist_prog(train_program,
     completer = Completer(dist_context)
     complete_train_program = completer.complete_forward_annotation(
         train_program)
-
+    dist_context.block_state.parse_forward_blocks(complete_train_program)
     if change_process_mesh:
         global PP_MESH_1
         dist_context.get_tensor_dist_attr_for_program(
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py
index 5a79d1f9514ab..e84cb68f437ca 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py
@@ -120,7 +120,7 @@ def get_dist_prog(train_program, startup_program, dist_context, rank_id):
     completer = Completer(dist_context)
     complete_train_program = completer.complete_forward_annotation(
         train_program)
-
+    dist_context.block_state.parse_forward_blocks(complete_train_program)
     params_grads = parallelizer._generate_backward(
         complete_train_program,
         startup_program,
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py
index 6696a9d3006d2..0636c083e54e0 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py
@@ -136,7 +136,7 @@ def get_dist_prog(train_program, startup_program, dist_context, rank_id):
     completer = Completer(dist_context)
     complete_train_program = completer.complete_forward_annotation(
         train_program)
-
+    dist_context.block_state.parse_forward_blocks(complete_train_program)
     params_grads = parallelizer._generate_backward(
         complete_train_program,
         startup_program,
@@ -269,6 +269,7 @@ def test_allgather(self):
         completer = Completer(dist_context)
         complete_train_program = completer.complete_forward_annotation(
             train_program)
+        dist_context.block_state.parse_forward_blocks(complete_train_program)
         partitioned_main_prog, partitioned_startup_prog, partitioned_params_grads = partitioner.partition(
             complete_train_program, startup_program, [])
         reshard(partitioned_main_prog, partitioned_startup_prog, rank_id,

From 42eb56e248543830a08a115d775f135aae8be954 Mon Sep 17 00:00:00 2001
From: zhiboniu <31800336+zhiboniu@users.noreply.github.com>
Date: Tue, 22 Feb 2022 20:38:30 +0800
Subject: [PATCH 045/101] unset fluid in tensor (#35082)

---
 python/paddle/framework/__init__.py  |   7 +-
 python/paddle/tensor/attribute.py    |   7 +-
 python/paddle/tensor/creation.py     |  32 ++++---
 python/paddle/tensor/einsum.py       |   9 +-
 python/paddle/tensor/linalg.py       |  69 ++++++++-------
 python/paddle/tensor/logic.py        |  26 +++---
 python/paddle/tensor/manipulation.py |  48 +++++-----
 python/paddle/tensor/math.py         | 128 ++++++++++++++-------------
 python/paddle/tensor/random.py       |  27 +++---
 python/paddle/tensor/search.py       |  73 +++++++--------
 python/paddle/tensor/stat.py         |  13 ++-
 python/paddle/tensor/to_string.py    |   2 +-
 12 files changed, 223 insertions(+), 218 deletions(-)

diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py
index 7da9c0accfb49..b13aefb58c09e 100644
--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
@@ -32,7 +32,7 @@
 from ..fluid.core import CustomPlace  # noqa: F401
 from ..fluid.core import VarBase  # noqa: F401
 
-from paddle.fluid import core  # noqa: F401
+from ..fluid import core  # noqa: F401
 from ..fluid.dygraph.base import no_grad_ as no_grad  # noqa: F401
 from ..fluid.dygraph.base import grad  # noqa: F401
 from .io import save  # noqa: F401
@@ -47,5 +47,10 @@
 from ..fluid.dygraph.base import enable_dygraph as disable_static  # noqa: F401
 from ..fluid.dygraph.base import disable_dygraph as enable_static  # noqa: F401
 from ..fluid.framework import in_dygraph_mode as in_dynamic_mode  # noqa: F401
+from ..fluid.framework import _current_expected_place, _get_paddle_place  # noqa: F401
+from ..fluid.framework import dygraph_only  # noqa: F401
+from ..fluid.framework import convert_np_dtype_to_dtype_, _varbase_creator, OpProtoHolder  # noqa: F401
+from ..fluid.framework import _in_eager_mode  # noqa: F401
+from ..fluid.framework import _dygraph_tracer  # noqa: F401
 
 __all__ = []
diff --git a/python/paddle/tensor/attribute.py b/python/paddle/tensor/attribute.py
index ee84b43e13fef..b851f6db4acab 100644
--- a/python/paddle/tensor/attribute.py
+++ b/python/paddle/tensor/attribute.py
@@ -14,7 +14,7 @@
 
 from __future__ import print_function
 
-from ..fluid.framework import core, in_dygraph_mode, Variable
+from ..framework import core
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.data_feeder import check_variable_and_dtype
 
@@ -23,6 +23,7 @@
 from ..fluid.layers import shape  # noqa: F401
 import paddle
 from paddle import _C_ops
+from paddle.static import Variable
 
 __all__ = []
 
@@ -184,7 +185,7 @@ def real(x, name=None):
             #        [[1., 2., 3.],
             #         [4., 5., 6.]])
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.real(x)
 
     check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], 'real')
@@ -228,7 +229,7 @@ def imag(x, name=None):
             #        [[6., 5., 4.],
             #         [3., 2., 1.]])
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.imag(x)
 
     check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], 'imag')
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index 934ccfa72640f..ae563e641e3c8 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -18,21 +18,19 @@
 from ..fluid.layers import utils
 
 from ..fluid.layers import tensor
-from ..fluid.framework import Variable
-from ..fluid.framework import unique_name
-from ..fluid.framework import _current_expected_place, _get_paddle_place
-from ..fluid.framework import dygraph_only
-from ..fluid.initializer import Constant
-from ..fluid.layers import core
+from ..static import Variable, device_guard
+from ..framework import _current_expected_place, _get_paddle_place
+from ..framework import dygraph_only
+from ..framework import core
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype
-from ..fluid.framework import convert_np_dtype_to_dtype_, in_dygraph_mode, _varbase_creator, device_guard, OpProtoHolder
+from ..framework import convert_np_dtype_to_dtype_, _varbase_creator, OpProtoHolder
 from paddle.tensor.attribute import _complex_to_real_dtype, _real_to_complex_dtype
 # TODO: define functions to get create a tensor  
 from ..fluid.layers import linspace  # noqa: F401
 import paddle
 from paddle import _C_ops
-from ..fluid.framework import _in_eager_mode
+from ..framework import _in_eager_mode
 
 __all__ = []
 
@@ -214,7 +212,7 @@ def full_like(x, fill_value, dtype=None, name=None):
         if not isinstance(dtype, core.VarDesc.VarType):
             dtype = convert_np_dtype_to_dtype_(dtype)
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.fill_any_like(x, 'value', fill_value, 'dtype', dtype)
 
     helper = LayerHelper("full_like", **locals())
@@ -648,7 +646,7 @@ def tril(x, diagonal=0, name=None):
             #        [ 9, 10,  0,  0]])
 
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         op = getattr(_C_ops, 'tril_triu')
         return op(x, 'diagonal', diagonal, "lower", True)
 
@@ -715,7 +713,7 @@ def triu(x, diagonal=0, name=None):
             #        [ 0, 10, 11, 12]])
 
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         op = getattr(_C_ops, 'tril_triu')
         return op(x, 'diagonal', diagonal, "lower", False)
 
@@ -757,7 +755,7 @@ def meshgrid(*args, **kwargs):
 
     if len(args) == 1 and isinstance(args[0], (list, tuple)):
         args = args[0]
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         num = len(args)
         out = _C_ops.meshgrid(list(args), num)
         return out
@@ -862,7 +860,7 @@ def diagflat(x, offset=0, name=None):
           #  [0 0 0 4 0]]
     """
     padding_value = 0
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         if len(x.shape) == 1:
             return _C_ops.diag_v2(x, "offset", offset, "padding_value",
                                   padding_value)
@@ -976,7 +974,7 @@ def diag(x, offset=0, padding_value=0, name=None):
           print(y.numpy())
           # [4]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.diag_v2(x, "offset", offset, "padding_value",
                               padding_value)
 
@@ -1057,7 +1055,7 @@ def empty(shape, dtype=None, name=None):
 
     dtype = convert_dtype(dtype)
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         shape = utils.convert_shape_to_list(shape)
         out = _C_ops.empty('shape', shape, 'dtype',
                            convert_np_dtype_to_dtype_(dtype))
@@ -1125,7 +1123,7 @@ def empty_like(x, dtype=None, name=None):
         dtype = x.dtype
     dtype = convert_dtype(dtype)
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         out = _C_ops.empty('shape', x.shape, 'dtype',
                            convert_np_dtype_to_dtype_(dtype))
         out.stop_gradient = True
@@ -1309,7 +1307,7 @@ def complex(real, imag, name=None):
             # [[0.+0.j 0.+1.j 0.+2.j]
             #  [1.+0.j 1.+1.j 1.+2.j]]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return paddle._C_ops.complex(real, imag)
 
     check_variable_and_dtype(real, 'real', ['float32', 'float64'], 'complex')
diff --git a/python/paddle/tensor/einsum.py b/python/paddle/tensor/einsum.py
index e5d947294d922..040480c26faa8 100644
--- a/python/paddle/tensor/einsum.py
+++ b/python/paddle/tensor/einsum.py
@@ -15,9 +15,8 @@
 import itertools
 import re
 
-from ..fluid.layers import reshape, transpose
-from .linalg import matmul
-from .manipulation import squeeze, unsqueeze
+from .linalg import matmul, transpose
+from .manipulation import squeeze, unsqueeze, reshape
 from .math import multiply
 from .math import sum as paddle_sum
 
@@ -792,10 +791,10 @@ def einsum(equation, *operands):
                 - For any free label which is not present for the output, it's lowered to
                 a dummy label.
         - Examples
-            - '...ij, ...jk'，where i and k are free labels, j is dummy. The output label
+            - '...ij, ...jk', where i and k are free labels, j is dummy. The output label
             string is '...ik'
             - 'ij -> i', where i is a free label and j is a dummy label. 
-            - '...ij, ...jk -> ...ijk'，where i, j and k are all free labels.
+            - '...ij, ...jk -> ...ijk', where i, j and k are all free labels.
             - '...ij, ...jk -> ij', an invalid equation since `...` is not present for
             the output.
 
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index 170889588aadb..91d688b761a11 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -14,8 +14,9 @@
 
 import numpy as np
 from ..fluid.layer_helper import LayerHelper
-from ..fluid.framework import in_dygraph_mode, _varbase_creator, Variable, _dygraph_tracer
+from ..framework import _varbase_creator, _dygraph_tracer
 from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype
+from ..static import Variable
 
 from ..fluid.layers import transpose, cast  # noqa: F401
 from ..fluid import layers
@@ -133,7 +134,7 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
 
     """
     op_type = 'matmul_v2'
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         op = getattr(_C_ops, op_type)
         return op(x, y, 'trans_x', transpose_x, 'trans_y', transpose_y)
 
@@ -245,7 +246,7 @@ def frobenius_norm(input, dim=None, keepdim=False, name=None):
             raise ValueError(
                 "The dim of frobenius norm op should be None or two elements list!"
             )
-        if in_dygraph_mode():
+        if paddle.in_dynamic_mode():
             if dim is None:
                 return _C_ops.frobenius_norm(input, 'keep_dim', keepdim,
                                              'reduce_all', True)
@@ -282,7 +283,7 @@ def vector_norm(input,
           axis (int, optional): None for last dimension.
           keepdim (bool, optional): Whether keep the dimensions as the `input`, Default False.
         """
-        if in_dygraph_mode():
+        if paddle.in_dynamic_mode():
             if axis is None: axis = -1
             return _C_ops.p_norm(input, 'porder', porder, 'axis', axis,
                                  'keepdim', keepdim, 'asvector', asvector)
@@ -642,7 +643,7 @@ def mat_norm(input, porder=1., axis=None):
         axis = axis if axis != None and axis != [] else [0]
         keepdim = False
 
-        if in_dygraph_mode():
+        if paddle.in_dynamic_mode():
             abs_out = _C_ops.abs(input)
             sum_out = _C_ops.reduce_sum(abs_out, 'dim', axis, 'keepdim',
                                         keepdim, 'reduce_all', reduce_all)
@@ -699,7 +700,7 @@ def fro_norm(input, porder=2, axis=[-1]):
         reduce_all = True if axis is None or axis == [] else False
         keepdim = False
 
-        if in_dygraph_mode():
+        if paddle.in_dynamic_mode():
             pow_out = _C_ops.pow(input, 'factor', porder)
             sum_out_1 = _C_ops.reduce_sum(pow_out, 'dim', axis, 'keepdim',
                                           keepdim, 'reduce_all', reduce_all)
@@ -753,7 +754,7 @@ def svd_norm(input, porder, axis=[-1]):
 
         u, s, vh = svd(input, full_matrices=False)
 
-        if in_dygraph_mode():
+        if paddle.in_dynamic_mode():
             if porder == "nuc":
                 return _C_ops.reduce_sum(s, 'dim', axis, 'keepdim', keepdim,
                                          'reduce_all', reduce_all)
@@ -820,7 +821,7 @@ def svd_norm(input, porder, axis=[-1]):
             return out
 
     def empty_tensor(input, shape):
-        if in_dygraph_mode():
+        if paddle.in_dynamic_mode():
             return input.reshape(shape)
         raise ValueError("only support x is nonempty tensor in static mode")
 
@@ -895,7 +896,7 @@ def dot(x, y, name=None):
     """
     op_type = 'dot'
     # skip var type check in dygraph mode to improve efficiency
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         op = getattr(_C_ops, op_type)
         return op(x, y)
 
@@ -1079,7 +1080,7 @@ def t(input, name=None):
             "Input(input) only support N-D (N<=2) tensor, but received "
             "length of Input(input) is %s. Perhaps you can use paddle."
             "tensor.transpose() instead." % len(input.shape))
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         if len(input.shape) == 1:
             return input
         # 2-D tensor
@@ -1144,7 +1145,7 @@ def cross(x, y, axis=None, name=None):
             #  [0. 0. 0.]
             #  [0. 0. 0.]]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         if axis is not None:
             return _C_ops.cross(x, y, 'dim', axis)
         else:
@@ -1203,7 +1204,7 @@ def cholesky(x, upper=False, name=None):
             #  [1.25450498 0.05600871 0.06400121]]
 
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.cholesky(x, "upper", upper)
     check_variable_and_dtype(x, 'dtype', ['float32', 'float64'], 'cholesky')
     check_type(upper, 'upper', bool, 'cholesky')
@@ -1257,7 +1258,7 @@ def matrix_rank(x, tol=None, hermitian=False, name=None):
 
     """
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         if tol is None:
             tol_tensor = None
             tol_attr = 0.0
@@ -1355,7 +1356,7 @@ def bmm(x, y, name=None):
             "x's batch (shape[0]) must be equal with y's batch (shape[0]). But received x's shape: {}, y's shape: {}".
             format(x_shape, y_shape))
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.bmm(x, y)
 
     helper = LayerHelper('bmm', **locals())
@@ -1388,7 +1389,7 @@ def histogram(input, bins=100, min=0, max=0, name=None):
             result = paddle.histogram(inputs, bins=4, min=0, max=3)
             print(result) # [0, 2, 1, 0]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.histogram(input, "bins", bins, "min", min, "max", max)
 
     helper = LayerHelper('histogram', **locals())
@@ -1435,7 +1436,7 @@ def bincount(x, weights=None, minlength=0, name=None):
     if x.dtype not in [paddle.int32, paddle.int64]:
         raise TypeError("Elements in Input(x) should all be integers")
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.bincount(x, weights, "minlength", minlength)
 
     helper = LayerHelper('bincount', **locals())
@@ -1488,7 +1489,7 @@ def mv(x, vec, name=None):
             vec = paddle.to_tensor(vec_data).astype("float64")
             out = paddle.mv(x, vec)
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         out = _C_ops.mv(x, vec)
         return out
 
@@ -1541,7 +1542,7 @@ def det(x, name=None):
 
 
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.determinant(x)
 
     check_dtype(x.dtype, 'Input', ['float32', 'float64'], 'det')
@@ -1596,7 +1597,7 @@ def slogdet(x, name=None):
         # [-0.98610914, -0.43010661, -0.10872950]])
 
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.slogdeterminant(x)
 
     check_dtype(x.dtype, 'Input', ['float32', 'float64'], 'slogdet')
@@ -1669,7 +1670,7 @@ def svd(x, full_matrices=False, name=None):
             #                  V * VH == I
     """
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.svd(x, 'full_matrices', full_matrices)
     check_variable_and_dtype(x, 'dtype', ['float32', 'float64'], 'svd')
     check_type(full_matrices, 'full_matrices', bool, 'svd')
@@ -1744,7 +1745,7 @@ def matrix_power(x, n, name=None):
             #  [-7.66666667 ,  8.         , -1.83333333 ],
             #  [ 1.80555556 , -1.91666667 ,  0.44444444 ]]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.matrix_power(x, "n", n)
 
     check_variable_and_dtype(x, 'dtype', ['float32', 'float64'], 'matrix_power')
@@ -1801,7 +1802,7 @@ def qr(x, mode="reduced", name=None):
             
             # one can verify : X = Q * R ;     
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         q, r = _C_ops.qr(x, 'mode', mode)
         if mode == "r":
             return r
@@ -1900,7 +1901,7 @@ def lu(x, pivot=True, get_infos=False, name=None):
 
             # one can verify : X = P @ L @ U ;     
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         LU, Piv, Info = _C_ops.lu(x, 'pivots', pivot)
         if get_infos:
             return LU, Piv, Info
@@ -1997,7 +1998,7 @@ def lu_unpack(x, y, unpack_ludata=True, unpack_pivots=True, name=None):
             # one can verify : X = P @ L @ U ;   
     """
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         P, L, U = _C_ops.lu_unpack(x, y, 'unpack_ludata', unpack_ludata,
                                    'unpack_pivots', unpack_pivots)
         return P, L, U
@@ -2070,7 +2071,7 @@ def eig(x, name=None):
             #       [ (16.50471283351188+0j)  , (-5.5034820550763515+0j) ,
             #         (-0.21026087843552282+0j)])
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         w, v = _C_ops.eig(x)
         return w, v
 
@@ -2139,7 +2140,7 @@ def eigvals(x, name=None):
             "The last two dimensions of Input(x) should be equal, but received x's shape = {}".
             format(x_shape))
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.eigvals(x)
 
     helper = LayerHelper('eigvals', **locals())
@@ -2210,7 +2211,7 @@ def multi_dot(x, name=None):
         # [10, 7]
 
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.multi_dot(x)
 
     check_type(x, 'x', (list, tuple), 'multi_dot')
@@ -2262,7 +2263,7 @@ def eigh(x, UPLO='L', name=None):
             #[ 0.3826834323650898j    , -0.9238795325112867j    ]]
 
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.eigh(x, 'UPLO', UPLO)
 
     def __check_input(x, UPLO):
@@ -2361,7 +2362,7 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None):
             # or              out * x * out = x ;
     """
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         if not hermitian:
             # combine svd and matmul op
             u, s, vt = _C_ops.svd(x, 'full_matrices', False)
@@ -2611,7 +2612,7 @@ def solve(x, y, name=None):
         print(out)
         # [2., 3.])
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.solve(x, y)
 
     inputs = {"X": [x], "Y": [y]}
@@ -2675,7 +2676,7 @@ def triangular_solve(x,
         print(out)
         # [7, -2, -5]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.triangular_solve(x, y, 'upper', upper, 'transpose',
                                        transpose, 'unitriangular',
                                        unitriangular)
@@ -2732,7 +2733,7 @@ def cholesky_solve(x, y, upper=False, name=None):
         print(out)
         # [-2.5, -7, 9.5]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.cholesky_solve(x, y, 'upper', upper)
 
     helper = LayerHelper("cholesky_solve", **locals())
@@ -2776,7 +2777,7 @@ def eigvalsh(x, UPLO='L', name=None):
             print(out_value)
             #[0.17157288, 5.82842712]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         is_test = x.stop_gradient
         values, _ = _C_ops.eigvalsh(x, 'UPLO', UPLO, 'is_test', is_test)
         return values
@@ -2904,7 +2905,7 @@ def lstsq(x, y, rcond=None, driver=None, name=None):
         elif x.dtype == paddle.float64:
             rcond = 1e-15 * max(x.shape[-2], x.shape[-1])
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         solution, rank, singular_values = _C_ops.lstsq(x, y, "rcond", rcond,
                                                        "driver", driver)
         if x.shape[-2] > x.shape[-1]:
diff --git a/python/paddle/tensor/logic.py b/python/paddle/tensor/logic.py
index a9ec489118249..858f9139231e7 100755
--- a/python/paddle/tensor/logic.py
+++ b/python/paddle/tensor/logic.py
@@ -15,8 +15,7 @@
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.data_feeder import check_type, check_variable_and_dtype
 from ..fluid.layers.layer_function_generator import templatedoc
-from .. import fluid
-from ..fluid.framework import in_dygraph_mode, Variable
+from ..static import Variable
 from ..framework import VarBase as Tensor
 
 # TODO: define logic functions of a tensor  
@@ -25,8 +24,7 @@
 from ..fluid.layers import logical_not  # noqa: F401
 from ..fluid.layers import logical_or  # noqa: F401
 from ..fluid.layers import logical_xor  # noqa: F401
-
-from paddle.common_ops_import import core
+import paddle
 from paddle import _C_ops
 from paddle.tensor.creation import full
 
@@ -61,7 +59,7 @@ def equal_all(x, y, name=None):
           result2 = paddle.equal_all(x, z)
           print(result2) # result2 = [False ]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.equal_all(x, y)
 
     helper = LayerHelper("equal_all", **locals())
@@ -124,7 +122,7 @@ def allclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
           # [True]
     """
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.allclose(x, y, 'rtol',
                                str(rtol), 'atol',
                                str(atol), 'equal_nan', equal_nan)
@@ -182,7 +180,7 @@ def equal(x, y, name=None):
     if not isinstance(y, Variable):
         y = full(shape=[1], dtype=x.dtype, fill_value=y)
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.equal(x, y)
 
     check_variable_and_dtype(
@@ -224,7 +222,7 @@ def greater_equal(x, y, name=None):
             result1 = paddle.greater_equal(x, y)
             print(result1)  # result1 = [True False True]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.greater_equal(x, y)
 
     check_variable_and_dtype(x, "x",
@@ -270,7 +268,7 @@ def greater_than(x, y, name=None):
             result1 = paddle.greater_than(x, y)
             print(result1)  # result1 = [False False True]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.greater_than(x, y)
 
     check_variable_and_dtype(x, "x",
@@ -317,7 +315,7 @@ def less_equal(x, y, name=None):
             result1 = paddle.less_equal(x, y)
             print(result1)  # result1 = [True True False]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.less_equal(x, y)
 
     check_variable_and_dtype(
@@ -360,7 +358,7 @@ def less_than(x, y, name=None):
             result1 = paddle.less_than(x, y)
             print(result1)  # result1 = [False True False]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.less_than(x, y)
 
     check_variable_and_dtype(
@@ -403,7 +401,7 @@ def not_equal(x, y, name=None):
             result1 = paddle.not_equal(x, y)
             print(result1)  # result1 = [False True True]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.not_equal(x, y)
 
     check_variable_and_dtype(
@@ -449,7 +447,7 @@ def is_tensor(x):
 
 
 def _bitwise_op(op_name, x, y, out=None, name=None, binary_op=True):
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         op = getattr(_C_ops, op_name)
         if binary_op:
             return op(x, y)
@@ -637,7 +635,7 @@ def isclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
           # [True, True]
     """
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.isclose(x, y, 'rtol',
                               str(rtol), 'atol',
                               str(atol), 'equal_nan', equal_nan)
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 4df026cfa4892..53bb9a8807562 100755
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -15,11 +15,11 @@
 from __future__ import print_function
 from collections import Counter
 
-from ..fluid.layers import core
+from ..static import Variable, device_guard
+from ..framework import core
 from ..fluid.layer_helper import LayerHelper
-from ..fluid.framework import Variable, OpProtoHolder, in_dygraph_mode, convert_np_dtype_to_dtype_, device_guard, dygraph_only
+from ..framework import OpProtoHolder, convert_np_dtype_to_dtype_, dygraph_only
 from ..fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
-from ..fluid.layers.tensor import fill_constant
 from ..fluid.layers import utils
 import numpy as np
 # TODO: define functions to manipulate a tensor  
@@ -378,7 +378,7 @@ def broadcast_tensors(input, name=None):
     """
 
     num_inputs = len(input)
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.broadcast_tensors(input, num_inputs)
 
     check_type(input, 'input', (list, tuple), 'broadcast_tensors')
@@ -475,7 +475,7 @@ def flip(x, axis, name=None):
     """
     if isinstance(axis, int):
         axis = [axis]
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.flip(x, "axis", axis)
 
     helper = LayerHelper("flip", **locals())
@@ -671,7 +671,7 @@ def flatten(x, start_axis=0, stop_axis=-1, name=None):
     if not (isinstance(x, Variable)):
         raise ValueError("The input x should be a Tensor")
 
-    if not in_dygraph_mode():
+    if not paddle.in_dynamic_mode():
         check_variable_and_dtype(
             x, 'x',
             ['float32', 'float64', 'int8', 'int16', 'int32', 'int64', 'uint8'],
@@ -693,7 +693,7 @@ def flatten(x, start_axis=0, stop_axis=-1, name=None):
     if start_axis > stop_axis:
         raise ValueError("The stop_axis should be larger than stat_axis")
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         dy_out, _ = _C_ops.flatten_contiguous_range(x, 'start_axis', start_axis,
                                                     'stop_axis', stop_axis)
         return dy_out
@@ -792,7 +792,7 @@ def roll(x, shifts, axis=None, name=None):
     else:
         axis = []
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.roll(x, 'axis', axis, 'shifts', shifts)
 
     helper = LayerHelper("roll", **locals())
@@ -1108,7 +1108,7 @@ def unique_consecutive(x,
     else:
         axis = [axis]
     attr_dtype = convert_np_dtype_to_dtype_(dtype)
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         out, inverse, counts = _C_ops.unique_consecutive(
             x, 'dtype', attr_dtype, 'return_inverse', return_inverse,
             'return_counts', return_counts, 'axis', axis)
@@ -1213,7 +1213,7 @@ def unique(x,
     else:
         axis = [axis]
     attr_dtype = convert_np_dtype_to_dtype_(dtype)
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         out, inverse, indices, counts = _C_ops.unique(
             x, 'dtype', attr_dtype, 'return_index', return_index,
             'return_inverse', return_inverse, 'return_counts', return_counts,
@@ -1397,7 +1397,7 @@ def gather(x, index, axis=None, name=None):
     if axis is None:
         axis = 0
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         axis = axis.item() if isinstance(axis, paddle.Tensor) else axis
         return _C_ops.gather(x, index, None, "axis", axis, "overwrite", False)
 
@@ -1471,7 +1471,7 @@ def unbind(input, axis=0):
     input_shape = input.shape
     axis_ = axis if axis >= 0 else len(input_shape) + axis
     num = input_shape[axis_]
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.unbind(input, num, 'axis', axis)
 
     helper = LayerHelper("unbind", **locals())
@@ -1565,7 +1565,7 @@ def scatter(x, index, updates, overwrite=True, name=None):
             #  [2., 2.],
             #  [1., 1.]]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.scatter(x, index, updates, 'overwrite', overwrite)
 
     check_variable_and_dtype(
@@ -1744,7 +1744,7 @@ def tile(x, repeat_times, name=None):
             np_out = out.numpy()
             # [[1, 2, 3], [1, 2, 3]]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.tile(x, 'repeat_times', repeat_times)
     check_type(repeat_times, 'repeat_times', (list, tuple, Variable), 'tile')
     if isinstance(repeat_times, Variable):
@@ -1827,7 +1827,7 @@ def expand_as(x, y, name=None):
             np_out = out.numpy()
             # [[1, 2, 3], [1, 2, 3]]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.expand_as_v2(x, 'target_shape', y.shape)
 
     check_variable_and_dtype(
@@ -1881,7 +1881,7 @@ def broadcast_to(x, shape, name=None):
             print(out)
             # [[1, 2, 3], [1, 2, 3]]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.expand_v2(x, 'shape', shape)
 
     if isinstance(shape, Variable):
@@ -1968,7 +1968,7 @@ def expand(x, shape, name=None):
             print(out)
             # [[1, 2, 3], [1, 2, 3]]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.expand_v2(x, 'shape', shape)
 
     if isinstance(shape, Variable):
@@ -2407,7 +2407,7 @@ def tensordot(x, y, axes=2, name=None):
     check_type(axes, 'axes', (int, tuple, list, Variable), op_type)
 
     def _var_to_list(var):
-        if in_dygraph_mode():
+        if paddle.in_dynamic_mode():
             return tolist(var)
         raise TypeError(
             "The 'axes' with type 'Tensor' in " + op_type +
@@ -2523,7 +2523,7 @@ def as_complex(x, name=None):
             # [[ 0. +1.j  2. +3.j  4. +5.j]
             #  [ 6. +7.j  8. +9.j 10.+11.j]]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return paddle._C_ops.as_complex(x)
 
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'as_complex')
@@ -2572,7 +2572,7 @@ def as_real(x, name=None):
             #   [ 8.  9.]
             #   [10. 11.]]]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return paddle._C_ops.as_real(x)
 
     check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], 'as_real')
@@ -2626,7 +2626,7 @@ def repeat_interleave(x, repeats, axis=None, name=None):
         x = paddle.flatten(x)
         axis = 0
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         if isinstance(repeats, int):
             return _C_ops.repeat_interleave(x, None, 'Repeats', repeats, 'dim',
                                             axis)
@@ -2733,7 +2733,7 @@ def moveaxis(x, source, destination, name=None):
     for i in range(len(src_dims)):
         perm[dst_dims[i]] = src_dims[i]
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         out, _ = _C_ops.transpose2(x, 'axis', perm)
         return out
 
@@ -2814,7 +2814,7 @@ def take_along_axis(arr, indices, axis):
     if not broadcast_shape:
         # if indices matrix have larger size than arr, arr should broadcast into indices shape.
         broadcast_shape = indices.shape
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         indices = paddle.broadcast_to(indices, broadcast_shape)
         broadcast_shape_list = list(broadcast_shape)
         broadcast_shape_list[axis] = list(arr.shape)[axis]
@@ -2879,7 +2879,7 @@ def put_along_axis(arr, indices, values, axis, reduce='assign'):
             "`indices` and `arr` must have the same number of dimensions!")
     axis = non_negative_axis(arr, axis)
     broadcast_shape = infer_broadcast_shape(arr, indices, axis)
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         values = paddle.to_tensor(values) if not isinstance(
             values, paddle.Tensor) else values
         if broadcast_shape:
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index ba8a4d7f11990..a36bf1c432515 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -26,8 +26,9 @@
 from paddle.tensor import cast
 from paddle.tensor.attribute import _complex_to_real_dtype
 import paddle
-from ..fluid import layers
-from ..fluid.framework import core, _varbase_creator, in_dygraph_mode, Variable, convert_np_dtype_to_dtype_
+from paddle.static import Variable
+from ..framework import core
+from ..framework import _varbase_creator, convert_np_dtype_to_dtype_
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype
 from ..fluid.layers.layer_function_generator import _generate_doc_string_, generate_activation_fn, generate_layer_fn
@@ -70,7 +71,8 @@
 from ..fluid.layers import atanh    # noqa: F401
 
 from ..fluid.layers import multiplex    # noqa: F401
-from ..fluid import layers
+from ..fluid.layers import reduce_prod
+from ..fluid.layers import elementwise_sub
 from paddle import _C_ops
 
 __all__ = []
@@ -147,7 +149,7 @@ def pow(x, y, name=None):
 
     """
     # in dynamic graph mode
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         if isinstance(y, (int, float)):
             return _C_ops.pow(x, 'factor', y)
         elif isinstance(y, (paddle.Tensor, Variable)):
@@ -240,7 +242,7 @@ def add(x, y, name=None):
 
     """
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.elementwise_add(x, y)
 
     return _elementwise_op(LayerHelper('elementwise_add', **locals()))
@@ -319,7 +321,7 @@ def subtract(x, y, name=None):
     op_type = 'elementwise_sub'
     axis = -1
     act = None
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _elementwise_op_in_dygraph(
             x, y, axis=axis, act=act, op_name=op_type)
     return _elementwise_op(LayerHelper(op_type, **locals()))
@@ -376,7 +378,7 @@ def divide(x, y, name=None):
     op_type = 'elementwise_div'
     axis = -1
     act = None
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _elementwise_op_in_dygraph(
             x, y, axis=axis, act=act, op_name=op_type)
 
@@ -415,7 +417,7 @@ def floor_divide(x, y, name=None):
     """
     op_type = 'elementwise_floordiv'
     axis = -1
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _elementwise_op_in_dygraph(
             x, y, axis=axis, op_name=op_type)
 
@@ -455,7 +457,7 @@ def remainder(x, y, name=None):
     """
     op_type = 'elementwise_mod'
     axis = -1
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _elementwise_op_in_dygraph(
             x, y, axis=axis, op_name=op_type)
 
@@ -505,7 +507,7 @@ def multiply(x, y, name=None):
     act = None
     axis = -1
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _elementwise_op_in_dygraph(
             x, y, axis=axis, act=act, op_name=op_type)
 
@@ -570,7 +572,7 @@ def maximum(x, y, name=None):
     op_type = 'elementwise_max'
     axis = -1
     act = None
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _elementwise_op_in_dygraph(
             x, y, axis=axis, act=act, op_name=op_type)
     return _elementwise_op(LayerHelper(op_type, **locals()))
@@ -629,7 +631,7 @@ def minimum(x, y, name=None):
     op_type = 'elementwise_min'
     axis = -1
     act = None
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _elementwise_op_in_dygraph(
             x, y, axis=axis, act=act, op_name=op_type)
     return _elementwise_op(LayerHelper(op_type, **locals()))
@@ -690,7 +692,7 @@ def fmax(x, y, name=None):
     op_type = 'elementwise_fmax'
     axis = -1
     act = None
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _elementwise_op_in_dygraph(
             x, y, axis=axis, act=act, op_name=op_type)
     return _elementwise_op(LayerHelper(op_type, **locals()))
@@ -751,7 +753,7 @@ def fmin(x, y, name=None):
     op_type = 'elementwise_fmin'
     axis = -1
     act = None
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _elementwise_op_in_dygraph(
             x, y, axis=axis, act=act, op_name=op_type)
     return _elementwise_op(LayerHelper(op_type, **locals()))
@@ -860,7 +862,7 @@ def get_dtype(x, dtype):
         return (False, src_type)
 
     dtype_flag, dtype = get_dtype(x, dtype)
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         axis = axis if axis != None and axis != [] else [0]
         if dtype_flag:
             return _C_ops.reduce_sum(x, 'dim', axis, 'keep_dim', keepdim,
@@ -1024,7 +1026,7 @@ def add_n(inputs, name=None):
             # [[8., 10., 12.], 
             #  [14., 16., 18.]]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         if isinstance(inputs, Variable):
             inputs = [inputs]
         return _C_ops.sum(inputs, 'use_mkldnn', False)
@@ -1080,7 +1082,7 @@ def trunc(input, name=None):
             #         [[0., 0.],
             #         [0., 0.]]))
     '''
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.trunc(input)
     else:
         inputs = {"X": input}
@@ -1164,7 +1166,7 @@ def mm(input, mat2, name=None):
 
 
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.matmul_v2(input, mat2)
 
     def __check_input(x, y):
@@ -1269,7 +1271,7 @@ def addmm(input, x, y, beta=1.0, alpha=1.0, name=None):
 
 
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         out = _C_ops.addmm(input, x, y, "Alpha", alpha, "Beta", beta)
         return out
 
@@ -1328,7 +1330,7 @@ def renorm(x, p, axis, max_norm):
         if not axis >= -1 * len(input_shape):
             raise ValueError("the axis:{} should not be less than -1 * length of input_shape:{}".format(axis,-1 * len(input_shape)))
         axis = axis + len(input_shape)
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         out = core.ops.renorm(x, 'p',p, 'axis',axis, 'max_norm', max_norm)
         return out
 
@@ -1384,7 +1386,7 @@ def inner(x, y, name=None):
         nx = x.reshape((-1, xshape[-1]))
         ny = y.reshape((-1, yshape[-1]))
 
-        if in_dygraph_mode():
+        if paddle.in_dynamic_mode():
             return _C_ops.matmul_v2(nx, ny.T).reshape(dstshape)
 
         def __check_input(x, y):
@@ -1447,7 +1449,7 @@ def outer(x, y, name=None):
     nx = x.reshape((-1, 1))
     ny = y.reshape((1, -1))
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.matmul_v2(nx, ny)
 
     def __check_input(x, y):
@@ -1516,7 +1518,7 @@ def logsumexp(x, axis=None, keepdim=False, name=None):
     if axis is None or len(axis) == 0:
         axis = [0]
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.logsumexp(x, 'axis', axis, 'keepdim', keepdim, 'reduce_all', reduce_all)
 
     check_variable_and_dtype(x, 'x',
@@ -1560,7 +1562,7 @@ def inverse(x, name=None):
             print(inv) # [[0.5, 0], [0, 0.5]]
 
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.inverse(x)
 
     def _check_input(x):
@@ -1676,7 +1678,7 @@ def max(x, axis=None, keepdim=False, name=None):
     """
 
     reduce_all, axis = _get_reduce_all_value(axis)
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.reduce_max(x, 'dim', axis, 'keep_dim', keepdim,
                                    'reduce_all', reduce_all)
 
@@ -1776,7 +1778,7 @@ def min(x, axis=None, keepdim=False, name=None):
     """
 
     reduce_all, axis = _get_reduce_all_value(axis)
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.reduce_min(x, 'dim', axis, 'keep_dim', keepdim,
                                    'reduce_all', reduce_all)
 
@@ -1889,7 +1891,7 @@ def amax(x, axis=None, keepdim=False, name=None):
     """
 
     reduce_all, axis = _get_reduce_all_value(axis)
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.reduce_amax(x, 'dim', axis, 'keep_dim', keepdim, 'reduce_all', reduce_all)
 
     helper = LayerHelper('amax', **locals())
@@ -2002,7 +2004,7 @@ def amin(x, axis=None, keepdim=False, name=None):
     """
 
     reduce_all, axis = _get_reduce_all_value(axis)
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.reduce_amin(x, 'dim', axis, 'keep_dim', keepdim, 'reduce_all', reduce_all)
 
     helper = LayerHelper('amin', **locals())
@@ -2046,7 +2048,7 @@ def log1p(x, name=None):
             # [[0.], [0.6931472]]
     """
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.log1p(x)
 
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], "log1p")
@@ -2095,7 +2097,7 @@ def log2(x, name=None):
             res = paddle.log2(x_i)
             print(res) # [1.0]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.log2(x)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], "log2")
@@ -2145,7 +2147,7 @@ def log10(x, name=None):
             res = paddle.log10(x_i)
             print(res) # [1.0]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.log10(x)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], "log10")
@@ -2206,7 +2208,7 @@ def clip(x, min=None, max=None, name=None):
         min_ = float(np.finfo(np.float32).min)
         max_ = float(np.finfo(np.float32).max)
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         if isinstance(min, Variable):
             min = min.numpy().item(0)
         if isinstance(max, Variable):
@@ -2339,7 +2341,7 @@ def __check_input(input, offset, dim1, dim2):
                 "But received axis1 = %d, axis2 = %d\n"%(axis1, axis2)
 
     __check_input(input, offset, axis1, axis2)
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.trace(x, 'offset', offset, 'axis1', axis1, 'axis2', axis2)
 
     inputs = {'Input': [x]}
@@ -2422,7 +2424,7 @@ def diagonal(x, offset=0, axis1=0, axis2=1, name=None):
             #        [0.17020577, 0.27325270]])
             
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.diagonal(x, 'offset', offset, 'axis1', axis1, 'axis2', axis2)
 
     def __check_input(input, offset, dim1, dim2):
@@ -2499,7 +2501,7 @@ def kron(x, y, name=None):
             #         [12, 15, 18, 16, 20, 24],
             #         [21, 24, 27, 28, 32, 36]])
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.kron(x, y)
 
     helper = LayerHelper('kron', **locals())
@@ -2557,9 +2559,9 @@ def cumsum(x, axis=None, dtype=None, name=None):
     else:
         flatten = False
     if dtype is not None and x.dtype != convert_np_dtype_to_dtype_(dtype):
-        x = layers.cast(x, dtype)
+        x = cast(x, dtype)
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         if axis is None:
             return _C_ops.cumsum(x, 'flatten', flatten)
         else:
@@ -2622,9 +2624,9 @@ def cumprod(x, dim=None, dtype=None, name=None):
     """
 
     if dtype is not None and x.dtype != convert_np_dtype_to_dtype_(dtype):
-        x = layers.cast(x, dtype)
+        x = cast(x, dtype)
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.cumprod(x, 'dim', dim)
 
     check_variable_and_dtype(x, "x", ['complex64', 'complex128', 'float32', 'float64', 'int32', 'int64'], 'cumprod')
@@ -2656,7 +2658,7 @@ def isfinite(x, name=None):
             out = paddle.tensor.isfinite(x)
             print(out)  # [False  True  True False  True False False]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.isfinite_v2(x)
     helper = LayerHelper("isfinite_v2", **locals())
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64'], 'isfinite')
@@ -2684,7 +2686,7 @@ def isinf(x, name=None):
             out = paddle.tensor.isinf(x)
             print(out)  # [ True False False  True False False False]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.isinf_v2(x)
     helper = LayerHelper("isinf_v2", **locals())
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64'], 'isinf')
@@ -2712,7 +2714,7 @@ def isnan(x, name=None):
             out = paddle.tensor.isnan(x)
             print(out)  # [False False False False False  True  True]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.isnan_v2(x)
     helper = LayerHelper("isnan_v2", **locals())
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64'], 'isnan')
@@ -2783,9 +2785,9 @@ def prod(x, axis=None, keepdim=False, dtype=None, name=None):
     if dtype is not None:
         check_dtype(dtype, 'dtype', ['float32', 'float64', 'int32', 'int64'], 'prod')
         if x.dtype != convert_np_dtype_to_dtype_(dtype):
-            x = layers.cast(x, dtype)
+            x = cast(x, dtype)
 
-    return layers.reduce_prod(input=x, dim=axis, keep_dim=keepdim, name=name)
+    return reduce_prod(input=x, dim=axis, keep_dim=keepdim, name=name)
 
 
 def sign(x, name=None):
@@ -2809,7 +2811,7 @@ def sign(x, name=None):
           out = paddle.sign(x=x)
           print(out)  # [1.0, 0.0, -1.0, 1.0]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.sign(x)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'sign')
@@ -2846,7 +2848,7 @@ def tanh(x, name=None):
             print(out)
             # [-0.37994896 -0.19737532  0.09966799  0.29131261]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.tanh(x)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'tanh')
@@ -2888,7 +2890,7 @@ def increment(x, value=1.0, name=None):
             # [1.]
 
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.increment(x, 'step', value)
 
     check_variable_and_dtype(x, 'x', ['float32', 'float64', 'int32', 'int64'],
@@ -2969,7 +2971,7 @@ def all(x, axis=None, keepdim=False, name=None):
         else:
             reduce_all_flag = False
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         axis = axis if axis != None and axis != [] else [0]
         return _C_ops.reduce_all(x, 'dim', axis, 'keep_dim', keepdim,
                                        'reduce_all', reduce_all_flag)
@@ -3061,7 +3063,7 @@ def any(x, axis=None, keepdim=False, name=None):
         else:
             reduce_all_flag = False
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         axis = axis if axis != None and axis != [] else [0]
         return _C_ops.reduce_any(x, 'dim', axis, 'keep_dim', keepdim,
                                        'reduce_all', reduce_all_flag)
@@ -3142,7 +3144,7 @@ def conj(x, name=None):
           #        [(4-4j), (5-5j), (6-6j)]])
 
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.conj(x)
 
     check_variable_and_dtype(x, "x", ['complex64', 'complex128', 'float32', 'float64', 'int32', 'int64'], 'conj')
@@ -3181,7 +3183,7 @@ def digamma(x, name=None):
             #        [ nan       ,  5.32286835]])
     """
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.digamma(x)
 
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'digamma')
@@ -3212,7 +3214,7 @@ def neg(x, name=None):
             # [0.4 0.2 -0.1 -0.3]
     """
 
-    return layers.scale(x, scale=-1.0, bias=0.0, bias_after_scale=True, act=None, name=name)
+    return scale(x, scale=-1.0, bias=0.0, bias_after_scale=True, act=None, name=name)
 
 def atan2(x, y, name=None):
     r"""
@@ -3257,7 +3259,7 @@ def atan2(x, y, name=None):
 
     """
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.atan2(x, y)
     else:
         check_variable_and_dtype(x, 'x', ['int32', 'int64', 'float16', 'float32', 'float64'], 'atan2')
@@ -3313,7 +3315,7 @@ def logit(x, eps=None, name=None):
 
     if eps == None:
         eps = 0.0
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.logit(x, 'eps', eps)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'logit')
@@ -3356,7 +3358,7 @@ def lerp(x, y, weight, name=None):
             # out: [5.5., 6., 6.5, 7.]
 
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         check_type(weight, 'weight', (float, paddle.Tensor, Variable), 'lerp')
         if isinstance(weight, float):
             weight = paddle.to_tensor(weight, dtype=x.dtype)
@@ -3419,7 +3421,7 @@ def erfinv(x, name=None):
     """
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'erfinv')
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.erfinv(x)
 
     helper = LayerHelper('erfinv', **locals())
@@ -3478,7 +3480,7 @@ def rad2deg(x, name=None):
             #         [57.29578018])
     """
     rad2deg_scale = 180 / np.pi
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         if convert_dtype(x.dtype) in ['int32', 'int64']:
             x = cast(x, dtype="float32")
         return _C_ops.scale(x, 'scale', rad2deg_scale)
@@ -3531,7 +3533,7 @@ def deg2rad(x, name=None):
             #         [3.14159274])
     """
     deg2rad_scale = np.pi / 180.0
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         if convert_dtype(x.dtype) in ['int32', 'int64']:
             x = cast(x, dtype="float32")
         return _C_ops.scale(x, 'scale', deg2rad_scale)
@@ -3615,7 +3617,7 @@ def _gcd_body_fn(x, y):
                   paddle.where(y_not_equal_0, paddle.mod(x, y_safe),paddle.zeros(y.shape, y.dtype)))
         return (paddle.where(x < y, y, x), paddle.where(x < y, x, y))
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         while _gcd_cond_fn(x, y):
             x, y = _gcd_body_fn(x, y)
 
@@ -3749,7 +3751,7 @@ def diff(x, n=1, axis=-1, prepend=None, append=None, name=None):
     dtype = x.dtype
     axes = [axis]
     infer_flags = list(1 for i in range(len(axes)))
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         has_pend = False
         input_list = []
         if prepend is not None and append is not None:
@@ -3788,7 +3790,7 @@ def diff(x, n=1, axis=-1, prepend=None, append=None, name=None):
             op = getattr(_C_ops, "logical_xor")
             out = op(input_back, input_front)
         else:
-            out = layers.elementwise_sub(input_back, input_front, axis=axis)
+            out = elementwise_sub(input_back, input_front, axis=axis)
         return out
     else:
         check_variable_and_dtype(x, 'x', ['float32', 'float64', 'bool', 'int32', 'int64'], 'diff')
@@ -3840,7 +3842,7 @@ def diff(x, n=1, axis=-1, prepend=None, append=None, name=None):
                 type='logical_xor', inputs={"X": input_back, "Y": input_front}, outputs={"Out": out}
             )
         else:
-            out = layers.elementwise_sub(input_back, input_front, axis=axis)
+            out = elementwise_sub(input_back, input_front, axis=axis)
 
         return out
 
@@ -3883,7 +3885,7 @@ def angle(x, name=None):
             #  [-1.1071488 -0.7853982  0.         0.7853982]]
     """
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.angle(x)
 
     check_variable_and_dtype(x, 'x',
diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py
index 5adb937118303..c4e7e96191acf 100644
--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
@@ -14,13 +14,14 @@
 
 # TODO: define random functions  
 
-from ..fluid import core
-from ..fluid.framework import in_dygraph_mode, Variable, convert_np_dtype_to_dtype_, dygraph_only
+from ..framework import core
+from ..framework import convert_np_dtype_to_dtype_, dygraph_only
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, check_shape
 from ..fluid.layers import utils
 import paddle
 from paddle import _C_ops
+from paddle.static import Variable
 
 __all__ = []
 
@@ -65,7 +66,7 @@ def bernoulli(x, name=None):
 
     """
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.bernoulli(x)
 
     check_variable_and_dtype(x, "x", ["float32", "float64"], "bernoulli")
@@ -110,7 +111,7 @@ def poisson(x, name=None):
 
     """
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.poisson(x)
 
     check_variable_and_dtype(x, "x", ["float32", "float64"], "poisson")
@@ -173,7 +174,7 @@ def multinomial(x, num_samples=1, replacement=False, name=None):
     assert core.is_compiled_with_rocm() == False, (
         "multinomial op is not supported on ROCM yet.")
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.multinomial(x, 'num_samples', num_samples, 'replacement',
                                   replacement)
 
@@ -231,7 +232,7 @@ def gaussian(shape, mean=0.0, std=1.0, dtype=None, name=None):
     if not isinstance(dtype, core.VarDesc.VarType):
         dtype = convert_np_dtype_to_dtype_(dtype)
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         shape = utils.convert_shape_to_list(shape)
         return _C_ops.gaussian_random('shape', shape, 'mean',
                                       float(mean), 'std',
@@ -422,7 +423,7 @@ def normal(mean=0.0, std=1.0, shape=None, name=None):
             # [1.00780561 3.78457445 5.81058198]  # random
 
     """
-    if not in_dygraph_mode():
+    if not paddle.in_dynamic_mode():
         check_type(mean, 'mean', (int, float, Variable), 'normal')
         check_type(std, 'std', (int, float, Variable), 'normal')
         if isinstance(mean, Variable):
@@ -454,7 +455,7 @@ def normal(mean=0.0, std=1.0, shape=None, name=None):
         return gaussian(shape=shape, mean=mean, std=std, name=name)
 
     out = out * std + mean
-    if not in_dygraph_mode():
+    if not paddle.in_dynamic_mode():
         out.stop_grediant = True
     return out
 
@@ -540,7 +541,7 @@ def uniform(shape, dtype=None, min=-1.0, max=1.0, seed=0, name=None):
     if not isinstance(dtype, core.VarDesc.VarType):
         dtype = convert_np_dtype_to_dtype_(dtype)
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         shape = utils.convert_shape_to_list(shape)
         return _C_ops.uniform_random('shape', shape, 'min',
                                      float(min), 'max',
@@ -679,7 +680,7 @@ def randint(low=0, high=None, shape=[1], dtype=None, name=None):
     if not isinstance(dtype, core.VarDesc.VarType):
         dtype = convert_np_dtype_to_dtype_(dtype)
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         shape = utils.convert_shape_to_list(shape)
         return _C_ops.randint('shape', shape, 'low', low, 'high', high, 'seed',
                               0, 'dtype', dtype)
@@ -846,7 +847,7 @@ def randint_like(x, low=0, high=None, dtype=None, name=None):
             "randint_like's low must less then high, but received low = {0}, "
             "high = {1}".format(low, high))
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         shape = utils.convert_shape_to_list(shape)
         out = _C_ops.randint('shape', shape, 'low', low, 'high', high, 'seed',
                              0, 'dtype', core.VarDesc.VarType.INT64)
@@ -911,7 +912,7 @@ def randperm(n, dtype="int64", name=None):
     if not isinstance(dtype, core.VarDesc.VarType):
         dtype = convert_np_dtype_to_dtype_(dtype)
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.randperm('n', n, 'seed', 0, 'dtype', dtype)
 
     if n < 1:
@@ -1014,7 +1015,7 @@ def exponential_(x, lam=1.0, name=None):
             #  [0.72520673, 0.45208144, 0.30234432]]
 
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.exponential_(x, "lambda", lam)
 
     check_variable_and_dtype(x, "x", ["float32", "float64"], "exponential")
diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py
index 2a2e7d000a1e6..5c5517e54f71a 100644
--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
@@ -13,14 +13,16 @@
 # limitations under the License.
 from __future__ import print_function
 import numpy as np
+import paddle
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype
-from ..fluid import core, layers
-from paddle.common_ops_import import in_dygraph_mode
+from ..fluid import layers
+from ..framework import core
 from paddle.common_ops_import import convert_np_dtype_to_dtype_
 from paddle.common_ops_import import Variable
 from paddle.common_ops_import import VarDesc
 from paddle import _C_ops
+from .logic import logical_not
 
 # TODO: define searching & indexing functions of a tensor  
 # from ..fluid.layers import has_inf  #DEFINE_ALIAS
@@ -88,7 +90,7 @@ def argsort(x, axis=-1, descending=False, name=None):
             #  [1 1 0 2]
             #  [0 2 1 1]]]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         _, ids = _C_ops.argsort(x, 'axis', axis, 'descending', descending)
         return ids
     check_variable_and_dtype(
@@ -165,7 +167,7 @@ def argmax(x, axis=None, keepdim=False, dtype="int64", name=None):
         flatten = True
         axis = 0
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         out = _C_ops.arg_max(x, 'axis', axis, 'dtype', var_dtype, 'keepdims',
                              keepdim, 'flatten', flatten)
         return out
@@ -242,7 +244,7 @@ def argmin(x, axis=None, keepdim=False, dtype="int64", name=None):
         flatten = True
         axis = 0
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         out = _C_ops.arg_min(x, 'axis', axis, 'dtype', var_dtype, 'keepdims',
                              keepdim, 'flatten', flatten)
         return out
@@ -302,7 +304,7 @@ def index_select(x, index, axis=0, name=None):
             # [ 9. 10. 10.]]
     """
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.index_select(x, index, 'dim', axis)
 
     helper = LayerHelper("index_select", **locals())
@@ -378,7 +380,7 @@ def nonzero(x, as_tuple=False):
     shape = x.shape
     rank = len(shape)
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         outs = _C_ops.where_index(x)
     else:
         outs = layers.where(x)
@@ -390,7 +392,7 @@ def nonzero(x, as_tuple=False):
     else:
         for i in range(rank):
             list_out.append(
-                layers.slice(
+                paddle.slice(
                     outs, axes=[1], starts=[i], ends=[i + 1]))
         return tuple(list_out)
 
@@ -452,7 +454,7 @@ def sort(x, axis=-1, descending=False, name=None):
             #  [4. 7. 4. 6.]
             #  [5. 7. 7. 9.]]]
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         out, _ = _C_ops.argsort(x, 'axis', axis, 'descending', descending)
         return out
     helper = LayerHelper("sort", **locals())
@@ -501,7 +503,7 @@ def mode(x, axis=-1, keepdim=False, name=None):
            #    [1, 0]]))
            
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.mode(x, "axis", axis, "keepdim", keepdim)
 
     helper = LayerHelper("mode", **locals())
@@ -575,7 +577,7 @@ def where(condition, x=None, y=None, name=None):
     if x is None or y is None:
         raise ValueError("either both or neither of x and y should be given")
 
-    if not in_dygraph_mode():
+    if not paddle.in_dynamic_mode():
         check_variable_and_dtype(condition, 'condition', ['bool'], 'where')
         check_variable_and_dtype(
             x, 'x', ['float32', 'float64', 'int32', 'int64'], 'where')
@@ -592,28 +594,27 @@ def where(condition, x=None, y=None, name=None):
         broadcast_y = y
     else:
         if core.is_compiled_with_xpu():
-            cond_int = layers.cast(condition, x.dtype)
-            cond_not_int = layers.cast(layers.logical_not(condition), x.dtype)
-            out1 = layers.elementwise_mul(x, cond_int)
-            out2 = layers.elementwise_mul(y, cond_not_int)
-            out = layers.elementwise_add(out1, out2)
+            cond_int = paddle.cast(condition, x.dtype)
+            cond_not_int = paddle.cast(logical_not(condition), x.dtype)
+            out1 = paddle.multiply(x, cond_int)
+            out2 = paddle.multiply(y, cond_not_int)
+            out = paddle.add(out1, out2)
             return out
 
-        zeros_like_x = layers.zeros_like(x)
-        zeros_like_y = layers.zeros_like(y)
-        zeros_like_condition = layers.zeros_like(condition)
-        zeros_like_condition = layers.cast(zeros_like_condition, x.dtype)
-        cast_cond = layers.cast(condition, x.dtype)
-
-        broadcast_zeros = layers.elementwise_add(zeros_like_x, zeros_like_y)
-        broadcast_zeros = layers.elementwise_add(broadcast_zeros,
-                                                 zeros_like_condition)
-        broadcast_x = layers.elementwise_add(x, broadcast_zeros)
-        broadcast_y = layers.elementwise_add(y, broadcast_zeros)
-        broadcast_condition = layers.elementwise_add(cast_cond, broadcast_zeros)
-        broadcast_condition = layers.cast(broadcast_condition, 'bool')
-
-    if in_dygraph_mode():
+        zeros_like_x = paddle.zeros_like(x)
+        zeros_like_y = paddle.zeros_like(y)
+        zeros_like_condition = paddle.zeros_like(condition)
+        zeros_like_condition = paddle.cast(zeros_like_condition, x.dtype)
+        cast_cond = paddle.cast(condition, x.dtype)
+
+        broadcast_zeros = paddle.add(zeros_like_x, zeros_like_y)
+        broadcast_zeros = paddle.add(broadcast_zeros, zeros_like_condition)
+        broadcast_x = paddle.add(x, broadcast_zeros)
+        broadcast_y = paddle.add(y, broadcast_zeros)
+        broadcast_condition = paddle.add(cast_cond, broadcast_zeros)
+        broadcast_condition = paddle.cast(broadcast_condition, 'bool')
+
+    if paddle.in_dynamic_mode():
         return _C_ops.where(broadcast_condition, broadcast_x, broadcast_y)
     else:
         helper = LayerHelper("where", **locals())
@@ -704,7 +705,7 @@ def index_sample(x, index):
             # [1200 1100]]
 
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.index_sample(x, index)
 
     helper = LayerHelper("index_sample", **locals())
@@ -752,7 +753,7 @@ def masked_select(x, mask, name=None):
             #[1.0 5.0 6.0 9.0]
     """
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.masked_select(x, mask)
 
     helper = LayerHelper("masked_select", **locals())
@@ -822,7 +823,7 @@ def topk(x, k, axis=None, largest=True, sorted=True, name=None):
            # [[1 1 0 0]]
 
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         k = k.numpy().item(0) if isinstance(k, Variable) else k
         if axis is None:
             out, indices = _C_ops.top_k_v2(x, 'k',
@@ -906,7 +907,7 @@ def searchsorted(sorted_sequence,
             
     """
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.searchsorted(sorted_sequence, values, "out_int32",
                                    out_int32, "right", right)
 
@@ -969,7 +970,7 @@ def kthvalue(x, k, axis=None, keepdim=False, name=None):
             #  [[0, 2],
             #  [1, 2]]))
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         if axis is not None:
             return _C_ops.kthvalue(x, 'k', k, "axis", axis, "keepdim", keepdim)
         else:
diff --git a/python/paddle/tensor/stat.py b/python/paddle/tensor/stat.py
index d54c7fe74dab7..468aa46048627 100644
--- a/python/paddle/tensor/stat.py
+++ b/python/paddle/tensor/stat.py
@@ -15,10 +15,9 @@
 # TODO: define statistical functions of a tensor  
 
 import numpy as np
-from ..fluid.framework import Variable
+from ..static import Variable
 from ..fluid.layer_helper import LayerHelper
-from ..fluid.framework import core, in_dygraph_mode
-from ..fluid import layers
+from ..framework import core
 from .search import where
 from ..fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
 import paddle
@@ -88,7 +87,7 @@ def mean(x, axis=None, keepdim=False, name=None):
     if axis is None or len(axis) == 0:
         axis = [0]
 
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.reduce_mean(x, 'dim', axis, 'keep_dim', keepdim,
                                   'reduce_all', reduce_all)
 
@@ -150,7 +149,7 @@ def var(x, axis=None, unbiased=True, keepdim=False, name=None):
             out2 = paddle.var(x, axis=1)
             # [1.         4.33333333]
     """
-    if not in_dygraph_mode():
+    if not paddle.in_dynamic_mode():
         check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'var')
 
     u = mean(x, axis, True, name)
@@ -209,7 +208,7 @@ def std(x, axis=None, unbiased=True, keepdim=False, name=None):
             out2 = paddle.std(x, axis=1)
             # [1.       2.081666]
     """
-    if not in_dygraph_mode():
+    if not paddle.in_dynamic_mode():
         check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'std')
 
     out = var(**locals())
@@ -237,7 +236,7 @@ def numel(x, name=None):
 
 
     """
-    if in_dygraph_mode():
+    if paddle.in_dynamic_mode():
         return _C_ops.size(x)
 
     if not isinstance(x, Variable):
diff --git a/python/paddle/tensor/to_string.py b/python/paddle/tensor/to_string.py
index 0e76d92ca73ef..85672ec7a36e6 100644
--- a/python/paddle/tensor/to_string.py
+++ b/python/paddle/tensor/to_string.py
@@ -14,7 +14,7 @@
 
 import paddle
 import numpy as np
-from paddle.fluid.layers import core
+from ..framework import core
 from paddle.fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
 
 __all__ = []

From a167a1435c525ac38ad44bacf45ffa816823761e Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Tue, 22 Feb 2022 21:32:37 +0800
Subject: [PATCH 046/101] import llvm::ArrayRef and add test (#39802)

---
 paddle/testing/CMakeLists.txt    |   1 +
 paddle/testing/array_ref_test.cc |  92 +++++++++
 paddle/utils/array_ref.h         | 337 +++++++++++++++++++++++++++++++
 3 files changed, 430 insertions(+)
 create mode 100644 paddle/testing/array_ref_test.cc
 create mode 100644 paddle/utils/array_ref.h

diff --git a/paddle/testing/CMakeLists.txt b/paddle/testing/CMakeLists.txt
index 4208132b98051..fe288ec2bf1d1 100644
--- a/paddle/testing/CMakeLists.txt
+++ b/paddle/testing/CMakeLists.txt
@@ -4,3 +4,4 @@ if(WITH_TESTING)
   cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS init device_context memory gtest gflags)
 endif()
 cc_test(small_vector_test SRCS small_vector_test.cc DEPS gtest gflags)
+cc_test(array_ref_test SRCS array_ref_test.cc DEPS gtest gflags)
diff --git a/paddle/testing/array_ref_test.cc b/paddle/testing/array_ref_test.cc
new file mode 100644
index 0000000000000..33a09c499246d
--- /dev/null
+++ b/paddle/testing/array_ref_test.cc
@@ -0,0 +1,92 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/utils/array_ref.h"
+
+#include <cstdlib>
+#include <ctime>
+
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+TEST(array_ref, array_ref) {
+  paddle::ArrayRef<int> a;
+  CHECK_EQ(a.size(), size_t(0));
+  CHECK_EQ(a.data(), static_cast<int*>(nullptr));
+
+  paddle::ArrayRef<int> b(paddle::none);
+  CHECK_EQ(b.size(), size_t(0));
+  CHECK_EQ(b.data(), static_cast<int*>(nullptr));
+
+  int v = 1;
+  paddle::ArrayRef<int> c(v);
+  CHECK_EQ(c.size(), size_t(1));
+  CHECK_EQ(c.data(), &v);
+  CHECK_EQ(c.equals(paddle::makeArrayRef(v)), true);
+
+  int v1[5] = {1, 2, 3, 4, 5};
+  paddle::ArrayRef<int> d(v1, 5);
+  CHECK_EQ(d.size(), size_t(5));
+  CHECK_EQ(d.data(), v1);
+  CHECK_EQ(d.equals(paddle::makeArrayRef(v1, 5)), true);
+
+  paddle::ArrayRef<int> e(&v1[0], &v1[4]);
+  CHECK_EQ(e.size(), size_t(4));
+  CHECK_EQ(e.data(), v1);
+  CHECK_EQ(e.equals(paddle::makeArrayRef(&v1[0], &v1[4])), true);
+
+  paddle::SmallVector<int, 3> small_vector{1, 2, 3};
+  paddle::ArrayRef<int> f(small_vector);
+  CHECK_EQ(f.size(), size_t(3));
+  CHECK_EQ(f.data(), small_vector.data());
+  CHECK_EQ(f.equals(paddle::makeArrayRef(small_vector)), true);
+
+  std::vector<int> vector{1, 2, 3};
+  paddle::ArrayRef<int> g(vector);
+  CHECK_EQ(g.size(), size_t(3));
+  CHECK_EQ(g.data(), vector.data());
+  CHECK_EQ(g.equals(paddle::makeArrayRef(vector)), true);
+
+  std::initializer_list<int> list = {1, 2, 3};
+  paddle::ArrayRef<int> h(list);
+  CHECK_EQ(h.size(), size_t(3));
+  CHECK_EQ(h.data(), list.begin());
+
+  paddle::ArrayRef<int> i(h);
+  CHECK_EQ(i.size(), size_t(3));
+  CHECK_EQ(i.data(), list.begin());
+  CHECK_EQ(i.equals(h), true);
+  CHECK_EQ(i.equals(paddle::makeArrayRef(h)), true);
+
+  auto slice = i.slice(1, 2);
+  CHECK_EQ(slice.size(), size_t(2));
+  CHECK_EQ(slice[0], 2);
+  CHECK_EQ(slice[1], 3);
+
+  auto drop = i.drop_front(2);
+  CHECK_EQ(drop.size(), size_t(1));
+  CHECK_EQ(drop[0], 3);
+
+  paddle::ArrayRef<int> nums = {1, 2, 3, 4, 5, 6, 7, 8};
+  auto front = nums.take_front(3);
+  CHECK_EQ(front.size(), size_t(3));
+  for (size_t i = 0; i < 3; ++i) {
+    CHECK_EQ(front[i], nums[i]);
+  }
+  auto back = nums.take_back(3);
+  CHECK_EQ(back.size(), size_t(3));
+  for (size_t i = 0; i < 3; ++i) {
+    CHECK_EQ(back[i], nums[i + 5]);
+  }
+}
diff --git a/paddle/utils/array_ref.h b/paddle/utils/array_ref.h
new file mode 100644
index 0000000000000..9b39e9775f97a
--- /dev/null
+++ b/paddle/utils/array_ref.h
@@ -0,0 +1,337 @@
+// This file copy from llvm/ADT/ArrayRef.h, version: 12.0.0
+// Modified the following points
+// 1. remove hash_value functions
+// 2. replace with the llvm::NoneType with paddle::none_t
+// 3. remove drop_while, drop_until, take_while, take_until methods
+
+//===- ArrayRef.h - Array Reference Wrapper ---------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PADDLE_UTILS_ARRAY_REF_H_
+#define PADDLE_UTILS_ARRAY_REF_H_
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cstddef>
+#include <initializer_list>
+#include <iterator>
+#include <memory>
+#include <type_traits>
+#include <vector>
+
+#include "paddle/utils/none.h"
+#include "paddle/utils/small_vector.h"
+
+namespace paddle {
+
+/// ArrayRef - Represent a constant reference to an array (0 or more elements
+/// consecutively in memory), i.e. a start pointer and a length.  It allows
+/// various APIs to take consecutive elements easily and conveniently.
+///
+/// This class does not own the underlying data, it is expected to be used in
+/// situations where the data resides in some other buffer, whose lifetime
+/// extends past that of the ArrayRef. For this reason, it is not in general
+/// safe to store an ArrayRef.
+///
+/// This is intended to be trivially copyable, so it should be passed by
+/// value.
+template <typename T>
+class ArrayRef {
+ public:
+  using iterator = const T *;
+  using const_iterator = const T *;
+  using size_type = size_t;
+  using reverse_iterator = std::reverse_iterator<iterator>;
+
+ private:
+  /// The start of the array, in an external buffer.
+  const T *Data = nullptr;
+
+  /// The number of elements.
+  size_type Length = 0;
+
+ public:
+  /// @name Constructors
+  /// @{
+
+  /// Construct an empty ArrayRef.
+  /*implicit*/ ArrayRef() = default;
+
+  /// Construct an empty ArrayRef from None.
+  /*implicit*/ ArrayRef(none_t) {}
+
+  /// Construct an ArrayRef from a single element.
+  /*implicit*/ ArrayRef(const T &OneElt) : Data(&OneElt), Length(1) {}
+
+  /// Construct an ArrayRef from a pointer and length.
+  /*implicit*/ ArrayRef(const T *data, size_t length)
+      : Data(data), Length(length) {}
+
+  /// Construct an ArrayRef from a range.
+  ArrayRef(const T *begin, const T *end) : Data(begin), Length(end - begin) {}
+
+  /// Construct an ArrayRef from a SmallVector. This is templated in order to
+  /// avoid instantiating SmallVectorTemplateCommon<T> whenever we
+  /// copy-construct an ArrayRef.
+  template <typename U>
+  /*implicit*/ ArrayRef(const SmallVectorTemplateCommon<T, U> &Vec)
+      : Data(Vec.data()), Length(Vec.size()) {}
+
+  /// Construct an ArrayRef from a std::vector.
+  template <typename A>
+  /*implicit*/ ArrayRef(const std::vector<T, A> &Vec)
+      : Data(Vec.data()), Length(Vec.size()) {}
+
+  /// Construct an ArrayRef from a std::array
+  template <size_t N>
+  /*implicit*/ constexpr ArrayRef(const std::array<T, N> &Arr)
+      : Data(Arr.data()), Length(N) {}
+
+  /// Construct an ArrayRef from a C array.
+  template <size_t N>
+  /*implicit*/ constexpr ArrayRef(const T (&Arr)[N]) : Data(Arr), Length(N) {}
+
+  /// Construct an ArrayRef from a std::initializer_list.
+  /*implicit*/ ArrayRef(const std::initializer_list<T> &Vec)
+      : Data(Vec.begin() == Vec.end() ? (T *)nullptr : Vec.begin()),
+        Length(Vec.size()) {}
+
+  /// Construct an ArrayRef<const T*> from ArrayRef<T*>. This uses SFINAE to
+  /// ensure that only ArrayRefs of pointers can be converted.
+  template <typename U>
+  ArrayRef(const ArrayRef<U *> &A,
+           std::enable_if_t<std::is_convertible<U *const *, T const *>::value>
+               * = nullptr)
+      : Data(A.data()), Length(A.size()) {}
+
+  /// Construct an ArrayRef<const T*> from a SmallVector<T*>. This is
+  /// templated in order to avoid instantiating SmallVectorTemplateCommon<T>
+  /// whenever we copy-construct an ArrayRef.
+  template <typename U, typename DummyT>
+  /*implicit*/ ArrayRef(
+      const SmallVectorTemplateCommon<U *, DummyT> &Vec,
+      std::enable_if_t<std::is_convertible<U *const *, T const *>::value> * =
+          nullptr)
+      : Data(Vec.data()), Length(Vec.size()) {}
+
+  /// Construct an ArrayRef<const T*> from std::vector<T*>. This uses SFINAE
+  /// to ensure that only vectors of pointers can be converted.
+  template <typename U, typename A>
+  ArrayRef(
+      const std::vector<U *, A> &Vec,
+      std::enable_if_t<std::is_convertible<U *const *, T const *>::value> * = 0)
+      : Data(Vec.data()), Length(Vec.size()) {}
+
+  /// @}
+  /// @name Simple Operations
+  /// @{
+
+  iterator begin() const { return Data; }
+  iterator end() const { return Data + Length; }
+
+  reverse_iterator rbegin() const { return reverse_iterator(end()); }
+  reverse_iterator rend() const { return reverse_iterator(begin()); }
+
+  /// empty - Check if the array is empty.
+  bool empty() const { return Length == 0; }
+
+  const T *data() const { return Data; }
+
+  /// size - Get the array size.
+  size_t size() const { return Length; }
+
+  /// front - Get the first element.
+  const T &front() const {
+    assert(!empty());
+    return Data[0];
+  }
+
+  /// back - Get the last element.
+  const T &back() const {
+    assert(!empty());
+    return Data[Length - 1];
+  }
+
+  // copy - Allocate copy in Allocator and return ArrayRef<T> to it.
+  template <typename Allocator>
+  ArrayRef<T> copy(Allocator &A) {
+    T *Buff = A.template Allocate<T>(Length);
+    std::uninitialized_copy(begin(), end(), Buff);
+    return ArrayRef<T>(Buff, Length);
+  }
+
+  /// equals - Check for element-wise equality.
+  bool equals(ArrayRef RHS) const {
+    if (Length != RHS.Length) return false;
+    return std::equal(begin(), end(), RHS.begin());
+  }
+
+  /// slice(n, m) - Chop off the first N elements of the array, and keep M
+  /// elements in the array.
+  ArrayRef<T> slice(size_t N, size_t M) const {
+    assert(N + M <= size() && "Invalid specifier");
+    return ArrayRef<T>(data() + N, M);
+  }
+
+  /// slice(n) - Chop off the first N elements of the array.
+  ArrayRef<T> slice(size_t N) const { return slice(N, size() - N); }
+
+  /// Drop the first \p N elements of the array.
+  ArrayRef<T> drop_front(size_t N = 1) const {
+    assert(size() >= N && "Dropping more elements than exist");
+    return slice(N, size() - N);
+  }
+
+  /// Drop the last \p N elements of the array.
+  ArrayRef<T> drop_back(size_t N = 1) const {
+    assert(size() >= N && "Dropping more elements than exist");
+    return slice(0, size() - N);
+  }
+
+  /// Return a copy of *this with only the first \p N elements.
+  ArrayRef<T> take_front(size_t N = 1) const {
+    if (N >= size()) return *this;
+    return drop_back(size() - N);
+  }
+
+  /// Return a copy of *this with only the last \p N elements.
+  ArrayRef<T> take_back(size_t N = 1) const {
+    if (N >= size()) return *this;
+    return drop_front(size() - N);
+  }
+
+  /// @}
+  /// @name Operator Overloads
+  /// @{
+  const T &operator[](size_t Index) const {
+    assert(Index < Length && "Invalid index!");
+    return Data[Index];
+  }
+
+  /// Disallow accidental assignment from a temporary.
+  ///
+  /// The declaration here is extra complicated so that "arrayRef = {}"
+  /// continues to select the move assignment operator.
+  template <typename U>
+  std::enable_if_t<std::is_same<U, T>::value, ArrayRef<T>> &operator=(
+      U &&Temporary) = delete;
+
+  /// Disallow accidental assignment from a temporary.
+  ///
+  /// The declaration here is extra complicated so that "arrayRef = {}"
+  /// continues to select the move assignment operator.
+  template <typename U>
+  std::enable_if_t<std::is_same<U, T>::value, ArrayRef<T>> &operator=(
+      std::initializer_list<U>) = delete;
+
+  /// @}
+  /// @name Expensive Operations
+  /// @{
+  std::vector<T> vec() const { return std::vector<T>(Data, Data + Length); }
+
+  /// @}
+  /// @name Conversion operators
+  /// @{
+  operator std::vector<T>() const {
+    return std::vector<T>(Data, Data + Length);
+  }
+
+  /// @}
+};
+
+/// @name ArrayRef Convenience constructors
+/// @{
+
+/// Construct an ArrayRef from a single element.
+template <typename T>
+ArrayRef<T> makeArrayRef(const T &OneElt) {
+  return OneElt;
+}
+
+/// Construct an ArrayRef from a pointer and length.
+template <typename T>
+ArrayRef<T> makeArrayRef(const T *data, size_t length) {
+  return ArrayRef<T>(data, length);
+}
+
+/// Construct an ArrayRef from a range.
+template <typename T>
+ArrayRef<T> makeArrayRef(const T *begin, const T *end) {
+  return ArrayRef<T>(begin, end);
+}
+
+/// Construct an ArrayRef from a SmallVector.
+template <typename T>
+ArrayRef<T> makeArrayRef(const SmallVectorImpl<T> &Vec) {
+  return Vec;
+}
+
+/// Construct an ArrayRef from a SmallVector.
+template <typename T, unsigned N>
+ArrayRef<T> makeArrayRef(const SmallVector<T, N> &Vec) {
+  return Vec;
+}
+
+/// Construct an ArrayRef from a std::vector.
+template <typename T>
+ArrayRef<T> makeArrayRef(const std::vector<T> &Vec) {
+  return Vec;
+}
+
+/// Construct an ArrayRef from a std::array.
+template <typename T, std::size_t N>
+ArrayRef<T> makeArrayRef(const std::array<T, N> &Arr) {
+  return Arr;
+}
+
+/// Construct an ArrayRef from an ArrayRef (no-op) (const)
+template <typename T>
+ArrayRef<T> makeArrayRef(const ArrayRef<T> &Vec) {
+  return Vec;
+}
+
+/// Construct an ArrayRef from an ArrayRef (no-op)
+template <typename T>
+ArrayRef<T> &makeArrayRef(ArrayRef<T> &Vec) {
+  return Vec;
+}
+
+/// Construct an ArrayRef from a C array.
+template <typename T, size_t N>
+ArrayRef<T> makeArrayRef(const T (&Arr)[N]) {
+  return ArrayRef<T>(Arr);
+}
+
+/// @}
+/// @name ArrayRef Comparison Operators
+/// @{
+
+template <typename T>
+inline bool operator==(ArrayRef<T> LHS, ArrayRef<T> RHS) {
+  return LHS.equals(RHS);
+}
+
+template <typename T>
+inline bool operator==(SmallVectorImpl<T> &LHS, ArrayRef<T> RHS) {
+  return ArrayRef<T>(LHS).equals(RHS);
+}
+
+template <typename T>
+inline bool operator!=(ArrayRef<T> LHS, ArrayRef<T> RHS) {
+  return !(LHS == RHS);
+}
+
+template <typename T>
+inline bool operator!=(SmallVectorImpl<T> &LHS, ArrayRef<T> RHS) {
+  return !(LHS == RHS);
+}
+
+}  // end namespace paddle
+
+#endif  // PADDLE_UTILS_ARRAY_REF_H_

From edc3ba13010497a04e6859a804ce535faf5e5945 Mon Sep 17 00:00:00 2001
From: Aganlengzi <aganlengzi@gmail.com>
Date: Tue, 22 Feb 2022 21:45:33 +0800
Subject: [PATCH 047/101] [custom kernel]Delete useless and upgrade (#39791)

* [custom kernel]Delete useless

* change RegType enum names

* mod notes

* merge

* update
---
 .../fluid/framework/op_kernel_info_helper.h   |   71 -
 paddle/phi/api/ext/op_kernel_info.h           | 1257 -----------------
 paddle/phi/api/lib/op_kernel_info.cc          |  108 --
 paddle/phi/core/kernel_registry.h             |   16 +-
 4 files changed, 8 insertions(+), 1444 deletions(-)
 delete mode 100644 paddle/fluid/framework/op_kernel_info_helper.h
 delete mode 100644 paddle/phi/api/ext/op_kernel_info.h
 delete mode 100644 paddle/phi/api/lib/op_kernel_info.cc

diff --git a/paddle/fluid/framework/op_kernel_info_helper.h b/paddle/fluid/framework/op_kernel_info_helper.h
deleted file mode 100644
index d62711bb88275..0000000000000
--- a/paddle/fluid/framework/op_kernel_info_helper.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/phi/api/ext/op_kernel_info.h"
-#include "paddle/phi/core/kernel_factory.h"
-
-namespace paddle {
-namespace framework {
-
-class OpKernelInfoHelper {
- public:
-  static const std::string& GetOpName(const paddle::OpKernelInfo& info) {
-    return info.op_name_;
-  }
-
-  static const phi::Backend& GetBackend(const paddle::OpKernelInfo& info) {
-    return info.backend_;
-  }
-
-  static const phi::DataLayout& GetDataLayout(
-      const paddle::OpKernelInfo& info) {
-    return info.layout_;
-  }
-
-  static const phi::DataType& GetDataType(const paddle::OpKernelInfo& info) {
-    return info.dtype_;
-  }
-
-  static phi::KernelKey GetKernelKey(const paddle::OpKernelInfo& info) {
-    return phi::KernelKey(info.backend_, info.layout_, info.dtype_);
-  }
-
-  static const CustomKernelFunc& GetKernelFn(const paddle::OpKernelInfo& info) {
-    return info.kernel_fn_;
-  }
-
-  static void* GetVariadicKernelFn(const paddle::OpKernelInfo& info) {
-    return info.variadic_kernel_fn_;
-  }
-
-  static const paddle::SmallVector<TensorArgDef>& GetInputDefs(
-      const paddle::OpKernelInfo& info) {
-    return info.input_defs_;
-  }
-
-  static const paddle::SmallVector<TensorArgDef>& GetOutputDefs(
-      const paddle::OpKernelInfo& info) {
-    return info.output_defs_;
-  }
-
-  static const paddle::SmallVector<AttributeArgDef>& GetAttributeDefs(
-      const paddle::OpKernelInfo& info) {
-    return info.attribute_defs_;
-  }
-};
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/phi/api/ext/op_kernel_info.h b/paddle/phi/api/ext/op_kernel_info.h
deleted file mode 100644
index b3adbe9d18b96..0000000000000
--- a/paddle/phi/api/ext/op_kernel_info.h
+++ /dev/null
@@ -1,1257 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <iostream>
-#include <string>
-#include <typeindex>
-#include <typeinfo>
-#include <unordered_map>
-#include <vector>
-
-#include "paddle/phi/api/ext/dll_decl.h"
-#include "paddle/phi/api/ext/exception.h"
-#include "paddle/phi/api/ext/op_meta_info.h"
-#include "paddle/phi/api/include/tensor.h"
-#include "paddle/phi/common/scalar.h"
-#include "paddle/phi/common/scalar_array.h"
-#include "paddle/utils/any.h"
-#include "paddle/utils/small_vector.h"
-
-#include "paddle/phi/common/data_type.h"
-
-/**
- * Custom Kernel Info Define.
- *
- * Used to maintain custom kernel core information before registering.
- * Pten is working on exposing headers, custom kernel depends on them, and
- * we prefer outer users following pten-kernel-function-style and registering
- * macro. So, we have to re-implement some structs or class and functions to
- * make sure users' custom kernel functions can be registered to pten.
- *
- * TODO(Aganlengzi): We should upgrade following pten.
- */
-
-namespace paddle {
-namespace framework {
-class PADDLE_API OpKernelInfoHelper;
-}  // namespace framework
-
-// TODO(Aganlengzi): Simple DeviceContext temporarily for stream getting
-// before phi::DeviceContext is exposed.
-class DeviceContext {
- public:
-  DeviceContext() { stream_ = nullptr; }
-  void set_stream(void* stream) { stream_ = stream; }
-  void* stream() const { return stream_; }
-
- private:
-  void* stream_;
-};
-class CPUContext : public DeviceContext {};
-
-// TODO(Aganlengzi): Use paddle::Tensor before DenseTensor is exposed
-using Tensor = paddle::experimental::Tensor;
-using Scalar = phi::Scalar;
-using ScalarArray = phi::ScalarArray;
-
-// Record custom kernel core information
-// We can not use phi::KernelFn directly, so users' custom kernel function
-// is signatured to `CustomKernelFunc', notice that the first parameter is
-// fixed to `const DeviceContext&'.
-using CustomKernelFunc =
-    void (*)(const DeviceContext& dev_ctx,
-             const std::vector<Tensor>& inputs,
-             const std::vector<std::vector<Tensor>>& vec_inputs,
-             const std::vector<paddle::any>& attrs,
-             std::vector<Tensor*>* outputs,
-             std::vector<std::vector<Tensor*>>* vec_outputs);
-
-////////////////////// Kernel Function (PD_PT_KERNEL) ////////////////////////
-#define PD_SPECIALIZE_KernelCallHelper_FOR_DEV_CONTEXT(device_ctx)           \
-  template <typename... Tail>                                                \
-  struct CustomComputeCallHelper<const device_ctx&, Tail...> {               \
-    template <int dev_ctx_idx,                                               \
-              int in_idx,                                                    \
-              int vec_in_idx,                                                \
-              int attr_idx,                                                  \
-              int out_idx,                                                   \
-              int vec_out_idx,                                               \
-              typename... PreviousArgs>                                      \
-    static void Compute(const DeviceContext& dev_ctx,                        \
-                        const std::vector<Tensor>& inputs,                   \
-                        const std::vector<std::vector<Tensor>>& vec_inputs,  \
-                        const std::vector<paddle::any>& attrs,               \
-                        std::vector<Tensor*>* outputs,                       \
-                        std::vector<std::vector<Tensor*>>* vec_outputs,      \
-                        PreviousArgs... pargs) {                             \
-      static_assert(in_idx == 0,                                             \
-                    "Kernel's DeviceContext should appear before Inputs.");  \
-      static_assert(vec_in_idx == 0,                                         \
-                    "Kernel's DeviceContext should appear before Inputs.");  \
-      static_assert(                                                         \
-          attr_idx == 0,                                                     \
-          "Kernel's DeviceContext should appear before Attributes.");        \
-      static_assert(out_idx == 0,                                            \
-                    "Kernel's DeviceContext should appear before Outputs."); \
-      static_assert(vec_out_idx == 0,                                        \
-                    "Kernel's DeviceContext should appear before Outputs."); \
-      const device_ctx& arg = static_cast<const device_ctx&>(dev_ctx);       \
-      CustomComputeCallHelper<Tail...>::template Compute<dev_ctx_idx + 1,    \
-                                                         in_idx,             \
-                                                         vec_in_idx,         \
-                                                         attr_idx,           \
-                                                         out_idx,            \
-                                                         vec_out_idx>(       \
-          dev_ctx,                                                           \
-          inputs,                                                            \
-          vec_inputs,                                                        \
-          attrs,                                                             \
-          outputs,                                                           \
-          vec_outputs,                                                       \
-          pargs...,                                                          \
-          arg);                                                              \
-    }                                                                        \
-  }
-
-#define PD_SPECIALIZE_KernelCallHelper_FOR_INPUT(tensor_type)               \
-  template <typename... Tail>                                               \
-  struct CustomComputeCallHelper<const tensor_type&, Tail...> {             \
-    template <int dev_ctx_idx,                                              \
-              int in_idx,                                                   \
-              int vec_in_idx,                                               \
-              int attr_idx,                                                 \
-              int out_idx,                                                  \
-              int vec_out_idx,                                              \
-              typename... PreviousArgs>                                     \
-    static void Compute(const DeviceContext& dev_ctx,                       \
-                        const std::vector<Tensor>& inputs,                  \
-                        const std::vector<std::vector<Tensor>>& vec_inputs, \
-                        const std::vector<paddle::any>& attrs,              \
-                        std::vector<Tensor*>* outputs,                      \
-                        std::vector<std::vector<Tensor*>>* vec_outputs,     \
-                        PreviousArgs... pargs) {                            \
-      static_assert(attr_idx == 0,                                          \
-                    "Kernel's Input should appear before Attributes.");     \
-      static_assert(out_idx == 0,                                           \
-                    "Kernel's Input should appear before Outputs.");        \
-      static_assert(vec_out_idx == 0,                                       \
-                    "Kernel's Input should appear before Outputs.");        \
-      const Tensor& arg = inputs[in_idx];                                   \
-      CustomComputeCallHelper<Tail...>::template Compute<dev_ctx_idx,       \
-                                                         in_idx + 1,        \
-                                                         vec_in_idx,        \
-                                                         attr_idx,          \
-                                                         out_idx,           \
-                                                         vec_out_idx>(      \
-          dev_ctx,                                                          \
-          inputs,                                                           \
-          vec_inputs,                                                       \
-          attrs,                                                            \
-          outputs,                                                          \
-          vec_outputs,                                                      \
-          pargs...,                                                         \
-          arg);                                                             \
-    }                                                                       \
-  }
-
-#define PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(tensor_type)          \
-  template <typename... Tail>                                                \
-  struct CustomComputeCallHelper<const std::vector<tensor_type>&, Tail...> { \
-    template <int dev_ctx_idx,                                               \
-              int in_idx,                                                    \
-              int vec_in_idx,                                                \
-              int attr_idx,                                                  \
-              int out_idx,                                                   \
-              int vec_out_idx,                                               \
-              typename... PreviousArgs>                                      \
-    static void Compute(const DeviceContext& dev_ctx,                        \
-                        const std::vector<Tensor>& inputs,                   \
-                        const std::vector<std::vector<Tensor>>& vec_inputs,  \
-                        const std::vector<paddle::any>& attrs,               \
-                        std::vector<Tensor*>* outputs,                       \
-                        std::vector<std::vector<Tensor*>>* vec_outputs,      \
-                        PreviousArgs... pargs) {                             \
-      static_assert(attr_idx == 0,                                           \
-                    "Kernel's Input should appear before Attributes.");      \
-      static_assert(out_idx == 0,                                            \
-                    "Kernel's Input should appear before Outputs.");         \
-      static_assert(vec_out_idx == 0,                                        \
-                    "Kernel's Input should appear before Outputs.");         \
-      const std::vector<Tensor>& arg = vec_inputs[vec_in_idx];               \
-      CustomComputeCallHelper<Tail...>::template Compute<dev_ctx_idx,        \
-                                                         in_idx,             \
-                                                         vec_in_idx + 1,     \
-                                                         attr_idx,           \
-                                                         out_idx,            \
-                                                         vec_out_idx>(       \
-          dev_ctx,                                                           \
-          inputs,                                                            \
-          vec_inputs,                                                        \
-          attrs,                                                             \
-          outputs,                                                           \
-          vec_outputs,                                                       \
-          pargs...,                                                          \
-          arg);                                                              \
-    }                                                                        \
-  }
-
-#define PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(attr_type)             \
-  template <typename... Tail>                                               \
-  struct CustomComputeCallHelper<attr_type, Tail...> {                      \
-    template <int dev_ctx_idx,                                              \
-              int in_idx,                                                   \
-              int vec_in_idx,                                               \
-              int attr_idx,                                                 \
-              int out_idx,                                                  \
-              int vec_out_idx,                                              \
-              typename... PreviousArgs>                                     \
-    static void Compute(const DeviceContext& dev_ctx,                       \
-                        const std::vector<Tensor>& inputs,                  \
-                        const std::vector<std::vector<Tensor>>& vec_inputs, \
-                        const std::vector<paddle::any>& attrs,              \
-                        std::vector<Tensor*>* outputs,                      \
-                        std::vector<std::vector<Tensor*>>* vec_outputs,     \
-                        PreviousArgs... pargs) {                            \
-      static_assert(out_idx == 0,                                           \
-                    "Kernel's Attributes should appear before Outputs.");   \
-      static_assert(vec_out_idx == 0,                                       \
-                    "Kernel's Attributes should appear before Outputs.");   \
-      try {                                                                 \
-        attr_type arg = paddle::any_cast<attr_type>(attrs[attr_idx]);       \
-        return CustomComputeCallHelper<Tail...>::template Compute<          \
-            dev_ctx_idx,                                                    \
-            in_idx,                                                         \
-            vec_in_idx,                                                     \
-            attr_idx + 1,                                                   \
-            out_idx,                                                        \
-            vec_out_idx>(dev_ctx,                                           \
-                         inputs,                                            \
-                         vec_inputs,                                        \
-                         attrs,                                             \
-                         outputs,                                           \
-                         vec_outputs,                                       \
-                         pargs...,                                          \
-                         arg);                                              \
-      } catch (paddle::bad_any_cast&) {                                     \
-        PD_THROW(                                                           \
-            "Attribute cast error in custom operator. Expected " #attr_type \
-            " value.");                                                     \
-      }                                                                     \
-    }                                                                       \
-  }
-
-#define PD_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(tensor_type)              \
-  template <typename... Tail>                                               \
-  struct CustomComputeCallHelper<tensor_type*, Tail...> {                   \
-    template <int dev_ctx_idx,                                              \
-              int in_idx,                                                   \
-              int vec_in_idx,                                               \
-              int attr_idx,                                                 \
-              int out_idx,                                                  \
-              int vec_out_idx,                                              \
-              typename... PreviousArgs>                                     \
-    static void Compute(const DeviceContext& dev_ctx,                       \
-                        const std::vector<Tensor>& inputs,                  \
-                        const std::vector<std::vector<Tensor>>& vec_inputs, \
-                        const std::vector<paddle::any>& attrs,              \
-                        std::vector<Tensor*>* outputs,                      \
-                        std::vector<std::vector<Tensor*>>* vec_outputs,     \
-                        PreviousArgs... pargs) {                            \
-      tensor_type* arg = (*outputs)[out_idx];                               \
-      CustomComputeCallHelper<Tail...>::template Compute<dev_ctx_idx,       \
-                                                         in_idx,            \
-                                                         vec_in_idx,        \
-                                                         attr_idx,          \
-                                                         out_idx + 1,       \
-                                                         vec_out_idx>(      \
-          dev_ctx,                                                          \
-          inputs,                                                           \
-          vec_inputs,                                                       \
-          attrs,                                                            \
-          outputs,                                                          \
-          vec_outputs,                                                      \
-          pargs...,                                                         \
-          arg);                                                             \
-    }                                                                       \
-  }
-
-#define PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(tensor_type)        \
-  template <typename... Tail>                                               \
-  struct CustomComputeCallHelper<std::vector<tensor_type*>, Tail...> {      \
-    template <int dev_ctx_idx,                                              \
-              int in_idx,                                                   \
-              int vec_in_idx,                                               \
-              int attr_idx,                                                 \
-              int out_idx,                                                  \
-              int vec_out_idx,                                              \
-              typename... PreviousArgs>                                     \
-    static void Compute(const DeviceContext& dev_ctx,                       \
-                        const std::vector<Tensor>& inputs,                  \
-                        const std::vector<std::vector<Tensor>>& vec_inputs, \
-                        const std::vector<paddle::any>& attrs,              \
-                        std::vector<Tensor*>* outputs,                      \
-                        std::vector<std::vector<Tensor*>>* vec_outputs,     \
-                        PreviousArgs... pargs) {                            \
-      std::vector<tensor_type*> arg = (*vec_outputs)[vec_out_idx];          \
-      CustomComputeCallHelper<Tail...>::template Compute<dev_ctx_idx,       \
-                                                         in_idx,            \
-                                                         vec_in_idx,        \
-                                                         attr_idx,          \
-                                                         out_idx,           \
-                                                         vec_out_idx + 1>(  \
-          dev_ctx,                                                          \
-          inputs,                                                           \
-          vec_inputs,                                                       \
-          attrs,                                                            \
-          outputs,                                                          \
-          vec_outputs,                                                      \
-          pargs...,                                                         \
-          arg);                                                             \
-    }                                                                       \
-  }
-
-template <typename T>
-struct PtenTypeTag {};
-
-template <typename F, F f>
-struct CustomKernelFuncImpl;
-
-template <typename Return,
-          typename DevCtx,
-          typename... Args,
-          Return (*impl_fn)(DevCtx, Args...)>
-struct CustomKernelFuncImpl<Return (*)(DevCtx, Args...), impl_fn> {
-  static void Compute(const DeviceContext& dev_ctx,
-                      const std::vector<Tensor>& inputs,
-                      const std::vector<std::vector<Tensor>>& vec_inputs,
-                      const std::vector<paddle::any>& attrs,
-                      std::vector<Tensor*>* outputs,
-                      std::vector<std::vector<Tensor*>>* vec_outputs) {
-    CustomComputeCallHelper<DevCtx, Args..., PtenTypeTag<int>>::
-        template Compute<0, 0, 0, 0, 0, 0>(
-            dev_ctx, inputs, vec_inputs, attrs, outputs, vec_outputs);
-  }
-
-  // NOTE: Tensor in args is paddle::Tensor but not DenseTensor
-  static void VariadicCompute(const DeviceContext& dev_ctx, Args... args) {
-    return impl_fn(static_cast<DevCtx>(dev_ctx), std::forward<Args>(args)...);
-  }
-
- private:
-  template <typename... RemainingArgs>
-  struct CustomComputeCallHelper;
-
-  /* DeviceContext Helpers */
-  PD_SPECIALIZE_KernelCallHelper_FOR_DEV_CONTEXT(CPUContext);
-
-  /* Input Helpers */
-  PD_SPECIALIZE_KernelCallHelper_FOR_INPUT(Tensor);
-  PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(Tensor);
-
-  /* Attribute Helpers */
-  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(bool);
-  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(float);
-  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(double);
-  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(int);
-  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(int64_t);
-  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(phi::dtype::float16);
-  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(DataType);
-  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const Scalar&);
-  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const ScalarArray&);
-  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<int>&);
-  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<int64_t>&);
-
-  /* Output Helpers */
-  PD_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(Tensor);
-  PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(Tensor);
-
-  // End: base template
-  template <typename T>
-  struct CustomComputeCallHelper<PtenTypeTag<T>> {
-    template <int dev_ctx_idx,
-              int in_idx,
-              int vec_in_idx,
-              int attr_idx,
-              int out_idx,
-              int vec_out_idx>
-    static void Compute(const DeviceContext& dev_ctx,
-                        const std::vector<Tensor>& inputs,
-                        const std::vector<std::vector<Tensor>>& vec_inputs,
-                        const std::vector<paddle::any>& attrs,
-                        std::vector<Tensor*>* outputs,
-                        std::vector<std::vector<Tensor*>>* vec_outputs,
-                        DevCtx device_ctx,
-                        Args... args) {
-      return impl_fn(device_ctx, args...);
-    }
-  };
-};
-
-#define PD_PT_KERNEL(...) \
-  ::paddle::CustomKernelFuncImpl<decltype(&__VA_ARGS__), &__VA_ARGS__>::Compute
-
-#define PD_PT_VARIADIC_KERNEL(...)                            \
-  reinterpret_cast<void*>(                                    \
-      &::paddle::CustomKernelFuncImpl<decltype(&__VA_ARGS__), \
-                                      &__VA_ARGS__>::VariadicCompute)
-
-////////////////////// Op Kernel Info depended structs //////////////////////
-// TODO(Aganlengzi): Re-define TensorArgDef and AttributeArgDef temporarily.
-// TensorArgDef follows phi::TensorArgDef in kernel_factory.h, the
-// difference is that custom_kernel needs extra `is_vector' to ensure we can
-// deal with case like vector with only one element.
-struct TensorArgDef {
-  phi::Backend backend;
-  phi::DataLayout layout;
-  phi::DataType dtype;
-  bool is_vector{false};
-
-  TensorArgDef(phi::Backend in_backend,
-               phi::DataLayout in_layout,
-               phi::DataType in_dtype,
-               bool is_vector = false)
-      : backend(in_backend),
-        layout(in_layout),
-        dtype(in_dtype),
-        is_vector(is_vector) {}
-
-  TensorArgDef& SetBackend(phi::Backend in_backend) {
-    backend = in_backend;
-    return *this;
-  }
-
-  TensorArgDef& SetDataLayout(phi::DataLayout in_layout) {
-    layout = in_layout;
-    return *this;
-  }
-
-  TensorArgDef& SetDataType(phi::DataType in_dtype) {
-    dtype = in_dtype;
-    return *this;
-  }
-};
-
-// AttributeArgDef follows phi::AttributeArgDef in kernel_factory.h
-struct AttributeArgDef {
-  std::type_index type_index;
-
-  explicit AttributeArgDef(std::type_index type_index)
-      : type_index(type_index) {}
-};
-
-////////////////////// Op Kernel Info //////////////////////
-// OpKernelInfo stores all info parsed from user kernel function, includes:
-// 0. op_name and kernel key(backend, data_layout and data_type)
-// 1. unified custom kernel function
-// 2. variadic kernel function(use paddle::Tensor)
-// 3. args info and user defined change for specific arg
-class PADDLE_API OpKernelInfo {
- public:
-  explicit OpKernelInfo(const std::string& op_name,
-                        phi::Backend backend,
-                        phi::DataLayout data_layout,
-                        phi::DataType data_type)
-      : op_name_(op_name),
-        backend_(backend),
-        layout_(data_layout),
-        dtype_(data_type) {}
-
-  // format: PD_PT_KERNEL(...)
-  OpKernelInfo& SetKernelFn(CustomKernelFunc&& func);
-  // format: PD_PT_VARIADIC_KERNEL(...)
-  OpKernelInfo& SetVariadicKernelFn(void* func);
-
-  // for Args parsing and storing
-  void AppendInput(phi::Backend backend,
-                   phi::DataLayout layout,
-                   phi::DataType dtype,
-                   bool is_vector = false) {
-    input_defs_.emplace_back(TensorArgDef(backend, layout, dtype, is_vector));
-  }
-
-  void AppendOutput(phi::Backend backend,
-                    phi::DataLayout layout,
-                    phi::DataType dtype,
-                    bool is_vector = false) {
-    output_defs_.emplace_back(TensorArgDef(backend, layout, dtype, is_vector));
-  }
-
-  void AppendAttribute(std::type_index type_index) {
-    attribute_defs_.emplace_back(AttributeArgDef(type_index));
-  }
-
-  // for Args user-def function
-  TensorArgDef& InputAt(size_t idx) { return input_defs_.at(idx); }
-  TensorArgDef& OutputAt(size_t idx) { return output_defs_.at(idx); }
-
-  const phi::Backend& GetBackend() const { return backend_; }
-  const phi::DataLayout& GetDataLayout() const { return layout_; }
-  const phi::DataType& GetDataType() const { return dtype_; }
-
- private:
-  friend class framework::OpKernelInfoHelper;
-
-  // 1. op info
-  std::string op_name_;
-
-  // 2. kernel key info
-  phi::Backend backend_{phi::Backend::UNDEFINED};
-  phi::DataLayout layout_{phi::DataLayout::UNDEFINED};
-  phi::DataType dtype_{phi::DataType::UNDEFINED};
-
-  // 3. args info
-  paddle::SmallVector<TensorArgDef> input_defs_{{}};
-  paddle::SmallVector<TensorArgDef> output_defs_{{}};
-  paddle::SmallVector<AttributeArgDef> attribute_defs_{{}};
-
-  // 4. func info
-  CustomKernelFunc kernel_fn_{nullptr};
-  void* variadic_kernel_fn_{nullptr};
-};
-
-////////////////////// Op Kernel Args Parser //////////////////////
-// Define CustomKernelArgsParseFunctor for args parsing
-// We have to store parsed info into OpKernelInfo before
-// mapping to phi::KernelArgsDef in phi::Kernel
-template <typename Func>
-struct CustomKernelArgsParseFunctor;
-
-template <typename Return_, typename... Args_>
-struct CustomKernelArgsParseFunctor<Return_ (*)(Args_...)> {
-  using Args = std::tuple<Args_...>;
-  enum : std::size_t { Arity = sizeof...(Args_) };
-  using Indices = std::make_index_sequence<Arity>;
-  template <std::size_t Index>
-  using Arg = typename std::tuple_element<Index, Args>::type;
-
-  static void Parse(OpKernelInfo* op_kernel_info) {
-    const phi::Backend& backend = op_kernel_info->GetBackend();
-    const phi::DataLayout& layout = op_kernel_info->GetDataLayout();
-    const phi::DataType& dtype = op_kernel_info->GetDataType();
-
-    auto default_tensor_layout = phi::DataLayout::NCHW;
-    if (layout != phi::DataLayout::ANY) {
-      default_tensor_layout = layout;
-    }
-    auto args_type = ParseArgType(Indices{});
-    for (auto arg_type : args_type) {
-      if (arg_type == std::type_index(typeid(const CPUContext&))) {
-        // do nothing, skip context arg now
-      } else if (arg_type == std::type_index(typeid(const Tensor&))) {
-        op_kernel_info->AppendInput(backend, default_tensor_layout, dtype);
-      } else if (arg_type ==
-                 std::type_index(typeid(const std::vector<Tensor>&))) {
-        op_kernel_info->AppendInput(
-            backend, default_tensor_layout, dtype, true);
-      } else if (arg_type == std::type_index(typeid(Tensor*))) {
-        op_kernel_info->AppendOutput(backend, default_tensor_layout, dtype);
-      } else if (arg_type == std::type_index(typeid(std::vector<Tensor*>))) {
-        op_kernel_info->AppendOutput(
-            backend, default_tensor_layout, dtype, true);
-      } else {
-        op_kernel_info->AppendAttribute(arg_type);
-      }
-    }
-  }
-
- private:
-  template <std::size_t... INDEX>
-  static std::vector<std::type_index> ParseArgType(
-      std::index_sequence<INDEX...>) {
-    return {std::type_index(typeid(Arg<INDEX>))...};
-  }
-};
-
-#define PD_PT_ARGS_PARSE(...) \
-  ::paddle::CustomKernelArgsParseFunctor<decltype(&__VA_ARGS__)>::Parse
-
-//////////////// Op Kernel Info Map /////////////////
-// all user custom kernels information are stored in this map
-class PADDLE_API OpKernelInfoMap {
- public:
-  static OpKernelInfoMap& Instance() {
-    static OpKernelInfoMap g_custom_kernel_info_map;
-    return g_custom_kernel_info_map;
-  }
-
-  std::vector<OpKernelInfo>& operator[](const std::string& name);
-
-  const std::unordered_map<std::string, std::vector<OpKernelInfo>>& GetMap()
-      const;
-
- private:
-  OpKernelInfoMap() = default;
-  std::unordered_map<std::string, std::vector<OpKernelInfo>> map_;
-
-  PD_DISABLE_COPY_AND_ASSIGN(OpKernelInfoMap);
-};
-
-//////////////// Op Kernel Info Builder /////////////////
-// format: PD_PT_ARGS_PARSE(...)
-using CustomKernelArgsParseFn = void (*)(OpKernelInfo* op_kernel_info);
-using CustomKernelArgsDefFn = void (*)(OpKernelInfo* kernel);
-
-class PADDLE_API OpKernelInfoBuilder {
- public:
-  explicit OpKernelInfoBuilder(std::string&& op_name,
-                               phi::Backend backend,
-                               phi::DataLayout data_layout,
-                               phi::DataType data_type);
-
-  OpKernelInfoBuilder& SetKernelFn(CustomKernelFunc func);
-  OpKernelInfoBuilder& SetVariadicKernelFn(void* func);
-  OpKernelInfoBuilder& ArgsParse(CustomKernelArgsParseFn func);
-  OpKernelInfoBuilder& ArgsDef(CustomKernelArgsDefFn func);
-
- private:
-  // op name
-  std::string op_name_;
-
-  // kernel key info
-  phi::Backend backend_{phi::Backend::UNDEFINED};
-  phi::DataLayout layout_{phi::DataLayout::UNDEFINED};
-  phi::DataType dtype_{phi::DataType::UNDEFINED};
-
-  // ref current info ptr
-  OpKernelInfo* info_ptr_;
-};
-/////////////////////// Custom kernel register API /////////////////////////
-// For inference: compile directly with framework
-// Call after PD_REGISTER_BUILTIN_KERNEL(...)
-void RegisterAllCustomKernel();
-
-//////////////// Custom kernel register macro /////////////////////
-// Refer to paddle/phi/core/kernel_registry.h, we can not use
-// PD_REGISTER_KERNEL directly, common macros and functions are
-// not ready for custom kernel now.
-// Difference: custom_kernel stores all kernels' info into global
-// g_custom_kernel_info_map before loading and registering into
-// pten kernel management. Only providing PD_REGISTER_BUILTIN_KERNEL which
-// supports 2 template arguments.
-
-#define PD_BACKEND(arg__) phi::Backend::arg__
-#define PD_DATALAYOUT(arg__) phi::DataLayout::arg__
-#define PD_DATATYPE(arg__) phi::DataType::arg__
-
-#define PD_NARGS(...) _PD_NARGS((__VA_ARGS__, _PD_RESQ_N()))
-#define _PD_NARGS(...) _PD_ARG_N(__VA_ARGS__)
-#define _PD_ARG_N_EXPAND(                                                     \
-    _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, N, ...) \
-  N
-#define _PD_ARG_N(args) _PD_ARG_N_EXPAND args
-#define _PD_RESQ_N() 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
-
-#define PD_CONCATENATE(arg1, arg2) PD_CONCATENATE1(arg1, arg2)
-#define PD_CONCATENATE1(arg1, arg2) PD_CONCATENATE2(arg1, arg2)
-#define PD_CONCATENATE2(arg1, arg2) arg1##arg2
-
-#define PD_EXPAND(x) x
-
-#ifdef __COUNTER__
-#define PD_ID __COUNTER__
-#else
-#define PD_ID __LINE__
-#endif
-
-#define PD_REGISTER_BUILTIN_KERNEL(                                      \
-    kernel_name, backend, layout, func, cpp_dtype, ...)                  \
-  STATIC_ASSERT_GLOBAL_NAMESPACE(                                        \
-      _reg_custom_kernel_ns_check_##kernel_name##_##backend##_##layout,  \
-      "PD_REGISTER_BUILTIN_KERNEL must be called in global namespace."); \
-  _PD_REGISTER_2TA_KERNEL(                                               \
-      kernel_name, backend, layout, func, cpp_dtype, ##__VA_ARGS__)
-
-// WIN32 is not supported
-#define _PD_REGISTER_2TA_KERNEL(                                              \
-    kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, ...)             \
-  PD_KERNEL_INSTANTIATION(meta_kernel_fn, backend, cpp_dtype, ##__VA_ARGS__); \
-  static void __PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(   \
-      ::paddle::OpKernelInfo* kernel);                                        \
-  PD_KERNEL_REGISTRAR_INIT(                                                   \
-      kernel_name,                                                            \
-      backend,                                                                \
-      layout,                                                                 \
-      &__PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout,          \
-      meta_kernel_fn,                                                         \
-      cpp_dtype,                                                              \
-      ##__VA_ARGS__);                                                         \
-  void __PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(          \
-      ::paddle::OpKernelInfo* kernel)
-
-#define PD_KERNEL_INSTANTIATION(meta_kernel_fn, backend, cpp_dtype, ...) \
-  _PD_KERNEL_INSTANTIATION(PD_NARGS(cpp_dtype, ##__VA_ARGS__),           \
-                           meta_kernel_fn,                               \
-                           backend,                                      \
-                           cpp_dtype,                                    \
-                           ##__VA_ARGS__)
-
-#define _PD_KERNEL_INSTANTIATION(N, meta_kernel_fn, backend, cpp_dtype, ...) \
-  PD_CONCATENATE(_PD_KERNEL_INSTANTIATION_, N)                               \
-  (meta_kernel_fn, backend, cpp_dtype, ##__VA_ARGS__)
-
-#define _PD_KERNEL_INSTANTIATION_1(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>)  \
-      meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>
-#define _PD_KERNEL_INSTANTIATION_2(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>)  \
-      meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>;                \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_1(meta_kernel_fn, backend, ##__VA_ARGS__))
-#define _PD_KERNEL_INSTANTIATION_3(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>)  \
-      meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>;                \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_2(meta_kernel_fn, backend, ##__VA_ARGS__))
-#define _PD_KERNEL_INSTANTIATION_4(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>)  \
-      meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>;                \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_3(meta_kernel_fn, backend, ##__VA_ARGS__))
-#define _PD_KERNEL_INSTANTIATION_5(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>)  \
-      meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>;                \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_4(meta_kernel_fn, backend, ##__VA_ARGS__))
-#define _PD_KERNEL_INSTANTIATION_6(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>)  \
-      meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>;                \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_5(meta_kernel_fn, backend, ##__VA_ARGS__))
-#define _PD_KERNEL_INSTANTIATION_7(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>)  \
-      meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>;                \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_6(meta_kernel_fn, backend, ##__VA_ARGS__))
-#define _PD_KERNEL_INSTANTIATION_8(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>)  \
-      meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>;                \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_7(meta_kernel_fn, backend, ##__VA_ARGS__))
-#define _PD_KERNEL_INSTANTIATION_9(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>)  \
-      meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>;                \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_8(meta_kernel_fn, backend, ##__VA_ARGS__))
-#define _PD_KERNEL_INSTANTIATION_10(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>)   \
-      meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>;                 \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_9(meta_kernel_fn, backend, ##__VA_ARGS__))
-#define _PD_KERNEL_INSTANTIATION_11(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>)   \
-      meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>;                 \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_10(meta_kernel_fn, backend, ##__VA_ARGS__))
-#define _PD_KERNEL_INSTANTIATION_12(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>)   \
-      meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>;                 \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_11(meta_kernel_fn, backend, ##__VA_ARGS__))
-#define _PD_KERNEL_INSTANTIATION_13(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>)   \
-      meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>;                 \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_12(meta_kernel_fn, backend, ##__VA_ARGS__))
-#define _PD_KERNEL_INSTANTIATION_14(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>)   \
-      meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>;                 \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_13(meta_kernel_fn, backend, ##__VA_ARGS__))
-#define _PD_KERNEL_INSTANTIATION_15(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>)   \
-      meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>;                 \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_14(meta_kernel_fn, backend, ##__VA_ARGS__))
-
-#define PD_KERNEL_REGISTRAR_INIT(                                              \
-    kernel_name, backend, layout, args_def_fn, meta_kernel_fn, cpp_dtype, ...) \
-  _PD_KERNEL_REGISTRAR_INIT(PD_NARGS(cpp_dtype, ##__VA_ARGS__),                \
-                            kernel_name,                                       \
-                            backend,                                           \
-                            layout,                                            \
-                            args_def_fn,                                       \
-                            meta_kernel_fn,                                    \
-                            cpp_dtype,                                         \
-                            ##__VA_ARGS__)
-
-// clang-format off
-
-/* The =pre-commit always treats this macro into the wrong format,
-  and multi-line macros cannot be skipped with NOLINT.*/
-#define _PD_KERNEL_REGISTRAR_INIT(N,              \
-                                  kernel_name,    \
-                                  backend,        \
-                                  layout,         \
-                                  args_def_fn,    \
-                                  meta_kernel_fn, \
-                                  cpp_dtype,      \
-                                  ...)            \
-  PD_CONCATENATE(_PD_KERNEL_REGISTRAR_INIT_, N) ( \
-    kernel_name,                                  \
-    backend,                                      \
-    layout,                                       \
-    PD_ID,                                        \
-    args_def_fn,                                  \
-    meta_kernel_fn,                               \
-    cpp_dtype,                                    \
-    ##__VA_ARGS__)
-
-// clang-format on
-
-#define _PD_KERNEL_REGISTRAR_INIT_1(kernel_name,                        \
-                                    backend,                            \
-                                    layout,                             \
-                                    registrar_id,                       \
-                                    args_def_fn,                        \
-                                    meta_kernel_fn,                     \
-                                    cpp_dtype,                          \
-                                    ...)                                \
-  static ::paddle::OpKernelInfoBuilder PD_CONCATENATE(                  \
-      custom_kernel_info_##kernel_name##_##backend##_##layout##_,       \
-      registrar_id) =                                                   \
-      ::paddle::OpKernelInfoBuilder(                                    \
-          #kernel_name,                                                 \
-          PD_BACKEND(backend),                                          \
-          PD_DATALAYOUT(layout),                                        \
-          ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type()) \
-          .SetKernelFn(PD_PT_KERNEL(                                    \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .SetVariadicKernelFn(PD_PT_VARIADIC_KERNEL(                   \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsParse(PD_PT_ARGS_PARSE(                                  \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsDef(args_def_fn);
-
-#define _PD_KERNEL_REGISTRAR_INIT_2(kernel_name,                        \
-                                    backend,                            \
-                                    layout,                             \
-                                    registrar_id,                       \
-                                    args_def_fn,                        \
-                                    meta_kernel_fn,                     \
-                                    cpp_dtype,                          \
-                                    ...)                                \
-  static ::paddle::OpKernelInfoBuilder PD_CONCATENATE(                  \
-      custom_kernel_info_##kernel_name##_##backend##_##layout##_,       \
-      registrar_id) =                                                   \
-      ::paddle::OpKernelInfoBuilder(                                    \
-          #kernel_name,                                                 \
-          PD_BACKEND(backend),                                          \
-          PD_DATALAYOUT(layout),                                        \
-          ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type()) \
-          .SetKernelFn(PD_PT_KERNEL(                                    \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .SetVariadicKernelFn(PD_PT_VARIADIC_KERNEL(                   \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsParse(PD_PT_ARGS_PARSE(                                  \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsDef(args_def_fn);                                        \
-  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_1(kernel_name,                    \
-                                        backend,                        \
-                                        layout,                         \
-                                        PD_ID,                          \
-                                        args_def_fn,                    \
-                                        meta_kernel_fn,                 \
-                                        ##__VA_ARGS__))
-
-#define _PD_KERNEL_REGISTRAR_INIT_3(kernel_name,                        \
-                                    backend,                            \
-                                    layout,                             \
-                                    registrar_id,                       \
-                                    args_def_fn,                        \
-                                    meta_kernel_fn,                     \
-                                    cpp_dtype,                          \
-                                    ...)                                \
-  static ::paddle::OpKernelInfoBuilder PD_CONCATENATE(                  \
-      custom_kernel_info_##kernel_name##_##backend##_##layout##_,       \
-      registrar_id) =                                                   \
-      ::paddle::OpKernelInfoBuilder(                                    \
-          #kernel_name,                                                 \
-          PD_BACKEND(backend),                                          \
-          PD_DATALAYOUT(layout),                                        \
-          ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type()) \
-          .SetKernelFn(PD_PT_KERNEL(                                    \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .SetVariadicKernelFn(PD_PT_VARIADIC_KERNEL(                   \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsParse(PD_PT_ARGS_PARSE(                                  \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsDef(args_def_fn);                                        \
-  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_2(kernel_name,                    \
-                                        backend,                        \
-                                        layout,                         \
-                                        PD_ID,                          \
-                                        args_def_fn,                    \
-                                        meta_kernel_fn,                 \
-                                        ##__VA_ARGS__))
-
-#define _PD_KERNEL_REGISTRAR_INIT_4(kernel_name,                        \
-                                    backend,                            \
-                                    layout,                             \
-                                    registrar_id,                       \
-                                    args_def_fn,                        \
-                                    meta_kernel_fn,                     \
-                                    cpp_dtype,                          \
-                                    ...)                                \
-  static ::paddle::OpKernelInfoBuilder PD_CONCATENATE(                  \
-      custom_kernel_info_##kernel_name##_##backend##_##layout##_,       \
-      registrar_id) =                                                   \
-      ::paddle::OpKernelInfoBuilder(                                    \
-          #kernel_name,                                                 \
-          PD_BACKEND(backend),                                          \
-          PD_DATALAYOUT(layout),                                        \
-          ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type()) \
-          .SetKernelFn(PD_PT_KERNEL(                                    \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .SetVariadicKernelFn(PD_PT_VARIADIC_KERNEL(                   \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsParse(PD_PT_ARGS_PARSE(                                  \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsDef(args_def_fn);                                        \
-  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_3(kernel_name,                    \
-                                        backend,                        \
-                                        layout,                         \
-                                        PD_ID,                          \
-                                        args_def_fn,                    \
-                                        meta_kernel_fn,                 \
-                                        ##__VA_ARGS__))
-
-#define _PD_KERNEL_REGISTRAR_INIT_5(kernel_name,                        \
-                                    backend,                            \
-                                    layout,                             \
-                                    registrar_id,                       \
-                                    args_def_fn,                        \
-                                    meta_kernel_fn,                     \
-                                    cpp_dtype,                          \
-                                    ...)                                \
-  static ::paddle::OpKernelInfoBuilder PD_CONCATENATE(                  \
-      custom_kernel_info_##kernel_name##_##backend##_##layout##_,       \
-      registrar_id) =                                                   \
-      ::paddle::OpKernelInfoBuilder(                                    \
-          #kernel_name,                                                 \
-          PD_BACKEND(backend),                                          \
-          PD_DATALAYOUT(layout),                                        \
-          ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type()) \
-          .SetKernelFn(PD_PT_KERNEL(                                    \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .SetVariadicKernelFn(PD_PT_VARIADIC_KERNEL(                   \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsParse(PD_PT_ARGS_PARSE(                                  \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsDef(args_def_fn);                                        \
-  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_4(kernel_name,                    \
-                                        backend,                        \
-                                        layout,                         \
-                                        PD_ID,                          \
-                                        args_def_fn,                    \
-                                        meta_kernel_fn,                 \
-                                        ##__VA_ARGS__))
-
-#define _PD_KERNEL_REGISTRAR_INIT_6(kernel_name,                        \
-                                    backend,                            \
-                                    layout,                             \
-                                    registrar_id,                       \
-                                    args_def_fn,                        \
-                                    meta_kernel_fn,                     \
-                                    cpp_dtype,                          \
-                                    ...)                                \
-  static ::paddle::OpKernelInfoBuilder PD_CONCATENATE(                  \
-      custom_kernel_info_##kernel_name##_##backend##_##layout##_,       \
-      registrar_id) =                                                   \
-      ::paddle::OpKernelInfoBuilder(                                    \
-          #kernel_name,                                                 \
-          PD_BACKEND(backend),                                          \
-          PD_DATALAYOUT(layout),                                        \
-          ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type()) \
-          .SetKernelFn(PD_PT_KERNEL(                                    \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .SetVariadicKernelFn(PD_PT_VARIADIC_KERNEL(                   \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsParse(PD_PT_ARGS_PARSE(                                  \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsDef(args_def_fn);                                        \
-  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_5(kernel_name,                    \
-                                        backend,                        \
-                                        layout,                         \
-                                        PD_ID,                          \
-                                        args_def_fn,                    \
-                                        meta_kernel_fn,                 \
-                                        ##__VA_ARGS__))
-
-#define _PD_KERNEL_REGISTRAR_INIT_7(kernel_name,                        \
-                                    backend,                            \
-                                    layout,                             \
-                                    registrar_id,                       \
-                                    args_def_fn,                        \
-                                    meta_kernel_fn,                     \
-                                    cpp_dtype,                          \
-                                    ...)                                \
-  static ::paddle::OpKernelInfoBuilder PD_CONCATENATE(                  \
-      custom_kernel_info_##kernel_name##_##backend##_##layout##_,       \
-      registrar_id) =                                                   \
-      ::paddle::OpKernelInfoBuilder(                                    \
-          #kernel_name,                                                 \
-          PD_BACKEND(backend),                                          \
-          PD_DATALAYOUT(layout),                                        \
-          ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type()) \
-          .SetKernelFn(PD_PT_KERNEL(                                    \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .SetVariadicKernelFn(PD_PT_VARIADIC_KERNEL(                   \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsParse(PD_PT_ARGS_PARSE(                                  \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsDef(args_def_fn);                                        \
-  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_6(kernel_name,                    \
-                                        backend,                        \
-                                        layout,                         \
-                                        PD_ID,                          \
-                                        args_def_fn,                    \
-                                        meta_kernel_fn,                 \
-                                        ##__VA_ARGS__))
-
-#define _PD_KERNEL_REGISTRAR_INIT_8(kernel_name,                        \
-                                    backend,                            \
-                                    layout,                             \
-                                    registrar_id,                       \
-                                    args_def_fn,                        \
-                                    meta_kernel_fn,                     \
-                                    cpp_dtype,                          \
-                                    ...)                                \
-  static ::paddle::OpKernelInfoBuilder PD_CONCATENATE(                  \
-      custom_kernel_info_##kernel_name##_##backend##_##layout##_,       \
-      registrar_id) =                                                   \
-      ::paddle::OpKernelInfoBuilder(                                    \
-          #kernel_name,                                                 \
-          PD_BACKEND(backend),                                          \
-          PD_DATALAYOUT(layout),                                        \
-          ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type()) \
-          .SetKernelFn(PD_PT_KERNEL(                                    \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .SetVariadicKernelFn(PD_PT_VARIADIC_KERNEL(                   \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsParse(PD_PT_ARGS_PARSE(                                  \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsDef(args_def_fn);                                        \
-  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_7(kernel_name,                    \
-                                        backend,                        \
-                                        layout,                         \
-                                        PD_ID,                          \
-                                        args_def_fn,                    \
-                                        meta_kernel_fn,                 \
-                                        ##__VA_ARGS__))
-
-#define _PD_KERNEL_REGISTRAR_INIT_9(kernel_name,                        \
-                                    backend,                            \
-                                    layout,                             \
-                                    registrar_id,                       \
-                                    args_def_fn,                        \
-                                    meta_kernel_fn,                     \
-                                    cpp_dtype,                          \
-                                    ...)                                \
-  static ::paddle::OpKernelInfoBuilder PD_CONCATENATE(                  \
-      custom_kernel_info_##kernel_name##_##backend##_##layout##_,       \
-      registrar_id) =                                                   \
-      ::paddle::OpKernelInfoBuilder(                                    \
-          #kernel_name,                                                 \
-          PD_BACKEND(backend),                                          \
-          PD_DATALAYOUT(layout),                                        \
-          ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type()) \
-          .SetKernelFn(PD_PT_KERNEL(                                    \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .SetVariadicKernelFn(PD_PT_VARIADIC_KERNEL(                   \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsParse(PD_PT_ARGS_PARSE(                                  \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsDef(args_def_fn);                                        \
-  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_8(kernel_name,                    \
-                                        backend,                        \
-                                        layout,                         \
-                                        PD_ID,                          \
-                                        args_def_fn,                    \
-                                        meta_kernel_fn,                 \
-                                        ##__VA_ARGS__))
-
-#define _PD_KERNEL_REGISTRAR_INIT_10(kernel_name,                       \
-                                     backend,                           \
-                                     layout,                            \
-                                     registrar_id,                      \
-                                     args_def_fn,                       \
-                                     meta_kernel_fn,                    \
-                                     cpp_dtype,                         \
-                                     ...)                               \
-  static ::paddle::OpKernelInfoBuilder PD_CONCATENATE(                  \
-      custom_kernel_info_##kernel_name##_##backend##_##layout##_,       \
-      registrar_id) =                                                   \
-      ::paddle::OpKernelInfoBuilder(                                    \
-          #kernel_name,                                                 \
-          PD_BACKEND(backend),                                          \
-          PD_DATALAYOUT(layout),                                        \
-          ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type()) \
-          .SetKernelFn(PD_PT_KERNEL(                                    \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .SetVariadicKernelFn(PD_PT_VARIADIC_KERNEL(                   \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsParse(PD_PT_ARGS_PARSE(                                  \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsDef(args_def_fn);                                        \
-  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_9(kernel_name,                    \
-                                        backend,                        \
-                                        layout,                         \
-                                        PD_ID,                          \
-                                        args_def_fn,                    \
-                                        meta_kernel_fn,                 \
-                                        ##__VA_ARGS__))
-
-#define _PD_KERNEL_REGISTRAR_INIT_11(kernel_name,                       \
-                                     backend,                           \
-                                     layout,                            \
-                                     registrar_id,                      \
-                                     args_def_fn,                       \
-                                     meta_kernel_fn,                    \
-                                     cpp_dtype,                         \
-                                     ...)                               \
-  static ::paddle::OpKernelInfoBuilder PD_CONCATENATE(                  \
-      custom_kernel_info_##kernel_name##_##backend##_##layout##_,       \
-      registrar_id) =                                                   \
-      ::paddle::OpKernelInfoBuilder(                                    \
-          #kernel_name,                                                 \
-          PD_BACKEND(backend),                                          \
-          PD_DATALAYOUT(layout),                                        \
-          ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type()) \
-          .SetKernelFn(PD_PT_KERNEL(                                    \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .SetVariadicKernelFn(PD_PT_VARIADIC_KERNEL(                   \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsParse(PD_PT_ARGS_PARSE(                                  \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsDef(args_def_fn);                                        \
-  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_10(kernel_name,                   \
-                                         backend,                       \
-                                         layout,                        \
-                                         PD_ID,                         \
-                                         args_def_fn,                   \
-                                         meta_kernel_fn,                \
-                                         ##__VA_ARGS__))
-
-#define _PD_KERNEL_REGISTRAR_INIT_12(kernel_name,                       \
-                                     backend,                           \
-                                     layout,                            \
-                                     registrar_id,                      \
-                                     args_def_fn,                       \
-                                     meta_kernel_fn,                    \
-                                     cpp_dtype,                         \
-                                     ...)                               \
-  static ::paddle::OpKernelInfoBuilder PD_CONCATENATE(                  \
-      custom_kernel_info_##kernel_name##_##backend##_##layout##_,       \
-      registrar_id) =                                                   \
-      ::paddle::OpKernelInfoBuilder(                                    \
-          #kernel_name,                                                 \
-          PD_BACKEND(backend),                                          \
-          PD_DATALAYOUT(layout),                                        \
-          ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type()) \
-          .SetKernelFn(PD_PT_KERNEL(                                    \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .SetVariadicKernelFn(PD_PT_VARIADIC_KERNEL(                   \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsParse(PD_PT_ARGS_PARSE(                                  \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsDef(args_def_fn);                                        \
-  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_11(kernel_name,                   \
-                                         backend,                       \
-                                         layout,                        \
-                                         PD_ID,                         \
-                                         args_def_fn,                   \
-                                         meta_kernel_fn,                \
-                                         ##__VA_ARGS__))
-
-#define _PD_KERNEL_REGISTRAR_INIT_13(kernel_name,                       \
-                                     backend,                           \
-                                     layout,                            \
-                                     registrar_id,                      \
-                                     args_def_fn,                       \
-                                     meta_kernel_fn,                    \
-                                     cpp_dtype,                         \
-                                     ...)                               \
-  static ::paddle::OpKernelInfoBuilder PD_CONCATENATE(                  \
-      custom_kernel_info_##kernel_name##_##backend##_##layout##_,       \
-      registrar_id) =                                                   \
-      ::paddle::OpKernelInfoBuilder(                                    \
-          #kernel_name,                                                 \
-          PD_BACKEND(backend),                                          \
-          PD_DATALAYOUT(layout),                                        \
-          ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type()) \
-          .SetKernelFn(PD_PT_KERNEL(                                    \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .SetVariadicKernelFn(PD_PT_VARIADIC_KERNEL(                   \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsParse(PD_PT_ARGS_PARSE(                                  \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsDef(args_def_fn);                                        \
-  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_12(kernel_name,                   \
-                                         backend,                       \
-                                         layout,                        \
-                                         PD_ID,                         \
-                                         args_def_fn,                   \
-                                         meta_kernel_fn,                \
-                                         ##__VA_ARGS__))
-
-#define _PD_KERNEL_REGISTRAR_INIT_14(kernel_name,                       \
-                                     backend,                           \
-                                     layout,                            \
-                                     registrar_id,                      \
-                                     args_def_fn,                       \
-                                     meta_kernel_fn,                    \
-                                     cpp_dtype,                         \
-                                     ...)                               \
-  static ::paddle::OpKernelInfoBuilder PD_CONCATENATE(                  \
-      custom_kernel_info_##kernel_name##_##backend##_##layout##_,       \
-      registrar_id) =                                                   \
-      ::paddle::OpKernelInfoBuilder(                                    \
-          #kernel_name,                                                 \
-          PD_BACKEND(backend),                                          \
-          PD_DATALAYOUT(layout),                                        \
-          ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type()) \
-          .SetKernelFn(PD_PT_KERNEL(                                    \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .SetVariadicKernelFn(PD_PT_VARIADIC_KERNEL(                   \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsParse(PD_PT_ARGS_PARSE(                                  \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsDef(args_def_fn);                                        \
-  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_13(kernel_name,                   \
-                                         backend,                       \
-                                         layout,                        \
-                                         PD_ID,                         \
-                                         args_def_fn,                   \
-                                         meta_kernel_fn,                \
-                                         ##__VA_ARGS__))
-
-#define _PD_KERNEL_REGISTRAR_INIT_15(kernel_name,                       \
-                                     backend,                           \
-                                     layout,                            \
-                                     registrar_id,                      \
-                                     args_def_fn,                       \
-                                     meta_kernel_fn,                    \
-                                     cpp_dtype,                         \
-                                     ...)                               \
-  static ::paddle::OpKernelInfoBuilder PD_CONCATENATE(                  \
-      custom_kernel_info_##kernel_name##_##backend##_##layout##_,       \
-      registrar_id) =                                                   \
-      ::paddle::OpKernelInfoBuilder(                                    \
-          #kernel_name,                                                 \
-          PD_BACKEND(backend),                                          \
-          PD_DATALAYOUT(layout),                                        \
-          ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type()) \
-          .SetKernelFn(PD_PT_KERNEL(                                    \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .SetVariadicKernelFn(PD_PT_VARIADIC_KERNEL(                   \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsParse(PD_PT_ARGS_PARSE(                                  \
-              meta_kernel_fn<cpp_dtype, ::paddle::backend##Context>))   \
-          .ArgsDef(args_def_fn);                                        \
-  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_14(kernel_name,                   \
-                                         backend,                       \
-                                         layout,                        \
-                                         PD_ID,                         \
-                                         args_def_fn,                   \
-                                         meta_kernel_fn,                \
-                                         ##__VA_ARGS__))
-}  // namespace paddle
diff --git a/paddle/phi/api/lib/op_kernel_info.cc b/paddle/phi/api/lib/op_kernel_info.cc
deleted file mode 100644
index c2aef8288dae1..0000000000000
--- a/paddle/phi/api/lib/op_kernel_info.cc
+++ /dev/null
@@ -1,108 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/api/ext/op_kernel_info.h"
-#include "paddle/fluid/framework/custom_kernel.h"
-
-namespace paddle {
-
-////////////////////// Op Kernel Info //////////////////////
-
-OpKernelInfo& OpKernelInfo::SetKernelFn(CustomKernelFunc&& func) {
-  kernel_fn_ = std::forward<CustomKernelFunc>(func);
-  return *this;
-}
-
-OpKernelInfo& OpKernelInfo::SetVariadicKernelFn(void* func) {
-  variadic_kernel_fn_ = func;
-  return *this;
-}
-
-//////////////// Op Kernel Info Map /////////////////
-
-std::vector<OpKernelInfo>& OpKernelInfoMap::operator[](
-    const std::string& name) {
-  return map_[name];
-}
-
-const std::unordered_map<std::string, std::vector<OpKernelInfo>>&
-OpKernelInfoMap::GetMap() const {
-  return map_;
-}
-
-//////////////// Op Kernel Info Builder /////////////////
-
-OpKernelInfoBuilder::OpKernelInfoBuilder(std::string&& op_name,
-                                         phi::Backend backend,
-                                         phi::DataLayout data_layout,
-                                         phi::DataType data_type) {
-  // 1. member assign
-  op_name_ = std::forward<std::string>(op_name);
-  backend_ = backend;
-  layout_ = data_layout;
-  dtype_ = data_type;
-
-  // 2. info parse
-  auto& info_vector = OpKernelInfoMap::Instance()[op_name_];
-  auto op_kernel_info = OpKernelInfo(op_name_, backend_, layout_, dtype_);
-  info_vector.emplace_back(std::move(op_kernel_info));
-
-  // 3. get current info ptr
-  info_ptr_ = &(info_vector.back());
-}
-
-OpKernelInfoBuilder& OpKernelInfoBuilder::SetKernelFn(CustomKernelFunc func) {
-  info_ptr_->SetKernelFn(std::forward<CustomKernelFunc>(func));
-  return *this;
-}
-
-OpKernelInfoBuilder& OpKernelInfoBuilder::SetVariadicKernelFn(void* func) {
-  info_ptr_->SetVariadicKernelFn(func);
-  return *this;
-}
-
-OpKernelInfoBuilder& OpKernelInfoBuilder::ArgsParse(
-    CustomKernelArgsParseFn func) {
-  func(this->info_ptr_);
-  return *this;
-}
-
-OpKernelInfoBuilder& OpKernelInfoBuilder::ArgsDef(CustomKernelArgsDefFn func) {
-  func(this->info_ptr_);
-  return *this;
-}
-
-/////////////////////// Op register API /////////////////////////
-
-// For inference: compile directly with framework
-// Call after PD_REGISTER_BUILTIN_KERNEL(...)
-void RegisterAllCustomKernel() {
-  auto& op_kernel_info_map = OpKernelInfoMap::Instance();
-  framework::RegisterKernelWithMetaInfoMap(op_kernel_info_map);
-}
-
-}  // namespace paddle
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// C-API to get global OpKernelInfoMap.
-paddle::OpKernelInfoMap& PD_GetOpKernelInfoMap() {
-  return paddle::OpKernelInfoMap::Instance();
-}
-
-#ifdef __cplusplus
-}  // end extern "C"
-#endif
diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h
index 4603f4123acd0..6a1688947b986 100644
--- a/paddle/phi/core/kernel_registry.h
+++ b/paddle/phi/core/kernel_registry.h
@@ -129,10 +129,10 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
   }
 };
 
-// NOTE: used for making a difference between kernels compiled with phi or not.
+// NOTE: used for making a difference between inner or outer registration.
 enum class RegType : uint8_t {
-  BUILTIN = 0,  // compiled with phi
-  PLUGIN,       // separate compiled and registered
+  INNER = 0,
+  OUTER,
 };
 
 // TODO(chenweihang): Polish the kernel selection logic, support the selection
@@ -205,7 +205,7 @@ struct KernelRegistrar {
     Kernel kernel(kernel_fn, variadic_kernel_fn);
     args_parse_fn(kernel_key, kernel.mutable_args_def());
     args_def_fn(kernel_key, &kernel);
-    if (reg_type == RegType::BUILTIN) {
+    if (reg_type == RegType::INNER) {
       KernelFactory::Instance().kernels()[kernel_name][kernel_key] = kernel;
     } else {
       CustomKernelMap::Instance().Kernels()[kernel_name][kernel_key] = kernel;
@@ -244,7 +244,7 @@ struct KernelRegistrar {
  * Note: `2TA` means `2 template argument`
  */
 #define PD_REGISTER_KERNEL(kernel_name, backend, layout, meta_kernel_fn, ...) \
-  _PD_REGISTER_KERNEL(::phi::RegType::BUILTIN,                                \
+  _PD_REGISTER_KERNEL(::phi::RegType::INNER,                                  \
                       kernel_name,                                            \
                       backend,                                                \
                       ::phi::backend##Context,                                \
@@ -918,7 +918,7 @@ struct KernelRegistrar {
 #define PD_REGISTER_GENERAL_KERNEL(                 \
     kernel_name, backend, layout, kernel_fn, dtype) \
   _PD_REGISTER_GENERAL_KERNEL(                      \
-      ::phi::RegType::BUILTIN, kernel_name, backend, layout, kernel_fn, dtype)
+      ::phi::RegType::INNER, kernel_name, backend, layout, kernel_fn, dtype)
 
 #define _PD_REGISTER_GENERAL_KERNEL(                                         \
     reg_type, kernel_name, backend, layout, kernel_fn, dtype)                \
@@ -992,7 +992,7 @@ struct KernelRegistrar {
  */
 #define PD_REGISTER_BUILTIN_KERNEL(                    \
     kernel_name, backend, layout, meta_kernel_fn, ...) \
-  _PD_REGISTER_KERNEL(::phi::RegType::PLUGIN,          \
+  _PD_REGISTER_KERNEL(::phi::RegType::OUTER,           \
                       kernel_name,                     \
                       backend,                         \
                       ::phi::backend##Context,         \
@@ -1007,7 +1007,7 @@ struct KernelRegistrar {
  */
 #define PD_REGISTER_PLUGIN_KERNEL(                     \
     kernel_name, backend, layout, meta_kernel_fn, ...) \
-  _PD_REGISTER_KERNEL(::phi::RegType::PLUGIN,          \
+  _PD_REGISTER_KERNEL(::phi::RegType::OUTER,           \
                       kernel_name,                     \
                       backend,                         \
                       ::phi::CustomContext,            \

From 69e9e9d5afd577f6786b43ac5bc18de9ff263cd0 Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Wed, 23 Feb 2022 00:41:48 +0800
Subject: [PATCH 048/101] [PHI] Remove fill_any_like kernel register in fluid 
 (#39807)

* remove fill_any_like kernel in fluid and fix data transform bug

* support scalar in infershpe

* recover infershape in fill_and_like
---
 paddle/fluid/framework/operator.cc            |  3 +
 paddle/fluid/imperative/prepared_operator.h   |  3 +
 paddle/fluid/operators/fill_any_like_op.cc    | 14 +---
 paddle/fluid/operators/fill_any_like_op.cu    | 29 --------
 paddle/fluid/operators/fill_any_like_op.h     | 74 -------------------
 .../fluid/operators/fill_any_like_op_npu.cc   |  2 +-
 .../fluid/operators/fill_any_like_op_xpu.cc   |  2 +-
 paddle/phi/api/lib/CMakeLists.txt             |  5 +-
 paddle/phi/common/scalar.h                    |  5 +-
 paddle/phi/kernels/cpu/full_kernel.cc         |  4 +-
 paddle/phi/kernels/gpu/full_kernel.cu         |  4 +-
 paddle/phi/kernels/xpu/full_kernel.cc         |  4 +-
 12 files changed, 26 insertions(+), 123 deletions(-)
 delete mode 100644 paddle/fluid/operators/fill_any_like_op.cu
 delete mode 100644 paddle/fluid/operators/fill_any_like_op.h

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 8e614faa248fa..e589f059f522b 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1972,6 +1972,9 @@ Scope* OperatorWithKernel::PreparePtenData(
         continue;
       }
 
+      if (in_def.backend == phi::Backend::ALL_BACKEND) {
+        continue;
+      }
       auto expected_place = phi::TransToPtenPlace(in_def.backend);
       if (platform::is_same_place(tensor_in->place(), expected_place)) {
         continue;
diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h
index 714e429798662..879b3ec3e68a2 100644
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -479,6 +479,9 @@ void PreparePtenData(const phi::Kernel& pt_kernel,
       auto var = ins_vector[offset];
       const auto* tensor_in = GetTensorFromVar(var->Var());
       if (tensor_in && tensor_in->IsInitialized()) {
+        if (in_def.backend == phi::Backend::ALL_BACKEND) {
+          continue;
+        }
         auto expected_place = phi::TransToPtenPlace(in_def.backend);
         if (platform::is_same_place(tensor_in->place(), expected_place)) {
           continue;
diff --git a/paddle/fluid/operators/fill_any_like_op.cc b/paddle/fluid/operators/fill_any_like_op.cc
index ed68bd7b7c2a5..e6de430a78c1a 100644
--- a/paddle/fluid/operators/fill_any_like_op.cc
+++ b/paddle/fluid/operators/fill_any_like_op.cc
@@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/fill_any_like_op.h"
 #include <string>
 
+#include "paddle/fluid/framework/op_registry.h"
+
 namespace paddle {
 namespace operators {
 
@@ -91,14 +92,3 @@ REGISTER_OPERATOR(
     ::paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     ::paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
     ops::FillAnyLikeVarTypeInference)
-
-REGISTER_OP_CPU_KERNEL(
-    fill_any_like,
-    ops::FillAnyLikeKernel<paddle::platform::CPUDeviceContext, int16_t>,
-    ops::FillAnyLikeKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::FillAnyLikeKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::FillAnyLikeKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::FillAnyLikeKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::FillAnyLikeKernel<paddle::platform::CPUDeviceContext,
-                           paddle::platform::float16>,
-    ops::FillAnyLikeKernel<paddle::platform::CPUDeviceContext, bool>);
diff --git a/paddle/fluid/operators/fill_any_like_op.cu b/paddle/fluid/operators/fill_any_like_op.cu
deleted file mode 100644
index 3ebc0ad7c8ec5..0000000000000
--- a/paddle/fluid/operators/fill_any_like_op.cu
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/fill_any_like_op.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    fill_any_like,
-    ops::FillAnyLikeKernel<paddle::platform::CUDADeviceContext, int16_t>,
-    ops::FillAnyLikeKernel<paddle::platform::CUDADeviceContext, int32_t>,
-    ops::FillAnyLikeKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::FillAnyLikeKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::FillAnyLikeKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::FillAnyLikeKernel<paddle::platform::CUDADeviceContext,
-                           paddle::platform::float16>,
-    ops::FillAnyLikeKernel<paddle::platform::CUDADeviceContext, bool>);
diff --git a/paddle/fluid/operators/fill_any_like_op.h b/paddle/fluid/operators/fill_any_like_op.h
deleted file mode 100644
index 36b56394b6f1b..0000000000000
--- a/paddle/fluid/operators/fill_any_like_op.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <cmath>
-#include <limits>
-#include <type_traits>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/pten_utils.h"
-
-#include "paddle/phi/kernels/full_kernel.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class FillAnyLikeKernel : public framework::OpKernel<T> {
- public:
-  using CommonType = typename std::common_type<
-      float,
-      typename std::conditional<std::is_same<T, platform::float16>::value,
-                                float, T>::type>::type;
-
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* out = context.Output<framework::Tensor>("Out");
-    out->mutable_data<T>(context.GetPlace());
-
-    // TODO(fangzeyang): Once context.Attribute supports double dtype, this
-    // kernel should be updated to support double dtype, too.
-    float value = context.Attr<float>("value");
-
-    auto common_type_value = static_cast<CommonType>(value);
-
-    PADDLE_ENFORCE_EQ(
-        (common_type_value >=
-         static_cast<CommonType>(std::numeric_limits<T>::lowest())) &&
-            (common_type_value <=
-             static_cast<CommonType>(std::numeric_limits<T>::max())),
-        true,
-        platform::errors::InvalidArgument(
-            "The filled value is out of range for target type, "
-            "current kernel type is %s, the range should between %f "
-            "and %f, but now value is %f.",
-            typeid(T).name(),
-            static_cast<CommonType>(std::numeric_limits<T>::lowest()),
-            static_cast<CommonType>(std::numeric_limits<T>::max()), value));
-
-    PADDLE_ENFORCE_EQ(
-        std::isnan(value), false,
-        platform::errors::InvalidArgument("The filled value is NaN."));
-
-    const auto& dev_ctx = context.template device_context<DeviceContext>();
-    // call new kernel
-    phi::FullLikeKernel<T>(
-        static_cast<const typename paddle::framework::ConvertToPtenContext<
-            DeviceContext>::TYPE&>(dev_ctx),
-        *x, value, phi::DataType::UNDEFINED, out);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/fill_any_like_op_npu.cc b/paddle/fluid/operators/fill_any_like_op_npu.cc
index a584c1341dc0f..2a914ff2ebd33 100644
--- a/paddle/fluid/operators/fill_any_like_op_npu.cc
+++ b/paddle/fluid/operators/fill_any_like_op_npu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/fill_any_like_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/fill_any_like_op_xpu.cc b/paddle/fluid/operators/fill_any_like_op_xpu.cc
index 693d4431b2ec8..896310cd0918b 100644
--- a/paddle/fluid/operators/fill_any_like_op_xpu.cc
+++ b/paddle/fluid/operators/fill_any_like_op_xpu.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/fluid/operators/fill_any_like_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 #include "paddle/phi/kernels/full_kernel.h"
 
diff --git a/paddle/phi/api/lib/CMakeLists.txt b/paddle/phi/api/lib/CMakeLists.txt
index 720c6f54bb075..1ebddc3d3cd1b 100644
--- a/paddle/phi/api/lib/CMakeLists.txt
+++ b/paddle/phi/api/lib/CMakeLists.txt
@@ -91,8 +91,9 @@ cc_library(pten_tensor SRCS tensor_method.cc DEPS pten_tensor_raw pten_function_
 
 cc_library(op_meta_info SRCS op_meta_info.cc DEPS pten_tensor)
 
+cc_library(wrapped_infermeta SRCS ${wrapped_infermeta_source_file} DEPS pten)
+
 cc_library(sparse_api SRCS sparse_api.cc DEPS pten_tensor pten kernel_dispatch pten_data_transform)
-cc_library(pten_function_api SRCS ${api_source_file} DEPS pten_tensor pten kernel_dispatch pten_data_transform)
+cc_library(pten_function_api SRCS ${api_source_file} DEPS pten_tensor pten kernel_dispatch pten_data_transform wrapped_infermeta)
 cc_library(pten_dygraph_api SRCS ${dygraph_api_source_file} DEPS pten_tensor pten kernel_dispatch pten_data_transform)
 cc_library(pten_bw_function_api SRCS ${bw_api_source_file} DEPS pten_tensor pten kernel_dispatch backward_infermeta pten_data_transform pten_function_api)
-cc_library(wrapped_infermeta SRCS ${wrapped_infermeta_source_file} DEPS pten)
diff --git a/paddle/phi/common/scalar.h b/paddle/phi/common/scalar.h
index 9a5a3fbf921d0..72cef89d300c8 100644
--- a/paddle/phi/common/scalar.h
+++ b/paddle/phi/common/scalar.h
@@ -25,7 +25,6 @@ namespace experimental {
 template <typename T>
 class ScalarBase {
  public:
-  bool FromTensor() const { return is_from_tensor_; }
   // Constructor support implicit
   ScalarBase(double val) : dtype_(DataType::FLOAT64) {  // NOLINT
     data_.f64 = val;
@@ -157,6 +156,10 @@ class ScalarBase {
     CopyScalar(other, this);
   }
 
+  bool FromTensor() const { return is_from_tensor_; }
+
+  void SetFromTensor(bool from_tensor) { is_from_tensor_ = from_tensor; }
+
   template <typename RT>
   inline RT to() const {
     switch (dtype_) {
diff --git a/paddle/phi/kernels/cpu/full_kernel.cc b/paddle/phi/kernels/cpu/full_kernel.cc
index 84d7f56d3361c..6b0183d31c6ec 100644
--- a/paddle/phi/kernels/cpu/full_kernel.cc
+++ b/paddle/phi/kernels/cpu/full_kernel.cc
@@ -99,4 +99,6 @@ PD_REGISTER_KERNEL(full_like,
                    int,
                    int64_t,
                    bool,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16) {
+  kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
+}
diff --git a/paddle/phi/kernels/gpu/full_kernel.cu b/paddle/phi/kernels/gpu/full_kernel.cu
index d5cb1575b7181..48b26540331ef 100644
--- a/paddle/phi/kernels/gpu/full_kernel.cu
+++ b/paddle/phi/kernels/gpu/full_kernel.cu
@@ -123,4 +123,6 @@ PD_REGISTER_KERNEL(full_like,
                    int,
                    int64_t,
                    bool,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16) {
+  kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
+}
diff --git a/paddle/phi/kernels/xpu/full_kernel.cc b/paddle/phi/kernels/xpu/full_kernel.cc
index b514425cc54da..574f4e991a260 100644
--- a/paddle/phi/kernels/xpu/full_kernel.cc
+++ b/paddle/phi/kernels/xpu/full_kernel.cc
@@ -139,4 +139,6 @@ PD_REGISTER_KERNEL(full_like,
                    float,
                    int,
                    int64_t,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16) {
+  kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
+}

From 94243789828f38e5220ae4b7e97553701e148000 Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Wed, 23 Feb 2022 09:36:57 +0800
Subject: [PATCH 049/101] Supported intermediate outputs for eager final state
 codegen (#39767)

* Supported intermediate outputs for eager final state codegen

* Added validation check for intermediate tensors
---
 .../final_state_generator/eager_gen.py        | 49 ++++++++++++++++---
 1 file changed, 41 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
index ca02a3d39779d..0578f930679b8 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
@@ -127,6 +127,26 @@ def ReadBwdFile(filepath):
 ######################
 ###  Yaml Parsers  ###
 ######################
+def IntermediateValidationCheck(intermediate_outputs, forward_returns_list):
+    # intermediate_outputs : [name0, name1, ...]
+    # forward_returns_list : [[ret_name, type, orig_pos], ...]
+    """
+    Check whether intermediate_outputs are positioned
+    at the very end of forward_returns_list
+    """
+
+    intermediate_positions = range(
+        len(forward_returns_list) - len(intermediate_outputs),
+        len(forward_returns_list))
+    for ret_name, _, pos in forward_returns_list:
+        if ret_name in intermediate_outputs:
+            assert pos in intermediate_positions
+
+
+def ParseIntermediate(string):
+    return [v.strip() for v in string.split(",")]
+
+
 def ParseNoNeedBuffer(string):
     # string: "x, y"
     no_need_buffer_set = set()
@@ -742,11 +762,11 @@ def GenerateNodeCreationCodes(fwd_api_name, bwd_api_name,
     return node_creation_str
 
 
-def GenerateForwardDefinition(fwd_api_name, bwd_api_name,
-                              forward_inputs_position_map,
-                              forward_outputs_position_map, forward_attrs_list,
-                              backward_fwd_input_map, backward_grad_input_map,
-                              backward_grad_output_map, backward_attrs_list):
+def GenerateForwardDefinition(
+        fwd_api_name, bwd_api_name, forward_inputs_position_map,
+        forward_outputs_position_map, forward_attrs_list,
+        backward_fwd_input_map, backward_grad_input_map,
+        backward_grad_output_map, backward_attrs_list, intermediate_outputs):
     # fwd_api_name = ""
     # forward_inputs_position_map = { "name" : [type, fwd_position] }
     # forward_outputs_position_map = { "name" : [type, fwd_position] }
@@ -790,13 +810,20 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name,
     inputs_call_args_str = ", ".join(inputs_call_list)
 
     # Forward Full Logic
-    forward_call_str = f"auto api_result = paddle::experimental::{fwd_api_name}({inputs_call_args_str});"
+    if len(intermediate_outputs) == 0:
+        function_name = fwd_api_name
+    else:
+        function_name = fwd_api_name + "_intermediate"
+    forward_call_str = f"auto api_result = paddle::experimental::{function_name}({inputs_call_args_str});"
 
     # Get return type list & outputs
-    num_outputs = len(forward_outputs_position_map.keys())
+    num_outputs = len(forward_outputs_position_map.keys()) - len(
+        intermediate_outputs)
     returns_type_list = ["" for i in range(num_outputs)]
     returns_list = ["" for i in range(num_outputs)]
     for name, (rtype, pos) in forward_outputs_position_map.items():
+        if name in intermediate_outputs:
+            continue
         if num_outputs == 1:
             returns_list[0] = f"api_result"
         else:
@@ -1037,6 +1064,12 @@ def GenerateForwardHFile(filepath, forward_function_declaration_str):
         print("Prased Forward Attrs List: ", forward_attrs_list)
         print("Parsed Forward Returns List: ", forward_returns_list)
 
+        intermediate_outputs = []
+        if 'intermediate' in fwd_api.keys():
+            intermediate_outputs = ParseIntermediate(fwd_api['intermediate'])
+
+        IntermediateValidationCheck(intermediate_outputs, forward_returns_list)
+
         # Collect Original Forward Inputs/Outputs and then perform validation checks
         orig_forward_inputs_list, orig_forward_attrs_list, orig_forward_returns_list = ParseYamlForward(
             fwd_args_str, fwd_returns_str)
@@ -1095,7 +1128,7 @@ def GenerateForwardHFile(filepath, forward_function_declaration_str):
             fwd_api_name, bwd_api_name, forward_inputs_position_map,
             forward_outputs_position_map, forward_attrs_list,
             backward_fwd_input_map, backward_grad_input_map,
-            backward_grad_output_map, backward_attrs_list)
+            backward_grad_output_map, backward_attrs_list, intermediate_outputs)
         print("Generated Forward Definition: ", forward_definition_str)
         print("Generated Forward Declaration: ", forward_declaration_str)
         forward_definition_str += definition_declaration_pair[0]

From 1fcaab456976ee5e72d056f72ed65884ce77d784 Mon Sep 17 00:00:00 2001
From: chenjian <chenjian26@baidu.com>
Date: Wed, 23 Feb 2022 10:01:14 +0800
Subject: [PATCH 050/101] Update record interface using part3 (#39695)

* fix RecordEvent interface

* modify default level to 4

* update interface use

* add const default trace level

* update record event interface using

* update record event interface using

* update record event interface using

* update operator.cc

* update part2

* update part1

* update part3

* fix include profiler.h header in ps server

* fix include profiler.h header in ps server

* fix profiler.h header

* fix profiler.h header

* fix merge buf

* update

* fix bug

* fix bug
---
 .../grad_merge_all_reduce_op_handle.cc        |   5 +
 paddle/fluid/memory/memcpy.cc                 | 119 +++++++++++++-----
 .../fluid/operators/controlflow/fetch_op.cc   |   2 +-
 paddle/fluid/operators/conv_cudnn_op.cu       |   2 +-
 paddle/fluid/operators/load_op.h              |   2 +-
 .../operators/pscore/send_and_recv_op.cc      |   2 +-
 .../fluid/operators/reader/buffered_reader.cc |  13 +-
 paddle/fluid/operators/reader/read_op.cc      |   5 +-
 paddle/fluid/platform/device_context.cc       |   4 +-
 9 files changed, 111 insertions(+), 43 deletions(-)

diff --git a/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.cc b/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.cc
index a623266719343..44b9ca90fc540 100644
--- a/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 DECLARE_bool(sync_nccl_allreduce);
@@ -47,6 +48,8 @@ GradMergeAllReduceOpHandle::GradMergeAllReduceOpHandle(
 #endif
 
 void GradMergeAllReduceOpHandle::RunImpl() {
+  platform::RecordEvent record_event(
+      Name(), platform::TracerEventType::Communication, 1);
   PADDLE_ENFORCE_GT(local_scopes_.size(), 0,
                     platform::errors::PreconditionNotMet(
                         "The number of local scope should be > 0, but got %zu.",
@@ -96,6 +99,8 @@ FusedGradMergeAllReduceOpHandle::FusedGradMergeAllReduceOpHandle(
 #endif
 
 void FusedGradMergeAllReduceOpHandle::RunImpl() {
+  platform::RecordEvent record_event(
+      Name(), platform::TracerEventType::Communication, 1);
   PADDLE_ENFORCE_GT(local_scopes_.size(), 0,
                     platform::errors::PreconditionNotMet(
                         "The number of local scope should be > 0, but got %zu.",
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index a71e5fe9877c5..166cdd0b5d6b6 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -246,7 +246,8 @@ void Copy<platform::NPUPlace, platform::CPUPlace>(platform::NPUPlace dst_place,
           << dst_place << " by thream(" << stream << ")";
 
   if (stream) {
-    platform::RecordEvent record_event("NpuMemcpyAsync:CPU->NPU");
+    platform::RecordEvent record_event(
+        "NpuMemcpyAsync:CPU->NPU", platform::TracerEventType::UserDefined, 1);
     platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE,
                              reinterpret_cast<aclrtStream>(stream));
   } else {
@@ -256,7 +257,8 @@ void Copy<platform::NPUPlace, platform::CPUPlace>(platform::NPUPlace dst_place,
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();
 
-    platform::RecordEvent record_event("NpuMemcpySync:CPU->NPU");
+    platform::RecordEvent record_event(
+        "NpuMemcpySync:CPU->NPU", platform::TracerEventType::UserDefined, 1);
     platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE);
   }
 }
@@ -275,14 +277,16 @@ void Copy<platform::CPUPlace, platform::NPUPlace>(platform::CPUPlace dst_place,
           << dst_place << " by thream(" << stream << ")";
 
   if (stream) {
-    platform::RecordEvent record_event("NpuMemcpyAsync:NPU->CPU");
+    platform::RecordEvent record_event(
+        "NpuMemcpyAsync:NPU->CPU", platform::TracerEventType::UserDefined, 1);
     platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST,
                              reinterpret_cast<aclrtStream>(stream));
   } else {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     static_cast<platform::NPUDeviceContext*>(pool.Get(src_place))->Wait();
 
-    platform::RecordEvent record_event("NpuMemcpySync:NPU->CPU");
+    platform::RecordEvent record_event(
+        "NpuMemcpySync:NPU->CPU", platform::TracerEventType::UserDefined, 1);
     platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST);
   }
 }
@@ -300,7 +304,9 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
   if (dst_place == src_place) {
     platform::SetNPUDeviceId(src_place.device);
     if (stream) {
-      platform::RecordEvent record_event("NpuMemcpyAsync(same_npu):NPU->NPU");
+      platform::RecordEvent record_event("NpuMemcpyAsync(same_npu):NPU->NPU",
+                                         platform::TracerEventType::UserDefined,
+                                         1);
       platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE,
                                reinterpret_cast<aclrtStream>(stream));
     } else {
@@ -308,7 +314,9 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
           platform::DeviceContextPool::Instance();
       static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();
 
-      platform::RecordEvent record_event("NpuMemcpySync(same_npu):NPU->NPU");
+      platform::RecordEvent record_event("NpuMemcpySync(same_npu):NPU->NPU",
+                                         platform::TracerEventType::UserDefined,
+                                         1);
       platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE);
     }
   } else {
@@ -318,7 +326,9 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
     }
     if (stream) {
       // TODO(zhiqiu): support peer access?
-      platform::RecordEvent record_event("NpuMemcpyPeerAsync:NPU->NPU");
+      platform::RecordEvent record_event("NpuMemcpyPeerAsync:NPU->NPU",
+                                         platform::TracerEventType::UserDefined,
+                                         1);
       platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE,
                                reinterpret_cast<aclrtStream>(stream));
     } else {
@@ -326,7 +336,9 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
           platform::DeviceContextPool::Instance();
       static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();
 
-      platform::RecordEvent record_event("NpuMemcpyPeerSync:NPU->NPU");
+      platform::RecordEvent record_event("NpuMemcpyPeerSync:NPU->NPU",
+                                         platform::TracerEventType::UserDefined,
+                                         1);
       platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE);
     }
   }
@@ -374,14 +386,18 @@ void Copy<platform::NPUPinnedPlace, platform::NPUPlace>(
           << dst_place << " by thream(" << stream << ")";
 
   if (stream) {
-    platform::RecordEvent record_event("NpuMemcpyAsync:NPU->NPUPinned");
+    platform::RecordEvent record_event("NpuMemcpyAsync:NPU->NPUPinned",
+                                       platform::TracerEventType::UserDefined,
+                                       1);
     platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST,
                              reinterpret_cast<aclrtStream>(stream));
   } else {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     static_cast<platform::NPUDeviceContext*>(pool.Get(src_place))->Wait();
 
-    platform::RecordEvent record_event("NpuMemcpySync:NPU->NPUPinned");
+    platform::RecordEvent record_event("NpuMemcpySync:NPU->NPUPinned",
+                                       platform::TracerEventType::UserDefined,
+                                       1);
     platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST);
   }
 }
@@ -398,7 +414,9 @@ void Copy<platform::NPUPlace, platform::NPUPinnedPlace>(
           << dst_place << " by thream(" << stream << ")";
 
   if (stream) {
-    platform::RecordEvent record_event("NpuMemcpyAsync:NPUPinned->NPU");
+    platform::RecordEvent record_event("NpuMemcpyAsync:NPUPinned->NPU",
+                                       platform::TracerEventType::UserDefined,
+                                       1);
     platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE,
                              reinterpret_cast<aclrtStream>(stream));
   } else {
@@ -408,7 +426,9 @@ void Copy<platform::NPUPlace, platform::NPUPinnedPlace>(
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();
 
-    platform::RecordEvent record_event("NpuMemcpySync:NPUPinned->NPU");
+    platform::RecordEvent record_event("NpuMemcpySync:NPUPinned->NPU",
+                                       platform::TracerEventType::UserDefined,
+                                       1);
     platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE);
   }
 }
@@ -596,7 +616,8 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>(
   VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
           << dst_place << " by stream(" << stream << ")";
   if (stream) {
-    platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CPU");
+    platform::RecordEvent record_event(
+        "GpuMemcpyAsync:GPU->CPU", platform::TracerEventType::UserDefined, 1);
 #ifdef PADDLE_WITH_HIP
     platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost,
                              reinterpret_cast<gpuStream_t>(stream));
@@ -605,7 +626,8 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>(
                              reinterpret_cast<gpuStream_t>(stream));
 #endif
   } else {
-    platform::RecordEvent record_event("GpuMemcpySync:GPU->CPU");
+    platform::RecordEvent record_event(
+        "GpuMemcpySync:GPU->CPU", platform::TracerEventType::UserDefined, 1);
 #ifdef PADDLE_WITH_HIP
     platform::GpuMemcpySync(dst, src, num, hipMemcpyDeviceToHost);
 #else
@@ -628,7 +650,8 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
   VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
           << dst_place << " by thream(" << stream << ")";
   if (stream) {
-    platform::RecordEvent record_event("GpuMemcpyAsync:CPU->GPU");
+    platform::RecordEvent record_event(
+        "GpuMemcpyAsync:CPU->GPU", platform::TracerEventType::UserDefined, 1);
 #ifdef PADDLE_WITH_HIP
     platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice,
                              reinterpret_cast<gpuStream_t>(stream));
@@ -637,7 +660,8 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
                              reinterpret_cast<gpuStream_t>(stream));
 #endif
   } else {
-    platform::RecordEvent record_event("GpuMemcpySync:CPU->GPU");
+    platform::RecordEvent record_event(
+        "GpuMemcpySync:CPU->GPU", platform::TracerEventType::UserDefined, 1);
 #ifdef PADDLE_WITH_HIP
     platform::GpuMemcpySync(dst, src, num, hipMemcpyHostToDevice);
 #else
@@ -661,7 +685,9 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
   if (dst_place == src_place) {
     platform::SetDeviceId(src_place.device);
     if (stream) {
-      platform::RecordEvent record_event("GpuMemcpyAsync(same_gpu):GPU->GPU");
+      platform::RecordEvent record_event("GpuMemcpyAsync(same_gpu):GPU->GPU",
+                                         platform::TracerEventType::UserDefined,
+                                         1);
 #ifdef PADDLE_WITH_HIP
       platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToDevice,
                                reinterpret_cast<gpuStream_t>(stream));
@@ -670,7 +696,9 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
                                reinterpret_cast<gpuStream_t>(stream));
 #endif
     } else {
-      platform::RecordEvent record_event("GpuMemcpySync(same_gpu):GPU->GPU");
+      platform::RecordEvent record_event("GpuMemcpySync(same_gpu):GPU->GPU",
+                                         platform::TracerEventType::UserDefined,
+                                         1);
 #ifdef PADDLE_WITH_HIP
       platform::GpuMemcpySync(dst, src, num, hipMemcpyDeviceToDevice);
 #else
@@ -679,11 +707,15 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
     }
   } else {
     if (stream) {
-      platform::RecordEvent record_event("GpuMemcpyPeerAsync:GPU->GPU");
+      platform::RecordEvent record_event("GpuMemcpyPeerAsync:GPU->GPU",
+                                         platform::TracerEventType::UserDefined,
+                                         1);
       platform::GpuMemcpyPeerAsync(dst, dst_place.device, src, src_place.device,
                                    num, reinterpret_cast<gpuStream_t>(stream));
     } else {
-      platform::RecordEvent record_event("GpuMemcpyPeerSync:GPU->GPU");
+      platform::RecordEvent record_event("GpuMemcpyPeerSync:GPU->GPU",
+                                         platform::TracerEventType::UserDefined,
+                                         1);
       platform::GpuMemcpyPeerSync(dst, dst_place.device, src, src_place.device,
                                   num);
     }
@@ -729,7 +761,9 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
   VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
           << dst_place << " by thream(" << stream << ")";
   if (stream) {
-    platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CUDAPinned");
+    platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CUDAPinned",
+                                       platform::TracerEventType::UserDefined,
+                                       1);
 #ifdef PADDLE_WITH_HIP
     platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost,
                              reinterpret_cast<gpuStream_t>(stream));
@@ -738,7 +772,9 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
                              reinterpret_cast<gpuStream_t>(stream));
 #endif
   } else {
-    platform::RecordEvent record_event("GpuMemcpySync:GPU->CUDAPinned");
+    platform::RecordEvent record_event("GpuMemcpySync:GPU->CUDAPinned",
+                                       platform::TracerEventType::UserDefined,
+                                       1);
 #ifdef PADDLE_WITH_HIP
     platform::GpuMemcpySync(dst, src, num, hipMemcpyDeviceToHost);
 #else
@@ -758,7 +794,9 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
   VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
           << dst_place << " by thream(" << stream << ")";
   if (stream) {
-    platform::RecordEvent record_event("GpuMemcpyAsync:CUDAPinned->GPU");
+    platform::RecordEvent record_event("GpuMemcpyAsync:CUDAPinned->GPU",
+                                       platform::TracerEventType::UserDefined,
+                                       1);
 #ifdef PADDLE_WITH_HIP
     platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice,
                              reinterpret_cast<gpuStream_t>(stream));
@@ -767,7 +805,9 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
                              reinterpret_cast<gpuStream_t>(stream));
 #endif
   } else {
-    platform::RecordEvent record_event("GpuMemcpySync:CUDAPinned->GPU");
+    platform::RecordEvent record_event("GpuMemcpySync:CUDAPinned->GPU",
+                                       platform::TracerEventType::UserDefined,
+                                       1);
 #ifdef PADDLE_WITH_HIP
     platform::GpuMemcpySync(dst, src, num, hipMemcpyHostToDevice);
 #else
@@ -927,7 +967,9 @@ void Copy<platform::CPUPlace, platform::MLUPlace>(platform::CPUPlace dst_place,
   if (stream) {
     VLOG(4) << "Async memory::Copy " << num << " Bytes from " << src_place
             << " to " << dst_place << " by mlu stream(" << stream << ")";
-    platform::RecordEvent record_event("MLUMemcpyD2HAsync:MLU->CPU");
+    platform::RecordEvent record_event("MLUMemcpyD2HAsync:MLU->CPU",
+                                       platform::TracerEventType::UserDefined,
+                                       1);
     platform::MLUMemcpyD2HAsync(dst, src, num,
                                 reinterpret_cast<mluStream>(stream));
   } else {
@@ -936,7 +978,8 @@ void Copy<platform::CPUPlace, platform::MLUPlace>(platform::CPUPlace dst_place,
 
     VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place
             << " to " << dst_place;
-    platform::RecordEvent record_event("MLUMemcpyD2HSync:MLU->CPU");
+    platform::RecordEvent record_event(
+        "MLUMemcpyD2HSync:MLU->CPU", platform::TracerEventType::UserDefined, 1);
     platform::MLUMemcpyD2HSync(dst, src, num);
   }
 }
@@ -953,7 +996,9 @@ void Copy<platform::MLUPlace, platform::CPUPlace>(platform::MLUPlace dst_place,
   if (stream) {
     VLOG(4) << "Async memory::Copy " << num << " Bytes from " << src_place
             << " to " << dst_place << " by mlu stream(" << stream << ")";
-    platform::RecordEvent record_event("MLUMemcpyH2DAsync:CPU->MLU");
+    platform::RecordEvent record_event("MLUMemcpyH2DAsync:CPU->MLU",
+                                       platform::TracerEventType::UserDefined,
+                                       1);
     platform::MLUMemcpyH2DAsync(dst, src, num,
                                 reinterpret_cast<mluStream>(stream));
   } else {
@@ -962,7 +1007,8 @@ void Copy<platform::MLUPlace, platform::CPUPlace>(platform::MLUPlace dst_place,
 
     VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place
             << " to " << dst_place;
-    platform::RecordEvent record_event("MLUMemcpyH2DSync:CPU->MLU");
+    platform::RecordEvent record_event(
+        "MLUMemcpyH2DSync:CPU->MLU", platform::TracerEventType::UserDefined, 1);
     platform::MLUMemcpyH2DSync(dst, src, num);
   }
 }
@@ -980,8 +1026,9 @@ void Copy<platform::MLUPlace, platform::MLUPlace>(platform::MLUPlace dst_place,
     if (stream) {
       VLOG(4) << "Async memory::Copy " << num << " Bytes from " << src_place
               << " to " << dst_place << " by mlu stream(" << stream << ")";
-      platform::RecordEvent record_event(
-          "MLUMemcpyD2DAsync(same_mlu):MLU->MLU");
+      platform::RecordEvent record_event("MLUMemcpyD2DAsync(same_mlu):MLU->MLU",
+                                         platform::TracerEventType::UserDefined,
+                                         1);
       platform::MLUMemcpyD2DAsync(dst, src, num,
                                   reinterpret_cast<mluStream>(stream));
     } else {
@@ -991,20 +1038,26 @@ void Copy<platform::MLUPlace, platform::MLUPlace>(platform::MLUPlace dst_place,
 
       VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place
               << " to " << dst_place;
-      platform::RecordEvent record_event("MLUMemcpyD2DSync(same_mlu):MLU->MLU");
+      platform::RecordEvent record_event("MLUMemcpyD2DSync(same_mlu):MLU->MLU",
+                                         platform::TracerEventType::UserDefined,
+                                         1);
       platform::MLUMemcpyD2DSync(dst, src, num);
     }
   } else {
     if (stream) {
       VLOG(4) << "Async memory::Copy " << num << " Bytes from " << src_place
               << " to " << dst_place << " by mlu stream(" << stream << ")";
-      platform::RecordEvent record_event("MLUMemcpyPeerAsync:MLU->MLU");
+      platform::RecordEvent record_event("MLUMemcpyPeerAsync:MLU->MLU",
+                                         platform::TracerEventType::UserDefined,
+                                         1);
       platform::MLUMemcpyPeerAsync(dst, dst_place.device, src, src_place.device,
                                    num, reinterpret_cast<mluStream>(stream));
     } else {
       VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place
               << " to " << dst_place;
-      platform::RecordEvent record_event("MLUMemcpyPeerSync:MLU->MLU");
+      platform::RecordEvent record_event("MLUMemcpyPeerSync:MLU->MLU",
+                                         platform::TracerEventType::UserDefined,
+                                         1);
       platform::MLUMemcpyPeerSync(dst, dst_place.device, src, src_place.device,
                                   num);
     }
diff --git a/paddle/fluid/operators/controlflow/fetch_op.cc b/paddle/fluid/operators/controlflow/fetch_op.cc
index ed4995d4fbeda..de3d8bd996149 100644
--- a/paddle/fluid/operators/controlflow/fetch_op.cc
+++ b/paddle/fluid/operators/controlflow/fetch_op.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu b/paddle/fluid/operators/conv_cudnn_op.cu
index bda5ac42da8e3..dff60afd74c02 100644
--- a/paddle/fluid/operators/conv_cudnn_op.cu
+++ b/paddle/fluid/operators/conv_cudnn_op.cu
@@ -28,7 +28,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/padding.h"
 #include "paddle/fluid/platform/cudnn_workspace_helper.h"
 #include "paddle/fluid/platform/float16.h"
-#include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 
 DECLARE_bool(cudnn_deterministic);
 DECLARE_uint64(conv_workspace_size_limit);
diff --git a/paddle/fluid/operators/load_op.h b/paddle/fluid/operators/load_op.h
index 521a35646c45a..7a161fb9dd383 100644
--- a/paddle/fluid/operators/load_op.h
+++ b/paddle/fluid/operators/load_op.h
@@ -22,7 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/pscore/send_and_recv_op.cc b/paddle/fluid/operators/pscore/send_and_recv_op.cc
index 980351e12a030..c5971632b03ef 100644
--- a/paddle/fluid/operators/pscore/send_and_recv_op.cc
+++ b/paddle/fluid/operators/pscore/send_and_recv_op.cc
@@ -21,7 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc
index 6393ff2135d1d..21c23a7f602a3 100644
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/operators/reader/buffered_reader.h"
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 
 namespace paddle {
 namespace operators {
@@ -115,7 +116,9 @@ void BufferedReader::ReadAsync(size_t i) {
         platform::CUDAPinnedPlace cuda_pinned_place;
         std::vector<void *> cuda_pinned_ptrs;
         cuda_pinned_ptrs.reserve(cpu.size());
-        platform::RecordEvent record_event("BufferedReader:MemoryCopy");
+        platform::RecordEvent record_event(
+            "BufferedReader:MemoryCopy", platform::TracerEventType::UserDefined,
+            1);
         // NODE(chenweihang): When we use CUDAPinned Memory, we need call
         // cudaHostAlloc, that is a CUDA API, calling CUDA API need load
         // cuda lib into device, it will cost hundreds of MB of GPU memory.
@@ -170,7 +173,9 @@ void BufferedReader::ReadAsync(size_t i) {
             cudaStreamWaitEvent(stream_.get(), events_[i].get(), 0));
 #endif
 
-        platform::RecordEvent record_event("BufferedReader:MemoryCopy");
+        platform::RecordEvent record_event(
+            "BufferedReader:MemoryCopy", platform::TracerEventType::UserDefined,
+            1);
         for (size_t i = 0; i < cpu.size(); ++i) {
           auto cpu_place = cpu[i].place();
           auto cpu_ptr = cpu[i].data();
@@ -229,7 +234,9 @@ void BufferedReader::ReadAsync(size_t i) {
       platform::NPUEventRecord(events_[i].get(), compute_stream_);
       platform::NPUStreamWaitEvent(stream_.get(), events_[i].get());
 
-      platform::RecordEvent record_event("BufferedReader:MemoryCopy");
+      platform::RecordEvent record_event("BufferedReader:MemoryCopy",
+                                         platform::TracerEventType::UserDefined,
+                                         1);
       for (size_t i = 0; i < cpu.size(); ++i) {
         auto cpu_place = cpu[i].place();
         auto cpu_ptr = cpu[i].data();
diff --git a/paddle/fluid/operators/reader/read_op.cc b/paddle/fluid/operators/reader/read_op.cc
index 73bc67287c278..d406640bff240 100644
--- a/paddle/fluid/operators/reader/read_op.cc
+++ b/paddle/fluid/operators/reader/read_op.cc
@@ -15,7 +15,7 @@
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
-#include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 
 namespace paddle {
 namespace operators {
@@ -106,7 +106,8 @@ class ReadOp : public framework::OperatorBase {
     std::vector<framework::LoDTensor> ins;
 
     // For profiling
-    platform::RecordEvent record_event(Type());
+    platform::RecordEvent record_event(
+        Type().c_str(), platform::TracerEventType::UserDefined, 1);
 
     reader->ReadNext(&ins);
     if (ins.empty()) {
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index e5e369efd6bb4..4282ec20623c9 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -32,6 +32,7 @@ limitations under the License. */
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 
 namespace paddle {
 namespace memory {
@@ -322,7 +323,8 @@ NPUDeviceContext::~NPUDeviceContext() {
 }
 
 void NPUDeviceContext::Wait() const {
-  platform::RecordEvent record_event("NPUDeviceContext/wait");
+  platform::RecordEvent record_event("NPUDeviceContext/wait",
+                                     platform::TracerEventType::UserDefined, 2);
   VLOG(4) << "NPU context(" << this << ")  Wait";
   stream_->Wait();
 }

From ca4df333d3ea3be71d82273865ee39e7c5c74910 Mon Sep 17 00:00:00 2001
From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com>
Date: Wed, 23 Feb 2022 10:02:09 +0800
Subject: [PATCH 051/101] [bf16] add bf16 kernel: elementwise_div (#39602)

* add elementwise_div

* refine rocm

* refine code

* refine op register

* solve conflict

* refine unittest

* refine unittest precision

* add rocm
---
 .../elementwise/elementwise_div_op.cu         |  6 +++
 .../device/gpu/cuda/cuda_device_function.h    | 12 ++++++
 .../device/gpu/rocm/rocm_device_function.h    |  7 ++++
 paddle/phi/kernels/gpu/math_kernel.cu         |  2 +
 .../unittests/test_elementwise_div_op.py      | 38 ++++++++++++++++++-
 5 files changed, 64 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cu b/paddle/fluid/operators/elementwise/elementwise_div_op.cu
index 06f9107db27b4..9eb4b0352e533 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cu
@@ -53,6 +53,8 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, float>,
     ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext,
                               paddle::platform::float16>,
+    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext,
+                              paddle::platform::bfloat16>,
     ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, double>,
     ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int>,
     ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int64_t>,
@@ -65,6 +67,8 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, float>,
     ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
                                   paddle::platform::float16>,
+    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
+                                  paddle::platform::bfloat16>,
     ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, double>,
     ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int>,
     ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
@@ -78,6 +82,8 @@ REGISTER_OP_CUDA_KERNEL(
                                         float>,
     ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
                                         paddle::platform::float16>,
+    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
+                                        paddle::platform::bfloat16>,
     ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
                                         double>,
     ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h b/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h
index 367fb3de47c78..f17a814175fa0 100644
--- a/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h
@@ -105,6 +105,18 @@ __forceinline__ __device__ float16 CudaShuffleXorSync(unsigned mask,
   return float16(__shfl_xor_sync(mask, val.to_half(), width));
 }
 
+template <>
+__forceinline__ __device__ bfloat16 CudaShuffleXorSync(unsigned mask,
+                                                       bfloat16 val,
+                                                       int width) {
+#if defined(PADDLE_CUDA_BF16)
+  return bfloat16(__shfl_xor_sync(mask, static_cast<nv_bfloat16>(val), width));
+#else
+  PADDLE_ENFORCE(
+      false, "__shfl_xor_sync with bfloat16 is not supported on cuda <= 11.");
+#endif
+}
+
 template <>
 __forceinline__ __device__ paddle::platform::complex<float> CudaShuffleXorSync(
     unsigned mask, paddle::platform::complex<float> val, int width) {
diff --git a/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h b/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h
index 63897bd671740..61bf1905fdb74 100644
--- a/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h
+++ b/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h
@@ -91,6 +91,13 @@ __forceinline__ __device__ float16 CudaShuffleXorSync(unsigned mask,
   return float16(__shfl_xor(static_cast<float>(val), width));
 }
 
+template <>
+__forceinline__ __device__ bfloat16 CudaShuffleXorSync(unsigned mask,
+                                                       bfloat16 val,
+                                                       int width) {
+  return bfloat16(__shfl_xor(static_cast<float>(val), width));
+}
+
 template <>
 __forceinline__ __device__ paddle::platform::complex<float> CudaShuffleXorSync(
     unsigned mask, paddle::platform::complex<float> val, int width) {
diff --git a/paddle/phi/kernels/gpu/math_kernel.cu b/paddle/phi/kernels/gpu/math_kernel.cu
index f7b1205cb593a..02e3f00bd3425 100644
--- a/paddle/phi/kernels/gpu/math_kernel.cu
+++ b/paddle/phi/kernels/gpu/math_kernel.cu
@@ -92,6 +92,7 @@ DEFINE_CUDA_ELEMENTWISE_OP(Divide)
 }  // namespace phi
 
 using float16 = phi::dtype::float16;
+using bfloat16 = phi::dtype::bfloat16;
 using complex64 = ::phi::dtype::complex<float>;
 using complex128 = ::phi::dtype::complex<double>;
 
@@ -128,6 +129,7 @@ PD_REGISTER_KERNEL(divide_raw,
                    int,
                    int64_t,
                    float16,
+                   bfloat16,
                    complex64,
                    complex128) {}
 PD_REGISTER_KERNEL(multiply_raw,
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
index 32860a6694a89..a43e56b0815a6 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
@@ -18,7 +18,7 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
-from op_test import OpTest, skip_check_grad_ci
+from op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16
 
 
 class ElementwiseDivOp(OpTest):
@@ -55,6 +55,42 @@ def init_dtype(self):
         pass
 
 
+@unittest.skipIf(
+    not core.is_compiled_with_cuda() or core.cudnn_version() < 8100,
+    "core is not compiled with CUDA and cudnn version need larger than 8.1.0")
+class TestElementwiseDivOpBF16(OpTest):
+    def setUp(self):
+        self.op_type = "elementwise_div"
+        self.dtype = np.uint16
+
+        x = np.random.uniform(0.1, 1, [12, 13]).astype(np.float32)
+        y = np.random.uniform(0.1, 1, [12, 13]).astype(np.float32)
+
+        out = np.divide(x, y)
+
+        self.inputs = {
+            'X': convert_float_to_uint16(x),
+            'Y': convert_float_to_uint16(y)
+        }
+        self.outputs = {'Out': convert_float_to_uint16(out)}
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place)
+
+    def test_check_grad_normal(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(place, ['X', 'Y'], 'Out')
+
+    def test_check_grad_ingore_x(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(place, ['Y'], 'Out', no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(place, ['X'], 'Out', no_grad_set=set('Y'))
+
+
 @skip_check_grad_ci(
     reason="[skip shape check] Use y_shape(1) to test broadcast.")
 class TestElementwiseDivOp_scalar(ElementwiseDivOp):

From abe232d8288621d773f2bd886c56ffe1748ac7af Mon Sep 17 00:00:00 2001
From: wanghuancoder <wanghuan29@baidu.com>
Date: Wed, 23 Feb 2022 10:07:13 +0800
Subject: [PATCH 052/101] [Eager] Support Eager mode for some model testcase
 (#39248)

* eager, test=develop

* fix bug, test=develop

* eager, test=develop

* merge legacy to fluid

* eager, test=develop

* eager, test=develop

* Refactor TensorAdd func by template and remove gradient_accumulation in eager

* Remove needless target name

* eager, test=develop

* eager, test=develop

* Use overload instead of template

* Remove legacy code

* Remove legacy code

* selectedrows, test=develop

* Remove DataType test

* eager, test=develop

* eager, test=develop

* support gan, test=develop

* Using Tensor directly instead of using EagerTensor

* support gradient_accumulation

* make test_imperative_lod_tensor_to_selected_rows longer

* make test_imperative_lod_tensor_to_selected_rows longer

* refine code

* ptb, test=develop

* Rename all EagerTensor to Tensor

* Rename some EagerTensor to Tensor

* rename EagerTensor to EagerVariable

* eager, test=develop

* eager, test=develop

* eager, test=develop

* eager, test=develop

* add more test

* eager, test=develop

* Support copiable selected rows and merge develop

* refine, test=develop

* refine, test=develop

* refine, test=develop

* refine, test=develop

* clear grad, test=develop

* merge, develop

* merge, develop

Co-authored-by: JiabinYang <360788950@qq.com>
Co-authored-by: Weilong Wu <veyron_wu@163.com>
---
 .../eager/accumulation/accumulation_node.cc   |   1 +
 .../eager/accumulation/accumulation_node.h    |   2 +
 paddle/fluid/eager/api/utils/hook_utils.cc    |   8 +-
 .../auto_code_generator/eager_generator.cc    | 215 +++++++++++++-----
 paddle/fluid/eager/autograd_meta.h            |   6 +
 paddle/fluid/eager/grad_node_info.cc          |   5 +-
 paddle/fluid/eager/grad_node_info.h           |   2 +
 .../tests/task_tests/eager_utils_test.cc      |   2 +-
 paddle/fluid/eager/utils.cc                   |  75 +++---
 paddle/fluid/eager/utils.h                    |  29 ++-
 paddle/fluid/pybind/eager.cc                  |  55 ++---
 paddle/fluid/pybind/eager_method.cc           |  38 +++-
 python/paddle/fluid/dygraph/nn.py             |   3 +-
 .../fluid/dygraph/varbase_patch_methods.py    |   8 -
 python/paddle/fluid/initializer.py            |  10 +-
 python/paddle/fluid/layers/nn.py              |  24 +-
 .../tests/unittests/test_imperative_basic.py  |   2 +-
 .../unittests/test_imperative_framework.py    |  15 +-
 .../tests/unittests/test_imperative_gan.py    |   8 +-
 .../tests/unittests/test_imperative_mnist.py  |  10 +-
 .../test_imperative_mnist_sorted_gradient.py  |   8 +-
 .../unittests/test_imperative_ptb_rnn.py      |  10 +-
 .../tests/unittests/test_imperative_resnet.py |  32 +--
 .../test_imperative_resnet_sorted_gradient.py |   8 +-
 24 files changed, 384 insertions(+), 192 deletions(-)

diff --git a/paddle/fluid/eager/accumulation/accumulation_node.cc b/paddle/fluid/eager/accumulation/accumulation_node.cc
index 8b6752dfec743..9b0e784c0efb1 100644
--- a/paddle/fluid/eager/accumulation/accumulation_node.cc
+++ b/paddle/fluid/eager/accumulation/accumulation_node.cc
@@ -47,6 +47,7 @@ void GradNodeAccumulation::RetainGrad(
 std::vector<std::vector<paddle::experimental::Tensor>> GradNodeAccumulation::
 operator()(
     const std::vector<std::vector<paddle::experimental::Tensor>>& grads) {
+  VLOG(3) << "Running Eager Backward Node: GradNodeAccumulation";
   PADDLE_ENFORCE(grads.size() == 1,
                  paddle::platform::errors::Fatal(
                      "GradNodeAccumulation should take exactly 1 grad tensor"
diff --git a/paddle/fluid/eager/accumulation/accumulation_node.h b/paddle/fluid/eager/accumulation/accumulation_node.h
index be2ccc263e806..3f53517204a5a 100644
--- a/paddle/fluid/eager/accumulation/accumulation_node.h
+++ b/paddle/fluid/eager/accumulation/accumulation_node.h
@@ -35,6 +35,8 @@ class GradNodeAccumulation : public GradNodeBase {
 
   paddle::experimental::Tensor* Grad() { return &accumulated_grad; }
 
+  std::string name() { return "GradNodeAccumulation"; }
+
   /**
    * Register ReduceHook
    * **/
diff --git a/paddle/fluid/eager/api/utils/hook_utils.cc b/paddle/fluid/eager/api/utils/hook_utils.cc
index ee6a3afc6ffd3..7d2997eb884c8 100644
--- a/paddle/fluid/eager/api/utils/hook_utils.cc
+++ b/paddle/fluid/eager/api/utils/hook_utils.cc
@@ -70,12 +70,8 @@ void RetainGradForTensor(const paddle::experimental::Tensor& tensor) {
             grad_tensor->set_impl(t.impl());
             return *grad_tensor.get();
           } else {
-            PADDLE_THROW(paddle::platform::errors::Fatal(
-                "Detected uninitialized variable, causing segmentation "
-                "fault "
-                "inside the hook."
-                "Tensor has to be initialized while we need to set it."
-                "please check tensor initialization status."));
+            VLOG(7) << "Retain NULL paddle::experimental::Tensor in Grad Hook";
+            return paddle::experimental::Tensor();
           }
         } else {
           VLOG(7) << "Retain NULL paddle::experimental::Tensor in Grad Hook";
diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
index 63f25f5528100..7cddfd9c1c7dc 100644
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -554,6 +554,21 @@ static bool CheckOpProto(proto::OpProto* op_proto) {
   return true;
 }
 
+static bool BeSameAsInput(const std::string& output_name,
+                          const std::set<std::string>& input_names) {
+  if (output_name.size() < 4) {
+    return false;
+  }
+
+  if (output_name.substr(output_name.size() - 3, 3) == "Out") {
+    if (input_names.count(output_name.substr(0, output_name.size() - 3))) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
 /* --------------------------------------- */
 /* --------- Preprocess Ins/Outs --------- */
 /* --------------------------------------- */
@@ -1022,27 +1037,12 @@ static std::string GenerateGradNodeCreationContent(
           "egr::EagerUtils::autograd_meta(&%s);\n";
       get_autograd_meta_str += paddle::string::Sprintf(
           GET_MULTI_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name);
-      if (op_passing_outs_map[op_type].count(output_name)) {
-        const std::string output_var_args_name = output_name + "Var";
-        const char* FWD_OUT_SYNC_BACK_TEMPLATE =
-            "  egr::EagerUtils::OverwriteOutputs(%s, %s);\n";
-        get_autograd_meta_str += paddle::string::Sprintf(
-            FWD_OUT_SYNC_BACK_TEMPLATE, output_name, output_var_args_name);
-      }
     } else {
       const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE =
           "  egr::AutogradMeta* %s = "
           "egr::EagerUtils::autograd_meta(&%s);\n";
       get_autograd_meta_str += paddle::string::Sprintf(
           GET_SINGLE_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name);
-
-      if (op_passing_outs_map[op_type].count(output_name)) {
-        const std::string output_var_args_name = output_name + "Var";
-        const char* FWD_OUT_SYNC_BACK_TEMPLATE =
-            "  egr::EagerUtils::OverwriteOutputs(%s, %s);\n";
-        get_autograd_meta_str += paddle::string::Sprintf(
-            FWD_OUT_SYNC_BACK_TEMPLATE, output_name, output_var_args_name);
-      }
     }
   }
   VLOG(6) << "Generated outputs autograd_meta";
@@ -1180,11 +1180,13 @@ static std::string GenerateGradNodeCreationContent(
           SET_GRAD_IN_META_TEMPLATE, output_autograd_name, output_position);
     }
 
-    VLOG(6) << "Generated Call RetainGradForTensor";
-    const char* RETAIN_GRAD_TEMPLATE =
-        "    egr::EagerUtils::CheckAndRetainGrad(%s);\n";
-    grad_node_creation_str +=
-        paddle::string::Sprintf(RETAIN_GRAD_TEMPLATE, output_name);
+    if (!output.intermediate()) {
+      VLOG(6) << "Generated Call RetainGradForTensor";
+      const char* RETAIN_GRAD_TEMPLATE =
+          "    egr::EagerUtils::CheckAndRetainGrad(%s);\n";
+      grad_node_creation_str +=
+          paddle::string::Sprintf(RETAIN_GRAD_TEMPLATE, output_name);
+    }
   }
   VLOG(6) << "Generated SetGradIn/OutMeta";
 
@@ -1324,19 +1326,21 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
   generated_function_body += "\n";
 
   // Handle Dispensable Inputs
+  std::set<std::string> input_names;
   for (const proto::OpProto::Var& input : in_vars) {
     const std::string& input_name = input.name();
+    input_names.insert(input_name);
     if (input.dispensable()) {
       if (input.duplicable()) {
         const char* FWD_INS_CONTENT_TEMPLATE =
             "  if(%s.size() > 0) "
-            "ins[\"%s\"] = egr::EagerUtils::TrySyncToVars(%s)\n;";
+            "ins[\"%s\"] = egr::EagerUtils::TrySyncToVars(%s);\n";
         generated_function_body += paddle::string::Sprintf(
             FWD_INS_CONTENT_TEMPLATE, input_name, input_name, input_name);
       } else {
         const char* FWD_INS_CONTENT_TEMPLATE =
             "  if(%s.initialized()) "
-            "ins[\"%s\"] = egr::EagerUtils::TrySyncToVars(%s)\n;";
+            "ins[\"%s\"] = egr::EagerUtils::TrySyncToVars(%s);\n";
         generated_function_body += paddle::string::Sprintf(
             FWD_INS_CONTENT_TEMPLATE, input_name, input_name, input_name);
       }
@@ -1372,11 +1376,21 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
 
         core_ops_args_type_info[op_type].push_back("tensor");
       }
-      const char* FWD_OUTS_CONTENT_TEMPLATE =
-          "{ \"%s\", egr::EagerUtils::TrySyncToVars(%s) },";
-      outs_contents_str += paddle::string::Sprintf(
-          FWD_OUTS_CONTENT_TEMPLATE, output_name, output_var_name);
 
+      if (BeSameAsInput(output_name, input_names)) {
+        if (!output.dispensable()) {
+          std::string input_name =
+              output_name.substr(0, output_name.size() - 3);
+          const char* FWD_OUTS_CONTENT_TEMPLATE = "{ \"%s\", ins[\"%s\"] },";
+          outs_contents_str += paddle::string::Sprintf(
+              FWD_OUTS_CONTENT_TEMPLATE, output_name, input_name);
+        }
+      } else {
+        const char* FWD_OUTS_CONTENT_TEMPLATE =
+            "{ \"%s\", egr::EagerUtils::TrySyncToVars(%s) },";
+        outs_contents_str += paddle::string::Sprintf(
+            FWD_OUTS_CONTENT_TEMPLATE, output_name, output_var_name);
+      }
       core_ops_args_info[op_type].push_back(output_var_name);
 
     } else {
@@ -1415,6 +1429,23 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
   generated_function_body += outs_map_str;
   generated_function_body += "\n";
 
+  for (const proto::OpProto::Var& output : out_vars) {
+    const std::string& output_name = output.name();
+    if (op_passing_outs_map[op_type].count(output_name)) {
+      if (BeSameAsInput(output_name, input_names)) {
+        if (output.dispensable()) {
+          std::string input_name =
+              output_name.substr(0, output_name.size() - 3);
+          const char* FWD_OUTS_CONTENT_TEMPLATE =
+              "  if (ins.count(\"%s\")) outs[\"%s\"] = ins[\"%s\"];\n";
+          generated_function_body += paddle::string::Sprintf(
+              FWD_OUTS_CONTENT_TEMPLATE, input_name, output_name, input_name);
+        }
+      }
+    }
+  }
+  generated_function_body += "\n";
+
   VLOG(6) << "Generated Outs Map";
 
   // [Generation] Get Attrs
@@ -1448,33 +1479,61 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
     std::string output_varname = LegalizeVariableName(output_name);
 
     if (output.duplicable()) {
-      const char* FWD_OUT_TENSORS_TEMPLATE =
-          "  std::vector<paddle::experimental::Tensor> %s = "
-          "egr::EagerUtils::GetOutputs(outs[\"%s\"]);\n";
-      out_tensor_str = paddle::string::Sprintf(FWD_OUT_TENSORS_TEMPLATE,
-                                               output_varname, output_name);
+      if (op_passing_outs_map[op_type].count(output_name)) {
+        if (output.dispensable()) {
+          const char* FWD_OUT_TENSORS_TEMPLATE =
+              "  std::vector<paddle::experimental::Tensor> %s;\n"
+              "  if (outs.count(\"%s\"))  "
+              "egr::EagerUtils::GetOutputs(outs[\"%s\"], %s);\n"
+              "  egr::EagerUtils::Output2Result(%s, &%s);\n";
+          out_tensor_str = paddle::string::Sprintf(
+              FWD_OUT_TENSORS_TEMPLATE, output_varname, output_name,
+              output_name, output_var_args_name, output_var_args_name,
+              output_varname);
+        } else {
+          const char* FWD_OUT_TENSORS_TEMPLATE =
+              "  std::vector<paddle::experimental::Tensor> %s;\n"
+              "  egr::EagerUtils::GetOutputs(outs[\"%s\"], %s);\n"
+              "  egr::EagerUtils::Output2Result(%s, &%s);\n";
+          out_tensor_str = paddle::string::Sprintf(
+              FWD_OUT_TENSORS_TEMPLATE, output_varname, output_name,
+              output_var_args_name, output_var_args_name, output_varname);
+        }
+      } else {
+        const char* FWD_OUT_TENSORS_TEMPLATE =
+            "  std::vector<paddle::experimental::Tensor> %s;\n"
+            "  egr::EagerUtils::GetOutputs(outs[\"%s\"], &%s);\n";
+        out_tensor_str =
+            paddle::string::Sprintf(FWD_OUT_TENSORS_TEMPLATE, output_varname,
+                                    output_name, output_varname);
+      }
       return_types[return_position] =
           "std::vector<paddle::experimental::Tensor>";
-      if (op_passing_outs_map[op_type].count(output_name) &&
-          bwd_info.GenerateForwardOnly()) {
-        const char* FWD_OUT_SYNC_BACK_TEMPLATE =
-            "  egr::EagerUtils::OverwriteOutputs(outs[\"%s\"], %s);\n";
-        out_tensor_str += paddle::string::Sprintf(
-            FWD_OUT_SYNC_BACK_TEMPLATE, output_name, output_var_args_name);
-      }
     } else {
-      const char* FWD_OUT_TENSOR_TEMPLATE =
-          "  paddle::experimental::Tensor %s = "
-          "egr::EagerUtils::GetOutput(outs[\"%s\"][0]);\n";
-      out_tensor_str = paddle::string::Sprintf(FWD_OUT_TENSOR_TEMPLATE,
-                                               output_varname, output_name);
-
-      if (op_passing_outs_map[op_type].count(output_name) &&
-          bwd_info.GenerateForwardOnly()) {
-        const char* FWD_OUT_SYNC_BACK_TEMPLATE =
-            "  egr::EagerUtils::OverwriteOutputs(outs[\"%s\"][0], %s);\n";
-        out_tensor_str += paddle::string::Sprintf(
-            FWD_OUT_SYNC_BACK_TEMPLATE, output_name, output_var_args_name);
+      if (op_passing_outs_map[op_type].count(output_name)) {
+        if (output.dispensable()) {
+          const char* FWD_OUT_TENSOR_TEMPLATE =
+              "  if (outs.count(\"%s\"))  "
+              "egr::EagerUtils::GetOutput(outs[\"%s\"][0], %s);\n"
+              "  paddle::experimental::Tensor& %s = *%s;\n";
+          out_tensor_str = paddle::string::Sprintf(
+              FWD_OUT_TENSOR_TEMPLATE, output_name, output_name,
+              output_var_args_name, output_varname, output_var_args_name);
+        } else {
+          const char* FWD_OUT_TENSOR_TEMPLATE =
+              "  egr::EagerUtils::GetOutput(outs[\"%s\"][0], %s);\n"
+              "  paddle::experimental::Tensor& %s = *%s;\n";
+          out_tensor_str = paddle::string::Sprintf(
+              FWD_OUT_TENSOR_TEMPLATE, output_name, output_var_args_name,
+              output_varname, output_var_args_name);
+        }
+      } else {
+        const char* FWD_OUT_TENSOR_TEMPLATE =
+            "  paddle::experimental::Tensor %s;\n"
+            "  egr::EagerUtils::GetOutput(outs[\"%s\"][0], &%s);\n";
+        out_tensor_str =
+            paddle::string::Sprintf(FWD_OUT_TENSOR_TEMPLATE, output_varname,
+                                    output_name, output_varname);
       }
       return_types[return_position] = "paddle::experimental::Tensor";
     }
@@ -1494,6 +1553,7 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
         GenerateGradNodeCreationContent(fwd_info, bwd_info);
     generated_function_body += grad_node_creation_body_str;
     generated_function_body += "\n";
+
     // [Generation] Call RetainGradForTensor
     VLOG(6) << "Generated GradNode Creation codes";
   }
@@ -1588,12 +1648,25 @@ static std::string GenerateSingleOpBase(
   const std::string& attrs_name = "attrs_map" + std::to_string(*outs_size);
 
   // [Generation] Get Ins Map
+  std::unordered_set<std::string> dispensable_input_name_set;
+  for (const auto& in : in_vars) {
+    if (in.dispensable()) dispensable_input_name_set.insert(in.name());
+  }
+  std::unordered_set<std::string> duplicable_input_name_set;
+  for (const auto& in : in_vars) {
+    if (in.duplicable()) duplicable_input_name_set.insert(in.name());
+  }
   std::string ins_contents_str = "";
   for (auto iter : grad_ins) {
     const std::string& grad_input_name = iter.first;
 
     if (grad_ins_fwd_slotname_map.count(grad_input_name)) {
       // Fwd Tensor
+      const std::string& fwd_name =
+          grad_ins_fwd_slotname_map.at(grad_input_name);
+      if (dispensable_input_name_set.count(fwd_name)) {
+        continue;
+      }
       std::string struct_fwd_input_name =
           grad_ins_fwd_slotname_map.at(grad_input_name) + "_";
       const char* GRAD_INS_FWD_CONTENT_TEMPLATE =
@@ -1634,14 +1707,41 @@ static std::string GenerateSingleOpBase(
       paddle::string::Sprintf(BWD_INS_MAP_TEMPLATE, ins_name, ins_contents_str);
   generated_grad_function_body += ins_map_str;
 
-  VLOG(6) << "Generated Ins Map";
+  for (auto iter : grad_ins) {
+    const std::string& grad_input_name = iter.first;
 
-  // [Generation] Get Outs Map
-  std::unordered_set<std::string> duplicable_input_name_set;
-  for (const auto& in : in_vars) {
-    if (in.duplicable()) duplicable_input_name_set.insert(in.name());
+    if (grad_ins_fwd_slotname_map.count(grad_input_name)) {
+      // Fwd Tensor
+      const std::string& fwd_name =
+          grad_ins_fwd_slotname_map.at(grad_input_name);
+      if (dispensable_input_name_set.count(fwd_name)) {
+        std::string struct_fwd_input_name =
+            grad_ins_fwd_slotname_map.at(grad_input_name) + "_";
+        if (duplicable_input_name_set.count(fwd_name)) {
+          const char* DISPENSABLE_GRAD_INS_FWD_CONTENT_TEMPLATE =
+              "  if(this->%s.size() > 0) %s[\"%s\"] = "
+              "egr::EagerUtils::TrySyncToVars(egr::EagerUtils::"
+              "RecoverTensorWrapper(&this->%s, nullptr));\n";
+          generated_grad_function_body += paddle::string::Sprintf(
+              DISPENSABLE_GRAD_INS_FWD_CONTENT_TEMPLATE, struct_fwd_input_name,
+              ins_name, grad_input_name, struct_fwd_input_name);
+        } else {
+          const char* DISPENSABLE_GRAD_INS_FWD_CONTENT_TEMPLATE =
+              "  auto %s = egr::EagerUtils::RecoverTensorWrapper(&this->%s, "
+              "nullptr);\n  if(%s.initialized()) %s[\"%s\"] = "
+              "egr::EagerUtils::TrySyncToVars(%s);\n";
+          generated_grad_function_body += paddle::string::Sprintf(
+              DISPENSABLE_GRAD_INS_FWD_CONTENT_TEMPLATE, grad_input_name,
+              struct_fwd_input_name, grad_input_name, ins_name, grad_input_name,
+              grad_input_name);
+        }
+      }
+    }
   }
 
+  VLOG(6) << "Generated Ins Map";
+
+  // [Generation] Get Outs Map
   std::string outs_contents_str = "";
   for (auto iter : grad_outs) {
     const std::string& grad_output_name = iter.first;
@@ -1987,6 +2087,7 @@ static std::string GenerateGradNodeHeaderContents(
       "%s\n"
       "  // SetAttrMap\n"
       "%s\n"
+      "  std::string name() { return \"GradNode%s\"; }\n"
       "\n"
       " private:\n"
       "   // TensorWrappers\n"
@@ -2085,8 +2186,8 @@ static std::string GenerateGradNodeHeaderContents(
 
   std::string grad_node_str = paddle::string::Sprintf(
       GRAD_NODE_TEMPLATE, op_type, op_type, op_type, op_type,
-      set_tensor_wrappers_str, set_attr_map_str, tensor_wrapper_members_str,
-      attr_members_str);
+      set_tensor_wrappers_str, set_attr_map_str, op_type,
+      tensor_wrapper_members_str, attr_members_str);
 
   return grad_node_str;
 }
diff --git a/paddle/fluid/eager/autograd_meta.h b/paddle/fluid/eager/autograd_meta.h
index f4b2b8e08d4fa..53f17a4ffe58c 100644
--- a/paddle/fluid/eager/autograd_meta.h
+++ b/paddle/fluid/eager/autograd_meta.h
@@ -127,6 +127,12 @@ class AutogradMeta : public AbstractAutogradMeta {
     stop_gradient_ = static_cast<int>(stop_gradient);
   }
 
+  void WeakSetStopGradient(bool stop_gradient) {
+    if (stop_gradient_ == -1) {
+      stop_gradient_ = static_cast<int>(stop_gradient);
+    }
+  }
+
   bool Persistable() const { return persistable_; }
 
   void SetPersistable(bool persistable) { persistable_ = persistable; }
diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc
index 598b368c6426a..d83fa916db66c 100644
--- a/paddle/fluid/eager/grad_node_info.cc
+++ b/paddle/fluid/eager/grad_node_info.cc
@@ -69,13 +69,16 @@ void GradNodeBase::AddEdges(AutogradMeta* meta, size_t slot_id) {
           "adj_edges is designed to has the same size of grad "
           "inputs's slot num."));
   if (meta && !meta->StopGradient()) {
-    VLOG(6) << "Add Edges for slot: " << slot_id;
     auto node = meta->GetMutableGradNode();
     if (node) {
+      VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from "
+              << this->name() << " to " << meta->GetMutableGradNode()->name();
       adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
                                        meta->OutRankInfo());
     } else {
       meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>());
+      VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from "
+              << this->name() << " to " << meta->GetMutableGradNode()->name();
       adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
                                        meta->OutRankInfo());
     }
diff --git a/paddle/fluid/eager/grad_node_info.h b/paddle/fluid/eager/grad_node_info.h
index 8603d84fe8df5..f699f9ab28e2d 100644
--- a/paddle/fluid/eager/grad_node_info.h
+++ b/paddle/fluid/eager/grad_node_info.h
@@ -147,6 +147,8 @@ class GradNodeBase {
   std::vector<std::vector<paddle::experimental::Tensor>> ApplyGradientHooks(
       const std::vector<std::vector<paddle::experimental::Tensor>>& tensors);
 
+  virtual std::string name() { return "GradNodeBase"; }
+
  private:
   // TODO(jiabin): Use SmallVector instead after merge PR from develop
 
diff --git a/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc b/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
index 990f700056158..217055e4e9e4a 100644
--- a/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
@@ -159,7 +159,7 @@ TEST(EagerUtils, PassStopGradient) {
   CHECK(auto_grad0->StopGradient() == false);
   egr::EagerUtils::PassStopGradient(true, auto_grad0.get(), auto_grad1.get(),
                                     auto_grad2.get(), auto_grad3.get());
-  CHECK(auto_grad0->StopGradient() == true);
+  CHECK(auto_grad0->StopGradient() == false);
   CHECK(auto_grad1->StopGradient() == true);
   CHECK(auto_grad2->StopGradient() == true);
   CHECK(auto_grad3->StopGradient() == true);
diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc
index 9c6c7d4d540c6..5d8dff5cd5b24 100644
--- a/paddle/fluid/eager/utils.cc
+++ b/paddle/fluid/eager/utils.cc
@@ -220,53 +220,62 @@ paddle::experimental::Tensor EagerUtils::GetOutput(
   return paddle::experimental::Tensor(out->GetTensorBase(), out->name());
 }
 
-void EagerUtils::OverwriteOutputs(const std::shared_ptr<EagerVariable>& out,
-                                  paddle::experimental::Tensor* tensor) {
+void EagerUtils::GetOutput(const std::shared_ptr<EagerVariable>& out,
+                           paddle::experimental::Tensor* out_var) {
   PADDLE_ENFORCE_NOT_NULL(
-      tensor, paddle::platform::errors::Fatal(
-                  "Tensor is null and cannot be copied. "
-                  "We are tring to OverwriteOutput from its "
-                  "shared_ptr, this error may indicate some outputs "
-                  "are nullptr"));
-  tensor->set_impl(out->GetTensorBase());
+      out_var, paddle::platform::errors::Fatal(
+                   "Tensor is null and cannot be copied. "
+                   "We are tring to OverwriteOutput from its "
+                   "shared_ptr, this error may indicate some outputs "
+                   "are nullptr"));
+  out_var->set_impl(out->GetTensorBase());
 }
 
-void EagerUtils::OverwriteOutputs(
+void EagerUtils::GetOutputs(
     const std::vector<std::shared_ptr<EagerVariable>>& outs,
-    const std::vector<paddle::experimental::Tensor*>& tensors) {
-  PADDLE_ENFORCE_EQ(
-      outs.size(), tensors.size(),
-      paddle::platform::errors::Fatal(
-          "We are tring to OverwriteOutputs which passed in and it expected "
-          "elements num of outs and origin outputs are equal, but we got outs "
-          "size of: %d, and tensors passed in size is: %d",
-          outs.size(), tensors.size()));
+    std::vector<paddle::experimental::Tensor>* result) {
   for (size_t i = 0; i < outs.size(); i++) {
-    OverwriteOutputs(outs[i], tensors[i]);
+    result->emplace_back(outs[i]->GetTensorBase());
   }
 }
 
-void EagerUtils::OverwriteOutputs(const paddle::experimental::Tensor& out,
-                                  paddle::experimental::Tensor* tensor) {
-  PADDLE_ENFORCE_NOT_NULL(
-      tensor, paddle::platform::errors::Fatal(
-                  "Tensor is null and cannot be copied. "
-                  "We are tring to OverwriteOutput from its "
-                  "shared_ptr, this error may indicate some outputs "
-                  "are nullptr"));
-  *tensor = out;
-}
-void EagerUtils::OverwriteOutputs(
-    const std::vector<paddle::experimental::Tensor>& outs,
-    const std::vector<paddle::experimental::Tensor*>& tensors) {
+void EagerUtils::GetOutputs(
+    const std::vector<std::shared_ptr<EagerVariable>>& outs,
+    const std::vector<paddle::experimental::Tensor*>& out_var) {
   for (size_t i = 0; i < outs.size(); i++) {
     PADDLE_ENFORCE_NOT_NULL(
-        tensors[i], paddle::platform::errors::Fatal(
+        out_var[i], paddle::platform::errors::Fatal(
                         "Tensor is null and cannot be copied. "
                         "We are tring to OverwriteOutput from its "
                         "shared_ptr, this error may indicate some outputs "
                         "are nullptr"));
-    *tensors[i] = outs[i];
+    out_var[i]->set_impl(outs[i]->GetTensorBase());
+  }
+}
+
+void EagerUtils::GetOutputs(const std::shared_ptr<EagerVariable>& out,
+                            std::vector<paddle::experimental::Tensor>* result) {
+  result->emplace_back(out->GetTensorBase());
+}
+
+void EagerUtils::GetOutputs(
+    const std::shared_ptr<EagerVariable>& out,
+    const std::vector<paddle::experimental::Tensor*>& out_var) {
+  PADDLE_ENFORCE_NOT_NULL(
+      out_var[0], paddle::platform::errors::Fatal(
+                      "Tensor is null and cannot be copied. "
+                      "We are tring to OverwriteOutput from its "
+                      "shared_ptr, this error may indicate some outputs "
+                      "are nullptr"));
+  out_var[0]->set_impl(out->GetTensorBase());
+}
+
+void EagerUtils::Output2Result(
+    const std::vector<paddle::experimental::Tensor*>& out_var,
+    std::vector<paddle::experimental::Tensor>* result) {
+  result->reserve(out_var.size());
+  for (size_t i = 0; i < out_var.size(); i++) {
+    result->emplace_back(*out_var[i]);
   }
 }
 
diff --git a/paddle/fluid/eager/utils.h b/paddle/fluid/eager/utils.h
index 00013faa345e2..b74d68db2a6d5 100644
--- a/paddle/fluid/eager/utils.h
+++ b/paddle/fluid/eager/utils.h
@@ -77,7 +77,7 @@ class PassStopGradientIter : public IterHelper<AutogradMeta*> {
       VLOG(2) << "Tensor is NULL";
       return;
     }
-    element->SetStopGradient(stop_gradient_);
+    element->WeakSetStopGradient(stop_gradient_);
   }
 
   bool stop_gradient_ = true;
@@ -173,17 +173,24 @@ class EagerUtils {
       const std::vector<std::shared_ptr<EagerVariable>>& outs);
   static paddle::experimental::Tensor GetOutput(
       const std::shared_ptr<EagerVariable>& out);
-  // Sync Back to origin output Tensor
-  static void OverwriteOutputs(const std::shared_ptr<EagerVariable>& out,
-                               paddle::experimental::Tensor* tensor);
-  static void OverwriteOutputs(const paddle::experimental::Tensor& out,
-                               paddle::experimental::Tensor* tensor);
-  static void OverwriteOutputs(
+  static void GetOutput(const std::shared_ptr<EagerVariable>& out,
+                        paddle::experimental::Tensor* out_var);
+  static void GetOutputs(
       const std::vector<std::shared_ptr<EagerVariable>>& outs,
-      const std::vector<paddle::experimental::Tensor*>& tensors);
-  static void OverwriteOutputs(
-      const std::vector<paddle::experimental::Tensor>& outs,
-      const std::vector<paddle::experimental::Tensor*>& tensors);
+      std::vector<paddle::experimental::Tensor>* result);
+  static void GetOutputs(
+      const std::vector<std::shared_ptr<EagerVariable>>& outs,
+      const std::vector<paddle::experimental::Tensor*>& out_var);
+  static void GetOutputs(const std::shared_ptr<EagerVariable>& out,
+                         std::vector<paddle::experimental::Tensor>* result);
+  static void GetOutputs(
+      const std::shared_ptr<EagerVariable>& out,
+      const std::vector<paddle::experimental::Tensor*>& out_var);
+
+  static void Output2Result(
+      const std::vector<paddle::experimental::Tensor*>& out_var,
+      std::vector<paddle::experimental::Tensor>* result);
+
   // end Intermidate needed
 
   static void CheckAndRetainGrad(const paddle::experimental::Tensor& tensor);
diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc
index 241e9f9058dfe..3867336764834 100644
--- a/paddle/fluid/pybind/eager.cc
+++ b/paddle/fluid/pybind/eager.cc
@@ -50,7 +50,6 @@ PyObject* TensorNew(PyTypeObject* type, PyObject* args, PyObject* kwargs) {
   if (obj) {
     auto v = reinterpret_cast<TensorObject*>(obj);
     new (&(v->tensor)) paddle::experimental::Tensor();
-    Py_INCREF(obj);
   }
   return obj;
 }
@@ -58,7 +57,7 @@ PyObject* TensorNew(PyTypeObject* type, PyObject* args, PyObject* kwargs) {
 // TODO(jiabin): Overload this once we need more constructor in Python
 void EmptyTensorInitializer(TensorObject* self, const std::string& name,
                             const paddle::platform::Place& place,
-                            bool persistable = false, bool stop_gradient = true,
+                            bool persistable = false, int stop_gradient = -1,
                             framework::proto::VarType::Type dtype =
                                 paddle::framework::proto::VarType::FP32,
                             const std::vector<int>& dims = {},
@@ -74,7 +73,9 @@ void EmptyTensorInitializer(TensorObject* self, const std::string& name,
   self->tensor.set_name(name);
   auto autograd_meta = egr::EagerUtils::autograd_meta(&(self->tensor));
   autograd_meta->SetPersistable(persistable);
-  autograd_meta->SetStopGradient(stop_gradient);
+  if (stop_gradient != -1) {
+    autograd_meta->SetStopGradient(static_cast<bool>(stop_gradient));
+  }
   if (var_type == paddle::framework::proto::VarType::LOD_TENSOR) {
     // TODO(jiabin): Maybe support LOD later
     std::shared_ptr<phi::DenseTensor> dense_tensor =
@@ -143,13 +144,12 @@ void InitTensorWithTensor(TensorObject* self,
         src.copy_to(phi::TransToPtenBackend(place), true).impl());
     VLOG(4) << "Different place, do TensorCopy";
   }
-  egr::EagerUtils::autograd_meta(&(self->tensor))->SetStopGradient(true);
   if (src.get_autograd_meta()) {
-    egr::EagerUtils::unsafe_autograd_meta(self->tensor)
+    egr::EagerUtils::autograd_meta(&(self->tensor))
         ->SetPersistable(
             egr::EagerUtils::unsafe_autograd_meta(src)->Persistable());
   } else {
-    egr::EagerUtils::unsafe_autograd_meta(self->tensor)->SetPersistable(false);
+    egr::EagerUtils::autograd_meta(&(self->tensor))->SetPersistable(false);
   }
 }
 
@@ -168,8 +168,7 @@ void InitTensorWithFrameworkTensor(TensorObject* self,
         temp.copy_to(phi::TransToPtenBackend(place), true).impl());
     VLOG(4) << "Different place, do TensorCopy";
   }
-  egr::EagerUtils::autograd_meta(&(self->tensor))->SetStopGradient(true);
-  egr::EagerUtils::unsafe_autograd_meta(self->tensor)->SetPersistable(false);
+  egr::EagerUtils::autograd_meta(&(self->tensor))->SetPersistable(false);
 }
 
 py::object ParsePyArray(
@@ -218,21 +217,18 @@ paddle::platform::Place ParsePlace(
 }
 
 // boolean arguments: zero_copy, stop_gradient, persistable
-bool ParseBooleanArgs(std::string key,
-                      std::unordered_map<std::string, PyObject*> kws_map,
-                      std::unordered_map<std::string, Py_ssize_t> kw_order_map,
-                      PyObject* args, bool flag_kwargs, Py_ssize_t args_num) {
-  bool res = false;
-  if (key == "stop_gradient") res = true;
+int ParseBooleanArgs(std::string key,
+                     std::unordered_map<std::string, PyObject*> kws_map,
+                     std::unordered_map<std::string, Py_ssize_t> kw_order_map,
+                     PyObject* args, bool flag_kwargs, Py_ssize_t args_num) {
+  int res = -1;
 
   if (kw_order_map[key] <= args_num) {
-    res = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, kw_order_map[key] - 1),
-                                kw_order_map[key] - 1);
+    res = static_cast<int>(CastPyArg2AttrBoolean(
+        PyTuple_GET_ITEM(args, kw_order_map[key] - 1), kw_order_map[key] - 1));
   } else {
     if (flag_kwargs && kws_map[key] != NULL) {
-      res = CastPyArg2AttrBoolean(kws_map[key], 0);
-    } else {
-      return res;
+      res = static_cast<int>(CastPyArg2AttrBoolean(kws_map[key], 0));
     }
   }
   return res;
@@ -288,15 +284,15 @@ void AutoInitTensorByPyArray(TensorObject* py_tensor_ptr,
   bool persistable = false;
   bool zero_copy = false;
   std::string act_name = "";
-  bool stop_gradient = true;
+  int stop_gradient = -1;
 
   numpy_value =
       ParsePyArray(kws_map, kw_order_map, args, flag_kwargs, args_num);
   place = ParsePlace(kws_map, kw_order_map, args, flag_kwargs, args_num);
-  persistable = ParseBooleanArgs("persistable", kws_map, kw_order_map, args,
-                                 flag_kwargs, args_num);
-  zero_copy = ParseBooleanArgs("zero_copy", kws_map, kw_order_map, args,
-                               flag_kwargs, args_num);
+  persistable = (1 == ParseBooleanArgs("persistable", kws_map, kw_order_map,
+                                       args, flag_kwargs, args_num));
+  zero_copy = (1 == ParseBooleanArgs("zero_copy", kws_map, kw_order_map, args,
+                                     flag_kwargs, args_num));
   act_name = ParseName(kws_map, kw_order_map, args, flag_kwargs, args_num);
   stop_gradient = ParseBooleanArgs("stop_gradient", kws_map, kw_order_map, args,
                                    flag_kwargs, args_num);
@@ -571,7 +567,7 @@ int TensorInit(PyObject* self, PyObject* args, PyObject* kwargs) {
         EmptyTensorInitializer(py_tensor_ptr, act_name,
                                egr::Controller::Instance().GetExpectedPlace(),
                                persistable,
-                               /* stop_gradient */ true, dtype, dims, var_type);
+                               /* stop_gradient */ -1, dtype, dims, var_type);
 
         return 0;
       } else {
@@ -655,7 +651,7 @@ int TensorInit(PyObject* self, PyObject* args, PyObject* kwargs) {
         bool persistable = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 4), 4);
         EmptyTensorInitializer(py_tensor_ptr, act_name,
                                egr::Controller::Instance().GetExpectedPlace(),
-                               persistable, true, dtype, dims, var_type);
+                               persistable, -1, dtype, dims, var_type);
         return 0;
       } else if (pybind11::detail::npy_api::get().PyArray_Check_(arg0_ptr)) {
         VLOG(6) << "Calling case3's initializer.";
@@ -726,9 +722,8 @@ PyMappingMethods mapping_methods;
 void BindEager(pybind11::module* module) {
   auto m = module->def_submodule("eager");
 
-  auto& internals = pybind11::detail::get_internals();
   auto heap_type = reinterpret_cast<PyHeapTypeObject*>(
-      internals.default_metaclass->tp_alloc(internals.default_metaclass, 0));
+      PyType_Type.tp_alloc(&PyType_Type, 0));
   heap_type->ht_name = ToPyObject("Tensor");
   heap_type->ht_qualname = ToPyObject("Tensor");
   auto type = &heap_type->ht_type;
@@ -742,8 +737,8 @@ void BindEager(pybind11::module* module) {
   type->tp_getset = variable_properties;
   type->tp_init = TensorInit;
   type->tp_new = TensorNew;
-  Py_INCREF(internals.instance_base);
-  type->tp_base = reinterpret_cast<PyTypeObject*>(internals.instance_base);
+  Py_INCREF(&PyBaseObject_Type);
+  type->tp_base = reinterpret_cast<PyTypeObject*>(&PyBaseObject_Type);
   type->tp_flags |=
       Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;
 #if PY_VERSION_HEX >= 0x03050000
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index b1aef8fc08fea..27328bea692af 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -186,11 +186,17 @@ static PyObject* tensor_retain_grads(TensorObject* self, PyObject* args,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
-static PyObject* tensor__clear_gradient(TensorObject* self, PyObject* args,
-                                        PyObject* kwargs) {
+static PyObject* tensor_clear_gradient(TensorObject* self, PyObject* args,
+                                       PyObject* kwargs) {
   EAGER_TRY
   VLOG(4) << "ClearGradient " << self->tensor.name();
 
+  Py_ssize_t args_num = PyTuple_Size(args);
+  bool set_to_zero = true;
+  if (args_num == (Py_ssize_t)1) {
+    CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 0), 0);
+  }
+
   paddle::experimental::Tensor* grad;
   if (egr::egr_utils_api::IsLeafTensor(self->tensor)) {
     // Add RetainGrad as PostHook to AccumulationNode
@@ -209,13 +215,27 @@ static PyObject* tensor__clear_gradient(TensorObject* self, PyObject* args,
     grad = meta->MutableGrad();
   }
 
-  if (grad->initialized()) {
-    VLOG(4) << "Gradient of " << self->tensor.name()
-            << " is initialized, will be released.";
-    auto dense_tensor =
-        std::dynamic_pointer_cast<phi::DenseTensor>(grad->impl());
-    dense_tensor->MoveMemoryHolder();
+  if (grad->is_selected_rows()) {
+    auto selected_rows =
+        std::dynamic_pointer_cast<phi::SelectedRows>(grad->impl());
+    if (selected_rows->mutable_value()->IsInitialized()) {
+      selected_rows->mutable_rows()->clear();
+      selected_rows->mutable_value()->clear();
+    }
+  } else if (grad->is_dense_tensor()) {
+    if (grad->initialized()) {
+      if (set_to_zero) {
+        grad->set_impl(paddle::experimental::zeros_like(*grad).impl());
+      } else {
+        VLOG(4) << "Gradient of " << self->tensor.name()
+                << " is initialized, will be released.";
+        auto dense_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(grad->impl());
+        dense_tensor->MoveMemoryHolder();
+      }
+    }
   }
+
   Py_INCREF(Py_None);
   return Py_None;
   EAGER_CATCH_AND_THROW_RETURN_NULL
@@ -407,7 +427,7 @@ PyMethodDef variable_methods[] = {
      METH_VARARGS | METH_KEYWORDS, NULL},
     {"retain_grads", (PyCFunction)(void (*)(void))tensor_retain_grads,
      METH_VARARGS | METH_KEYWORDS, NULL},
-    {"_clear_gradient", (PyCFunction)(void (*)(void))tensor__clear_gradient,
+    {"clear_gradient", (PyCFunction)(void (*)(void))tensor_clear_gradient,
      METH_VARARGS | METH_KEYWORDS, NULL},
     {"_zero_grads", (PyCFunction)(void (*)(void))tensor__zero_grads,
      METH_VARARGS | METH_KEYWORDS, NULL},
diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index e1857a34f03f5..5bb1aef6d6e9b 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -240,7 +240,8 @@ def _get_default_param_initializer():
             is_bias=True)
 
     def forward(self, input):
-        if in_dygraph_mode() and self._l_type == 'conv2d':
+        if in_dygraph_mode() and (self._l_type == 'conv2d' or
+                                  self._l_type == 'depthwise_conv2d'):
             attrs = ('strides', self._stride, 'paddings', self._padding,
                      'dilations', self._dilation, 'groups', self._groups
                      if self._groups else 1, 'use_cudnn', self._use_cudnn,
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index 6f0305f4774d6..c4ea751ed92f8 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -779,13 +779,6 @@ def _set_grad_ivar(self, value):
             raise TypeError(
                 "_set_grad_ivar is only supported for Parameter Tensor")
 
-    @framework.dygraph_only
-    def clear_gradient(self, set_to_zero=True):
-        if set_to_zero:
-            self._zero_grads()
-        else:
-            self._clear_gradient()
-
     @framework.dygraph_only
     def clone(self):
         return _C_ops_.assign(self)
@@ -815,7 +808,6 @@ def value(self):
     if core._in_eager_mode():
         setattr(core.eager.Tensor, "_grad_ivar", _grad_ivar)
         setattr(core.eager.Tensor, "_set_grad_ivar", _set_grad_ivar)
-        setattr(core.eager.Tensor, "clear_gradient", clear_gradient)
         setattr(core.eager.Tensor, "clone", clone)
         setattr(core.eager.Tensor, "value", value)
     else:
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index ea17d029b6cc2..9e3add6217958 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -138,11 +138,11 @@ def __call__(self, var, block=None):
         assert isinstance(block, framework.Block)
 
         if framework.in_dygraph_mode():
-            var = _C_ops.fill_constant(
-                var, 'value',
-                float(self._value), 'force_cpu', self._force_cpu, 'dtype',
-                int(var.dtype), 'str_value',
-                str(float(self._value)), 'shape', var.shape)
+            _C_ops.fill_constant(var, 'value',
+                                 float(self._value), 'force_cpu',
+                                 self._force_cpu, 'dtype',
+                                 int(var.dtype), 'str_value',
+                                 str(float(self._value)), 'shape', var.shape)
             return None
         else:
             # fill constant should set the "str_value" to preserve precision
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index d2653b75eafba..f022e1791daef 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -26,7 +26,7 @@
 import paddle
 from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant, NumpyArrayInitializer
-from ..framework import Variable, OpProtoHolder, in_dygraph_mode, dygraph_only, _dygraph_tracer, default_main_program, _varbase_creator, static_only, _global_flags
+from ..framework import Variable, OpProtoHolder, in_dygraph_mode, dygraph_only, _dygraph_tracer, default_main_program, _varbase_creator, static_only, _global_flags, _in_eager_mode
 from .. import dygraph_utils
 from ..param_attr import ParamAttr
 from .layer_function_generator import autodoc, templatedoc, _generate_doc_string_
@@ -6254,6 +6254,10 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
             # the shape of reshaped_3 is [6,8].
     """
     if in_dygraph_mode():
+        if _in_eager_mode():
+            tmp_tensor_type = core.eager.Tensor
+        else:
+            tmp_tensor_type = Variable
         #TODO(zhiqiu): enable inplace in dygraph mode.
         if inplace:
             warnings.warn(
@@ -6265,7 +6269,7 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
                 for item in shape
             ]
             out, _ = _C_ops.reshape2(x, None, 'shape', shape)
-        elif isinstance(shape, Variable):
+        elif isinstance(shape, tmp_tensor_type):
             shape.stop_gradient = True
             out, _ = _C_ops.reshape2(x, shape)
         else:
@@ -11132,24 +11136,30 @@ def slice(input, axes, starts, ends):
 
         infer_flags = list(1 for i in range(len(axes)))
 
+        if _in_eager_mode():
+            tmp_tensor_type = core.eager.Tensor
+        else:
+            tmp_tensor_type = Variable
+
         if isinstance(starts, (list, tuple)):
             starts = [
-                item.numpy().item(0) if isinstance(item, Variable) else item
+                item.numpy().item(0)
+                if isinstance(item, tmp_tensor_type) else item
                 for item in starts
             ]
             attrs += ('starts', starts)
-        elif isinstance(starts, Variable):
+        elif isinstance(starts, tmp_tensor_type):
             starts_tensor = starts
             starts.stop_gradient = True
             infer_flags = list(-1 for i in range(len(axes)))
 
         if isinstance(ends, (list, tuple)):
             ends = [
-                item.numpy().item(0) if isinstance(item, Variable) else item
-                for item in ends
+                item.numpy().item(0)
+                if isinstance(item, tmp_tensor_type) else item for item in ends
             ]
             attrs += ('ends', ends)
-        elif isinstance(ends, Variable):
+        elif isinstance(ends, tmp_tensor_type):
             ends_tensor = ends
             ends_tensor.stop_gradient = True
             infer_flags = list(-1 for i in range(len(axes)))
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_basic.py b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
index a36b10f58ffaa..4734e67140a8d 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
@@ -953,7 +953,7 @@ def func_metaclass(self):
         self.assertNotEqual(type(MyLayer).__name__, 'pybind11_type')
         if core._in_eager_mode():
             self.assertEqual(
-                type(paddle.fluid.core.eager.Tensor).__name__, 'pybind11_type')
+                type(paddle.fluid.core.eager.Tensor).__name__, 'type')
         else:
             self.assertEqual(
                 type(paddle.fluid.core.VarBase).__name__, 'pybind11_type')
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_framework.py b/python/paddle/fluid/tests/unittests/test_imperative_framework.py
index 68628918391cb..2d900d65976e7 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_framework.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_framework.py
@@ -18,6 +18,7 @@
 import paddle.fluid as fluid
 import numpy as np
 from test_imperative_base import new_program_scope
+from paddle.fluid.framework import _test_eager_guard
 
 
 class MLP(fluid.Layer):
@@ -46,7 +47,7 @@ def forward(self, inputs):
 
 
 class TestDygraphFramework(unittest.TestCase):
-    def test_dygraph_backward(self):
+    def func_test_dygraph_backward(self):
         with new_program_scope():
             mlp = MLP(input_size=2)
             var_inp = fluid.layers.data(
@@ -59,8 +60,18 @@ def test_dygraph_backward(self):
             except AssertionError as e:
                 self.assertTrue((e is not None))
 
-    def test_dygraph_to_string(self):
+    def test_dygraph_backward(self):
+        with _test_eager_guard():
+            self.func_test_dygraph_backward()
+        self.func_test_dygraph_backward()
+
+    def func_test_dygraph_to_string(self):
         np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
         with fluid.dygraph.guard():
             var_inp = fluid.dygraph.to_variable(np_inp)
             print(str(var_inp))
+
+    def test_dygraph_to_string(self):
+        with _test_eager_guard():
+            self.func_test_dygraph_to_string()
+        self.func_test_dygraph_to_string()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gan.py b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
index 189745e7295a8..39b7f941c4bba 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_gan.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
@@ -25,6 +25,7 @@
 from paddle.fluid import Conv2D, Pool2D, Linear
 from test_imperative_base import new_program_scope
 from paddle.fluid.dygraph.base import to_variable
+from paddle.fluid.framework import _test_eager_guard
 
 
 class Discriminator(fluid.Layer):
@@ -54,7 +55,7 @@ def forward(self, inputs):
 
 
 class TestDygraphGAN(unittest.TestCase):
-    def test_gan_float32(self):
+    def func_test_gan_float32(self):
         seed = 90
         paddle.seed(1)
         paddle.framework.random._manual_program_seed(1)
@@ -227,6 +228,11 @@ def test_gan_float32(self):
         for k, v in six.iteritems(dy_params2):
             self.assertTrue(np.allclose(v, static_params[k]))
 
+    def test_gan_float32(self):
+        with _test_eager_guard():
+            self.func_test_gan_float32()
+        self.func_test_gan_float32()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py
index 1e509960c0763..e221453200895 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py
@@ -27,6 +27,7 @@
 from paddle.fluid.dygraph.base import to_variable
 from test_imperative_base import new_program_scope
 from utils import DyGraphProgramDescTracerTestHelper, is_equal_program
+from paddle.fluid.framework import _test_eager_guard, _in_eager_mode
 
 
 class SimpleImgConvPool(fluid.dygraph.Layer):
@@ -114,7 +115,7 @@ def _reader_imple():
 
         return _reader_imple
 
-    def test_mnist_float32(self):
+    def func_test_mnist_float32(self):
         seed = 90
         epoch_num = 1
         batch_size = 128
@@ -152,7 +153,7 @@ def test_mnist_float32(self):
                     label = data[1]
                     label.stop_gradient = True
 
-                    if batch_id % 10 == 0:
+                    if batch_id % 10 == 0 and not _in_eager_mode():
                         cost, traced_layer = paddle.jit.TracedLayer.trace(
                             mnist, inputs=img)
                         if program is not None:
@@ -257,6 +258,11 @@ def test_mnist_float32(self):
         for key, value in six.iteritems(static_param_value):
             self.assertTrue(np.allclose(value, dy_param_value[key], atol=1e-5))
 
+    def test_mnist_float32(self):
+        with _test_eager_guard():
+            self.func_test_mnist_float32()
+        self.func_test_mnist_float32()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_mnist_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_mnist_sorted_gradient.py
index bda1958c0f354..8e3cbaf9488bd 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_mnist_sorted_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_mnist_sorted_gradient.py
@@ -26,10 +26,11 @@
 from paddle.fluid.dygraph.base import to_variable
 from test_imperative_base import new_program_scope
 from test_imperative_mnist import MNIST
+from paddle.fluid.framework import _test_eager_guard
 
 
 class TestImperativeMnistSortGradient(unittest.TestCase):
-    def test_mnist_sort_gradient_float32(self):
+    def func_test_mnist_sort_gradient_float32(self):
         seed = 90
         epoch_num = 1
 
@@ -144,6 +145,11 @@ def test_mnist_sort_gradient_float32(self):
         for key, value in six.iteritems(static_param_value):
             self.assertTrue(np.allclose(value, dy_param_value2[key], atol=1e-5))
 
+    def test_mnist_sort_gradient_float32(self):
+        with _test_eager_guard():
+            self.func_test_mnist_sort_gradient_float32()
+        self.func_test_mnist_sort_gradient_float32()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
index 1c183a8c2b74a..7f2ce131a05d7 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
@@ -27,6 +27,7 @@
 import numpy as np
 import six
 from utils import DyGraphProgramDescTracerTestHelper, is_equal_program
+from paddle.fluid.framework import _test_eager_guard, _in_eager_mode
 
 
 class SimpleLSTMRNN(fluid.Layer):
@@ -210,10 +211,15 @@ def forward(self, input, label, init_hidden, init_cell):
 
 
 class TestDygraphPtbRnn(unittest.TestCase):
-    def test_ptb_rnn(self):
+    def func_test_ptb_rnn(self):
         for is_sparse in [True, False]:
             self.ptb_rnn_cpu_float32(is_sparse)
 
+    def test_ptb_rnn(self):
+        with _test_eager_guard():
+            self.func_test_ptb_rnn()
+        self.func_test_ptb_rnn()
+
     def ptb_rnn_cpu_float32(self, is_sparse):
         seed = 90
         hidden_size = 10
@@ -260,7 +266,7 @@ def ptb_rnn_cpu_float32(self, is_sparse):
                 y = to_variable(y_data)
                 init_hidden = to_variable(init_hidden_data)
                 init_cell = to_variable(init_cell_data)
-                if i % 5 == 0:
+                if i % 5 == 0 and (not _in_eager_mode()):
                     outs, traced_layer = TracedLayer.trace(
                         ptb_model, [x, y, init_hidden, init_cell])
                     outs_static = traced_layer([x, y, init_hidden, init_cell])
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
index 2d67af82de87a..32e4aacf880e9 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
@@ -26,6 +26,7 @@
 from test_imperative_base import new_program_scope
 from utils import DyGraphProgramDescTracerTestHelper, is_equal_program
 from paddle.fluid.dygraph import TracedLayer
+from paddle.fluid.framework import _test_eager_guard, _in_eager_mode
 
 #NOTE(zhiqiu): run with FLAGS_cudnn_deterministic=1
 
@@ -242,7 +243,7 @@ def _reader_imple():
 
         return _reader_imple
 
-    def test_resnet_float32(self):
+    def func_test_resnet_float32(self):
         seed = 90
 
         batch_size = train_parameters["batch_size"]
@@ -259,14 +260,9 @@ def test_resnet_float32(self):
                 train_parameters, parameter_list=resnet.parameters())
             np.random.seed(seed)
 
-            batch_py_reader = fluid.io.PyReader(capacity=1)
-            batch_py_reader.decorate_sample_list_generator(
-                paddle.batch(
-                    self.reader_decorator(
-                        paddle.dataset.flowers.train(use_xmap=False)),
-                    batch_size=batch_size,
-                    drop_last=True),
-                places=fluid.CPUPlace())
+            train_reader = paddle.batch(
+                paddle.dataset.flowers.train(use_xmap=False),
+                batch_size=batch_size)
 
             dy_param_init_value = {}
             for param in resnet.parameters():
@@ -275,16 +271,21 @@ def test_resnet_float32(self):
             helper = DyGraphProgramDescTracerTestHelper(self)
             program = None
 
-            for batch_id, data in enumerate(batch_py_reader()):
+            for batch_id, data in enumerate(train_reader()):
                 if batch_id >= batch_num:
                     break
 
-                img = data[0]
-                label = data[1]
+                dy_x_data = np.array(
+                    [x[0].reshape(3, 224, 224) for x in data]).astype('float32')
+                y_data = np.array([x[1] for x in data]).astype('int64').reshape(
+                    batch_size, 1)
+
+                img = to_variable(dy_x_data)
+                label = to_variable(y_data)
                 label.stop_gradient = True
 
                 out = None
-                if batch_id % 5 == 0:
+                if batch_id % 5 == 0 and not _in_eager_mode():
                     out, traced_layer = TracedLayer.trace(resnet, img)
                     if program is not None:
                         self.assertTrue(
@@ -430,6 +431,11 @@ def test_resnet_float32(self):
             self.assertTrue(np.isfinite(value.all()))
             self.assertFalse(np.isnan(value.any()))
 
+    def test_resnet_float32(self):
+        with _test_eager_guard():
+            self.func_test_resnet_float32()
+        self.func_test_resnet_float32()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_resnet_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_resnet_sorted_gradient.py
index 13570d1bf71a5..daa778288ddf5 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_resnet_sorted_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet_sorted_gradient.py
@@ -22,6 +22,7 @@
 from paddle.fluid.dygraph.base import to_variable
 from test_imperative_base import new_program_scope
 from test_imperative_resnet import ResNet
+from paddle.fluid.framework import _test_eager_guard
 
 batch_size = 8
 train_parameters = {
@@ -71,7 +72,7 @@ def optimizer_setting(params, parameter_list=None):
 
 
 class TestDygraphResnetSortGradient(unittest.TestCase):
-    def test_resnet_sort_gradient_float32(self):
+    def func_test_resnet_sort_gradient_float32(self):
         seed = 90
 
         batch_size = train_parameters["batch_size"]
@@ -230,6 +231,11 @@ def test_resnet_sort_gradient_float32(self):
             self.assertTrue(np.isfinite(value.all()))
             self.assertFalse(np.isnan(value.any()))
 
+    def test_resnet_sort_gradient_float32(self):
+        with _test_eager_guard():
+            self.func_test_resnet_sort_gradient_float32()
+        self.func_test_resnet_sort_gradient_float32()
+
 
 if __name__ == '__main__':
     unittest.main()

From 9880595ae7c550d0cc2b6c6a33e0607e5382d9b8 Mon Sep 17 00:00:00 2001
From: houj04 <35131887+houj04@users.noreply.github.com>
Date: Wed, 23 Feb 2022 10:23:12 +0800
Subject: [PATCH 053/101] fix activation ut typo xpu. test=kunlun (#39813)

---
 .../paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py
index 3436e443ab853..6a7e5f08b5e48 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py
@@ -229,7 +229,7 @@ def gelu(x, approximate):
     return y_ref.astype(x.dtype)
 
 
-class XPUTestHardSwishGeluOP(XPUOpTestWrapper):
+class XPUTestHardSwishOP(XPUOpTestWrapper):
     def __init__(self):
         self.op_name = 'hard_swish'
         self.use_dynamic_create_class = False

From 22abb6b3d023fd305a58b90738f8201aa549709e Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Wed, 23 Feb 2022 10:36:02 +0800
Subject: [PATCH 054/101] fix 'is with a literal' warning (#39798)

* fix 'is with a literal'

* fix typo
---
 python/paddle/fluid/dygraph/amp/auto_cast.py   | 2 +-
 python/paddle/fluid/framework.py               | 2 +-
 python/paddle/fluid/tests/unittests/op_test.py | 2 +-
 python/paddle/nn/functional/loss.py            | 4 ++--
 python/paddle/nn/functional/norm.py            | 2 +-
 python/paddle/nn/layer/conv.py                 | 2 +-
 python/paddle/nn/layer/norm.py                 | 4 ++--
 python/paddle/nn/layer/rnn.py                  | 2 +-
 python/paddle/nn/layer/vision.py               | 2 +-
 python/paddle/tensor/linalg.py                 | 2 +-
 python/paddle/tensor/math.py                   | 4 ++--
 11 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/python/paddle/fluid/dygraph/amp/auto_cast.py b/python/paddle/fluid/dygraph/amp/auto_cast.py
index 37134764e9d1c..41a7d3d774793 100644
--- a/python/paddle/fluid/dygraph/amp/auto_cast.py
+++ b/python/paddle/fluid/dygraph/amp/auto_cast.py
@@ -161,7 +161,7 @@ def pure_fp16_initialize(models):
     for idx in range(len(models)):
         for layer in models[idx].sublayers(include_self=True):
             layer._casted_by_pure_fp16 = True
-            if (layer._dtype is 'float16') or isinstance(
+            if (layer._dtype == 'float16') or isinstance(
                     layer, (paddle.nn.BatchNorm, paddle.nn.BatchNorm1D,
                             paddle.nn.BatchNorm2D, paddle.nn.BatchNorm3D,
                             paddle.nn.LayerNorm)):
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 780b8acc4fde6..d0a94238a7aeb 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -2544,7 +2544,7 @@ def __init__(self,
                     warnings.warn("The Op(%s) is not support to set device." %
                                   type)
                 if 'force_cpu' in op_attrs:
-                    if (type is 'less_than' and op_attrs['force_cpu'] != None
+                    if (type == 'less_than' and op_attrs['force_cpu'] != None
                         ) or op_attrs['force_cpu'] != False:
                         warnings.warn(
                             "The Attr(force_cpu) of Op(%s) will be deprecated in the future, "
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index ec3b68086b065..92cba4fca5aba 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -380,7 +380,7 @@ def is_bfloat16_op(self):
             hasattr(self, 'output_dtype') and
             self.output_dtype == np.uint16) or (
                 hasattr(self, 'mkldnn_data_type') and
-                getattr(self, 'mkldnn_data_type') is "bfloat16") or (
+                getattr(self, 'mkldnn_data_type') == "bfloat16") or (
                     hasattr(self, 'attrs') and
                     'mkldnn_data_type' in self.attrs and
                     self.attrs['mkldnn_data_type'] == 'bfloat16')
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 636d2f645c5b0..94c516f476ede 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -131,7 +131,7 @@ def binary_cross_entropy(input, label, weight=None, reduction='mean',
     fluid.data_feeder.check_variable_and_dtype(
         label, 'label', ['float32', 'float64'], 'binary_cross_entropy')
 
-    sub_name = name if weight is None and reduction is 'none' else None
+    sub_name = name if weight is None and reduction == 'none' else None
     helper = LayerHelper("binary_cross_entropy", name=sub_name)
     out = helper.create_variable_for_type_inference(dtype=input.dtype)
     helper.append_op(
@@ -144,7 +144,7 @@ def binary_cross_entropy(input, label, weight=None, reduction='mean',
 
     if weight is not None:
         if isinstance(weight, paddle.static.Variable):
-            weight_name = name if reduction is 'none' else None
+            weight_name = name if reduction == 'none' else None
             out = paddle.multiply(out, weight, name=weight_name)
         else:
             raise ValueError(
diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py
index c59d0eb5e6d11..a5de268ec2314 100644
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -217,7 +217,7 @@ def batch_norm(x,
 
     helper = LayerHelper('batch_norm', **locals())
 
-    param_dtype = x.dtype if x.dtype is not 'float16' else 'float32'
+    param_dtype = x.dtype if x.dtype != 'float16' else 'float32'
     saved_mean = helper.create_variable_for_type_inference(
         dtype=param_dtype, stop_gradient=True)
     saved_variance = helper.create_variable_for_type_inference(
diff --git a/python/paddle/nn/layer/conv.py b/python/paddle/nn/layer/conv.py
index fd7355e162ae7..bb1cbbfc03e55 100644
--- a/python/paddle/nn/layer/conv.py
+++ b/python/paddle/nn/layer/conv.py
@@ -162,7 +162,7 @@ def extra_repr(self):
             main_str += ', stride={_stride}'
         if self._padding != 0:
             main_str += ', padding={_padding}'
-        if self._padding_mode is not 'zeros':
+        if self._padding_mode != 'zeros':
             main_str += ', padding_mode={_padding_mode}'
         if self.output_padding != 0:
             main_str += ', output_padding={output_padding}'
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index 8113073d757d6..7c3e3ad8dee9f 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -668,7 +668,7 @@ def forward(self, input):
     def extra_repr(self):
         main_str = 'num_features={}, momentum={}, epsilon={}'.format(
             self._num_features, self._momentum, self._epsilon)
-        if self._data_format is not 'NCHW':
+        if self._data_format != 'NCHW':
             main_str += ', data_format={}'.format(self._data_format)
         if self._name is not None:
             main_str += ', name={}'.format(self._name)
@@ -1252,7 +1252,7 @@ def forward(self, input):
     def extra_repr(self):
         main_str = 'size={}, alpha={}, beta={}, k={}'.format(
             self.size, self.alpha, self.beta, self.k)
-        if self.data_format is not 'NCHW':
+        if self.data_format != 'NCHW':
             main_str += ', data_format={}'.format(self.data_format)
         if self.name is not None:
             main_str += ', name={}'.format(self.name)
diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py
index 2bb1f1311107b..09a0d3cb41cbc 100644
--- a/python/paddle/nn/layer/rnn.py
+++ b/python/paddle/nn/layer/rnn.py
@@ -391,7 +391,7 @@ def state_shape(self):
 
     def extra_repr(self):
         s = '{input_size}, {hidden_size}'
-        if self.activation is not "tanh":
+        if self.activation != "tanh":
             s += ', activation={activation}'
         return s.format(**self.__dict__)
 
diff --git a/python/paddle/nn/layer/vision.py b/python/paddle/nn/layer/vision.py
index 7f8b51ca10818..0531afb4eeeeb 100644
--- a/python/paddle/nn/layer/vision.py
+++ b/python/paddle/nn/layer/vision.py
@@ -82,7 +82,7 @@ def forward(self, x):
 
     def extra_repr(self):
         main_str = 'upscale_factor={}'.format(self._upscale_factor)
-        if self._data_format is not 'NCHW':
+        if self._data_format != 'NCHW':
             main_str += ', data_format={}'.format(self._data_format)
         if self._name is not None:
             main_str += ', name={}'.format(self._name)
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index 91d688b761a11..fef1652040835 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -2792,7 +2792,7 @@ def __check_input(x, UPLO):
             raise ValueError(
                 "The input matrix must be batches of square matrices. But received x's dimention: {}".
                 format(x_shape))
-        if UPLO is not 'L' and UPLO is not 'U':
+        if UPLO != 'L' and UPLO != 'U':
             raise ValueError(
                 "UPLO must be L or U. But received UPLO is: {}".format(UPLO))
 
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index a36bf1c432515..ce29e9dce8180 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -3439,7 +3439,7 @@ def erfinv_(x, name=None):
     return _C_ops.erfinv_(x)
 
 def rad2deg(x, name=None):
-    """
+    r"""
     Convert each of the elements of input x from angles in radians to degrees.
     
     Equation:
@@ -3498,7 +3498,7 @@ def rad2deg(x, name=None):
         return out
 
 def deg2rad(x, name=None):
-    """
+    r"""
     Convert each of the elements of input x from degrees to angles in radians.
     
     Equation:

From da492a13fb04aab7aef858e6d3e5019cfd035405 Mon Sep 17 00:00:00 2001
From: TTerror <tangzhiyi11@users.noreply.github.com>
Date: Wed, 23 Feb 2022 10:41:59 +0800
Subject: [PATCH 055/101] refactoring gather/masked_select/arg_max unittests
 for kunlun, *test=kunlun (#39711)

---
 .../unittests/xpu/test_arg_max_op_xpu.py      | 179 +++++-------
 .../tests/unittests/xpu/test_gather_op_xpu.py | 272 ++++++------------
 .../xpu/test_masked_select_op_xpu.py          |  80 ++----
 3 files changed, 190 insertions(+), 341 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/xpu/test_arg_max_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_arg_max_op_xpu.py
index cbdd9db8ee7f2..519a185250ab0 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_arg_max_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_arg_max_op_xpu.py
@@ -18,118 +18,93 @@
 import numpy as np
 import sys
 sys.path.append("..")
+
+import paddle
 from op_test import OpTest
 from op_test_xpu import XPUOpTest
-import paddle
-import paddle.fluid.core as core
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
 
 paddle.enable_static()
 
 
-class XPUBaseTestCase(XPUOpTest):
-    def initTestCase(self):
-        self.dims = (3, 4)
-        self.dtype = 'float32'
-        self.axis = 1
+class XPUTestArgMax(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'arg_max'
 
-    def setUp(self):
-        self.initTestCase()
-        self.__class__.op_type = 'arg_max'
-        self.__class__.use_xpu = True
-        np.random.seed(2021)
-        self.x = (np.random.random(self.dims)).astype(self.dtype)
-        self.inputs = {'X': self.x}
-        self.attrs = {'axis': self.axis, 'use_xpu': True}
-        if self.op_type == "arg_min":
-            self.outputs = {'Out': np.argmin(self.x, axis=self.axis)}
-        else:
-            self.outputs = {'Out': np.argmax(self.x, axis=self.axis)}
-
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place)
-
-
-# test argmax, dtype: float32
-class TestArgMaxFloat32Case1(XPUBaseTestCase):
-    def initTestCase(self):
-        self.op_type = 'arg_max'
-        self.dims = (3, 4, 5)
-        self.dtype = 'float32'
-        self.axis = -1
+    class XPUBaseTestCase(XPUOpTest):
+        def initTestCase(self):
+            self.dims = (3, 4)
+            self.axis = 1
 
+        def setUp(self):
+            self.op_type = 'arg_max'
+            self.dtype = self.in_type
+            self.initTestCase()
 
-class TestArgMaxFloat32Case2(XPUBaseTestCase):
-    def initTestCase(self):
-        self.op_type = 'arg_max'
-        self.dims = (3, 4, 5)
-        self.dtype = 'float32'
-        self.axis = 0
-
-
-class TestArgMaxFloat32Case3(XPUBaseTestCase):
-    def initTestCase(self):
-        self.op_type = 'arg_max'
-        self.dims = (3, 4, 5)
-        self.dtype = 'float32'
-        self.axis = 1
-
-
-class TestArgMaxFloat32Case4(XPUBaseTestCase):
-    def initTestCase(self):
-        self.op_type = 'arg_max'
-        self.dims = (3, 4, 5)
-        self.dtype = 'float32'
-        self.axis = 2
-
-
-class TestArgMaxFloat32Case5(XPUBaseTestCase):
-    def initTestCase(self):
-        self.op_type = 'arg_max'
-        self.dims = (3, 4)
-        self.dtype = 'float32'
-        self.axis = -1
-
-
-class TestArgMaxFloat32Case6(XPUBaseTestCase):
-    def initTestCase(self):
-        self.op_type = 'arg_max'
-        self.dims = (3, 4)
-        self.dtype = 'float32'
-        self.axis = 0
-
-
-class TestArgMaxFloat32Case7(XPUBaseTestCase):
-    def initTestCase(self):
-        self.op_type = 'arg_max'
-        self.dims = (3, 4)
-        self.dtype = 'float32'
-        self.axis = 1
-
-
-class TestArgMaxFloat32Case8(XPUBaseTestCase):
-    def initTestCase(self):
-        self.op_type = 'arg_max'
-        self.dims = (1, )
-        self.dtype = 'float32'
-        self.axis = 0
-
-
-class TestArgMaxFloat32Case9(XPUBaseTestCase):
-    def initTestCase(self):
-        self.op_type = 'arg_max'
-        self.dims = (2, )
-        self.dtype = 'float32'
-        self.axis = 0
-
+            self.x = (np.random.random(self.dims)).astype(self.dtype)
+            self.inputs = {'X': self.x}
+            self.attrs = {'axis': self.axis, 'use_xpu': True}
+            self.outputs = {'Out': np.argmax(self.x, axis=self.axis)}
 
-class TestArgMaxFloat32Case10(XPUBaseTestCase):
-    def initTestCase(self):
-        self.op_type = 'arg_max'
-        self.dims = (3, )
-        self.dtype = 'float32'
-        self.axis = 0
+        def test_check_output(self):
+            if paddle.is_compiled_with_xpu():
+                place = paddle.XPUPlace(0)
+                self.check_output_with_place(place)
+
+    class TestArgMaxCase1(XPUBaseTestCase):
+        def initTestCase(self):
+            self.dims = (3, 4, 5)
+            self.axis = -1
+
+    class TestArgMaxCase2(XPUBaseTestCase):
+        def initTestCase(self):
+            self.dims = (3, 4, 5)
+            self.axis = 0
+
+    class TestArgMaxCase3(XPUBaseTestCase):
+        def initTestCase(self):
+            self.dims = (3, 4, 5)
+            self.axis = 1
+
+    class TestArgMaxCase4(XPUBaseTestCase):
+        def initTestCase(self):
+            self.dims = (3, 4, 5)
+            self.axis = 2
+
+    class TestArgMaxCase5(XPUBaseTestCase):
+        def initTestCase(self):
+            self.dims = (3, 4)
+            self.axis = -1
+
+    class TestArgMaxCase6(XPUBaseTestCase):
+        def initTestCase(self):
+            self.dims = (3, 4)
+            self.axis = 0
+
+    class TestArgMaxCase7(XPUBaseTestCase):
+        def initTestCase(self):
+            self.dims = (3, 4)
+            self.axis = 1
+
+    class TestArgMaxCase8(XPUBaseTestCase):
+        def initTestCase(self):
+            self.dims = (1, )
+            self.axis = 0
+
+    class TestArgMaxCase9(XPUBaseTestCase):
+        def initTestCase(self):
+            self.dims = (2, )
+            self.axis = 0
+
+    class TestArgMaxCase10(XPUBaseTestCase):
+        def initTestCase(self):
+            self.dims = (3, )
+            self.axis = 0
+
+
+support_types = get_xpu_op_support_types('arg_max')
+for stype in support_types:
+    create_test_class(globals(), XPUTestArgMax, stype)
 
 
 class TestArgMaxAPI(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_gather_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_gather_op_xpu.py
index bdf74018abb58..f0e6315514fb5 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_gather_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_gather_op_xpu.py
@@ -20,9 +20,8 @@
 import numpy as np
 
 import paddle
-import paddle.fluid as fluid
-from op_test import OpTest
 from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
 
 paddle.enable_static()
 
@@ -34,194 +33,87 @@ def gather_numpy(x, index, axis):
     return gather
 
 
-class TestXPUGatherOp(XPUOpTest):
-    def setUp(self):
-        self.op_type = "gather"
-        self.use_xpu = True
-        self.use_mkldnn = False
-        self.attrs = {'use_xpu': True}
-
-        self.config()
-        xnp = np.random.random(self.x_shape).astype(self.x_type)
-        self.inputs = {
-            'X': xnp,
-            'Index': np.array(self.index).astype(self.index_type)
-        }
-        self.outputs = {'Out': self.inputs["X"][self.inputs["Index"]]}
-
-    def config(self):
-        """
-        For multi-dimension input
-        """
-        self.dtype = np.float32
-        self.x_shape = (10, 20)
-        self.x_type = np.float32
-        self.index = [1, 3, 5]
-        self.index_type = np.int32
-
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place)
-
-    def test_check_grad(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, ['X'], 'Out')
-
-
-class TestCase1(TestXPUGatherOp):
-    def config(self):
-        """
-        For one dimension input
-        """
-        self.dtype = np.float32
-        self.x_shape = (100)
-        self.x_type = np.float32
-        self.index = [1, 3, 5]
-        self.index_type = np.int32
-
-
-class TestCase2(TestXPUGatherOp):
-    def config(self):
-        """
-        For int64_t index type
-        """
-        self.dtype = np.float32
-        self.x_shape = (100)
-        self.x_type = np.float32
-        self.index = [1, 3, 5]
-        self.index_type = np.int64
-
-
-class TestCase3(TestXPUGatherOp):
-    def config(self):
-        """
-        For other input type
-        """
-        self.dtype = np.float32
-        self.x_shape = (10, 20)
-        self.x_type = np.float32
-        self.index = [1, 3, 5]
-        self.index_type = np.int32
-
-
-class TestCase4(TestXPUGatherOp):
-    def config(self):
-        self.dtype = np.float32
-        self.x_shape = (10, 20)
-        self.attrs = {'use_xpu': True, 'overwrite': False}
-        self.x_type = np.float32
-        self.index = [1, 1]
-        self.index_type = np.int32
-
-
-class TestCase5(TestXPUGatherOp):
-    def config(self):
-        self.dtype = np.float32
-        self.x_shape = (10, 20)
-        self.attrs = {'use_xpu': True, 'overwrite': False}
-        self.x_type = np.float32
-        self.index = [1, 1, 3]
-        self.index_type = np.int32
-
-
-class TestCase6(TestXPUGatherOp):
-    def config(self):
-        self.dtype = np.float32
-        self.x_shape = (10, 20)
-        self.attrs = {'use_xpu': True, 'overwrite': True}
-        self.x_type = np.float32
-        self.index = [1, 3]
-        self.index_type = np.int32
-
-
-class TestCase7(TestXPUGatherOp):
-    def config(self):
-        self.dtype = np.float32
-        self.x_shape = (10, 20)
-        self.attrs = {'use_xpu': True, 'overwrite': True}
-        self.x_type = np.float32
-        self.index = [1, 3]
-        self.index_type = np.int64
-
-
-## test fp16
-class TestCaseFP161(TestXPUGatherOp):
-    def config(self):
-        """
-        For one dimension input
-        """
-        self.dtype = np.float16
-        self.x_shape = (100)
-        self.x_type = np.float16
-        self.index = [1, 3, 5]
-        self.index_type = np.int32
-
-
-class TestCaseFP162(TestXPUGatherOp):
-    def config(self):
-        """
-        For int64_t index type
-        """
-        self.dtype = np.float16
-        self.x_shape = (100)
-        self.x_type = np.float16
-        self.index = [1, 3, 5]
-        self.index_type = np.int64
-
-
-class TestCaseFP163(TestXPUGatherOp):
-    def config(self):
-        """
-        For other input type
-        """
-        self.dtype = np.float16
-        self.x_shape = (10, 20)
-        self.x_type = np.float16
-        self.index = [1, 3, 5]
-        self.index_type = np.int32
-
-
-class TestCaseFP164(TestXPUGatherOp):
-    def config(self):
-        self.dtype = np.float16
-        self.x_shape = (10, 20)
-        self.attrs = {'use_xpu': True, 'overwrite': False}
-        self.x_type = np.float16
-        self.index = [1, 1]
-        self.index_type = np.int32
-
-
-class TestCaseFP165(TestXPUGatherOp):
-    def config(self):
-        self.dtype = np.float16
-        self.x_shape = (10, 20)
-        self.attrs = {'use_xpu': True, 'overwrite': False}
-        self.x_type = np.float16
-        self.index = [1, 1, 3]
-        self.index_type = np.int32
-
-
-class TestCaseFP166(TestXPUGatherOp):
-    def config(self):
-        self.dtype = np.float16
-        self.x_shape = (10, 20)
-        self.attrs = {'use_xpu': True, 'overwrite': True}
-        self.x_type = np.float16
-        self.index = [1, 3]
-        self.index_type = np.int32
-
-
-class TestCaseFP167(TestXPUGatherOp):
-    def config(self):
-        self.dtype = np.float16
-        self.x_shape = (10, 20)
-        self.attrs = {'use_xpu': True, 'overwrite': True}
-        self.x_type = np.float16
-        self.index = [1, 3]
-        self.index_type = np.int64
-
+class XPUTestGather(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'gather'
+
+    class TestXPUGatherOp(XPUOpTest):
+        def setUp(self):
+            self.op_type = "gather"
+            self.place = paddle.XPUPlace(0)
+            self.dtype = self.in_type
+
+            self.init_config()
+            xnp = np.random.random(self.x_shape).astype(self.dtype)
+            self.inputs = {
+                'X': xnp,
+                'Index': np.array(self.index).astype(self.index_type)
+            }
+            self.outputs = {'Out': self.inputs["X"][self.inputs["Index"]]}
+
+        def init_config(self):
+            self.x_shape = (10, 20)
+            self.index = [1, 3, 5]
+            self.index_type = np.int32
+
+        def test_check_output(self):
+            if paddle.is_compiled_with_xpu():
+                self.check_output_with_place(self.place)
+
+        def test_check_grad(self):
+            if paddle.is_compiled_with_xpu():
+                self.check_grad_with_place(self.place, ['X'], 'Out')
+
+    class TestCase1(TestXPUGatherOp):
+        def init_config(self):
+            self.x_shape = (100)
+            self.index = [1, 3, 5]
+            self.index_type = np.int32
+
+    class TestCase2(TestXPUGatherOp):
+        def init_config(self):
+            self.x_shape = (100)
+            self.index = [1, 3, 5]
+            self.index_type = np.int64
+
+    class TestCase3(TestXPUGatherOp):
+        def init_config(self):
+            self.x_shape = (10, 20)
+            self.index = [1, 3, 5]
+            self.index_type = np.int32
+
+    class TestCase4(TestXPUGatherOp):
+        def init_config(self):
+            self.x_shape = (10, 20)
+            self.attrs = {'overwrite': False}
+            self.index = [1, 1]
+            self.index_type = np.int32
+
+    class TestCase5(TestXPUGatherOp):
+        def init_config(self):
+            self.x_shape = (10, 20)
+            self.attrs = {'overwrite': False}
+            self.index = [1, 1, 3]
+            self.index_type = np.int32
+
+    class TestCase6(TestXPUGatherOp):
+        def init_config(self):
+            self.x_shape = (10, 20)
+            self.attrs = {'overwrite': True}
+            self.index = [1, 3]
+            self.index_type = np.int32
+
+    class TestCase7(TestXPUGatherOp):
+        def init_config(self):
+            self.x_shape = (10, 20)
+            self.attrs = {'overwrite': True}
+            self.index = [1, 3]
+            self.index_type = np.int64
+
+
+support_types = get_xpu_op_support_types('gather')
+for stype in support_types:
+    create_test_class(globals(), XPUTestGather, stype)
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_masked_select_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_masked_select_op_xpu.py
index 8c5b3f3d8a9af..990594e1f9edf 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_masked_select_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_masked_select_op_xpu.py
@@ -18,10 +18,11 @@
 import unittest
 import sys
 sys.path.append("..")
-from op_test import OpTest
-from op_test_xpu import XPUOpTest
+
 import paddle
 import paddle.fluid as fluid
+from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
 
 paddle.enable_static()
 
@@ -34,61 +35,42 @@ def np_masked_select(x, mask):
     return result.flatten()
 
 
-class TestMaskedSelectOp(XPUOpTest):
-    def set_xpu(self):
-        self.__class__.use_xpu = True
-
-    def setUp(self):
-        self.set_xpu()
-        self.init()
-        self.init_dtype()
-        self.place = paddle.XPUPlace(0)
-        self.op_type = "masked_select"
-        x = np.random.random(self.shape).astype(self.dtype)
-        mask = np.array(np.random.randint(2, size=self.shape, dtype=bool))
-        out = np_masked_select(x, mask)
-        self.inputs = {'X': x, 'Mask': mask}
-        self.outputs = {'Y': out}
-
-    def test_check_output(self):
-        self.check_output_with_place(self.place)
-
-    def test_check_grad(self):
-        pass
-
-    def init(self):
-        self.shape = (50, 3)
-
-    def init_dtype(self):
-        self.dtype = np.float32
-
-
-class TestMaskedSelectOp1(TestMaskedSelectOp):
-    def init(self):
-        self.shape = (6, 8, 9, 18)
+class XPUTestMaskedSelectOp(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'masked_select'
 
+    class TestMaskedSelectOp(XPUOpTest):
+        def setUp(self):
+            self.init()
+            self.dtype = self.in_type
+            self.place = paddle.XPUPlace(0)
+            self.op_type = "masked_select"
+            self.__class__.no_need_check_grad = True
 
-class TestMaskedSelectOp2(TestMaskedSelectOp):
-    def init(self):
-        self.shape = (168, )
+            x = np.random.random(self.shape).astype(self.dtype)
+            mask = np.array(np.random.randint(2, size=self.shape, dtype=bool))
+            out = np_masked_select(x, mask)
+            self.inputs = {'X': x, 'Mask': mask}
+            self.outputs = {'Y': out}
 
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
 
-class TestMaskedSelectOpInt32(TestMaskedSelectOp):
-    def init_dtype(self):
-        self.dtype = np.int32
+        def init(self):
+            self.shape = (50, 3)
 
-    # skip_check_grad_ci(reason="get_numeric_gradient not support int32")
-    def test_check_grad(self):
-        pass
+    class TestMaskedSelectOp1(TestMaskedSelectOp):
+        def init(self):
+            self.shape = (6, 8, 9, 18)
 
+    class TestMaskedSelectOp2(TestMaskedSelectOp):
+        def init(self):
+            self.shape = (168, )
 
-class TestMaskedSelectOpInt64(TestMaskedSelectOp):
-    def init_dtype(self):
-        self.dtype = np.int64
 
-    # skip_check_grad_ci(reason="get_numeric_gradient not support int64")
-    def test_check_grad(self):
-        pass
+support_types = get_xpu_op_support_types('masked_select')
+for stype in support_types:
+    create_test_class(globals(), XPUTestMaskedSelectOp, stype)
 
 
 class TestMaskedSelectAPI(unittest.TestCase):

From 4130b640314d003ab452f9b0f2aa7dac132668f6 Mon Sep 17 00:00:00 2001
From: baoachun <962571062@qq.com>
Date: Wed, 23 Feb 2022 11:09:03 +0800
Subject: [PATCH 056/101] update gather_nd trt converter ut (#39584)

* update gather_nd trt converter ut

* update ut
---
 .../inference/test_trt_convert_gather_nd.py   | 25 ++++++-------------
 1 file changed, 7 insertions(+), 18 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather_nd.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather_nd.py
index 0c7eae5f85f95..6b6a9536d81be 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather_nd.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather_nd.py
@@ -346,7 +346,7 @@ def generate_input1():
             return np.random.random([2, 32]).astype(np.float32)
 
         def generate_input2():
-            return np.ones([2, 2]).astype(np.int32)
+            return np.array([[0, 3], [1, 9]]).astype(np.int32)
 
         ops_config = [{
             "op_type": "gather_nd",
@@ -408,23 +408,11 @@ def clear_dynamic_shape():
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), (1, 3), 1e-5
+        yield self.create_inference_config(), (0, 4), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (1, 3), 1e-5
-
-    def add_skip_trt_case(self):
-        def teller(program_config, predictor_config):
-            if len(self.dynamic_shape.min_input_shape) != 0:
-                return True
-            return False
-
-        self.add_skip_case(
-            teller, SkipReasons.TRT_NOT_SUPPORT,
-            "Need to repair the case: the output of trt and GPU has diff when inputs' dim is 1 and 2."
-        )
+        yield self.create_inference_config(), (0, 4), 1e-5
 
     def test(self):
-        self.add_skip_trt_case()
         self.run_test()
 
 
@@ -434,10 +422,11 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
 
     def sample_program_configs(self):
         def generate_input1():
-            return np.random.random([2, 32, 256]).astype(np.float32)
+            return np.random.random([16, 32, 256]).astype(np.float32)
 
         def generate_input2():
-            return np.ones([2, 2, 2]).astype(np.int32)
+            return np.array(
+                [[[2, 5], [3, 8]], [[0, 2], [0, 3]]]).astype(np.int32)
 
         ops_config = [{
             "op_type": "gather_nd",
@@ -471,7 +460,7 @@ def generate_dynamic_shape(attrs):
                 "index_data": [1, 1, 1]
             }
             self.dynamic_shape.max_input_shape = {
-                "input_data": [4, 64, 512],
+                "input_data": [16, 64, 512],
                 "index_data": [4, 2, 4]
             }
             self.dynamic_shape.opt_input_shape = {

From 24f55aedaa991e8ab00d301e568b17c0fae7200a Mon Sep 17 00:00:00 2001
From: Allen Guo <alleng@graphcore.ai>
Date: Wed, 23 Feb 2022 11:12:15 +0800
Subject: [PATCH 057/101] [IPU] update inference demos (#39792)

* update inference part

* restore white space
---
 paddle/fluid/inference/CMakeLists.txt         |   5 +-
 paddle/fluid/inference/analysis/argument.h    |   8 +-
 .../analysis/passes/ir_graph_build_pass.cc    |  18 +-
 paddle/fluid/inference/api/analysis_config.cc |  37 +++-
 .../fluid/inference/api/analysis_predictor.cc |  18 +-
 .../inference/api/paddle_analysis_config.h    |  47 +++--
 paddle/fluid/inference/api/paddle_tensor.h    |   2 +-
 .../fluid/inference/tests/api/CMakeLists.txt  |  25 ++-
 .../tests/api/analyzer_ernie_tester.h         |   3 +-
 .../tests/api/ipu_ernie_fp16_test.cc          | 184 ++++++++++++++++
 .../inference/tests/api/ipu_ernie_test.cc     | 196 ++++++++++++++++++
 .../tests/api/ipu_multi_model_profile.cc      | 105 ++++++++++
 .../tests/api/ipu_resnet50_fp16_test.cc       |  86 ++++++++
 .../inference/tests/api/ipu_resnet50_test.cc  |  10 +-
 .../tests/api/ipu_word2vec_sample.cc          |  81 ++++++++
 .../fluid/inference/tests/api/tester_helper.h |  52 +++++
 16 files changed, 823 insertions(+), 54 deletions(-)
 create mode 100644 paddle/fluid/inference/tests/api/ipu_ernie_fp16_test.cc
 create mode 100644 paddle/fluid/inference/tests/api/ipu_ernie_test.cc
 create mode 100644 paddle/fluid/inference/tests/api/ipu_multi_model_profile.cc
 create mode 100644 paddle/fluid/inference/tests/api/ipu_resnet50_fp16_test.cc
 create mode 100644 paddle/fluid/inference/tests/api/ipu_word2vec_sample.cc

diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index d731bfe139bac..887bd52bae547 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -48,11 +48,10 @@ set(STATIC_INFERENCE_API paddle_inference_api analysis_predictor
 #TODO(wilber, T8T9): Do we still need to support windows gpu static library?
 if(WIN32 AND WITH_GPU)
   cc_library(paddle_inference DEPS ${fluid_modules} ${pten_modules} ${STATIC_INFERENCE_API} ${utils_modules})
+elseif(WITH_IPU)
+  cc_library(paddle_inference DEPS ${fluid_modules} ${pten_modules} ${STATIC_INFERENCE_API} ${utils_modules} paddle_ipu)
 else()
   create_static_lib(paddle_inference ${fluid_modules} ${pten_modules} ${STATIC_INFERENCE_API} ${utils_modules})
-  if(WITH_IPU)
-    target_link_libraries(paddle_inference -Wl,--allow-multiple-definition popart_canonicalization_utils)
-  endif()
 endif()
 
 if(NOT APPLE)
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index f474ccd260e80..a5c32164bf1a2 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -278,10 +278,14 @@ struct Argument {
   // ipu related
   DECL_ARGUMENT_FIELD(use_ipu, UseIpu, bool);
   DECL_ARGUMENT_FIELD(ipu_device_num, IpuDeviceNum, int);
+  DECL_ARGUMENT_FIELD(ipu_micro_batch_size, IpuMicroBatchSize, int);
   DECL_ARGUMENT_FIELD(ipu_enable_pipelining, IpuEnablePipelining, bool);
   DECL_ARGUMENT_FIELD(ipu_batches_per_step, IpuBatchesPerStep, int);
-  DECL_ARGUMENT_FIELD(ipu_batch_size, IpuBatchSize, int);
-  DECL_ARGUMENT_FIELD(ipu_need_avg_shard, IpuNeedAvgShard, bool);
+  DECL_ARGUMENT_FIELD(ipu_enable_fp16, IpuEnableFp16, bool);
+  DECL_ARGUMENT_FIELD(ipu_replica_num, IpuReplicaNum, int);
+  DECL_ARGUMENT_FIELD(ipu_available_memory_proportion,
+                      IpuAvailableMemoryProportion, float);
+  DECL_ARGUMENT_FIELD(ipu_enable_half_partial, IpuEnableHalfPartial, bool);
 
   // npu related
   DECL_ARGUMENT_FIELD(use_npu, UseNpu, bool);
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
index fe6a27f80725f..321716b1c8a18 100644
--- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
@@ -72,17 +72,21 @@ void IrGraphBuildPass::RunImpl(Argument *argument) {
     if (argument->use_ipu()) {
       argument->main_graph().SetNotOwned("num_ipus",
                                          &argument->ipu_device_num());
-      argument->main_graph().SetNotOwned("need_avg_shard",
-                                         &argument->ipu_need_avg_shard());
+      argument->main_graph().SetNotOwned("micro_batch_size",
+                                         &argument->ipu_micro_batch_size());
       argument->main_graph().SetNotOwned("enable_pipelining",
                                          &argument->ipu_enable_pipelining());
       argument->main_graph().SetNotOwned("batches_per_step",
                                          &argument->ipu_batches_per_step());
-      argument->main_graph().SetNotOwned("batch_size",
-                                         &argument->ipu_batch_size());
-    } else {
-      PADDLE_THROW(
-          platform::errors::Unimplemented("Please compile with WITH_IPU"));
+      argument->main_graph().SetNotOwned("enable_fp16",
+                                         &argument->ipu_enable_fp16());
+      argument->main_graph().SetNotOwned("replica_num",
+                                         &argument->ipu_replica_num());
+      argument->main_graph().SetNotOwned(
+          "available_memory_proportion",
+          &argument->ipu_available_memory_proportion());
+      argument->main_graph().SetNotOwned("enable_half_partial",
+                                         &argument->ipu_enable_half_partial());
     }
   }
 #endif
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 57e49733b329a..fd2ccffae3b4a 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -142,17 +142,28 @@ void AnalysisConfig::EnableNpu(int device_id) {
 
   Update();
 }
-void AnalysisConfig::EnableIpu(int device_num, bool ipu_enable_pipelining,
-                               int ipu_batches_per_step, int ipu_batch_size,
-                               bool ipu_need_avg_shard) {
+
+void AnalysisConfig::EnableIpu(int ipu_device_num, int ipu_micro_batch_size,
+                               bool ipu_enable_pipelining,
+                               int ipu_batches_per_step) {
   enable_ir_optim_ = true;
 
   use_ipu_ = true;
-  ipu_device_num_ = device_num;
+  ipu_device_num_ = ipu_device_num;
+  ipu_micro_batch_size_ = ipu_micro_batch_size;
   ipu_enable_pipelining_ = ipu_enable_pipelining;
   ipu_batches_per_step_ = ipu_batches_per_step;
-  ipu_batch_size_ = ipu_batch_size;
-  ipu_need_avg_shard_ = ipu_need_avg_shard;
+
+  Update();
+}
+
+void AnalysisConfig::SetIpuConfig(bool ipu_enable_fp16, int ipu_replica_num,
+                                  float ipu_available_memory_proportion,
+                                  bool ipu_enable_half_partial) {
+  ipu_enable_fp16_ = ipu_enable_fp16;
+  ipu_replica_num_ = ipu_replica_num;
+  ipu_available_memory_proportion_ = ipu_available_memory_proportion;
+  ipu_enable_half_partial_ = ipu_enable_half_partial;
 
   Update();
 }
@@ -255,10 +266,13 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   // ipu related
   CP_MEMBER(use_ipu_);
   CP_MEMBER(ipu_device_num_);
+  CP_MEMBER(ipu_micro_batch_size_);
   CP_MEMBER(ipu_enable_pipelining_);
   CP_MEMBER(ipu_batches_per_step_);
-  CP_MEMBER(ipu_batch_size_);
-  CP_MEMBER(ipu_need_avg_shard_);
+  CP_MEMBER(ipu_enable_fp16_);
+  CP_MEMBER(ipu_replica_num_);
+  CP_MEMBER(ipu_available_memory_proportion_);
+  CP_MEMBER(ipu_enable_half_partial_);
 
   if (use_gpu_) {
     PADDLE_ENFORCE_EQ(use_xpu_, false,
@@ -684,10 +698,13 @@ std::string AnalysisConfig::SerializeInfoCache() {
 
   ss << use_ipu_;
   ss << ipu_device_num_;
+  ss << ipu_micro_batch_size_;
   ss << ipu_enable_pipelining_;
   ss << ipu_batches_per_step_;
-  ss << ipu_batch_size_;
-  ss << ipu_need_avg_shard_;
+  ss << ipu_enable_fp16_;
+  ss << ipu_replica_num_;
+  ss << ipu_available_memory_proportion_;
+  ss << ipu_enable_half_partial_;
 
   return ss.str();
 }
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 6c005e4b2d6e4..cd6e3a3c759c0 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -93,6 +93,8 @@ bool PaddleTensorToLoDTensor(const PaddleTensor &pt, framework::LoDTensor *t,
     input_ptr = t->mutable_data<float>(ddim, place);
   } else if (pt.dtype == PaddleDType::INT32) {
     input_ptr = t->mutable_data<int32_t>(ddim, place);
+  } else if (pt.dtype == PaddleDType::FLOAT16) {
+    input_ptr = t->mutable_data<float16>(ddim, place);
   } else {
     LOG(ERROR) << "unsupported feed type " << pt.dtype;
     return false;
@@ -563,8 +565,12 @@ bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
     } else if (type == framework::proto::VarType::INT32) {
       GetFetchOne<int32_t>(fetch, output);
       output->dtype = PaddleDType::INT32;
+    } else if (type == framework::proto::VarType::FP16) {
+      GetFetchOne<float16>(fetch, output);
+      output->dtype = PaddleDType::FLOAT16;
     } else {
-      LOG(ERROR) << "unknown type, only support float32, int64 and int32 now.";
+      LOG(ERROR) << "unknown type, only support float32, float16, int64 and "
+                    "int32 now.";
     }
   }
   return true;
@@ -662,12 +668,18 @@ void AnalysisPredictor::PrepareArgument() {
     LOG(INFO) << "Lite subgraph engine is enabled";
   }
 
+#ifdef PADDLE_WITH_IPU
   argument_.SetUseIpu(config_.use_ipu_);
   argument_.SetIpuDeviceNum(config_.ipu_device_num());
+  argument_.SetIpuMicroBatchSize(config_.ipu_micro_batch_size_);
   argument_.SetIpuEnablePipelining(config_.ipu_enable_pipelining_);
   argument_.SetIpuBatchesPerStep(config_.ipu_batches_per_step_);
-  argument_.SetIpuBatchSize(config_.ipu_batch_size_);
-  argument_.SetIpuNeedAvgShard(config_.ipu_need_avg_shard_);
+  argument_.SetIpuEnableFp16(config_.ipu_enable_fp16_);
+  argument_.SetIpuReplicaNum(config_.ipu_replica_num_);
+  argument_.SetIpuAvailableMemoryProportion(
+      config_.ipu_available_memory_proportion_);
+  argument_.SetIpuEnableHalfPartial(config_.ipu_enable_half_partial_);
+#endif
 
   argument_.SetUseNpu(config_.use_npu_);
   argument_.SetNPUDeviceId(config_.npu_device_id());
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 4b13ca073bc4f..180c028c6a610 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -234,20 +234,30 @@ struct PD_INFER_DECL AnalysisConfig {
   ///
   /// \brief Turn on IPU.
   ///
-  /// \param device_num The number of IPUs.
-  /// \param ipu_enable_pipelining Enable data pipelining between subgraphs,
-  /// each subgraph is settled on an IPU. (This feature requires the number of
-  /// IPUs > 1.)
-  /// \param ipu_batches_per_step The number of micro_batch_size per run. (This
-  /// feature requires to enable pipelining.)
-  /// \param ipu_batch_size The micro_batch_size which is the batch_size in the
-  /// graph.
-  /// \param ipu_need_avg_shard Enable the auto graph sharding. (This feature
-  /// requires the number of IPUs > 1.)
-  ///
-  void EnableIpu(int device_num = 1, bool ipu_enable_pipelining = false,
-                 int ipu_batches_per_step = 1, int ipu_batch_size = 1,
-                 bool ipu_need_avg_shard = false);
+  /// \param ipu_device_num the number of IPUs.
+  /// \param ipu_micro_batch_size the batch size in the graph, only work with
+  /// mutable input shapes.
+  /// \param ipu_enable_pipelining enable pipelining.
+  /// \param ipu_batches_per_step the number of batches per run in pipelining.
+  ///
+  void EnableIpu(int ipu_device_num = 1, int ipu_micro_batch_size = 1,
+                 bool ipu_enable_pipelining = false,
+                 int ipu_batches_per_step = 1);
+
+  ///
+  /// \brief Set IPU config.
+  ///
+  /// \param ipu_enable_fp16 enable fp16.
+  /// \param ipu_replica_num the number of graph replication.
+  /// \param ipu_available_memory_proportion the available memory proportion for
+  /// matmul/conv.
+  /// \param ipu_enable_half_partial enable fp16 partial for matmul, only work
+  /// with fp16.
+  ///
+  void SetIpuConfig(bool ipu_enable_fp16 = false, int ipu_replica_num = 1,
+                    float ipu_available_memory_proportion = 1.0,
+                    bool ipu_enable_half_partial = false);
+
   ///
   /// \brief Set XPU device id.
   ///
@@ -876,11 +886,14 @@ struct PD_INFER_DECL AnalysisConfig {
   // ipu related.
   bool use_ipu_{false};
   int ipu_device_num_{1};
-
+  int ipu_micro_batch_size_{1};
   bool ipu_enable_pipelining_{false};
   int ipu_batches_per_step_{1};
-  int ipu_batch_size_{1};
-  bool ipu_need_avg_shard_{false};
+
+  bool ipu_enable_fp16_{false};
+  int ipu_replica_num_{1};
+  float ipu_available_memory_proportion_{1.0};
+  bool ipu_enable_half_partial_{false};
 
   // If the config is already used on a predictor, it becomes invalid.
   // Any config can only be used with one predictor.
diff --git a/paddle/fluid/inference/api/paddle_tensor.h b/paddle/fluid/inference/api/paddle_tensor.h
index 24a72a0b9dadb..81eecbb2c1480 100644
--- a/paddle/fluid/inference/api/paddle_tensor.h
+++ b/paddle/fluid/inference/api/paddle_tensor.h
@@ -45,7 +45,7 @@ enum DataType {
   // TODO(Superjomn) support more data types if needed.
 };
 
-enum class PlaceType { kUNK = -1, kCPU, kGPU, kXPU, kNPU };
+enum class PlaceType { kUNK = -1, kCPU, kGPU, kXPU, kNPU, kIPU };
 
 /// \brief Represents an n-dimensional array of values.
 /// The Tensor is used to store the input or output of the network.
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 9dafd0d17c715..85fe931cf93f8 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -758,11 +758,30 @@ if(ON_INFER OR WITH_GPU)
     set_tests_properties(test_analyzer_transformer_profile PROPERTIES TIMEOUT 120)
 endif()
 
-# IPU
 if (WITH_IPU)
-    #resnet50
+    #word2vec sample
+    set(WORD2VEC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/word2vec/word2vec.inference.model")
+    inference_analysis_test(ipu_word2vec_sample SRCS ipu_word2vec_sample.cc
+        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+        ARGS --infer_model=${WORD2VEC_INSTALL_DIR})
+
+    # ERNIE
+    set(ERNIE_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/Ernie")
+    inference_analysis_api_test(ipu_ernie_test ${ERNIE_INSTALL_DIR} ipu_ernie_test.cc
+            ARGS --warmup=true --repeat=10)
+    inference_analysis_api_test(ipu_ernie_fp16_test ${ERNIE_INSTALL_DIR} ipu_ernie_fp16_test.cc
+            ARGS --warmup=true --repeat=10)
+
+    # Resnet50
     set(RESNET50_MODEL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/resnet50")
     inference_analysis_test(ipu_resnet50_test SRCS ipu_resnet50_test.cc
         EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-        ARGS --infer_model=${RESNET50_MODEL_DIR} --warmup=true --repeat=1000)
+        ARGS --infer_model=${RESNET50_MODEL_DIR} --warmup=true --repeat=10)
+    inference_analysis_test(ipu_resnet50_fp16_test SRCS ipu_resnet50_fp16_test.cc
+        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+        ARGS --infer_model=${RESNET50_MODEL_DIR} --warmup=true --repeat=10)
+
+    # Only support Resnet50 and Ernie currently
+    inference_analysis_api_test(ipu_multi_model_profile SRCS ipu_multi_model_profile.cc
+        ARGS --model_name="Resnet50" --infer_model=${RESNET50_MODEL_DIR} --warmup=true --repeat=10)
 endif()
diff --git a/paddle/fluid/inference/tests/api/analyzer_ernie_tester.h b/paddle/fluid/inference/tests/api/analyzer_ernie_tester.h
index 2582a1cb09eef..fffcd38d95a0c 100644
--- a/paddle/fluid/inference/tests/api/analyzer_ernie_tester.h
+++ b/paddle/fluid/inference/tests/api/analyzer_ernie_tester.h
@@ -150,8 +150,7 @@ void SetConfig(AnalysisConfig *cfg, bool use_mkldnn = false,
 
 void SetIpuConfig(AnalysisConfig *cfg, int batch_size = 1) {
   cfg->SetModel(FLAGS_infer_model);
-  // num_ipu, enable_pipelining, batches_per_step, batch_size, need_avg_shard
-  cfg->EnableIpu(4, false, 1, batch_size, true);
+  cfg->EnableIpu(4, batch_size, false, 1);
 }
 
 }  // namespace inference
diff --git a/paddle/fluid/inference/tests/api/ipu_ernie_fp16_test.cc b/paddle/fluid/inference/tests/api/ipu_ernie_fp16_test.cc
new file mode 100644
index 0000000000000..fa775bd9a9cb9
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/ipu_ernie_fp16_test.cc
@@ -0,0 +1,184 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+
+using paddle::PaddleTensor;
+
+template <typename T>
+void GetValueFromStream(std::stringstream *ss, T *t) {
+  (*ss) >> (*t);
+}
+
+template <>
+void GetValueFromStream<std::string>(std::stringstream *ss, std::string *t) {
+  *t = ss->str();
+}
+
+// Split string to vector
+template <typename T>
+void Split(const std::string &line, char sep, std::vector<T> *v) {
+  std::stringstream ss;
+  T t;
+  for (auto c : line) {
+    if (c != sep) {
+      ss << c;
+    } else {
+      GetValueFromStream<T>(&ss, &t);
+      v->push_back(std::move(t));
+      ss.str({});
+      ss.clear();
+    }
+  }
+
+  if (!ss.str().empty()) {
+    GetValueFromStream<T>(&ss, &t);
+    v->push_back(std::move(t));
+    ss.str({});
+    ss.clear();
+  }
+}
+
+// Parse tensor from string
+template <typename T>
+bool ParseTensor(const std::string &field, paddle::PaddleTensor *tensor) {
+  std::vector<std::string> data;
+  Split(field, ':', &data);
+  if (data.size() < 2) return false;
+
+  std::string shape_str = data[0];
+
+  std::vector<int> shape;
+  Split(shape_str, ' ', &shape);
+
+  std::string mat_str = data[1];
+
+  std::vector<T> mat;
+  Split(mat_str, ' ', &mat);
+
+  tensor->shape = shape;
+  auto size =
+      std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()) *
+      sizeof(T);
+  tensor->data.Resize(size);
+  std::copy(mat.begin(), mat.end(), static_cast<T *>(tensor->data.data()));
+  tensor->dtype = GetPaddleDType<T>();
+
+  return true;
+}
+
+// Parse input tensors from string
+bool ParseLine(const std::string &line,
+               std::vector<paddle::PaddleTensor> *tensors) {
+  std::vector<std::string> fields;
+  Split(line, ';', &fields);
+
+  tensors->clear();
+  tensors->reserve(4);
+
+  int i = 0;
+  auto input_name = FLAGS_ernie_large ? "eval_placeholder_" : "placeholder_";
+  for (; i < 3; i++) {
+    paddle::PaddleTensor temp;
+    ParseTensor<int64_t>(fields[i], &temp);
+    temp.name = input_name + std::to_string(i);
+    tensors->push_back(temp);
+  }
+
+  // input_mask
+  paddle::PaddleTensor input_mask;
+  ParseTensor<float>(fields[i], &input_mask);
+  // fp32 to fp16
+  ConvertFP32toFP16(input_mask);
+  input_mask.name = input_name + std::to_string(i);
+  tensors->push_back(input_mask);
+
+  return true;
+}
+
+bool LoadInputData(std::vector<std::vector<paddle::PaddleTensor>> *inputs,
+                   int batch_size = 1) {
+  if (FLAGS_infer_data.empty()) {
+    LOG(ERROR) << "please set input data path";
+    return false;
+  }
+
+  std::ifstream fin(FLAGS_infer_data);
+  std::string line;
+  int sample = 0;
+
+  // The unit-test dataset only have 10 samples, each sample have 5 feeds.
+  while (std::getline(fin, line)) {
+    std::vector<paddle::PaddleTensor> feed_data;
+    ParseLine(line, &feed_data);
+    inputs->push_back(std::move(feed_data));
+    sample++;
+    if (!FLAGS_test_all_data && sample == batch_size) break;
+  }
+  LOG(INFO) << "number of samples: " << sample;
+  return true;
+}
+
+void SetConfig(AnalysisConfig *cfg, int batch_size = 1) {
+  cfg->SetModel(FLAGS_infer_model);
+  // ipu_device_num, ipu_micro_batch_size, ipu_enable_pipelining
+  cfg->EnableIpu(1, batch_size, false);
+  // ipu_enable_fp16, ipu_replica_num, ipu_available_memory_proportion,
+  // ipu_enable_half_partial
+  cfg->SetIpuConfig(true, 1, 1.0, true);
+}
+
+// Compare results
+TEST(Analyzer_Ernie_ipu, compare_results) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  LoadInputData(&input_slots_all);
+
+  std::ifstream fin(FLAGS_refer_result);
+  std::string line;
+  std::vector<float> ref;
+
+  while (std::getline(fin, line)) {
+    Split(line, ' ', &ref);
+  }
+
+  auto predictor = CreateTestPredictor(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+      FLAGS_use_analysis);
+
+  std::vector<PaddleTensor> outputs;
+  for (size_t i = 0; i < input_slots_all.size(); i++) {
+    outputs.clear();
+    predictor->Run(input_slots_all[i], &outputs);
+
+    auto output = outputs.front();
+    ConvertFP16toFP32(output);
+    auto outputs_size = 1;
+    for (auto dim : output.shape) {
+      outputs_size *= dim;
+    }
+    float *fp32_data = reinterpret_cast<float *>(output.data.data());
+    for (size_t j = 0; j < outputs_size; ++j) {
+      EXPECT_NEAR(ref[i * outputs_size + j], fp32_data[j], 5e-3);
+    }
+  }
+}
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/ipu_ernie_test.cc b/paddle/fluid/inference/tests/api/ipu_ernie_test.cc
new file mode 100644
index 0000000000000..e36917c9acd3e
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/ipu_ernie_test.cc
@@ -0,0 +1,196 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+
+using paddle::PaddleTensor;
+
+template <typename T>
+void GetValueFromStream(std::stringstream *ss, T *t) {
+  (*ss) >> (*t);
+}
+
+template <>
+void GetValueFromStream<std::string>(std::stringstream *ss, std::string *t) {
+  *t = ss->str();
+}
+
+// Split string to vector
+template <typename T>
+void Split(const std::string &line, char sep, std::vector<T> *v) {
+  std::stringstream ss;
+  T t;
+  for (auto c : line) {
+    if (c != sep) {
+      ss << c;
+    } else {
+      GetValueFromStream<T>(&ss, &t);
+      v->push_back(std::move(t));
+      ss.str({});
+      ss.clear();
+    }
+  }
+
+  if (!ss.str().empty()) {
+    GetValueFromStream<T>(&ss, &t);
+    v->push_back(std::move(t));
+    ss.str({});
+    ss.clear();
+  }
+}
+
+// Parse tensor from string
+template <typename T>
+bool ParseTensor(const std::string &field, paddle::PaddleTensor *tensor) {
+  std::vector<std::string> data;
+  Split(field, ':', &data);
+  if (data.size() < 2) return false;
+
+  std::string shape_str = data[0];
+
+  std::vector<int> shape;
+  Split(shape_str, ' ', &shape);
+
+  std::string mat_str = data[1];
+
+  std::vector<T> mat;
+  Split(mat_str, ' ', &mat);
+
+  tensor->shape = shape;
+  auto size =
+      std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()) *
+      sizeof(T);
+  tensor->data.Resize(size);
+  std::copy(mat.begin(), mat.end(), static_cast<T *>(tensor->data.data()));
+  tensor->dtype = GetPaddleDType<T>();
+
+  return true;
+}
+
+// Parse input tensors from string
+bool ParseLine(const std::string &line,
+               std::vector<paddle::PaddleTensor> *tensors) {
+  std::vector<std::string> fields;
+  Split(line, ';', &fields);
+
+  tensors->clear();
+  tensors->reserve(4);
+
+  int i = 0;
+  auto input_name = FLAGS_ernie_large ? "eval_placeholder_" : "placeholder_";
+  for (; i < 3; i++) {
+    paddle::PaddleTensor temp;
+    ParseTensor<int64_t>(fields[i], &temp);
+    temp.name = input_name + std::to_string(i);
+    tensors->push_back(temp);
+  }
+
+  // input_mask
+  paddle::PaddleTensor input_mask;
+  ParseTensor<float>(fields[i], &input_mask);
+  input_mask.name = input_name + std::to_string(i);
+  tensors->push_back(input_mask);
+
+  return true;
+}
+
+bool LoadInputData(std::vector<std::vector<paddle::PaddleTensor>> *inputs,
+                   int batch_size = 1) {
+  if (FLAGS_infer_data.empty()) {
+    LOG(ERROR) << "please set input data path";
+    return false;
+  }
+
+  std::ifstream fin(FLAGS_infer_data);
+  std::string line;
+  int sample = 0;
+
+  // The unit-test dataset only have 10 samples, each sample have 5 feeds.
+  while (std::getline(fin, line)) {
+    std::vector<paddle::PaddleTensor> feed_data;
+    ParseLine(line, &feed_data);
+    inputs->push_back(std::move(feed_data));
+    sample++;
+    if (!FLAGS_test_all_data && sample == batch_size) break;
+  }
+  LOG(INFO) << "number of samples: " << sample;
+  return true;
+}
+
+void SetConfig(AnalysisConfig *cfg, int batch_size = 1) {
+  cfg->SetModel(FLAGS_infer_model);
+  // ipu_device_num, ipu_micro_batch_size, ipu_enable_pipelining
+  cfg->EnableIpu(1, batch_size, false);
+}
+
+void profile() {
+  AnalysisConfig config;
+  SetConfig(&config);
+
+  std::vector<std::vector<PaddleTensor>> outputs;
+  std::vector<std::vector<PaddleTensor>> inputs;
+  LoadInputData(&inputs);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&config),
+                 inputs, &outputs, FLAGS_num_threads);
+}
+
+// Compare Deterministic result
+TEST(Analyzer_Ernie_ipu, compare_determine) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  LoadInputData(&input_slots_all);
+  CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                       input_slots_all);
+}
+
+// Compare results
+TEST(Analyzer_Ernie_ipu, compare_results) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  LoadInputData(&input_slots_all);
+
+  std::ifstream fin(FLAGS_refer_result);
+  std::string line;
+  std::vector<float> ref;
+
+  while (std::getline(fin, line)) {
+    Split(line, ' ', &ref);
+  }
+
+  auto predictor = CreateTestPredictor(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+      FLAGS_use_analysis);
+
+  std::vector<PaddleTensor> outputs;
+  for (size_t i = 0; i < input_slots_all.size(); i++) {
+    outputs.clear();
+    predictor->Run(input_slots_all[i], &outputs);
+    auto outputs_size = outputs.front().data.length() / (sizeof(float));
+    for (size_t j = 0; j < outputs_size; ++j) {
+      EXPECT_NEAR(ref[i * outputs_size + j],
+                  static_cast<float *>(outputs[0].data.data())[j],
+                  FLAGS_accuracy);
+    }
+  }
+}
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/ipu_multi_model_profile.cc b/paddle/fluid/inference/tests/api/ipu_multi_model_profile.cc
new file mode 100644
index 0000000000000..a225feae4a261
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/ipu_multi_model_profile.cc
@@ -0,0 +1,105 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include "gflags/gflags.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+
+void ErnieInputData(const int &total_batch_size, const bool enable_fp16,
+                    std::vector<PaddleTensor> *inputs) {
+  const int input_num = total_batch_size * 128 * 1;
+  std::vector<int64_t> placeholder_012(input_num, 1);
+  std::vector<float> placeholder_3(input_num, 1);
+
+  for (int i = 0; i < 4; i++) {
+    PaddleTensor in;
+    in.name = "placeholder_" + std::to_string(i);
+    in.shape = {total_batch_size, 128, 1};
+    if (i < 3) {
+      in.data = PaddleBuf(static_cast<void *>(placeholder_012.data()),
+                          input_num * sizeof(int64_t));
+      in.dtype = PaddleDType::INT64;
+    } else {
+      in.data = PaddleBuf(static_cast<void *>(placeholder_3.data()),
+                          input_num * sizeof(float));
+      in.dtype = PaddleDType::FLOAT32;
+      if (enable_fp16) {
+        ConvertFP32toFP16(in);
+      }
+    }
+    inputs->push_back(std::move(in));
+  }
+}
+
+void Resnet50InputData(const int &total_batch_size, const bool enable_fp16,
+                       std::vector<paddle::PaddleTensor> *inputs) {
+  const int input_num = total_batch_size * 3 * 318 * 318;
+  std::vector<float> input(input_num, 1);
+  PaddleTensor in;
+  in.shape = {total_batch_size, 3, 318, 318};
+  in.data =
+      PaddleBuf(static_cast<void *>(input.data()), input_num * sizeof(float));
+  in.dtype = PaddleDType::FLOAT32;
+  if (enable_fp16) {
+    ConvertFP32toFP16(in);
+  }
+  inputs->push_back(std::move(in));
+}
+
+// performance profile
+TEST(Analyzer_ipu_fp16, performance_profile) {
+  AnalysisConfig config;
+  std::vector<PaddleTensor> inputs;
+  std::vector<std::vector<PaddleTensor>> outputs;
+
+  int total_batch_size = FLAGS_ipu_micro_batch_size * FLAGS_ipu_replica_num;
+  if (FLAGS_ipu_enable_pipelining) {
+    // if device_num > 1 and pipelining is enabled, the total batch size =
+    // micro_batch_size * device_num(batches_per_step) * replica_num
+    total_batch_size = FLAGS_ipu_micro_batch_size * FLAGS_ipu_batches_per_step *
+                       FLAGS_ipu_replica_num;
+  }
+
+  if (FLAGS_model_name == "Resnet50") {
+    config.SetModel(FLAGS_infer_model + "/model/model",
+                    FLAGS_infer_model + "/model/params");
+    Resnet50InputData(total_batch_size, FLAGS_ipu_enable_fp16, &inputs);
+  } else if (FLAGS_model_name == "Ernie") {
+    config.SetModel(FLAGS_infer_model + "/model/");
+    ErnieInputData(total_batch_size, FLAGS_ipu_enable_fp16, &inputs);
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Only support Resnet50 and Ernie Currently"));
+  }
+  // ipu_device_num, ipu_micro_batch_size, ipu_enable_pipelining,
+  // ipu_batches_per_step
+  config.EnableIpu(FLAGS_ipu_device_num, FLAGS_ipu_micro_batch_size,
+                   FLAGS_ipu_enable_pipelining, FLAGS_ipu_batches_per_step);
+  // ipu_enable_fp16, ipu_replica_num, ipu_available_memory_proportion,
+  // ipu_enable_half_partial
+  config.SetIpuConfig(FLAGS_ipu_enable_fp16, FLAGS_ipu_replica_num,
+                      FLAGS_ipu_available_memory_proportion,
+                      FLAGS_ipu_enable_half_partial);
+
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&config),
+                 {inputs}, &outputs, 1);
+}
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/ipu_resnet50_fp16_test.cc b/paddle/fluid/inference/tests/api/ipu_resnet50_fp16_test.cc
new file mode 100644
index 0000000000000..1d69069da0716
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/ipu_resnet50_fp16_test.cc
@@ -0,0 +1,86 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <cmath>
+
+#include "gflags/gflags.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+
+// Compare results with 1 batch
+TEST(Analyzer_Resnet50_ipu, compare_results_1_batch) {
+  std::string model_dir = FLAGS_infer_model + "/" + "model";
+  AnalysisConfig config;
+  // ipu_device_num, ipu_micro_batch_size, ipu_enable_pipelining
+  config.EnableIpu(1, 1, false);
+  // ipu_enable_fp16, ipu_replica_num, ipu_available_memory_proportion,
+  // ipu_enable_half_partial
+  config.SetIpuConfig(true, 1, 1.0, true);
+  config.SetModel(model_dir + "/model", model_dir + "/params");
+
+  std::vector<PaddleTensor> inputs;
+  auto predictor = CreatePaddlePredictor(config);
+  const int batch = 1;
+  const int channel = 3;
+  const int height = 318;
+  const int width = 318;
+  const int input_num = batch * channel * height * width;
+  std::vector<float> input(input_num, 1);
+
+  PaddleTensor in;
+  in.shape = {batch, channel, height, width};
+  in.data =
+      PaddleBuf(static_cast<void*>(input.data()), input_num * sizeof(float));
+  in.dtype = PaddleDType::FLOAT32;
+  ConvertFP32toFP16(in);
+  inputs.emplace_back(in);
+
+  std::vector<PaddleTensor> outputs;
+
+  ASSERT_TRUE(predictor->Run(inputs, &outputs));
+
+  const std::vector<float> truth_values = {
+      127.779f,  738.165f,  1013.22f,  -438.17f,  366.401f,  927.659f,
+      736.222f,  -633.684f, -329.927f, -430.155f, -633.062f, -146.548f,
+      -1324.28f, -1349.36f, -242.675f, 117.448f,  -801.723f, -391.514f,
+      -404.818f, 454.16f,   515.48f,   -133.031f, 69.293f,   590.096f,
+      -1434.69f, -1070.89f, 307.074f,  400.525f,  -316.12f,  -587.125f,
+      -161.056f, 800.363f,  -96.4708f, 748.706f,  868.174f,  -447.938f,
+      112.737f,  1127.2f,   47.4355f,  677.72f,   593.186f,  -336.4f,
+      551.362f,  397.823f,  78.3979f,  -715.398f, 405.969f,  404.256f,
+      246.019f,  -8.42969f, 131.365f,  -648.051f};
+
+  const size_t expected_size = 1;
+  EXPECT_EQ(outputs.size(), expected_size);
+
+  auto output = outputs.front();
+  ConvertFP16toFP32(output);
+  auto outputs_size = 1;
+  for (auto dim : output.shape) {
+    outputs_size *= dim;
+  }
+  float* fp32_data = reinterpret_cast<float*>(output.data.data());
+
+  for (size_t j = 0; j < outputs_size; j += 10) {
+    EXPECT_NEAR((fp32_data[j] - truth_values[j / 10]) / truth_values[j / 10],
+                0., 9e-2);
+  }
+}
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/ipu_resnet50_test.cc b/paddle/fluid/inference/tests/api/ipu_resnet50_test.cc
index f5e755ab46691..5fde8e6a5e1e6 100644
--- a/paddle/fluid/inference/tests/api/ipu_resnet50_test.cc
+++ b/paddle/fluid/inference/tests/api/ipu_resnet50_test.cc
@@ -33,9 +33,8 @@ static std::vector<float> truth_values = {
 TEST(Analyzer_Resnet50_ipu, compare_results_1_batch) {
   std::string model_dir = FLAGS_infer_model + "/" + "model";
   AnalysisConfig config;
-  // num_ipu, enable_pipelining, batches_per_step, batch_size,
-  // need_avg_shard
-  config.EnableIpu(1, false);
+  // ipu_device_num, ipu_micro_batch_size, ipu_enable_pipelining
+  config.EnableIpu(1, 1, false);
   config.SetModel(model_dir + "/model", model_dir + "/params");
 
   std::vector<PaddleTensor> inputs;
@@ -72,9 +71,8 @@ TEST(Analyzer_Resnet50_ipu, compare_results_1_batch) {
 TEST(Analyzer_Resnet50_ipu, compare_results_2_batch) {
   std::string model_dir = FLAGS_infer_model + "/" + "model";
   AnalysisConfig config;
-  // num_ipu, enable_pipelining, batches_per_step, batch_size,
-  // need_avg_shard
-  config.EnableIpu(2, false, 1, 2, 1);
+  // ipu_device_num, ipu_micro_batch_size, ipu_enable_pipelining
+  config.EnableIpu(1, 2, false);
   config.SetModel(model_dir + "/model", model_dir + "/params");
 
   std::vector<PaddleTensor> inputs;
diff --git a/paddle/fluid/inference/tests/api/ipu_word2vec_sample.cc b/paddle/fluid/inference/tests/api/ipu_word2vec_sample.cc
new file mode 100644
index 0000000000000..d38c5c3416351
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/ipu_word2vec_sample.cc
@@ -0,0 +1,81 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file contains a simple demo for how to take a model for inference with
+ * IPUs.
+ * Model: wget -q
+ * http://paddle-inference-dist.bj.bcebos.com/word2vec.inference.model.tar.gz
+ */
+
+#include <iostream>
+#include <numeric>
+#include <string>
+#include <vector>
+
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+
+DEFINE_string(infer_model, "", "Directory of the inference model.");
+
+using paddle_infer::Config;
+using paddle_infer::Predictor;
+using paddle_infer::CreatePredictor;
+
+void inference(std::string model_path, bool use_ipu,
+               std::vector<float> *out_data) {
+  //# 1. Create Predictor with a config.
+  Config config;
+  config.SetModel(FLAGS_infer_model);
+  if (use_ipu) {
+    // ipu_device_num, ipu_micro_batch_size
+    config.EnableIpu(1, 4);
+  }
+  auto predictor = CreatePredictor(config);
+
+  //# 2. Prepare input/output tensor.
+  auto input_names = predictor->GetInputNames();
+  std::vector<int64_t> data{1, 2, 3, 4};
+  // For simplicity, we set all the slots with the same data.
+  for (auto input_name : input_names) {
+    auto input_tensor = predictor->GetInputHandle(input_name);
+    input_tensor->Reshape({4, 1});
+    input_tensor->CopyFromCpu(data.data());
+  }
+
+  //# 3. Run
+  predictor->Run();
+
+  //# 4. Get output.
+  auto output_names = predictor->GetOutputNames();
+  auto output_tensor = predictor->GetOutputHandle(output_names[0]);
+  std::vector<int> output_shape = output_tensor->shape();
+  int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1,
+                                std::multiplies<int>());
+  out_data->resize(out_num);
+  output_tensor->CopyToCpu(out_data->data());
+}
+
+int main(int argc, char *argv[]) {
+  ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
+  std::vector<float> ipu_result;
+  std::vector<float> cpu_result;
+  inference(FLAGS_infer_model, true, &ipu_result);
+  inference(FLAGS_infer_model, false, &cpu_result);
+  for (size_t i = 0; i < ipu_result.size(); i++) {
+    CHECK_NEAR(ipu_result[i], cpu_result[i], 1e-6);
+  }
+  LOG(INFO) << "Finished";
+}
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index 77fab0a86f833..637fa16e31ba7 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -76,10 +76,23 @@ DEFINE_int32(cpu_num_threads, 1, "Number of threads for each paddle instance.");
 DEFINE_bool(fuse_multi_gru, false,
             "Running the inference program with multi_gru_fuse_pass");
 
+// ipu related
+DEFINE_int32(ipu_micro_batch_size, 1, "micro batch size");
+DEFINE_int32(ipu_device_num, 1, "device num");
+DEFINE_bool(ipu_enable_pipelining, false, "enable pipelining");
+DEFINE_int32(ipu_batches_per_step, 1,
+             "the number of batches per run in pipelining");
+DEFINE_bool(ipu_enable_fp16, false, "enable fp16");
+DEFINE_int32(ipu_replica_num, 1, "replica num");
+DEFINE_double(ipu_available_memory_proportion, 1.0,
+              "available memory proportion");
+DEFINE_bool(ipu_enable_half_partial, false, "enable half partial");
+
 namespace paddle {
 namespace inference {
 
 using paddle::framework::proto::VarType;
+using float16 = paddle::platform::float16;
 
 template <typename T>
 constexpr paddle::PaddleDType GetPaddleDType();
@@ -1060,5 +1073,44 @@ static bool CompareTensor(const framework::LoDTensor &a,
   return true;
 }
 
+void ConvertFP32toFP16(paddle::PaddleTensor &tensor  // NOLINT
+                       ) {
+  int num = 1;
+  for (auto dim : tensor.shape) {
+    num *= dim;
+  }
+  PADDLE_ENFORCE_EQ(
+      tensor.dtype, PaddleDType::FLOAT32,
+      platform::errors::InvalidArgument(
+          "The tensor dtype is not float32, only support float32 as input"));
+  float *fp32_data = reinterpret_cast<float *>(tensor.data.data());
+  float16 *fp16_data = new float16[num];
+  for (int i = 0; i < num; i++) {
+    fp16_data[i] = float16(fp32_data[i]);
+  }
+  tensor.data =
+      PaddleBuf(static_cast<void *>(fp16_data), num * sizeof(float16));
+  tensor.dtype = PaddleDType::FLOAT16;
+}
+
+void ConvertFP16toFP32(paddle::PaddleTensor &tensor  // NOLINT
+                       ) {
+  int num = 1;
+  for (auto dim : tensor.shape) {
+    num *= dim;
+  }
+  PADDLE_ENFORCE_EQ(
+      tensor.dtype, PaddleDType::FLOAT16,
+      platform::errors::InvalidArgument(
+          "The tensor dtype is not float16, only support float16 as input"));
+  float16 *fp16_data = reinterpret_cast<float16 *>(tensor.data.data());
+  float *fp32_data = new float[num];
+  for (int i = 0; i < num; i++) {
+    fp32_data[i] = static_cast<float>(fp16_data[i]);
+  }
+  tensor.data = PaddleBuf(static_cast<void *>(fp32_data), num * sizeof(float));
+  tensor.dtype = PaddleDType::FLOAT32;
+}
+
 }  // namespace inference
 }  // namespace paddle

From b7bcd0f643b90e87da749251011e364e3681e5d7 Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Wed, 23 Feb 2022 11:14:23 +0800
Subject: [PATCH 058/101] [Phi] Migrate lable_smooth_op into Phi (#39796)

* [Phi] Migrate lable_smooth_op into Phi

* fix PT->PD
---
 paddle/fluid/framework/operator.cc            |   2 +-
 paddle/fluid/operators/label_smooth_op.cc     |  11 +-
 paddle/fluid/operators/label_smooth_op.cu     | 125 ------------------
 paddle/fluid/operators/label_smooth_op.h      |  70 ----------
 paddle/fluid/operators/label_smooth_op_npu.cc |   2 +-
 paddle/fluid/operators/label_smooth_op_xpu.cc |   1 -
 .../kernels/cpu/label_smooth_grad_kernel.cc   |  45 +++++++
 paddle/phi/kernels/cpu/label_smooth_kernel.cc |  50 +++++++
 .../kernels/gpu/label_smooth_grad_kernel.cu   |  55 ++++++++
 paddle/phi/kernels/gpu/label_smooth_kernel.cu |  86 ++++++++++++
 paddle/phi/kernels/label_smooth_grad_kernel.h |  28 ++++
 paddle/phi/kernels/label_smooth_kernel.h      |  30 +++++
 paddle/phi/ops/compat/label_smooth_sig.cc     |  37 ++++++
 13 files changed, 334 insertions(+), 208 deletions(-)
 delete mode 100644 paddle/fluid/operators/label_smooth_op.cu
 delete mode 100644 paddle/fluid/operators/label_smooth_op.h
 create mode 100644 paddle/phi/kernels/cpu/label_smooth_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/label_smooth_kernel.cc
 create mode 100644 paddle/phi/kernels/gpu/label_smooth_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/label_smooth_kernel.cu
 create mode 100644 paddle/phi/kernels/label_smooth_grad_kernel.h
 create mode 100644 paddle/phi/kernels/label_smooth_kernel.h
 create mode 100644 paddle/phi/ops/compat/label_smooth_sig.cc

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index e589f059f522b..701fc7de6940a 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -2040,7 +2040,7 @@ void OperatorWithKernel::BuildPtenKernelContext(
         (i == 0 ? 0 : pt_kernel_context->InputRangeAt(i - 1).second);
 
     // deal with optional here
-    if ((it == ctx.inputs.end()) &&
+    if ((it == ctx.inputs.end() || it->second.size() == 0) &&
         (input_defs[i].type_index ==
          std::type_index(typeid(paddle::optional<const phi::DenseTensor&>)))) {
       pt_kernel_context->EmplaceBackInputWithoutSetRange(nullptr);
diff --git a/paddle/fluid/operators/label_smooth_op.cc b/paddle/fluid/operators/label_smooth_op.cc
index 5ae9fd7a61028..7e07610db2875 100644
--- a/paddle/fluid/operators/label_smooth_op.cc
+++ b/paddle/fluid/operators/label_smooth_op.cc
@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/label_smooth_op.h"
-
 #include <string>
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -152,11 +151,3 @@ REGISTER_OPERATOR(label_smooth, ops::LabelSmoothOp, ops::LabelSmoothOpMaker,
                   ops::LabelSmoothGradMaker<paddle::framework::OpDesc>,
                   ops::LabelSmoothGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(label_smooth_grad, ops::LabelSmoothGradOp);
-REGISTER_OP_CPU_KERNEL(
-    label_smooth,
-    ops::LabelSmoothKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LabelSmoothKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    label_smooth_grad,
-    ops::LabelSmoothGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LabelSmoothGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/label_smooth_op.cu b/paddle/fluid/operators/label_smooth_op.cu
deleted file mode 100644
index f149e104eff62..0000000000000
--- a/paddle/fluid/operators/label_smooth_op.cu
+++ /dev/null
@@ -1,125 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
-#include "paddle/fluid/operators/label_smooth_op.h"
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct LabelSmoothFunctor {
-  T epsilon;
-  T label_dim;
-
-  __forceinline__ LabelSmoothFunctor(float epsilon_data, int label_dim_data) {
-    epsilon = static_cast<T>(epsilon_data);
-    label_dim = static_cast<T>(label_dim_data);
-  }
-
-  __device__ __forceinline__ T operator()(const T x) const {
-    return (static_cast<T>(1 - epsilon) * x +
-            static_cast<T>(epsilon / label_dim));
-  }
-};
-
-template <typename T>
-struct LabelSmoothGradFunctor {
-  T epsilon;
-
-  __forceinline__ LabelSmoothGradFunctor(float epsilon_data) {
-    epsilon = static_cast<T>(epsilon_data);
-  }
-
-  __device__ __forceinline__ T operator()(const T x) const {
-    return static_cast<T>(1 - epsilon) * x;
-  }
-};
-
-template <typename T>
-__global__ void LabelSmoothRunDistKernel(const int N, const float epsilon,
-                                         const int dist_numel, const T* src,
-                                         const T* dist_data, T* dst) {
-  CUDA_KERNEL_LOOP(idx, N) {
-    int dist_idx = idx % dist_numel;
-    dst[idx] = static_cast<T>(1 - epsilon) * src[idx] +
-               static_cast<T>(epsilon) * dist_data[dist_idx];
-  }
-}
-
-template <typename DeviceContext, typename T>
-class LabelSmoothGPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* out_t = ctx.Output<framework::LoDTensor>("Out");
-    auto* in_t = ctx.Input<framework::LoDTensor>("X");
-    auto* dist_t = ctx.Input<framework::Tensor>("PriorDist");
-    auto label_dim = in_t->dims()[in_t->dims().size() - 1];
-    auto epsilon = ctx.Attr<float>("epsilon");
-    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
-    auto size_prob = in_t->numel();
-    const T* in_data = in_t->data<T>();
-    T* out_data = out_t->mutable_data<T>(ctx.GetPlace());
-    int threads = 512;
-    int grid = (size_prob + threads - 1) / threads;
-    auto stream = ctx.cuda_device_context().stream();
-    if (dist_t) {
-      auto dist_numel = dist_t->numel();
-      const T* dist_data = dist_t->data<T>();
-      LabelSmoothRunDistKernel<T><<<grid, threads, 0, stream>>>(
-          size_prob, epsilon, dist_numel, in_data, dist_data, out_data);
-
-    } else {
-      auto& dev_ctx =
-          ctx.template device_context<platform::CUDADeviceContext>();
-
-      std::vector<const framework::Tensor*> ins = {in_t};
-      std::vector<framework::Tensor*> outs = {out_t};
-      auto functor = LabelSmoothFunctor<T>(epsilon, label_dim);
-      paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
-                                                                &outs, functor);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class LabelSmoothGradGPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* d_out_t = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* d_in_t = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    d_in_t->mutable_data<T>(ctx.GetPlace());
-
-    auto epsilon = ctx.Attr<float>("epsilon");
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-
-    std::vector<const framework::Tensor*> ins = {d_out_t};
-    std::vector<framework::Tensor*> outs = {d_in_t};
-    auto functor = LabelSmoothGradFunctor<T>(epsilon);
-    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
-                                                              &outs, functor);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    label_smooth,
-    ops::LabelSmoothGPUKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LabelSmoothGPUKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    label_smooth_grad,
-    ops::LabelSmoothGradGPUKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LabelSmoothGradGPUKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/label_smooth_op.h b/paddle/fluid/operators/label_smooth_op.h
deleted file mode 100644
index 6b509eb64cce6..0000000000000
--- a/paddle/fluid/operators/label_smooth_op.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class LabelSmoothKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* out_t = ctx.Output<framework::LoDTensor>("Out");
-    auto* in_t = ctx.Input<framework::LoDTensor>("X");
-    auto* dist_t = ctx.Input<framework::Tensor>("PriorDist");
-    auto label_dim = in_t->dims()[in_t->dims().size() - 1];
-    out_t->mutable_data<T>(ctx.GetPlace());
-    if (label_dim != 0) {
-      auto epsilon = ctx.Attr<float>("epsilon");
-      auto out = framework::EigenVector<T>::Flatten(*out_t);
-      auto in = framework::EigenVector<T>::Flatten(*in_t);
-      auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
-      if (dist_t) {
-        auto dist = framework::EigenVector<T>::Flatten(*dist_t);
-        out.device(dev) = static_cast<T>(1 - epsilon) * in +
-                          static_cast<T>(epsilon) *
-                              dist.broadcast(Eigen::DSizes<int, 1>(
-                                  in_t->numel() / label_dim));
-      } else {
-        out.device(dev) = static_cast<T>(1 - epsilon) * in +
-                          static_cast<T>(epsilon / label_dim);
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class LabelSmoothGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* d_out_t = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* d_in_t = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    d_in_t->mutable_data<T>(ctx.GetPlace());
-    auto d_out_dim = d_out_t->dims()[d_out_t->dims().size() - 1];
-    if (d_out_dim != 0) {
-      auto d_out = framework::EigenVector<T>::Flatten(*d_out_t);
-      auto d_in = framework::EigenVector<T>::Flatten(*d_in_t);
-
-      auto epsilon = ctx.Attr<float>("epsilon");
-      auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
-      d_in.device(dev) = static_cast<T>(1 - epsilon) * d_out;
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/label_smooth_op_npu.cc b/paddle/fluid/operators/label_smooth_op_npu.cc
index af519cc9090b0..c24b896e0a49a 100644
--- a/paddle/fluid/operators/label_smooth_op_npu.cc
+++ b/paddle/fluid/operators/label_smooth_op_npu.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/label_smooth_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/label_smooth_op_xpu.cc b/paddle/fluid/operators/label_smooth_op_xpu.cc
index 6b6350753909f..dd8d0c721c9c2 100644
--- a/paddle/fluid/operators/label_smooth_op_xpu.cc
+++ b/paddle/fluid/operators/label_smooth_op_xpu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/operators/label_smooth_op.h"
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/phi/kernels/cpu/label_smooth_grad_kernel.cc b/paddle/phi/kernels/cpu/label_smooth_grad_kernel.cc
new file mode 100644
index 0000000000000..74664fb270b2d
--- /dev/null
+++ b/paddle/phi/kernels/cpu/label_smooth_grad_kernel.cc
@@ -0,0 +1,45 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/label_smooth_grad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LabelSmoothGradKernel(const Context& ctx,
+                           const DenseTensor& out_grad,
+                           float epsilon,
+                           DenseTensor* label_grad) {
+  ctx.template Alloc<T>(label_grad);
+  auto d_out_dim = out_grad.dims()[out_grad.dims().size() - 1];
+  if (d_out_dim != 0) {
+    auto d_out = EigenVector<T>::Flatten(out_grad);
+    auto d_in = EigenVector<T>::Flatten(*label_grad);
+
+    auto& dev = *ctx.eigen_device();
+    d_in.device(dev) = static_cast<T>(1 - epsilon) * d_out;
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(label_smooth_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::LabelSmoothGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/label_smooth_kernel.cc b/paddle/phi/kernels/cpu/label_smooth_kernel.cc
new file mode 100644
index 0000000000000..c76fb826cdfcc
--- /dev/null
+++ b/paddle/phi/kernels/cpu/label_smooth_kernel.cc
@@ -0,0 +1,50 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/label_smooth_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LabelSmoothKernel(const Context& ctx,
+                       const DenseTensor& label,
+                       paddle::optional<const DenseTensor&> prior_dist,
+                       float epsilon,
+                       DenseTensor* out) {
+  auto label_dim = label.dims()[label.dims().size() - 1];
+  ctx.template Alloc<T>(out);
+  auto& dev = *ctx.eigen_device();
+  if (label_dim != 0) {
+    auto eigen_out = EigenVector<T>::Flatten(*out);
+    auto eigen_in = EigenVector<T>::Flatten(label);
+    if (prior_dist.is_initialized()) {
+      auto dist = EigenVector<T>::Flatten(*prior_dist.get_ptr());
+      eigen_out.device(dev) =
+          static_cast<T>(1 - epsilon) * eigen_in +
+          static_cast<T>(epsilon) *
+              dist.broadcast(Eigen::DSizes<int, 1>(label.numel() / label_dim));
+    } else {
+      eigen_out.device(dev) = static_cast<T>(1 - epsilon) * eigen_in +
+                              static_cast<T>(epsilon / label_dim);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    label_smooth, CPU, ALL_LAYOUT, phi::LabelSmoothKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/label_smooth_grad_kernel.cu b/paddle/phi/kernels/gpu/label_smooth_grad_kernel.cu
new file mode 100644
index 0000000000000..f30e8c3cdcf7a
--- /dev/null
+++ b/paddle/phi/kernels/gpu/label_smooth_grad_kernel.cu
@@ -0,0 +1,55 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/label_smooth_grad_kernel.h"
+
+namespace phi {
+template <typename T>
+struct LabelSmoothGradFunctor {
+  T epsilon;
+
+  __forceinline__ LabelSmoothGradFunctor(float epsilon_data) {
+    epsilon = static_cast<T>(epsilon_data);
+  }
+
+  __device__ __forceinline__ T operator()(const T x) const {
+    return static_cast<T>(1 - epsilon) * x;
+  }
+};
+
+template <typename T, typename Context>
+void LabelSmoothGradKernel(const Context& ctx,
+                           const DenseTensor& out_grad,
+                           float epsilon,
+                           DenseTensor* label_grad) {
+  ctx.template Alloc<T>(label_grad);
+
+  std::vector<const DenseTensor*> ins = {&out_grad};
+  std::vector<DenseTensor*> outs = {label_grad};
+  auto functor = LabelSmoothGradFunctor<T>(epsilon);
+  paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
+      ctx, ins, &outs, functor);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(label_smooth_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LabelSmoothGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/label_smooth_kernel.cu b/paddle/phi/kernels/gpu/label_smooth_kernel.cu
new file mode 100644
index 0000000000000..50f7548450ce7
--- /dev/null
+++ b/paddle/phi/kernels/gpu/label_smooth_kernel.cu
@@ -0,0 +1,86 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/label_smooth_kernel.h"
+
+namespace phi {
+
+template <typename T>
+struct LabelSmoothFunctor {
+  T epsilon;
+  T label_dim;
+
+  __forceinline__ LabelSmoothFunctor(float epsilon_data, int label_dim_data) {
+    epsilon = static_cast<T>(epsilon_data);
+    label_dim = static_cast<T>(label_dim_data);
+  }
+
+  __device__ __forceinline__ T operator()(const T x) const {
+    return (static_cast<T>(1 - epsilon) * x +
+            static_cast<T>(epsilon / label_dim));
+  }
+};
+
+template <typename T>
+__global__ void LabelSmoothRunDistKernel(const int N,
+                                         const float epsilon,
+                                         const int dist_numel,
+                                         const T* src,
+                                         const T* dist_data,
+                                         T* dst) {
+  CUDA_KERNEL_LOOP(idx, N) {
+    int dist_idx = idx % dist_numel;
+    dst[idx] = static_cast<T>(1 - epsilon) * src[idx] +
+               static_cast<T>(epsilon) * dist_data[dist_idx];
+  }
+}
+
+template <typename T, typename Context>
+void LabelSmoothKernel(const Context& ctx,
+                       const DenseTensor& label,
+                       paddle::optional<const DenseTensor&> prior_dist,
+                       float epsilon,
+                       DenseTensor* out) {
+  auto label_dim = label.dims()[label.dims().size() - 1];
+  auto size_prob = label.numel();
+  const T* in_data = label.data<T>();
+  T* out_data = ctx.template Alloc<T>(out);
+
+  if (prior_dist.get_ptr()) {
+    int threads = 512;
+    int grid = (size_prob + threads - 1) / threads;
+    auto stream = ctx.stream();
+    const auto* dist_t = prior_dist.get_ptr();
+    auto dist_numel = dist_t->numel();
+    const T* dist_data = dist_t->data<T>();
+    LabelSmoothRunDistKernel<T><<<grid, threads, 0, stream>>>(
+        size_prob, epsilon, dist_numel, in_data, dist_data, out_data);
+
+  } else {
+    std::vector<const DenseTensor*> ins = {&label};
+    std::vector<DenseTensor*> outs = {out};
+    auto functor = LabelSmoothFunctor<T>(epsilon, label_dim);
+    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
+        ctx, ins, &outs, functor);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    label_smooth, GPU, ALL_LAYOUT, phi::LabelSmoothKernel, float, double) {}
diff --git a/paddle/phi/kernels/label_smooth_grad_kernel.h b/paddle/phi/kernels/label_smooth_grad_kernel.h
new file mode 100644
index 0000000000000..993e967814aee
--- /dev/null
+++ b/paddle/phi/kernels/label_smooth_grad_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LabelSmoothGradKernel(const Context& ctx,
+                           const DenseTensor& out_grad,
+                           float epsilon,
+                           DenseTensor* label_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/label_smooth_kernel.h b/paddle/phi/kernels/label_smooth_kernel.h
new file mode 100644
index 0000000000000..b7e1f2708894c
--- /dev/null
+++ b/paddle/phi/kernels/label_smooth_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LabelSmoothKernel(const Context& ctx,
+                       const DenseTensor& label,
+                       paddle::optional<const DenseTensor&> prior_dist,
+                       float epsilon,
+                       DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/label_smooth_sig.cc b/paddle/phi/ops/compat/label_smooth_sig.cc
new file mode 100644
index 0000000000000..4fb62a8ca2675
--- /dev/null
+++ b/paddle/phi/ops/compat/label_smooth_sig.cc
@@ -0,0 +1,37 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature LabelSmoothOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "label_smooth", {"X", "PriorDist"}, {"epsilon"}, {"Out"});
+}
+
+KernelSignature LabelSmoothGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("label_smooth_grad",
+                         {GradVarName("Out")},
+                         {"epsilon"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(label_smooth, phi::LabelSmoothOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(label_smooth_grad,
+                           phi::LabelSmoothGradOpArgumentMapping);

From 1a1a2ce8072250b96ca216161a21db9b40a6c136 Mon Sep 17 00:00:00 2001
From: Liu-xiandong <85323580+Liu-xiandong@users.noreply.github.com>
Date: Wed, 23 Feb 2022 12:25:25 +0800
Subject: [PATCH 059/101] [KP] Add elementwise add xpu after phi, test=develop
 (#39787)

* [KP] Add elementwise add xpu, test=develop

* modify the File Permissions

* modify the copyright time

* modify code style

* modify code style
---
 .pre-commit-config.yaml                       |   4 +-
 cmake/operators.cmake                         |  15 +-
 cmake/xpu_kp.cmake                            |  14 +-
 .../elementwise/elementwise_add_op.kps        | 188 ++++++++++
 paddle/fluid/platform/device_context.h        |   3 +
 paddle/phi/core/hostdevice.h                  |   4 +-
 paddle/phi/kernels/funcs/broadcast_function.h |  14 +-
 paddle/phi/kernels/funcs/eigen/extensions.h   |   4 +
 paddle/phi/kernels/funcs/elementwise_base.h   |  20 +-
 paddle/phi/kernels/gpu/elementwise.h          | 127 +++----
 .../primitive/compute_primitives_xpu2.h       |   4 +-
 .../primitive/datamover_primitives_xpu2.h     |  46 +--
 .../primitive/functor_primitives_xpu2.h       | 209 +++++++++++
 .../phi/kernels/primitive/helper_primitives.h |   2 +-
 .../phi/kernels/primitive/kernel_primitives.h |  11 +-
 .../xpu/test_elementwise_add_op_xpu_kp.py     | 341 ++++++++++++++++++
 16 files changed, 890 insertions(+), 116 deletions(-)
 create mode 100644 paddle/fluid/operators/elementwise/elementwise_add_op.kps
 create mode 100755 paddle/phi/kernels/primitive/functor_primitives_xpu2.h
 create mode 100644 python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op_xpu_kp.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index df2e59b7647bf..2684529930e7c 100755
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -25,7 +25,7 @@ repos:
         description: Format files with ClangFormat.
         entry: bash ./tools/codestyle/clang_format.hook -i
         language: system
-        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$
+        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|xpu|kps)$
 -   repo: local
     hooks:
     -   id: cpplint-cpp-source
@@ -48,7 +48,7 @@ repos:
         name: copyright_checker
         entry: python ./tools/codestyle/copyright.hook
         language: system
-        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py|sh)$
+        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|xpu|kps|py|sh)$
         exclude: |
             (?x)^(
                 paddle/utils/.*
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 8469dc4c02ee3..8843dd2628767 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -125,6 +125,9 @@ function(op_library TARGET)
             if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.xpu)
                 list(APPEND xpu_kp_cc_srcs ${TARGET}.xpu)
             endif()
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.kps)
+                list(APPEND xpu_kp_cc_srcs ${TARGET}.kps)
+            endif()
         endif()
         if(WITH_ASCEND_CL)
             string(REPLACE "_op" "_op_npu" NPU_FILE "${TARGET}")
@@ -162,6 +165,8 @@ function(op_library TARGET)
                 list(APPEND xpu_cc_srcs ${src})
             elseif(WITH_XPU_KP AND ${src} MATCHES ".*\\.xpu$")
                 list(APPEND xpu_kp_cc_srcs ${src})
+            elseif(WITH_XPU_KP AND ${src} MATCHES ".*\\.kps$")
+                list(APPEND xpu_kp_cc_srcs ${src})
             elseif(WITH_ASCEND_CL AND ${src} MATCHES ".*_op_npu.cc$")
                 list(APPEND npu_cc_srcs ${src})
             elseif(WITH_MLU AND ${src} MATCHES ".*_op_mlu.cc$")
@@ -384,7 +389,15 @@ function(op_library TARGET)
 
     # pybind USE_OP_DEVICE_KERNEL for XPU KP
     if (WITH_XPU_KP AND ${xpu_kp_cc_srcs_len} GREATER 0)
-        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, KP);\n")
+        foreach(xpu_kp_src ${xpu_kp_cc_srcs})
+        set(op_name "")
+        find_register(${xpu_kp_src} "REGISTER_OP_KERNEL" op_name)
+        if(NOT ${op_name} EQUAL "")
+            file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, KP);\n")
+            message(STATUS "Building KP Target: ${op_name}")
+            set(pybind_flag 1)
+        endif()
+        endforeach()
     endif()
 
     # pybind USE_OP_DEVICE_KERNEL for NPU
diff --git a/cmake/xpu_kp.cmake b/cmake/xpu_kp.cmake
index f8ab9693db0c9..adab3e1423c91 100644
--- a/cmake/xpu_kp.cmake
+++ b/cmake/xpu_kp.cmake
@@ -17,7 +17,7 @@ if(NOT WITH_XPU_KP)
 endif()
 
 if(NOT XPU_TOOLCHAIN)
-  set(XPU_TOOLCHAIN /workspace/paddle/xpu-demo/XTDK)
+  set(XPU_TOOLCHAIN /workspace/output/XTDK-ubuntu_x86_64)
   get_filename_component(XPU_TOOLCHAIN ${XPU_TOOLCHAIN} REALPATH)
 endif()
 if(NOT IS_DIRECTORY ${XPU_TOOLCHAIN})
@@ -102,7 +102,7 @@ macro(compile_kernel COMPILE_ARGS)
 
   set(XTDK_DIR ${XPU_TOOLCHAIN})
   set(CXX_DIR ${HOST_SYSROOT})
-  set(XPU_CXX_FLAGS  -Wno-error=pessimizing-move -Wno-error=constant-conversion -Wno-error=c++11-narrowing -Wno-error=shift-count-overflow -Wno-error=unused-local-typedef -Wno-error=deprecated-declarations -Wno-deprecated-declarations -std=c++14 -m64 -fPIC -fno-omit-frame-pointer  -Wall -Wno-inconsistent-missing-override -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function  -Wno-error=unused-local-typedefs -Wno-error=ignored-attributes  -Wno-error=int-in-bool-context -Wno-error=parentheses -Wno-error=address -Wno-ignored-qualifiers -Wno-ignored-attributes -Wno-parentheses -DNDEBUG )
+  set(XPU_CXX_FLAGS  -fforce-enable-int128 -Wno-error=pessimizing-move -Wno-error=constant-conversion -Wno-error=c++11-narrowing -Wno-error=shift-count-overflow -Wno-error=unused-local-typedef -Wno-error=deprecated-declarations -Wno-deprecated-declarations -std=c++14 -m64 -fPIC -fno-omit-frame-pointer  -Wall -Wno-inconsistent-missing-override -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function  -Wno-error=unused-local-typedefs -Wno-error=ignored-attributes  -Wno-error=int-in-bool-context -Wno-error=parentheses -Wno-error=address -Wno-ignored-qualifiers -Wno-ignored-attributes -Wno-parentheses -DNDEBUG )
 
   #include path
   get_property(dirs DIRECTORY ${CMAKE_SOURCE_DIR} PROPERTY INCLUDE_DIRECTORIES)
@@ -127,9 +127,11 @@ macro(compile_kernel COMPILE_ARGS)
       kernel_build/${kernel_name}.bin.o
     COMMAND
       ${CMAKE_COMMAND} -E make_directory kernel_build
+    COMMAND
+      cp ${kernel_path}/${kernel_name}.kps kernel_build/${kernel_name}.xpu
     COMMAND
     ${XPU_CLANG} --sysroot=${CXX_DIR}  -std=c++11 -D_GLIBCXX_USE_CXX11_ABI=1 ${OPT_LEVEL} -fno-builtin -mcpu=xpu2  -fPIC ${XPU_CXX_DEFINES}  ${XPU_CXX_FLAGS}  ${XPU_CXX_INCLUDES} 
-       -I.  -o kernel_build/${kernel_name}.bin.o.sec ${kernel_path}/${kernel_name}.xpu
+       -I.  -o kernel_build/${kernel_name}.bin.o.sec kernel_build/${kernel_name}.xpu
         --xpu-device-only -c -v 
     COMMAND
       ${XTDK_DIR}/bin/xpu2-elfconv kernel_build/${kernel_name}.bin.o.sec  kernel_build/${kernel_name}.bin.o ${XPU_CLANG} --sysroot=${CXX_DIR}
@@ -148,9 +150,11 @@ macro(compile_kernel COMPILE_ARGS)
       kernel_build/${kernel_name}.host.o
     COMMAND
       ${CMAKE_COMMAND} -E make_directory kernel_build
+    COMMAND
+      cp ${kernel_path}/${kernel_name}.kps kernel_build/${kernel_name}.xpu
     COMMAND
     ${XPU_CLANG} --sysroot=${CXX_DIR}  -std=c++11 -D_GLIBCXX_USE_CXX11_ABI=1 ${OPT_LEVEL} -fno-builtin -mcpu=xpu2  -fPIC ${XPU_CXX_DEFINES}  ${XPU_CXX_FLAGS} ${XPU_CXX_INCLUDES} 
-        -I.  -o kernel_build/${kernel_name}.host.o ${kernel_path}/${kernel_name}.xpu
+        -I.  -o kernel_build/${kernel_name}.host.o kernel_build/${kernel_name}.xpu
         --xpu-host-only -c -v 
     WORKING_DIRECTORY
       ${CMAKE_CURRENT_BINARY_DIR}
@@ -185,7 +189,7 @@ macro(xpu_add_library TARGET_NAME)
     # Distinguish .xpu file from other files
     foreach(cur_xpu_src IN LISTS xpu_srcs_lists)
       get_filename_component(language_type_name ${cur_xpu_src} EXT)
-      if(${language_type_name} STREQUAL ".xpu")
+      if(${language_type_name} STREQUAL ".kps")
         list(APPEND xpu_kernel_lists ${cur_xpu_src})
       else()
         list(APPEND cc_kernel_lists ${cur_xpu_src})
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.kps b/paddle/fluid/operators/elementwise/elementwise_add_op.kps
new file mode 100644
index 0000000000000..a3fea0d7b3dbf
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.kps
@@ -0,0 +1,188 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+// Please do not modify the following code
+#if defined(__CUDA_ARCH__)
+#undef __CUDA_ARCH__
+#endif
+
+#if defined(__CUDACC__)
+#undef __CUDACC__
+#endif
+
+#if defined(__CUDA__)
+#undef __CUDA__
+#endif
+
+#if defined(__NVCC__)
+#undef __NVCC__
+#endif
+
+#ifdef PADDLE_WITH_XPU_KP
+#include <xpu/runtime.h>                // NOLINT
+#include "xpu/kernel/cluster_header.h"  // NOLINT
+#include "xpu/kernel/debug.h"           // NOLINT
+#include "xpu/kernel/math.h"            // NOLINT
+
+#include <memory>
+#include <string>
+#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
+#include "paddle/fluid/operators/elementwise/elementwise_xpu.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class ElementwiseAddXPUKPKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    std::vector<const framework::Tensor*> ins;
+    std::vector<framework::Tensor*> outs;
+    int axis = PackTensorsIntoVector<T>(ctx, &ins, &outs);
+    const auto& xpu_ctx =
+        ctx.template device_context<paddle::platform::XPUDeviceContext>();
+    paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T,
+                                                   T, kps::AddFunctor<T>, 1>(
+        xpu_ctx, ins, &outs, axis, kps::AddFunctor<T>());
+  }
+};
+
+static std::vector<int> get_rdims(const std::vector<int>& xdims,
+                                  const std::vector<int>& ydims) {
+  std::vector<int> rdims;
+  for (size_t i = 0; i < xdims.size(); i++) {
+    if (xdims[i] != ydims[i]) {
+      rdims.push_back(i);
+    }
+  }
+  return rdims;
+}
+
+template <typename T>
+class ElementwiseAddGradXPUKPKernel : public ElemwiseGradKernel<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElemwiseGradKernel<T>::Compute(ctx);
+    auto* x = ctx.Input<framework::Tensor>("X");
+    auto* y = ctx.Input<framework::Tensor>("Y");
+    auto* dz = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
+    const framework::DDim& x_dims = x->dims();
+    const framework::DDim& y_dims = y->dims();
+    const framework::DDim& dz_dims = dz->dims();
+    int axis = ctx.Attr<int>("axis");
+    axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
+    int max_dim = std::max(x_dims.size(), y_dims.size());
+    PADDLE_ENFORCE_GE(
+        axis, 0,
+        platform::errors::InvalidArgument(
+            "Axis should be great than or equal to 0, but received axis is %d.",
+            axis));
+    PADDLE_ENFORCE_LT(
+        axis, max_dim,
+        platform::errors::InvalidArgument(
+            "Axis should be less than %d, but received axis is %d.", max_dim,
+            axis));
+
+    std::vector<int> x_dims_vec(max_dim, 1);
+    std::vector<int> y_dims_vec(max_dim, 1);
+    std::vector<int> z_dims_vec(max_dim, 1);
+    if (x_dims.size() == max_dim) {
+      for (int i = 0; i < max_dim; i++) {
+        x_dims_vec[i] = x_dims[i];
+      }
+    } else {
+      for (int i = 0; i < x_dims.size(); i++) {
+        x_dims_vec[i + axis] = x_dims[i];
+      }
+    }
+
+    if (y_dims.size() == max_dim) {
+      for (int i = 0; i < max_dim; i++) {
+        y_dims_vec[i] = y_dims[i];
+      }
+    } else {
+      for (int i = 0; i < y_dims.size(); i++) {
+        y_dims_vec[i + axis] = y_dims[i];
+      }
+    }
+
+    for (int i = 0; i < max_dim; i++) {
+      z_dims_vec[i] = dz_dims[i];
+    }
+    std::vector<int> rdims_for_x;
+    std::vector<int> rdims_for_y;
+    rdims_for_x = get_rdims(x_dims_vec, z_dims_vec);
+    rdims_for_y = get_rdims(y_dims_vec, z_dims_vec);
+    const T* dz_data = dz->data<T>();
+    auto& dev_ctx =
+        ctx.template device_context<paddle::platform::XPUDeviceContext>();
+
+    if (dx != nullptr) {
+      T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
+      if (rdims_for_x.size() == 0) {
+        if (dx_data != dz_data) {
+          framework::TensorCopy(
+              *dz, ctx.GetPlace(),
+              ctx.template device_context<platform::DeviceContext>(), dx);
+        }
+      } else {
+        // For inplace strategy, dx will be stored in addr of dz, which makes
+        // the result of dy wrong.
+        if (dx->IsSharedBufferWith(*dz)) {
+          dx->clear();
+          dx->mutable_data<T>(x->dims(), ctx.GetPlace());
+        }
+
+        int ret = xpu::reduce_sum<XPUType>(
+            dev_ctx.x_context(), reinterpret_cast<const XPUType*>(dz_data),
+            reinterpret_cast<XPUType*>(dx_data), z_dims_vec, rdims_for_x);
+        PADDLE_ENFORCE_XDNN_SUCCESS(ret, "reduce_sum ");
+      }
+    }
+
+    if (dy != nullptr) {
+      T* dy_data = dy->mutable_data<T>(ctx.GetPlace());
+      if (rdims_for_y.size() == 0) {
+        if (dy_data != dz_data) {
+          framework::TensorCopy(
+              *dz, ctx.GetPlace(),
+              ctx.template device_context<platform::DeviceContext>(), dy);
+        }
+      } else {
+        int ret = xpu::reduce_sum<XPUType>(
+            dev_ctx.x_context(), reinterpret_cast<const XPUType*>(dz_data),
+            reinterpret_cast<XPUType*>(dy_data), z_dims_vec, rdims_for_y);
+        PADDLE_ENFORCE_XDNN_SUCCESS(ret, "reduce_sum ");
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_KERNEL(elementwise_add, KP, plat::XPUPlace,
+                   ops::ElementwiseAddXPUKPKernel<float>);
+
+REGISTER_OP_KERNEL(elementwise_add_grad, KP, plat::XPUPlace,
+                   ops::ElementwiseAddGradXPUKPKernel<float>);
+
+#endif  // PADDLE_WITH_XPU_KP
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 17288b354a280..e9124dfc1f8a7 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -74,7 +74,10 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/device/device_ext.h"
 #include "paddle/fluid/platform/device/stream.h"
+
+#if !defined(PADDLE_WITH_XPU_KP) || defined(__xpu_on_host__)
 #include "unsupported/Eigen/CXX11/Tensor"
+#endif
 
 namespace Eigen {
 struct DefaultDevice;
diff --git a/paddle/phi/core/hostdevice.h b/paddle/phi/core/hostdevice.h
index 08fe3125287d7..0869df143235f 100644
--- a/paddle/phi/core/hostdevice.h
+++ b/paddle/phi/core/hostdevice.h
@@ -18,14 +18,14 @@
 #include <hip/hip_runtime.h>
 #endif
 
-#ifdef __xpu_kp__
+#if defined(__xpu__)
 #include <xpu/runtime.h>
 #include "xpu/kernel/cluster_header.h"
 #include "xpu/kernel/debug.h"
 #include "xpu/kernel/math.h"
 #endif
 
-#if (defined(__CUDACC__) || defined(__HIPCC__) || defined(__xpu_kp__))
+#if (defined(__CUDACC__) || defined(__HIPCC__) || defined(__xpu__))
 #define HOSTDEVICE __host__ __device__
 #define DEVICE __device__
 #define HOST __host__
diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h
index be57b8630f895..84a36b849afa1 100644
--- a/paddle/phi/kernels/funcs/broadcast_function.h
+++ b/paddle/phi/kernels/funcs/broadcast_function.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
 
 namespace kps = phi::kps;
 
@@ -122,7 +122,7 @@ struct DimensionsTransform {
   explicit DimensionsTransform(const std::vector<const DenseTensor *> &ins,
                                const phi::DDim &dims,
                                int axis) {
-    const int N = max(static_cast<int>(ins.size()), 2);
+    const int N = std::max(static_cast<int>(ins.size()), 2);
     dim_size = dims.size();
     out_dims = phi::vectorize<int64_t>(dims);
     in_dims.resize(N);
@@ -183,7 +183,7 @@ struct DimensionsTransform {
   }
 };
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
 
 template <typename T, int VecSize, int Rank, bool IsBoundary = false>
 __device__ __forceinline__ void LoadData(
@@ -268,7 +268,7 @@ __global__ void VectorizedBroadcastKernel(
   int block_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize;
   int stride = BLOCK_NUM_X * GRID_NUM_X * VecSize;
 
-#ifdef PADDLE_WITH_XPU2
+#ifdef PADDLE_WITH_XPU_KP
   for (; block_offset < main_offset; block_offset += stride) {
     VectorizedBroadcastKernelImpl<InT,
                                   OutT,
@@ -348,12 +348,12 @@ void LaunchBroadcastKernel(const KPDevice &ctx,
   phi::Array<_ptr_ OutT *, NumOuts> outs_data;
 
   for (int i = 0; i < NumOuts; ++i) {
-    outs_data[i] = ctx.Alloc<OutT>((*outs)[i]);
+    outs_data[i] = (_ptr_ OutT *)(ctx.Alloc<OutT>((*outs)[i]));
   }
 
   for (int i = 0; i < Arity; i++) {
     use_broadcast[i] = (ins[i]->numel() != numel);
-    ins_data[i] = (_ptr_ InT *)(ins[i]->data<InT>());
+    ins_data[i] = (const _ptr_ InT *)(ins[i]->data<InT>());
     if (use_broadcast[i]) {
       // get the broadcast config,
       // if data shape is[m, n], then you should set data_dim = {n, m}
@@ -363,7 +363,7 @@ void LaunchBroadcastKernel(const KPDevice &ctx,
     }
   }
 
-#ifdef PADDLE_WITH_XPU2
+#ifdef PADDLE_WITH_XPU_KP
   const int threads = 64;
   const int blocks = 8;
   int main_offset = (numel / (VecSize * threads)) * VecSize * threads;
diff --git a/paddle/phi/kernels/funcs/eigen/extensions.h b/paddle/phi/kernels/funcs/eigen/extensions.h
index 5fc8f76d988d1..fbb9d8e3d2ef5 100644
--- a/paddle/phi/kernels/funcs/eigen/extensions.h
+++ b/paddle/phi/kernels/funcs/eigen/extensions.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#ifndef __xpu__
+
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/common/float16.h"
@@ -435,3 +437,5 @@ HOSTDEVICE inline float16 maxi(const float16& a, const float16& b) {
 
 }  // namespace numext
 }  // namespace Eigen
+
+#endif  // __xpu__
diff --git a/paddle/phi/kernels/funcs/elementwise_base.h b/paddle/phi/kernels/funcs/elementwise_base.h
index 9a429dfaaf957..47f1593a11eb9 100644
--- a/paddle/phi/kernels/funcs/elementwise_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_base.h
@@ -21,12 +21,13 @@ limitations under the License. */
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
 #include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/fluid/platform/function_traits.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/kernels/primitive/kernel_primitives.h"
 
+#define HOSTDEVICE __host__ __device__
 namespace kps = phi::kps;
 
 #endif
@@ -436,7 +437,7 @@ inline void ElementwiseGradPreProcess(const DenseTensor &dout,
   }
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
 
 // static unroller
 template <template <int Index, int VecSize> typename Func,
@@ -469,10 +470,14 @@ struct Loader {
     kps::Init<Type, ArgsT, Index, VecSize>(args, static_cast<Type>(1.0f));
     if (is_boundary) {
       kps::ReadData<Type, VecSize, 1, 1, ArgsT, Index, true>(
-          args, reinterpret_cast<const Type *>(in[Index]) + data_offset, num);
+          args,
+          reinterpret_cast<const _ptr_ Type *>(in[Index]) + data_offset,
+          num);
     } else {
       kps::ReadData<Type, VecSize, 1, 1, ArgsT, Index, false>(
-          args, reinterpret_cast<const Type *>(in[Index]) + data_offset, num);
+          args,
+          reinterpret_cast<const _ptr_ Type *>(in[Index]) + data_offset,
+          num);
     }
   }
 };
@@ -482,8 +487,7 @@ struct InputSetter {
   template <typename Array>
   static HOSTDEVICE void Apply(
       const std::vector<const DenseTensor *> &ins_tensor, Array *ins_data) {
-    (*ins_data)[Index] =
-        reinterpret_cast<const _ptr_ char *>(ins_tensor[Index]->data());
+    (*ins_data)[Index] = (const _ptr_ char *)(ins_tensor[Index]->data());
   }
 };
 
@@ -718,9 +722,9 @@ void ElementwiseCudaKernel(const KPDevice &ctx,
 
   Unroller<InputSetter, VecSize, Arity>::step(ins, &ins_data);
   for (int i = 0; i < NumOuts; ++i) {
-    outs_data[i] = ctx.Alloc<OutT>((*outs)[i]);
+    outs_data[i] = (_ptr_ OutT *)(ctx.Alloc<OutT>((*outs)[i]));
   }
-#ifdef PADDLE_WITH_XPU2
+#ifdef PADDLE_WITH_XPU_KP
   int block_size = 64;
   int grid_size = 8;
   auto stream = ctx.x_context()->xpu_stream;
diff --git a/paddle/phi/kernels/gpu/elementwise.h b/paddle/phi/kernels/gpu/elementwise.h
index a2992702b164a..369bd8d8ad418 100644
--- a/paddle/phi/kernels/gpu/elementwise.h
+++ b/paddle/phi/kernels/gpu/elementwise.h
@@ -114,6 +114,7 @@ inline void ComputeBroadcastKernelSize(int *x_dims_array,
   }
 }
 
+#ifndef __xpu__
 template <typename T, typename OP, typename Tout = T>
 static __global__ void FastCommonGradBroadcastOneCUDAKernel(const T *x,
                                                             const T *y,
@@ -128,8 +129,8 @@ static __global__ void FastCommonGradBroadcastOneCUDAKernel(const T *x,
                                                             bool is_xsize,
                                                             OP op,
                                                             T *dd) {
-  int tid = threadIdx.x;
-  int bid = blockIdx.x;
+  int tid = THREAD_ID_X;
+  int bid = BLOCK_ID_X;
 
   T val(0);
   if (is_xsize) {
@@ -196,8 +197,8 @@ static __global__ void FastCommonGradBroadcastAllCUDAKernel(
     DY_OP dy_op,
     T *dx,
     T *dy) {
-  int tid = threadIdx.x;
-  int bid = blockIdx.x;
+  int tid = THREAD_ID_X;
+  int bid = BLOCK_ID_X;
 
   T val(0);
   if (is_xsize_larger) {
@@ -260,67 +261,67 @@ static __global__ void FastCommonGradBroadcastCUDAKernelHeight(const T *x,
   __shared__ T sdata[BLOCK_Y][BLOCK_X + 1];
 
   T val(0);
-  size_t width_stride = gridDim.x * blockDim.x;
-  size_t idx = threadIdx.x + blockDim.x * blockIdx.x;
+  size_t width_stride = GRID_NUM_X * BLOCK_NUM_X;
+  size_t idx = THREAD_ID_X + BLOCK_NUM_X * BLOCK_ID_X;
   size_t full_width =
       (w & (~((uint64_t)(BLOCK_X - 1)))) + ((w & (BLOCK_X - 1)) ? BLOCK_X : 0);
   size_t full_height =
       (h & (~((uint64_t)(BLOCK_Y - 1)))) + ((h & (BLOCK_Y - 1)) ? BLOCK_Y : 0);
   if (is_y) {
     for (int m = idx; m < full_width; m += width_stride) {
-      sdata[threadIdx.y][threadIdx.x] = 0;
-      for (int n = threadIdx.y; n < full_height; n += BLOCK_Y) {
+      sdata[THREAD_ID_Y][THREAD_ID_X] = 0;
+      for (int n = THREAD_ID_Y; n < full_height; n += BLOCK_Y) {
         int out_offset = n * w + m;
         int x_offset = (n % x_h) * x_w + m % x_w;
         if (dy) {
           if (m < w && n < h) {
             T val = dy_op(x[x_offset], y[m], out[out_offset], dout[out_offset]);
-            sdata[threadIdx.y][threadIdx.x] += val;
+            sdata[THREAD_ID_Y][THREAD_ID_X] += val;
           }
           __syncthreads();
         }
       }
       if (dy) {
-        T my_val = sdata[threadIdx.x][threadIdx.y];
+        T my_val = sdata[THREAD_ID_X][THREAD_ID_Y];
         for (int i = warpSize >> 1; i > 0; i >>= 1) {
           my_val += paddle::platform::CudaShuffleXorSync(0xFFFFFFFF, my_val, i);
         }
         __syncthreads();
-        if ((threadIdx.x == 0)) {
-          sdata[0][threadIdx.y] = my_val;
+        if ((THREAD_ID_X == 0)) {
+          sdata[0][THREAD_ID_Y] = my_val;
         }
         __syncthreads();
-        if (threadIdx.y == 0 && m < w) {
-          dy[m] = sdata[0][threadIdx.x];
+        if (THREAD_ID_Y == 0 && m < w) {
+          dy[m] = sdata[0][THREAD_ID_X];
         }
       }
     }
   } else {
     for (int m = idx; m < full_width; m += width_stride) {
-      sdata[threadIdx.y][threadIdx.x] = 0;
-      for (int n = threadIdx.y; n < full_height; n += BLOCK_Y) {
+      sdata[THREAD_ID_Y][THREAD_ID_X] = 0;
+      for (int n = THREAD_ID_Y; n < full_height; n += BLOCK_Y) {
         int out_offset = n * w + m;
         int y_offset = (n % x_h) * x_w + m % x_w;
         if (dy) {
           if (m < w && n < h) {
             T val = dy_op(x[m], y[y_offset], out[out_offset], dout[out_offset]);
-            sdata[threadIdx.y][threadIdx.x] += val;
+            sdata[THREAD_ID_Y][THREAD_ID_X] += val;
           }
           __syncthreads();
         }
       }
       if (dy) {
-        T my_val = sdata[threadIdx.x][threadIdx.y];
+        T my_val = sdata[THREAD_ID_X][THREAD_ID_Y];
         for (int i = warpSize >> 1; i > 0; i >>= 1) {
           my_val += paddle::platform::CudaShuffleXorSync(0xFFFFFFFF, my_val, i);
         }
         __syncthreads();
-        if ((threadIdx.x == 0)) {
-          sdata[0][threadIdx.y] = my_val;
+        if ((THREAD_ID_X == 0)) {
+          sdata[0][THREAD_ID_Y] = my_val;
         }
         __syncthreads();
-        if (threadIdx.y == 0 && m < w) {
-          dy[m] = sdata[0][threadIdx.x];
+        if (THREAD_ID_Y == 0 && m < w) {
+          dy[m] = sdata[0][THREAD_ID_X];
         }
       }
     }
@@ -339,9 +340,9 @@ static __global__ void CommonGradBroadcast1CUDAKernelHeight(const T *x,
                                                             int x_h,
                                                             int x_w,
                                                             bool is_y) {
-  int j = blockIdx.x;
-  int i = threadIdx.x;
-  int tid = threadIdx.x;
+  int j = BLOCK_ID_X;
+  int i = THREAD_ID_X;
+  int tid = THREAD_ID_X;
   T val(0);
 
   if (is_y) {
@@ -357,7 +358,7 @@ static __global__ void CommonGradBroadcast1CUDAKernelHeight(const T *x,
     if (dy) {
       h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
       val = paddle::platform::reduceSum(val, tid, h);
-      if (threadIdx.x == 0) {
+      if (THREAD_ID_X == 0) {
         dy[j] = val;
       }
     }
@@ -374,7 +375,7 @@ static __global__ void CommonGradBroadcast1CUDAKernelHeight(const T *x,
     if (dy) {
       h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
       val = paddle::platform::reduceSum(val, tid, h);
-      if (threadIdx.x == 0) {
+      if (THREAD_ID_X == 0) {
         dy[j] = val;
       }
     }
@@ -393,9 +394,9 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel(const T *x,
                                                         DY_OP dy_op,
                                                         T *dx,
                                                         T *dy) {
-  int j = blockIdx.x;
-  int i = threadIdx.x;
-  int tid = threadIdx.x;
+  int j = BLOCK_ID_X;
+  int i = THREAD_ID_X;
+  int tid = THREAD_ID_X;
   T val(0);
   if (is_xsize_larger) {
     do {
@@ -412,7 +413,7 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel(const T *x,
     if (dy) {
       h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
       val = paddle::platform::reduceSum(val, tid, h);
-      if (threadIdx.x == 0) {
+      if (THREAD_ID_X == 0) {
         dy[j] = val;
       }
     }
@@ -431,7 +432,7 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel(const T *x,
     if (dx) {
       h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
       val = paddle::platform::reduceSum(val, tid, h);
-      if (threadIdx.x == 0) {
+      if (THREAD_ID_X == 0) {
         dx[j] = val;
       }
     }
@@ -456,16 +457,16 @@ static __global__ void FastElemwiseGradBroadcast1CUDAKernel(
   __shared__ T sdata[BLOCK_Y][BLOCK_X + 1];
 
   T val(0);
-  size_t width_stride = gridDim.x * blockDim.x;
-  size_t idx = threadIdx.x + blockDim.x * blockIdx.x;
+  size_t width_stride = GRID_NUM_X * BLOCK_NUM_X;
+  size_t idx = THREAD_ID_X + BLOCK_NUM_X * BLOCK_ID_X;
   size_t full_width =
       (w & (~((uint64_t)(BLOCK_X - 1)))) + ((w & (BLOCK_X - 1)) ? BLOCK_X : 0);
   size_t full_height =
       (h & (~((uint64_t)(BLOCK_Y - 1)))) + ((h & (BLOCK_Y - 1)) ? BLOCK_Y : 0);
   if (is_xsize_larger) {
     for (int m = idx; m < full_width; m += width_stride) {
-      sdata[threadIdx.y][threadIdx.x] = 0;
-      for (int n = threadIdx.y; n < full_height; n += BLOCK_Y) {
+      sdata[THREAD_ID_Y][THREAD_ID_X] = 0;
+      for (int n = THREAD_ID_Y; n < full_height; n += BLOCK_Y) {
         int x_offset = n * w + m;
         if (dx && m < w && n < h) {
           dx[x_offset] =
@@ -474,29 +475,29 @@ static __global__ void FastElemwiseGradBroadcast1CUDAKernel(
         if (dy) {
           if (m < w && n < h) {
             T val = dy_op(x[x_offset], y[m], out[x_offset], dout[x_offset]);
-            sdata[threadIdx.y][threadIdx.x] += val;
+            sdata[THREAD_ID_Y][THREAD_ID_X] += val;
           }
           __syncthreads();
         }
       }
       if (dy) {
-        T my_val = sdata[threadIdx.x][threadIdx.y];
+        T my_val = sdata[THREAD_ID_X][THREAD_ID_Y];
         for (int i = warpSize >> 1; i > 0; i >>= 1)
           my_val += paddle::platform::CudaShuffleXorSync(0xFFFFFFFF, my_val, i);
         __syncthreads();
-        if ((threadIdx.x == 0)) {
-          sdata[0][threadIdx.y] = my_val;
+        if ((THREAD_ID_X == 0)) {
+          sdata[0][THREAD_ID_Y] = my_val;
         }
         __syncthreads();
-        if (threadIdx.y == 0 && m < w) {
-          dy[m] = sdata[0][threadIdx.x];
+        if (THREAD_ID_Y == 0 && m < w) {
+          dy[m] = sdata[0][THREAD_ID_X];
         }
       }
     }
   } else {  // x.dims < y.dims, broadcast for x.
     for (int m = idx; m < full_width; m += width_stride) {
-      sdata[threadIdx.y][threadIdx.x] = 0;
-      for (int n = threadIdx.y; n < full_height; n += BLOCK_Y) {
+      sdata[THREAD_ID_Y][THREAD_ID_X] = 0;
+      for (int n = THREAD_ID_Y; n < full_height; n += BLOCK_Y) {
         int y_offset = n * w + m;
         if (dy && m < w && n < h) {
           dy[y_offset] =
@@ -505,22 +506,22 @@ static __global__ void FastElemwiseGradBroadcast1CUDAKernel(
         if (dx) {
           if (m < w && n < h) {
             T val = dx_op(x[m], y[y_offset], out[y_offset], dout[y_offset]);
-            sdata[threadIdx.y][threadIdx.x] += val;
+            sdata[THREAD_ID_Y][THREAD_ID_X] += val;
           }
           __syncthreads();
         }
       }
       if (dx) {
-        T my_val = sdata[threadIdx.x][threadIdx.y];
+        T my_val = sdata[THREAD_ID_X][THREAD_ID_Y];
         for (int i = warpSize >> 1; i > 0; i >>= 1)
           my_val += paddle::platform::CudaShuffleXorSync(0xFFFFFFFF, my_val, i);
         __syncthreads();
-        if ((threadIdx.x == 0)) {
-          sdata[0][threadIdx.y] = my_val;
+        if ((THREAD_ID_X == 0)) {
+          sdata[0][THREAD_ID_Y] = my_val;
         }
         __syncthreads();
-        if (threadIdx.y == 0 && m < w) {
-          dx[m] = sdata[0][threadIdx.x];
+        if (THREAD_ID_Y == 0 && m < w) {
+          dx[m] = sdata[0][THREAD_ID_X];
         }
       }
     }
@@ -540,8 +541,8 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel(const T *x,
                                                         DY_OP dy_op,
                                                         T *dx,
                                                         T *dy) {
-  int tid = threadIdx.x;
-  int j = blockIdx.x;
+  int tid = THREAD_ID_X;
+  int j = BLOCK_ID_X;
 
   T val(0);
   int ttid = tid;
@@ -569,7 +570,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel(const T *x,
       int h = pre * post;
       h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
       val = paddle::platform::reduceSum(val, tid, h);
-      if (threadIdx.x == 0) {
+      if (THREAD_ID_X == 0) {
         dy[j] = val;
       }
     }
@@ -596,7 +597,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel(const T *x,
       int h = pre * post;
       h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
       val = paddle::platform::reduceSum(val, tid, h);
-      if (threadIdx.x == 0) {
+      if (THREAD_ID_X == 0) {
         dx[j] = val;
       }
     }
@@ -668,9 +669,9 @@ __global__ void CommonGradBroadcastCUDAKernel(const int *x_strides_array,
                                               int thread_num,
                                               DX_OP dx_op) {
   T val(0);
-  int i = blockIdx.x;
-  int tid = threadIdx.x;
-  for (int j = tid; j < thread_num; j += blockDim.x) {
+  int i = BLOCK_ID_X;
+  int tid = THREAD_ID_X;
+  for (int j = tid; j < thread_num; j += BLOCK_NUM_X) {
     const int X_index = i * thread_num + j;
     int out_index = X_index;
     int C_index = 0;
@@ -694,7 +695,7 @@ __global__ void CommonGradBroadcastCUDAKernel(const int *x_strides_array,
     val += dx_op(x[x_index], y[y_index], out[out_index], dout[out_index]);
   }
   val = paddle::platform::reduceSum(val, tid, thread_num);
-  if (threadIdx.x == 0) {
+  if (THREAD_ID_X == 0) {
     dx[i] = val;
   }
 }
@@ -1416,8 +1417,8 @@ void ElemwiseGradComputeWithBroadcast(const GPUContext &ctx,
 template <typename T>
 static __global__ void SimpleElemwiseAddGradCUDAKernel(
     const T *__restrict__ dout, int size, int vec_size, T *dx, T *dy) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = gridDim.x * blockDim.x;
+  int tid = BLOCK_ID_X * BLOCK_NUM_X + THREAD_ID_X;
+  int stride = GRID_NUM_X * BLOCK_NUM_X;
   int loop = size / vec_size;
   int remainder = size % vec_size;
   const float4 *dout_vec = reinterpret_cast<const float4 *>(dout);
@@ -1544,14 +1545,14 @@ static __global__ void SimpleElemwiseSubGradCUDAKernel(const T *dout,
                                                        int64_t size,
                                                        T *dx,
                                                        T *dy) {
-  int col = blockIdx.x * blockDim.x + threadIdx.x;
+  int col = BLOCK_ID_X * BLOCK_NUM_X + THREAD_ID_X;
 
   while (col < size) {
     if (dx != nullptr) {
       dx[col] = dout[col];
     }
     dy[col] = -dout[col];
-    col += blockDim.x * gridDim.x;
+    col += BLOCK_NUM_X * GRID_NUM_X;
   }
 }
 
@@ -1629,4 +1630,6 @@ void elementwise_sub_grad(const GPUContext &ctx,
       dy->mutable_data<T>(ctx.GetPlace()));
 }
 
+#endif
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/primitive/compute_primitives_xpu2.h b/paddle/phi/kernels/primitive/compute_primitives_xpu2.h
index cd044e8a3f995..a445f4a02ea71 100644
--- a/paddle/phi/kernels/primitive/compute_primitives_xpu2.h
+++ b/paddle/phi/kernels/primitive/compute_primitives_xpu2.h
@@ -328,7 +328,7 @@ __device__ __forceinline__ void Reduce(T* out,
                                        const T* in,
                                        ReduceFunctor reducer,
                                        bool reduce_last_dim) {
-  if (Mode == kGlobalMode) {
+  if (Mode == details::kGlobalMode) {
 #pragma unroll
     for (int i = 0; i < NY; ++i) {
 #pragma unroll
@@ -336,7 +336,7 @@ __device__ __forceinline__ void Reduce(T* out,
         out[i] = reducer(out[i], in[i * NX + j]);
       }
     }
-    BlockXReduce<T, OpFunc, NY>(out, reducer);
+    BlockXReduce<T, ReduceFunctor, NY>(out, reducer);
   } else {  // else  kLocalMode
 #pragma unroll
     for (int i = 0; i < NY; ++i) {
diff --git a/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h b/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
index 1583774369a97..75b2dbaf7e6a3 100644
--- a/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
+++ b/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
@@ -34,9 +34,9 @@ struct alignas(sizeof(T) * VecSize) VectorType {
 #pragma pack(4)
 template <int kDims>
 struct BroadcastConfig {
-  int strides_in[DDim::kMaxRank];
-  int strides_out[DDim::kMaxRank];
-  int in_dim[DDim::kMaxRank];
+  int strides_in[phi::DDim::kMaxRank];
+  int strides_out[phi::DDim::kMaxRank];
+  int in_dim[phi::DDim::kMaxRank];
 
   HOSTDEVICE BroadcastConfig() {}
 
@@ -222,7 +222,7 @@ __device__ __forceinline__ void Init(ArgsT* dst, T init_data) {
  * src: The data pointer of the current block.
  * size: The current block needs to load size data continuously.
  */
-template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
+template <typename T, int NX, int NY, int BlockSize, bool IsBoundary>
 __device__ __inline__ void ReadData(T* dst,
                                     const T _global_ptr_* src,
                                     int num) {
@@ -251,9 +251,9 @@ template <typename T,
           int BlockSize,
           typename ArgsT,
           int Index,
-          bool IsBoundary = false>
+          bool IsBoundary>
 __device__ __forceinline__ void ReadData(ArgsT* dst,
-                                         const T* __restrict__ src,
+                                         const T _global_ptr_* src,
                                          int num) {
   int thread_offset = core_id() * NX;
   __local__ T in_temp[1];
@@ -366,22 +366,25 @@ __device__ __inline__ void ReadDataBc(T* dst,
  * reduce_last_dim: Used to indicate whether the dimension of reduce contains
  * the lowest dimension.
  */
-template <typename T,
+template <typename Tx,
+          typename Ty,
           int NX,
           int NY,
           int BlockSize,
           int Rank,
           typename IndexCal,
+          typename Functor,
           bool IsBoundary = false>
-__device__ __inline__ void ReadDataReduce(T* dst,
-                                          const T _global_ptr_* src,
-                                          int block_offset,
-                                          const IndexCal& index_cal,
-                                          int size_nx,
-                                          int size_ny,
-                                          int stride_nx,
-                                          int stride_ny,
-                                          bool reduce_last_dim) {
+__device__ __forceinline__ void ReadDataReduce(Ty* dst,
+                                               const Tx* __restrict__ src,
+                                               int block_offset,
+                                               const IndexCal& index_cal,
+                                               int size_nx,
+                                               int size_ny,
+                                               int stride_nx,
+                                               int stride_ny,
+                                               Functor func,
+                                               bool reduce_last_dim) {
   __local__ Tx in_temp[1];
   int thread_offset = 0;
   int left_idx = 0;
@@ -618,11 +621,12 @@ template <typename T,
           int BlockSize,
           int Rank,
           bool IsBoundary = false>
-__device__ __inline__ void ReadDataBc(T* dst,
-                                      const T _global_ptr_* src,
-                                      uint32_t block_offset,
-                                      details::BroadcastConfig<Rank> config,
-                                      int total_num_output) {
+__device__ __inline__ void ReadDataBc(
+    T* dst,
+    const T _global_ptr_* src,
+    uint32_t block_offset,
+    const details::BroadcastConfig<Rank>& config,
+    int total_num_output) {
   int thread_offset = block_offset + core_id() * NX;
   int index_src = 0;
 
diff --git a/paddle/phi/kernels/primitive/functor_primitives_xpu2.h b/paddle/phi/kernels/primitive/functor_primitives_xpu2.h
new file mode 100755
index 0000000000000..8a21e61eaa7d0
--- /dev/null
+++ b/paddle/phi/kernels/primitive/functor_primitives_xpu2.h
@@ -0,0 +1,209 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "xpu/kernel/cluster_header.h"
+#include "xpu/kernel/debug.h"
+#include "xpu/kernel/math.h"
+
+namespace phi {
+namespace kps {
+/**
+ * @brief Default unary identity functor
+ */
+template <typename Tx, typename Ty = Tx>
+struct IdentityFunctor {
+  inline IdentityFunctor() {}
+
+  explicit inline IdentityFunctor(int n) {}
+
+  inline Ty operator()(const Tx& x) const { return static_cast<Ty>(x); }
+  __device__ inline IdentityFunctor() {}
+
+  __device__ explicit inline IdentityFunctor(int n) {}
+
+  __device__ inline Ty operator()(const Tx& x) const {
+    return static_cast<Ty>(x);
+  }
+  __device__ inline void SetDiv(int n) {}
+};
+
+/**
+ * @brief Default unary div functor. Divide by a constant
+ */
+template <typename Tx, typename Ty = Tx>
+struct DivideFunctor {
+  inline DivideFunctor() { n_inv = static_cast<Tx>(1.0f); }
+
+  explicit inline DivideFunctor(int n)
+      : n_inv(static_cast<Tx>(((float)1.0) / (static_cast<float>(n)))) {}
+
+  inline Ty operator()(const Tx& x) const { return static_cast<Ty>(x * n_inv); }
+
+  __device__ inline DivideFunctor() { n_inv = static_cast<Tx>(1.0f); }
+
+  __device__ inline DivideFunctor(int n)
+      : n_inv(static_cast<Tx>(((float)1.0) / (static_cast<float>(n)))) {}
+
+  __device__ inline Ty operator()(const Tx& x) const {
+    return static_cast<Ty>(x * n_inv);
+  }
+
+  __device__ inline void SetDiv(int n) {
+    n_inv = static_cast<Tx>(((float)1.0) / (static_cast<float>(n)));
+  }
+
+ private:
+  Tx n_inv;
+};
+
+/**
+ * @brief Default unary square functor
+ */
+template <typename Tx, typename Ty = Tx>
+struct SquareFunctor {
+  HOSTDEVICE inline SquareFunctor() {}
+
+  HOSTDEVICE explicit inline SquareFunctor(int n) {}
+
+  HOSTDEVICE inline Ty operator()(const Tx& x) const {
+    return static_cast<Ty>(x) * static_cast<Ty>(x);
+  }
+};
+
+/****************************** Binary Functor ********************************/
+
+/**
+ * @brief Default binary min functor
+ */
+template <typename T>
+struct MinFunctor {
+  inline T initial() { /*return static_cast<T>(std::numeric_limits<T>::max());*/
+  }
+
+  __device__ T operator()(const T& a, const T& b) const {
+    return (b < a) ? b : a;
+  }
+};
+
+/**
+ * @brief Default binary max functor
+ */
+template <typename T>
+struct MaxFunctor {
+  inline T initial() {
+    // return static_cast<T>(std::numeric_limits<T>::lowest());
+  }
+
+  __device__ T operator()(const T& a, const T& b) const {
+    return (b > a) ? b : a;
+  }
+};
+
+/**
+ * @brief Default binary add functor
+ */
+template <typename T>
+struct AddFunctor {
+  inline T initial() { return static_cast<T>(0.0f); }
+
+  __device__ T operator()(const T a, const T b) const { return b + a; }
+};
+
+/**
+ * @brief Default binary add functor
+ */
+template <typename T>
+struct MulFunctor {
+  inline T initial() { return static_cast<T>(1.0f); }
+
+  __device__ T operator()(const T& a, const T& b) const { return b * a; }
+};
+
+/**
+ * @brief Default binary logic or functor
+ */
+template <typename T>
+struct LogicalOrFunctor {
+  inline T initial() { return static_cast<T>(false); }
+
+  __device__ T operator()(const T& a, const T& b) const { return b || a; }
+};
+
+/**
+ * @brief Default binary logic and functor
+ */
+template <typename T>
+struct LogicalAndFunctor {
+  inline T initial() { return static_cast<T>(true); }
+
+  __device__ T operator()(const T& a, const T& b) const { return b && a; }
+};
+
+/**
+ * @brief Default binary sub functor
+ */
+template <typename T>
+struct SubFunctor {
+  inline T initial() { return static_cast<T>(0.0f); }
+
+  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a - b; }
+};
+
+/**
+ * @brief Default binary div functor
+ */
+template <typename T, typename Enable = void>
+struct DivFunctor {
+  inline T initial() { return static_cast<T>(1.0f); }
+
+  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a / b; }
+};
+
+template <typename T>
+struct DivFunctor<T,
+                  typename std::enable_if<std::is_integral<T>::value>::type> {
+  inline T initial() { return static_cast<T>(1.0f); }
+
+  inline HOSTDEVICE T operator()(const T& a, const T& b) const {
+    // For int32/int64, need to check whether the divison is zero.
+    PADDLE_ENFORCE_NE(b,
+                      0,
+                      phi::errors::InvalidArgument(
+                          "Integer division by zero encountered "
+                          "in (floor) divide. Please check the input value."));
+    return a / b;
+  }
+};
+
+/**
+ * @brief Default binary floor divide functor
+ */
+template <typename T>
+struct FloorDivFunctor {
+  inline T initial() { return static_cast<T>(1.0f); }
+
+  inline HOSTDEVICE T operator()(const T& a, const T& b) const {
+    PADDLE_ENFORCE_NE(b,
+                      0,
+                      phi::errors::InvalidArgument(
+                          "Integer division by zero encountered "
+                          "in (floor) divide. Please check the input value."));
+    return static_cast<T>(std::trunc(a / b));
+  }
+};
+
+}  // namespace kps
+}  // namespace phi
diff --git a/paddle/phi/kernels/primitive/helper_primitives.h b/paddle/phi/kernels/primitive/helper_primitives.h
index 1aeaa2aa100d7..b0dd8c774f83a 100644
--- a/paddle/phi/kernels/primitive/helper_primitives.h
+++ b/paddle/phi/kernels/primitive/helper_primitives.h
@@ -17,7 +17,7 @@
 namespace phi {
 namespace kps {
 
-#ifdef PADDLE_WITH_XPU2
+#ifdef PADDLE_WITH_XPU_KP
 struct dim3 {
   int x;
   int y;
diff --git a/paddle/phi/kernels/primitive/kernel_primitives.h b/paddle/phi/kernels/primitive/kernel_primitives.h
index d29d58b1fecc7..830bc1972c49f 100644
--- a/paddle/phi/kernels/primitive/kernel_primitives.h
+++ b/paddle/phi/kernels/primitive/kernel_primitives.h
@@ -14,11 +14,7 @@
 
 #pragma once
 #include "paddle/phi/kernels/primitive/helper_primitives.h"
-#ifdef PADDLE_WITH_XPU2
-#include "paddle/phi/backends/xpu/xpu_context.h"
-#include "paddle/phi/kernels/primitive/compute_primitives_xpu2.h"
-#include "paddle/phi/kernels/primitive/datamover_primitives_xpu2.h"
-#include "paddle/phi/kernels/primitive/functor_primitives_xpu2.h"
+#ifdef PADDLE_WITH_XPU_KP
 
 #define KPStream XPUStream
 #define KPDevice phi::XPUContext
@@ -26,6 +22,11 @@
 #define __forceinline__ __inline__
 #define __restrict__
 
+#include "paddle/phi/backends/xpu/xpu_context.h"
+#include "paddle/phi/kernels/primitive/compute_primitives_xpu2.h"
+#include "paddle/phi/kernels/primitive/datamover_primitives_xpu2.h"
+#include "paddle/phi/kernels/primitive/functor_primitives_xpu2.h"
+
 #define THREAD_ID_X core_id()
 #define THREAD_ID_Y 0
 #define THREAD_ID_Z 0
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op_xpu_kp.py b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op_xpu_kp.py
new file mode 100644
index 0000000000000..7cc97ccc82f7e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op_xpu_kp.py
@@ -0,0 +1,341 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import numpy as np
+import sys
+sys.path.append("..")
+import paddle
+from op_test import OpTest, skip_check_grad_ci
+from op_test_xpu import XPUOpTest
+import unittest
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseAddOp(XPUOpTest):
+    def setUp(self):
+        self.op_type = "elementwise_add"
+        self.init_dtype()
+        self.init_input_output()
+        self.init_axis()
+        self.init_max_relative_error()
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
+        }
+        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {'Out': self.out}
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def test_check_grad_normal(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(
+                place, ['X', 'Y'],
+                'Out',
+                max_relative_error=self.max_relative_error)
+
+    def test_check_grad_ingore_x(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(
+                place, ['Y'],
+                'Out',
+                no_grad_set=set("X"),
+                max_relative_error=self.max_relative_error)
+
+    def test_check_grad_ingore_y(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(
+                place, ['X'],
+                'Out',
+                no_grad_set=set("Y"),
+                max_relative_error=self.max_relative_error)
+
+    def init_input_output(self):
+        self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.out = np.add(self.x, self.y)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def init_axis(self):
+        self.axis = -1
+
+    def init_max_relative_error(self):
+        self.max_relative_error = 0.006
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+@skip_check_grad_ci(
+    reason="[skip shape check] Use y_shape(1) to test broadcast.")
+class TestElementwiseAddOp_scalar(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(1).astype(self.dtype)
+        self.out = self.x + self.y
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+@skip_check_grad_ci(
+    reason="[skip shape check] Use y_shape(1,1) to test broadcast.")
+class TestElementwiseAddOp_scalar2(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(1, 1).astype(self.dtype)
+        self.out = self.x + self.y
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseAddOp_Vector(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.random((100, )).astype(self.dtype)
+        self.y = np.random.random((100, )).astype(self.dtype)
+        self.out = np.add(self.x, self.y)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseAddOp_broadcast_0(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(100, 2, 3).astype(self.dtype)
+        self.y = np.random.rand(100).astype(self.dtype)
+        self.out = self.x + self.y.reshape(100, 1, 1)
+
+    def init_axis(self):
+        self.axis = 0
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseAddOp_broadcast_1(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 100, 3).astype(self.dtype)
+        self.y = np.random.rand(100).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 100, 1)
+
+    def init_axis(self):
+        self.axis = 1
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseAddOp_broadcast_2(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 100).astype(self.dtype)
+        self.y = np.random.rand(100).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 1, 100)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseAddOp_broadcast_3(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 10, 12, 3).astype(self.dtype)
+        self.y = np.random.rand(10, 12).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 10, 12, 1)
+
+    def init_axis(self):
+        self.axis = 1
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseAddOp_broadcast_4(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(100, 2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(100, 1).astype(self.dtype)
+        self.out = self.x + self.y.reshape(100, 1, 1, 1)
+
+    def init_axis(self):
+        self.axis = 0
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseAddOp_broadcast_5(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(10, 3, 12).astype(self.dtype)
+        self.y = np.random.rand(10, 1, 12).astype(self.dtype)
+        self.out = self.x + self.y
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseAddOp_broadcast_6(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 12, 3, 5).astype(self.dtype)
+        self.y = np.random.rand(2, 12, 1, 5).astype(self.dtype)
+        self.out = self.x + self.y
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseAddOp_broadcast_7(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(1, 1, 20, 5).astype(self.dtype)
+        self.y = np.random.rand(20, 5, 1, 1).astype(self.dtype)
+        self.out = self.x + self.y
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseAddOp_rowwise_add_0(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 10, 12).astype(self.dtype)
+        self.y = np.random.rand(10, 12).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 10, 12)
+
+    def init_axis(self):
+        self.axis = 1
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+@skip_check_grad_ci(
+    reason="[skip shape check] Use y_shape(1) to test broadcast.")
+class TestElementwiseAddOp_rowwise_add_1(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(100, 1).astype(self.dtype)
+        self.y = np.random.rand(1).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 1)
+
+    def init_axis(self):
+        self.axis = 1
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseAddOp_channelwise_add(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(100, 2, 3).astype(self.dtype)
+        self.y = np.random.rand(100, 1, 1).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_axis(self):
+        self.axis = -1
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseAddOp_commonuse_add1(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 100).astype(self.dtype)
+        self.y = np.random.rand(1, 1, 100).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_axis(self):
+        self.axis = -1
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseAddOp_commonuse_add2(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(10, 3, 1, 4).astype(self.dtype)
+        self.y = np.random.rand(10, 1, 12, 1).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_axis(self):
+        self.axis = -1
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseAddOp_xsize_lessthan_ysize_add(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(10, 12).astype(self.dtype)
+        self.y = np.random.rand(2, 3, 10, 12).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_axis(self):
+        self.axis = 2
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseAddOpError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            # the input of elementwise_add must be Variable.
+            x1 = fluid.create_lod_tensor(
+                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.XPUPlace(0))
+            y1 = fluid.create_lod_tensor(
+                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.XPUPlace(0))
+            self.assertRaises(TypeError, fluid.layers.elementwise_add, x1, y1)
+
+            # the input dtype of elementwise_add must be float16 or float32 or float64 or int32 or int64
+            # float16 only can be set on GPU place
+            x2 = fluid.layers.data(name='x2', shape=[3, 4, 5, 6], dtype="uint8")
+            y2 = fluid.layers.data(name='y2', shape=[3, 4, 5, 6], dtype="uint8")
+            self.assertRaises(TypeError, fluid.layers.elementwise_add, x2, y2)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestAddOp(unittest.TestCase):
+    def test_name(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data(name="x", shape=[2, 3], dtype="float32")
+            y = fluid.data(name='y', shape=[2, 3], dtype='float32')
+
+            y_1 = paddle.add(x, y, name='add_res')
+            self.assertEqual(('add_res' in y_1.name), True)
+
+    def test_declarative(self):
+        with fluid.program_guard(fluid.Program()):
+
+            def gen_data():
+                return {
+                    "x": np.array([2, 3, 4]).astype('float32'),
+                    "y": np.array([1, 5, 2]).astype('float32')
+                }
+
+            x = fluid.data(name="x", shape=[3], dtype='float32')
+            y = fluid.data(name="y", shape=[3], dtype='float32')
+            z = paddle.add(x, y)
+
+            place = fluid.XPUPlace(0)
+            exe = fluid.Executor(place)
+            z_value = exe.run(feed=gen_data(), fetch_list=[z.name])
+            z_expected = np.array([3., 8., 6.])
+            self.assertEqual((z_value == z_expected).all(), True)
+
+    def test_dygraph(self):
+        with fluid.dygraph.guard():
+            np_x = np.array([2, 3, 4]).astype('float32')
+            np_y = np.array([1, 5, 2]).astype('float32')
+            x = fluid.dygraph.to_variable(np_x)
+            y = fluid.dygraph.to_variable(np_y)
+            z = paddle.add(x, y)
+            np_z = z.numpy()
+            z_expected = np.array([3., 8., 6.])
+            self.assertEqual((np_z == z_expected).all(), True)
+
+
+if __name__ == '__main__':
+    unittest.main()

From dba694f471713137f6d9d5338bf230983b16bac2 Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Wed, 23 Feb 2022 14:34:19 +0800
Subject: [PATCH 060/101] [phi] move unbind to phi (#39789)

* move unbind to phi

* revert infer shape

* add header file

* move concat_and_split to phi
---
 paddle/fluid/operators/math/CMakeLists.txt    |   4 +-
 .../fluid/operators/math/concat_and_split.cc  |  14 +-
 .../fluid/operators/math/concat_and_split.cu  |  16 +-
 .../fluid/operators/math/concat_and_split.h   |  14 -
 paddle/fluid/operators/unbind_op.cc           |  11 +-
 paddle/fluid/operators/unbind_op.cu.cc        |  24 -
 paddle/fluid/operators/unbind_op.h            |  21 -
 paddle/phi/infermeta/unary.cc                 |  19 +
 paddle/phi/infermeta/unary.h                  |   3 +
 paddle/phi/kernels/CMakeLists.txt             |   2 +-
 paddle/phi/kernels/cpu/concat_and_split.h     | 138 -----
 paddle/phi/kernels/cpu/concat_kernel.cc       |   5 +-
 paddle/phi/kernels/cpu/split_kernel.cc        |   5 +-
 paddle/phi/kernels/cpu/unbind_kernel.cc       |  28 +
 paddle/phi/kernels/funcs/CMakeLists.txt       |   1 +
 .../kernels/funcs/concat_and_split_functor.cc | 146 +++++
 .../kernels/funcs/concat_and_split_functor.cu | 584 ++++++++++++++++++
 .../kernels/funcs/concat_and_split_functor.h  |  90 +++
 paddle/phi/kernels/gpu/concat_and_split.h     | 567 -----------------
 paddle/phi/kernels/gpu/concat_kernel.cu       |   5 +-
 paddle/phi/kernels/gpu/split_kernel.cu        |   5 +-
 paddle/phi/kernels/gpu/unbind_kernel.cu       |  28 +
 paddle/phi/kernels/impl/unbind_kernel_impl.h  |  40 ++
 paddle/phi/kernels/unbind_kernel.h            |  31 +
 24 files changed, 998 insertions(+), 803 deletions(-)
 delete mode 100644 paddle/fluid/operators/unbind_op.cu.cc
 delete mode 100644 paddle/phi/kernels/cpu/concat_and_split.h
 create mode 100644 paddle/phi/kernels/cpu/unbind_kernel.cc
 create mode 100644 paddle/phi/kernels/funcs/concat_and_split_functor.cc
 create mode 100644 paddle/phi/kernels/funcs/concat_and_split_functor.cu
 create mode 100644 paddle/phi/kernels/funcs/concat_and_split_functor.h
 delete mode 100644 paddle/phi/kernels/gpu/concat_and_split.h
 create mode 100644 paddle/phi/kernels/gpu/unbind_kernel.cu
 create mode 100644 paddle/phi/kernels/impl/unbind_kernel_impl.h
 create mode 100644 paddle/phi/kernels/unbind_kernel.h

diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index d5336af8f05ef..d7d1093b9b3bf 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -6,9 +6,9 @@ endif()
 
 # please add new math_library in alphabetical order
 if (WITH_ASCEND_CL)
-math_library(concat_and_split DEPS npu_op_runner)
+math_library(concat_and_split DEPS concat_and_split_functor npu_op_runner)
 else()
-math_library(concat_and_split)
+math_library(concat_and_split DEPS concat_and_split_functor)
 endif()
 math_library(context_project DEPS im2col math_function)
 math_library(cross_entropy)
diff --git a/paddle/fluid/operators/math/concat_and_split.cc b/paddle/fluid/operators/math/concat_and_split.cc
index 8ec89f1b60ace..46126ac59c892 100644
--- a/paddle/fluid/operators/math/concat_and_split.cc
+++ b/paddle/fluid/operators/math/concat_and_split.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/concat_and_split.h"
 
-#include "paddle/phi/kernels/cpu/concat_and_split.h"
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 #ifdef PADDLE_WITH_ASCEND_CL
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #endif
@@ -46,9 +46,8 @@ class ConcatFunctor<platform::CPUDeviceContext, T> {
   void operator()(const platform::CPUDeviceContext& context,
                   const std::vector<framework::Tensor>& input, int axis,
                   framework::Tensor* output) {
-    std::vector<phi::DenseTensor> pt_input{input.begin(), input.end()};
-    phi::ConcatImpl<T, platform::CPUDeviceContext>(context, pt_input, axis,
-                                                   output);
+    phi::funcs::ConcatFunctor<phi::CPUContext, T> functor;
+    functor(context, input, axis, output);
   }
 };
 
@@ -63,11 +62,8 @@ class SplitFunctor<platform::CPUDeviceContext, T> {
                   const framework::Tensor& input,
                   const std::vector<const framework::Tensor*>& ref_inputs,
                   const int axis, std::vector<framework::Tensor*>* outputs) {
-    std::vector<const phi::DenseTensor*> pt_ref_inputs{ref_inputs.begin(),
-                                                       ref_inputs.end()};
-    std::vector<phi::DenseTensor*> pt_outputs{outputs->begin(), outputs->end()};
-    phi::SplitImpl<T, platform::CPUDeviceContext>(context, input, pt_ref_inputs,
-                                                  axis, &pt_outputs);
+    phi::funcs::SplitFunctor<phi::CPUContext, T> functor;
+    functor(context, input, ref_inputs, axis, outputs);
   }
 };
 
diff --git a/paddle/fluid/operators/math/concat_and_split.cu b/paddle/fluid/operators/math/concat_and_split.cu
index 51f94afcfc1b9..e51631385eb75 100644
--- a/paddle/fluid/operators/math/concat_and_split.cu
+++ b/paddle/fluid/operators/math/concat_and_split.cu
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/concat_and_split.h"
 
-#include "paddle/phi/kernels/gpu/concat_and_split.h"
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 namespace paddle {
 namespace operators {
 namespace math {
@@ -29,10 +29,8 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
   void operator()(const platform::CUDADeviceContext& context,
                   const std::vector<framework::Tensor>& input, int axis,
                   framework::Tensor* output) {
-    std::vector<phi::DenseTensor> pt_input{input.begin(), input.end()};
-
-    phi::ConcatImpl<T, platform::CUDADeviceContext>(context, pt_input, axis,
-                                                    output);
+    phi::funcs::ConcatFunctor<phi::GPUContext, T> functor;
+    functor(context, input, axis, output);
   }
 };
 
@@ -43,16 +41,12 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
 template <typename T>
 class SplitFunctor<platform::CUDADeviceContext, T> {
  public:
-  SplitFunctor();
   void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input,
                   const std::vector<const framework::Tensor*>& ref_inputs,
                   int axis, std::vector<framework::Tensor*>* outputs) {
-    std::vector<const phi::DenseTensor*> pt_ref_inputs{ref_inputs.begin(),
-                                                       ref_inputs.end()};
-    std::vector<phi::DenseTensor*> pt_outputs{outputs->begin(), outputs->end()};
-    phi::SplitImpl<T, platform::CUDADeviceContext>(
-        context, input, pt_ref_inputs, axis, &pt_outputs);
+    phi::funcs::SplitFunctor<phi::GPUContext, T> functor;
+    functor(context, input, ref_inputs, axis, outputs);
   }
 };
 
diff --git a/paddle/fluid/operators/math/concat_and_split.h b/paddle/fluid/operators/math/concat_and_split.h
index 65d2ca79e60c2..b5b0aae23ac87 100644
--- a/paddle/fluid/operators/math/concat_and_split.h
+++ b/paddle/fluid/operators/math/concat_and_split.h
@@ -64,17 +64,3 @@ class SplitFunctor {
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
-
-#define FOR_ALL_TYPES(macro)                 \
-  macro(int);                                \
-  macro(float);                              \
-  macro(double);                             \
-  macro(bool);                               \
-  macro(int64_t);                            \
-  macro(int16_t);                            \
-  macro(uint8_t);                            \
-  macro(int8_t);                             \
-  macro(::paddle::platform::float16);        \
-  macro(::paddle::platform::bfloat16);       \
-  macro(::paddle::platform::complex<float>); \
-  macro(::paddle::platform::complex<double>);
diff --git a/paddle/fluid/operators/unbind_op.cc b/paddle/fluid/operators/unbind_op.cc
index 3fce0f8f47d32..f2fc08308c6b3 100644
--- a/paddle/fluid/operators/unbind_op.cc
+++ b/paddle/fluid/operators/unbind_op.cc
@@ -14,6 +14,9 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/unbind_op.h"
 #include <string>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -79,11 +82,3 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(unbind, ops::UnbindOp, ops::UnbindOpMaker,
                   ops::UnbindGradMaker<paddle::framework::OpDesc>,
                   ops::UnbindGradMaker<paddle::imperative::OpBase>);
-namespace plat = paddle::platform;
-REGISTER_OP_CPU_KERNEL(
-    unbind, ops::UnbindOpKernel<plat::CPUDeviceContext, double>,
-    ops::UnbindOpKernel<plat::CPUDeviceContext, float>,
-    ops::UnbindOpKernel<plat::CPUDeviceContext, int64_t>,
-    ops::UnbindOpKernel<plat::CPUDeviceContext, int>,
-    ops::UnbindOpKernel<plat::CPUDeviceContext, plat::float16>,
-    ops::UnbindOpKernel<plat::CPUDeviceContext, plat::bfloat16>);
diff --git a/paddle/fluid/operators/unbind_op.cu.cc b/paddle/fluid/operators/unbind_op.cu.cc
deleted file mode 100644
index cec7058d3cf52..0000000000000
--- a/paddle/fluid/operators/unbind_op.cu.cc
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/unbind_op.h"
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    unbind, ops::UnbindOpKernel<plat::CUDADeviceContext, double>,
-    ops::UnbindOpKernel<plat::CUDADeviceContext, float>,
-    ops::UnbindOpKernel<plat::CUDADeviceContext, int64_t>,
-    ops::UnbindOpKernel<plat::CUDADeviceContext, int>,
-    ops::UnbindOpKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::UnbindOpKernel<plat::CUDADeviceContext, plat::bfloat16>);
diff --git a/paddle/fluid/operators/unbind_op.h b/paddle/fluid/operators/unbind_op.h
index 69808e3f9fe9e..6e35f262de420 100644
--- a/paddle/fluid/operators/unbind_op.h
+++ b/paddle/fluid/operators/unbind_op.h
@@ -34,27 +34,6 @@ static inline framework::DDim UnbindOutsDims(const framework::DDim in_dims,
   }
   return phi::make_ddim(out_dims);
 }
-template <typename DeviceContext, typename T>
-class UnbindOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
-    int axis = ctx.Attr<int>("axis");
-
-    auto in_dims = in->dims();
-    axis = axis < 0 ? in_dims.size() + axis : axis;
-    std::vector<const framework::Tensor*> shape_refer;
-    for (size_t j = 0; j < outs.size(); ++j) {
-      outs[j]->mutable_data<T>(ctx.GetPlace());
-      shape_refer.emplace_back(outs[j]);
-    }
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    math::SplitFunctor<DeviceContext, T> functor;
-    functor(dev_ctx, *in, shape_refer, axis, &outs);
-  }
-};
 
 template <typename T>
 class UnbindGradMaker : public framework::SingleGradOpMaker<T> {
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 1fbd6c2b6c2f5..ca71d6a56d8e7 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -485,6 +485,25 @@ void SplitInferMeta(const MetaTensor& x,
   }
 }
 
+void UnbindInferMeta(const MetaTensor& x,
+                     int axis,
+                     std::vector<MetaTensor>* outs) {
+  auto in_dims = x.dims();
+  std::vector<int> out_dim;
+  axis = axis < 0 ? in_dims.size() + axis : axis;
+  for (int i = 0; i < in_dims.size(); ++i) {
+    if (i != axis) out_dim.push_back(in_dims[i]);
+  }
+  auto out_dims = phi::make_ddim(out_dim);
+
+  for (size_t i = 0; i < outs->size(); ++i) {
+    (*outs)[i].set_dtype(x.dtype());
+    (*outs)[i].set_dims(out_dims);
+    (*outs)[i].set_layout(x.layout());
+    (*outs)[i].share_lod(x);
+  }
+}
+
 void TraceInferMeta(
     const MetaTensor& x, int offset, int axis1, int axis2, MetaTensor* out) {
   int dim1 = axis1;
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index c6d5d250d98aa..7d15f497ead14 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -90,6 +90,9 @@ void SplitInferMeta(const MetaTensor& x_meta,
                     std::vector<MetaTensor>* out,
                     MetaConfig config = MetaConfig());
 
+void UnbindInferMeta(const MetaTensor& x,
+                     int axis,
+                     std::vector<MetaTensor>* outs);
 void TraceInferMeta(
     const MetaTensor& x, int offset, int axis1, int axis2, MetaTensor* out);
 
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index f819eb3de3ef7..ef085e71f5dcc 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -10,7 +10,7 @@ add_subdirectory(funcs)
 set_property(GLOBAL PROPERTY PTEN_KERNELS "")
 
 set(COMMON_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils)
-set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col)
+set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col concat_and_split_functor)
 # remove this dep after removing fluid deps on tensor creation
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} pten_api_utils)
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} infermeta)
diff --git a/paddle/phi/kernels/cpu/concat_and_split.h b/paddle/phi/kernels/cpu/concat_and_split.h
deleted file mode 100644
index 88cfc5db8f2e8..0000000000000
--- a/paddle/phi/kernels/cpu/concat_and_split.h
+++ /dev/null
@@ -1,138 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/core/dense_tensor.h"
-
-namespace phi {
-
-/*
- * \brief Concatenate the input tensors along the dimension axis.
- *  TODO(zcd): maybe it needs to be more detailed.
- *  Examples:
- *     Input[0] = [[1,2],[3,4]]
- *     Input[1] = [[5,6]]
- *     axis = 0
- *
- *     Output = [[1,2],
- *               [3,4],
- *               [5,6]]
- */
-
-template <typename T, typename Context>
-void ConcatImpl(const Context& context,
-                const std::vector<DenseTensor>& input,
-                int axis,
-                DenseTensor* output) {
-  // TODO(zcd): Add input data validity checking
-  size_t num = input.size();
-
-  int64_t rows = 1;
-  auto dim_0 = input[0].dims();
-  for (int i = 0; i < axis; ++i) {
-    rows *= dim_0[i];
-  }
-  int64_t out_rows = rows, out_cols = 0;
-
-  std::vector<int64_t> input_cols(input.size());
-  for (size_t i = 0; i < num; ++i) {
-    int64_t t_cols = input[i].numel() / rows;
-    out_cols += t_cols;
-    input_cols[i] = t_cols;
-  }
-  auto cpu_place = context.GetPlace();
-
-  // computation
-  auto output_data = output->data<T>();
-  int64_t col_idx = 0;
-  for (size_t j = 0; j < num; ++j) {
-    int64_t col_len = input_cols[j];
-    auto input_data = input[j].data<T>();
-    for (int64_t k = 0; k < out_rows; ++k) {
-      paddle::memory::Copy(cpu_place,
-                           output_data + k * out_cols + col_idx,
-                           cpu_place,
-                           input_data + k * col_len,
-                           sizeof(T) * col_len);
-    }
-    col_idx += col_len;
-  }
-}
-
-/*
- * \brief Split the input tensors along the dimension axis into outputs.
- *  TODO(zcd): maybe it needs to be more detailed.
- *  Examples:
- *     Input = [[1,2],
- *              [3,4],
- *              [5,6]]
- *     axis = 0
- *
- *     Output[0] = [[1,2],[3,4]]
- *     Output[1] = [[5,6]]
- */
-template <typename T, typename Context>
-void SplitImpl(const Context& context,
-               const DenseTensor& input,
-               const std::vector<const DenseTensor*>& ref_inputs,
-               const int axis,
-               std::vector<DenseTensor*>* outputs) {
-  // NOTE(zhiqiu): split a tensor of shape [0,3,4] at axis=1, result in 3
-  // tensors of shape [0,1,4]
-  if (input.numel() == 0) {
-    return;
-  }
-
-  // TODO(zcd): Add input data validity checking
-  size_t num = outputs->size();
-
-  int input_rows = 1;
-  auto dim_0 = ref_inputs[0]->dims();
-  for (int i = 0; i < axis; ++i) {
-    input_rows *= dim_0[i];
-  }
-
-  int input_cols = 0;
-
-  std::vector<int64_t> output_cols(outputs->size());
-  for (size_t i = 0; i < num; ++i) {
-    int t_cols = ref_inputs[i]->numel() / input_rows;
-    input_cols += t_cols;
-    output_cols[i] = t_cols;
-  }
-  auto cpu_place = context.GetPlace();
-
-  // computation
-  for (int k = 0; k < input_rows; ++k) {
-    const T* src_ptr = input.data<T>() + k * input_cols;
-    int col_idx = 0;
-    for (size_t j = 0; j < num; ++j) {
-      int col_len = output_cols[j];
-      auto* out_tensor = outputs->at(j);
-      if (out_tensor != nullptr) {
-        T* dst_ptr = out_tensor->data<T>() + k * col_len;
-        paddle::memory::Copy(cpu_place,
-                             dst_ptr,
-                             cpu_place,
-                             src_ptr + col_idx,
-                             sizeof(T) * col_len);
-      }
-      col_idx += col_len;
-    }
-  }
-}
-
-}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/concat_kernel.cc b/paddle/phi/kernels/cpu/concat_kernel.cc
index 3b74951a5041c..18bb8837b105d 100644
--- a/paddle/phi/kernels/cpu/concat_kernel.cc
+++ b/paddle/phi/kernels/cpu/concat_kernel.cc
@@ -22,7 +22,7 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/lod_utils.h"
-#include "paddle/phi/kernels/cpu/concat_and_split.h"
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 #include "paddle/phi/kernels/funcs/concat_funcs.h"
 
 namespace phi {
@@ -104,7 +104,8 @@ void ConcatKernel(const Context& dev_ctx,
         continue;
       }
     }
-    ConcatImpl<T, Context>(dev_ctx, inputs, axis, out);
+    phi::funcs::ConcatFunctor<Context, T> functor;
+    functor(dev_ctx, inputs, axis, out);
   }
 }
 
diff --git a/paddle/phi/kernels/cpu/split_kernel.cc b/paddle/phi/kernels/cpu/split_kernel.cc
index 259bf9e388c2c..7b2166eaf11f9 100644
--- a/paddle/phi/kernels/cpu/split_kernel.cc
+++ b/paddle/phi/kernels/cpu/split_kernel.cc
@@ -19,7 +19,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 
 #include "paddle/phi/infermeta/unary.h"
-#include "paddle/phi/kernels/cpu/concat_and_split.h"
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 namespace phi {
 
 template <typename T, typename Context>
@@ -54,7 +54,8 @@ void SplitKernel(const Context& dev_ctx,
     paddle::operators::StridedMemcpyWithAxis0<T>(
         dev_ctx, x, shape_refer, &outs);
   } else {
-    SplitImpl<T, Context>(dev_ctx, x, shape_refer, axis, &outs);
+    phi::funcs::SplitFunctor<Context, T> functor;
+    functor(dev_ctx, x, shape_refer, axis, &outs);
   }
 }
 
diff --git a/paddle/phi/kernels/cpu/unbind_kernel.cc b/paddle/phi/kernels/cpu/unbind_kernel.cc
new file mode 100644
index 0000000000000..655f8c8aafbf2
--- /dev/null
+++ b/paddle/phi/kernels/cpu/unbind_kernel.cc
@@ -0,0 +1,28 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/unbind_kernel.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/unbind_kernel_impl.h"
+
+PD_REGISTER_KERNEL(unbind,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::UnbindKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt
index ba0c848df434e..aa4fac1692007 100644
--- a/paddle/phi/kernels/funcs/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/CMakeLists.txt
@@ -3,3 +3,4 @@ add_subdirectory(blas)
 add_subdirectory(lapack)
 
 math_library(math_function DEPS blas dense_tensor tensor)
+math_library(concat_and_split_functor DEPS dense_tensor)
diff --git a/paddle/phi/kernels/funcs/concat_and_split_functor.cc b/paddle/phi/kernels/funcs/concat_and_split_functor.cc
new file mode 100644
index 0000000000000..c8405703a5c16
--- /dev/null
+++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cc
@@ -0,0 +1,146 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cmath>
+#include <memory>
+#include <vector>
+
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/core/utils/data_type.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
+
+namespace phi {
+namespace funcs {
+
+/*
+ * All tensors' dimension should be the same and the values of
+ * each dimension must be the same, except the axis dimension.
+ */
+template <typename T>
+struct ConcatFunctor<phi::CPUContext, T> {
+  void operator()(const phi::CPUContext& context,
+                  const std::vector<phi::DenseTensor>& input,
+                  int axis,
+                  phi::DenseTensor* output) {
+    // TODO(zcd): Add input data validity checking
+    size_t num = input.size();
+
+    int64_t rows = 1;
+    auto dim_0 = input[0].dims();
+    for (int i = 0; i < axis; ++i) {
+      rows *= dim_0[i];
+    }
+    int64_t out_rows = rows, out_cols = 0;
+
+    std::vector<int64_t> input_cols(input.size());
+    for (size_t i = 0; i < num; ++i) {
+      int64_t t_cols = input[i].numel() / rows;
+      out_cols += t_cols;
+      input_cols[i] = t_cols;
+    }
+    auto cpu_place = context.GetPlace();
+
+    // computation
+    auto output_data = output->data<T>();
+    int64_t col_idx = 0;
+    for (size_t j = 0; j < num; ++j) {
+      int64_t col_len = input_cols[j];
+      auto input_data = input[j].data<T>();
+      for (int64_t k = 0; k < out_rows; ++k) {
+        paddle::memory::Copy(cpu_place,
+                             output_data + k * out_cols + col_idx,
+                             cpu_place,
+                             input_data + k * col_len,
+                             sizeof(T) * col_len);
+      }
+      col_idx += col_len;
+    }
+  }
+};
+
+/*
+ * All tensors' dimension should be the same and the values of
+ * each dimension must be the same, except the axis dimension.
+ */
+template <typename T>
+struct SplitFunctor<phi::CPUContext, T> {
+ public:
+  void operator()(const phi::CPUContext& context,
+                  const phi::DenseTensor& input,
+                  const std::vector<const phi::DenseTensor*>& ref_inputs,
+                  int axis,
+                  std::vector<phi::DenseTensor*>* outputs) {
+    // NOTE(zhiqiu): split a tensor of shape [0,3,4] at axis=1, result in 3
+    // tensors of shape [0,1,4]
+    if (input.numel() == 0) {
+      return;
+    }
+
+    // TODO(zcd): Add input data validity checking
+    size_t num = outputs->size();
+
+    int input_rows = 1;
+    auto dim_0 = ref_inputs[0]->dims();
+    for (int i = 0; i < axis; ++i) {
+      input_rows *= dim_0[i];
+    }
+
+    int input_cols = 0;
+
+    std::vector<int64_t> output_cols(outputs->size());
+    for (size_t i = 0; i < num; ++i) {
+      int t_cols = ref_inputs[i]->numel() / input_rows;
+      input_cols += t_cols;
+      output_cols[i] = t_cols;
+    }
+    auto cpu_place = context.GetPlace();
+
+    // computation
+    for (int k = 0; k < input_rows; ++k) {
+      const T* src_ptr = input.data<T>() + k * input_cols;
+      int col_idx = 0;
+      for (size_t j = 0; j < num; ++j) {
+        int col_len = output_cols[j];
+        auto* out_tensor = outputs->at(j);
+        if (out_tensor != nullptr) {
+          T* dst_ptr = out_tensor->data<T>() + k * col_len;
+          paddle::memory::Copy(cpu_place,
+                               dst_ptr,
+                               cpu_place,
+                               src_ptr + col_idx,
+                               sizeof(T) * col_len);
+        }
+        col_idx += col_len;
+      }
+    }
+  }
+};
+
+#define DEFINE_FUNCTOR(type)                           \
+  template class ConcatFunctor<phi::CPUContext, type>; \
+  template class SplitFunctor<phi::CPUContext, type>;
+
+FOR_ALL_TYPES(DEFINE_FUNCTOR);
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/concat_and_split_functor.cu b/paddle/phi/kernels/funcs/concat_and_split_functor.cu
new file mode 100644
index 0000000000000..2abfdb606e7e6
--- /dev/null
+++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cu
@@ -0,0 +1,584 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cmath>
+#include <memory>
+#include <vector>
+
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/utils/data_type.h"
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
+
+namespace phi {
+namespace funcs {
+
+template <typename T>
+__global__ void ConcatKernel_(const T** inputs,
+                              const int64_t* input_cols,
+                              int col_size,
+                              const int64_t output_rows,
+                              const int64_t output_cols,
+                              T* output) {
+  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+  int curr_segment = 0;
+  int curr_offset = input_cols[0];
+  for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) {
+    int curr_col_offset = input_cols[curr_segment + 1];
+    while (curr_col_offset <= tid_x) {
+      curr_offset = curr_col_offset;
+      ++curr_segment;
+      curr_col_offset = input_cols[curr_segment + 1];
+    }
+
+    int local_col = tid_x - curr_offset;
+    int segment_width = curr_col_offset - curr_offset;
+
+    const T* input_ptr = inputs[curr_segment];
+    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+    for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y)
+      output[tid_y * output_cols + tid_x] =
+          input_ptr[tid_y * segment_width + local_col];
+  }
+}
+
+template <typename T>
+__device__ void ConcatKernelDetail(const T** inputs_data,
+                                   const int fixed_in_col,
+                                   const int out_rows,
+                                   const int out_cols,
+                                   T* output_data) {
+  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+  for (; tid_x < out_cols; tid_x += blockDim.x * gridDim.x) {
+    int split = tid_x * 1.0 / fixed_in_col;
+    int in_offset = tid_x - split * fixed_in_col;
+    const T* input_ptr = inputs_data[split];
+    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+    for (; tid_y < out_rows; tid_y += blockDim.y * gridDim.y) {
+      output_data[tid_y * out_cols + tid_x] =
+          input_ptr[tid_y * fixed_in_col + in_offset];
+    }
+  }
+}
+
+template <typename T>
+__global__ void ConcatKernel_(const T* input_addr0,
+                              const T* input_addr1,
+                              const int64_t fixed_in_col,
+                              const int64_t out_rows,
+                              const int64_t out_cols,
+                              T* output_data) {
+  const T* inputs_data[2];
+  inputs_data[0] = input_addr0;
+  inputs_data[1] = input_addr1;
+  ConcatKernelDetail<T>(
+      inputs_data, fixed_in_col, out_rows, out_cols, output_data);
+}
+
+template <typename T>
+__global__ void ConcatKernel_(const T* input_addr0,
+                              const T* input_addr1,
+                              const T* input_addr2,
+                              const int64_t fixed_in_col,
+                              const int64_t out_rows,
+                              const int64_t out_cols,
+                              T* output_data) {
+  const T* inputs_data[3];
+  inputs_data[0] = input_addr0;
+  inputs_data[1] = input_addr1;
+  inputs_data[2] = input_addr2;
+  ConcatKernelDetail<T>(
+      inputs_data, fixed_in_col, out_rows, out_cols, output_data);
+}
+
+template <typename T>
+__global__ void ConcatKernel_(const T* input_addr0,
+                              const T* input_addr1,
+                              const T* input_addr2,
+                              const T* input_addr3,
+                              const int64_t fixed_in_col,
+                              const int64_t out_rows,
+                              const int64_t out_cols,
+                              T* output_data) {
+  const T* inputs_data[4];
+  inputs_data[0] = input_addr0;
+  inputs_data[1] = input_addr1;
+  inputs_data[2] = input_addr2;
+  inputs_data[3] = input_addr3;
+  ConcatKernelDetail<T>(
+      inputs_data, fixed_in_col, out_rows, out_cols, output_data);
+}
+
+template <typename T>
+__global__ void ConcatKernel_(const T** inputs_data,
+                              const int in_num,
+                              const int64_t fixed_in_col,
+                              const int64_t out_rows,
+                              const int64_t out_cols,
+                              T* output_data) {
+  ConcatKernelDetail<T>(
+      inputs_data, fixed_in_col, out_rows, out_cols, output_data);
+}
+
+template <typename T>
+__global__ void SplitKernel_(const T* input_data,
+                             const int64_t in_row,
+                             const int64_t in_col,
+                             const int64_t* out_cols,
+                             int out_cols_size,
+                             T** outputs_data) {
+  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+  int curr_segment = 0;
+  int curr_offset = out_cols[0];
+  for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
+    int curr_col_offset = out_cols[curr_segment + 1];
+    while (curr_col_offset <= tid_x) {
+      curr_offset = curr_col_offset;
+      ++curr_segment;
+      curr_col_offset = out_cols[curr_segment + 1];
+    }
+
+    int local_col = tid_x - curr_offset;
+    int segment_width = curr_col_offset - curr_offset;
+    T* output_ptr = outputs_data[curr_segment];
+    if (output_ptr != nullptr) {
+      int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+      for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
+        output_ptr[tid_y * segment_width + local_col] =
+            input_data[tid_y * in_col + tid_x];
+    }
+  }
+}
+
+template <typename T>
+__device__ void SplitKernelDetail(const T* input_data,
+                                  const int in_row,
+                                  const int in_col,
+                                  const int fixed_out_col,
+                                  T** outputs_data) {
+  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+  for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
+    int split = tid_x / fixed_out_col;
+    int in_offset = tid_x - split * fixed_out_col;
+    T* output_ptr = outputs_data[split];
+    if (output_ptr != nullptr) {
+      int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+      for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
+        output_ptr[tid_y * fixed_out_col + in_offset] =
+            input_data[tid_y * in_col + tid_x];
+    }
+  }
+}
+
+template <typename T>
+__global__ void SplitKernel_(const T* input_data,
+                             const int64_t in_row,
+                             const int64_t in_col,
+                             const int64_t fixed_out_col,
+                             T** outputs_data) {
+  SplitKernelDetail<T>(input_data, in_row, in_col, fixed_out_col, outputs_data);
+}
+
+template <typename T>
+__global__ void SplitKernel_(const T* input_data,
+                             const int64_t in_row,
+                             const int64_t in_col,
+                             const int64_t fixed_out_col,
+                             T* outputs_addr0,
+                             T* outputs_addr1) {
+  T* outputs_data[2];
+  outputs_data[0] = outputs_addr0;
+  outputs_data[1] = outputs_addr1;
+  SplitKernelDetail<T>(input_data, in_row, in_col, fixed_out_col, outputs_data);
+}
+
+template <typename T>
+__global__ void SplitKernel_(const T* input_data,
+                             const int64_t in_row,
+                             const int64_t in_col,
+                             const int64_t fixed_out_col,
+                             T* outputs_addr0,
+                             T* outputs_addr1,
+                             T* outputs_addr2) {
+  T* outputs_data[3];
+  outputs_data[0] = outputs_addr0;
+  outputs_data[1] = outputs_addr1;
+  outputs_data[2] = outputs_addr2;
+  SplitKernelDetail<T>(input_data, in_row, in_col, fixed_out_col, outputs_data);
+}
+
+template <typename T>
+__global__ void SplitKernel_(const T* input_data,
+                             const int64_t in_row,
+                             const int64_t in_col,
+                             const int64_t fixed_out_col,
+                             T* outputs_addr0,
+                             T* outputs_addr1,
+                             T* outputs_addr2,
+                             T* outputs_addr3) {
+  T* outputs_data[4];
+  outputs_data[0] = outputs_addr0;
+  outputs_data[1] = outputs_addr1;
+  outputs_data[2] = outputs_addr2;
+  outputs_data[3] = outputs_addr3;
+  SplitKernelDetail<T>(input_data, in_row, in_col, fixed_out_col, outputs_data);
+}
+
+static inline void GetBlockDims(const phi::GPUContext& context,
+                                int64_t num_rows,
+                                int64_t num_cols,
+                                dim3* block_dims,
+                                dim3* grid_dims) {
+  // Set the thread block and grid according to CurrentDeviceId
+  const int kThreadsPerBlock = 1024;
+  int block_cols = kThreadsPerBlock;
+  if (num_cols < kThreadsPerBlock) {  // block_cols is aligned by 32.
+    block_cols = ((num_cols + 31) >> 5) << 5;
+  }
+  int block_rows = kThreadsPerBlock / block_cols;
+  *block_dims = dim3(block_cols, block_rows, 1);
+
+  int max_threads = context.GetMaxPhysicalThreadCount();
+  int64_t max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
+
+  int grid_cols =
+      std::min((num_cols + block_cols - 1) / block_cols, max_blocks);
+  int grid_rows = std::min(max_blocks / grid_cols,
+                           std::max(num_rows / block_rows, (int64_t)1));
+  *grid_dims = dim3(grid_cols, grid_rows, 1);
+}
+
+/*
+ * All tensors' dimension should be the same and the values of
+ * each dimension must be the same, except the axis dimension.
+ */
+
+template <typename T>
+struct ConcatFunctor<phi::GPUContext, T> {
+  void operator()(const phi::GPUContext& context,
+                  const std::vector<phi::DenseTensor>& input,
+                  int axis,
+                  phi::DenseTensor* output) {
+    // TODO(zcd): Add input data validity checking
+    int in_num = input.size();
+    int64_t in_row = 1;
+    auto dim_0 = input[0].dims();
+    for (int i = 0; i < axis; ++i) {
+      in_row *= dim_0[i];
+    }
+    int64_t in_col = input[0].numel() / in_row;
+    int64_t out_row = in_row, out_col = 0;
+
+    int inputs_col_num = in_num + 1;
+    std::vector<const T*> inputs_data_vec(in_num);
+    std::vector<int64_t> inputs_col_vec(inputs_col_num);
+    const T** inputs_data = inputs_data_vec.data();
+    int64_t* inputs_col = inputs_col_vec.data();
+
+// There are some differences between hip runtime and NV runtime.
+// In NV, when the pageable memory data less than 64K is transferred from
+// hosttodevice, it will be automatically asynchronous.
+// However, only pinned memory in hip can copy asynchronously
+// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#concurrent-execution-host-device
+// 3.2.6.1. Concurrent Execution between Host and Device
+// Memory copies from host to device of a memory block of 64 KB or less
+#ifdef PADDLE_WITH_HIP
+    paddle::memory::AllocationPtr data_alloc, col_alloc;
+    // TODO(chentianyu03): try to find a method to remove the Alloc function
+    data_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
+                                       in_num * sizeof(T*));
+    inputs_data = reinterpret_cast<const T**>(data_alloc->ptr());
+    // TODO(chentianyu03): try to find a method to remove the Alloc function
+    col_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
+                                      inputs_col_num * sizeof(int));
+    inputs_col = reinterpret_cast<int64_t*>(col_alloc->ptr());
+#endif
+
+    inputs_col[0] = 0;
+    bool has_same_shape = true;
+    for (int i = 0; i < in_num; ++i) {
+      int64_t t_cols = input[i].numel() / in_row;
+      if (has_same_shape) {
+        if (t_cols != in_col) has_same_shape = false;
+      }
+      out_col += t_cols;
+      inputs_col[i + 1] = out_col;
+      inputs_data[i] = input[i].data<T>();
+    }
+
+    dim3 block_dims;
+    dim3 grid_dims;
+    GetBlockDims(context, out_row, out_col, &block_dims, &grid_dims);
+
+    paddle::memory::allocation::AllocationPtr tmp_dev_ins_data;
+    const T** dev_ins_data = nullptr;
+    if (!has_same_shape || in_num < 2 || in_num > 4) {
+      tmp_dev_ins_data = paddle::memory::Alloc(context, in_num * sizeof(T*));
+      auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph(
+          inputs_data, in_num);
+      paddle::memory::Copy(context.GetPlace(),
+                           tmp_dev_ins_data->ptr(),
+                           paddle::platform::CPUPlace(),
+                           restored,
+                           in_num * sizeof(T*),
+                           context.stream());
+      dev_ins_data = reinterpret_cast<const T**>(tmp_dev_ins_data->ptr());
+    }
+
+    if (has_same_shape) {
+      if (in_num == 2) {
+        ConcatKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
+            inputs_data[0],
+            inputs_data[1],
+            in_col,
+            out_row,
+            out_col,
+            output->data<T>());
+      } else if (in_num == 3) {
+        ConcatKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
+            inputs_data[0],
+            inputs_data[1],
+            inputs_data[2],
+            in_col,
+            out_row,
+            out_col,
+            output->data<T>());
+      } else if (in_num == 4) {
+        ConcatKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
+            inputs_data[0],
+            inputs_data[1],
+            inputs_data[2],
+            inputs_data[3],
+            in_col,
+            out_row,
+            out_col,
+            output->data<T>());
+      } else {
+        ConcatKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
+            dev_ins_data, in_num, in_col, out_row, out_col, output->data<T>());
+      }
+    } else {
+      auto tmp_dev_ins_col_data =
+          paddle::memory::Alloc(context, inputs_col_num * sizeof(int64_t));
+
+      auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph(
+          inputs_col, inputs_col_num);
+      paddle::memory::Copy(context.GetPlace(),
+                           tmp_dev_ins_col_data->ptr(),
+                           paddle::platform::CPUPlace(),
+                           restored,
+                           inputs_col_num * sizeof(int64_t),
+                           context.stream());
+      int64_t* dev_ins_col_data =
+          static_cast<int64_t*>(tmp_dev_ins_col_data->ptr());
+
+      ConcatKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
+          dev_ins_data,
+          dev_ins_col_data,
+          static_cast<int>(inputs_col_num),
+          out_row,
+          out_col,
+          output->data<T>());
+    }
+
+#ifdef PADDLE_WITH_HIP
+    // Prevent the pinned memory value from being covered and release the memory
+    // after the launch kernel of the stream is executed (reapply pinned memory
+    // next time)
+    auto* data_alloc_released = data_alloc.release();
+    auto* col_alloc_released = col_alloc.release();
+    context.AddStreamCallback([data_alloc_released, col_alloc_released] {
+      paddle::memory::allocation::Allocator::AllocationDeleter(
+          data_alloc_released);
+      paddle::memory::allocation::Allocator::AllocationDeleter(
+          col_alloc_released);
+    });
+#endif
+  }
+};
+
+template <typename T>
+class SplitFunctor<phi::GPUContext, T> {
+ public:
+  void operator()(const phi::GPUContext& context,
+                  const phi::DenseTensor& input,
+                  const std::vector<const phi::DenseTensor*>& ref_inputs,
+                  int axis,
+                  std::vector<phi::DenseTensor*>* outputs) {
+    // NOTE(zhiqiu): split a tensor of shape [0,3,4] at axis=1, result in 3
+    // tensors of shape [0,1,4]
+    if (input.numel() == 0) {
+      return;
+    }
+
+    // TODO(zcd): Add input data validity checking
+    int o_num = outputs->size();
+    int64_t out_row = 1;
+    auto dim_0 = ref_inputs[0]->dims();
+    for (int i = 0; i < axis; ++i) {
+      out_row *= dim_0[i];
+    }
+
+    int64_t out0_col = ref_inputs[0]->numel() / out_row;
+    int64_t in_col = 0, in_row = out_row;
+    bool has_same_shape = true;
+
+    int outputs_cols_num = o_num + 1;
+    std::vector<T*> outputs_data_vec(o_num);
+    std::vector<int64_t> outputs_cols_vec(outputs_cols_num);
+    T** outputs_data = outputs_data_vec.data();
+    int64_t* outputs_cols = outputs_cols_vec.data();
+
+// There are some differences between hip runtime and NV runtime.
+// In NV, when the pageable memory data less than 64K is transferred from
+// hosttodevice, it will be automatically asynchronous.
+// However, only pinned memory in hip can copy asynchronously
+// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#concurrent-execution-host-device
+// 3.2.6.1. Concurrent Execution between Host and Device
+// Memory copies from host to device of a memory block of 64 KB or less
+#ifdef PADDLE_WITH_HIP
+    paddle::memory::AllocationPtr data_alloc, cols_alloc;
+    // TODO(chentianyu03): try to find a method to remove the Alloc function
+    data_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
+                                       o_num * sizeof(T*));
+    outputs_data = reinterpret_cast<T**>(data_alloc->ptr());
+    // TODO(chentianyu03): try to find a method to remove the Alloc function
+    cols_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
+                                       (outputs_cols_num) * sizeof(int64_t));
+    outputs_cols = reinterpret_cast<int64_t*>(cols_alloc->ptr());
+#endif
+
+    outputs_cols[0] = 0;
+    for (int i = 0; i < o_num; ++i) {
+      int64_t t_col = ref_inputs.at(i)->numel() / out_row;
+      if (has_same_shape) {
+        if (t_col != out0_col) has_same_shape = false;
+      }
+      in_col += t_col;
+      outputs_cols[i + 1] = in_col;
+      if (outputs->at(i) != nullptr) {
+        outputs_data[i] = outputs->at(i)->data<T>();
+      } else {
+        outputs_data[i] = nullptr;
+      }
+    }
+
+    dim3 block_dims;
+    dim3 grid_dims;
+    GetBlockDims(context, out_row, in_col, &block_dims, &grid_dims);
+
+    paddle::memory::allocation::AllocationPtr tmp_dev_outs_data;
+    T** dev_out_gpu_data = nullptr;
+    if (!has_same_shape || o_num < 2 || o_num > 4) {
+      // TODO(chentianyu03): try to find a method to remove the Alloc function
+      tmp_dev_outs_data = paddle::memory::Alloc(context, o_num * sizeof(T*));
+      auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph(
+          outputs_data, o_num);
+      paddle::memory::Copy(context.GetPlace(),
+                           tmp_dev_outs_data->ptr(),
+                           paddle::platform::CPUPlace(),
+                           restored,
+                           o_num * sizeof(T*),
+                           context.stream());
+      dev_out_gpu_data = reinterpret_cast<T**>(tmp_dev_outs_data->ptr());
+    }
+
+    if (has_same_shape) {
+      if (o_num == 2) {
+        SplitKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
+            input.data<T>(),
+            in_row,
+            in_col,
+            out0_col,
+            outputs_data[0],
+            outputs_data[1]);
+      } else if (o_num == 3) {
+        SplitKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
+            input.data<T>(),
+            in_row,
+            in_col,
+            out0_col,
+            outputs_data[0],
+            outputs_data[1],
+            outputs_data[2]);
+      } else if (o_num == 4) {
+        SplitKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
+            input.data<T>(),
+            in_row,
+            in_col,
+            out0_col,
+            outputs_data[0],
+            outputs_data[1],
+            outputs_data[2],
+            outputs_data[3]);
+      } else {
+        SplitKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
+            input.data<T>(), in_row, in_col, out0_col, dev_out_gpu_data);
+      }
+    } else {
+      auto tmp_dev_ins_col_data =
+          // TODO(chentianyu03): try to find a method to remove the Alloc
+          // function
+          paddle::memory::Alloc(context, outputs_cols_num * sizeof(int64_t));
+      auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph(
+          outputs_cols, outputs_cols_num);
+      paddle::memory::Copy(context.GetPlace(),
+                           tmp_dev_ins_col_data->ptr(),
+                           paddle::platform::CPUPlace(),
+                           restored,
+                           outputs_cols_num * sizeof(int64_t),
+                           context.stream());
+      int64_t* dev_outs_col_data =
+          reinterpret_cast<int64_t*>(tmp_dev_ins_col_data->ptr());
+
+      SplitKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
+          input.data<T>(),
+          in_row,
+          in_col,
+          dev_outs_col_data,
+          static_cast<int>(outputs_cols_num),
+          dev_out_gpu_data);
+    }
+#ifdef PADDLE_WITH_HIP
+    // Prevent the pinned memory value from being covered and release the memory
+    // after the launch kernel of the stream is executed (reapply pinned memory
+    // next time)
+    auto* data_alloc_released = data_alloc.release();
+    auto* cols_alloc_released = cols_alloc.release();
+    context.AddStreamCallback([data_alloc_released, cols_alloc_released] {
+      paddle::memory::allocation::Allocator::AllocationDeleter(
+          data_alloc_released);
+      paddle::memory::allocation::Allocator::AllocationDeleter(
+          cols_alloc_released);
+    });
+#endif
+  }
+};
+
+#define DEFINE_FUNCTOR(type)                           \
+  template class ConcatFunctor<phi::GPUContext, type>; \
+  template class SplitFunctor<phi::GPUContext, type>
+
+FOR_ALL_TYPES(DEFINE_FUNCTOR);
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/concat_and_split_functor.h b/paddle/phi/kernels/funcs/concat_and_split_functor.h
new file mode 100644
index 0000000000000..3af4d878d3cab
--- /dev/null
+++ b/paddle/phi/kernels/funcs/concat_and_split_functor.h
@@ -0,0 +1,90 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <cmath>
+#include <memory>
+#include <vector>
+
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/utils/data_type.h"
+
+namespace phi {
+namespace funcs {
+
+/*
+ * \brief Concatenate the input tensors along the dimension axis.
+ *  TODO(zcd): maybe it needs to be more detailed.
+ *  Examples:
+ *     Input[0] = [[1,2],[3,4]]
+ *     Input[1] = [[5,6]]
+ *     axis = 0
+ *
+ *     Output = [[1,2],
+ *               [3,4],
+ *               [5,6]]
+ */
+template <typename Context, typename T>
+struct ConcatFunctor {
+  void operator()(const Context& context,
+                  const std::vector<phi::DenseTensor>& input,
+                  int axis,
+                  phi::DenseTensor* output);
+};
+
+/*
+ * \brief Split the input tensors along the dimension axis into outputs.
+ *  TODO(zcd): maybe it needs to be more detailed.
+ *  Examples:
+ *     Input = [[1,2],
+ *              [3,4],
+ *              [5,6]]
+ *     axis = 0
+ *
+ *     Output[0] = [[1,2],[3,4]]
+ *     Output[1] = [[5,6]]
+ */
+template <typename Context, typename T>
+class SplitFunctor {
+ public:
+  void operator()(const Context& context,
+                  const phi::DenseTensor& input,
+                  const std::vector<const phi::DenseTensor*>& ref_inputs,
+                  int axis,
+                  std::vector<phi::DenseTensor*>* outputs);
+};
+
+}  // namespace funcs
+}  // namespace phi
+
+#define FOR_ALL_TYPES(macro)         \
+  macro(int);                        \
+  macro(float);                      \
+  macro(double);                     \
+  macro(bool);                       \
+  macro(int64_t);                    \
+  macro(int16_t);                    \
+  macro(uint8_t);                    \
+  macro(int8_t);                     \
+  macro(phi::dtype::float16);        \
+  macro(phi::dtype::bfloat16);       \
+  macro(phi::dtype::complex<float>); \
+  macro(phi::dtype::complex<double>);
diff --git a/paddle/phi/kernels/gpu/concat_and_split.h b/paddle/phi/kernels/gpu/concat_and_split.h
deleted file mode 100644
index ced48ece979f0..0000000000000
--- a/paddle/phi/kernels/gpu/concat_and_split.h
+++ /dev/null
@@ -1,567 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <algorithm>
-#include <vector>
-#include "gflags/gflags.h"
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-
-#include "paddle/phi/backends/gpu/gpu_context.h"
-
-namespace phi {
-
-template <typename T>
-__global__ void ConcatKernel_(const T** inputs,
-                              const int64_t* input_cols,
-                              int col_size,
-                              const int64_t output_rows,
-                              const int64_t output_cols,
-                              T* output) {
-  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  int curr_segment = 0;
-  int curr_offset = input_cols[0];
-  for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) {
-    int curr_col_offset = input_cols[curr_segment + 1];
-    while (curr_col_offset <= tid_x) {
-      curr_offset = curr_col_offset;
-      ++curr_segment;
-      curr_col_offset = input_cols[curr_segment + 1];
-    }
-
-    int local_col = tid_x - curr_offset;
-    int segment_width = curr_col_offset - curr_offset;
-
-    const T* input_ptr = inputs[curr_segment];
-    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-    for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y)
-      output[tid_y * output_cols + tid_x] =
-          input_ptr[tid_y * segment_width + local_col];
-  }
-}
-
-template <typename T>
-__device__ void ConcatKernelDetail(const T** inputs_data,
-                                   const int fixed_in_col,
-                                   const int out_rows,
-                                   const int out_cols,
-                                   T* output_data) {
-  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  for (; tid_x < out_cols; tid_x += blockDim.x * gridDim.x) {
-    int split = tid_x * 1.0 / fixed_in_col;
-    int in_offset = tid_x - split * fixed_in_col;
-    const T* input_ptr = inputs_data[split];
-    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-    for (; tid_y < out_rows; tid_y += blockDim.y * gridDim.y) {
-      output_data[tid_y * out_cols + tid_x] =
-          input_ptr[tid_y * fixed_in_col + in_offset];
-    }
-  }
-}
-
-template <typename T>
-__global__ void ConcatKernel_(const T* input_addr0,
-                              const T* input_addr1,
-                              const int64_t fixed_in_col,
-                              const int64_t out_rows,
-                              const int64_t out_cols,
-                              T* output_data) {
-  const T* inputs_data[2];
-  inputs_data[0] = input_addr0;
-  inputs_data[1] = input_addr1;
-  ConcatKernelDetail<T>(
-      inputs_data, fixed_in_col, out_rows, out_cols, output_data);
-}
-
-template <typename T>
-__global__ void ConcatKernel_(const T* input_addr0,
-                              const T* input_addr1,
-                              const T* input_addr2,
-                              const int64_t fixed_in_col,
-                              const int64_t out_rows,
-                              const int64_t out_cols,
-                              T* output_data) {
-  const T* inputs_data[3];
-  inputs_data[0] = input_addr0;
-  inputs_data[1] = input_addr1;
-  inputs_data[2] = input_addr2;
-  ConcatKernelDetail<T>(
-      inputs_data, fixed_in_col, out_rows, out_cols, output_data);
-}
-
-template <typename T>
-__global__ void ConcatKernel_(const T* input_addr0,
-                              const T* input_addr1,
-                              const T* input_addr2,
-                              const T* input_addr3,
-                              const int64_t fixed_in_col,
-                              const int64_t out_rows,
-                              const int64_t out_cols,
-                              T* output_data) {
-  const T* inputs_data[4];
-  inputs_data[0] = input_addr0;
-  inputs_data[1] = input_addr1;
-  inputs_data[2] = input_addr2;
-  inputs_data[3] = input_addr3;
-  ConcatKernelDetail<T>(
-      inputs_data, fixed_in_col, out_rows, out_cols, output_data);
-}
-
-template <typename T>
-__global__ void ConcatKernel_(const T** inputs_data,
-                              const int in_num,
-                              const int64_t fixed_in_col,
-                              const int64_t out_rows,
-                              const int64_t out_cols,
-                              T* output_data) {
-  ConcatKernelDetail<T>(
-      inputs_data, fixed_in_col, out_rows, out_cols, output_data);
-}
-
-template <typename T>
-__global__ void SplitKernel_(const T* input_data,
-                             const int64_t in_row,
-                             const int64_t in_col,
-                             const int64_t* out_cols,
-                             int out_cols_size,
-                             T** outputs_data) {
-  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  int curr_segment = 0;
-  int curr_offset = out_cols[0];
-  for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
-    int curr_col_offset = out_cols[curr_segment + 1];
-    while (curr_col_offset <= tid_x) {
-      curr_offset = curr_col_offset;
-      ++curr_segment;
-      curr_col_offset = out_cols[curr_segment + 1];
-    }
-
-    int local_col = tid_x - curr_offset;
-    int segment_width = curr_col_offset - curr_offset;
-    T* output_ptr = outputs_data[curr_segment];
-    if (output_ptr != nullptr) {
-      int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-      for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
-        output_ptr[tid_y * segment_width + local_col] =
-            input_data[tid_y * in_col + tid_x];
-    }
-  }
-}
-
-template <typename T>
-__device__ void SplitKernelDetail(const T* input_data,
-                                  const int in_row,
-                                  const int in_col,
-                                  const int fixed_out_col,
-                                  T** outputs_data) {
-  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
-    int split = tid_x / fixed_out_col;
-    int in_offset = tid_x - split * fixed_out_col;
-    T* output_ptr = outputs_data[split];
-    if (output_ptr != nullptr) {
-      int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-      for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
-        output_ptr[tid_y * fixed_out_col + in_offset] =
-            input_data[tid_y * in_col + tid_x];
-    }
-  }
-}
-
-template <typename T>
-__global__ void SplitKernel_(const T* input_data,
-                             const int64_t in_row,
-                             const int64_t in_col,
-                             const int64_t fixed_out_col,
-                             T** outputs_data) {
-  SplitKernelDetail<T>(input_data, in_row, in_col, fixed_out_col, outputs_data);
-}
-
-template <typename T>
-__global__ void SplitKernel_(const T* input_data,
-                             const int64_t in_row,
-                             const int64_t in_col,
-                             const int64_t fixed_out_col,
-                             T* outputs_addr0,
-                             T* outputs_addr1) {
-  T* outputs_data[2];
-  outputs_data[0] = outputs_addr0;
-  outputs_data[1] = outputs_addr1;
-  SplitKernelDetail<T>(input_data, in_row, in_col, fixed_out_col, outputs_data);
-}
-
-template <typename T>
-__global__ void SplitKernel_(const T* input_data,
-                             const int64_t in_row,
-                             const int64_t in_col,
-                             const int64_t fixed_out_col,
-                             T* outputs_addr0,
-                             T* outputs_addr1,
-                             T* outputs_addr2) {
-  T* outputs_data[3];
-  outputs_data[0] = outputs_addr0;
-  outputs_data[1] = outputs_addr1;
-  outputs_data[2] = outputs_addr2;
-  SplitKernelDetail<T>(input_data, in_row, in_col, fixed_out_col, outputs_data);
-}
-
-template <typename T>
-__global__ void SplitKernel_(const T* input_data,
-                             const int64_t in_row,
-                             const int64_t in_col,
-                             const int64_t fixed_out_col,
-                             T* outputs_addr0,
-                             T* outputs_addr1,
-                             T* outputs_addr2,
-                             T* outputs_addr3) {
-  T* outputs_data[4];
-  outputs_data[0] = outputs_addr0;
-  outputs_data[1] = outputs_addr1;
-  outputs_data[2] = outputs_addr2;
-  outputs_data[3] = outputs_addr3;
-  SplitKernelDetail<T>(input_data, in_row, in_col, fixed_out_col, outputs_data);
-}
-
-static inline void GetBlockDims(const phi::GPUContext& context,
-                                int64_t num_rows,
-                                int64_t num_cols,
-                                dim3* block_dims,
-                                dim3* grid_dims) {
-  // Set the thread block and grid according to CurrentDeviceId
-  const int kThreadsPerBlock = 1024;
-  int block_cols = kThreadsPerBlock;
-  if (num_cols < kThreadsPerBlock) {  // block_cols is aligned by 32.
-    block_cols = ((num_cols + 31) >> 5) << 5;
-  }
-  int block_rows = kThreadsPerBlock / block_cols;
-  *block_dims = dim3(block_cols, block_rows, 1);
-
-  int max_threads = context.GetMaxPhysicalThreadCount();
-  int64_t max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
-
-  int grid_cols =
-      std::min((num_cols + block_cols - 1) / block_cols, max_blocks);
-  int grid_rows = std::min(max_blocks / grid_cols,
-                           std::max(num_rows / block_rows, (int64_t)1));
-  *grid_dims = dim3(grid_cols, grid_rows, 1);
-}
-
-/*
- * All tensors' dimension should be the same and the values of
- * each dimension must be the same, except the axis dimension.
- */
-template <typename T, typename Context>
-void ConcatImpl(const Context& context,
-                const std::vector<phi::DenseTensor>& input,
-                int axis,
-                phi::DenseTensor* output) {
-  // TODO(zcd): Add input data validity checking
-  int in_num = input.size();
-  int64_t in_row = 1;
-  auto dim_0 = input[0].dims();
-  for (int i = 0; i < axis; ++i) {
-    in_row *= dim_0[i];
-  }
-  int64_t in_col = input[0].numel() / in_row;
-  int64_t out_row = in_row, out_col = 0;
-
-  int inputs_col_num = in_num + 1;
-  std::vector<const T*> inputs_data_vec(in_num);
-  std::vector<int64_t> inputs_col_vec(inputs_col_num);
-  const T** inputs_data = inputs_data_vec.data();
-  int64_t* inputs_col = inputs_col_vec.data();
-
-// There are some differences between hip runtime and NV runtime.
-// In NV, when the pageable memory data less than 64K is transferred from
-// hosttodevice, it will be automatically asynchronous.
-// However, only pinned memory in hip can copy asynchronously
-// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#concurrent-execution-host-device
-// 3.2.6.1. Concurrent Execution between Host and Device
-// Memory copies from host to device of a memory block of 64 KB or less
-#ifdef PADDLE_WITH_HIP
-  paddle::memory::AllocationPtr data_alloc, col_alloc;
-  // TODO(chentianyu03): try to find a method to remove the Alloc function
-  data_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
-                                     in_num * sizeof(T*));
-  inputs_data = reinterpret_cast<const T**>(data_alloc->ptr());
-  // TODO(chentianyu03): try to find a method to remove the Alloc function
-  col_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
-                                    inputs_col_num * sizeof(int));
-  inputs_col = reinterpret_cast<int64_t*>(col_alloc->ptr());
-#endif
-
-  inputs_col[0] = 0;
-  bool has_same_shape = true;
-  for (int i = 0; i < in_num; ++i) {
-    int64_t t_cols = input[i].numel() / in_row;
-    if (has_same_shape) {
-      if (t_cols != in_col) has_same_shape = false;
-    }
-    out_col += t_cols;
-    inputs_col[i + 1] = out_col;
-    inputs_data[i] = input[i].data<T>();
-  }
-
-  dim3 block_dims;
-  dim3 grid_dims;
-  GetBlockDims(context, out_row, out_col, &block_dims, &grid_dims);
-
-  paddle::memory::allocation::AllocationPtr tmp_dev_ins_data;
-  const T** dev_ins_data = nullptr;
-  if (!has_same_shape || in_num < 2 || in_num > 4) {
-    tmp_dev_ins_data = paddle::memory::Alloc(context, in_num * sizeof(T*));
-    auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph(
-        inputs_data, in_num);
-    paddle::memory::Copy(context.GetPlace(),
-                         tmp_dev_ins_data->ptr(),
-                         phi::CPUPlace(),
-                         restored,
-                         in_num * sizeof(T*),
-                         context.stream());
-    dev_ins_data = reinterpret_cast<const T**>(tmp_dev_ins_data->ptr());
-  }
-
-  if (has_same_shape) {
-    if (in_num == 2) {
-      ConcatKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
-          inputs_data[0],
-          inputs_data[1],
-          in_col,
-          out_row,
-          out_col,
-          output->data<T>());
-    } else if (in_num == 3) {
-      ConcatKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
-          inputs_data[0],
-          inputs_data[1],
-          inputs_data[2],
-          in_col,
-          out_row,
-          out_col,
-          output->data<T>());
-    } else if (in_num == 4) {
-      ConcatKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
-          inputs_data[0],
-          inputs_data[1],
-          inputs_data[2],
-          inputs_data[3],
-          in_col,
-          out_row,
-          out_col,
-          output->data<T>());
-    } else {
-      ConcatKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
-          dev_ins_data, in_num, in_col, out_row, out_col, output->data<T>());
-    }
-  } else {
-    auto tmp_dev_ins_col_data =
-        paddle::memory::Alloc(context, inputs_col_num * sizeof(int64_t));
-
-    auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph(
-        inputs_col, inputs_col_num);
-    paddle::memory::Copy(context.GetPlace(),
-                         tmp_dev_ins_col_data->ptr(),
-                         phi::CPUPlace(),
-                         restored,
-                         inputs_col_num * sizeof(int64_t),
-                         context.stream());
-    int64_t* dev_ins_col_data =
-        static_cast<int64_t*>(tmp_dev_ins_col_data->ptr());
-
-    ConcatKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
-        dev_ins_data,
-        dev_ins_col_data,
-        static_cast<int>(inputs_col_num),
-        out_row,
-        out_col,
-        output->data<T>());
-  }
-
-#ifdef PADDLE_WITH_HIP
-  // Prevent the pinned memory value from being covered and release the memory
-  // after the launch kernel of the stream is executed (reapply pinned memory
-  // next time)
-  auto* data_alloc_released = data_alloc.release();
-  auto* col_alloc_released = col_alloc.release();
-  context.AddStreamCallback([data_alloc_released, col_alloc_released] {
-    paddle::memory::allocation::Allocator::AllocationDeleter(
-        data_alloc_released);
-    paddle::memory::allocation::Allocator::AllocationDeleter(
-        col_alloc_released);
-  });
-#endif
-}
-
-/*
- * All tensors' dimension should be the same and the values of
- * each dimension must be the same, except the axis dimension.
- */
-template <typename T, typename Context>
-void SplitImpl(const Context& context,
-               const phi::DenseTensor& input,
-               const std::vector<const phi::DenseTensor*>& ref_inputs,
-               int axis,
-               std::vector<phi::DenseTensor*>* outputs) {
-  // NOTE(zhiqiu): split a tensor of shape [0,3,4] at axis=1, result in 3
-  // tensors of shape [0,1,4]
-  if (input.numel() == 0) {
-    return;
-  }
-
-  // TODO(zcd): Add input data validity checking
-  int o_num = outputs->size();
-  int64_t out_row = 1;
-  auto dim_0 = ref_inputs[0]->dims();
-  for (int i = 0; i < axis; ++i) {
-    out_row *= dim_0[i];
-  }
-
-  int64_t out0_col = ref_inputs[0]->numel() / out_row;
-  int64_t in_col = 0, in_row = out_row;
-  bool has_same_shape = true;
-
-  int outputs_cols_num = o_num + 1;
-  std::vector<T*> outputs_data_vec(o_num);
-  std::vector<int64_t> outputs_cols_vec(outputs_cols_num);
-  T** outputs_data = outputs_data_vec.data();
-  int64_t* outputs_cols = outputs_cols_vec.data();
-
-// There are some differences between hip runtime and NV runtime.
-// In NV, when the pageable memory data less than 64K is transferred from
-// hosttodevice, it will be automatically asynchronous.
-// However, only pinned memory in hip can copy asynchronously
-// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#concurrent-execution-host-device
-// 3.2.6.1. Concurrent Execution between Host and Device
-// Memory copies from host to device of a memory block of 64 KB or less
-#ifdef PADDLE_WITH_HIP
-  paddle::memory::AllocationPtr data_alloc, cols_alloc;
-  // TODO(chentianyu03): try to find a method to remove the Alloc function
-  data_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
-                                     o_num * sizeof(T*));
-  outputs_data = reinterpret_cast<T**>(data_alloc->ptr());
-  // TODO(chentianyu03): try to find a method to remove the Alloc function
-  cols_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
-                                     (outputs_cols_num) * sizeof(int64_t));
-  outputs_cols = reinterpret_cast<int64_t*>(cols_alloc->ptr());
-#endif
-
-  outputs_cols[0] = 0;
-  for (int i = 0; i < o_num; ++i) {
-    int64_t t_col = ref_inputs.at(i)->numel() / out_row;
-    if (has_same_shape) {
-      if (t_col != out0_col) has_same_shape = false;
-    }
-    in_col += t_col;
-    outputs_cols[i + 1] = in_col;
-    if (outputs->at(i) != nullptr) {
-      outputs_data[i] = outputs->at(i)->data<T>();
-    } else {
-      outputs_data[i] = nullptr;
-    }
-  }
-
-  dim3 block_dims;
-  dim3 grid_dims;
-  GetBlockDims(context, out_row, in_col, &block_dims, &grid_dims);
-
-  paddle::memory::allocation::AllocationPtr tmp_dev_outs_data;
-  T** dev_out_gpu_data = nullptr;
-  if (!has_same_shape || o_num < 2 || o_num > 4) {
-    // TODO(chentianyu03): try to find a method to remove the Alloc function
-    tmp_dev_outs_data = paddle::memory::Alloc(context, o_num * sizeof(T*));
-    auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph(
-        outputs_data, o_num);
-    paddle::memory::Copy(context.GetPlace(),
-                         tmp_dev_outs_data->ptr(),
-                         phi::CPUPlace(),
-                         restored,
-                         o_num * sizeof(T*),
-                         context.stream());
-    dev_out_gpu_data = reinterpret_cast<T**>(tmp_dev_outs_data->ptr());
-  }
-
-  if (has_same_shape) {
-    if (o_num == 2) {
-      SplitKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
-          input.data<T>(),
-          in_row,
-          in_col,
-          out0_col,
-          outputs_data[0],
-          outputs_data[1]);
-    } else if (o_num == 3) {
-      SplitKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
-          input.data<T>(),
-          in_row,
-          in_col,
-          out0_col,
-          outputs_data[0],
-          outputs_data[1],
-          outputs_data[2]);
-    } else if (o_num == 4) {
-      SplitKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
-          input.data<T>(),
-          in_row,
-          in_col,
-          out0_col,
-          outputs_data[0],
-          outputs_data[1],
-          outputs_data[2],
-          outputs_data[3]);
-    } else {
-      SplitKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
-          input.data<T>(), in_row, in_col, out0_col, dev_out_gpu_data);
-    }
-  } else {
-    auto tmp_dev_ins_col_data =
-        // TODO(chentianyu03): try to find a method to remove the Alloc function
-        paddle::memory::Alloc(context, outputs_cols_num * sizeof(int64_t));
-    auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph(
-        outputs_cols, outputs_cols_num);
-    paddle::memory::Copy(context.GetPlace(),
-                         tmp_dev_ins_col_data->ptr(),
-                         phi::CPUPlace(),
-                         restored,
-                         outputs_cols_num * sizeof(int64_t),
-                         context.stream());
-    int64_t* dev_outs_col_data =
-        reinterpret_cast<int64_t*>(tmp_dev_ins_col_data->ptr());
-
-    SplitKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
-        input.data<T>(),
-        in_row,
-        in_col,
-        dev_outs_col_data,
-        static_cast<int>(outputs_cols_num),
-        dev_out_gpu_data);
-  }
-#ifdef PADDLE_WITH_HIP
-  // Prevent the pinned memory value from being covered and release the memory
-  // after the launch kernel of the stream is executed (reapply pinned memory
-  // next time)
-  auto* data_alloc_released = data_alloc.release();
-  auto* cols_alloc_released = cols_alloc.release();
-  context.AddStreamCallback([data_alloc_released, cols_alloc_released] {
-    paddle::memory::allocation::Allocator::AllocationDeleter(
-        data_alloc_released);
-    paddle::memory::allocation::Allocator::AllocationDeleter(
-        cols_alloc_released);
-  });
-#endif
-}
-
-}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/concat_kernel.cu b/paddle/phi/kernels/gpu/concat_kernel.cu
index b787b80c7e4ed..2b04b979c20aa 100644
--- a/paddle/phi/kernels/gpu/concat_kernel.cu
+++ b/paddle/phi/kernels/gpu/concat_kernel.cu
@@ -22,8 +22,8 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/lod_utils.h"
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 #include "paddle/phi/kernels/funcs/concat_funcs.h"
-#include "paddle/phi/kernels/gpu/concat_and_split.h"
 
 namespace phi {
 
@@ -104,7 +104,8 @@ void ConcatKernel(const Context& dev_ctx,
         continue;
       }
     }
-    ConcatImpl<T, Context>(dev_ctx, inputs, axis, out);
+    phi::funcs::ConcatFunctor<Context, T> functor;
+    functor(dev_ctx, inputs, axis, out);
   }
 }
 
diff --git a/paddle/phi/kernels/gpu/split_kernel.cu b/paddle/phi/kernels/gpu/split_kernel.cu
index 5222fce03ace6..a698b9e716140 100644
--- a/paddle/phi/kernels/gpu/split_kernel.cu
+++ b/paddle/phi/kernels/gpu/split_kernel.cu
@@ -18,7 +18,7 @@
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-#include "paddle/phi/kernels/gpu/concat_and_split.h"
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 namespace phi {
 
 template <typename T, typename Context>
@@ -53,7 +53,8 @@ void SplitKernel(const Context& dev_ctx,
     paddle::operators::StridedMemcpyWithAxis0<T>(
         dev_ctx, x, shape_refer, &outs);
   } else {
-    SplitImpl<T, Context>(dev_ctx, x, shape_refer, axis, &outs);
+    phi::funcs::SplitFunctor<Context, T> functor;
+    functor(dev_ctx, x, shape_refer, axis, &outs);
   }
 }
 
diff --git a/paddle/phi/kernels/gpu/unbind_kernel.cu b/paddle/phi/kernels/gpu/unbind_kernel.cu
new file mode 100644
index 0000000000000..1efc3a1094da2
--- /dev/null
+++ b/paddle/phi/kernels/gpu/unbind_kernel.cu
@@ -0,0 +1,28 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/unbind_kernel_impl.h"
+#include "paddle/phi/kernels/unbind_kernel.h"
+
+PD_REGISTER_KERNEL(unbind,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::UnbindKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/impl/unbind_kernel_impl.h b/paddle/phi/kernels/impl/unbind_kernel_impl.h
new file mode 100644
index 0000000000000..8a1342559bd90
--- /dev/null
+++ b/paddle/phi/kernels/impl/unbind_kernel_impl.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
+#include "paddle/phi/kernels/unbind_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void UnbindKernel(const Context& ctx,
+                  const DenseTensor& x,
+                  int axis,
+                  std::vector<DenseTensor*> outs) {
+  auto x_dims = x.dims();
+  axis = axis < 0 ? x_dims.size() + axis : axis;
+
+  std::vector<const DenseTensor*> shape_refer;
+  for (size_t j = 0; j < outs.size(); ++j) {
+    ctx.template Alloc<T>(outs[j]);
+    shape_refer.emplace_back(outs[j]);
+  }
+
+  phi::funcs::SplitFunctor<Context, T> functor;
+  functor(ctx, x, shape_refer, axis, &outs);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/unbind_kernel.h b/paddle/phi/kernels/unbind_kernel.h
new file mode 100644
index 0000000000000..30ee9a15d084e
--- /dev/null
+++ b/paddle/phi/kernels/unbind_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+/*
+ * All tensors' dimension should be the same and the values of
+ * each dimension must be the same, except the axis dimension.
+ */
+template <typename T, typename Context>
+void UnbindKernel(const Context& ctx,
+                  const DenseTensor& x,
+                  int axis,
+                  std::vector<DenseTensor*> outs);
+
+}  // namespace phi

From b089e7cdaa41363003e1aed44e0f41eb5a919d61 Mon Sep 17 00:00:00 2001
From: ronnywang <524019753@qq.com>
Date: Wed, 23 Feb 2022 14:43:57 +0800
Subject: [PATCH 061/101] [phi] migrate atan2_op into phi (#39806)

---
 paddle/fluid/operators/atan2_op.cc            |  41 +----
 paddle/fluid/operators/atan2_op.cu            |  31 ----
 paddle/fluid/operators/atan2_op.h             | 168 ------------------
 paddle/phi/infermeta/binary.cc                |   5 +
 paddle/phi/infermeta/binary.h                 |   2 +
 paddle/phi/kernels/atan2_grad_kernel.h        |  29 +++
 paddle/phi/kernels/atan2_kernel.h             |  27 +++
 paddle/phi/kernels/cpu/atan2_grad_kernel.cc   |  27 +++
 paddle/phi/kernels/cpu/atan2_kernel.cc        |  29 +++
 paddle/phi/kernels/gpu/atan2_grad_kernel.cu   |  27 +++
 paddle/phi/kernels/gpu/atan2_kernel.cu        |  29 +++
 .../phi/kernels/impl/atan2_grad_kernel_impl.h |  94 ++++++++++
 paddle/phi/kernels/impl/atan2_kernel_impl.h   |  88 +++++++++
 paddle/phi/ops/compat/atan2_sig.cc            |  28 +++
 14 files changed, 394 insertions(+), 231 deletions(-)
 delete mode 100644 paddle/fluid/operators/atan2_op.cu
 delete mode 100644 paddle/fluid/operators/atan2_op.h
 create mode 100644 paddle/phi/kernels/atan2_grad_kernel.h
 create mode 100644 paddle/phi/kernels/atan2_kernel.h
 create mode 100644 paddle/phi/kernels/cpu/atan2_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/atan2_kernel.cc
 create mode 100644 paddle/phi/kernels/gpu/atan2_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/atan2_kernel.cu
 create mode 100644 paddle/phi/kernels/impl/atan2_grad_kernel_impl.h
 create mode 100644 paddle/phi/kernels/impl/atan2_kernel_impl.h
 create mode 100644 paddle/phi/ops/compat/atan2_sig.cc

diff --git a/paddle/fluid/operators/atan2_op.cc b/paddle/fluid/operators/atan2_op.cc
index 8ee6540bfa5f0..71a895c244c54 100644
--- a/paddle/fluid/operators/atan2_op.cc
+++ b/paddle/fluid/operators/atan2_op.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/atan2_op.h"
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -25,16 +25,6 @@ namespace operators {
 class Atan2Op : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X1"), "Input", "X1", "atan2");
-    OP_INOUT_CHECK(ctx->HasInput("X2"), "Input", "X2", "atan2");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "atan2");
-
-    auto in_dims = ctx->GetInputDim("X1");
-
-    ctx->SetOutputDim("Out", in_dims);
-  }
 };
 
 class Atan2OpMaker : public framework::OpProtoAndCheckerMaker {
@@ -115,24 +105,11 @@ class Atan2OpVarTypeInference : public framework::VarTypeInference {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-
+DELCARE_INFER_SHAPE_FUNCTOR(atan2, Atan2InferShapeFunctor,
+                            PT_INFER_META(phi::Atan2InferMeta));
 REGISTER_OPERATOR(atan2, ops::Atan2Op, ops::Atan2OpMaker,
                   ops::Atan2GradMaker<paddle::framework::OpDesc>,
                   ops::Atan2GradMaker<paddle::imperative::OpBase>,
-                  ops::Atan2OpVarTypeInference);
+                  ops::Atan2OpVarTypeInference, Atan2InferShapeFunctor);
 
 REGISTER_OPERATOR(atan2_grad, ops::Atan2GradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    atan2, ops::Atan2Kernel<paddle::platform::CPUDeviceContext, int32_t>,
-    ops::Atan2Kernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::Atan2Kernel<paddle::platform::CPUDeviceContext, float>,
-    ops::Atan2Kernel<paddle::platform::CPUDeviceContext, double>,
-    ops::Atan2Kernel<paddle::platform::CPUDeviceContext,
-                     paddle::platform::float16>);
-
-REGISTER_OP_CPU_KERNEL(
-    atan2_grad, ops::Atan2GradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::Atan2GradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::Atan2GradKernel<paddle::platform::CPUDeviceContext,
-                         paddle::platform::float16>);
diff --git a/paddle/fluid/operators/atan2_op.cu b/paddle/fluid/operators/atan2_op.cu
deleted file mode 100644
index faf1fde47e4c4..0000000000000
--- a/paddle/fluid/operators/atan2_op.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/atan2_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    atan2, ops::Atan2Kernel<paddle::platform::CUDADeviceContext, int32_t>,
-    ops::Atan2Kernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::Atan2Kernel<paddle::platform::CUDADeviceContext, float>,
-    ops::Atan2Kernel<paddle::platform::CUDADeviceContext, double>,
-    ops::Atan2Kernel<paddle::platform::CUDADeviceContext,
-                     paddle::platform::float16>);
-
-REGISTER_OP_CUDA_KERNEL(
-    atan2_grad,
-    ops::Atan2GradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::Atan2GradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::Atan2GradKernel<paddle::platform::CUDADeviceContext,
-                         paddle::platform::float16>);
diff --git a/paddle/fluid/operators/atan2_op.h b/paddle/fluid/operators/atan2_op.h
deleted file mode 100644
index a0e64c301524e..0000000000000
--- a/paddle/fluid/operators/atan2_op.h
+++ /dev/null
@@ -1,168 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-
-namespace paddle {
-namespace operators {
-using Tensor = framework::Tensor;
-using framework::To32BitIndex;
-
-template <typename T>
-struct Atan2Out {
-  using type = T;
-};
-
-template <>
-struct Atan2Out<int32_t> {
-  using type = double;
-};
-
-template <>
-struct Atan2Out<int64_t> {
-  using type = double;
-};
-
-template <typename T>
-struct Atan2Functor {
-  Atan2Functor(const T* x1, const T* x2, typename Atan2Out<T>::type* out,
-               int64_t numel)
-      : x1_(x1), x2_(x2), out_(out), numel_(numel) {}
-
-  HOSTDEVICE void operator()(int64_t idx) const {
-    out_[idx] = static_cast<typename Atan2Out<T>::type>(
-        ::atan2f(static_cast<float>(x1_[idx]), static_cast<float>(x2_[idx])));
-  }
-
-  const T* x1_;
-  const T* x2_;
-  typename Atan2Out<T>::type* out_;
-  int64_t numel_;
-};
-
-template <>
-struct Atan2Functor<double> {
-  Atan2Functor(const double* x1, const double* x2, double* out, int64_t numel)
-      : x1_(x1), x2_(x2), out_(out), numel_(numel) {}
-
-  HOSTDEVICE void operator()(int64_t idx) const {
-    out_[idx] = ::atan2(x1_[idx], x2_[idx]);
-  }
-
-  const double* x1_;
-  const double* x2_;
-  double* out_;
-  int64_t numel_;
-};
-
-// dx1 = dout * x2 / ((x1)^2 + (x2)^2)
-// dx2 = - dout * x1 / ((x1)^2 + (x2)^2)
-template <typename T>
-struct Atan2GradFunctor {
-  Atan2GradFunctor(const T* x1, const T* x2, const T* dout, T* dx1, T* dx2,
-                   int64_t numel)
-      : x1_(x1), x2_(x2), dout_(dout), dx1_(dx1), dx2_(dx2), numel_(numel) {}
-
-  HOSTDEVICE void operator()(int64_t idx) const {
-    float x1 = static_cast<float>(x1_[idx]);
-    float x2 = static_cast<float>(x2_[idx]);
-    float x = x1 * x1 + x2 * x2;
-    dx1_[idx] = static_cast<T>(static_cast<float>(dout_[idx]) * x2 / x);
-    dx2_[idx] = static_cast<T>(-static_cast<float>(dout_[idx]) * x1 / x);
-  }
-
-  const T* x1_;
-  const T* x2_;
-  const T* dout_;
-  T* dx1_;
-  T* dx2_;
-  int64_t numel_;
-};
-
-template <>
-struct Atan2GradFunctor<double> {
-  Atan2GradFunctor(const double* x1, const double* x2, const double* dout,
-                   double* dx1, double* dx2, int64_t numel)
-      : x1_(x1), x2_(x2), dout_(dout), dx1_(dx1), dx2_(dx2), numel_(numel) {}
-
-  HOSTDEVICE void operator()(int64_t idx) const {
-    auto x = x1_[idx] * x1_[idx] + x2_[idx] * x2_[idx];
-    dx1_[idx] = dout_[idx] * x2_[idx] / x;
-    dx2_[idx] = -dout_[idx] * x1_[idx] / x;
-  }
-
-  const double* x1_;
-  const double* x2_;
-  const double* dout_;
-  double* dx1_;
-  double* dx2_;
-  int64_t numel_;
-};
-
-template <typename DeviceContext, typename T>
-class Atan2Kernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* X1 = context.Input<Tensor>("X1");
-    const Tensor* X2 = context.Input<Tensor>("X2");
-    Tensor* Out = context.Output<Tensor>("Out");
-
-    auto numel = X1->numel();
-    auto x1 = X1->data<T>();
-    auto x2 = X2->data<T>();
-    auto out = Out->mutable_data<typename Atan2Out<T>::type>(
-        context.GetPlace(), size_t(numel * sizeof(typename Atan2Out<T>::type)));
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-
-    platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
-    Atan2Functor<T> functor(x1, x2, out, numel);
-    for_range(functor);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class Atan2GradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const {
-    const Tensor* X1 = context.Input<Tensor>("X1");
-    const Tensor* X2 = context.Input<Tensor>("X2");
-    const Tensor* dOut = context.Input<Tensor>(framework::GradVarName("Out"));
-    Tensor* dX1 = context.Output<Tensor>(framework::GradVarName("X1"));
-    Tensor* dX2 = context.Output<Tensor>(framework::GradVarName("X2"));
-
-    auto numel = X1->numel();
-    auto x1 = X1->data<T>();
-    auto x2 = X2->data<T>();
-    auto dout = dOut->data<T>();
-    auto dx1 =
-        dX1->mutable_data<T>(context.GetPlace(), size_t(numel * sizeof(T)));
-    auto dx2 =
-        dX2->mutable_data<T>(context.GetPlace(), size_t(numel * sizeof(T)));
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-
-    platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
-    Atan2GradFunctor<T> functor(x1, x2, dout, dx1, dx2, numel);
-    for_range(functor);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index 7455f1e6a0896..e94926a9c1403 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -225,4 +225,9 @@ void HuberLossInferMeta(const MetaTensor& input,
   out->share_lod(input);
 }
 
+void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
+  auto in_dims = x.dims();
+  out->set_dims(in_dims);
+}
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h
index 93ef9f5f35abb..f23382be89b6a 100644
--- a/paddle/phi/infermeta/binary.h
+++ b/paddle/phi/infermeta/binary.h
@@ -52,4 +52,6 @@ void HuberLossInferMeta(const MetaTensor& input_meta,
                         MetaTensor* out,
                         MetaTensor* residual,
                         MetaConfig config = MetaConfig());
+
+void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out);
 }  // namespace phi
diff --git a/paddle/phi/kernels/atan2_grad_kernel.h b/paddle/phi/kernels/atan2_grad_kernel.h
new file mode 100644
index 0000000000000..ddd87c9da156d
--- /dev/null
+++ b/paddle/phi/kernels/atan2_grad_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void Atan2GradKernel(const Context& ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     const DenseTensor& out_grad,
+                     DenseTensor* x_grad,
+                     DenseTensor* y_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/atan2_kernel.h b/paddle/phi/kernels/atan2_kernel.h
new file mode 100644
index 0000000000000..38276fa4f73ce
--- /dev/null
+++ b/paddle/phi/kernels/atan2_kernel.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void Atan2Kernel(const Context& ctx,
+                 const DenseTensor& x,
+                 const DenseTensor& y,
+                 DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/atan2_grad_kernel.cc b/paddle/phi/kernels/cpu/atan2_grad_kernel.cc
new file mode 100644
index 0000000000000..6ff7431f0c8c5
--- /dev/null
+++ b/paddle/phi/kernels/cpu/atan2_grad_kernel.cc
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/atan2_grad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/device_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/atan2_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(atan2_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::Atan2GradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/cpu/atan2_kernel.cc b/paddle/phi/kernels/cpu/atan2_kernel.cc
new file mode 100644
index 0000000000000..eb38a6c90b793
--- /dev/null
+++ b/paddle/phi/kernels/cpu/atan2_kernel.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/atan2_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/device_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/atan2_kernel_impl.h"
+
+PD_REGISTER_KERNEL(atan2,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::Atan2Kernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/atan2_grad_kernel.cu b/paddle/phi/kernels/gpu/atan2_grad_kernel.cu
new file mode 100644
index 0000000000000..1cc3311c36398
--- /dev/null
+++ b/paddle/phi/kernels/gpu/atan2_grad_kernel.cu
@@ -0,0 +1,27 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/device_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/atan2_grad_kernel.h"
+#include "paddle/phi/kernels/impl/atan2_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(atan2_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::Atan2GradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/atan2_kernel.cu b/paddle/phi/kernels/gpu/atan2_kernel.cu
new file mode 100644
index 0000000000000..702c959b78f75
--- /dev/null
+++ b/paddle/phi/kernels/gpu/atan2_kernel.cu
@@ -0,0 +1,29 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/device_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/atan2_kernel.h"
+#include "paddle/phi/kernels/impl/atan2_kernel_impl.h"
+
+PD_REGISTER_KERNEL(atan2,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::Atan2Kernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/impl/atan2_grad_kernel_impl.h b/paddle/phi/kernels/impl/atan2_grad_kernel_impl.h
new file mode 100644
index 0000000000000..5f75a95f4a7b1
--- /dev/null
+++ b/paddle/phi/kernels/impl/atan2_grad_kernel_impl.h
@@ -0,0 +1,94 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/atan2_grad_kernel.h"
+
+namespace phi {
+
+// dx1 = dout * x2 / ((x1)^2 + (x2)^2)
+// dx2 = - dout * x1 / ((x1)^2 + (x2)^2)
+template <typename T>
+struct Atan2GradFunctor {
+  Atan2GradFunctor(
+      const T* x1, const T* x2, const T* dout, T* dx1, T* dx2, int64_t numel)
+      : x1_(x1), x2_(x2), dout_(dout), dx1_(dx1), dx2_(dx2), numel_(numel) {}
+
+  HOSTDEVICE void operator()(int64_t idx) const {
+    float x1 = static_cast<float>(x1_[idx]);
+    float x2 = static_cast<float>(x2_[idx]);
+    float x = x1 * x1 + x2 * x2;
+    dx1_[idx] = static_cast<T>(static_cast<float>(dout_[idx]) * x2 / x);
+    dx2_[idx] = static_cast<T>(-static_cast<float>(dout_[idx]) * x1 / x);
+  }
+
+  const T* x1_;
+  const T* x2_;
+  const T* dout_;
+  T* dx1_;
+  T* dx2_;
+  int64_t numel_;
+};
+
+template <>
+struct Atan2GradFunctor<double> {
+  Atan2GradFunctor(const double* x1,
+                   const double* x2,
+                   const double* dout,
+                   double* dx1,
+                   double* dx2,
+                   int64_t numel)
+      : x1_(x1), x2_(x2), dout_(dout), dx1_(dx1), dx2_(dx2), numel_(numel) {}
+
+  HOSTDEVICE void operator()(int64_t idx) const {
+    auto x = x1_[idx] * x1_[idx] + x2_[idx] * x2_[idx];
+    dx1_[idx] = dout_[idx] * x2_[idx] / x;
+    dx2_[idx] = -dout_[idx] * x1_[idx] / x;
+  }
+
+  const double* x1_;
+  const double* x2_;
+  const double* dout_;
+  double* dx1_;
+  double* dx2_;
+  int64_t numel_;
+};
+
+template <typename T, typename Context>
+void Atan2GradKernel(const Context& ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     const DenseTensor& out_grad,
+                     DenseTensor* x_grad,
+                     DenseTensor* y_grad) {
+  auto numel = x.numel();
+  auto x_data = x.data<T>();
+  auto y_data = y.data<T>();
+  auto out_grad_data = out_grad.data<T>();
+
+  auto* x_grad_data =
+      ctx.template Alloc<T>(x_grad, size_t(x.numel() * sizeof(T)));
+  auto* y_grad_data =
+      ctx.template Alloc<T>(y_grad, size_t(y.numel() * sizeof(T)));
+
+  paddle::platform::ForRange<Context> for_range(ctx, numel);
+  phi::Atan2GradFunctor<T> functor(
+      x_data, y_data, out_grad_data, x_grad_data, y_grad_data, numel);
+  for_range(functor);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/atan2_kernel_impl.h b/paddle/phi/kernels/impl/atan2_kernel_impl.h
new file mode 100644
index 0000000000000..c29449a27e0b5
--- /dev/null
+++ b/paddle/phi/kernels/impl/atan2_kernel_impl.h
@@ -0,0 +1,88 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/atan2_kernel.h"
+
+namespace phi {
+template <typename T>
+struct Atan2Out {
+  using type = T;
+};
+
+template <>
+struct Atan2Out<int32_t> {
+  using type = double;
+};
+
+template <>
+struct Atan2Out<int64_t> {
+  using type = double;
+};
+
+template <typename T>
+struct Atan2Functor {
+  Atan2Functor(const T* x1,
+               const T* x2,
+               typename Atan2Out<T>::type* out,
+               int64_t numel)
+      : x1_(x1), x2_(x2), out_(out), numel_(numel) {}
+
+  HOSTDEVICE void operator()(int64_t idx) const {
+    out_[idx] = static_cast<typename Atan2Out<T>::type>(
+        ::atan2f(static_cast<float>(x1_[idx]), static_cast<float>(x2_[idx])));
+  }
+
+  const T* x1_;
+  const T* x2_;
+  typename Atan2Out<T>::type* out_;
+  int64_t numel_;
+};
+
+template <>
+struct Atan2Functor<double> {
+  Atan2Functor(const double* x1, const double* x2, double* out, int64_t numel)
+      : x1_(x1), x2_(x2), out_(out), numel_(numel) {}
+
+  HOSTDEVICE void operator()(int64_t idx) const {
+    out_[idx] = ::atan2(x1_[idx], x2_[idx]);
+  }
+
+  const double* x1_;
+  const double* x2_;
+  double* out_;
+  int64_t numel_;
+};
+
+template <typename T, typename Context>
+void Atan2Kernel(const Context& ctx,
+                 const DenseTensor& x,
+                 const DenseTensor& y,
+                 DenseTensor* out) {
+  auto numel = x.numel();
+  auto x_data = x.data<T>();
+  auto y_data = y.data<T>();
+
+  auto* out_data = ctx.template Alloc<typename Atan2Out<T>::type>(
+      out, size_t(x.numel() * sizeof(typename Atan2Out<T>::type)));
+
+  paddle::platform::ForRange<Context> for_range(ctx, numel);
+  phi::Atan2Functor<T> functor(x_data, y_data, out_data, numel);
+  for_range(functor);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/atan2_sig.cc b/paddle/phi/ops/compat/atan2_sig.cc
new file mode 100644
index 0000000000000..8a6049e67b668
--- /dev/null
+++ b/paddle/phi/ops/compat/atan2_sig.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature Atan2GradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("atan2_grad",
+                         {"X1", "X2", GradVarName("Out")},
+                         {},
+                         {GradVarName("X1"), GradVarName("X2")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(atan2_grad, phi::Atan2GradOpArgumentMapping);

From 69a04209fc72a1b9181ea0f224b51ff920927f0c Mon Sep 17 00:00:00 2001
From: zhangxiaoci <zhangxiaoci@baidu.com>
Date: Wed, 23 Feb 2022 15:03:57 +0800
Subject: [PATCH 062/101] refactor range unittest for kunlun (#39800)

*test=kunlun
---
 .../tests/unittests/xpu/test_range_xpu.py     | 83 +++++++++++--------
 1 file changed, 48 insertions(+), 35 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/xpu/test_range_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_range_xpu.py
index f2a078fcd2db1..f9c49a81ef30c 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_range_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_range_xpu.py
@@ -20,57 +20,70 @@
 import sys
 sys.path.append("..")
 from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
 
 paddle.enable_static()
 
 
-class TestRangeOp(XPUOpTest):
-    def setUp(self):
-        self.op_type = "range"
-        self.init_config()
-        self.inputs = {
-            'Start': np.array([self.case[0]]).astype(self.dtype),
-            'End': np.array([self.case[1]]).astype(self.dtype),
-            'Step': np.array([self.case[2]]).astype(self.dtype)
-        }
+class XPUTestRangeOp(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = "range"
+        self.use_dynamic_create_class = False
 
-        self.outputs = {
-            'Out': np.arange(self.case[0], self.case[1],
-                             self.case[2]).astype(self.dtype)
-        }
+    class TestRangeOp(XPUOpTest):
+        def setUp(self):
+            self.set_xpu()
+            self.op_type = "range"
+            self.init_dtype()
+            self.init_config()
+            self.inputs = {
+                'Start': np.array([self.case[0]]).astype(self.dtype),
+                'End': np.array([self.case[1]]).astype(self.dtype),
+                'Step': np.array([self.case[2]]).astype(self.dtype)
+            }
 
-    def init_config(self):
-        self.dtype = np.float32
-        self.case = (0, 1, 0.2)
+            self.outputs = {
+                'Out': np.arange(self.case[0], self.case[1],
+                                 self.case[2]).astype(self.dtype)
+            }
 
-    def test_check_output(self):
-        place = paddle.XPUPlace(0)
-        self.check_output_with_place(place, check_dygraph=False)
+        def set_xpu(self):
+            self.__class__.no_need_check_grad = True
 
+        def init_dtype(self):
+            self.dtype = self.in_type
 
-class TestFloatRangeOpCase0(TestRangeOp):
-    def init_config(self):
-        self.dtype = np.float32
-        self.case = (0, 5, 1)
+        def init_config(self):
+            self.case = (0, 1, 0.2) if self.dtype == np.float32 else (0, 5, 1)
 
+        def test_check_output(self):
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place, check_dygraph=False)
 
-class TestInt32RangeOpCase0(TestRangeOp):
-    def init_config(self):
-        self.dtype = np.int32
-        self.case = (0, 5, 2)
+    class TestRangeOpCase0(TestRangeOp):
+        def init_config(self):
+            self.case = (0, 5, 1)
 
+    class TestRangeOpCase1(TestRangeOp):
+        def init_config(self):
+            self.case = (0, 5, 2)
 
-class TestInt32RangeOpCase1(TestRangeOp):
-    def init_config(self):
-        self.dtype = np.int32
-        self.case = (10, 1, -2)
+    class TestRangeOpCase2(TestRangeOp):
+        def init_config(self):
+            self.case = (10, 1, -2)
 
+    class TestRangeOpCase3(TestRangeOp):
+        def init_config(self):
+            self.case = (-1, -10, -2)
 
-class TestInt32RangeOpCase2(TestRangeOp):
-    def init_config(self):
-        self.dtype = np.int32
-        self.case = (-1, -10, -2)
+    class TestRangeOpCase4(TestRangeOp):
+        def init_config(self):
+            self.case = (10, -10, -11)
 
 
+support_types = get_xpu_op_support_types("range")
+for stype in support_types:
+    create_test_class(globals(), XPUTestRangeOp, stype)
+
 if __name__ == "__main__":
     unittest.main()

From b9675acc9d4326b73f5b3167265a1d3f6e98dac9 Mon Sep 17 00:00:00 2001
From: Zhou Wei <1183042833@qq.com>
Date: Wed, 23 Feb 2022 16:55:49 +0800
Subject: [PATCH 063/101] change CUDA implementaion of bernoulli OP (#39732)

* change CUDA implementaion of bernoulli OP

* fix CI
---
 paddle/fluid/operators/distribution_helper.h  |  9 +-
 paddle/phi/backends/gpu/gpu_launch_config.h   |  1 +
 paddle/phi/kernels/gpu/bernoulli_kernel.cu    | 82 +++++++++++++++----
 .../tests/unittests/test_bernoulli_op.py      | 39 +++++++++
 .../tests/unittests/test_exponential_op.py    | 11 +--
 .../unittests/test_gaussian_random_op.py      |  9 +-
 .../fluid/tests/unittests/test_poisson_op.py  |  7 +-
 .../tests/unittests/test_uniform_random_op.py |  9 +-
 8 files changed, 135 insertions(+), 32 deletions(-)

diff --git a/paddle/fluid/operators/distribution_helper.h b/paddle/fluid/operators/distribution_helper.h
index ca6bcb1147a2f..c13bf687af234 100644
--- a/paddle/fluid/operators/distribution_helper.h
+++ b/paddle/fluid/operators/distribution_helper.h
@@ -180,8 +180,8 @@ struct normal_distribution<double> {
 /******** Launch GPU function of distribution and transformation *********/
 template <typename T, typename DistOp, typename TransformOp>
 __global__ void DistributionKernel(size_t size, uint64_t seed, uint64_t offset,
-                                   DistOp dist, TransformOp trans,
-                                   T *out_data) {
+                                   DistOp dist, TransformOp trans, T *out_data,
+                                   size_t stride) {
   size_t idx = static_cast<size_t>(BLOCK_ID_X * BLOCK_NUM_X);
   static constexpr int kCount = DistOp::kReturnsCount;
 #if defined(__NVCC__)
@@ -201,7 +201,8 @@ __global__ void DistributionKernel(size_t size, uint64_t seed, uint64_t offset,
     kps::ElementwiseUnary<T, T, kCount, 1, 1, TransformOp>(&result[0], &args[0],
                                                            trans);
     kps::WriteData<T, T, kCount, 1, 1, true>(out_data + i, &result[0], size - i,
-                                             1, total_thread, 1);
+                                             1, stride, 1);
+    __syncthreads();
   }
 }
 
@@ -234,7 +235,7 @@ void distribution_and_transform(const platform::CUDADeviceContext &dev_ctx,
 
   DistributionKernel<
       T, DistOp, TransformOp><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
-      size, seed, offset, dist, trans, out_data);
+      size, seed, offset, dist, trans, out_data, total_thread);
 }
 
 #endif
diff --git a/paddle/phi/backends/gpu/gpu_launch_config.h b/paddle/phi/backends/gpu/gpu_launch_config.h
index 5aa569e0197bd..e45b465122588 100644
--- a/paddle/phi/backends/gpu/gpu_launch_config.h
+++ b/paddle/phi/backends/gpu/gpu_launch_config.h
@@ -29,6 +29,7 @@
 #include <string>
 #include <vector>
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/enforce.h"
 
 #ifdef __HIPCC__
 // HIP results in error or nan if > 256
diff --git a/paddle/phi/kernels/gpu/bernoulli_kernel.cu b/paddle/phi/kernels/gpu/bernoulli_kernel.cu
index 6127bceef509c..ac69d398b8ac4 100644
--- a/paddle/phi/kernels/gpu/bernoulli_kernel.cu
+++ b/paddle/phi/kernels/gpu/bernoulli_kernel.cu
@@ -12,19 +12,30 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <thrust/execution_policy.h>
 #include <thrust/random.h>
 #include <thrust/transform.h>
+#ifdef __NVCC__
+#include <curand_kernel.h>
+#endif
+#ifdef __HIPCC__
+#include <hiprand_kernel.h>
+#endif
+
 #include <algorithm>
 #include <vector>
+
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/bernoulli_kernel.h"
 
 // See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/operators/distribution_helper.h"
 #include "paddle/fluid/platform/transform.h"
 
+DECLARE_bool(use_curand);
+
 namespace phi {
 
 template <typename T>
@@ -49,26 +60,69 @@ struct BernoulliCudaFunctor {
   }
 };
 
+// 'curand_uniform4/hiprand_uniform4' generate 4 random number each time
+template <typename T>
+__global__ void bernoulli_cuda_kernel(
+    size_t size, uint64_t seed, uint64_t offset, const T* x_data, T* out_data) {
+  size_t thread_idx =
+      static_cast<size_t>(blockIdx.x * blockDim.x + threadIdx.x);
+
+#if defined(__NVCC__)
+  curandStatePhilox4_32_10_t state;
+  curand_init(seed, thread_idx, offset, &state);
+#else
+  hiprandStatePhilox4_32_10_t state;
+  hiprand_init(seed, thread_idx, offset, &state);
+#endif
+
+  size_t total_thread = gridDim.x * blockDim.x;
+  for (size_t i = 4 * thread_idx; i < size; i += total_thread * 4) {
+    paddle::distribution::uniform_distribution<float> dist;
+    float4 rand = dist(&state);
+#pragma unroll
+    for (size_t j = 0; j < 4; j++) {
+      size_t idx = i + j;
+      if (idx < size) {
+        out_data[idx] = static_cast<T>((&rand.x)[j] <= x_data[idx]);
+      }
+    }
+  }
+}
+
 template <typename T, typename Context>
 void BernoulliKernel(const Context& ctx,
                      const DenseTensor& x,
                      DenseTensor* out) {
-  auto numel = x.numel();
-  auto* x_data = x.data<T>();
+  const T* x_data = x.data<T>();
   T* out_data = ctx.template Alloc<T>(out);
+  auto numel = x.numel();
 
   auto gen_cuda = ctx.GetGenerator();
-  auto seed_offset = gen_cuda->IncrementOffset(1);
-  int64_t gen_offset = numel * seed_offset.second;
-  paddle::platform::Transform<phi::GPUContext> trans;
-  thrust::counting_iterator<int64_t> index_sequence_begin(0);
-  trans(ctx,
-        index_sequence_begin,
-        index_sequence_begin + numel,
-        x_data,
-        out_data,
-        BernoulliCudaFunctor<T>(static_cast<int64_t>(seed_offset.first),
-                                static_cast<int64_t>(gen_offset)));
+
+  if (FLAGS_use_curand) {
+    auto seed_offset = gen_cuda->IncrementOffset(12);
+    uint64_t seed = seed_offset.first;
+    uint64_t offset = seed_offset.second;
+
+    auto gpu_config = phi::backends::gpu::GetGpuLaunchConfig1D(ctx, numel, 4);
+    size_t grid_size = gpu_config.GetGridSize();
+    size_t block_size = gpu_config.GetBlockSize();
+
+    bernoulli_cuda_kernel<<<grid_size, block_size, 0, ctx.stream()>>>(
+        numel, seed, offset, x_data, out_data);
+  } else {
+    auto seed_offset = gen_cuda->IncrementOffset(1);
+    int64_t gen_offset = numel * seed_offset.second;
+    paddle::platform::Transform<phi::GPUContext> trans;
+    thrust::counting_iterator<int64_t> index_sequence_begin(0);
+    trans(ctx,
+          index_sequence_begin,
+          index_sequence_begin + numel,
+          x_data,
+          out_data,
+          BernoulliCudaFunctor<T>(static_cast<int64_t>(seed_offset.first),
+                                  static_cast<int64_t>(gen_offset)));
+  }
 }
 
 }  // namespace phi
diff --git a/python/paddle/fluid/tests/unittests/test_bernoulli_op.py b/python/paddle/fluid/tests/unittests/test_bernoulli_op.py
index 471caeb77bf65..426d5d463f453 100644
--- a/python/paddle/fluid/tests/unittests/test_bernoulli_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bernoulli_op.py
@@ -18,6 +18,7 @@
 import paddle
 from op_test import OpTest
 import numpy as np
+import os
 
 
 def output_hist(out):
@@ -68,5 +69,43 @@ def test_static(self):
                 hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
 
 
+class TestRandomValue(unittest.TestCase):
+    def test_fixed_random_number(self):
+        # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t'
+        if not paddle.is_compiled_with_cuda():
+            return
+
+        if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None):
+            return
+
+        print("Test Fixed Random number on GPU------>")
+        paddle.disable_static()
+        paddle.set_device('gpu')
+        paddle.seed(100)
+        np.random.seed(100)
+
+        x_np = np.random.rand(32, 1024, 1024)
+
+        x = paddle.to_tensor(x_np, dtype='float64')
+        y = paddle.bernoulli(x).numpy()
+        index0, index1, index2 = np.nonzero(y)
+        self.assertEqual(np.sum(index0), 260028995)
+        self.assertEqual(np.sum(index1), 8582429431)
+        self.assertEqual(np.sum(index2), 8581445798)
+        expect = [0., 0., 0., 0., 0., 0., 0., 1., 1., 1.]
+        self.assertTrue(np.array_equal(y[16, 500, 500:510], expect))
+
+        x = paddle.to_tensor(x_np, dtype='float32')
+        y = paddle.bernoulli(x).numpy()
+        index0, index1, index2 = np.nonzero(y)
+        self.assertEqual(np.sum(index0), 260092343)
+        self.assertEqual(np.sum(index1), 8583509076)
+        self.assertEqual(np.sum(index2), 8582778540)
+        expect = [0., 0., 1., 1., 1., 1., 0., 1., 1., 1.]
+        self.assertTrue(np.array_equal(y[16, 500, 500:510], expect))
+
+        paddle.enable_static()
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_exponential_op.py b/python/paddle/fluid/tests/unittests/test_exponential_op.py
index 7d43ebadf41bb..ccbc0a1676302 100644
--- a/python/paddle/fluid/tests/unittests/test_exponential_op.py
+++ b/python/paddle/fluid/tests/unittests/test_exponential_op.py
@@ -16,6 +16,7 @@
 import paddle
 import numpy as np
 from op_test import OpTest
+import os
 
 paddle.enable_static()
 paddle.seed(100)
@@ -90,18 +91,18 @@ def test_dygraph(self):
         self.assertTrue(np.min(x.numpy()) >= 0)
         paddle.enable_static()
 
-    # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t'
     def test_fixed_random_number(self):
+        # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t'
         if not paddle.is_compiled_with_cuda():
             return
 
-        # Note(zhouwei): The Number of threads is determined by 
-        # 'multiProcessorCount * maxThreadsPerMultiProcessor'. So, different 
-        # GPU have different number of threads, which result in different 
-        # random value. Only test on V100 GPU here.
+        # Different GPU generatte different random value. Only test V100 here.
         if not "V100" in paddle.device.cuda.get_device_name():
             return
 
+        if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None):
+            return
+
         print("Test Fixed Random number on V100 GPU------>")
         paddle.disable_static()
         paddle.set_device('gpu')
diff --git a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
index 43bcc3438eef4..31caf4bd6be98 100644
--- a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 
+import os
 import unittest
 import numpy as np
 import paddle
@@ -293,13 +294,13 @@ def test_fixed_random_number(self):
         if not paddle.is_compiled_with_cuda():
             return
 
-        # Note(zhouwei): The Number of threads is determined by 
-        # 'multiProcessorCount * maxThreadsPerMultiProcessor'. So, different 
-        # GPU have different number of threads, which result in different 
-        # random value. Only test on V100 GPU here.
+        # Different GPU generatte different random value. Only test V100 here.
         if not "V100" in paddle.device.cuda.get_device_name():
             return
 
+        if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None):
+            return
+
         def _check_random_value(dtype, expect, expect_mean, expect_std):
             x = paddle.randn([32, 3, 1024, 1024], dtype=dtype)
             actual = x.numpy()
diff --git a/python/paddle/fluid/tests/unittests/test_poisson_op.py b/python/paddle/fluid/tests/unittests/test_poisson_op.py
index dc4dc3284e923..2123d4e0e7e35 100644
--- a/python/paddle/fluid/tests/unittests/test_poisson_op.py
+++ b/python/paddle/fluid/tests/unittests/test_poisson_op.py
@@ -17,6 +17,7 @@
 import numpy as np
 from op_test import OpTest
 import math
+import os
 
 paddle.enable_static()
 paddle.seed(100)
@@ -101,11 +102,15 @@ def test_dygraph(self):
         self.assertTrue(np.min(y.numpy()) >= 0)
         paddle.enable_static()
 
-    # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t'
     def test_fixed_random_number(self):
+        # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t'
         if not paddle.is_compiled_with_cuda():
             return
 
+        if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None):
+            return
+
+        print("Test Fixed Random number on GPU------>")
         paddle.disable_static()
         paddle.set_device('gpu')
         paddle.seed(2021)
diff --git a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
index a84c3b20da26c..41b6ed36d65cc 100644
--- a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import sys
+import os
 import subprocess
 import unittest
 import numpy as np
@@ -568,13 +569,13 @@ def test_fixed_random_number(self):
         if not paddle.is_compiled_with_cuda():
             return
 
-        # Note(zhouwei): The Number of threads is determined by 
-        # 'multiProcessorCount * maxThreadsPerMultiProcessor'. So, different 
-        # GPU have different number of threads, which result in different 
-        # random value. Only test on V100 GPU here.
+        # Different GPU generate different random value. Only test V100 here.
         if not "V100" in paddle.device.cuda.get_device_name():
             return
 
+        if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None):
+            return
+
         def _check_random_value(dtype, expect, expect_mean, expect_std):
             x = paddle.rand([32, 3, 1024, 1024], dtype=dtype)
             actual = x.numpy()

From 6241913b8bf3f6259a38cad29ea8ba9cd598ff4a Mon Sep 17 00:00:00 2001
From: maxhuiy <1508399706@qq.com>
Date: Wed, 23 Feb 2022 17:33:17 +0800
Subject: [PATCH 064/101] [MLU] add cncl parallel context and mlu resource pool
 (#39803)

* [MLU] add cncl parallel context and mlu resource pool

* [MLU] fix the cncl_context_test
---
 paddle/fluid/imperative/CMakeLists.txt        |   3 +
 paddle/fluid/imperative/cncl_context.cc       | 237 ++++++++++++++++++
 paddle/fluid/imperative/cncl_context.h        |  75 ++++++
 paddle/fluid/imperative/tests/CMakeLists.txt  |   3 +
 .../imperative/tests/cncl_context_test.cc     | 141 +++++++++++
 paddle/fluid/platform/CMakeLists.txt          |   4 +
 .../fluid/platform/device/mlu/CMakeLists.txt  |   1 +
 .../platform/device/mlu/mlu_resource_pool.cc  |  99 ++++++++
 .../platform/device/mlu/mlu_resource_pool.h   |  64 +++++
 paddle/fluid/pybind/CMakeLists.txt            |   8 +
 paddle/fluid/pybind/imperative.cc             |  13 +
 11 files changed, 648 insertions(+)
 create mode 100644 paddle/fluid/imperative/cncl_context.cc
 create mode 100644 paddle/fluid/imperative/cncl_context.h
 create mode 100644 paddle/fluid/imperative/tests/cncl_context_test.cc
 create mode 100644 paddle/fluid/platform/device/mlu/mlu_resource_pool.cc
 create mode 100644 paddle/fluid/platform/device/mlu/mlu_resource_pool.h

diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt
index 90cf0e76e0007..72f7e5af9a96e 100644
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -31,6 +31,9 @@ if(NOT WIN32)
         cc_library(hccl_context SRCS hccl_context.cc DEPS collective_helper device_context tensor var_type_traits)
         cc_library(reducer SRCS reducer.cc DEPS layer)
     endif()
+    if(WITH_CNCL)
+        cc_library(cncl_context SRCS cncl_context.cc DEPS collective_helper device_context tensor var_type_traits)
+    endif()
     if(WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_ASCEND_CL)
         cc_library(heter_ccl_context SRCS heter_ccl_context.cc DEPS collective_helper device_context tensor var_type_traits)
     endif()
diff --git a/paddle/fluid/imperative/cncl_context.cc b/paddle/fluid/imperative/cncl_context.cc
new file mode 100644
index 0000000000000..779b748c2d2d4
--- /dev/null
+++ b/paddle/fluid/imperative/cncl_context.cc
@@ -0,0 +1,237 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#if defined(PADDLE_WITH_CNCL)
+#include "paddle/fluid/imperative/cncl_context.h"
+
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/variable.h"
+
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/gen_comm_id_helper.h"
+#include "paddle/fluid/platform/place.h"
+
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/device/mlu/cncl_helper.h"
+#include "paddle/fluid/platform/device/mlu/mlu_info.h"
+
+namespace paddle {
+namespace framework {
+class Variable;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace imperative {
+
+static void AllReduce(const framework::Tensor &src, framework::Tensor *dst,
+                      const mluStream stream, const platform::CNCLComm *comm) {
+  const auto &place = src.place();
+  PADDLE_ENFORCE_EQ(
+      platform::is_mlu_place(place), true,
+      platform::errors::Unimplemented(
+          "Imperative mode does not support multi-CPU training yet."));
+
+  const void *src_ptr = src.data();
+  dst->Resize(src.dims());
+  auto *dst_ptr = dst->mutable_data(src.place(), src.dtype());
+  auto cncl_dtype =
+      platform::ToCNCLDataType(framework::TransToProtoVarType(src.dtype()));
+  PADDLE_ENFORCE_MLU_SUCCESS(cnclAllReduce(src_ptr, dst_ptr, src.numel(),
+                                           cncl_dtype, cnclSum, comm->comm(),
+                                           stream));
+}
+
+void CNCLParallelContext::BcastCNCLId(
+    std::vector<cnclCliqueId> &cncl_ids,  // NOLINT
+    int root, int server_fd) {
+  if (strategy_.local_rank_ == root) {
+    std::vector<std::string> other_trainers;
+    for (auto &ep : strategy_.trainer_endpoints_) {
+      if (ep != strategy_.current_endpoint_) {
+        other_trainers.push_back(ep);
+      }
+    }
+    platform::SendBroadCastCommID(other_trainers, &cncl_ids);
+  } else {
+    platform::RecvBroadCastCommID(server_fd, strategy_.current_endpoint_,
+                                  &cncl_ids);
+  }
+}
+
+void CNCLParallelContext::Init() {
+  int server_fd = -1;
+
+  std::vector<cnclCliqueId> cncl_ids;
+  cncl_ids.resize(strategy_.nrings_);
+
+  if (strategy_.local_rank_ == 0) {
+    // generate the unique cnclid on the root worker
+    for (size_t i = 0; i < cncl_ids.size(); ++i) {
+      PADDLE_ENFORCE_MLU_SUCCESS(cnclGetCliqueId(&cncl_ids[i]));
+    }
+  } else {
+    server_fd = platform::SocketServer::GetInstance(strategy_.current_endpoint_)
+                    .socket();
+  }
+  BcastCNCLId(cncl_ids, 0, server_fd);
+
+  int mlu_id = place_.device;
+  for (int ring_id = 0; ring_id < strategy_.nrings_; ++ring_id) {
+    VLOG(0) << "init cncl context nranks: " << strategy_.nranks_
+            << " local rank: " << strategy_.local_rank_ << " mlu id: " << mlu_id
+            << " ring id: " << ring_id;
+    // it will assign cncl_comm in MLUDeviceContext within ring_id
+    platform::CNCLCommContext::Instance().CreateComm(
+        &cncl_ids[ring_id], strategy_.nranks_, strategy_.local_rank_, mlu_id,
+        ring_id);
+
+    compute_events_.emplace_back(
+        platform::MluEventResourcePool::Instance().New(place_.device));
+    comm_events_.emplace_back(
+        platform::MluEventResourcePool::Instance().New(place_.device));
+  }
+}
+
+void CNCLParallelContext::InitWithRingID(int ring_id) {
+  int server_fd = -1;
+  std::vector<cnclCliqueId> cncl_ids;
+  cncl_ids.resize(1);
+
+  if (strategy_.local_rank_ == 0) {
+    // generate the unique cnclid on the root worker
+    PADDLE_ENFORCE_MLU_SUCCESS(cnclGetCliqueId(&cncl_ids[0]));
+  } else {
+    server_fd = platform::SocketServer::GetInstance(strategy_.current_endpoint_)
+                    .socket();
+  }
+  BcastCNCLId(cncl_ids, 0, server_fd);
+
+  int mlu_id = place_.device;
+  VLOG(0) << "init cncl context nranks: " << strategy_.nranks_
+          << " local rank: " << strategy_.local_rank_ << " mlu id: " << mlu_id
+          << " ring id: " << ring_id;
+  // it will assign cncl_comm in MLUDeviceContext within ring_id
+  platform::CNCLCommContext::Instance().CreateComm(
+      &cncl_ids[0], strategy_.nranks_, strategy_.local_rank_, mlu_id, ring_id);
+
+  compute_events_.emplace_back(
+      platform::MluEventResourcePool::Instance().New(place_.device));
+  comm_events_.emplace_back(
+      platform::MluEventResourcePool::Instance().New(place_.device));
+}
+
+void CNCLParallelContext::AllReduceByStream(const framework::Variable &src,
+                                            framework::Variable *dst,
+                                            int ring_id, bool use_calc_stream) {
+  PADDLE_ENFORCE_EQ(
+      platform::is_mlu_place(place_), true,
+      platform::errors::Unimplemented(
+          "Dynamic graph mode does not support multi-CPU training yet."));
+  auto *dev_ctx = static_cast<platform::MLUDeviceContext *>(
+      platform::DeviceContextPool::Instance().Get(place_));
+  platform::CNCLComm *comm =
+      platform::CNCLCommContext::Instance().Get(ring_id, place_);
+  mluStream stream = (use_calc_stream ? dev_ctx->stream() : comm->stream());
+
+  if (src.IsType<framework::LoDTensor>()) {
+    if (!dst->IsType<framework::LoDTensor>()) {
+      dst->Clear();
+    }
+    AllReduce(src.Get<framework::LoDTensor>(),
+              dst->GetMutable<framework::LoDTensor>(), stream, comm);
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Unsupported variable type %s for imperative allreduce, only "
+        "LoDTensor is supported.",
+        platform::demangle(framework::ToTypeName(src.Type()))));
+  }
+}
+
+void CNCLParallelContext::Broadcast(framework::Variable *src, int ring_id) {
+  VLOG(3) << "/// DEBUG /// start inter broadcast with ring_id: " << ring_id;
+  framework::Tensor *src_tensor = src->GetMutable<framework::LoDTensor>();
+  const auto &place = src_tensor->place();
+  platform::CNCLComm *comm =
+      platform::CNCLCommContext::Instance().Get(ring_id, place);
+  mluStream stream = comm->stream();
+
+  void *src_ptr = src_tensor->data();
+  auto cncl_dtype = platform::ToCNCLDataType(
+      framework::TransToProtoVarType(src_tensor->dtype()));
+  PADDLE_ENFORCE_MLU_SUCCESS(cnclBcast(src_ptr, src_tensor->numel(), cncl_dtype,
+                                       0, comm->comm(), stream));
+}
+
+paddle::platform::DeviceContext *CNCLParallelContext::GetDeviceContext(
+    int ring_id) {
+  return static_cast<platform::DeviceContext *>(
+      platform::CNCLCommContext::Instance()
+          .Get(ring_id, place_)
+          ->dev_context());
+}
+
+void CNCLParallelContext::WaitCompute(int ring_id) {
+  PADDLE_ENFORCE_GE(ring_id, 0, platform::errors::OutOfRange(
+                                    "ring id must >= 0, but got %d", ring_id));
+  PADDLE_ENFORCE_LT(ring_id, compute_events_.size(),
+                    platform::errors::OutOfRange(
+                        "ring id must < compute events size,"
+                        "but got ring id = %d, compute events size = %d",
+                        ring_id, compute_events_.size()));
+
+  auto compute_stream = static_cast<platform::MLUDeviceContext *>(
+                            platform::DeviceContextPool::Instance().Get(place_))
+                            ->stream();
+  auto comm_stream =
+      platform::CNCLCommContext::Instance().Get(ring_id, place_)->stream();
+  auto event = compute_events_[ring_id].get();
+
+  // compute_stream-->event-->comm_stream
+  PADDLE_ENFORCE_MLU_SUCCESS(cnrtPlaceNotifier(event, compute_stream));
+  PADDLE_ENFORCE_MLU_SUCCESS(cnrtQueueWaitNotifier(event, comm_stream, 0));
+}
+
+void CNCLParallelContext::WaitComm(int ring_id) {
+  PADDLE_ENFORCE_GE(ring_id, 0, platform::errors::OutOfRange(
+                                    "ring id must >= 0, but got %d", ring_id));
+  PADDLE_ENFORCE_LT(ring_id, comm_events_.size(),
+                    platform::errors::OutOfRange(
+                        "ring id must < comm events size,"
+                        "but got ring id = %d, comm events size = %d",
+                        ring_id, comm_events_.size()));
+
+  auto compute_stream = static_cast<platform::MLUDeviceContext *>(
+                            platform::DeviceContextPool::Instance().Get(place_))
+                            ->stream();
+  auto comm_stream =
+      platform::CNCLCommContext::Instance().Get(ring_id, place_)->stream();
+  auto event = comm_events_[ring_id].get();
+
+  // comm_stream-->event-->compute_stream
+  PADDLE_ENFORCE_MLU_SUCCESS(cnrtPlaceNotifier(event, comm_stream));
+  PADDLE_ENFORCE_MLU_SUCCESS(cnrtQueueWaitNotifier(event, compute_stream, 0));
+}
+
+void CNCLParallelContext::SynchronizeCompute() {
+  auto *compute_dev_ctx = static_cast<platform::MLUDeviceContext *>(
+      platform::DeviceContextPool::Instance().Get(place_));
+  compute_dev_ctx->Wait();
+}
+
+}  //  namespace imperative
+}  //  namespace paddle
+
+#endif
diff --git a/paddle/fluid/imperative/cncl_context.h b/paddle/fluid/imperative/cncl_context.h
new file mode 100644
index 0000000000000..85f53319bfcde
--- /dev/null
+++ b/paddle/fluid/imperative/cncl_context.h
@@ -0,0 +1,75 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#if defined(PADDLE_WITH_CNCL)
+#include <cncl.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/imperative/parallel_context.h"
+#include "paddle/fluid/platform/device/mlu/mlu_resource_pool.h"
+
+namespace paddle {
+namespace framework {
+class Variable;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace imperative {
+
+class CNCLParallelContext : public ParallelContext {
+ public:
+  explicit CNCLParallelContext(const ParallelStrategy& strategy,
+                               const platform::Place& place)
+      : ParallelContext(strategy, place) {}
+
+  ~CNCLParallelContext() override = default;
+
+  void BcastCNCLId(std::vector<cnclCliqueId>& cncl_ids, int root,  // NOLINT
+                   int server_fd);
+
+  void Init() override;
+
+  void InitWithRingID(int ring_id) override;
+
+  void AllReduceByStream(const framework::Variable& src,
+                         framework::Variable* dst, int ring_id,
+                         bool use_calc_stream) override;
+
+  void Broadcast(framework::Variable* src, int ring_id) override;
+
+  paddle::platform::DeviceContext* GetDeviceContext(int ring_id) override;
+
+  void WaitCompute(int ring_id) override;
+
+  void WaitComm(int ring_id) override;
+
+  void SynchronizeCompute() override;
+
+ private:
+  // used for comm wait compute, compute_stream-->event-->comm_stream[ring_id]
+  std::vector<std::shared_ptr<platform::MluEventObject>> compute_events_;
+
+  // used for compute wait comm, comm_stream[ring_id]-->event-->compute_stream
+  std::vector<std::shared_ptr<platform::MluEventObject>> comm_events_;
+};
+
+}  //  namespace imperative
+}  //  namespace paddle
+#endif
diff --git a/paddle/fluid/imperative/tests/CMakeLists.txt b/paddle/fluid/imperative/tests/CMakeLists.txt
index 774bb9653e2cb..a9c81cb87798b 100644
--- a/paddle/fluid/imperative/tests/CMakeLists.txt
+++ b/paddle/fluid/imperative/tests/CMakeLists.txt
@@ -9,6 +9,9 @@ else()
     if (WITH_XPU_BKCL)
         cc_test(bkcl_context_test SRCS bkcl_context_test.cc DEPS bkcl_context)
     endif()
+    if (WITH_CNCL)
+        cc_test(cncl_context_test SRCS cncl_context_test.cc DEPS cncl_context)
+    endif()
 endif(WIN32)
 
 
diff --git a/paddle/fluid/imperative/tests/cncl_context_test.cc b/paddle/fluid/imperative/tests/cncl_context_test.cc
new file mode 100644
index 0000000000000..1d5ee8e7fc899
--- /dev/null
+++ b/paddle/fluid/imperative/tests/cncl_context_test.cc
@@ -0,0 +1,141 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <thread>  // NOLINT
+
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/imperative/cncl_context.h"
+#include "paddle/fluid/platform/gen_comm_id_helper.h"
+
+#include "gtest/gtest.h"
+
+namespace imperative = paddle::imperative;
+namespace platform = paddle::platform;
+namespace framework = paddle::framework;
+
+// Node1: FLAGS_selected_mlus=0 PADDLE_TRAINER_ID=0 ./cncl_context_test
+// Node2: FLAGS_selected_mlus=1 PADDLE_TRAINER_ID=1 ./cncl_context_test
+
+int nrings = 1;
+imperative::ParallelStrategy GetStrategy(int local_rank) {
+  std::vector<std::string> eps = {"127.0.0.1:9866", "localhost:9867"};
+  imperative::ParallelStrategy strategy;
+  strategy.trainer_endpoints_ = eps;
+  strategy.current_endpoint_ = eps[local_rank];
+  strategy.nranks_ = 2;
+  strategy.local_rank_ = local_rank;
+  strategy.nrings_ = nrings;
+  return strategy;
+}
+
+#if defined(PADDLE_WITH_CNCL)
+void Broadcast(int local_rank, int device_id) {
+  int data_size = 4;
+  float test_data = 7;
+  const auto& place = platform::MLUPlace(device_id);
+  platform::MLUDeviceContext ctx(place);
+
+  imperative::CNCLParallelContext cpc(GetStrategy(local_rank), place);
+
+  // init
+  cpc.Init();
+
+  framework::Variable* src_dev_var(new framework::Variable());
+  auto* src_dev_tensor = src_dev_var->GetMutable<framework::LoDTensor>();
+  src_dev_tensor->mutable_data<float>(phi::make_ddim({data_size}), place);
+
+  // fill data for rank 0 only
+  std::vector<float> src_vec;
+  if (local_rank == 0) {
+    for (int i = 0; i < data_size; ++i) {
+      src_vec.push_back(test_data);
+    }
+    framework::TensorFromVector(src_vec, ctx, src_dev_tensor);
+  }
+  ctx.Wait();
+
+  // call broadcast
+  cpc.Broadcast(src_dev_var, 0);
+  std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+
+  // check result
+  std::vector<float> dst_vec;
+  framework::TensorToVector(*src_dev_tensor, ctx, &dst_vec);
+  ctx.Wait();
+
+  for (int i = 0; i < data_size; ++i) {
+    EXPECT_EQ(dst_vec[i], test_data);
+  }
+}
+
+TEST(Broadcast, Run) {
+  if (platform::GetMLUDeviceCount() >= 2) {
+    int local_rank = atoi(getenv("PADDLE_TRAINER_ID"));
+    int device_id = atoi(getenv("FLAGS_selected_mlus"));
+    Broadcast(local_rank, device_id);
+  }
+}
+
+void AllReduceByStream(int local_rank, int device_id) {
+  int data_size = 32;
+  const auto& place = platform::MLUPlace(device_id);
+  platform::MLUDeviceContext ctx(place);
+
+  imperative::CNCLParallelContext cpc(GetStrategy(local_rank), place);
+
+  // init
+  cpc.Init();
+
+  // input data
+  framework::Variable* src_dev_var(new framework::Variable());
+  auto* src_dev_tensor = src_dev_var->GetMutable<framework::LoDTensor>();
+  src_dev_tensor->mutable_data<float>(phi::make_ddim({data_size}), place);
+
+  // fill input data
+  std::vector<float> src_vec;
+  for (int i = 0; i < data_size; ++i) {
+    src_vec.push_back(1.0 + local_rank);
+  }
+  framework::TensorFromVector(src_vec, ctx, src_dev_tensor);
+  ctx.Wait();
+
+  // output data
+  framework::Variable* dst_dev_var(new framework::Variable());
+  auto* dst_dev_tensor = dst_dev_var->GetMutable<framework::LoDTensor>();
+  dst_dev_tensor->mutable_data<float>(phi::make_ddim({data_size}), place);
+
+  // call allreduce
+  cpc.AllReduceByStream(*src_dev_var, dst_dev_var, 0, false);
+  std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+
+  // check result
+  std::vector<float> dst_vec;
+  framework::TensorToVector(*dst_dev_tensor, ctx, &dst_vec);
+  ctx.Wait();
+
+  EXPECT_EQ(dst_vec.size(), src_vec.size());
+  for (int i = 0; i < data_size; ++i) {
+    EXPECT_EQ(dst_vec[i], 3.0);
+  }
+}
+
+TEST(AllReduceByStream, Run) {
+  if (platform::GetMLUDeviceCount() >= 2) {
+    int local_rank = atoi(getenv("PADDLE_TRAINER_ID"));
+    int device_id = atoi(getenv("FLAGS_selected_mlus"));
+    AllReduceByStream(local_rank, device_id);
+  }
+}
+#endif
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 478b71745e4ac..37709c953e13b 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -148,6 +148,10 @@ if(WITH_ASCEND_CL)
     target_link_libraries(device_context npu_resource_pool)
 endif()
 
+if(WITH_MLU)
+    target_link_libraries(device_context mlu_resource_pool)
+endif()
+
 if(WITH_CUSTOM_DEVICE)
     target_link_libraries(device_context custom_context)
 endif()
diff --git a/paddle/fluid/platform/device/mlu/CMakeLists.txt b/paddle/fluid/platform/device/mlu/CMakeLists.txt
index 724776bfad233..1f3a7670849c2 100644
--- a/paddle/fluid/platform/device/mlu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/mlu/CMakeLists.txt
@@ -9,3 +9,4 @@ cc_library(mlu_stream SRCS mlu_stream.cc DEPS boost mlu_info stream_callback_man
 cc_library(mlu_device_context SRCS device_context.cc DEPS mlu_stream)
 cc_test(mlu_device_context_test SRCS device_context_test.cc DEPS mlu_device_context)
 cc_library(mlu_collective_helper SRCS mlu_collective_helper.cc DEPS mlu_stream mlu_info)
+cc_library(mlu_resource_pool SRCS mlu_resource_pool.cc DEPS mlu_info)
diff --git a/paddle/fluid/platform/device/mlu/mlu_resource_pool.cc b/paddle/fluid/platform/device/mlu/mlu_resource_pool.cc
new file mode 100644
index 0000000000000..fbe3eca1c4d23
--- /dev/null
+++ b/paddle/fluid/platform/device/mlu/mlu_resource_pool.cc
@@ -0,0 +1,99 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined(PADDLE_WITH_MLU)
+#include "paddle/fluid/platform/device/mlu/mlu_resource_pool.h"
+
+namespace paddle {
+namespace platform {
+
+MluStreamResourcePool::MluStreamResourcePool() {
+  int dev_cnt = platform::GetMLUDeviceCount();
+  pool_.reserve(dev_cnt);
+  for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) {
+    auto creator = [dev_idx] {
+      platform::SetMLUDeviceId(dev_idx);
+      mluStream stream;
+      cnrtQueueCreate(&stream);
+      return stream;
+    };
+
+    auto deleter = [dev_idx](mluStream stream) {
+      platform::SetMLUDeviceId(dev_idx);
+      cnrtQueueDestroy(stream);
+    };
+
+    pool_.emplace_back(ResourcePool<MluStreamObject>::Create(creator, deleter));
+  }
+}
+
+MluStreamResourcePool& MluStreamResourcePool::Instance() {
+  static MluStreamResourcePool pool;
+  return pool;
+}
+
+std::shared_ptr<MluStreamObject> MluStreamResourcePool::New(int dev_idx) {
+  PADDLE_ENFORCE_GE(
+      dev_idx, 0,
+      platform::errors::InvalidArgument(
+          "The dev_idx should be not less than 0, but got %d.", dev_idx));
+  PADDLE_ENFORCE_LT(
+      dev_idx, pool_.size(),
+      platform::errors::OutOfRange(
+          "The dev_idx should be less than device count %d, but got %d.",
+          pool_.size(), dev_idx));
+  return pool_[dev_idx]->New();
+}
+
+MluEventResourcePool::MluEventResourcePool() {
+  int dev_cnt = platform::GetMLUDeviceCount();
+  pool_.reserve(dev_cnt);
+  for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) {
+    auto creator = [dev_idx] {
+      platform::SetMLUDeviceId(dev_idx);
+      mluEventHandle event;
+      cnrtNotifierCreate(&event);
+      return event;
+    };
+
+    auto deleter = [dev_idx](mluEventHandle event) {
+      platform::SetMLUDeviceId(dev_idx);
+      cnrtNotifierDestroy(event);
+    };
+
+    pool_.emplace_back(ResourcePool<MluEventObject>::Create(creator, deleter));
+  }
+}
+
+MluEventResourcePool& MluEventResourcePool::Instance() {
+  static MluEventResourcePool pool;
+  return pool;
+}
+
+std::shared_ptr<MluEventObject> MluEventResourcePool::New(int dev_idx) {
+  PADDLE_ENFORCE_GE(
+      dev_idx, 0,
+      platform::errors::InvalidArgument(
+          "The dev_idx should be not less than 0, but got %d.", dev_idx));
+  PADDLE_ENFORCE_LT(
+      dev_idx, pool_.size(),
+      platform::errors::OutOfRange(
+          "The dev_idx should be less than device count %d, but got %d.",
+          pool_.size(), dev_idx));
+  return pool_[dev_idx]->New();
+}
+
+}  // namespace platform
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/platform/device/mlu/mlu_resource_pool.h b/paddle/fluid/platform/device/mlu/mlu_resource_pool.h
new file mode 100644
index 0000000000000..b0e2af7f024cb
--- /dev/null
+++ b/paddle/fluid/platform/device/mlu/mlu_resource_pool.h
@@ -0,0 +1,64 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if defined(PADDLE_WITH_MLU)
+#include <memory>
+#include <type_traits>
+#include <vector>
+
+#include "paddle/fluid/platform/device/mlu/mlu_info.h"
+#include "paddle/fluid/platform/resource_pool.h"
+
+namespace paddle {
+namespace platform {
+
+using MluStreamObject = std::remove_pointer<mluStream>::type;
+using MluEventObject = std::remove_pointer<mluEventHandle>::type;
+
+class MluStreamResourcePool {
+ public:
+  std::shared_ptr<MluStreamObject> New(int dev_idx);
+
+  static MluStreamResourcePool &Instance();
+
+ private:
+  MluStreamResourcePool();
+
+  DISABLE_COPY_AND_ASSIGN(MluStreamResourcePool);
+
+ private:
+  std::vector<std::shared_ptr<ResourcePool<MluStreamObject>>> pool_;
+};
+
+class MluEventResourcePool {
+ public:
+  std::shared_ptr<MluEventObject> New(int dev_idx);
+
+  static MluEventResourcePool &Instance();
+
+ private:
+  MluEventResourcePool();
+
+  DISABLE_COPY_AND_ASSIGN(MluEventResourcePool);
+
+ private:
+  std::vector<std::shared_ptr<ResourcePool<MluEventObject>>> pool_;
+};
+
+}  // namespace platform
+}  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 26c35167f404a..01b21d02ea017 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -37,6 +37,10 @@ if (WITH_ASCEND_CL)
   set(PYBIND_DEPS ${PYBIND_DEPS} heter_ccl_context)
 endif()
 
+if (WITH_CNCL)
+  set(PYBIND_DEPS ${PYBIND_DEPS} cncl_context)
+endif()
+
 if(NOT WIN32)
   set(PYBIND_DEPS ${PYBIND_DEPS} data_loader)
   set(PYBIND_DEPS ${PYBIND_DEPS} mmap_allocator)
@@ -134,6 +138,10 @@ if(WITH_PYTHON)
     list(APPEND OP_FUNCTION_GENERETOR_DEPS hccl_context)
   endif(WITH_ASCEND_CL)
 
+  if(WITH_CNCL)
+    list(APPEND OP_FUNCTION_GENERETOR_DEPS cncl_context)
+  endif(WITH_CNCL)
+
   add_executable(op_function_generator op_function_generator.cc)
   target_link_libraries(op_function_generator ${OP_FUNCTION_GENERETOR_DEPS})
   add_executable(eager_op_function_generator eager_op_function_generator.cc)
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 387addda9edd1..8c5ed2d118301 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -36,6 +36,7 @@ limitations under the License. */
 #include "paddle/fluid/imperative/amp_auto_cast.h"
 #include "paddle/fluid/imperative/basic_engine.h"
 #include "paddle/fluid/imperative/bkcl_context.h"
+#include "paddle/fluid/imperative/cncl_context.h"
 #include "paddle/fluid/imperative/data_loader.h"
 #include "paddle/fluid/imperative/gloo_context.h"
 #include "paddle/fluid/imperative/hccl_context.h"
@@ -2559,6 +2560,18 @@ void BindImperative(py::module *m_ptr) {
            py::arg("ring_id"));
 #endif
 
+#if defined(PADDLE_WITH_CNCL)
+  py::class_<imperative::CNCLParallelContext, imperative::ParallelContext,
+             std::shared_ptr<imperative::CNCLParallelContext>>(
+      m, "CNCLParallelContext")
+      .def(py::init<const imperative::ParallelStrategy &,
+                    const platform::MLUPlace &>())
+      .def("init", [](imperative::CNCLParallelContext &self) { self.Init(); })
+      .def("init_with_ring_id",
+           &imperative::CNCLParallelContext::InitWithRingID,
+           py::arg("ring_id"));
+#endif
+
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
     defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_ASCEND_CL)
   py::class_<imperative::HeterParallelContext, imperative::ParallelContext,

From 64ed92bdb79b2dfe28a2d947e16d8ccb22c413eb Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Wed, 23 Feb 2022 17:42:11 +0800
Subject: [PATCH 065/101] [Phi] Polish default signature attr and output select
 impl (#39810)

* polish default sig impl

* revert dispenable out
---
 paddle/fluid/framework/pten_utils.cc | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/framework/pten_utils.cc b/paddle/fluid/framework/pten_utils.cc
index b96eb848e43a4..0ecc04dbd6b8d 100644
--- a/paddle/fluid/framework/pten_utils.cc
+++ b/paddle/fluid/framework/pten_utils.cc
@@ -137,7 +137,7 @@ KernelArgsNameMakerByOpProto::GetInputArgsNames() {
       continue;
     }
     // If contains dispensable input, we should override the
-    // GetExpectedPtenKernelArgs method self
+    // OpArgumentMapping method self in phi/ops/compat dir
     if (in.has_dispensable() && in.dispensable()) {
       VLOG(6) << "Parse PtenKernel input: skip dispensable input - " << in_name;
       continue;
@@ -153,7 +153,11 @@ KernelArgsNameMakerByOpProto::GetOutputArgsNames() {
   for (int i = 0; i < op_proto_->outputs_size(); ++i) {
     auto& out = op_proto_->outputs()[i];
     auto& out_name = out.name();
-    // TODO(chenweihang): outputs also need skip some cases
+    if ((out.has_extra() && out.extra()) || (out.has_quant() && out.quant())) {
+      VLOG(6) << "Parse PtenKernel output: skip extra & quant output - "
+              << out_name;
+      continue;
+    }
     VLOG(6) << "Parse PtenKernel output: " << out_name;
     output_names_.emplace_back(out_name);
   }
@@ -165,9 +169,10 @@ KernelArgsNameMakerByOpProto::GetAttrsArgsNames() {
   for (int i = 0; i < op_proto_->attrs_size(); ++i) {
     auto& attr = op_proto_->attrs()[i];
     auto& attr_name = attr.name();
-    if (attr_name == "use_mkldnn" || attr_name == "op_role" ||
-        attr_name == "op_role_var" || attr_name == "op_namescope" ||
-        attr_name == "op_callstack" || attr_name == "op_device") {
+    if (attr_name == "use_mkldnn" || attr_name == "use_cudnn" ||
+        attr_name == "op_role" || attr_name == "op_role_var" ||
+        attr_name == "op_namescope" || attr_name == "op_callstack" ||
+        attr_name == "op_device") {
       VLOG(6) << "Parse PtenKernel attribute: skip needless attr - "
               << attr_name;
       continue;

From ad294a81fa340f439e75a41ba7c024a85d30b0e6 Mon Sep 17 00:00:00 2001
From: Yang <yangm3ng@gmail.com>
Date: Wed, 23 Feb 2022 19:12:41 +0800
Subject: [PATCH 066/101] [Phi] move flip op to phi kernel (#39822)

---
 paddle/fluid/operators/flip_op.cc     |  13 +--
 paddle/fluid/operators/flip_op.cu     | 129 -----------------------
 paddle/fluid/operators/flip_op.h      |  83 ---------------
 paddle/phi/kernels/cpu/flip_kernel.cc |  77 ++++++++++++++
 paddle/phi/kernels/flip_kernel.h      |  29 ++++++
 paddle/phi/kernels/gpu/flip_kernel.cu | 141 ++++++++++++++++++++++++++
 6 files changed, 250 insertions(+), 222 deletions(-)
 delete mode 100644 paddle/fluid/operators/flip_op.cu
 delete mode 100644 paddle/fluid/operators/flip_op.h
 create mode 100644 paddle/phi/kernels/cpu/flip_kernel.cc
 create mode 100644 paddle/phi/kernels/flip_kernel.h
 create mode 100644 paddle/phi/kernels/gpu/flip_kernel.cu

diff --git a/paddle/fluid/operators/flip_op.cc b/paddle/fluid/operators/flip_op.cc
index 3f6171b8a07b0..fc03ef0afae51 100644
--- a/paddle/fluid/operators/flip_op.cc
+++ b/paddle/fluid/operators/flip_op.cc
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/flip_op.h"
 #include <string>
 #include <unordered_map>
 #include <vector>
+
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/platform/complex.h"
 
 namespace paddle {
 namespace operators {
@@ -29,6 +29,7 @@ class FlipOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
+  // TODO move to phi kernel
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE_EQ(
         ctx->HasInput("X"), true,
@@ -150,14 +151,6 @@ namespace plat = paddle::platform;
 REGISTER_OPERATOR(flip, ops::FlipOp, ops::FlipOpMaker, ops::FlipOpInferVarType,
                   ops::FlipOpGradMaker<paddle::framework::OpDesc>,
                   ops::FlipOpGradMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    flip, ops::FlipKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::FlipKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::FlipKernel<paddle::platform::CPUDeviceContext, int32_t>,
-    ops::FlipKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::FlipKernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::FlipKernel<paddle::platform::CPUDeviceContext, plat::complex<float>>,
-    ops::FlipKernel<paddle::platform::CPUDeviceContext, plat::complex<double>>);
 
 /* ==========================  register checkpoint ===========================*/
 REGISTER_OP_VERSION(flip)
diff --git a/paddle/fluid/operators/flip_op.cu b/paddle/fluid/operators/flip_op.cu
deleted file mode 100644
index b9f8b16214fe4..0000000000000
--- a/paddle/fluid/operators/flip_op.cu
+++ /dev/null
@@ -1,129 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/flip_op.h"
-
-#include <vector>
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/platform/complex.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using CUDADeviceContext = paddle::platform::CUDADeviceContext;
-
-template <typename T>
-__global__ void flip_cuda_kernel(const int N, const T* in_data, T* out_data,
-                                 int64_t* x_shape, int64_t* x_stride,
-                                 int* flip_dims, int flip_dims_size,
-                                 int total_dims) {
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx >= N) {
-    return;
-  }
-
-  int cur_indices = idx, rem = 0, dst_offset = 0;
-  for (int i = 0; i < total_dims; ++i) {
-    int64_t temp = cur_indices;
-    cur_indices = cur_indices / x_stride[i];
-    rem = temp - cur_indices * x_stride[i];
-    // flip the indices if it is in flip_dims
-    for (int j = 0; j < flip_dims_size; ++j) {
-      if (i == flip_dims[j]) {
-        cur_indices = x_shape[i] - 1 - cur_indices;
-      }
-    }
-    dst_offset += cur_indices * x_stride[i];
-    cur_indices = rem;
-  }
-  out_data[idx] = in_data[dst_offset];
-}
-
-template <typename T>
-class FlipKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto gplace = ctx.GetPlace();
-    auto cplace = platform::CPUPlace();
-    auto& dev_ctx = ctx.template device_context<CUDADeviceContext>();
-
-    const Tensor* x = ctx.Input<Tensor>("X");
-    Tensor* out = ctx.Output<Tensor>("Out");
-    auto* in_data = x->data<T>();
-    auto* out_data = out->mutable_data<T>(ctx.GetPlace());
-    auto flip_dims = ctx.template Attr<std::vector<int>>("axis");
-
-    const int flip_dims_size = static_cast<int>(flip_dims.size());
-    auto x_dims = x->dims();
-    const int total_dims = x_dims.size();
-    const int N = x->numel();
-
-    int block_size = 512;
-    dim3 dim_block(block_size);
-    dim3 dim_grid((N + block_size - 1) / block_size);
-
-    for (size_t i = 0; i < flip_dims.size(); ++i) {
-      if (flip_dims[i] < 0) {
-        flip_dims[i] += total_dims;
-      }
-    }
-
-    auto x_stride = phi::stride(x_dims);
-    std::vector<int64_t> x_dims_v = phi::vectorize(x_dims);
-    std::vector<int64_t> x_stride_v = phi::vectorize(x_stride);
-
-    int bytes = total_dims * sizeof(int64_t);
-    auto x_strides_array_tmp = memory::Alloc(dev_ctx, bytes);
-    int64_t* x_strides_array_gpu =
-        reinterpret_cast<int64_t*>(x_strides_array_tmp->ptr());
-    memory::Copy(gplace, x_strides_array_gpu, cplace, x_stride_v.data(), bytes,
-                 dev_ctx.stream());
-
-    auto x_shape_array_tmp = memory::Alloc(dev_ctx, bytes);
-    int64_t* x_shape_array_gpu =
-        reinterpret_cast<int64_t*>(x_shape_array_tmp->ptr());
-    memory::Copy(gplace, x_shape_array_gpu, cplace, x_dims_v.data(), bytes,
-                 dev_ctx.stream());
-
-    bytes = flip_dims_size * sizeof(int);
-    auto flip_dims_array_tmp = memory::Alloc(dev_ctx, bytes);
-    int* flip_dims_array_gpu =
-        reinterpret_cast<int*>(flip_dims_array_tmp->ptr());
-    memory::Copy(gplace, flip_dims_array_gpu, cplace, flip_dims.data(), bytes,
-                 dev_ctx.stream());
-
-    flip_cuda_kernel<
-        T><<<dim_grid, dim_block, 0, ctx.cuda_device_context().stream()>>>(
-        N, in_data, out_data, x_shape_array_gpu, x_strides_array_gpu,
-        flip_dims_array_gpu, flip_dims_size, total_dims);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    flip, ops::FlipKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::FlipKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::FlipKernel<paddle::platform::CUDADeviceContext, plat::float16>,
-    ops::FlipKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::FlipKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::FlipKernel<paddle::platform::CUDADeviceContext, bool>,
-    ops::FlipKernel<paddle::platform::CUDADeviceContext, plat::complex<float>>,
-    ops::FlipKernel<paddle::platform::CUDADeviceContext,
-                    plat::complex<double>>);
diff --git a/paddle/fluid/operators/flip_op.h b/paddle/fluid/operators/flip_op.h
deleted file mode 100644
index 3c00df5f67d19..0000000000000
--- a/paddle/fluid/operators/flip_op.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <bitset>
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-constexpr size_t dim_bitset_size = 64;
-
-template <typename DeviceContext, typename T>
-class FlipKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override;
-};
-
-template <typename T>
-class FlipKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* x = ctx.Input<Tensor>("X");
-    Tensor* out = ctx.Output<Tensor>("Out");
-    auto flip_dims = ctx.template Attr<std::vector<int>>("axis");
-
-    auto x_dims = x->dims();
-    const int total_dims = x_dims.size();
-    std::bitset<dim_bitset_size> dim_bitset;
-    for (size_t i = 0; i < flip_dims.size(); ++i) {
-      int dim = flip_dims[i];
-      if (flip_dims[i] < 0) {
-        dim += total_dims;
-      }
-      dim_bitset[dim] = true;
-    }
-    auto x_strides = phi::stride(x_dims);
-    auto numel = x->numel();
-    const T* x_data = x->data<T>();
-    T* out_data = out->mutable_data<T>(ctx.GetPlace());
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-    for (int64_t i = 0; i < numel; ++i) {
-      int64_t cur_indices = i;
-      int64_t rem = 0;
-      int64_t dst_offset = 0;
-
-      for (int d = 0; d < total_dims; ++d) {
-        int64_t temp = cur_indices;
-        cur_indices = cur_indices / x_strides[d];
-        rem = temp - cur_indices * x_strides[d];
-        dst_offset += dim_bitset[d]
-                          ? (x_dims[d] - 1 - cur_indices) * x_strides[d]
-                          : cur_indices * x_strides[d];
-        cur_indices = rem;
-      }
-      out_data[i] = x_data[dst_offset];
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/phi/kernels/cpu/flip_kernel.cc b/paddle/phi/kernels/cpu/flip_kernel.cc
new file mode 100644
index 0000000000000..fa1625d65bdc9
--- /dev/null
+++ b/paddle/phi/kernels/cpu/flip_kernel.cc
@@ -0,0 +1,77 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/flip_kernel.h"
+
+#include <bitset>
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+constexpr size_t dim_bitset_size = 64;
+
+template <typename T, typename Context>
+void FlipKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const std::vector<int>& axis,
+                DenseTensor* out) {
+  auto x_dims = x.dims();
+  const int total_dims = x_dims.size();
+  std::bitset<dim_bitset_size> dim_bitset;
+  for (size_t i = 0; i < axis.size(); ++i) {
+    int dim = axis[i];
+    if (axis[i] < 0) {
+      dim += total_dims;
+    }
+    dim_bitset[dim] = true;
+  }
+  auto x_strides = phi::stride(x_dims);
+  auto numel = x.numel();
+  const T* x_data = x.data<T>();
+  T* out_data = dev_ctx.template Alloc<T>(out);
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (int64_t i = 0; i < numel; ++i) {
+    int64_t cur_indices = i;
+    int64_t rem = 0;
+    int64_t dst_offset = 0;
+
+    for (int d = 0; d < total_dims; ++d) {
+      int64_t temp = cur_indices;
+      cur_indices = cur_indices / x_strides[d];
+      rem = temp - cur_indices * x_strides[d];
+      dst_offset += dim_bitset[d] ? (x_dims[d] - 1 - cur_indices) * x_strides[d]
+                                  : cur_indices * x_strides[d];
+      cur_indices = rem;
+    }
+    out_data[i] = x_data[dst_offset];
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(flip,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::FlipKernel,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t,
+                   bool,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/flip_kernel.h b/paddle/phi/kernels/flip_kernel.h
new file mode 100644
index 0000000000000..4470486fec0fb
--- /dev/null
+++ b/paddle/phi/kernels/flip_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void FlipKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const std::vector<int>& axis,
+                DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/flip_kernel.cu b/paddle/phi/kernels/gpu/flip_kernel.cu
new file mode 100644
index 0000000000000..668d673bd3269
--- /dev/null
+++ b/paddle/phi/kernels/gpu/flip_kernel.cu
@@ -0,0 +1,141 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/flip_kernel.h"
+
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T>
+__global__ void flip_cuda_kernel(const int N,
+                                 const T* in_data,
+                                 T* out_data,
+                                 int64_t* x_shape,
+                                 int64_t* x_stride,
+                                 int* flip_dims,
+                                 int flip_dims_size,
+                                 int total_dims) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= N) {
+    return;
+  }
+
+  int cur_indices = idx, rem = 0, dst_offset = 0;
+  for (int i = 0; i < total_dims; ++i) {
+    int64_t temp = cur_indices;
+    cur_indices = cur_indices / x_stride[i];
+    rem = temp - cur_indices * x_stride[i];
+    // flip the indices if it is in flip_dims
+    for (int j = 0; j < flip_dims_size; ++j) {
+      if (i == flip_dims[j]) {
+        cur_indices = x_shape[i] - 1 - cur_indices;
+      }
+    }
+    dst_offset += cur_indices * x_stride[i];
+    cur_indices = rem;
+  }
+  out_data[idx] = in_data[dst_offset];
+}
+
+template <typename T, typename Context>
+void FlipKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const std::vector<int>& axis,
+                DenseTensor* out) {
+  const auto gplace = dev_ctx.GetPlace();
+  auto cplace = phi::CPUPlace();
+  std::vector<int> flip_dims = axis;
+
+  auto* in_data = x.data<T>();
+  auto* out_data = dev_ctx.template Alloc<T>(out);
+
+  const int flip_dims_size = static_cast<int>(flip_dims.size());
+  auto x_dims = x.dims();
+  const int total_dims = x_dims.size();
+  const int N = x.numel();
+
+  int block_size = 512;
+  dim3 dim_block(block_size);
+  dim3 dim_grid((N + block_size - 1) / block_size);
+
+  for (size_t i = 0; i < flip_dims.size(); ++i) {
+    if (flip_dims[i] < 0) {
+      flip_dims[i] += total_dims;
+    }
+  }
+
+  auto x_stride = phi::stride(x_dims);
+  std::vector<int64_t> x_dims_v = phi::vectorize(x_dims);
+  std::vector<int64_t> x_stride_v = phi::vectorize(x_stride);
+
+  int bytes = total_dims * sizeof(int64_t);
+  auto x_strides_array_tmp = paddle::memory::Alloc(dev_ctx, bytes);
+  int64_t* x_strides_array_gpu =
+      reinterpret_cast<int64_t*>(x_strides_array_tmp->ptr());
+  paddle::memory::Copy(gplace,
+                       x_strides_array_gpu,
+                       cplace,
+                       x_stride_v.data(),
+                       bytes,
+                       dev_ctx.stream());
+
+  auto x_shape_array_tmp = paddle::memory::Alloc(dev_ctx, bytes);
+  int64_t* x_shape_array_gpu =
+      reinterpret_cast<int64_t*>(x_shape_array_tmp->ptr());
+  paddle::memory::Copy(gplace,
+                       x_shape_array_gpu,
+                       cplace,
+                       x_dims_v.data(),
+                       bytes,
+                       dev_ctx.stream());
+
+  bytes = flip_dims_size * sizeof(int);
+  auto flip_dims_array_tmp = paddle::memory::Alloc(dev_ctx, bytes);
+  int* flip_dims_array_gpu = reinterpret_cast<int*>(flip_dims_array_tmp->ptr());
+  paddle::memory::Copy(gplace,
+                       flip_dims_array_gpu,
+                       cplace,
+                       flip_dims.data(),
+                       bytes,
+                       dev_ctx.stream());
+
+  flip_cuda_kernel<T><<<dim_grid, dim_block, 0, dev_ctx.stream()>>>(
+      N,
+      in_data,
+      out_data,
+      x_shape_array_gpu,
+      x_strides_array_gpu,
+      flip_dims_array_gpu,
+      flip_dims_size,
+      total_dims);
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(flip,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::FlipKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}

From 30992ea059b90c7ad6380c4a1164486f9a0e7210 Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Wed, 23 Feb 2022 19:26:18 +0800
Subject: [PATCH 067/101] [phi] move randperm to phi (#39816)

* move randperm to phi

* fix npu

* fix memory::Copy
---
 paddle/fluid/operators/randperm_op.cc     |  8 ----
 paddle/fluid/operators/randperm_op.cu     | 24 ----------
 paddle/fluid/platform/device_context.cc   |  1 +
 paddle/phi/core/device_context.cc         | 35 ++++++++++++--
 paddle/phi/core/device_context.h          | 13 ++++++
 paddle/phi/kernels/cpu/randperm_kernel.cc | 46 ++++++++++++++++++
 paddle/phi/kernels/gpu/randperm_kernel.cu | 57 +++++++++++++++++++++++
 paddle/phi/kernels/randperm_kernel.h      | 28 +++++++++++
 paddle/phi/ops/compat/randperm_sig.cc     | 25 ++++++++++
 9 files changed, 201 insertions(+), 36 deletions(-)
 delete mode 100644 paddle/fluid/operators/randperm_op.cu
 create mode 100644 paddle/phi/kernels/cpu/randperm_kernel.cc
 create mode 100644 paddle/phi/kernels/gpu/randperm_kernel.cu
 create mode 100644 paddle/phi/kernels/randperm_kernel.h
 create mode 100644 paddle/phi/ops/compat/randperm_sig.cc

diff --git a/paddle/fluid/operators/randperm_op.cc b/paddle/fluid/operators/randperm_op.cc
index bdc2ea0b5bfbb..1b28ab3c133f7 100644
--- a/paddle/fluid/operators/randperm_op.cc
+++ b/paddle/fluid/operators/randperm_op.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/randperm_op.h"
 #include <string>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
@@ -89,10 +88,3 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
     paddle::operators::RandpermOpVarTypeInference);
-
-template <typename T>
-using kernel =
-    paddle::operators::RandpermKernel<paddle::platform::CPUDeviceContext, T>;
-
-REGISTER_OP_CPU_KERNEL(randperm, kernel<int64_t>, kernel<int>, kernel<float>,
-                       kernel<double>);
diff --git a/paddle/fluid/operators/randperm_op.cu b/paddle/fluid/operators/randperm_op.cu
deleted file mode 100644
index 7ed52a8fd25b1..0000000000000
--- a/paddle/fluid/operators/randperm_op.cu
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/randperm_op.h"
-
-template <typename T>
-using kernel =
-    paddle::operators::RandpermKernel<paddle::platform::CUDADeviceContext, T>;
-
-REGISTER_OP_CUDA_KERNEL(randperm, kernel<int64_t>, kernel<int>, kernel<float>,
-                        kernel<double>);
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 4282ec20623c9..6a7956628f804 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -172,6 +172,7 @@ inline void EmplaceDeviceContext(
                                     .get());
           dev_ctx->SetGenerator(framework::DefaultCPUGenerator().get());
         }
+        dev_ctx->SetHostGenerator(framework::DefaultCPUGenerator().get());
         dev_ctx->SetHostAllocator(
             memory::allocation::AllocatorFacade::Instance()
                 .GetAllocator(platform::CPUPlace())
diff --git a/paddle/phi/core/device_context.cc b/paddle/phi/core/device_context.cc
index c3e0d2a75228b..9c1d85251f892 100644
--- a/paddle/phi/core/device_context.cc
+++ b/paddle/phi/core/device_context.cc
@@ -119,22 +119,39 @@ struct DeviceContext::Impl {
         gen,
         phi::errors::InvalidArgument(
             "Required generator shall not be nullptr, but received nullptr."));
-    generator_ = gen;
+    device_generator_ = gen;
   }
 
   Generator* GetGenerator() const {
     PADDLE_ENFORCE_NOT_NULL(
-        generator_,
+        device_generator_,
         phi::errors::InvalidArgument("Required generator_ shall not be "
                                      "nullptr, but received nullptr."));
-    return generator_;
+    return device_generator_;
+  }
+
+  void SetHostGenerator(Generator* gen) {
+    PADDLE_ENFORCE_NOT_NULL(
+        gen,
+        phi::errors::InvalidArgument(
+            "Required generator shall not be nullptr, but received nullptr."));
+    host_generator_ = gen;
+  }
+
+  Generator* GetHostGenerator() const {
+    PADDLE_ENFORCE_NOT_NULL(
+        host_generator_,
+        phi::errors::InvalidArgument("Required generator_ shall not be "
+                                     "nullptr, but received nullptr."));
+    return host_generator_;
   }
 
  private:
   const Allocator* device_allocator_{nullptr};
   const Allocator* host_allocator_{nullptr};
   const Allocator* zero_allocator_{nullptr};
-  Generator* generator_{nullptr};
+  Generator* device_generator_{nullptr};
+  Generator* host_generator_{nullptr};
 };
 
 DeviceContext::DeviceContext() { impl_ = std::make_unique<Impl>(); }
@@ -143,6 +160,8 @@ DeviceContext::DeviceContext(const DeviceContext& other) {
   impl_->SetHostAllocator(&other.GetHostAllocator());
   impl_->SetAllocator(&other.GetAllocator());
   impl_->SetZeroAllocator(&other.GetZeroAllocator());
+  impl_->SetHostGenerator(other.GetHostGenerator());
+  impl_->SetGenerator(other.GetGenerator());
 }
 
 DeviceContext::DeviceContext(DeviceContext&& other) {
@@ -224,4 +243,12 @@ void DeviceContext::SetGenerator(Generator* gen) { impl_->SetGenerator(gen); }
 
 Generator* DeviceContext::GetGenerator() const { return impl_->GetGenerator(); }
 
+void DeviceContext::SetHostGenerator(Generator* gen) {
+  impl_->SetHostGenerator(gen);
+}
+
+Generator* DeviceContext::GetHostGenerator() const {
+  return impl_->GetHostGenerator();
+}
+
 }  // namespace phi
diff --git a/paddle/phi/core/device_context.h b/paddle/phi/core/device_context.h
index 7c1411e3bef37..689f4e4e66d15 100644
--- a/paddle/phi/core/device_context.h
+++ b/paddle/phi/core/device_context.h
@@ -132,6 +132,19 @@ class DeviceContext {
    */
   Generator* GetGenerator() const;
 
+  /**
+  * @brief Set the host generator for special op.
+  *
+  * @param Generator
+  */
+  void SetHostGenerator(Generator*);
+  /**
+   * @brief Get the host generator object.
+   *
+   * @return Generator
+   */
+  Generator* GetHostGenerator() const;
+
  private:
   struct Impl;
   std::unique_ptr<Impl> impl_;
diff --git a/paddle/phi/kernels/cpu/randperm_kernel.cc b/paddle/phi/kernels/cpu/randperm_kernel.cc
new file mode 100644
index 0000000000000..28092c8df6d15
--- /dev/null
+++ b/paddle/phi/kernels/cpu/randperm_kernel.cc
@@ -0,0 +1,46 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/randperm_kernel.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RandpermKernel(const Context& ctx,
+                    int n,
+                    DataType dtype,
+                    DenseTensor* out) {
+  T* out_data = ctx.template Alloc<T>(out);
+  auto gen_ptr = ctx.GetHostGenerator();
+  auto engine = gen_ptr->GetCPUEngine();
+
+  for (int i = 0; i < n; ++i) {
+    out_data[i] = static_cast<T>(i);
+  }
+  std::shuffle(out_data, out_data + n, *engine);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(randperm,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::RandpermKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/randperm_kernel.cu b/paddle/phi/kernels/gpu/randperm_kernel.cu
new file mode 100644
index 0000000000000..f75f768b633a3
--- /dev/null
+++ b/paddle/phi/kernels/gpu/randperm_kernel.cu
@@ -0,0 +1,57 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
+#include "paddle/phi/kernels/randperm_kernel.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RandpermKernel(const Context& ctx,
+                    int n,
+                    DataType dtype,
+                    DenseTensor* out) {
+  DenseTensor tmp;
+  tmp.Resize(phi::make_ddim({n}));
+  T* tmp_data = ctx.template HostAlloc<T>(&tmp);
+
+  auto gen_ptr = ctx.GetHostGenerator();
+  auto engine = gen_ptr->GetCPUEngine();
+
+  for (int i = 0; i < n; ++i) {
+    tmp_data[i] = static_cast<T>(i);
+  }
+  std::shuffle(tmp_data, tmp_data + n, *engine);
+
+  T* out_data = ctx.template Alloc<T>(out);
+  auto size = out->numel() * paddle::experimental::SizeOf(out->dtype());
+  paddle::memory::Copy<phi::GPUPlace, phi::Place>(
+      out->place(), out_data, tmp.place(), tmp_data, size, 0);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(randperm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::RandpermKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/randperm_kernel.h b/paddle/phi/kernels/randperm_kernel.h
new file mode 100644
index 0000000000000..63bdac6da6fdc
--- /dev/null
+++ b/paddle/phi/kernels/randperm_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RandpermKernel(const Context& ctx,
+                    int n,
+                    DataType dtype,
+                    DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/randperm_sig.cc b/paddle/phi/ops/compat/randperm_sig.cc
new file mode 100644
index 0000000000000..14b28512e402a
--- /dev/null
+++ b/paddle/phi/ops/compat/randperm_sig.cc
@@ -0,0 +1,25 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature RandpermOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("randperm", {}, {"n", "dtype"}, {"Out"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(randperm, phi::RandpermOpArgumentMapping);

From 95280a368b9f41e6f5ca3feff138ff82d6a56bf9 Mon Sep 17 00:00:00 2001
From: Sing_chan <51314274+betterpig@users.noreply.github.com>
Date: Wed, 23 Feb 2022 19:41:33 +0800
Subject: [PATCH 068/101] move trunc_op's infere shape to phi (#39772)

* move trunc_op's infere shape

* modify according to risheng's comment
---
 paddle/fluid/operators/trunc_op.cc | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/paddle/fluid/operators/trunc_op.cc b/paddle/fluid/operators/trunc_op.cc
index bd3dc002990a7..54f4deac80a74 100644
--- a/paddle/fluid/operators/trunc_op.cc
+++ b/paddle/fluid/operators/trunc_op.cc
@@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -21,14 +23,6 @@ namespace operators {
 class TruncOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "trunc");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "trunc");
-    auto input_dims = ctx->GetInputDim("X");
-    ctx->SetOutputDim("Out", input_dims);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
 };
 
 class TruncOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -75,9 +69,13 @@ class TruncGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace operators
 }  // namespace paddle
 
+DELCARE_INFER_SHAPE_FUNCTOR(trunc, TruncInferShapeFunctor,
+                            PT_INFER_META(phi::UnchangedInferMeta));
+
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(trunc, ops::TruncOp, ops::TruncOpMaker,
                   ops::TruncGradOpMaker<paddle::framework::OpDesc>,
-                  ops::TruncGradOpMaker<paddle::imperative::OpBase>);
+                  ops::TruncGradOpMaker<paddle::imperative::OpBase>,
+                  TruncInferShapeFunctor);
 
 REGISTER_OPERATOR(trunc_grad, ops::TruncGradOp);

From 96d530c1286936c78b5ad6869d926e159b4563b5 Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Wed, 23 Feb 2022 19:46:02 +0800
Subject: [PATCH 069/101] move array_ref_test and small_vector_test into
 paddle/utils and format header macro define (#39831)

---
 paddle/testing/CMakeLists.txt                  | 2 --
 paddle/utils/CMakeLists.txt                    | 2 ++
 paddle/utils/any.h                             | 7 ++-----
 paddle/utils/array_ref.h                       | 7 ++-----
 paddle/{testing => utils}/array_ref_test.cc    | 0
 paddle/utils/flat_hash_map.h                   | 2 +-
 paddle/utils/none.h                            | 9 +++------
 paddle/utils/optional.h                        | 8 +++-----
 paddle/utils/small_vector.h                    | 9 +++------
 paddle/{testing => utils}/small_vector_test.cc | 0
 10 files changed, 16 insertions(+), 30 deletions(-)
 rename paddle/{testing => utils}/array_ref_test.cc (100%)
 rename paddle/{testing => utils}/small_vector_test.cc (100%)

diff --git a/paddle/testing/CMakeLists.txt b/paddle/testing/CMakeLists.txt
index fe288ec2bf1d1..eace7c41f4a31 100644
--- a/paddle/testing/CMakeLists.txt
+++ b/paddle/testing/CMakeLists.txt
@@ -3,5 +3,3 @@
 if(WITH_TESTING)
   cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS init device_context memory gtest gflags)
 endif()
-cc_test(small_vector_test SRCS small_vector_test.cc DEPS gtest gflags)
-cc_test(array_ref_test SRCS array_ref_test.cc DEPS gtest gflags)
diff --git a/paddle/utils/CMakeLists.txt b/paddle/utils/CMakeLists.txt
index 2b4803e353854..64c88a47b4393 100644
--- a/paddle/utils/CMakeLists.txt
+++ b/paddle/utils/CMakeLists.txt
@@ -1 +1,3 @@
 add_subdirectory(string)
+cc_test(small_vector_test SRCS small_vector_test.cc DEPS gtest gflags)
+cc_test(array_ref_test SRCS array_ref_test.cc DEPS gtest gflags)
diff --git a/paddle/utils/any.h b/paddle/utils/any.h
index d0e72b7063579..148d3f45b56ec 100644
--- a/paddle/utils/any.h
+++ b/paddle/utils/any.h
@@ -6,8 +6,7 @@
 
 // See http://www.boost.org/libs/any for Documentation.
 
-#ifndef PADDLE_ANY_INCLUDED
-#define PADDLE_ANY_INCLUDED
+#pragma once
 
 // what:  variant type boost::any
 // who:   contributed by Kevlin Henney,
@@ -168,12 +167,10 @@ template <typename ValueType>
 inline const ValueType *unsafe_any_cast(const any *operand) {
   return unsafe_any_cast<ValueType>(const_cast<any *>(operand));
 }
-}
+}  // namespace paddle
 
 // Copyright Kevlin Henney, 2000, 2001, 2002. All rights reserved.
 //
 // Distributed under the Boost Software License, Version 1.0. (See
 // accompanying file LICENSE_1_0.txt or copy at
 // http://www.boost.org/LICENSE_1_0.txt)
-
-#endif
diff --git a/paddle/utils/array_ref.h b/paddle/utils/array_ref.h
index 9b39e9775f97a..d2ab762bb154f 100644
--- a/paddle/utils/array_ref.h
+++ b/paddle/utils/array_ref.h
@@ -12,8 +12,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef PADDLE_UTILS_ARRAY_REF_H_
-#define PADDLE_UTILS_ARRAY_REF_H_
+#pragma once
 
 #include <algorithm>
 #include <array>
@@ -332,6 +331,4 @@ inline bool operator!=(SmallVectorImpl<T> &LHS, ArrayRef<T> RHS) {
   return !(LHS == RHS);
 }
 
-}  // end namespace paddle
-
-#endif  // PADDLE_UTILS_ARRAY_REF_H_
+}  // namespace paddle
diff --git a/paddle/testing/array_ref_test.cc b/paddle/utils/array_ref_test.cc
similarity index 100%
rename from paddle/testing/array_ref_test.cc
rename to paddle/utils/array_ref_test.cc
diff --git a/paddle/utils/flat_hash_map.h b/paddle/utils/flat_hash_map.h
index 07b7b5d3c821c..64a75fffa5767 100644
--- a/paddle/utils/flat_hash_map.h
+++ b/paddle/utils/flat_hash_map.h
@@ -1741,4 +1741,4 @@ struct power_of_two_std_hash : std::hash<T> {
   typedef paddle::power_of_two_hash_policy hash_policy;
 };
 
-}  // end namespace paddle
+}  // namespace paddle
diff --git a/paddle/utils/none.h b/paddle/utils/none.h
index 20d6f4d2c7dde..d2da8f26a118f 100644
--- a/paddle/utils/none.h
+++ b/paddle/utils/none.h
@@ -15,8 +15,7 @@
 // You are welcome to contact the author at:
 //  fernando_cacciola@hotmail.com
 //
-#ifndef PADDLE_NONE_17SEP2003_HPP
-#define PADDLE_NONE_17SEP2003_HPP
+#pragma once
 
 namespace paddle {
 
@@ -26,7 +25,7 @@ struct none_helper {};
 
 typedef int detail::none_helper::*none_t;
 
-}  // namespace boost
+}  // namespace paddle
 
 // NOTE: Borland users have to include this header outside any precompiled
 // headers
@@ -37,6 +36,4 @@ namespace paddle {
 
 none_t const none = ((none_t)0);
 
-}  // namespace boost
-
-#endif
+}  // namespace paddle
diff --git a/paddle/utils/optional.h b/paddle/utils/optional.h
index 00d8ae28ee836..d2a9a3f11ef3c 100644
--- a/paddle/utils/optional.h
+++ b/paddle/utils/optional.h
@@ -17,8 +17,7 @@
 // You are welcome to contact the author at:
 //  fernando_cacciola@hotmail.com
 //
-#ifndef PADDLE_OPTIONAL_OPTIONAL_FLC_19NOV2002_HPP
-#define PADDLE_OPTIONAL_OPTIONAL_FLC_19NOV2002_HPP
+#pragma once
 
 #include <algorithm>
 #include <functional>
@@ -27,6 +26,8 @@
 
 #include "none.h"
 
+namespace paddle {
+
 // Daniel Wallin discovered that bind/apply.hpp badly interacts with the apply<>
 // member template of a factory as used in the optional<> implementation.
 // He proposed this simple fix which is to move the call to apply<> outside
@@ -38,7 +39,6 @@ void construct(Factory const& factory, void* address) {
 }
 }
 
-namespace paddle {
 template <typename T>
 class optional;
 
@@ -865,5 +865,3 @@ inline void optional_swap(optional<T>& x, optional<T>& y) {
 }  // namespace optional_detail
 
 }  // namespace paddle
-
-#endif
diff --git a/paddle/utils/small_vector.h b/paddle/utils/small_vector.h
index 48af2491b89f8..14cb8f410f460 100644
--- a/paddle/utils/small_vector.h
+++ b/paddle/utils/small_vector.h
@@ -18,8 +18,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef PADDLE_UTILS_SMALL_VECTOR_H_
-#define PADDLE_UTILS_SMALL_VECTOR_H_
+#pragma once
 
 #include <algorithm>
 #include <cassert>
@@ -1461,7 +1460,7 @@ static_assert(sizeof(SmallVectorSizeType<char>) == sizeof(uint32_t),
               "Expected SmallVectorBase<uint32_t> variant to be in use.");
 #endif
 
-}  // end namespace paddle
+}  // namespace paddle
 
 namespace std {
 
@@ -1479,6 +1478,4 @@ inline void swap(paddle::SmallVector<T, N> &LHS,
   LHS.swap(RHS);
 }
 
-}  // end namespace std
-
-#endif  // PADDLE_UTILS_SMALL_VECTOR_H_
+}  // namespace std
diff --git a/paddle/testing/small_vector_test.cc b/paddle/utils/small_vector_test.cc
similarity index 100%
rename from paddle/testing/small_vector_test.cc
rename to paddle/utils/small_vector_test.cc

From ca11a0e5c2bc6a998adc7dc7d65c403cd38ec0f5 Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Wed, 23 Feb 2022 19:57:22 +0800
Subject: [PATCH 070/101] Support dispensable inputs for eager final state
 codegen (#39743)

---
 .../final_state_generator/eager_gen.py        | 66 ++++++++++++-------
 .../final_state_generator/python_c_gen.py     | 22 +++++--
 paddle/fluid/pybind/eager_utils.cc            | 26 ++++++++
 paddle/fluid/pybind/eager_utils.h             |  6 ++
 4 files changed, 94 insertions(+), 26 deletions(-)

diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
index 0578f930679b8..c6e56e34627a5 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
@@ -143,6 +143,11 @@ def IntermediateValidationCheck(intermediate_outputs, forward_returns_list):
             assert pos in intermediate_positions
 
 
+def ParseDispensable(string):
+    # string: "X, Y"
+    return [v.strip() for v in string.split(",")]
+
+
 def ParseIntermediate(string):
     return [v.strip() for v in string.split(",")]
 
@@ -596,11 +601,11 @@ def GenerateNodeDefinition(fwd_api_name, bwd_api_name, backward_fwd_input_map,
     return node_definition_str
 
 
-def GenerateNodeCreationCodes(fwd_api_name, bwd_api_name,
-                              forward_inputs_position_map,
-                              forward_outputs_position_map, forward_attrs_list,
-                              backward_fwd_input_map, backward_grad_input_map,
-                              backward_grad_output_map, backward_attrs_list):
+def GenerateNodeCreationCodes(
+        fwd_api_name, bwd_api_name, forward_inputs_position_map,
+        forward_outputs_position_map, forward_attrs_list,
+        backward_fwd_input_map, backward_grad_input_map,
+        backward_grad_output_map, backward_attrs_list, optional_inputs):
     # fwd_api_name = ""
     # forward_inputs_position_map = { "name" : [type, fwd_position] }
     # forward_outputs_position_map = { "name" : [type, fwd_position] }
@@ -674,10 +679,17 @@ def GenerateNodeCreationCodes(fwd_api_name, bwd_api_name,
     # SetTensorWrappers
     set_tensor_wrappers_list = []
     for name, (_, is_fwd_input, _) in backward_fwd_input_map.items():
+        is_optional = (name in optional_inputs)
         if is_fwd_input:
-            set_tensor_wrappers = f"        grad_node->SetTensorWrapper{name}({name}, true);"
+            if is_optional:
+                set_tensor_wrappers = f"        if({name}.is_initialized()) grad_node->SetTensorWrapper{name}({name}, true);"
+            else:
+                set_tensor_wrappers = f"        grad_node->SetTensorWrapper{name}({name}, true);"
         else:
-            set_tensor_wrappers = f"        grad_node->SetTensorWrapper{name}({name}, false);"
+            if is_optional:
+                set_tensor_wrappers = f"        if({name}.is_initialized()) grad_node->SetTensorWrapper{name}({name}, false);"
+            else:
+                set_tensor_wrappers = f"        grad_node->SetTensorWrapper{name}({name}, false);"
         set_tensor_wrappers_list.append(set_tensor_wrappers)
     set_tensor_wrappers_str = "\n".join(set_tensor_wrappers_list)
 
@@ -762,11 +774,12 @@ def GenerateNodeCreationCodes(fwd_api_name, bwd_api_name,
     return node_creation_str
 
 
-def GenerateForwardDefinition(
-        fwd_api_name, bwd_api_name, forward_inputs_position_map,
-        forward_outputs_position_map, forward_attrs_list,
-        backward_fwd_input_map, backward_grad_input_map,
-        backward_grad_output_map, backward_attrs_list, intermediate_outputs):
+def GenerateForwardDefinition(fwd_api_name, bwd_api_name,
+                              forward_inputs_position_map,
+                              forward_outputs_position_map, forward_attrs_list,
+                              backward_fwd_input_map, backward_grad_input_map,
+                              backward_grad_output_map, backward_attrs_list,
+                              optional_inputs, intermediate_outputs):
     # fwd_api_name = ""
     # forward_inputs_position_map = { "name" : [type, fwd_position] }
     # forward_outputs_position_map = { "name" : [type, fwd_position] }
@@ -775,6 +788,7 @@ def GenerateForwardDefinition(
     # backward_grad_input_map  = { "name" : [type, fwd_position, orig_position] ...}
     # backward_grad_output_map = { "name" : [type, fwd_position, orig_position] ...}
     # backward_attrs_list = [ [attr_name, attr_type, default_value, orig_position], ...]
+    # optional_inputs = ["name0", ...]
 
     # Get Function Args
     num_inputs = len(forward_attrs_list) + len(forward_inputs_position_map.keys(
@@ -784,17 +798,18 @@ def GenerateForwardDefinition(
     inputs_call_list = ["" for i in range(num_inputs)]
     for name, (ttype, pos) in forward_inputs_position_map.items():
         inputs_call_list[pos] = f"{name}"
+        is_optional = (name in optional_inputs)
         if IsPlainTensorType(ttype):
-            inputs_args_definition_list[
-                pos] = f"const paddle::experimental::Tensor& {name}"
-            inputs_args_declaration_list[
-                pos] = f"const paddle::experimental::Tensor& {name}"
+            if is_optional:
+                arg_str = f"const paddle::optional<paddle::experimental::Tensor>& {name}"
+            else:
+                arg_str = f"const paddle::experimental::Tensor& {name}"
         else:
             assert IsVectorTensorType(ttype)
-            inputs_args_definition_list[
-                pos] = f"const std::vector<paddle::experimental::Tensor>& {name}"
-            inputs_args_declaration_list[
-                pos] = f"const std::vector<paddle::experimental::Tensor>& {name}"
+            arg_str = f"const std::vector<paddle::experimental::Tensor>& {name}"
+
+        inputs_args_definition_list[pos] = arg_str
+        inputs_args_declaration_list[pos] = arg_str
 
     for name, atype, default_val, pos in forward_attrs_list:
         inputs_call_list[pos] = name
@@ -849,7 +864,7 @@ def GenerateForwardDefinition(
         fwd_api_name, bwd_api_name, forward_inputs_position_map,
         forward_outputs_position_map, forward_attrs_list,
         backward_fwd_input_map, backward_grad_input_map,
-        backward_grad_output_map, backward_attrs_list)
+        backward_grad_output_map, backward_attrs_list, optional_inputs)
 
     FORWARD_FUNCTION_TEMPLATE = """
 {} {}({}) {{
@@ -1053,6 +1068,12 @@ def GenerateForwardHFile(filepath, forward_function_declaration_str):
         assert 'args' in bwd_api.keys()
         assert 'output' in bwd_api.keys()
         assert 'forward' in bwd_api.keys()
+
+        # Parse Dispensable Inputs
+        optional_inputs = []
+        if 'optional' in fwd_api.keys():
+            optional_inputs = ParseDispensable(fwd_api['optional'])
+
         bwd_forward_str = bwd_api['forward']
         bwd_args_str = bwd_api['args']
         bwd_returns_str = bwd_api['output']
@@ -1128,7 +1149,8 @@ def GenerateForwardHFile(filepath, forward_function_declaration_str):
             fwd_api_name, bwd_api_name, forward_inputs_position_map,
             forward_outputs_position_map, forward_attrs_list,
             backward_fwd_input_map, backward_grad_input_map,
-            backward_grad_output_map, backward_attrs_list, intermediate_outputs)
+            backward_grad_output_map, backward_attrs_list, optional_inputs,
+            intermediate_outputs)
         print("Generated Forward Definition: ", forward_definition_str)
         print("Generated Forward Declaration: ", forward_declaration_str)
         forward_definition_str += definition_declaration_pair[0]
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
index a95d6dce29aad..5a536067dbe49 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
@@ -14,7 +14,7 @@
 
 import os
 import argparse
-from eager_gen import ReadFwdFile, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap
+from eager_gen import ReadFwdFile, ParseDispensable, IsVectorTensorType, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap
 
 atype_to_parsing_function = {
     "bool": "CastPyArg2Boolean",
@@ -70,10 +70,12 @@ def FindParsingFunctionFromAttributeType(atype):
 
 
 def GeneratePythonCFunction(fwd_api_name, forward_inputs_position_map,
-                            forward_attrs_list, forward_outputs_position_map):
+                            forward_attrs_list, forward_outputs_position_map,
+                            optional_inputs):
     # forward_inputs_position_map = { "name" : [type, fwd_position] }
     # forward_outputs_position_map = { "name" : [type, fwd_position] }
     # forward_attrs_list = [ [attr_name, attr_type, default_value, orig_position], ...]
+    # optional_inputs = [name0, ...]
 
     # Get EagerTensor from args
     # Get dygraph function call args
@@ -82,7 +84,14 @@ def GeneratePythonCFunction(fwd_api_name, forward_inputs_position_map,
     dygraph_function_call_list = ["" for i in range(num_args)]
     get_eager_tensor_str = ""
     for name, (ttype, pos) in forward_inputs_position_map.items():
-        get_eager_tensor_str += f"    auto& {name} = GetTensorFromArgs(\"{fwd_api_name}\", \"{name}\", args, {pos}, false);\n"
+        is_optional = (name in optional_inputs)
+        if IsVectorTensorType(ttype):
+            get_eager_tensor_str += f"    auto {name} = GetTensorListFromArgs(\"{fwd_api_name}\", \"{name}\", args, {pos}, false);\n"
+        else:
+            if is_optional:
+                get_eager_tensor_str += f"    auto {name} = GetOptionalTensorFromArgs(\"{fwd_api_name}\", \"{name}\", args, {pos}, false);\n"
+            else:
+                get_eager_tensor_str += f"    auto {name} = GetTensorFromArgs(\"{fwd_api_name}\", \"{name}\", args, {pos}, false);\n"
         dygraph_function_call_list[pos] = f"{name}"
 
     parse_attributes_str = ""
@@ -267,6 +276,11 @@ def GeneratePythonCFile(filepath, python_c_str):
         fwd_args_str = fwd_api['args']
         fwd_returns_str = fwd_api['output']
 
+        # Parse Dispensable Inputs
+        optional_inputs = []
+        if 'optional' in fwd_api.keys():
+            optional_inputs = ParseDispensable(fwd_api['optional'])
+
         # Collect Original Forward Inputs/Outputs and then perform validation checks
         forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForward(
             fwd_args_str, fwd_returns_str)
@@ -283,7 +297,7 @@ def GeneratePythonCFile(filepath, python_c_str):
 
         python_c_function_str, python_c_function_reg_str = GeneratePythonCFunction(
             fwd_api_name, forward_inputs_position_map, forward_attrs_list,
-            forward_outputs_position_map)
+            forward_outputs_position_map, optional_inputs)
         python_c_function_list.append(python_c_function_str)
         python_c_function_reg_list.append(python_c_function_reg_str)
         print("Generated Python-C Function: ", python_c_function_str)
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index 9c033376d6c43..c1e8822eec221 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -555,6 +555,32 @@ PyObject* ToPyObject(
   return dict;
 }
 
+// For Final State Dygraph,
+// We directly use paddle::optional(Tensor) as dispensable Tensor
+paddle::optional<paddle::experimental::Tensor> GetOptionalTensorFromArgs(
+    const std::string& op_type, const std::string& arg_name, PyObject* args,
+    ssize_t arg_idx, bool dispensable) {
+  PyObject* obj = PyTuple_GET_ITEM(args, arg_idx);
+
+  if (PyTuple_Check(obj)) {
+    obj = PyTuple_GET_ITEM(obj, 0);
+  }
+
+  if (obj == nullptr || obj == Py_None) {
+    if (!dispensable) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "%s(): argument '%s' (position %d) must be Tensor, but got None",
+          op_type, arg_name, arg_idx));
+    }
+    return {};
+  }
+
+  return paddle::make_optional<paddle::experimental::Tensor>(
+      reinterpret_cast<TensorObject*>(obj)->tensor);
+}
+
+// For Intermediate State Dygraph,
+// we use an uninitialized Tensor to represent dispensable Tensor
 paddle::experimental::Tensor& GetTensorFromArgs(const std::string& op_type,
                                                 const std::string& arg_name,
                                                 PyObject* args, ssize_t arg_idx,
diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h
index fb19e108aeb70..0c721d6124791 100644
--- a/paddle/fluid/pybind/eager_utils.h
+++ b/paddle/fluid/pybind/eager_utils.h
@@ -89,10 +89,15 @@ PyObject* ToPyObject(const std::tuple<Args...>& out) {
   return result;
 }
 
+paddle::optional<paddle::experimental::Tensor> GetOptionalTensorFromArgs(
+    const std::string& op_type, const std::string& arg_name, PyObject* args,
+    ssize_t arg_idx, bool dispensable = false);
+
 paddle::experimental::Tensor& GetTensorFromArgs(const std::string& op_type,
                                                 const std::string& arg_name,
                                                 PyObject* args, ssize_t arg_idx,
                                                 bool dispensable = false);
+
 std::vector<paddle::experimental::Tensor> GetTensorListFromArgs(
     const std::string& op_type, const std::string& arg_name, PyObject* args,
     ssize_t arg_idx, bool dispensable = false);
@@ -102,6 +107,7 @@ paddle::experimental::Tensor* GetTensorPtrFromArgs(const std::string& op_type,
                                                    PyObject* args,
                                                    ssize_t arg_idx,
                                                    bool dispensable = false);
+
 std::vector<paddle::experimental::Tensor*> GetTensorPtrListFromArgs(
     const std::string& op_type, const std::string& arg_name, PyObject* args,
     ssize_t arg_idx, bool dispensable = false);

From 058e1d8592e44fc913365520455d124333224adf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=9F=B3=E6=99=93=E4=BC=9F?=
 <39303645+Shixiaowei02@users.noreply.github.com>
Date: Wed, 23 Feb 2022 20:56:53 +0800
Subject: [PATCH 071/101] infrt runtime supports phi, test=develop (#39836)

* runtime supports pten kernels, test=develop

* fixes a bug, test=develop
---
 paddle/infrt/dialect/phi/infrt_phi_base.td    |  2 +
 paddle/infrt/dialect/phi/infrt_phi_kernel.td  | 12 ++-
 paddle/infrt/dialect/phi/infrt_phi_tensor.td  |  7 +-
 paddle/infrt/dialect/phi/phi_base.h           | 14 ++++
 paddle/infrt/host_context/kernel_frame.cc     | 32 ++++++++
 paddle/infrt/host_context/kernel_frame.h      | 77 +++++++++++--------
 paddle/infrt/host_context/kernel_utils.h      | 18 +++--
 .../infrt/host_context/kernel_utils_test.cc   | 40 ++++++++++
 .../host_context/mlir_to_runtime_translate.cc | 77 ++++++++++++-------
 paddle/infrt/host_context/op_executable.cc    |  3 +-
 paddle/infrt/host_context/value.h             | 13 +++-
 paddle/infrt/kernel/phi/context_kernels.cc    |  2 +-
 paddle/infrt/kernel/phi/context_kernels.h     |  2 +-
 .../infershaped_kernel_launcher.cc            |  3 -
 .../phi/infershaped/phi_kernel_launcher.h     | 29 +++++++
 paddle/infrt/kernel/phi/registry.cc           | 20 +++--
 paddle/infrt/kernel/tensor_kernels.cc         |  8 +-
 paddle/infrt/support/variant.h                |  4 +-
 .../tests/dialect/pten/dense_tensor.mlir      | 16 ++--
 19 files changed, 272 insertions(+), 107 deletions(-)

diff --git a/paddle/infrt/dialect/phi/infrt_phi_base.td b/paddle/infrt/dialect/phi/infrt_phi_base.td
index e297fad86be75..907f912d9e638 100644
--- a/paddle/infrt/dialect/phi/infrt_phi_base.td
+++ b/paddle/infrt/dialect/phi/infrt_phi_base.td
@@ -23,6 +23,8 @@ class ContextTypeOf<string place, list<Trait> traits=[]>:
     let summary = !strconcat("!phi.context_", place, " type");
 }
 
+def PhiOpTrait : NativeOpTrait<"PhiOpTrait">;
+
 def CPU_Allocator : AllocatorTypeOf<"CPU">;
 def GPU_Allocator : AllocatorTypeOf<"GPU">;
 
diff --git a/paddle/infrt/dialect/phi/infrt_phi_kernel.td b/paddle/infrt/dialect/phi/infrt_phi_kernel.td
index 9ae469605860b..879994907cc0d 100644
--- a/paddle/infrt/dialect/phi/infrt_phi_kernel.td
+++ b/paddle/infrt/dialect/phi/infrt_phi_kernel.td
@@ -1,7 +1,10 @@
 #ifndef PHI_KERNEL
 #define PHI_KERNEL
 
-include "paddle/infrt/dialect/phi/infrt_phi_tensor.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+include "mlir/IR/OpBase.td"
+include "paddle/infrt/dialect/infrt_base.td"
+include "paddle/infrt/dialect/phi/infrt_phi_base.td"
 
 def PHI_KernelDialect : Dialect {
   let name = "phi_kernel";
@@ -14,12 +17,7 @@ def PHI_KernelDialect : Dialect {
 }
 
 // PHI Kernel related ops.
-class PDT_Kernel<string mnemonic, list<OpTrait> traits = []> : Op<PHI_KernelDialect, mnemonic, !listconcat(traits, [IsolatedFromAbove])> {
-}
-
-def FakeKernelOp : PDT_Kernel<"phi.matmul.host.fp32"> {
-  let arguments = (ins CPU_Context:$dev_ctx, DenseTensor:$x, DenseTensor:$y, BoolAttr:$transpose_x, BoolAttr:$transpose_y);
-  let results = (outs DenseTensor:$output);
+class PDT_Kernel<string mnemonic, list<OpTrait> traits = []> : Op<PHI_KernelDialect, mnemonic, !listconcat(traits, [PhiOpTrait, IsolatedFromAbove])> {
 }
 
 def PDCK_AbsOp : PDT_Kernel<"phi.abs.host.fp32"> {
diff --git a/paddle/infrt/dialect/phi/infrt_phi_tensor.td b/paddle/infrt/dialect/phi/infrt_phi_tensor.td
index b4607f632c9b9..b7b3b061fdbe4 100644
--- a/paddle/infrt/dialect/phi/infrt_phi_tensor.td
+++ b/paddle/infrt/dialect/phi/infrt_phi_tensor.td
@@ -18,7 +18,7 @@ def PHI_DenseTensorDialect : Dialect {
 }
 
 // PHI DenseTensor related Op.
-class PDT_Op<string mnemonic, list<OpTrait> traits = []> : Op<PHI_DenseTensorDialect, mnemonic, !listconcat(traits, [IsolatedFromAbove])> {
+class PDT_Op<string mnemonic, list<OpTrait> traits = []> : Op<PHI_DenseTensorDialect, mnemonic, !listconcat(traits, [PhiOpTrait, IsolatedFromAbove])> {
 }
 
 class CreateDenseTensorOp<string place, string dtype, string layout> 
@@ -53,4 +53,9 @@ def PDT_FillDenseTensorOp_f32 : FillDenseTensorOp<F32ArrayAttr, "f32">;
 def PDT_CreateAllocatorOp_cpu : CreateCPUAllocatorOp;
 def PDT_CreateContextOp_cpu : CreateCPUContextOp;
 
+def FakeKernelOp : PDT_Op<"fake_phi_kernel"> {
+  let arguments = (ins CPU_Context:$dev_ctx, DenseTensor:$x, DenseTensor:$y, BoolAttr:$transpose_x, BoolAttr:$transpose_y);
+  let results = (outs DenseTensor:$output);
+}
+
 #endif
diff --git a/paddle/infrt/dialect/phi/phi_base.h b/paddle/infrt/dialect/phi/phi_base.h
index e3e58c2269620..11174290f92bd 100644
--- a/paddle/infrt/dialect/phi/phi_base.h
+++ b/paddle/infrt/dialect/phi/phi_base.h
@@ -25,6 +25,20 @@
 #define GET_TYPEDEF_CLASSES
 #include "paddle/infrt/dialect/phi/infrt_phi_baseTypes.h.inc"
 
+namespace mlir {
+namespace OpTrait {
+
+template <typename ConcreteType>
+class PhiOpTrait : public OpTrait::TraitBase<ConcreteType, PhiOpTrait> {
+ public:
+  static LogicalResult verifyTrait(Operation *op) {
+    return LogicalResult::success();
+  }
+};
+
+}  // namespace OpTrait
+}  // namespace mlir
+
 namespace infrt {
 namespace phi {}  // namespace phi
 }  // namespace infrt
diff --git a/paddle/infrt/host_context/kernel_frame.cc b/paddle/infrt/host_context/kernel_frame.cc
index 1acb35e898308..14e88be4b96bb 100644
--- a/paddle/infrt/host_context/kernel_frame.cc
+++ b/paddle/infrt/host_context/kernel_frame.cc
@@ -15,6 +15,7 @@
 #include "paddle/infrt/host_context/kernel_frame.h"
 
 #include <memory>
+#include <sstream>
 
 namespace infrt {
 namespace host_context {
@@ -25,5 +26,36 @@ std::ostream& operator<<(std::ostream& os, const KernelFrame& frame) {
   return os;
 }
 
+#ifndef NDEBUG
+std::string KernelFrame::DumpArgTypes() const {
+  std::stringstream ss;
+  for (auto* value : GetValues(0, GetNumElements())) {
+    if (value->is_type<bool>()) {
+      ss << "bool (" << &value->get<bool>() << "), ";
+    } else if (value->is_type<tensor::DenseHostTensor>()) {
+      ss << "DenseHostTensor(" << &value->get<tensor::DenseHostTensor>()
+         << "), ";
+    } else if (value->is_type<float>()) {
+      ss << "float(" << &value->get<float>() << "), ";
+    } else if (value->is_type<int>()) {
+      ss << "int(" << &value->get<int>() << "), ";
+    } else if (value->is_type<phi::DenseTensor>()) {
+      ss << "phi::DenseTensor(" << &value->get<phi::DenseTensor>() << "), ";
+    } else if (value->is_type<phi::MetaTensor>()) {
+      ss << "phi::MetaTensor(" << &value->get<phi::MetaTensor>() << "), ";
+    } else if (value->is_type<::phi::CPUContext>()) {
+      ss << "phi::CPUContext(" << &value->get<::phi::CPUContext>() << "), ";
+    } else if (value->is_type<host_context::None>()) {
+      ss << "none(" << &value->get<host_context::None>() << "), ";
+    } else if (value->is_type<backends::CpuPhiContext>()) {
+      ss << "CpuPhiContext(" << &value->get<backends::CpuPhiContext>() << "), ";
+    } else {
+      ss << "typeid: " << value->index() << ", ";
+    }
+  }
+  return ss.str();
+}
+#endif
+
 }  // namespace host_context
 }  // namespace infrt
diff --git a/paddle/infrt/host_context/kernel_frame.h b/paddle/infrt/host_context/kernel_frame.h
index 35527872e624f..90887edb99166 100644
--- a/paddle/infrt/host_context/kernel_frame.h
+++ b/paddle/infrt/host_context/kernel_frame.h
@@ -31,20 +31,24 @@ namespace host_context {
 class KernelFrame {
  public:
   int GetNumArgs() const { return num_arguments_; }
-  int GetNumResults() const { return num_results_ == -1 ? 0 : num_results_; }
-  int GetNumAttributes() const {
-    return value_or_attrs_.size() - num_arguments_ -
-           (num_results_ == -1 ? 0 : num_results_);
+  int GetNumResults() const {
+    return value_or_attrs_.size() - num_arguments_ - GetNumAttributes();
   }
+  int GetNumAttributes() const { return num_attrs_ == -1 ? 0 : num_attrs_; }
 
   //! Get something at a specific position \p index. The element might be an
   //! argument, an attribute or a result.
   template <typename T>
   T& GetElementAt(int index) {
-    CHECK_LT(index, GetNumArgs() + GetNumAttributes() + GetNumResults());
+    CHECK_LT(static_cast<size_t>(index), GetNumElements());
     return value_or_attrs_[index]->template get_or_default<T>();
   }
 
+  Value* GetElementAt(int index) {
+    CHECK_LT(static_cast<size_t>(index), GetNumElements());
+    return value_or_attrs_[index];
+  }
+
   // Get number of elements, either input, attributes or results.
   size_t GetNumElements() const { return value_or_attrs_.size(); }
 
@@ -70,18 +74,21 @@ class KernelFrame {
   }
 
   Value* GetAttributeAt(int idx) {
-    CHECK_NE(num_results_, -1)
-        << "Must call SetNumResults before GetAttributeAt";
-    CHECK_LT(idx,
-             static_cast<int>(value_or_attrs_.size() - num_arguments_ -
-                              num_results_));
-    return value_or_attrs_[num_arguments_ + num_results_ + idx];
+    // CHECK_NE(num_results_, -1)
+    //<< "Must call SetNumResults before GetAttributeAt";
+    CHECK_LT(idx, GetNumAttributes());
+    return value_or_attrs_[num_arguments_ + idx];
   }
 
   void AddAttribute(Value* v) {
-    CHECK_NE(num_results_, -1)
-        << "Must call SetNumResults before calling AddAttribute";
+    CHECK_LE(num_results_, 0)
+        << "Must call SetNumResults after calling AddAttribute";
     value_or_attrs_.emplace_back(v);
+    if (num_attrs_ == -1) num_attrs_ = 0;
+    num_attrs_++;
+
+    CHECK_EQ(value_or_attrs_.size(),
+             static_cast<size_t>(num_arguments_ + num_attrs_));
   }
 
   template <typename T, typename... Args>
@@ -96,35 +103,43 @@ class KernelFrame {
 
   template <typename T>
   void SetResultAt(int index, T&& value) {
-    CHECK_LT(index, num_results_) << "Invalid result index";
-    CHECK(value_or_attrs_[num_arguments_ + index]);
-    value_or_attrs_[num_arguments_ + index]->set(std::move(value));
+    CHECK_LT(index, GetNumResults()) << "Invalid result index";
+    CHECK(value_or_attrs_[num_arguments_ + GetNumAttributes() + index]);
+    value_or_attrs_[num_arguments_ + GetNumAttributes() + index]->set(
+        std::move(value));
   }
 
   llvm::ArrayRef<Value*> GetResults() const {
-    return GetValues(num_arguments_, num_results_);
+    CHECK_GE(num_results_, 0) << "Invalid results num";
+    return GetValues(num_arguments_ + GetNumAttributes(), num_results_);
   }
   llvm::MutableArrayRef<Value*> GetResults() {
-    return GetMutableValues(num_arguments_, num_results_);
+    CHECK_GE(num_results_, 0) << "Invalid results num";
+    return GetMutableValues(num_arguments_ + GetNumAttributes(), num_results_);
   }
 
   llvm::ArrayRef<Value*> GetValues(size_t from, size_t length) const {
-    CHECK_LE(static_cast<int>(from + length), num_arguments_ + num_results_);
+    CHECK_LE(from + length, GetNumElements());
     if (length == 0) return {};
 
     return llvm::makeArrayRef(&value_or_attrs_[from], length);
   }
 
   llvm::MutableArrayRef<Value*> GetMutableValues(size_t from, size_t length) {
-    CHECK_LE(static_cast<int>(from + length), num_arguments_ + num_results_);
+    CHECK_LE(from + length, GetNumElements());
     if (length == 0) return {};
     return llvm::makeMutableArrayRef(&value_or_attrs_[from], length);
   }
 
+#ifndef NDEBUG
+  std::string DumpArgTypes() const;
+#endif
+
   bool IsEmpty() const { return value_or_attrs_.empty(); }
 
  protected:
   int num_arguments_{};
+  int num_attrs_{-1};
   int num_results_{-1};
 
   llvm::SmallVector<Value*, 8> value_or_attrs_;
@@ -136,15 +151,15 @@ class KernelFrameBuilder : public KernelFrame {
  public:
   void AddArgument(Value* value) {
     CHECK(value);
-    CHECK_EQ(num_results_, -1)
-        << "Should call AddArgument before calling SetNumResults";
+    CHECK_EQ(num_attrs_, -1)
+        << "Should call AddArgument before calling SetAttributes";
     value_or_attrs_.push_back(value);
     ++num_arguments_;
   }
 
   void SetResults(llvm::ArrayRef<Value*> values) {
-    CHECK_EQ(num_arguments_, static_cast<int>(value_or_attrs_.size()));
-    CHECK_EQ(num_results_, -1);
+    CHECK_EQ(num_arguments_ + GetNumAttributes(),
+             static_cast<int>(value_or_attrs_.size()));
     for (Value* x : values) {
       value_or_attrs_.push_back(x);
     }
@@ -152,28 +167,30 @@ class KernelFrameBuilder : public KernelFrame {
   }
 
   void SetNumResults(size_t n) {
-    CHECK_EQ(num_arguments_, static_cast<int>(value_or_attrs_.size()));
-    CHECK_EQ(num_results_, -1);
-    num_results_ = n;
+    CHECK_EQ(num_arguments_ + GetNumAttributes(),
+             static_cast<int>(value_or_attrs_.size()));
     for (size_t i = 0; i < n; i++) {
       value_or_attrs_.emplace_back(new Value);
     }
+    num_results_ = n;
   }
 
   void SetResultAt(int result_id, Value* value) {
     CHECK_EQ(static_cast<int>(value_or_attrs_.size()),
-             num_arguments_ + num_results_)
+             num_arguments_ + GetNumAttributes() + num_results_)
         << "Call SetNumResults first";
-    CHECK_LT(result_id + num_arguments_,
+    CHECK_LT(result_id + num_arguments_ + GetNumAttributes(),
              static_cast<int>(value_or_attrs_.size()));
     CHECK(value);
-    value_or_attrs_[num_arguments_ + result_id]->set(value);
+    value_or_attrs_[num_arguments_ + GetNumAttributes() + result_id]->set(
+        value);
   }
 
   void Reset() {
     value_or_attrs_.clear();
     num_arguments_ = 0;
     num_results_ = -1;
+    num_attrs_ = -1;
   }
 };
 
diff --git a/paddle/infrt/host_context/kernel_utils.h b/paddle/infrt/host_context/kernel_utils.h
index 31d411006d237..2f630dcc213cb 100644
--- a/paddle/infrt/host_context/kernel_utils.h
+++ b/paddle/infrt/host_context/kernel_utils.h
@@ -209,9 +209,11 @@ struct KernelImpl<Return (*)(Args...), impl_fn> {
     static void Invoke(KernelFrame* frame, const PreviousArgs&... pargs) {
       static_assert(out_idx != -1,
                     "Do not place Results after RemainingResults");
-      static_assert(const_idx == 0,
-                    "Arguments and results should appear before attributes");
-      Result<Head> arg(&frame->GetResults()[out_idx]);
+      // static_assert(const_idx == 0,
+      //              "Arguments and results should appear before attributes");
+
+      // Result<Head> arg(&frame->GetResults()[out_idx]);
+      Result<Head> arg(new ValueRef());
       KernelCallHelper<
           Tail...>::template Invoke<in_idx, out_idx + 1, const_idx>(frame,
                                                                     pargs...,
@@ -224,8 +226,8 @@ struct KernelImpl<Return (*)(Args...), impl_fn> {
   struct KernelCallHelper<Attribute<Head>, Tail...> {
     template <int in_idx, int out_idx, int const_idx, typename... PreviousArgs>
     static void Invoke(KernelFrame* frame, const PreviousArgs&... pargs) {
-      static_assert(const_idx != -1,
-                    "Do not place Attributes after RemainingAttributes");
+      // static_assert(const_idx != -1,
+      //              "Do not place Attributes after RemainingAttributes");
       Attribute<Head> arg(frame->GetAttributeAt(const_idx));
       KernelCallHelper<
           Tail...>::template Invoke<in_idx, out_idx, const_idx + 1>(frame,
@@ -242,8 +244,8 @@ struct KernelImpl<Return (*)(Args...), impl_fn> {
       static_assert(in_idx != -1,
                     "Do not place Arguments after RemainingArguments");
       static_assert(out_idx == 0, "Arguments should appear before results");
-      static_assert(const_idx == 0,
-                    "Arguments and results should appear before attributes.");
+      // static_assert(const_idx == 0,
+      //              "Arguments and results should appear before attributes.");
       auto* arg = &frame->template GetElementAt<Head>(in_idx);
       KernelCallHelper<
           Tail...>::template Invoke<in_idx + 1, out_idx, const_idx>(frame,
@@ -265,7 +267,7 @@ struct KernelImpl<Return (*)(Args...), impl_fn> {
       static_assert(const_idx == 0,
                     "Arguments and results should appear before attributes.");
 
-      auto* value = frame->GetArgAt(in_idx);
+      auto* value = frame->GetElementAt(in_idx);
       auto&& arg = value->get<ArgT>();
 
       KernelCallHelper<
diff --git a/paddle/infrt/host_context/kernel_utils_test.cc b/paddle/infrt/host_context/kernel_utils_test.cc
index bebd8d86e50bb..71d8904eb798f 100644
--- a/paddle/infrt/host_context/kernel_utils_test.cc
+++ b/paddle/infrt/host_context/kernel_utils_test.cc
@@ -67,5 +67,45 @@ TEST(KernelImpl, pair) {
   ASSERT_EQ(results[1]->get<float>(), 3.f);
 }
 
+void TestFunc(const std::string& arg_0,
+              const std::string& arg_1,
+              const std::string& arg_2,
+              Attribute<std::string> attr_0,
+              Result<std::string> res_0,
+              Result<std::string> res_1) {
+  CHECK_EQ(arg_0, "arg_0");
+  CHECK_EQ(arg_1, "arg_1");
+  CHECK_EQ(arg_2, "arg_2");
+  CHECK_EQ(attr_0.get(), "attr_0");
+
+  // res_0.Set(Argument<std::string>(ValueRef(new Value())));
+  // res_1.Set(Argument<std::string>(ValueRef(new Value())));
+}
+
+TEST(KernelRegistry, basic) {
+  KernelFrameBuilder kernel_frame;
+
+  Value arg_0(std::string{"arg_0"});
+  Value arg_1(std::string{"arg_1"});
+  Value arg_2(std::string{"arg_2"});
+  Value attr_0(std::string{"attr_0"});
+
+  kernel_frame.AddArgument(&arg_0);
+  kernel_frame.AddArgument(&arg_1);
+  kernel_frame.AddArgument(&arg_2);
+  kernel_frame.AddAttribute(&attr_0);
+  kernel_frame.SetNumResults(2);
+
+  CHECK_EQ(kernel_frame.GetNumArgs(), 3);
+  CHECK_EQ(kernel_frame.GetNumResults(), 2);
+  CHECK_EQ(kernel_frame.GetNumAttributes(), 1);
+  CHECK_EQ(kernel_frame.GetNumElements(), 6UL);
+
+  CHECK_EQ(kernel_frame.GetArgAt<std::string>(2), "arg_2");
+  CHECK_EQ(kernel_frame.GetAttributeAt(0)->get<std::string>(), "attr_0");
+
+  KernelImpl<decltype(&TestFunc), TestFunc>::Invoke(&kernel_frame);
+}
+
 }  // namespace host_context
 }  // namespace infrt
diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate.cc b/paddle/infrt/host_context/mlir_to_runtime_translate.cc
index b47e2b27eab7c..17e6f7cb563d2 100644
--- a/paddle/infrt/host_context/mlir_to_runtime_translate.cc
+++ b/paddle/infrt/host_context/mlir_to_runtime_translate.cc
@@ -31,6 +31,7 @@
 
 #include "boost/optional.hpp"
 #include "paddle/infrt/common/string.h"
+#include "paddle/infrt/dialect/dense_tensor.h"
 #include "paddle/infrt/dialect/mlir_loader.h"
 #include "paddle/infrt/dialect/tensor_shape.h"
 #include "paddle/infrt/host_context/core_runtime.h"
@@ -150,6 +151,17 @@ boost::optional<float> MlirToRuntimeTranslator::EmitAttribute(
   return boost::none;
 }
 
+template <>
+boost::optional<bool> MlirToRuntimeTranslator::EmitAttribute(
+    const mlir::Attribute& attr) {
+  if (!attr.isa<mlir::BoolAttr>()) return boost::none;
+  if (attr.isa<mlir::BoolAttr>()) {
+    auto val = attr.cast<mlir::BoolAttr>();
+    return val.getValue();
+  }
+  return boost::none;
+}
+
 template <>
 boost::optional<double> MlirToRuntimeTranslator::EmitAttribute(
     const mlir::Attribute& attr) {
@@ -187,6 +199,7 @@ boost::optional<std::string> MlirToRuntimeTranslator::EmitAttribute(
     return res;                                                                \
   }
 
+PROCESS_ARRAY_INT(bool, 1);
 PROCESS_ARRAY_INT(int16_t, 16);
 PROCESS_ARRAY_INT(int32_t, 32);
 PROCESS_ARRAY_INT(int64_t, 64);
@@ -262,25 +275,6 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(mlir::Operation* op) {
             << GetValue(operand) << " vs " << arg_value;
   }
 
-  // process results
-  llvm::SmallVector<Value*, 4> res_values;
-  for (int i = 0, e = op->getNumResults(); i < e; i++) {
-    auto res = op->getResult(i);
-    res_values.push_back(AddValue(res));
-
-    VLOG(3) << "* op mlir res: " << DumpToString(res) << " " << GetValue(res);
-  }
-  impl_->cur_op->SetResults(res_values);
-
-#ifdef INFRT_DEBUG
-  {
-    VLOG(3) << "check result";
-    for (int i = 0; i < impl_->cur_op->frame().GetNumResults(); i++) {
-      VLOG(3) << "+ res value: " << impl_->cur_op->frame().GetResults()[i];
-    }
-  }
-#endif
-
   // process attributes
   auto attrs = op->getAttrs();
 
@@ -296,6 +290,8 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(mlir::Operation* op) {
       impl_->cur_op->AppendAttribute(new Value(*v));
     } else if (auto v = EmitAttribute<std::string>(attr.getValue())) {
       impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
+    } else if (auto v = EmitAttribute<bool>(attr.getValue())) {
+      impl_->cur_op->AppendAttribute(new Value(*v));
     } else if (auto v = EmitAttribute<std::vector<int16_t>>(attr.getValue())) {
       impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
     } else if (auto v = EmitAttribute<std::vector<int32_t>>(attr.getValue())) {
@@ -311,6 +307,33 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(mlir::Operation* op) {
     }
   }
 
+  // process results
+  llvm::SmallVector<Value*, 4> res_values;
+  for (int i = 0, e = op->getNumResults(); i < e; i++) {
+    auto res = op->getResult(i);
+    if (res.getType().isa<::infrt::DenseTensorType>()) {
+      auto r = impl_->value_map.try_emplace(
+          res, ValueRef(new Value{::phi::DenseTensor()}));
+      CHECK(r.second) << "Duplicate add mlir value [" << DumpToString(res)
+                      << "]";
+      res_values.push_back(r.first->second.get());
+    } else {
+      res_values.push_back(AddValue(res));
+    }
+
+    VLOG(3) << "* op mlir res: " << DumpToString(res) << " " << GetValue(res);
+  }
+  impl_->cur_op->SetResults(res_values);
+
+#ifdef INFRT_DEBUG
+  {
+    VLOG(3) << "check result";
+    for (int i = 0; i < impl_->cur_op->frame().GetNumResults(); i++) {
+      VLOG(3) << "+ res value: " << impl_->cur_op->frame().GetResults()[i];
+    }
+  }
+#endif
+
   // process regions, we treat regions as attribute.
   auto num_regions = op->getNumRegions();
   if (num_regions > 0) {
@@ -440,14 +463,6 @@ bool MlirToRuntimeTranslator::EmitCallOp(mlir::Operation* op,
     impl_->cur_op->AppendArgument(arg_value);
   }
 
-  // process results
-  llvm::SmallVector<Value*, 4> res_values;
-  for (int i = 0, e = op->getNumResults(); i < e; i++) {
-    auto res = op->getResult(i);
-    res_values.push_back(AddValue(res));
-  }
-  impl_->cur_op->SetResults(res_values);
-
   // process attribute
   auto& table = function_table ? *function_table : impl_->func_defs;
   {
@@ -460,6 +475,14 @@ bool MlirToRuntimeTranslator::EmitCallOp(mlir::Operation* op,
     impl_->cur_op->AppendAttribute(new Value(function));
   }
 
+  // process results
+  llvm::SmallVector<Value*, 4> res_values;
+  for (int i = 0, e = op->getNumResults(); i < e; i++) {
+    auto res = op->getResult(i);
+    res_values.push_back(AddValue(res));
+  }
+  impl_->cur_op->SetResults(res_values);
+
   VLOG(3) << "Emit call " << callee_name.getValue().str() << " "
           << impl_->cur_op->frame();
   return true;
diff --git a/paddle/infrt/host_context/op_executable.cc b/paddle/infrt/host_context/op_executable.cc
index cf40d7315c6a5..59a73e7108328 100644
--- a/paddle/infrt/host_context/op_executable.cc
+++ b/paddle/infrt/host_context/op_executable.cc
@@ -133,7 +133,8 @@ void OpExecutable::Execute() {
   VLOG(3) << "execute " << name()
           << " --- frame args: " << impl_->frame.GetNumArgs() << " results "
           << impl_->frame.GetNumResults() << " attributes "
-          << impl_->frame.GetNumAttributes();
+          << impl_->frame.GetNumAttributes() << "\n"
+          << frame().DumpArgTypes();
   for (int i = 0; i < impl_->frame.GetNumArgs(); i++) {
     VLOG(3) << "function arg: " << impl_->frame.GetArgAt(i);
   }
diff --git a/paddle/infrt/host_context/value.h b/paddle/infrt/host_context/value.h
index 21c06c4bfd8f4..eb9a2092657aa 100644
--- a/paddle/infrt/host_context/value.h
+++ b/paddle/infrt/host_context/value.h
@@ -45,10 +45,13 @@
 namespace infrt {
 namespace host_context {
 
+struct None {};
+
 struct MlirFunctionExecutable;
 
 using ValueVariantType =
-    Variant<int16_t,
+    Variant<None,
+            int16_t,
             int32_t,
             int64_t,
             float,
@@ -118,13 +121,15 @@ class Value : public common::Object {
 
   template <typename T>
   const T& get() const {
-    CHECK(data.template is<T>());
+    CHECK(data.template is<T>()) << "typeid: " << data.index()
+                                 << " != " << ValueVariantType::IndexOf<T>;
     return data.get<T>();
   }
 
   template <typename T>
   T& get() {
-    CHECK(data.template is<T>());
+    CHECK(data.template is<T>()) << "typeid: " << data.index()
+                                 << " != " << ValueVariantType::IndexOf<T>;
     return data.get<T>();
   }
 
@@ -153,6 +158,8 @@ class Value : public common::Object {
 
   const char* type_info() const override;
 
+  ValueVariantType::IndexT index() const { return data.index(); }
+
   friend void CopyTo(const Value& from, Value* to);
 
  private:
diff --git a/paddle/infrt/kernel/phi/context_kernels.cc b/paddle/infrt/kernel/phi/context_kernels.cc
index ff9ae50bc4345..5284f499916c3 100644
--- a/paddle/infrt/kernel/phi/context_kernels.cc
+++ b/paddle/infrt/kernel/phi/context_kernels.cc
@@ -18,7 +18,7 @@ namespace infrt {
 namespace kernel {
 namespace phi {
 
-backends::CpuPhiContext CreateCpuContext() { return {}; }
+::phi::CPUContext CreateCpuContext() { return {}; }
 
 }  // namespace phi
 }  // namespace kernel
diff --git a/paddle/infrt/kernel/phi/context_kernels.h b/paddle/infrt/kernel/phi/context_kernels.h
index 6fe1a01f770db..8082dc6c2ff29 100644
--- a/paddle/infrt/kernel/phi/context_kernels.h
+++ b/paddle/infrt/kernel/phi/context_kernels.h
@@ -21,7 +21,7 @@ namespace infrt {
 namespace kernel {
 namespace phi {
 
-backends::CpuPhiContext CreateCpuContext();
+::phi::CPUContext CreateCpuContext();
 
 }  // namespace phi
 }  // namespace kernel
diff --git a/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc b/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc
index 62b204b160448..165f7f7c94377 100644
--- a/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc
+++ b/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc
@@ -26,9 +26,6 @@ void InferShapedKernelLauncher::CreateKernelFrameForInferShape(
     if (value->is_type<::phi::DenseTensor>()) {
       values.emplace_back(::phi::MetaTensor{&value->get<::phi::DenseTensor>()});
       infershape_kernel_frame_builder.AddArgument(values.back().get());
-    } else if (value->is_type<phi::DenseTensor>()) {
-      values.emplace_back(phi::MetaTensor{&value->get<phi::DenseTensor>()});
-      infershape_kernel_frame_builder.AddArgument(values.back().get());
     } else {
       infershape_kernel_frame_builder.AddArgument(value);
     }
diff --git a/paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h b/paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h
index 713f7df7f5225..a0a5b391ea669 100644
--- a/paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h
+++ b/paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h
@@ -14,7 +14,9 @@
 #pragma once
 
 #include <llvm/ADT/SmallVector.h>
+#include <iostream>
 
+#include "paddle/infrt/backends/host/phi_context.h"
 #include "paddle/infrt/host_context/kernel_utils.h"
 #include "paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.h"
 #include "paddle/infrt/kernel/phi/infershaped/infershaped_utils.h"
@@ -22,6 +24,26 @@
 namespace infrt {
 namespace kernel {
 
+static void FakePhiInferShape(const ::phi::MetaTensor& a,
+                              const ::phi::MetaTensor& b,
+                              bool arg_0,
+                              bool arg_1,
+                              ::phi::MetaTensor* c) {
+  LOG(INFO) << "the ptr of c: " << c;
+  LOG(INFO) << "c->numel(): " << c->numel();
+}
+
+static void FakePhiKernel(const ::phi::CPUContext& /*Context*/,
+                          const ::phi::DenseTensor& a,
+                          const ::phi::DenseTensor& b,
+                          bool arg_0,
+                          bool arg_1,
+                          ::phi::DenseTensor* c) {
+  std::cout << "@FakePhiKernel@" << std::endl;
+  LOG(INFO) << "the ptr of c: " << c;
+  LOG(INFO) << "c->numel(): " << c->numel();
+}
+
 template <typename KernelFunc,
           KernelFunc kernel,
           typename InferShapedFunc,
@@ -31,10 +53,17 @@ class KernelLauncher : public InferShapedKernelLauncher {
   static const uint16_t num_input_tensors{InferShapeHelper<KernelFunc>::count};
   static const bool turn_on_infer_shape_cache{true};
   void Invoke(host_context::KernelFrame* frame) override {
+#ifndef NDEBUG
+    LOG(INFO) << "Kernel.frame: " << frame->DumpArgTypes();
+#endif
     // Build the infershape KernelFrame if needed.
     // TODO(Superjomn) add unlikely here.
     if (infershape_kernel_frame_builder.IsEmpty()) {
       CreateKernelFrameForInferShape(frame);
+#ifndef NDEBUG
+      LOG(INFO) << "infershape.frame: "
+                << infershape_kernel_frame_builder.DumpArgTypes();
+#endif
     }
     if (turn_on_infer_shape_cache) {
       if (!turn_on_infer_shape_cache || IsShapeChanged(num_input_tensors)) {
diff --git a/paddle/infrt/kernel/phi/registry.cc b/paddle/infrt/kernel/phi/registry.cc
index f4f0e75a987a2..5d79814d4bec7 100644
--- a/paddle/infrt/kernel/phi/registry.cc
+++ b/paddle/infrt/kernel/phi/registry.cc
@@ -43,17 +43,15 @@ void RegisterPhiKernels(host_context::KernelRegistry* registry) {
   registry->AddKernel("phi_dt.fill_dense_tensor.f32",
                       INFRT_KERNEL(infrt::kernel::phi::FillDenseTensorF32));
   registry->AddKernel(
-      "phi.matmul.host.fp32",
-      std::bind(&kernel::KernelLauncherFunc<
-                    decltype(&::phi::MatmulKernel<float, ::phi::CPUContext>),
-                    &::phi::MatmulKernel<float, ::phi::CPUContext>,
-                    decltype(&::phi::MatmulInferMeta),
-                    &::phi::MatmulInferMeta>,
-                kernel::KernelLauncher<
-                    decltype(&::phi::MatmulKernel<float, ::phi::CPUContext>),
-                    &::phi::MatmulKernel<float, ::phi::CPUContext>,
-                    decltype(&::phi::MatmulInferMeta),
-                    &::phi::MatmulInferMeta>(),
+      "phi_dt.fake_phi_kernel",
+      std::bind(&KernelLauncherFunc<decltype(&FakePhiKernel),
+                                    &FakePhiKernel,
+                                    decltype(&FakePhiInferShape),
+                                    &FakePhiInferShape>,
+                KernelLauncher<decltype(&FakePhiKernel),
+                               &FakePhiKernel,
+                               decltype(&FakePhiInferShape),
+                               &FakePhiInferShape>(),
                 std::placeholders::_1));
 }
 
diff --git a/paddle/infrt/kernel/tensor_kernels.cc b/paddle/infrt/kernel/tensor_kernels.cc
index 1e55bcd07ae80..9de1350e97d1a 100644
--- a/paddle/infrt/kernel/tensor_kernels.cc
+++ b/paddle/infrt/kernel/tensor_kernels.cc
@@ -45,7 +45,7 @@ void PrintTensor(const DenseHostTensor &tensor) {
 }
 
 template <typename T>
-void FillTensorWithConstant(DenseHostTensor *tensor, Attribute<T> v) {
+void FillTensorWithConstant(Attribute<T> v, DenseHostTensor *tensor) {
   MutableDTArrayView<T>(tensor).Fill(v.get());
 }
 
@@ -53,13 +53,11 @@ TensorMap LoadParams(const std::string &path) {
   return *(infrt::tensor::LoadParams(path));
 }
 
-void TensorMapGetTensor(TensorMap map,
-                        DenseHostTensor *out,
-                        Attribute<std::string> name) {
+DenseHostTensor TensorMapGetTensor(TensorMap map, Attribute<std::string> name) {
   auto it = map.find(name.get());
   CHECK(it != map.end()) << "No tensor called " << name.get()
                          << " in the TensorMap";
-  *out = *it->second;
+  return *it->second;
 }
 
 int32_t TensorMapGetSize(TensorMap map) { return map.size(); }
diff --git a/paddle/infrt/support/variant.h b/paddle/infrt/support/variant.h
index 2f415b21c8010..b8dcd21ae27fe 100644
--- a/paddle/infrt/support/variant.h
+++ b/paddle/infrt/support/variant.h
@@ -136,12 +136,12 @@ class Variant {
     return nullptr;
   }
 
-  IndexT index() { return index_; }
+  IndexT index() const { return index_; }
 
- private:
   template <typename T>
   static constexpr size_t IndexOf = TupleIndexOf<T, Types>::value;
 
+ private:
   static constexpr size_t kStorageSize = std::max({sizeof(Ts)...});
   static constexpr size_t kAlignment = std::max({alignof(Ts)...});
 
diff --git a/paddle/infrt/tests/dialect/pten/dense_tensor.mlir b/paddle/infrt/tests/dialect/pten/dense_tensor.mlir
index 21ba15d5fce7d..f0b0b849b93cb 100644
--- a/paddle/infrt/tests/dialect/pten/dense_tensor.mlir
+++ b/paddle/infrt/tests/dialect/pten/dense_tensor.mlir
@@ -1,11 +1,13 @@
-// RUN: infrtopt %s | FileCheck %s
+// RUN: infrtexec -i %s | FileCheck %s
 
-// CHECK-LABEL: @basic_tensor
-func @basic_tensor() {
-  %a = "phi_dt.create_allocator.cpu" (): () -> !phi.CPU_allocator
-  %b = "phi_dt.create_context.cpu" (): () -> !phi.CPU_context
-  %c = "phi_dt.create_dense_tensor.cpu.f32.nchw" (%a) {dims=[1:i64], lod=[1:i64]}: (!phi.CPU_allocator) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
-  // "phi_dt.fill_dense_tensor.f32" (%c) {value=[1.0:f32]} : (!Infrt.tensor<CPU, FP32, NCHW>) -> ()
+// CHECK-LABEL: @fake_phi_kernel_execute
+func @fake_phi_kernel_execute() {
+  %allocator = "phi_dt.create_allocator.cpu" (): () -> !phi.CPU_allocator
+  %ctx = "phi_dt.create_context.cpu" (): () -> !phi.CPU_context
+  %t = "phi_dt.create_dense_tensor.cpu.f32.nchw" (%allocator) {dims=[1:i64], lod=[1:i64]}: (!phi.CPU_allocator) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
 
+  // CHECK: @FakePhiKernel@
+  %d = "phi_dt.fake_phi_kernel" (%ctx, %t, %t) {transpose_x=false, transpose_y=false} : (!phi.CPU_context, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
   Infrt.return
 }
+

From 0b2058172ffab252f011fe59cddc75ab0d92faf8 Mon Sep 17 00:00:00 2001
From: ShenLiang <1422485404@qq.com>
Date: Wed, 23 Feb 2022 21:38:39 +0800
Subject: [PATCH 072/101] Add ProcessGroupNCCL for distributed training
 (#39737)

* add processgroup_nccl
---
 paddle/fluid/distributed/CMakeLists.txt       |   2 +-
 .../distributed/collective/CMakeLists.txt     |   5 +
 .../fluid/distributed/collective/NCCLTools.h  | 198 +++++++++++
 .../distributed/collective/ProcessGroup.cc    |  40 +++
 .../distributed/collective/ProcessGroup.h     | 108 ++++++
 .../collective/ProcessGroupNCCL.cc            | 321 ++++++++++++++++++
 .../distributed/collective/ProcessGroupNCCL.h | 126 +++++++
 paddle/fluid/distributed/collective/Types.h   |  36 ++
 paddle/fluid/platform/cuda_device_guard.h     |  24 +-
 .../fluid/platform/device/gpu/nccl_helper.h   |  17 +
 paddle/fluid/platform/flags.cc                |  12 +
 paddle/fluid/pybind/CMakeLists.txt            |   8 +
 paddle/fluid/pybind/distributed_py.cc         | 149 ++++++++
 paddle/fluid/pybind/distributed_py.h          |  29 ++
 paddle/fluid/pybind/pybind.cc                 |   4 +
 .../fluid/tests/unittests/CMakeLists.txt      |   4 +
 .../tests/unittests/process_group_nccl.py     | 149 ++++++++
 .../test_collective_process_group.py          |  27 ++
 18 files changed, 1253 insertions(+), 6 deletions(-)
 create mode 100644 paddle/fluid/distributed/collective/CMakeLists.txt
 create mode 100644 paddle/fluid/distributed/collective/NCCLTools.h
 create mode 100644 paddle/fluid/distributed/collective/ProcessGroup.cc
 create mode 100644 paddle/fluid/distributed/collective/ProcessGroup.h
 create mode 100644 paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
 create mode 100644 paddle/fluid/distributed/collective/ProcessGroupNCCL.h
 create mode 100644 paddle/fluid/distributed/collective/Types.h
 create mode 100644 paddle/fluid/pybind/distributed_py.cc
 create mode 100644 paddle/fluid/pybind/distributed_py.h
 create mode 100644 python/paddle/fluid/tests/unittests/process_group_nccl.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_collective_process_group.py

diff --git a/paddle/fluid/distributed/CMakeLists.txt b/paddle/fluid/distributed/CMakeLists.txt
index 1527b752c6906..06b0583eddf24 100644
--- a/paddle/fluid/distributed/CMakeLists.txt
+++ b/paddle/fluid/distributed/CMakeLists.txt
@@ -1,5 +1,5 @@
+add_subdirectory(collective)
 add_subdirectory(store)
-
 if(NOT WITH_PSCORE)
     add_subdirectory(fleet_executor)
     return()
diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt
new file mode 100644
index 0000000000000..5daaf29ae2895
--- /dev/null
+++ b/paddle/fluid/distributed/collective/CMakeLists.txt
@@ -0,0 +1,5 @@
+cc_library(processgroup SRCS ProcessGroup.cc DEPS pten pten_api eager_api)
+
+if(WITH_NCCL)
+    cc_library(processgroup_nccl SRCS ProcessGroupNCCL.cc DEPS place cuda_stream enforce collective_helper device_context pten pten_api eager_api)
+endif()
diff --git a/paddle/fluid/distributed/collective/NCCLTools.h b/paddle/fluid/distributed/collective/NCCLTools.h
new file mode 100644
index 0000000000000..f30b96e72d453
--- /dev/null
+++ b/paddle/fluid/distributed/collective/NCCLTools.h
@@ -0,0 +1,198 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cuda_runtime.h>
+#include <error.h>
+#include <string>
+
+#include "boost/variant.hpp"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/platform/cuda_device_guard.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/dynload/nccl.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace distributed {
+
+#define NCCLCHECK(cmd)                                              \
+  do {                                                              \
+    ncclResult_t r = cmd;                                           \
+    if (r != ncclSuccess) {                                         \
+      printf("Failed, NCCL error %s:%d '%s'\n", __FILE__, __LINE__, \
+             platform::dynload::ncclGetErrorString(r));             \
+      exit(EXIT_FAILURE);                                           \
+    }                                                               \
+  } while (0)
+
+// NOTE(shenliang03): EventManager are movable not copyable CudaEvent wrapper.
+// EventManage is different from paddle::platform::CudaEvent.
+// It uses lazy initialization and is only created when the
+// Record() method is called for the first time; it also monitors
+// device information to ensure that recorded stream and event
+// are on the same device.
+
+class EventManager {
+ public:
+  EventManager() {}
+  explicit EventManager(unsigned int flags) : flags_{flags} {}
+
+  ~EventManager() {
+    if (is_created_) {
+      platform::CUDADeviceGuard guard(device_index_);
+      cudaEventDestroy(event_);
+    }
+  }
+
+  EventManager(const EventManager&) = delete;
+  EventManager& operator=(const EventManager&) = delete;
+
+  EventManager(EventManager&& other) {
+    std::swap(flags_, other.flags_);
+    std::swap(is_created_, other.is_created_);
+    std::swap(device_index_, other.device_index_);
+    std::swap(event_, other.event_);
+  }
+
+  EventManager& operator=(EventManager&& other) {
+    std::swap(flags_, other.flags_);
+    std::swap(is_created_, other.is_created_);
+    std::swap(device_index_, other.device_index_);
+    std::swap(event_, other.event_);
+    return *this;
+  }
+
+  bool IsCreated() const { return is_created_; }
+  bool DeviceId() const { return device_index_; }
+  gpuEvent_t GetRawCudaEvent() const { return event_; }
+
+  void Record(const paddle::platform::CUDADeviceContext& ctx) {
+    auto device_index = ctx.GetPlace().device;
+    if (!is_created_) {
+      CreateEvent(device_index);
+    }
+    PADDLE_ENFORCE_EQ(device_index, device_index_,
+                      platform::errors::PreconditionNotMet(
+                          "CUDADeviceContext's device %d does not match"
+                          "Event's device %d",
+                          device_index, device_index_));
+
+    platform::CUDADeviceGuard guard(device_index_);
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, ctx.stream()));
+  }
+
+  bool Query() const {
+    gpuError_t err = cudaEventQuery(event_);
+    if (err == cudaSuccess) {
+      return true;
+    } else if (err == cudaErrorNotReady) {
+      return false;
+    } else {
+      PADDLE_ENFORCE_GPU_SUCCESS(err);
+      return false;
+    }
+  }
+
+  void Synchronize() const {
+    if (is_created_) {
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaEventSynchronize(event_));
+    }
+  }
+
+  void Block(const paddle::platform::CUDADeviceContext& ctx) const {
+    if (is_created_) {
+      auto device_index = ctx.GetPlace().device;
+      PADDLE_ENFORCE_EQ(device_index, device_index_,
+                        platform::errors::PreconditionNotMet(
+                            "CUDADeviceContext's device %d does not match"
+                            "Event's device %d",
+                            device_index, device_index_));
+      platform::CUDADeviceGuard guard(device_index_);
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(ctx.stream(), event_, 0));
+    }
+  }
+
+ private:
+  unsigned int flags_ = cudaEventDefault;
+  bool is_created_{false};
+  gpuEvent_t event_{};
+  int8_t device_index_{0};
+
+ private:
+  void CreateEvent(int device_index) {
+    device_index_ = device_index;
+    platform::CUDADeviceGuard guard(device_index);
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventCreateWithFlags(&event_, flags_));
+    is_created_ = true;
+  }
+};
+
+// NOTE(shenliang03): NCCLCommManager is more lightweight than
+// platform::NCCLComm
+
+class NCCLCommManager {
+ public:
+  explicit NCCLCommManager(ncclComm_t ncclComm) : nccl_comm_(ncclComm) {}
+
+  NCCLCommManager() : NCCLCommManager(nullptr) {}
+
+  ~NCCLCommManager() noexcept {
+    std::unique_lock<std::mutex> lock(mutex_);
+    if (nccl_comm_) {
+      platform::dynload::ncclCommDestroy(nccl_comm_);
+    }
+  }
+
+  static std::shared_ptr<NCCLCommManager> Create(int num_ranks, int rank,
+                                                 ncclUniqueId comm_id) {
+    auto nccl_manager = std::make_shared<NCCLCommManager>();
+    NCCLCHECK(platform::dynload::ncclCommInitRank(&(nccl_manager->nccl_comm_),
+                                                  num_ranks, comm_id, rank));
+
+    nccl_manager->nccl_id_ = comm_id;
+    nccl_manager->rank_ = rank;
+    return nccl_manager;
+  }
+
+  ncclUniqueId GetNcclId() const {
+    std::unique_lock<std::mutex> lock(mutex_);
+    return nccl_id_;
+  }
+
+  ncclComm_t GetNcclComm() const {
+    std::unique_lock<std::mutex> lock(mutex_);
+    return nccl_comm_;
+  }
+
+  NCCLCommManager(const NCCLCommManager&) = delete;
+  NCCLCommManager& operator=(const NCCLCommManager&) = delete;
+  NCCLCommManager& operator=(NCCLCommManager&& other) = delete;
+
+  NCCLCommManager(NCCLCommManager&& other) {
+    std::unique_lock<std::mutex> lock(other.mutex_);
+    std::swap(nccl_comm_, other.nccl_comm_);
+  }
+
+ protected:
+  ncclComm_t nccl_comm_;
+  ncclUniqueId nccl_id_;
+  int rank_;
+  mutable std::mutex mutex_;
+};
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/collective/ProcessGroup.cc b/paddle/fluid/distributed/collective/ProcessGroup.cc
new file mode 100644
index 0000000000000..42ca3bd5f5be4
--- /dev/null
+++ b/paddle/fluid/distributed/collective/ProcessGroup.cc
@@ -0,0 +1,40 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/collective/ProcessGroup.h"
+
+namespace paddle {
+namespace distributed {
+
+ProcessGroup::Task::Task(int rank, const std::vector<Tensor>& inputTensors,
+                         CommType comm_type)
+    : rank_(rank), comm_type_(comm_type) {}
+
+ProcessGroup::Task::~Task() = default;
+
+bool ProcessGroup::Task::IsCompleted() {
+  std::lock_guard<std::mutex> lock(mutex_);
+  return is_completed_;
+}
+
+bool ProcessGroup::Task::Wait(std::chrono::milliseconds timeout) {
+  return false;
+}
+
+void ProcessGroup::Task::Synchronize() {}
+
+ProcessGroup::ProcessGroup(int rank, int size) : rank_(rank), size_(size) {}
+
+}  //  namespace distributed
+}  //  namespace paddle
diff --git a/paddle/fluid/distributed/collective/ProcessGroup.h b/paddle/fluid/distributed/collective/ProcessGroup.h
new file mode 100644
index 0000000000000..dde8622d9007e
--- /dev/null
+++ b/paddle/fluid/distributed/collective/ProcessGroup.h
@@ -0,0 +1,108 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <chrono>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/distributed/collective/Types.h"
+#include "paddle/fluid/eager/api/utils/tensor_utils.h"
+
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/platform/enforce.h"
+
+constexpr auto kWaitTimeout = std::chrono::milliseconds(0);
+
+namespace paddle {
+namespace distributed {
+
+using Tensor = paddle::experimental::Tensor;
+
+enum class CommType : std::uint8_t {
+  BROADCAST = 0,
+  ALLREDUCE = 1,
+  ALLREDUCE_SPARSE = 2,  // TODO(shenliang03): to support sparse in allreduce
+  REDUCE = 3,
+  ALLGATHER = 4,
+  GATHER = 5,
+  SCATTER = 6,
+  REDUCE_SCATTER = 7,
+  ALLTOALL = 8,
+  SEND = 9,
+  RECV = 10,
+  BARRIER = 11,
+  UNKNOWN = 100,
+};
+
+struct ProcessGroupStrategy {
+  int nranks_{1};
+  int local_rank_{0};
+  std::vector<std::string> trainer_endpoints_{};
+  std::string current_endpoint_{""};
+  int nrings_{1};
+};
+
+class ProcessGroup {
+ public:
+  class Task {
+   public:
+    Task(int rank, const std::vector<Tensor>& inputTensors,
+         CommType opType = CommType::UNKNOWN);
+
+    virtual ~Task();
+    virtual bool IsCompleted();
+    virtual bool Wait(std::chrono::milliseconds timeout = kWaitTimeout);
+    virtual void Synchronize();
+
+   protected:
+    const int rank_;
+    CommType comm_type_;
+    std::mutex mutex_;
+    bool is_completed_ = false;
+  };
+
+  explicit ProcessGroup(int rank, int size);
+  virtual ~ProcessGroup() {}
+
+  int GetRank() const { return rank_; }
+
+  int GetSize() const { return size_; }
+
+  virtual const std::string GetBackendName() const = 0;
+
+  virtual std::shared_ptr<ProcessGroup::Task> AllReduce(
+      std::vector<Tensor>& /* tensors */,
+      const AllreduceOptions& = AllreduceOptions()) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support allreduce", GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> Broadcast(
+      std::vector<Tensor>& /* tensors */,
+      const BroadcastOptions& = BroadcastOptions()) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support allreduce", GetBackendName()));
+  }
+
+ protected:
+  const int rank_;
+  const int size_;
+};
+
+}  //  namespace distributed
+}  //  namespace paddle
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
new file mode 100644
index 0000000000000..fe2325423b460
--- /dev/null
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
@@ -0,0 +1,321 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
+
+DECLARE_bool(nccl_blocking_wait);
+DECLARE_bool(use_stream_safe_cuda_allocator);
+
+constexpr int64_t kWaitBlockTImeout = 10;
+
+namespace paddle {
+namespace distributed {
+
+static ncclRedOp_t ToNCCLRedType(ReduceOp reduction) {
+  static const std::map<ReduceOp, ncclRedOp_t> red_type = {
+      {ReduceOp::MIN, ncclMin},
+      {ReduceOp::MAX, ncclMax},
+      {ReduceOp::SUM, ncclSum},
+      {ReduceOp::PRODUCT, ncclProd},
+  };
+  auto it = red_type.find(reduction);
+  PADDLE_ENFORCE_EQ(it != red_type.end(), true,
+                    platform::errors::InvalidArgument(
+                        "Invalid nccl reduction. Must be ncclMin | ncclMax | "
+                        "ncclProd | ncclSum"));
+  return it->second;
+}
+
+std::string SerializeNCCLUniqueId(const ncclUniqueId& ncclID) {
+  const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&ncclID);
+  std::ostringstream oss;
+  for (auto i = 0; i < NCCL_UNIQUE_ID_BYTES; ++i) {
+    oss << std::hex << static_cast<int>(bytes[i]);
+  }
+  return oss.str();
+}
+
+// Get the list of devices from list of tensors
+std::vector<Place> GetPlaceList(const std::vector<Tensor>& tensors) {
+  std::vector<Place> places;
+  places.reserve(tensors.size());
+  for (auto& tensor : tensors) {
+    places.push_back(tensor.inner_place());
+  }
+  return places;
+}
+
+// Get the deviceList String from the list of devices
+std::string GetKeyFromPlaces(const std::vector<Place>& places) {
+  std::string placeList;
+  for (auto& place : places) {
+    std::stringstream tmp;
+    tmp << place;
+    if (placeList.empty()) {
+      placeList += tmp.str();
+    } else {
+      placeList += "," + tmp.str();
+    }
+  }
+  return placeList;
+}
+
+bool CheckTensorsInCudaPlace(const std::vector<Tensor>& tensors) {
+  return std::all_of(tensors.cbegin(), tensors.cend(), [&](const Tensor& t) {
+    return t.place() == PlaceType::kGPU;
+  });
+}
+
+void SyncDefaultStream(
+    const std::vector<Place>& places,
+    std::vector<EventManager>& ncclEvents,                       // NOLINT
+    std::vector<std::unique_ptr<CUDADeviceContext>>& dev_ctx) {  // NOLINT
+  for (size_t i = 0; i < places.size(); ++i) {
+    auto* default_ctx = static_cast<platform::CUDADeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(places[i]));
+    ncclEvents[i].Record(*dev_ctx[i]);
+    ncclEvents[i].Block(*default_ctx);
+  }
+}
+
+std::shared_ptr<ProcessGroupNCCL::NCCLTask> ProcessGroupNCCL::CreateTask(
+    std::vector<Place> places, int rank, CommType comm_type,
+    const std::vector<Tensor>& inputs) {
+  return std::make_shared<ProcessGroupNCCL::NCCLTask>(places, rank, comm_type,
+                                                      inputs);
+}
+
+ProcessGroupNCCL::NCCLTask::NCCLTask(const std::vector<Place>& places, int rank,
+                                     CommType CommType,
+                                     const std::vector<Tensor>& inputs)
+    : Task(rank, inputs, CommType), places_(places) {
+  control_events_.resize(places.size());
+  ncclComms_.resize(places.size());
+}
+
+ProcessGroupNCCL::NCCLTask::~NCCLTask() {}
+
+void ProcessGroupNCCL::NCCLTask::SetOutputs(
+    std::vector<Tensor>& outputs) {  // NOLINT
+  outputs_ = std::make_shared<std::vector<Tensor>>(outputs);
+}
+
+void ProcessGroupNCCL::NCCLTask::SynchronizeStreams() {
+  for (size_t i = 0; i < places_.size(); ++i) {
+    auto* default_ctx = static_cast<platform::CUDADeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(places_[i]));
+    default_ctx->WaitEvent(control_events_[i].GetRawCudaEvent());
+  }
+}
+
+bool ProcessGroupNCCL::NCCLTask::IsCompleted() {
+  for (size_t i = 0; i < places_.size(); ++i) {
+    if (!control_events_[i].Query()) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+// TODO(sheniang03): Add timeout for wait, now timeout unused
+bool ProcessGroupNCCL::NCCLTask::Wait(std::chrono::milliseconds timeout) {
+  SynchronizeStreams();
+  if (FLAGS_nccl_blocking_wait) {
+    // NOTE(shenliang03): It will block host for sync
+    while (!IsCompleted()) {
+      std::this_thread::sleep_for(std::chrono::milliseconds(kWaitBlockTImeout));
+    }
+  }
+  return true;
+}
+
+// Same as Wait
+void ProcessGroupNCCL::NCCLTask::Synchronize() { Wait(kWaitTimeout); }
+
+ProcessGroupNCCL::ProcessGroupNCCL(const ProcessGroupStrategy& strategy,
+                                   int rank, int size)
+    : ProcessGroup(rank, size), strategy_(strategy) {}
+
+void ProcessGroupNCCL::BcastNCCLId(
+    std::vector<ncclUniqueId>& nccl_ids,  // NOLINT
+    int root, int server_fd) {
+  if (strategy_.local_rank_ == root) {
+    std::vector<std::string> other_trainers;
+    for (auto& ep : strategy_.trainer_endpoints_) {
+      if (ep != strategy_.current_endpoint_) {
+        other_trainers.push_back(ep);
+      }
+    }
+    platform::SendBroadCastCommID(other_trainers, &nccl_ids);
+  } else {
+    platform::RecvBroadCastCommID(server_fd, strategy_.current_endpoint_,
+                                  &nccl_ids);
+  }
+}
+
+void ProcessGroupNCCL::BroadcastUniqueNCCLID(
+    std::vector<ncclUniqueId>& nccl_ids) {  // NOLINT
+
+  int server_fd = -1;
+  if (rank_ != 0) {
+    server_fd = platform::SocketServer::GetInstance(strategy_.current_endpoint_)
+                    .socket();
+  }
+  BcastNCCLId(nccl_ids, 0, server_fd);
+}
+
+// create NCCLManager cache for places_key
+void ProcessGroupNCCL::CreateNCCLManagerCache(
+    const std::string& places_key, const std::vector<Place>& places) {
+  PADDLE_ENFORCE_EQ(places_key.empty(), false,
+                    platform::errors::PreconditionNotMet(
+                        "Not able to create/get the NCCL Communicator since "
+                        "the GPU place are not known"));
+
+  std::vector<std::shared_ptr<NCCLCommManager>> nccl_comms;
+  nccl_comms.resize(places.size());
+
+  // using vector just for broadcast
+  std::vector<ncclUniqueId> nccl_ids;
+  nccl_ids.resize(1);
+  auto& nccl_id = nccl_ids.front();
+
+  if (rank_ == 0) {
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGetUniqueId(&nccl_id));
+  }
+  BroadcastUniqueNCCLID(nccl_ids);
+
+  VLOG(3) << "init nccl rank: " << strategy_.local_rank_
+          << ", nranks: " << strategy_.nranks_ << ", place: " << places_key
+          << ", nccl uniqueid: " << SerializeNCCLUniqueId(nccl_id);
+
+  std::vector<std::unique_ptr<CUDADeviceContext>> dev_ctx;
+  dev_ctx.resize(places.size());
+
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
+
+  for (size_t i = 0; i < places.size(); ++i) {
+    platform::CUDADeviceGuard guard(places[i]);
+    nccl_comms[i] = NCCLCommManager::Create(GetSize(), GetRank(), nccl_id);
+    dev_ctx[i].reset(new CUDADeviceContext(places[i]));
+  }
+
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
+
+  std::vector<EventManager> events;
+  events.resize(places.size());
+
+  // These caches will be useful to process sync/wait/communicate
+  places_to_events_.emplace(places_key, std::move(events));
+  places_to_ncclcomm_.emplace(places_key, std::move(nccl_comms));
+  places_to_ctx_.emplace(places_key, std::move(dev_ctx));
+}
+
+template <typename Fn>
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Collective(
+    std::vector<Tensor>& inputs, std::vector<Tensor>& outputs, Fn fn,
+    CommType op_type) {
+  const auto places = GetPlaceList(inputs);
+  const auto key = GetKeyFromPlaces(places);
+
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (places_to_ncclcomm_.find(key) == places_to_ncclcomm_.end()) {
+      CreateNCCLManagerCache(key, places);
+    }
+  }
+
+  auto& nccl_comms = places_to_ncclcomm_[key];
+
+  SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]);
+
+  auto task = CreateTask(places, rank_, op_type, inputs);
+  task->SetOutputs(outputs);
+
+  // construct uninitialize guard for device
+  platform::CUDADeviceGuard cuda_guard;
+
+  if (FLAGS_use_stream_safe_cuda_allocator) {
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      cuda_guard.SetDevice(places[i]);
+      auto dense_tensor =
+          std::dynamic_pointer_cast<phi::DenseTensor>(inputs[i].impl());
+      memory::RecordStream(dense_tensor->Holder(),
+                           places_to_ctx_[key][i]->stream());
+    }
+  }
+
+  {
+    platform::NCCLGroupGuard nccl_guard;
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      cuda_guard.SetDevice(places[i]);
+      const auto& nccl_stream = places_to_ctx_[key][i]->stream();
+      fn(inputs[i], outputs[i], nccl_comms[i]->GetNcclComm(), nccl_stream);
+    }
+  }
+
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    cuda_guard.SetDevice(places[i]);
+    task->control_events_[i].Record(*places_to_ctx_[key][i]);
+  }
+  return task;
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllReduce(
+    std::vector<Tensor>& tensors, const AllreduceOptions& opts) {
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(tensors), true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  return Collective(
+      tensors, tensors,
+      [&](const Tensor& input, Tensor& output, ncclComm_t comm,
+          const gpuStream_t& stream) {
+        auto input_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(input.impl());
+        auto output_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(output.impl());
+        return platform::dynload::ncclAllReduce(
+            input_tensor->data(), output_tensor->data(), input_tensor->numel(),
+            platform::ToNCCLDataType(input.type()),
+            ToNCCLRedType(opts.reduce_op), comm, stream);
+      },
+      CommType::ALLREDUCE);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Broadcast(
+    std::vector<Tensor>& tensors, const BroadcastOptions& opts) {
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(tensors), true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+
+  return Collective(
+      tensors, tensors,
+      [&](Tensor& input, Tensor& output, ncclComm_t comm,
+          const gpuStream_t& stream) {
+        const auto root = opts.source_rank * tensors.size() + opts.source_root;
+        auto input_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(input.impl());
+        auto output_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(output.impl());
+        return platform::dynload::ncclBcast(
+            input_tensor->data(), input_tensor->numel(),
+            platform::ToNCCLDataType(input.type()), root, comm, stream);
+      },
+      CommType::BROADCAST);
+}
+
+}  //  namespace distributed
+}  //  namespace paddle
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
new file mode 100644
index 0000000000000..9f06566d1c863
--- /dev/null
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
@@ -0,0 +1,126 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <chrono>
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/distributed/collective/ProcessGroup.h"
+#include "paddle/fluid/platform/cuda_device_guard.h"
+#include "paddle/fluid/platform/device_context.h"
+
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/gen_comm_id_helper.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/stream/cuda_stream.h"
+
+#if defined(PADDLE_WITH_NCCL)
+#include "paddle/fluid/distributed/collective/NCCLTools.h"
+#include "paddle/fluid/platform/dynload/nccl.h"
+#endif
+
+constexpr const char* NCCL_BACKEND_NAME = "NCCL";
+
+namespace paddle {
+namespace distributed {
+
+using Place = paddle::platform::Place;
+using CUDAStream = platform::stream::CUDAStream;
+using CUDADeviceContext = paddle::platform::CUDADeviceContext;
+
+class ProcessGroupNCCL : public ProcessGroup {
+ public:
+  class NCCLTask : public ProcessGroup::Task,
+                   public std::enable_shared_from_this<NCCLTask> {
+   public:
+    NCCLTask(const std::vector<Place>& places, int rank, CommType CommType,
+             const std::vector<Tensor>& inputs);
+
+    bool IsCompleted();
+
+    void SynchronizeStreams();
+
+    bool Wait(std::chrono::milliseconds timeout = kWaitTimeout);
+
+    void Synchronize();
+
+    void SetOutputs(std::vector<Tensor>& outputs);  // NOLINT
+
+    virtual ~NCCLTask();
+
+    std::vector<EventManager> control_events_;
+
+   protected:
+    std::vector<Place> places_;
+    std::vector<std::shared_ptr<NCCLCommManager>> ncclComms_;
+    std::shared_ptr<std::vector<Tensor>> outputs_;
+
+   private:
+  };
+
+  ProcessGroupNCCL(const ProcessGroupStrategy& strategy, int rank, int size);
+
+  const std::string GetBackendName() const override {
+    return std::string(NCCL_BACKEND_NAME);
+  }
+
+  std::shared_ptr<ProcessGroup::Task> AllReduce(
+      std::vector<Tensor>& tensors,
+      const AllreduceOptions& = AllreduceOptions()) override;
+
+  std::shared_ptr<ProcessGroup::Task> Broadcast(
+      std::vector<Tensor>& tensors,
+      const BroadcastOptions& = BroadcastOptions()) override;
+
+ protected:
+  virtual std::shared_ptr<ProcessGroupNCCL::NCCLTask> CreateTask(
+      std::vector<Place> places, int rank, CommType opType,
+      const std::vector<Tensor>& inputs);
+
+ protected:
+  ProcessGroupStrategy strategy_;
+  std::shared_ptr<NCCLCommManager> nccl_comm_;
+  std::mutex mutex_;
+  std::unordered_map<std::string, std::vector<std::shared_ptr<NCCLCommManager>>>
+      places_to_ncclcomm_;
+
+  std::unordered_map<std::string, std::vector<EventManager>> places_to_events_;
+
+  std::unordered_map<std::string,
+                     std::vector<std::unique_ptr<CUDADeviceContext>>>
+      places_to_ctx_;
+
+ private:
+  void BcastNCCLId(std::vector<ncclUniqueId>& nccl_ids, int root,  // NOLINT
+                   int server_fd);
+
+  void BroadcastUniqueNCCLID(std::vector<ncclUniqueId>& nccl_ids);  // NOLINT
+
+  template <typename Fn>
+  std::shared_ptr<ProcessGroup::Task> Collective(
+      std::vector<Tensor>& inputs,   // NOLINT
+      std::vector<Tensor>& outputs,  // NOLINT
+      Fn fn, CommType op_type);
+
+  void CreateNCCLManagerCache(const std::string& places_key,
+                              const std::vector<Place>& places);
+};
+
+}  //  namespace distributed
+}  //  namespace paddle
diff --git a/paddle/fluid/distributed/collective/Types.h b/paddle/fluid/distributed/collective/Types.h
new file mode 100644
index 0000000000000..654d06686957b
--- /dev/null
+++ b/paddle/fluid/distributed/collective/Types.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <chrono>
+#include <cstdint>
+#include <vector>
+
+namespace paddle {
+namespace distributed {
+
+// TODO(shenliang03): To support AVG for reduce
+enum class ReduceOp : std::uint8_t { SUM = 0, AVG, MAX, MIN, PRODUCT };
+
+struct AllreduceOptions {
+  ReduceOp reduce_op = ReduceOp::SUM;
+};
+
+struct BroadcastOptions {
+  int source_rank = 0;
+  int source_root = 0;
+};
+
+}  //  namespace distributed
+}  //  namespace paddle
diff --git a/paddle/fluid/platform/cuda_device_guard.h b/paddle/fluid/platform/cuda_device_guard.h
index 40204c0ed83f9..08beed532a7ec 100644
--- a/paddle/fluid/platform/cuda_device_guard.h
+++ b/paddle/fluid/platform/cuda_device_guard.h
@@ -14,13 +14,28 @@
 
 #pragma once
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace platform {
 
 class CUDADeviceGuard {
  public:
-  explicit inline CUDADeviceGuard(int dev_id) {
+  explicit CUDADeviceGuard(int dev_id) { SetDeviceIndex(dev_id); }
+
+  explicit CUDADeviceGuard(const CUDAPlace& place)
+      : CUDADeviceGuard(place.device) {}
+
+  // create uninitialized CUDADeviceGuard
+  CUDADeviceGuard() {}
+
+  ~CUDADeviceGuard() {
+    if (prev_id_ != -1) {
+      platform::SetDeviceId(prev_id_);
+    }
+  }
+
+  inline void SetDeviceIndex(const int dev_id) {
     int prev_id = platform::GetCurrentDeviceId();
     if (prev_id != dev_id) {
       prev_id_ = prev_id;
@@ -28,10 +43,9 @@ class CUDADeviceGuard {
     }
   }
 
-  inline ~CUDADeviceGuard() {
-    if (prev_id_ != -1) {
-      platform::SetDeviceId(prev_id_);
-    }
+  void SetDevice(const CUDAPlace& place) {
+    int dev_id = place.device;
+    SetDeviceIndex(dev_id);
   }
 
   CUDADeviceGuard(const CUDADeviceGuard& o) = delete;
diff --git a/paddle/fluid/platform/device/gpu/nccl_helper.h b/paddle/fluid/platform/device/gpu/nccl_helper.h
index 1d6ccdc1280a9..1919f59f8c07f 100644
--- a/paddle/fluid/platform/device/gpu/nccl_helper.h
+++ b/paddle/fluid/platform/device/gpu/nccl_helper.h
@@ -56,6 +56,23 @@ inline ncclDataType_t ToNCCLDataType(framework::proto::VarType::Type type) {
   }
 }
 
+inline ncclDataType_t ToNCCLDataType(experimental::DataType type) {
+  if (type == experimental::DataType::FLOAT32) {
+    return ncclFloat;
+  } else if (type == experimental::DataType::FLOAT64) {
+    return ncclDouble;
+  } else if (type == experimental::DataType::INT32) {
+    return ncclInt;
+  } else if (type == experimental::DataType::INT64) {
+    return ncclInt64;
+  } else if (type == experimental::DataType::FLOAT16) {
+    return ncclFloat16;
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "This datatype in nccl is not supported."));
+  }
+}
+
 // NOTE(minqiyang): according to the ncclGroupEnd documentations:
 // https://docs.nvidia.com/deeplearning/sdk/nccl-api/ncclapidoc.html,
 // ncclGroupEnd will wait for all communicators to be initialized, which will
diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index 39f95a9295661..baf043e860be4 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -761,3 +761,15 @@ DEFINE_bool(enable_slotrecord_reset_shrink, false,
             "enable slotrecord obejct reset shrink memory, default false");
 DEFINE_bool(enable_ins_parser_file, false,
             "enable parser ins file , default false");
+
+/**
+ * ProcessGroupNCCL related FLAG
+ * Name: nccl_blocking_wait
+ * Since Version:
+ * Value Range: bool, default=false
+ * Example:
+ * Note: nccl blocking wait.
+ */
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PADDLE_DEFINE_EXPORTED_bool(nccl_blocking_wait, false, "nccl blocking wait");
+#endif
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 01b21d02ea017..e76183192bcee 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -80,6 +80,14 @@ set(PYBIND_SRCS
   communication.cc
   cuda_streams_py.cc)
 
+if(NOT ON_INFER)
+  set (PYBIND_DEPS ${PYBIND_DEPS} processgroup)
+  if (WITH_NCCL)
+    set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_nccl)
+  endif()
+  set(PYBIND_SRCS ${PYBIND_SRCS} distributed_py.cc)
+endif()
+
 if(WITH_ASCEND)
   set(PYBIND_DEPS ${PYBIND_DEPS} ascend_wrapper)
   set(PYBIND_SRCS ${PYBIND_SRCS} ascend_wrapper_py.cc)
diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc
new file mode 100644
index 0000000000000..e057fb53ccecc
--- /dev/null
+++ b/paddle/fluid/pybind/distributed_py.cc
@@ -0,0 +1,149 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <fcntl.h>
+#ifdef _POSIX_C_SOURCE
+#undef _POSIX_C_SOURCE
+#endif
+
+#ifdef _XOPEN_SOURCE
+#undef _XOPEN_SOURCE
+#endif
+
+#include "paddle/fluid/distributed/collective/ProcessGroup.h"
+#include "paddle/fluid/distributed/collective/Types.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/imperative/layer.h"
+#include "paddle/fluid/pybind/distributed_py.h"
+#include "paddle/fluid/pybind/eager_utils.h"
+#include "paddle/phi/api/all.h"
+
+#if defined(PADDLE_WITH_NCCL)
+#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
+#endif
+
+namespace py = pybind11;
+
+namespace paddle {
+namespace pybind {
+
+using Tensor = paddle::experimental::Tensor;
+
+void BindDistributed(py::module *m) {
+  py::enum_<distributed::ReduceOp>(*m, "ReduceOp")
+      .value("SUM", distributed::ReduceOp::SUM)
+      .value("AVG", distributed::ReduceOp::AVG)
+      .value("MAX", distributed::ReduceOp::MAX)
+      .value("MIN", distributed::ReduceOp::MIN)
+      .value("PRODUCT", distributed::ReduceOp::PRODUCT);
+
+  py::class_<distributed::AllreduceOptions>(*m, "AllreduceOptions")
+      .def(py::init<>())
+      .def_readwrite("reduce_op", &distributed::AllreduceOptions::reduce_op);
+
+  py::class_<distributed::BroadcastOptions>(*m, "BroadcastOptions")
+      .def(py::init<>())
+      .def_readwrite("source_rank", &distributed::BroadcastOptions::source_rank)
+      .def_readwrite("source_root",
+                     &distributed::BroadcastOptions::source_root);
+
+  auto ProcessGroup =
+      py::class_<distributed::ProcessGroup,
+                 std::shared_ptr<distributed::ProcessGroup>>(*m, "ProcessGroup")
+          .def("rank", &distributed::ProcessGroup::GetRank)
+          .def("size", &distributed::ProcessGroup::GetSize)
+          .def("name", &distributed::ProcessGroup::GetBackendName)
+          .def("allreduce",
+               [](distributed::ProcessGroup &self, py::handle py_tensor,
+                  distributed::ReduceOp op) {
+                 auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
+                 distributed::AllreduceOptions opts;
+                 opts.reduce_op = op;
+                 std::vector<Tensor> tensors = {tensor};
+                 return self.AllReduce(tensors, opts);
+               },
+               py::arg("tensor"), py::arg("op") = distributed::ReduceOp::SUM,
+               py::call_guard<py::gil_scoped_release>())
+
+          .def("broadcast",
+               [](distributed::ProcessGroup &self, py::handle py_tensor,
+                  int source_rank) {
+                 auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
+                 distributed::BroadcastOptions opts;
+                 opts.source_rank = source_rank;
+                 std::vector<Tensor> tensors = {tensor};
+                 return self.Broadcast(tensors, opts);
+               },
+               py::arg("tensor"), py::arg("source_rank"),
+               py::call_guard<py::gil_scoped_release>());
+
+#if defined(PADDLE_WITH_NCCL)
+  py::class_<distributed::ProcessGroupNCCL,
+             std::shared_ptr<distributed::ProcessGroupNCCL>>(
+      *m, "ProcessGroupNCCL", ProcessGroup)
+      .def(py::init<const distributed::ProcessGroupStrategy &, int, int>(),
+           py::call_guard<py::gil_scoped_release>());
+
+  py::class_<distributed::ProcessGroup::Task,
+             std::shared_ptr<distributed::ProcessGroup::Task>>(*m, "task")
+      .def("is_completed", &distributed::ProcessGroup::Task::IsCompleted)
+      .def("wait", &distributed::ProcessGroup::Task::Wait,
+           py::arg("timeout") = kWaitTimeout,
+           py::call_guard<py::gil_scoped_release>())
+      .def("synchronize", &distributed::ProcessGroup::Task::Synchronize,
+           py::call_guard<py::gil_scoped_release>());
+#endif
+
+  // define parallel strategy, it will be removed
+  py::class_<distributed::ProcessGroupStrategy> pg_strategy(
+      *m, "ProcessGroupStrategy", "");
+  pg_strategy.def(py::init())
+      .def_property("nranks",
+                    [](const distributed::ProcessGroupStrategy &self) {
+                      return self.nranks_;
+                    },
+                    [](distributed::ProcessGroupStrategy &self, int nranks) {
+                      self.nranks_ = nranks;
+                    })
+      .def_property("local_rank",
+                    [](const distributed::ProcessGroupStrategy &self) {
+                      return self.local_rank_;
+                    },
+                    [](distributed::ProcessGroupStrategy &self,
+                       int local_rank) { self.local_rank_ = local_rank; })
+      .def_property(
+          "trainer_endpoints",
+          [](const distributed::ProcessGroupStrategy &self) {
+            return self.trainer_endpoints_;
+          },
+          [](distributed::ProcessGroupStrategy &self,
+             std::vector<std::string> eps) { self.trainer_endpoints_ = eps; })
+      .def_property("current_endpoint",
+                    [](const distributed::ProcessGroupStrategy &self) {
+                      return self.current_endpoint_;
+                    },
+                    [](distributed::ProcessGroupStrategy &self,
+                       const std::string &ep) { self.current_endpoint_ = ep; })
+      .def_property("nrings",
+                    [](const distributed::ProcessGroupStrategy &self) {
+                      return self.nrings_;
+                    },
+                    [](distributed::ProcessGroupStrategy &self, int nrings) {
+                      self.nrings_ = nrings;
+                    });
+}
+
+}  // end namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/distributed_py.h b/paddle/fluid/pybind/distributed_py.h
new file mode 100644
index 0000000000000..be5c7549b8e8d
--- /dev/null
+++ b/paddle/fluid/pybind/distributed_py.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "pybind11/chrono.h"
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+
+namespace py = pybind11;
+
+namespace paddle {
+namespace pybind {
+
+void BindDistributed(py::module* m);
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 58205041b8041..958174420570e 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -78,6 +78,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/pybind/cuda_streams_py.h"
+#include "paddle/fluid/pybind/distributed_py.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/lod_utils.h"
 #ifndef PADDLE_ON_INFERENCE
@@ -3895,6 +3896,9 @@ All parameter, weight, gradient are variables in Paddle.
   BindCompatible(&m);
   BindDataset(&m);
   BindGenerator(&m);
+#ifndef PADDLE_ON_INFERENCE
+  BindDistributed(&m);
+#endif
 #ifdef PADDLE_WITH_ASCEND
   BindAscendWrapper(&m);
   BindAscendGraph(&m);
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 15ddcf588441e..ad0a81e725707 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -54,6 +54,7 @@ list(APPEND DIST_TEST_OPS test_parallel_margin_cross_entropy)
 list(APPEND DIST_TEST_OPS test_auto_parallel_data_unshard)
 list(APPEND DIST_TEST_OPS test_auto_parallel_save_load)
 list(APPEND DIST_TEST_OPS test_auto_parallel_autoconvert)
+list(APPEND DIST_TEST_OPS test_collective_process_group)
 set(MIXED_DIST_TEST_OPS ${DIST_TEST_OPS})
 #remove distribute unittests.
 list(APPEND MIXED_DIST_TEST_OPS test_dgc_op)
@@ -290,6 +291,7 @@ if ((NOT WITH_GPU) AND (NOT WITH_ROCM))
     LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_data_unshard)
     LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_save_load)
     LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_autoconvert)
+    LIST(REMOVE_ITEM TEST_OPS test_collective_process_group)
 elseif(WITH_GPU)
     if (${CUDNN_VERSION} VERSION_LESS 7100)
         LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
@@ -1114,6 +1116,8 @@ if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL)
     set_tests_properties(test_auto_parallel_data_unshard PROPERTIES TIMEOUT 120)
     set_tests_properties(test_auto_parallel_save_load PROPERTIES TIMEOUT 120)
     set_tests_properties(test_auto_parallel_autoconvert PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_collective_process_group PROPERTIES TIMEOUT 120)
+    
     if(${NCCL_VERSION} VERSION_GREATER_EQUAL 2212)
         set_tests_properties(test_parallel_dygraph_sparse_embedding PROPERTIES TIMEOUT 120)
         set_tests_properties(test_parallel_dygraph_transformer PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/tests/unittests/process_group_nccl.py b/python/paddle/fluid/tests/unittests/process_group_nccl.py
new file mode 100644
index 0000000000000..d999aad63ecf4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/process_group_nccl.py
@@ -0,0 +1,149 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import random
+import numpy as np
+import os
+import shutil
+
+import paddle
+from paddle.fluid import core
+from datetime import timedelta
+import paddle.fluid.core as core
+from paddle.fluid.framework import _test_eager_guard
+from paddle.fluid.dygraph.parallel import ParallelEnv
+
+ProcessGroupStrategy = core.ProcessGroupStrategy
+
+
+def init_process_group(strategy=None):
+    # this will remove
+    if strategy is None:
+        strategy = ProcessGroupStrategy()
+        strategy.nranks = ParallelEnv().nranks
+        strategy.local_rank = ParallelEnv().local_rank
+        strategy.trainer_endpoints = ParallelEnv().trainer_endpoints
+        strategy.current_endpoint = ParallelEnv().current_endpoint
+    if strategy.nranks < 2:
+        return
+
+    pg_group = core.ProcessGroupNCCL(strategy, strategy.local_rank,
+                                     strategy.nranks)
+
+    return pg_group
+
+
+class TestProcessGroupFp32(unittest.TestCase):
+    def setUp(self):
+        paddle.seed(2022)
+        random.seed(2022)
+        np.random.seed(2022)
+        self.config()
+
+    def config(self):
+        self.dtype = "float32"
+        self.shape = (2, 10, 5)
+
+    def test_create_process_group_nccl(self):
+        with _test_eager_guard():
+            paddle.set_device('gpu:%d' %
+                              paddle.distributed.ParallelEnv().dev_id)
+
+            pg = init_process_group()
+            print("rank:", pg.rank(), "size:", pg.size(), "name:", pg.name())
+            print("test new group api ok")
+
+            # test allreduce sum
+            # rank 0
+            x = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            # rank 1
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_y = paddle.to_tensor(y)
+
+            sum_result = tensor_x + tensor_y
+            if pg.rank() == 0:
+                task = pg.allreduce(tensor_x)
+                task.wait()
+                assert np.array_equal(tensor_x, sum_result)
+            else:
+                task = pg.allreduce(tensor_y)
+                task.wait()
+                assert np.array_equal(tensor_y, sum_result)
+
+            print("test allreduce sum api ok")
+
+            # test allreduce max
+            # rank 0
+            x = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            # rank 1
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_y = paddle.to_tensor(y)
+
+            max_result = paddle.maximum(tensor_x, tensor_y)
+
+            if pg.rank() == 0:
+                task = pg.allreduce(tensor_x, core.ReduceOp.MAX)
+                task.wait()
+                assert np.array_equal(tensor_x, max_result)
+            else:
+                task = pg.allreduce(tensor_y, core.ReduceOp.MAX)
+                task.wait()
+                assert np.array_equal(tensor_y, max_result)
+
+            print("test allreduce max api ok")
+
+            # test broadcast
+            # rank 0
+            x = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            # rank 1
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_y = paddle.to_tensor(y)
+
+            broadcast_result = paddle.assign(tensor_x)
+            if pg.rank() == 0:
+                task = pg.broadcast(tensor_x, 0)
+                task.synchronize()
+                paddle.device.cuda.synchronize()
+                assert task.is_completed()
+                assert np.array_equal(broadcast_result, tensor_x)
+            else:
+                task = pg.broadcast(tensor_y, 0)
+                task.synchronize()
+                paddle.device.cuda.synchronize()
+                assert task.is_completed()
+                assert np.array_equal(broadcast_result, tensor_y)
+
+            print("test broadcast api ok")
+
+
+class TestProcessGroupFp16(TestProcessGroupFp32):
+    def setUp(self):
+        paddle.seed(2022)
+        random.seed(2022)
+        np.random.seed(2022)
+        self.config()
+
+    def config(self):
+        self.dtype = "float16"
+        self.shape = (4, 20, 20)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_collective_process_group.py b/python/paddle/fluid/tests/unittests/test_collective_process_group.py
new file mode 100644
index 0000000000000..6ae5424a882da
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_collective_process_group.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+from test_parallel_dygraph_dataparallel import TestMultipleGpus
+
+
+class TestProcessGroup(TestMultipleGpus):
+    def test_process_group_nccl(self):
+        self.run_mnist_2gpu('process_group_nccl.py')
+
+
+if __name__ == "__main__":
+    unittest.main()

From 2457a7d1b6d2e54de2e2f2ca8c997b38b2957027 Mon Sep 17 00:00:00 2001
From: jakpiase <jakpia21@gmail.com>
Date: Wed, 23 Feb 2022 15:14:27 +0100
Subject: [PATCH 073/101] added paddle_bfloat to requirements (#39740)

---
 .../test_python_bf16_numpy_datatype.py        | 34 +++++++++++++++++++
 python/requirements.txt                       |  1 +
 2 files changed, 35 insertions(+)
 create mode 100644 python/paddle/fluid/tests/unittests/test_python_bf16_numpy_datatype.py

diff --git a/python/paddle/fluid/tests/unittests/test_python_bf16_numpy_datatype.py b/python/paddle/fluid/tests/unittests/test_python_bf16_numpy_datatype.py
new file mode 100644
index 0000000000000..a58d7d35807c6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_python_bf16_numpy_datatype.py
@@ -0,0 +1,34 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from paddle_bfloat import bfloat16
+import unittest
+
+
+class TestBF16DataType(unittest.TestCase):
+    def test_matmul(self):
+        a_bf16 = np.random.random((6, 7)).astype(bfloat16)
+        b_bf16 = np.random.random((7, 8)).astype(bfloat16)
+        c_bf16 = np.matmul(a_bf16, b_bf16)
+
+        a_fp32 = a_bf16.astype(np.float32)
+        b_fp32 = b_bf16.astype(np.float32)
+        c_fp32 = np.matmul(a_fp32, b_fp32)
+
+        self.assertTrue(np.allclose(c_bf16, c_fp32))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/requirements.txt b/python/requirements.txt
index f2a4580a94e51..5f2b788a81a0a 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -5,3 +5,4 @@ Pillow
 six
 decorator
 astor
+paddle_bfloat==0.1.2

From 76a6b88d233aae2c7f6804f3c6aeb19e903dd54a Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Thu, 24 Feb 2022 09:52:33 +0800
Subject: [PATCH 074/101] [PHi] Skip kernel declare for cuda only kernel on
 rocm (#39869)

* skip kernel declare for cuda only kernel on rocm

* fix error
---
 cmake/pten.cmake | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/cmake/pten.cmake b/cmake/pten.cmake
index 9a3552efce8e1..5645ac6cfa303 100644
--- a/cmake/pten.cmake
+++ b/cmake/pten.cmake
@@ -58,15 +58,21 @@ endfunction()
 function(kernel_declare TARGET_LIST)
     foreach(kernel_path ${TARGET_LIST})
         file(READ ${kernel_path} kernel_impl)
-        # TODO(chenweihang): rename PD_REGISTER_KERNEL to PD_REGISTER_KERNEL
-        # NOTE(chenweihang): now we don't recommend to use digit in kernel name
-        string(REGEX MATCH "(PD_REGISTER_KERNEL|PD_REGISTER_GENERAL_KERNEL)\\([ \t\r\n]*[a-z0-9_]*," first_registry "${kernel_impl}")
+        string(REGEX MATCH "(PD_REGISTER_KERNEL|PD_REGISTER_GENERAL_KERNEL)\\([ \t\r\n]*[a-z0-9_]*,[ \t\r\n\/]*[a-z0-9_]*" first_registry "${kernel_impl}")
         if (NOT first_registry STREQUAL "")
+            # some gpu kernel only can run on cuda, not support rocm, so we add this branch
+            if (WITH_ROCM)
+                string(FIND "${first_registry}" "cuda_only" pos)
+                if(pos GREATER 1)
+                    continue()
+                endif()
+            endif()
             # parse the first kernel name
             string(REPLACE "PD_REGISTER_KERNEL(" "" kernel_name "${first_registry}")
             string(REPLACE "PD_REGISTER_GENERAL_KERNEL(" "" kernel_name "${kernel_name}")
             string(REPLACE "," "" kernel_name "${kernel_name}")
             string(REGEX REPLACE "[ \t\r\n]+" "" kernel_name "${kernel_name}")
+            string(REGEX REPLACE "//cuda_only" "" kernel_name "${kernel_name}")
             # append kernel declare into declarations.h
             # TODO(chenweihang): default declare ALL_LAYOUT for each kernel
             if (${kernel_path} MATCHES "./cpu\/")

From d6038c22696e23dfc181643694e84f888e8001ae Mon Sep 17 00:00:00 2001
From: Li Min <11663212+limin2021@users.noreply.github.com>
Date: Thu, 24 Feb 2022 10:21:33 +0800
Subject: [PATCH 075/101] optimize performance of lookup_table_v2_op (#39856)

* optimize block config  and fp16 atomicAdd perf for lookup_table_v2_grad.
---
 paddle/fluid/operators/lookup_table_v2_op.cu  | 45 ++++++----
 .../platform/device/gpu/gpu_primitives.h      | 88 +++++++++++++++++++
 2 files changed, 115 insertions(+), 18 deletions(-)

diff --git a/paddle/fluid/operators/lookup_table_v2_op.cu b/paddle/fluid/operators/lookup_table_v2_op.cu
index 4539f7091b578..d40b264378570 100644
--- a/paddle/fluid/operators/lookup_table_v2_op.cu
+++ b/paddle/fluid/operators/lookup_table_v2_op.cu
@@ -21,19 +21,18 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename T, typename IdT, int BlockDimX, int BlockDimY, int GridDimX,
-          bool PaddingFlag>
+template <typename T, typename IdT, bool PaddingFlag>
 __global__ void LookupTableV2(T *output, const T *table, const IdT *ids,
                               const int64_t N, const int64_t K, const int64_t D,
                               const int64_t padding_idx) {
   int idx = threadIdx.x;
-  int idy = blockIdx.x + threadIdx.y * GridDimX;
+  int idy = blockIdx.x + threadIdx.y * gridDim.x;
 
   while (idy < K) {
     auto id = static_cast<int64_t>(ids[idy]);
     T *out = output + idy * D;
     const T *tab = table + id * D;
-    for (int i = idx; i < D; i += BlockDimX) {
+    for (int i = idx; i < D; i += blockDim.x) {
       if (PaddingFlag) {
         if (id == padding_idx)
           out[i] = static_cast<T>(0);
@@ -43,25 +42,29 @@ __global__ void LookupTableV2(T *output, const T *table, const IdT *ids,
         out[i] = tab[i];
       }
     }
-    idy += BlockDimY * GridDimX;
+    idy += blockDim.y * gridDim.x;
   }
 }
 
-template <typename T, typename IdT, int BlockDimX, int BlockDimY, int GridDimX>
+template <typename T, typename IdT>
 __global__ void LookupTableV2Grad(T *table, const T *output, const IdT *ids,
                                   const int64_t N, const int64_t K,
                                   const int64_t D) {
   int idx = threadIdx.x;
-  int idy = blockIdx.x + threadIdx.y * GridDimX;
+  int idy = blockIdx.x + threadIdx.y * gridDim.x;
 
   while (idy < K) {
     auto id = static_cast<int64_t>(ids[idy]);
     const T *out = output + idy * D;
     T *tab = table + id * D;
-    for (int i = idx; i < D; i += BlockDimX) {
+#ifdef PADDLE_WITH_CUDA
+    paddle::platform::VectorizedAtomicAddPerBlock(D, idx, blockDim.x, out, tab);
+#else
+    for (int i = idx; i < D; i += blockDim.x) {
       paddle::platform::CudaAtomicAdd(&tab[i], out[i]);
     }
-    idy += BlockDimY * GridDimX;
+#endif
+    idy += blockDim.y * gridDim.x;
   }
 }
 
@@ -81,8 +84,9 @@ struct LookupTableV2CUDAFunctor {
     size_t D = table_t->dims()[1];
     size_t K = ids_t_->numel();
 
+    const int gridx = 2 * context_.cuda_device_context().GetSMCount();
     dim3 threads(256, 4);
-    dim3 grids(80, 1);
+    dim3 grids(gridx, 1);
 
     const auto *table = table_t->template data<T>();
     const auto *ids = ids_t_->template data<IdT>();
@@ -90,10 +94,10 @@ struct LookupTableV2CUDAFunctor {
     auto stream = context_.cuda_device_context().stream();
 
     if (padding_idx == -1) {
-      LookupTableV2<T, IdT, 256, 4, 80, false><<<grids, threads, 0, stream>>>(
+      LookupTableV2<T, IdT, false><<<grids, threads, 0, stream>>>(
           output, table, ids, N, K, D, padding_idx);
     } else {
-      LookupTableV2<T, IdT, 256, 4, 80, true><<<grids, threads, 0, stream>>>(
+      LookupTableV2<T, IdT, true><<<grids, threads, 0, stream>>>(
           output, table, ids, N, K, D, padding_idx);
     }
   }
@@ -193,17 +197,22 @@ struct LookupTableV2GradCUDAFunctor {
       int D = d_table_t->dims()[1];
       int K = ids_t_->numel();
 
-      dim3 threads(128, 8);
-      dim3 grids(8, 1);
       const T *d_output = d_output_t->template data<T>();
       const auto *ids = ids_t_->template data<IdT>();
       T *d_table = d_table_t->mutable_data<T>(context_.GetPlace());
 
-      auto t = framework::EigenVector<T>::Flatten(*d_table_t);
-      t.device(*dev_ctx.eigen_device()) = t.constant(static_cast<T>(0));
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          hipMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx.stream()));
+#else
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          cudaMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx.stream()));
+#endif
 
-      LookupTableV2Grad<T, IdT, 128, 8,
-                        8><<<grids, threads, 0, dev_ctx.stream()>>>(
+      const int gridx = 2 * dev_ctx.GetSMCount();
+      dim3 threads(128, 8);
+      dim3 grids(gridx, 1);
+      LookupTableV2Grad<T, IdT><<<grids, threads, 0, dev_ctx.stream()>>>(
           d_table, d_output, ids, N, K, D);
     }
   }
diff --git a/paddle/fluid/platform/device/gpu/gpu_primitives.h b/paddle/fluid/platform/device/gpu/gpu_primitives.h
index 3e070da546b2a..8616e969f69df 100644
--- a/paddle/fluid/platform/device/gpu/gpu_primitives.h
+++ b/paddle/fluid/platform/device/gpu/gpu_primitives.h
@@ -147,6 +147,94 @@ CUDA_ATOMIC_WRAPPER(Add, float16) {
   }
 }
 #endif
+
+// The performance of "atomicAdd(half* )" is bad, but for "atomicAdd(half2* )"
+// is good. So for fp16 type, we can use "atomicAdd(half2* )" to speed up.
+template <typename T, typename std::enable_if<std::is_same<
+                          platform::float16, T>::value>::type * = nullptr>
+__device__ __forceinline__ void fastAtomicAdd(T *tensor, size_t index,
+                                              const size_t numel, T value) {
+#if ((CUDA_VERSION < 10000) || \
+     (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)))
+  CudaAtomicAdd(reinterpret_cast<platform::float16 *>(tensor) + index,
+                static_cast<platform::float16>(value));
+#else
+  // whether the address is 32-byte aligned.
+  __half *target_addr = reinterpret_cast<__half *>(tensor + index);
+  bool aligned_half2 =
+      (reinterpret_cast<std::uintptr_t>(target_addr) % sizeof(__half2) == 0);
+
+  if (aligned_half2 && index < (numel - 1)) {
+    __half2 value2;
+    value2.x = *reinterpret_cast<__half *>(&value);
+    value2.y = __int2half_rz(0);
+    atomicAdd(reinterpret_cast<__half2 *>(target_addr), value2);
+
+  } else if (!aligned_half2 && index > 0) {
+    __half2 value2;
+    value2.x = __int2half_rz(0);
+    value2.y = *reinterpret_cast<__half *>(&value);
+    atomicAdd(reinterpret_cast<__half2 *>(target_addr - 1), value2);
+
+  } else {
+    atomicAdd(reinterpret_cast<__half *>(tensor) + index,
+              *reinterpret_cast<__half *>(&value));
+  }
+#endif
+}
+
+template <typename T, typename std::enable_if<!std::is_same<
+                          platform::float16, T>::value>::type * = nullptr>
+__device__ __forceinline__ void fastAtomicAdd(T *arr, size_t index,
+                                              const size_t numel, T value) {
+  CudaAtomicAdd(arr + index, value);
+}
+
+#ifdef PADDLE_WITH_CUDA
+/*
+ * One thead block deals with elementwise atomicAdd for vector of len.
+ * @in: [x1, x2, x3, ...]
+ * @out:[y1+x1, y2+x2, y3+x3, ...]
+ * */
+template <typename T, typename std::enable_if<!std::is_same<
+                          platform::float16, T>::value>::type * = nullptr>
+__device__ __forceinline__ void VectorizedAtomicAddPerBlock(
+    const int64_t len, int tid, int threads_per_block, const T *in, T *out) {
+  for (int i = tid; i < len; i += threads_per_block) {
+    CudaAtomicAdd(&out[i], in[i]);
+  }
+}
+
+// Note: assume that len is even. If len is odd, call fastAtomicAdd directly.
+template <typename T, typename std::enable_if<std::is_same<
+                          platform::float16, T>::value>::type * = nullptr>
+__device__ __forceinline__ void VectorizedAtomicAddPerBlock(
+    const int64_t len, int tid, int threads_per_block, const T *in, T *out) {
+  int i = 0;
+  int loops = len / 2 * 2;
+
+  bool aligned_half2 =
+      (reinterpret_cast<std::uintptr_t>(out) % sizeof(__half2) == 0);
+
+  if (aligned_half2) {
+    for (i = tid * 2; i < loops; i += threads_per_block * 2) {
+      __half2 value2;
+      T value_1 = in[i];
+      T value_2 = in[i + 1];
+      value2.x = *reinterpret_cast<__half *>(&value_1);
+      value2.y = *reinterpret_cast<__half *>(&value_2);
+      atomicAdd(reinterpret_cast<__half2 *>(&out[i]), value2);
+    }
+    for (; i < len; i += threads_per_block) {
+      fastAtomicAdd(out, i, len, in[i]);
+    }
+  } else {
+    for (int i = tid; i < len; i += threads_per_block) {
+      fastAtomicAdd(out, i, len, in[i]);
+    }
+  }
+}
+#endif
 #endif
 
 CUDA_ATOMIC_WRAPPER(Add, complex<float>) {

From 2136bd42910d759f54dec111779dd3f1d2218db6 Mon Sep 17 00:00:00 2001
From: niuliling123 <51102941+niuliling123@users.noreply.github.com>
Date: Thu, 24 Feb 2022 10:23:26 +0800
Subject: [PATCH 076/101] Fix a bug in IndexKernel out-of-memory (#39867)

---
 paddle/fluid/operators/index_impl.cu.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/index_impl.cu.h b/paddle/fluid/operators/index_impl.cu.h
index bae0d3f569f5f..3d6a5e0ea88a2 100644
--- a/paddle/fluid/operators/index_impl.cu.h
+++ b/paddle/fluid/operators/index_impl.cu.h
@@ -45,7 +45,7 @@ __global__ void VectorizedIndexKernel(T *out, int numel, int main_offset,
                                             BLOCK_NUM_X * VecSize);
   }
   int num = numel - data_offset;
-  if (numel > 0) {
+  if (num > 0) {
     kps::InitWithDataIndex<int, VecSize, 1, 1>(&args[0], data_offset);
     kps::ElementwiseUnary<int, T, VecSize, 1, 1, Functor>(&result[0], &args[0],
                                                           func);

From 6b5749eb80b094e95bdca983f5786e4807472e48 Mon Sep 17 00:00:00 2001
From: wanghuancoder <wanghuan29@baidu.com>
Date: Thu, 24 Feb 2022 10:37:40 +0800
Subject: [PATCH 077/101] [Eager] save load testcase (#39571)

* eager, test=develop

* fix bug, test=develop

* eager, test=develop

* merge legacy to fluid

* eager, test=develop

* eager, test=develop

* Refactor TensorAdd func by template and remove gradient_accumulation in eager

* Remove needless target name

* eager, test=develop

* eager, test=develop

* Use overload instead of template

* Remove legacy code

* Remove legacy code

* selectedrows, test=develop

* Remove DataType test

* eager, test=develop

* eager, test=develop

* support gan, test=develop

* Using Tensor directly instead of using EagerTensor

* support gradient_accumulation

* make test_imperative_lod_tensor_to_selected_rows longer

* make test_imperative_lod_tensor_to_selected_rows longer

* refine code

* ptb, test=develop

* Rename all EagerTensor to Tensor

* Rename some EagerTensor to Tensor

* rename EagerTensor to EagerVariable

* eager, test=develop

* eager, test=develop

* eager, test=develop

* eager, test=develop

* add more test

* eager, test=develop

* Support copiable selected rows and merge develop

* save load, eager, test=develop

* save load, eager, test=develop

* refine, test=develop

* refine, test=develop

* refine, test=develop

* revert static_runner, test=develop

* EagerTensor to Tensor, test=develop

* refine, test=develop

* refine, test=develop

* clear grad, test=develop

* merge, develop

* merge, develop

* merge, test=develop

* merge, test=develop

Co-authored-by: JiabinYang <360788950@qq.com>
Co-authored-by: Weilong Wu <veyron_wu@163.com>
---
 paddle/fluid/pybind/eager.cc                  |  15 +-
 python/paddle/fluid/dygraph/checkpoint.py     |   6 +-
 python/paddle/fluid/dygraph/io.py             | 130 +++++++++++++-----
 .../unittests/test_imperative_save_load.py    |  57 +++++---
 .../unittests/test_imperative_save_load_v2.py |  60 +++++---
 python/paddle/framework/io.py                 |  23 +++-
 6 files changed, 199 insertions(+), 92 deletions(-)

diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc
index 3867336764834..2296169a16104 100644
--- a/paddle/fluid/pybind/eager.cc
+++ b/paddle/fluid/pybind/eager.cc
@@ -64,12 +64,6 @@ void EmptyTensorInitializer(TensorObject* self, const std::string& name,
                             framework::proto::VarType::Type var_type =
                                 paddle::framework::proto::VarType::LOD_TENSOR) {
   auto ddims = phi::make_ddim(dims);
-  PADDLE_ENFORCE_GE(
-      phi::product(ddims), 0,
-      paddle::platform::errors::InvalidArgument(
-          "Create Eager Tensor with dims contain minus num is ilegal"
-          "Please check your code and make sure you new a "
-          "eager tensor with fixed shape instead of using -1."));
   self->tensor.set_name(name);
   auto autograd_meta = egr::EagerUtils::autograd_meta(&(self->tensor));
   autograd_meta->SetPersistable(persistable);
@@ -83,13 +77,10 @@ void EmptyTensorInitializer(TensorObject* self, const std::string& name,
             phi::make_intrusive<paddle::experimental::SharedStorage>(place),
             phi::DenseTensorMeta(paddle::framework::TransToPtenDataType(dtype),
                                  ddims));
-    dense_tensor->mutable_data(place);
+    if (phi::product(ddims) > 0) {
+      dense_tensor->mutable_data(place);
+    }
     self->tensor.set_impl(dense_tensor);
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "We only support LoDTensor to be constructed by this initializer, "
-        "please check your var type first and make sure you are going to "
-        "construct LoDTensor."));
   }
 
   if (!autograd_meta->GetMutableGradNode()) {
diff --git a/python/paddle/fluid/dygraph/checkpoint.py b/python/paddle/fluid/dygraph/checkpoint.py
index a98dc5a79aec3..3776599daab16 100644
--- a/python/paddle/fluid/dygraph/checkpoint.py
+++ b/python/paddle/fluid/dygraph/checkpoint.py
@@ -17,7 +17,7 @@
 import os
 import collections
 import functools
-from ..framework import Variable, default_main_program, in_dygraph_mode, dygraph_only, Parameter, ParamBase, _varbase_creator, _dygraph_tracer
+from ..framework import Variable, default_main_program, in_dygraph_mode, dygraph_only, Parameter, ParamBase, _varbase_creator, _dygraph_tracer, EagerParamBase
 import pickle
 from . import learning_rate_scheduler
 import warnings
@@ -94,7 +94,7 @@ def save_dygraph(state_dict, model_path):
 
     param_num = 0
     for k, v in state_dict.items():
-        if isinstance(v, ParamBase):
+        if isinstance(v, (ParamBase, EagerParamBase)):
             param_num += 1
 
     if param_num == 0:
@@ -103,7 +103,7 @@ def save_dygraph(state_dict, model_path):
     model_dict = {}
     name_table = {}
     for k, v in state_dict.items():
-        if isinstance(v, (Variable, core.VarBase)):
+        if isinstance(v, (Variable, core.VarBase, core.eager.Tensor)):
             model_dict[k] = v.numpy()
             name_table[k] = v.name
         else:
diff --git a/python/paddle/fluid/dygraph/io.py b/python/paddle/fluid/dygraph/io.py
index 9ffdea969be5d..aad7737350961 100644
--- a/python/paddle/fluid/dygraph/io.py
+++ b/python/paddle/fluid/dygraph/io.py
@@ -535,12 +535,20 @@ def _load_persistable_vars_by_program(model_path,
         orig_each_name = program_holder._suffix_varname_dict[each_var.name()]
         if _is_parameter(each_var, program_holder.infer_program):
             # create output varbase
-            new_var = framework.ParamBase(
-                shape=each_var.shape(),
-                dtype=each_var.dtype(),
-                name=each_var.name(),
-                type=each_var.type(),
-                persistable=True)
+            if framework._in_eager_mode():
+                new_var = framework.EagerParamBase(
+                    shape=each_var.shape(),
+                    dtype=each_var.dtype(),
+                    name=each_var.name(),
+                    type=each_var.type(),
+                    persistable=True)
+            else:
+                new_var = framework.ParamBase(
+                    shape=each_var.shape(),
+                    dtype=each_var.dtype(),
+                    name=each_var.name(),
+                    type=each_var.type(),
+                    persistable=True)
         else:
             new_var = framework._varbase_creator(
                 type=each_var.type(),
@@ -620,11 +628,22 @@ def _load_persistable_vars(model_path, var_info_path, program_holder,
         # create output varbase
         if extra_var_info[name].get('trainable', None) is not None:
             # use default shape and dtype
-            new_var = framework.ParamBase(
-                shape=[1],  # only to pass check, this shape is not meaningful
-                dtype=core.VarDesc.VarType.FP32,
-                name=new_name,
-                persistable=True)
+            if framework._in_eager_mode():
+                new_var = framework.EagerParamBase(
+                    shape=[
+                        1
+                    ],  # only to pass check, this shape is not meaningful
+                    dtype=core.VarDesc.VarType.FP32,
+                    name=new_name,
+                    persistable=True)
+            else:
+                new_var = framework.ParamBase(
+                    shape=[
+                        1
+                    ],  # only to pass check, this shape is not meaningful
+                    dtype=core.VarDesc.VarType.FP32,
+                    name=new_name,
+                    persistable=True)
         else:
             new_var = framework._varbase_creator(
                 name=new_name, persistable=True)
@@ -747,18 +766,26 @@ def _run_dygraph(instance, input, program_holder):
     # 1. prepare inputs, outputs, attrs
     input_vars = []
     for i, value in enumerate(input):
-        if not isinstance(value, (np.ndarray, core.VarBase)):
+        if not isinstance(value, (np.ndarray, core.VarBase, core.eager.Tensor)):
             raise TypeError(
                 "The type of input in TranslatedLayer must be numpy array or Variable(VarBase), but received %s."
                 % type(value))
         # NOTE: In order to unify the API, firstly convert the input to VarBase
         if isinstance(value, np.ndarray):
-            var = core.VarBase(
-                value=value,
-                name=program_holder.input_descs[i].name(),
-                persistable=False,
-                place=framework._current_expected_place(),
-                zero_copy=True)
+            if framework._in_eager_mode():
+                var = core.eager.Tensor(
+                    value=value,
+                    name=program_holder.input_descs[i].name(),
+                    persistable=False,
+                    place=framework._current_expected_place(),
+                    zero_copy=True)
+            else:
+                var = core.VarBase(
+                    value=value,
+                    name=program_holder.input_descs[i].name(),
+                    persistable=False,
+                    place=framework._current_expected_place(),
+                    zero_copy=True)
         else:
             var = value
             # NOTE: we changed var name here, 
@@ -784,30 +811,62 @@ def _run_dygraph(instance, input, program_holder):
 
     output_vars = []
     for var_desc in program_holder.output_descs:
-        var = core.VarBase(var_desc.dtype(),
-                           var_desc.shape(),
-                           var_desc.name(), var_desc.type(), False)
+        if framework._in_eager_mode():
+            var = core.eager.Tensor(
+                dtype=var_desc.dtype(),
+                dims=var_desc.shape(),
+                name=var_desc.name(),
+                type=var_desc.type(),
+                persistable=False)
+        else:
+            var = core.VarBase(var_desc.dtype(),
+                               var_desc.shape(),
+                               var_desc.name(), var_desc.type(), False)
         output_vars.append(var)
 
     # hold forward variables
-    tmp_scope_vec = core.VarBase(core.VarDesc.VarType.FP32, [],
-                                 "program_out_scope",
-                                 core.VarDesc.VarType.STEP_SCOPES, True)
+    if framework._in_eager_mode():
+        tmp_scope_vec = core.eager.Tensor(
+            dtype=core.VarDesc.VarType.FP32,
+            dims=[],
+            name="program_out_scope",
+            type=core.VarDesc.VarType.STEP_SCOPES,
+            persistable=True)
+    else:
+        tmp_scope_vec = core.VarBase(core.VarDesc.VarType.FP32, [],
+                                     "program_out_scope",
+                                     core.VarDesc.VarType.STEP_SCOPES, True)
     tmp_scope_vec.value().set_scope(program_holder.scope)
 
     double_grad_vars = []
     for var_desc in program_holder.double_grad_descs:
-        var = core.VarBase(var_desc.dtype(),
-                           var_desc.shape(),
-                           var_desc.name(), var_desc.type(), False)
+        if framework._in_eager_mode():
+            var = core.eager.Tensor(
+                dtype=var_desc.dtype(),
+                dims=var_desc.shape(),
+                name=var_desc.name(),
+                type=var_desc.type(),
+                persistable=False)
+        else:
+            var = core.VarBase(var_desc.dtype(),
+                               var_desc.shape(),
+                               var_desc.name(), var_desc.type(), False)
         double_grad_vars.append(var)
     if len(double_grad_vars) == 0:
-        double_grad_vars = [
-            core.VarBase(
-                value=[1],
-                name='Fake_var',
-                place=framework._current_expected_place())
-        ]
+        if framework._in_eager_mode():
+            double_grad_vars = [
+                core.eager.Tensor(
+                    value=[1],
+                    name='Fake_var',
+                    place=framework._current_expected_place())
+            ]
+        else:
+            double_grad_vars = [
+                core.VarBase(
+                    value=[1],
+                    name='Fake_var',
+                    place=framework._current_expected_place())
+            ]
 
     # 2. run program by op
     trace_program = program_holder.infer_program if instance._is_test else program_holder.train_program
@@ -1215,11 +1274,12 @@ def __init__(self, programs, persistable_vars):
         # the TranslatedLayer object holded var names count started from 0
         with unique_name.guard():
             for name, var in persistable_vars.items():
-                if isinstance(var, framework.ParamBase):
+                if isinstance(var,
+                              (framework.ParamBase, framework.EagerParamBase)):
                     dy_name = _generate_unique_var_name(PARAMETER_NAME_PREFIX)
                     self._persistable_var_name_dict[name] = dy_name
                     self.add_parameter(dy_name, var)
-                elif isinstance(var, core.VarBase):
+                elif isinstance(var, (core.VarBase, core.eager.Tensor)):
                     dy_name = _generate_unique_var_name(BUFFER_NAME_PREFIX)
                     self._persistable_var_name_dict[name] = dy_name
                     self.register_buffer(dy_name, var)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
index 6c6b164bdec68..160c94a549c91 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
@@ -27,6 +27,7 @@
 import numpy as np
 import six
 import paddle
+from paddle.fluid.framework import _test_eager_guard
 
 
 class SimpleLSTMRNN(fluid.Layer):
@@ -208,7 +209,7 @@ def forward(self, input, label, init_hidden, init_cell):
 
 
 class TestDygraphPtbRnn(unittest.TestCase):
-    def setUp(self):
+    def func_setUp(self):
         seed = 90
         hidden_size = 10
         vocab_size = 1000
@@ -277,7 +278,7 @@ def setUp(self):
             self.opti_dict = adam.state_dict()
             self.base_opti = {}
             for k, v in self.opti_dict.items():
-                if isinstance(v, core.VarBase):
+                if isinstance(v, (core.VarBase, core.eager.Tensor)):
                     self.base_opti[v.name] = v.numpy()
                     self.assertTrue(np.sum(np.abs(v.numpy())) != 0)
                 else:
@@ -294,7 +295,7 @@ def setUp(self):
 
             fluid.save_dygraph(self.state_dict, "./test_dy")
 
-    def testLoadAndSetVarBase(self):
+    def func_testLoadAndSetVarBase(self):
         seed = 90
         hidden_size = 10
         vocab_size = 1000
@@ -363,7 +364,7 @@ def testLoadAndSetVarBase(self):
             opti_dict = adam.state_dict()
             # set to zero
             for k, v in opti_dict.items():
-                if isinstance(v, core.VarBase):
+                if isinstance(v, (core.VarBase, core.eager.Tensor)):
                     np_t = v.numpy()
                     var = v.value().get_tensor()
                     var.set(np.zeros_like(np_t), place)
@@ -374,11 +375,12 @@ def testLoadAndSetVarBase(self):
                 adam._learning_rate.step_num = 0
 
             para_state_dict, opti_state_dict = fluid.load_dygraph("./test_dy")
+            print(opti_state_dict.keys())
             adam.set_state_dict(opti_state_dict)
 
             opti_dict = adam.state_dict()
             for k, v in opti_dict.items():
-                if isinstance(v, core.VarBase):
+                if isinstance(v, (core.VarBase, core.eager.Tensor)):
                     self.assertTrue(
                         np.array_equal(v.numpy(), self.base_opti[v.name]))
                 else:
@@ -403,7 +405,7 @@ def testLoadAndSetVarBase(self):
 
                 self.assertTrue(np.array_equal(new_t, base_t))
 
-    def testSetVariable(self):
+    def func_testSetVariable(self):
         seed = 90
         hidden_size = 10
         vocab_size = 1000
@@ -472,7 +474,7 @@ def testSetVariable(self):
             opti_dict = adam.state_dict()
             # set to zero
             for k, v in opti_dict.items():
-                if isinstance(v, core.VarBase):
+                if isinstance(v, (core.VarBase, core.eager.Tensor)):
                     np_t = v.numpy()
                     var = v.value().get_tensor()
                     var.set(np.zeros_like(np_t), place)
@@ -485,7 +487,7 @@ def testSetVariable(self):
             adam.set_state_dict(self.opti_dict)
             opti_dict = adam.state_dict()
             for k, v in opti_dict.items():
-                if isinstance(v, core.VarBase):
+                if isinstance(v, (core.VarBase, core.eager.Tensor)):
                     self.assertTrue(
                         np.array_equal(v.numpy(), self.base_opti[v.name]))
                 else:
@@ -510,7 +512,7 @@ def testSetVariable(self):
 
                 self.assertTrue(np.array_equal(new_t, base_t))
 
-    def testSetNumpy(self):
+    def func_testSetNumpy(self):
         seed = 90
         hidden_size = 10
         vocab_size = 1000
@@ -580,7 +582,7 @@ def testSetNumpy(self):
             np_opti_dict = {}
             # set to zero
             for k, v in opti_dict.items():
-                if isinstance(v, core.VarBase):
+                if isinstance(v, (core.VarBase, core.eager.Tensor)):
                     np_t = v.numpy()
                     np_opti_dict[v.name] = np_t
                     var = v.value().get_tensor()
@@ -596,7 +598,7 @@ def testSetNumpy(self):
 
             opti_dict = adam.state_dict()
             for k, v in opti_dict.items():
-                if isinstance(v, core.VarBase):
+                if isinstance(v, (core.VarBase, core.eager.Tensor)):
                     self.assertTrue(
                         np.array_equal(v.numpy(), self.base_opti[v.name]))
                 else:
@@ -623,7 +625,7 @@ def testSetNumpy(self):
 
                 self.assertTrue(np.array_equal(new_t, base_t))
 
-    def testSetVariableBeforeTrain(self):
+    def func_testSetVariableBeforeTrain(self):
         seed = 90
         hidden_size = 10
         vocab_size = 1000
@@ -700,7 +702,7 @@ def testSetVariableBeforeTrain(self):
                 base_t = self.model_base[k]
                 self.assertTrue(np.array_equal(new_t, base_t))
 
-    def testLoadAndSetVarBaseBeforeTrain(self):
+    def func_testLoadAndSetVarBaseBeforeTrain(self):
         seed = 90
         hidden_size = 10
         vocab_size = 1000
@@ -791,7 +793,7 @@ def testLoadAndSetVarBaseBeforeTrain(self):
                 base_t = self.model_base[k]
                 self.assertTrue(np.array_equal(new_t, base_t))
 
-    def testSetNumpyBeforeTrain(self):
+    def func_testSetNumpyBeforeTrain(self):
         seed = 90
         hidden_size = 10
         vocab_size = 1000
@@ -840,7 +842,7 @@ def testSetNumpyBeforeTrain(self):
             np_state_dict = {}
 
             for k, v in self.opti_dict.items():
-                if isinstance(v, core.VarBase):
+                if isinstance(v, (core.VarBase, core.eager.Tensor)):
                     np_opti_dict[v.name] = v.numpy()
                 else:
                     np_opti_dict[k] = v
@@ -894,7 +896,7 @@ def testSetNumpyBeforeTrain(self):
                 base_t = self.model_base[k]
                 self.assertTrue(np.array_equal(new_t, base_t))
 
-    def testOnlyLoadParams(self):
+    def func_testOnlyLoadParams(self):
         with fluid.dygraph.guard():
             emb = fluid.dygraph.Embedding([10, 10])
             state_dict = emb.state_dict()
@@ -911,7 +913,7 @@ def testOnlyLoadParams(self):
             para_state_dict, opti_state_dict = fluid.load_dygraph(
                 os.path.join('saved_dy', 'emb_dy.pdopt'))
 
-    def test_load_compatible_with_keep_name_table(self):
+    def func_test_load_compatible_with_keep_name_table(self):
         with fluid.dygraph.guard():
             emb = fluid.dygraph.Embedding([10, 10])
             state_dict = emb.state_dict()
@@ -922,6 +924,27 @@ def test_load_compatible_with_keep_name_table(self):
             self.assertTrue(para_state_dict != None)
             self.assertTrue(opti_state_dict == None)
 
+    def test_main(self):
+        self.func_setUp()
+        self.func_testLoadAndSetVarBase()
+        self.func_testSetVariable()
+        self.func_testSetNumpy()
+        self.func_testSetVariableBeforeTrain()
+        self.func_testLoadAndSetVarBaseBeforeTrain()
+        self.func_testSetNumpyBeforeTrain()
+        self.func_testOnlyLoadParams()
+        self.func_test_load_compatible_with_keep_name_table()
+        with _test_eager_guard():
+            self.func_setUp()
+            self.func_testLoadAndSetVarBase()
+            self.func_testSetVariable()
+            self.func_testSetNumpy()
+            self.func_testSetVariableBeforeTrain()
+            self.func_testLoadAndSetVarBaseBeforeTrain()
+            self.func_testSetNumpyBeforeTrain()
+            self.func_testOnlyLoadParams()
+            self.func_test_load_compatible_with_keep_name_table()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
index 9f0dcdb4d8f0c..7e7b2e2fd5206 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
@@ -27,6 +27,7 @@
 import numpy as np
 import six
 import paddle
+from paddle.fluid.framework import _test_eager_guard
 
 
 class SimpleLSTMRNN(fluid.Layer):
@@ -208,7 +209,7 @@ def forward(self, input, label, init_hidden, init_cell):
 
 
 class TestDygraphPtbRnn(unittest.TestCase):
-    def setUp(self):
+    def func_setUp(self):
         seed = 90
         hidden_size = 10
         vocab_size = 1000
@@ -279,7 +280,7 @@ def setUp(self):
             self.opti_dict = adam.state_dict()
             self.base_opti = {}
             for k, v in self.opti_dict.items():
-                if isinstance(v, core.VarBase):
+                if isinstance(v, (core.VarBase, core.eager.Tensor)):
                     self.base_opti[v.name] = v.numpy()
                     self.assertTrue(np.sum(np.abs(v.numpy())) != 0)
                 else:
@@ -296,7 +297,7 @@ def setUp(self):
 
             paddle.save(self.state_dict, "./test_dy_v2.pdparams")
 
-    def testLoadAndSetVarBase(self):
+    def func_testLoadAndSetVarBase(self):
         self.setUp()
         seed = 90
         hidden_size = 10
@@ -367,7 +368,7 @@ def testLoadAndSetVarBase(self):
             opti_dict = adam.state_dict()
             # set to zero
             for k, v in opti_dict.items():
-                if isinstance(v, core.VarBase):
+                if isinstance(v, (core.VarBase, core.eager.Tensor)):
                     np_t = v.numpy()
                     var = v.value().get_tensor()
                     var.set(np.zeros_like(np_t), place)
@@ -380,7 +381,7 @@ def testLoadAndSetVarBase(self):
 
             opti_dict = adam.state_dict()
             for k, v in opti_dict.items():
-                if isinstance(v, core.VarBase):
+                if isinstance(v, (core.VarBase, core.eager.Tensor)):
                     self.assertTrue(
                         np.array_equal(v.numpy(), self.base_opti[v.name]))
                 else:
@@ -405,7 +406,7 @@ def testLoadAndSetVarBase(self):
 
                 self.assertTrue(np.array_equal(new_t, base_t))
 
-    def testSetVariable(self):
+    def func_testSetVariable(self):
         seed = 90
         hidden_size = 10
         vocab_size = 1000
@@ -475,7 +476,7 @@ def testSetVariable(self):
             opti_dict = adam.state_dict()
             # set to zero
             for k, v in opti_dict.items():
-                if isinstance(v, core.VarBase):
+                if isinstance(v, (core.VarBase, core.eager.Tensor)):
                     np_t = v.numpy()
                     var = v.value().get_tensor()
                     var.set(np.zeros_like(np_t), place)
@@ -488,7 +489,7 @@ def testSetVariable(self):
             adam.set_state_dict(self.opti_dict)
             opti_dict = adam.state_dict()
             for k, v in opti_dict.items():
-                if isinstance(v, core.VarBase):
+                if isinstance(v, (core.VarBase, core.eager.Tensor)):
                     self.assertTrue(
                         np.array_equal(v.numpy(), self.base_opti[v.name]))
                 else:
@@ -513,7 +514,7 @@ def testSetVariable(self):
 
                 self.assertTrue(np.array_equal(new_t, base_t))
 
-    def testSetNumpy(self):
+    def func_testSetNumpy(self):
         seed = 90
         hidden_size = 10
         vocab_size = 1000
@@ -584,7 +585,7 @@ def testSetNumpy(self):
             np_opti_dict = {}
             # set to zero
             for k, v in opti_dict.items():
-                if isinstance(v, core.VarBase):
+                if isinstance(v, (core.VarBase, core.eager.Tensor)):
                     np_t = v.numpy()
                     np_opti_dict[v.name] = np_t
                     var = v.value().get_tensor()
@@ -600,7 +601,7 @@ def testSetNumpy(self):
 
             opti_dict = adam.state_dict()
             for k, v in opti_dict.items():
-                if isinstance(v, core.VarBase):
+                if isinstance(v, (core.VarBase, core.eager.Tensor)):
                     self.assertTrue(
                         np.array_equal(v.numpy(), self.base_opti[v.name]))
                 else:
@@ -627,7 +628,7 @@ def testSetNumpy(self):
 
                 self.assertTrue(np.array_equal(new_t, base_t))
 
-    def testSetVariableBeforeTrain(self):
+    def func_testSetVariableBeforeTrain(self):
         seed = 90
         hidden_size = 10
         vocab_size = 1000
@@ -706,7 +707,7 @@ def testSetVariableBeforeTrain(self):
                 base_t = self.model_base[k]
                 self.assertTrue(np.array_equal(new_t, base_t))
 
-    def testLoadAndSetVarBaseBeforeTrain(self):
+    def func_testLoadAndSetVarBaseBeforeTrain(self):
         seed = 90
         hidden_size = 10
         vocab_size = 1000
@@ -797,7 +798,7 @@ def testLoadAndSetVarBaseBeforeTrain(self):
                 base_t = self.model_base[k]
                 self.assertTrue(np.array_equal(new_t, base_t))
 
-    def testSetNumpyBeforeTrain(self):
+    def func_testSetNumpyBeforeTrain(self):
         seed = 90
         hidden_size = 10
         vocab_size = 1000
@@ -846,7 +847,7 @@ def testSetNumpyBeforeTrain(self):
             np_state_dict = {}
 
             for k, v in self.opti_dict.items():
-                if isinstance(v, core.VarBase):
+                if isinstance(v, (core.VarBase, core.eager.Tensor)):
                     np_opti_dict[v.name] = v.numpy()
                 else:
                     np_opti_dict[k] = v
@@ -902,7 +903,7 @@ def testSetNumpyBeforeTrain(self):
                 base_t = self.model_base[k]
                 self.assertTrue(np.array_equal(new_t, base_t))
 
-    def testOnlyLoadParams(self):
+    def func_testOnlyLoadParams(self):
         with fluid.dygraph.guard():
             emb = fluid.dygraph.Embedding([10, 10])
             state_dict = emb.state_dict()
@@ -911,7 +912,7 @@ def testOnlyLoadParams(self):
             para_state_dict = paddle.load(
                 os.path.join('saved_dy', 'emb_dy.pdparams'))
 
-    def test_no_state_in_input_dict(self):
+    def func_test_no_state_in_input_dict(self):
         with fluid.dygraph.guard():
             emb = fluid.dygraph.Embedding([10, 10])
             state_dict = emb.state_dict()
@@ -923,7 +924,7 @@ def test_no_state_in_input_dict(self):
 
             emb.set_state_dict(para_state_dict)
 
-    def test_state_shape_mismatch(self):
+    def func_test_state_shape_mismatch(self):
         with fluid.dygraph.guard():
             emb = fluid.dygraph.Embedding([10, 10])
             state_dict = emb.state_dict()
@@ -936,6 +937,29 @@ def test_state_shape_mismatch(self):
 
             emb.set_state_dict(para_state_dict)
 
+    def test_main(self):
+        self.func_setUp()
+        self.func_testLoadAndSetVarBase()
+        self.func_testSetVariable()
+        self.func_testSetNumpy()
+        self.func_testSetVariableBeforeTrain()
+        self.func_testLoadAndSetVarBaseBeforeTrain()
+        self.func_testSetNumpyBeforeTrain()
+        self.func_testOnlyLoadParams()
+        self.func_test_no_state_in_input_dict()
+        self.func_test_state_shape_mismatch()
+        with _test_eager_guard():
+            self.func_setUp()
+            self.func_testLoadAndSetVarBase()
+            self.func_testSetVariable()
+            self.func_testSetNumpy()
+            self.func_testSetVariableBeforeTrain()
+            self.func_testLoadAndSetVarBaseBeforeTrain()
+            self.func_testSetNumpyBeforeTrain()
+            self.func_testOnlyLoadParams()
+            self.func_test_no_state_in_input_dict()
+            self.func_test_state_shape_mismatch()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/framework/io.py b/python/paddle/framework/io.py
index 8367205a7e7c2..94b8bd29b2c19 100644
--- a/python/paddle/framework/io.py
+++ b/python/paddle/framework/io.py
@@ -30,7 +30,7 @@
 from paddle.fluid.io import _legacy_save as _legacy_static_save
 from paddle.fluid.io import _open_file_buffer, _is_file_path, _is_memory_buffer
 
-from paddle.fluid.framework import Variable, _varbase_creator, _dygraph_tracer, in_dygraph_mode, ParamBase, _current_expected_place, Program
+from paddle.fluid.framework import Variable, _varbase_creator, _dygraph_tracer, in_dygraph_mode, ParamBase, EagerParamBase, _current_expected_place, Program
 from paddle.fluid.dygraph.jit import _SaveLoadConfig
 from paddle.fluid.dygraph.io import _construct_program_holders, _construct_params_and_buffers
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX, INFER_PARAMS_INFO_SUFFIX
@@ -42,7 +42,7 @@ def _build_saved_state_dict(state_dict):
     save_dict = {}
     name_table = {}
     for key, value in state_dict.items():
-        if isinstance(value, (Variable, core.VarBase)):
+        if isinstance(value, (Variable, core.VarBase, core.eager.Tensor)):
             if value.type == core.VarDesc.VarType.VOCAB:
                 save_dict[key] = value.value().get_map_tensor()
             else:
@@ -260,6 +260,8 @@ def add_dispatch_table():
         # This is not a good method, because the pickle module has been modified.
         pickle.dispatch_table[core.VarBase] = reduce_varbase
         pickle.dispatch_table[ParamBase] = reduce_varbase
+        pickle.dispatch_table[core.eager.Tensor] = reduce_varbase
+        pickle.dispatch_table[EagerParamBase] = reduce_varbase
         pickle.dispatch_table[core.LoDTensor] = reduce_LoDTensor
         pickle.dispatch_table.update(dispatch_table_layer)
 
@@ -267,6 +269,8 @@ def pop_dispatch_table():
         pickle.dispatch_table.pop(core.VarBase)
         pickle.dispatch_table.pop(core.LoDTensor)
         pickle.dispatch_table.pop(ParamBase)
+        pickle.dispatch_table.pop(core.eager.Tensor)
+        pickle.dispatch_table.pop(EagerParamBase)
         for k in dispatch_table_layer:
             pickle.dispatch_table.pop(k)
 
@@ -286,6 +290,8 @@ def pop_dispatch_table():
         pickler.dispatch_table[core.VarBase] = reduce_varbase
         pickler.dispatch_table[core.LoDTensor] = reduce_LoDTensor
         pickler.dispatch_table[ParamBase] = reduce_varbase
+        pickler.dispatch_table[core.eager.Tensor] = reduce_varbase
+        pickler.dispatch_table[EagerParamBase] = reduce_varbase
         pickler.dispatch_table.update(dispatch_table_layer)
         pickler.dump(obj)
 
@@ -317,7 +323,8 @@ def _is_state_dict(obj):
 
         def condition(obj):
             return isinstance(obj, (fluid.Layer, Program, core.VarBase,
-                                    core.LoDTensor, core.SelectedRows))
+                                    core.eager.Tensor, core.LoDTensor,
+                                    core.SelectedRows))
 
         # If the value of a dict is a core.VarBase/LoDTensor or a dict 
         # that does not contain a paddle type(Layer, Program, VarBase, LoDTensor, SelectedRows), 
@@ -327,7 +334,8 @@ def condition(obj):
                 for k, v in value.items():
                     if _contain_x(v, condition):
                         return False
-            elif not isinstance(value, (core.VarBase, core.LoDTensor)):
+            elif not isinstance(value, (core.VarBase, core.eager.Tensor,
+                                        core.LoDTensor)):
                 return False
         return True
 
@@ -412,8 +420,9 @@ def _parse_every_object(obj, condition_func, convert_func):
     elif type(obj) == set:
         return set(_parse_every_object(list(obj), condition_func, convert_func))
     else:
-        if isinstance(obj, collections.Iterable) and not isinstance(obj, (
-                str, np.ndarray, core.VarBase, core.LoDTensor)):
+        if isinstance(obj, collections.Iterable) and not isinstance(
+                obj,
+            (str, np.ndarray, core.VarBase, core.eager.Tensor, core.LoDTensor)):
             raise NotImplementedError(
                 "The iteratable objects supported are tuple, list, dict, OrderedDict, string. But received {}.".
                 format(type(obj)))
@@ -541,7 +550,7 @@ def _save_binary_var(obj, path):
         _save_lod_tensor(obj, path)
     elif isinstance(obj, core.SelectedRows):
         _save_selected_rows(obj, path)
-    elif isinstance(obj, core.VarBase):
+    elif isinstance(obj, (core.VarBase, core.eager.Tensor)):
         _save_lod_tensor(obj.value().get_tensor(), path)
     else:
         # Since the concept of 'Tensor' is only exposed to users, the error message can only contain tensor instead of 'LoDTensor' or 'SelectedRows'

From 5fd7b5c3092bef2e48817da8849c267835a43890 Mon Sep 17 00:00:00 2001
From: JZ-LIANG <jianzhongliang10@gmail.com>
Date: Thu, 24 Feb 2022 10:42:13 +0800
Subject: [PATCH 078/101] fix bug for block state (#39854)

---
 python/paddle/distributed/auto_parallel/engine.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/python/paddle/distributed/auto_parallel/engine.py b/python/paddle/distributed/auto_parallel/engine.py
index 98b76056a15a4..8efb9eb719237 100644
--- a/python/paddle/distributed/auto_parallel/engine.py
+++ b/python/paddle/distributed/auto_parallel/engine.py
@@ -139,6 +139,9 @@ def _plan(self):
         self._completer = Completer(self._dist_contexts[self.mode])
         self._completer.complete_forward_annotation(serial_main_prog)
         # TODO: add auto planner process
+        # parse forward sub block
+        self._dist_contexts[self.mode].block_state.parse_forward_blocks(
+            serial_main_prog)
 
     def _parallel(self, rank):
         serial_main_program = self._serial_main_progs[self.mode]
@@ -177,6 +180,8 @@ def _generate_backward(self, main_program, startup_program, loss):
                 loss,
                 distop_context=self._dist_contexts[self.mode].dist_op_context)
         self._completer.complete_backward_annotation(main_program)
+        self._dist_contexts[self.mode].block_state.parse_backward_blocks(
+            main_program)
         return params_grads
 
     def _generate_optimizer(self, main_program, startup_program, params_grads):

From 867224b26254d13046f7287993eebad995ee3735 Mon Sep 17 00:00:00 2001
From: Huihuang Zheng <zhhsplendid@163.com>
Date: Thu, 24 Feb 2022 10:52:17 +0800
Subject: [PATCH 079/101] Add Note for Place of Executor in Parallel
 Environment (#39063)

Add note for Place of Executor in parallel environment
---
 python/paddle/fluid/executor.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 5ae1403f632b6..447d6457e0a3c 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -622,7 +622,9 @@ class Executor(object):
             is CPU version, the default device would be set to `CPUPlace()` . If Paddle is
             GPU version, the default device would be set to `CUDAPlace(0)` . Default is None.
             If ``place`` is string, it can be ``cpu``, and ``gpu:x``, where ``x`` 
-            is the index of the GPUs.
+            is the index of the GPUs. Note: users only pass one Place or None to initialize
+            Executor when using multiple-cards. Other APIs will override the cards. See
+            `document for multiple-cards <https://www.paddlepaddle.org.cn/documentation/docs/en/develop/guides/01_paddle2.0_introduction/update_en.html#stand-alone-multi-card-launch>`_ 
 
     Returns:
         Executor

From c969955663c39de0c399f1b08c3cadbbb8076680 Mon Sep 17 00:00:00 2001
From: huangxu96 <46740794+huangxu96@users.noreply.github.com>
Date: Thu, 24 Feb 2022 11:07:36 +0800
Subject: [PATCH 080/101] Optimize where_op and abs_grad_op by the elementwise
 interface (#39609)

* Optimize the where_op by the elementwise_op funtion

* Modified where_op & abs_grad_op by elementwise interface
---
 paddle/fluid/operators/where_op.cu            | 19 ++++++--
 paddle/phi/kernels/funcs/complex_functors.h   | 47 +++++++++++++++++++
 .../phi/kernels/impl/abs_grad_kernel_impl.h   | 22 +++++++++
 3 files changed, 84 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/operators/where_op.cu b/paddle/fluid/operators/where_op.cu
index 54b0d5b69086c..61a1691e4fe26 100644
--- a/paddle/fluid/operators/where_op.cu
+++ b/paddle/fluid/operators/where_op.cu
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
 #include "paddle/fluid/operators/where_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 
@@ -20,6 +21,15 @@ namespace platform = paddle::platform;
 namespace paddle {
 namespace operators {
 
+template <typename T>
+struct CondFunctor {
+  HOSTDEVICE inline CondFunctor() {}
+
+  HOSTDEVICE inline T operator()(const bool cond, const T x, const T y) const {
+    return cond ? x : y;
+  }
+};
+
 template <typename T>
 __global__ void WhereCUDAKernel(const int N, const bool* cond, const T* x,
                                 const T* y, T* out) {
@@ -63,10 +73,11 @@ class WhereKernel<platform::CUDADeviceContext, T>
     auto stream = context.cuda_device_context().stream();
     auto& dev_ctx =
         context.template device_context<platform::CUDADeviceContext>();
-    auto config = GetGpuLaunchConfig1D(dev_ctx, numel);
-    WhereCUDAKernel<
-        T><<<config.block_per_grid.x, config.thread_per_block.x, 0, stream>>>(
-        numel, cond_data, x_data, y_data, out_data);
+    auto functor = CondFunctor<T>();
+    std::vector<const framework::Tensor*> ins = {condition, X, Y};
+    std::vector<framework::Tensor*> outs = {out};
+    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
+                                                              &outs, functor);
   }
 };
 
diff --git a/paddle/phi/kernels/funcs/complex_functors.h b/paddle/phi/kernels/funcs/complex_functors.h
index 450adfcc68b7e..86dbdd099ecde 100644
--- a/paddle/phi/kernels/funcs/complex_functors.h
+++ b/paddle/phi/kernels/funcs/complex_functors.h
@@ -154,6 +154,53 @@ struct AbsFunctor<T, NoComplex<T, Real<T>>> {
   int64_t numel_;
 };
 
+template <typename T>
+struct AbsGradCUDAFunctor {
+  HOSTDEVICE inline AbsGradCUDAFunctor() {}
+
+  HOSTDEVICE inline T operator()(const T x, const T dout) const {
+    T output;
+    if (x == T(0)) {
+      output = T(0);
+    } else {
+      output = T(dout) * (x / T(std::abs(x)));
+    }
+    return output;
+  }
+};
+
+template <>
+struct AbsGradCUDAFunctor<phi::dtype::complex<float>> {
+  HOSTDEVICE inline AbsGradCUDAFunctor() {}
+  HOSTDEVICE inline phi::dtype::complex<float> operator()(
+      const phi::dtype::complex<float> x, const float dout) const {
+    phi::dtype::complex<float> output;
+    if (x == phi::dtype::complex<float>(0)) {
+      output = phi::dtype::complex<float>(0);
+    } else {
+      output = phi::dtype::complex<float>(dout) *
+               (x / phi::dtype::complex<float>(abs(x)));
+    }
+    return output;
+  }
+};
+
+template <>
+struct AbsGradCUDAFunctor<phi::dtype::complex<double>> {
+  HOSTDEVICE inline AbsGradCUDAFunctor() {}
+  HOSTDEVICE inline phi::dtype::complex<double> operator()(
+      const phi::dtype::complex<double> x, const double dout) const {
+    phi::dtype::complex<double> output;
+    if (x == phi::dtype::complex<double>(0)) {
+      output = phi::dtype::complex<double>(0);
+    } else {
+      output = phi::dtype::complex<double>(dout) *
+               (x / phi::dtype::complex<double>(abs(x)));
+    }
+    return output;
+  }
+};
+
 template <typename T>
 struct AbsGradFunctor {
   AbsGradFunctor(const Real<T>* dout, const T* x, T* output, int64_t numel)
diff --git a/paddle/phi/kernels/impl/abs_grad_kernel_impl.h b/paddle/phi/kernels/impl/abs_grad_kernel_impl.h
index 939bc49c9fc67..4b31393a71f36 100644
--- a/paddle/phi/kernels/impl/abs_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/abs_grad_kernel_impl.h
@@ -17,9 +17,30 @@
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/kernels/abs_grad_kernel.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
 
 namespace phi {
 
+#if defined(__NVCC__)
+template <typename T>
+void AbsGradKernelImpl(const GPUContext& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& dout,
+                       DenseTensor* dx) {
+  std::vector<const DenseTensor*> ins = {&x, &dout};
+  std::vector<DenseTensor*> outs = {dx};
+  dev_ctx.Alloc<T>(dx);
+  phi::funcs::AbsGradCUDAFunctor<T> abs_grad_cuda_functor;
+  phi::funcs::ElementwiseKernel<T>(dev_ctx, ins, &outs, abs_grad_cuda_functor);
+}
+template <typename T, typename Context>
+void AbsGradKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& dout,
+                   DenseTensor* dx) {
+  AbsGradKernelImpl<T>(dev_ctx, x, dout, dx);
+}
+#else
 template <typename T, typename Context>
 void AbsGradKernel(const Context& ctx,
                    const DenseTensor& x,
@@ -37,6 +58,7 @@ void AbsGradKernel(const Context& ctx,
   for_range(functor);
 }
 
+#endif
 template <typename T, typename Context>
 void AbsDoubleGradKernel(const Context& ctx,
                          const DenseTensor& x,

From 75f91ce41b0b09e01446695a5d8909d710213dc8 Mon Sep 17 00:00:00 2001
From: jakpiase <jakpia21@gmail.com>
Date: Thu, 24 Feb 2022 04:12:58 +0100
Subject: [PATCH 081/101] Fix for split op in BF16 inference (#39548)

* Fix for split bf16 inference

* added test for pass

* changes after review
---
 .../framework/ir/graph_pattern_detector.cc    |   9 +
 .../framework/ir/graph_pattern_detector.h     |   9 +
 .../framework/ir/mkldnn/cpu_bfloat16_pass.cc  | 166 +++++++++++++-----
 .../ir/mkldnn/cpu_bfloat16_pass_tester.cc     |  29 ++-
 paddle/phi/kernels/cpu/split_kernel.cc        |   3 +-
 5 files changed, 168 insertions(+), 48 deletions(-)

diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 63559e201594a..e4c9dc72128f4 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -2516,6 +2516,15 @@ PDNode *patterns::DuplicatedInputs::operator()() {
   return op;
 }
 
+PDNode *patterns::DuplicatedOutputs::operator()() {
+  auto op = pattern->NewNode(op_repr())->assert_is_ops({"split"});
+  op->assert_more([&](Node *node) {
+    return node->Op()->GetAttrIfExists<std::string>("mkldnn_data_type") ==
+           "bfloat16";
+  });
+  return op;
+}
+
 PDNode *patterns::MKLDNNInPlace::operator()() {
   const std::unordered_set<std::string> &supported_op_types = {
       "abs", "gelu", "leaky_relu", "relu", "softmax", "sqrt", "swish", "tanh"};
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index 79f1d63a15190..d6400ed6945bf 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -1495,6 +1495,15 @@ struct DuplicatedInputs : public PatternBase {
   PATTERN_DECL_NODE(op);
 };
 
+struct DuplicatedOutputs : public PatternBase {
+  DuplicatedOutputs(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "many_outputs_op") {}
+
+  PDNode* operator()();
+
+  PATTERN_DECL_NODE(op);
+};
+
 // Pattern used for enforcing inplace computation for in-place computation
 // supporting DNNL ops. softmax, batch_norm and layer_norm
 struct MKLDNNInPlace : public PatternBase {
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc
index 5f9aefc1e7a0b..f1bd34a5ad4f6 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc
@@ -52,7 +52,7 @@ bool IsPermittedOutputName(const std::string& output_name) {
 }
 
 void AddQuantize(Graph* g, ir::Node* op, ir::Node* op_in,
-                 int* quantize_counter) {
+                 int& quantize_counter) {
   std::vector<std::string> input_names;
 
   // Find the name of the input linking op to op_in
@@ -87,10 +87,10 @@ void AddQuantize(Graph* g, ir::Node* op, ir::Node* op_in,
   IR_NODE_LINK_TO(op_in, quantize_op);
   IR_NODE_LINK_TO(quantize_op, quantize_out_node);
   IR_NODE_LINK_TO(quantize_out_node, op);
-  (*quantize_counter)++;
+  quantize_counter++;
 }
 
-void AddQuantizes(Graph* g, ir::Node* op, int* quantize_counter) {
+void AddQuantizes(Graph* g, ir::Node* op, int& quantize_counter) {
   auto inputs = op->inputs;
   PADDLE_ENFORCE_GE(inputs.size(), 1,
                     platform::errors::InvalidArgument(
@@ -127,7 +127,7 @@ void AddQuantizes(Graph* g, ir::Node* op, int* quantize_counter) {
     IR_NODE_LINK_TO(inputs[i], quantize_op);
     IR_NODE_LINK_TO(quantize_op, quantize_out_nodes[i]);
     IR_NODE_LINK_TO(quantize_out_nodes[i], op);
-    (*quantize_counter)++;
+    quantize_counter++;
   }
 
   op->Op()->SetInput("X", quantize_out_node_names);
@@ -136,7 +136,7 @@ void AddQuantizes(Graph* g, ir::Node* op, int* quantize_counter) {
 // Operators like Concat and Sum have a single input name X, which actually
 // consists of multiple inputs. Such operators require a different way to find
 // pattern and add quantize ops.
-void AddReoderBeforeDuplicatedInputs(ir::Graph* graph, int* quantize_counter) {
+void AddReoderBeforeDuplicatedInputs(ir::Graph* graph, int& quantize_counter) {
   GraphPatternDetector gpd;
   patterns::DuplicatedInputs duplicated_inputs{gpd.mutable_pattern(),
                                                "duplicated_inputs"};
@@ -151,7 +151,7 @@ void AddReoderBeforeDuplicatedInputs(ir::Graph* graph, int* quantize_counter) {
 
 // Adding quantize ops before all operators except Concat and Sum, which have
 // already been handled in AddReoderBeforeDuplicatedInputs
-void AddReoderBeforeSingleInputs(ir::Graph* graph, int* quantize_counter) {
+void AddReoderBeforeSingleInputs(ir::Graph* graph, int& quantize_counter) {
   GraphPatternDetector gpd;
   patterns::FirstBfloat16Ops bfloat16_ops{gpd.mutable_pattern(),
                                           "first_bfloat16_ops"};
@@ -169,60 +169,134 @@ void AddReoderBeforeSingleInputs(ir::Graph* graph, int* quantize_counter) {
 
 void CPUBFloat16Pass::SetInputDataType(ir::Graph* graph) const {
   int quantize_counter = 0;
-  AddReoderBeforeDuplicatedInputs(graph, &quantize_counter);
-  AddReoderBeforeSingleInputs(graph, &quantize_counter);
+  AddReoderBeforeDuplicatedInputs(graph, quantize_counter);
+  AddReoderBeforeSingleInputs(graph, quantize_counter);
   PrettyLogDetail("---    added %d quantize ops before bfloat16 op",
                   quantize_counter);
 }
 
-void CPUBFloat16Pass::SetOutputDataType(ir::Graph* graph) const {
+void AddDequantize(Graph* g, ir::Node* op, ir::Node* op_out,
+                   int& dequantize_counter) {
+  if (op->Op()->Type() == "prior_box") return;
+
+  // Find the name of the output linking op to op_out
+  std::vector<std::string> output_names;
+  for (auto name : op->Op()->OutputNames())
+    for (auto output_name : op->Op()->Output(name))
+      if (output_name == op_out->Name() && IsPermittedOutputName(name))
+        output_names.push_back(name);
+
+  if (output_names.empty()) return;
+
+  VarDesc dequantize_in_desc(patterns::PDNodeName("dequantize", "in"));
+  auto* dequantize_in_node = g->CreateVarNode(&dequantize_in_desc);
+
+  OpDesc deq_desc;
+  deq_desc.SetType("dequantize");
+  deq_desc.SetInput("Input",
+                    std::vector<std::string>({dequantize_in_node->Name()}));
+  deq_desc.SetOutput("Output", std::vector<std::string>({op_out->Name()}));
+  deq_desc.SetAttr("Scale", 1.0f);
+  deq_desc.SetAttr("Shift", 0.0f);
+  auto dequantize_op = g->CreateOpNode(&deq_desc);  // OpDesc will be copied.
+
+  for (auto name = output_names.begin(); name < output_names.end(); name++)
+    op->Op()->SetOutput(*name,
+                        std::vector<std::string>({dequantize_in_node->Name()}));
+
+  UnlinkNodes(op, op_out);
+  IR_NODE_LINK_TO(op, dequantize_in_node);
+  IR_NODE_LINK_TO(dequantize_in_node, dequantize_op);
+  IR_NODE_LINK_TO(dequantize_op, op_out);
+
+  dequantize_counter++;
+}
+
+void AddDequantizes(Graph* g, ir::Node* op, int& dequantize_counter) {
+  auto outputs = op->outputs;
+  PADDLE_ENFORCE_GE(outputs.size(), 1,
+                    platform::errors::InvalidArgument(
+                        "OP(%s)'s outputs(%d) must be equal or greater than 1.",
+                        op->Name(), outputs.size()));
+  PADDLE_ENFORCE_EQ(op->inputs.size(), 1,
+                    platform::errors::InvalidArgument(
+                        "OP(%s)'s inputs(%d) must be equal to 1.", op->Name(),
+                        op->inputs.size()));
+
+  OpDesc deq_desc;
+  deq_desc.SetType("dequantize");
+
+  std::vector<Node*> dequantize_in_nodes(outputs.size());
+  std::vector<std::string> dequantize_in_node_names(outputs.size());
+
+  for (size_t i = 0; i < outputs.size(); i++) {
+    VarDesc dequantize_in_desc(patterns::PDNodeName("dequantize", "in"));
+    dequantize_in_nodes[i] = g->CreateVarNode(&dequantize_in_desc);
+    dequantize_in_node_names[i] = dequantize_in_nodes[i]->Name();
+
+    deq_desc.SetInput("Input",
+                      std::vector<std::string>({dequantize_in_node_names[i]}));
+    deq_desc.SetOutput("Output",
+                       std::vector<std::string>({outputs[i]->Name()}));
+
+    deq_desc.SetAttr("Scale", 1.f);
+    deq_desc.SetAttr("Shift", 0.0f);
+    deq_desc.SetAttr("bfloat16", true);
+    deq_desc.SetAttr("output_format", op->Op()->HasAttr("data_layout")
+                                          ? op->Op()->GetAttr("data_layout")
+                                          : std::string("NCHW"));
+    auto dequantize_op = g->CreateOpNode(&deq_desc);  // OpDesc will be copied.
+
+    UnlinkNodes(op, outputs[i]);
+    IR_NODE_LINK_TO(op, dequantize_in_nodes[i]);
+    IR_NODE_LINK_TO(dequantize_in_nodes[i], dequantize_op);
+    IR_NODE_LINK_TO(dequantize_op, outputs[i]);
+
+    dequantize_counter++;
+  }
+
+  op->Op()->SetOutput("Out", dequantize_in_node_names);
+}
+
+// Operators like split have a single output name Out, which actually
+// consists of multiple outputs. Such operators require a different way to find
+// pattern and add dequantize ops.
+void AddReoderAfterDuplicatedOutputs(ir::Graph* graph,
+                                     int& dequantize_counter) {
+  GraphPatternDetector gpd;
+  patterns::DuplicatedOutputs duplicated_outputs{gpd.mutable_pattern(),
+                                                 "duplicated_outputs"};
+  duplicated_outputs();
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(op, op, duplicated_outputs);
+    AddDequantizes(g, op, dequantize_counter);
+  };
+  gpd(graph, handler);
+}
+
+// Adding dequantize ops after all operators except split, which has
+// already been handled in AddReoderAfterDuplicatedOutputs
+void AddReoderAfterSingleOutputs(ir::Graph* graph, int& dequantize_counter) {
   GraphPatternDetector gpd;
   patterns::LastBfloat16Ops bfloat16_ops{gpd.mutable_pattern(),
                                          "last_bfloat16_ops"};
   bfloat16_ops();
-  int dequantize_counter = 0;
-
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
-    GET_IR_NODE_FROM_SUBGRAPH(op, op, bfloat16_ops);
     GET_IR_NODE_FROM_SUBGRAPH(op_out, op_out, bfloat16_ops);
-
-    if (op->Op()->Type() != "prior_box") {
-      // Find the name of the output linking op to op_out
-      std::vector<std::string> output_names;
-      for (auto name : op->Op()->OutputNames())
-        for (auto output_name : op->Op()->Output(name))
-          if (output_name == op_out->Name() && IsPermittedOutputName(name))
-            output_names.push_back(name);
-
-      if (output_names.empty()) return;
-
-      VarDesc dequantize_in_desc(patterns::PDNodeName("dequantize", "in"));
-      auto* dequantize_in_node = g->CreateVarNode(&dequantize_in_desc);
-
-      OpDesc deq_desc;
-      deq_desc.SetType("dequantize");
-      deq_desc.SetInput("Input",
-                        std::vector<std::string>({dequantize_in_node->Name()}));
-      deq_desc.SetOutput("Output", std::vector<std::string>({op_out->Name()}));
-      deq_desc.SetAttr("Scale", 1.0f);
-      deq_desc.SetAttr("Shift", 0.0f);
-      auto dequantize_op =
-          g->CreateOpNode(&deq_desc);  // OpDesc will be copied.
-
-      for (auto name = output_names.begin(); name < output_names.end(); name++)
-        op->Op()->SetOutput(
-            *name, std::vector<std::string>({dequantize_in_node->Name()}));
-
-      UnlinkNodes(op, op_out);
-      IR_NODE_LINK_TO(op, dequantize_in_node);
-      IR_NODE_LINK_TO(dequantize_in_node, dequantize_op);
-      IR_NODE_LINK_TO(dequantize_op, op_out);
-
-      dequantize_counter++;
+    GET_IR_NODE_FROM_SUBGRAPH(op, op, bfloat16_ops);
+    if (op->Op()->Type() != "split") {
+      AddDequantize(g, op, op_out, dequantize_counter);
     }
   };
   gpd(graph, handler);
+}
+
+void CPUBFloat16Pass::SetOutputDataType(ir::Graph* graph) const {
+  int dequantize_counter = 0;
+  AddReoderAfterDuplicatedOutputs(graph, dequantize_counter);
+  AddReoderAfterSingleOutputs(graph, dequantize_counter);
   PrettyLogDetail("---    added %d dequantize ops after bfloat16 op",
                   dequantize_counter);
 }
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass_tester.cc
index f620b4c94fe89..877ee71fc2d85 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass_tester.cc
@@ -45,7 +45,7 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
     op->SetInput("Input", {inputs[0]});
     op->SetOutput("Out", {outputs[0]});
     op->SetAttr("mkldnn_data_type", mkldnn_data_type);
-  } else if (type == "concat" || type == "sum") {
+  } else if (type == "concat" || type == "sum" || type == "split") {
     op->SetInput("X", inputs);
     op->SetOutput("Out", outputs);
     op->SetAttr("mkldnn_data_type", mkldnn_data_type);
@@ -117,6 +117,7 @@ TEST(CpuBfloat16Pass, convolution) {
   bool use_mkldnn = true;
   int quant_op = 3;
   int dequant_op = 3;
+  // each added op consists of 2 nodes
   int added_nodes = quant_op * 2 + dequant_op * 2;
   MainTest(BuildProgramDescConv(use_mkldnn), quant_op, dequant_op, added_nodes);
 }
@@ -140,6 +141,7 @@ TEST(CpuBfloat16Pass, double_input_ops) {
   bool use_mkldnn = true;
   int quant_op = 4;
   int dequant_op = 3;
+  // each added op consists of 2 nodes
   int added_nodes = quant_op * 2 + dequant_op * 2;
   MainTest(BuildProgramDescDoubleInput(use_mkldnn), quant_op, dequant_op,
            added_nodes);
@@ -164,11 +166,35 @@ TEST(CpuBfloat16Pass, duplicated_input_ops) {
   bool use_mkldnn = true;
   int quant_op = 5;
   int dequant_op = 3;
+  // each added op consists of 2 nodes
   int added_nodes = quant_op * 2 + dequant_op * 2;
   MainTest(BuildProgramDescDuplicatedInput(use_mkldnn), quant_op, dequant_op,
            added_nodes);
 }
 
+ProgramDesc BuildProgramDescDuplicatedOutput(bool use_mkldnn) {
+  ProgramDesc prog;
+  for (auto& v : variable_names) {
+    prog.MutableBlock(0)->Var(v);
+  }
+  SetOp(&prog, "dropout", "Dropout", {"a"}, {"b"}, use_mkldnn, "float32");
+  SetOp(&prog, "split", "Split", {"b"}, {"c", "d"}, use_mkldnn, "bfloat16");
+  SetOp(&prog, "transpose2", "Transpose", {"c"}, {"e"}, use_mkldnn, "float32");
+  SetOp(&prog, "reshape2", "Reshape", {"d"}, {"f"}, use_mkldnn, "bfloat16");
+
+  return prog;
+}
+
+TEST(CpuBfloat16Pass, duplicated_output_ops) {
+  bool use_mkldnn = true;
+  int quant_op = 2;
+  int dequant_op = 3;
+  // each added op consists of 2 nodes
+  int added_nodes = quant_op * 2 + dequant_op * 2;
+  MainTest(BuildProgramDescDuplicatedOutput(use_mkldnn), quant_op, dequant_op,
+           added_nodes);
+}
+
 ProgramDesc BuildProgramDescDoubleOutputs(bool use_mkldnn) {
   ProgramDesc prog;
   for (auto& v : variable_names) {
@@ -190,6 +216,7 @@ TEST(CpuBfloat16Pass, double_outputs_ops) {
   bool use_mkldnn = true;
   int quant_op = 3;
   int dequant_op = 3;
+  // each added op consists of 2 nodes
   int added_nodes = quant_op * 2 + dequant_op * 2;
   MainTest(BuildProgramDescDoubleOutputs(use_mkldnn), quant_op, dequant_op,
            added_nodes);
diff --git a/paddle/phi/kernels/cpu/split_kernel.cc b/paddle/phi/kernels/cpu/split_kernel.cc
index 7b2166eaf11f9..722681fb7bc3f 100644
--- a/paddle/phi/kernels/cpu/split_kernel.cc
+++ b/paddle/phi/kernels/cpu/split_kernel.cc
@@ -70,4 +70,5 @@ PD_REGISTER_KERNEL(split,
                    int64_t,
                    int,
                    bool,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}

From dd2c997d6bd2f3dadfcf1ee78c2c94f36c9bd381 Mon Sep 17 00:00:00 2001
From: xiongkun <xiongkun03@baidu.com>
Date: Thu, 24 Feb 2022 11:22:47 +0800
Subject: [PATCH 082/101] [Phi] Fix comilation dependecy in selected_rows with
 memory (#39834)

---
 paddle/phi/core/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/phi/core/CMakeLists.txt b/paddle/phi/core/CMakeLists.txt
index 80bcc66477cb1..6ada063069905 100644
--- a/paddle/phi/core/CMakeLists.txt
+++ b/paddle/phi/core/CMakeLists.txt
@@ -23,7 +23,7 @@ cc_library(sparse_csr_tensor SRCS sparse_csr_tensor.cc DEPS dense_tensor tensor_
 
 cc_library(meta_tensor SRCS meta_tensor.cc DEPS tensor_base tensor_meta dense_tensor)
 cc_library(infermeta_utils SRCS infermeta_utils.cc DEPS meta_tensor)
-cc_library(selected_rows SRCS selected_rows_impl.cc DEPS dense_tensor pten_enforce ddim)
+cc_library(selected_rows SRCS selected_rows_impl.cc DEPS dense_tensor pten_enforce ddim memcpy)
 
 cc_library(pten_custom_kernel SRCS custom_kernel.cc DEPS kernel_factory convert_utils)
 

From 94b31f90a41d653f9e587204ee71bdd49475f4d6 Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Thu, 24 Feb 2022 11:24:54 +0800
Subject: [PATCH 083/101] [pten] add optional type for infermeta (#39848)

* modify infershape by args_def

* add optional type for infermate

* add optional type for infermate

* add optional type for infermate

* support scalar type

* change OptionalInputAt function to none template

* support phi::DataType
---
 paddle/fluid/framework/infershape_utils.cc | 85 ++++++++++++++++++----
 paddle/phi/core/infermeta_utils.cc         |  8 ++
 paddle/phi/core/infermeta_utils.h          | 21 ++++++
 3 files changed, 99 insertions(+), 15 deletions(-)

diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc
index aae36cf455dfe..4bec1baeaaee9 100644
--- a/paddle/fluid/framework/infershape_utils.cc
+++ b/paddle/fluid/framework/infershape_utils.cc
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/pten_utils.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/common/scalar.h"
 #include "paddle/phi/common/scalar_array.h"
 #include "paddle/phi/core/compat/arg_map_context.h"
 #include "paddle/phi/core/compat/convert_utils.h"
@@ -376,47 +377,101 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
               attr_name));
         }
       }
+    } else if (attr_defs[i].type_index ==
+               std::type_index(typeid(phi::Scalar))) {
+      if (ctx->HasAttr(attr_name)) {
+        // TODO(chentianyu03): support other attrs later
+        auto& attr = attr_reader.GetAttr(attr_name);
+        if (std::type_index(attr.type()) == std::type_index(typeid(float))) {
+          infer_meta_context.EmplaceBackAttr(
+              phi::Scalar(BOOST_GET_CONST(float, attr)));
+        } else if (std::type_index(attr.type()) ==
+                   std::type_index(typeid(std::string))) {
+          infer_meta_context.EmplaceBackAttr(
+              phi::Scalar(BOOST_GET_CONST(std::string, attr)));
+        } else if (std::type_index(attr.type()) ==
+                   std::type_index(typeid(int))) {
+          infer_meta_context.EmplaceBackAttr(
+              phi::Scalar(BOOST_GET_CONST(int, attr)));
+        } else {
+          PADDLE_THROW(platform::errors::Unimplemented(
+              "Unsupported cast op attribute `%s` to Scalar when construct "
+              "InferMetaContext.",
+              attr_name));
+        }
+      } else if (ctx->HasInput(attr_name)) {
+        const auto& infershape_input = ctx->GetInputVarPtrs(attr_name);
 
+        if (infershape_input.size() == 1) {
+          if (ctx->IsRuntime()) {
+            Variable* var = BOOST_GET_CONST(Variable*, infershape_input[0]);
+            infer_meta_context.EmplaceBackAttr(
+                std::move(experimental::MakePtenScalarFromVar(*var)));
+          } else {
+            phi::Scalar tensor_scalar(-1);
+            tensor_scalar.SetFromTensor(true);
+            infer_meta_context.EmplaceBackAttr(std::move(tensor_scalar));
+          }
+        } else {
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "Invalid input.size() when cast op attribute `%s` to Scalar, "
+              "expected 1, but actually is %d .",
+              attr_name, infershape_input.size()));
+        }
+      }
     } else if (ctx->HasAttr(attr_name)) {
       // Emplace Back Attr according to the type of attr.
       auto& attr = attr_reader.GetAttr(attr_name);
-      if (std::type_index(attr.type()) == std::type_index(typeid(bool))) {
+      if (attr_defs[i].type_index == std::type_index(typeid(bool))) {
         infer_meta_context.EmplaceBackAttr(BOOST_GET_CONST(bool, attr));
-      } else if (std::type_index(attr.type()) == std::type_index(typeid(int))) {
+      } else if (attr_defs[i].type_index == std::type_index(typeid(int))) {
         infer_meta_context.EmplaceBackAttr(BOOST_GET_CONST(int, attr));
-      } else if (std::type_index(attr.type()) ==
-                 std::type_index(typeid(int64_t))) {
+      } else if (attr_defs[i].type_index == std::type_index(typeid(int64_t))) {
         infer_meta_context.EmplaceBackAttr(BOOST_GET_CONST(int64_t, attr));
-      } else if (std::type_index(attr.type()) ==
-                 std::type_index(typeid(float))) {
+      } else if (attr_defs[i].type_index == std::type_index(typeid(float))) {
         infer_meta_context.EmplaceBackAttr(BOOST_GET_CONST(float, attr));
-      } else if (std::type_index(attr.type()) ==
+      } else if (attr_defs[i].type_index ==
                  std::type_index(typeid(std::string))) {
         infer_meta_context.EmplaceBackAttr(BOOST_GET_CONST(std::string, attr));
-      } else if (std::type_index(attr.type()) ==
+      } else if (attr_defs[i].type_index ==
                  std::type_index(typeid(std::vector<bool>))) {
         infer_meta_context.EmplaceBackAttr(
             BOOST_GET_CONST(std::vector<bool>, attr));
-      } else if (std::type_index(attr.type()) ==
+      } else if (attr_defs[i].type_index ==
                  std::type_index(typeid(std::vector<int>))) {
         infer_meta_context.EmplaceBackAttr(
             BOOST_GET_CONST(std::vector<int>, attr));
-      } else if (std::type_index(attr.type()) ==
+      } else if (attr_defs[i].type_index ==
                  std::type_index(typeid(std::vector<int64_t>))) {
-        infer_meta_context.EmplaceBackAttr(
-            BOOST_GET_CONST(std::vector<int64_t>, attr));
-      } else if (std::type_index(attr.type()) ==
+        if (std::type_index(attr.type()) ==
+            std::type_index(typeid(std::vector<int>))) {
+          // Emplace Back Attr according to the type of Phi_Kernel args.
+          const auto& vector_int_attr = BOOST_GET_CONST(std::vector<int>, attr);
+          const std::vector<int64_t> vector_int64_attr(vector_int_attr.begin(),
+                                                       vector_int_attr.end());
+          infer_meta_context.EmplaceBackAttr(vector_int64_attr);
+        } else {
+          infer_meta_context.EmplaceBackAttr(
+              BOOST_GET_CONST(std::vector<int64_t>, attr));
+        }
+      } else if (attr_defs[i].type_index ==
                  std::type_index(typeid(std::vector<float>))) {
         infer_meta_context.EmplaceBackAttr(
             BOOST_GET_CONST(std::vector<float>, attr));
-      } else if (std::type_index(attr.type()) ==
+      } else if (attr_defs[i].type_index ==
                  std::type_index(typeid(std::vector<double>))) {
         infer_meta_context.EmplaceBackAttr(
             BOOST_GET_CONST(std::vector<double>, attr));
-      } else if (std::type_index(attr.type()) ==
+      } else if (attr_defs[i].type_index ==
                  std::type_index(typeid(std::vector<std::string>))) {
         infer_meta_context.EmplaceBackAttr(
             BOOST_GET_CONST(std::vector<std::string>, attr));
+      } else if (attr_defs[i].type_index ==
+                 std::type_index(typeid(phi::DataType))) {
+        auto data_type = paddle::framework::TransToPtenDataType(
+            static_cast<framework::proto::VarType::Type>(
+                BOOST_GET_CONST(int, attr)));
+        infer_meta_context.EmplaceBackAttr(data_type);
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
             "Unsupported attribute type is received when call "
diff --git a/paddle/phi/core/infermeta_utils.cc b/paddle/phi/core/infermeta_utils.cc
index d21232ed82296..f3dd056911ecf 100644
--- a/paddle/phi/core/infermeta_utils.cc
+++ b/paddle/phi/core/infermeta_utils.cc
@@ -67,6 +67,14 @@ const MetaTensor& InferMetaContext::InputAt(size_t idx) const {
   return *inputs_.at(idx);
 }
 
+paddle::optional<const phi::MetaTensor&> InferMetaContext::OptionalInputAt(
+    size_t idx) const {
+  const auto& input = inputs_.at(idx);
+  return input ? paddle::optional<const phi::MetaTensor&>{static_cast<
+                     const phi::MetaTensor&>(*input)}
+               : paddle::optional<const phi::MetaTensor&>{paddle::none};
+}
+
 std::vector<MetaTensor> InferMetaContext::InputsBetween(size_t start,
                                                         size_t end) const {
   std::vector<MetaTensor> result;
diff --git a/paddle/phi/core/infermeta_utils.h b/paddle/phi/core/infermeta_utils.h
index 7cf92e4d933b3..203dbb269841e 100644
--- a/paddle/phi/core/infermeta_utils.h
+++ b/paddle/phi/core/infermeta_utils.h
@@ -51,6 +51,9 @@ class InferMetaContext {
 
   const MetaConfig& GetMetaConfig() const;
   const MetaTensor& InputAt(size_t idx) const;
+
+  paddle::optional<const phi::MetaTensor&> OptionalInputAt(size_t idx) const;
+
   std::vector<MetaTensor> InputsBetween(size_t start, size_t end) const;
   MetaTensor* MutableOutputAt(size_t idx);
   std::vector<MetaTensor> MutableOutputBetween(size_t start, size_t end);
@@ -135,6 +138,24 @@ struct InferMetaFnImpl<Return (*)(Args...), infer_meta_fn> {
     }
   };
 
+  template <typename... Tail>
+  struct InferMetaFnCallHelper<paddle::optional<const MetaTensor&>, Tail...> {
+    template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs>
+    static void Call(InferMetaContext* ctx, PreviousArgs&... pargs) {
+      static_assert(attr_idx == 0,
+                    "InferMeta's Input should appear before Attributes.");
+      static_assert(out_idx == 0,
+                    "InferMeta's Input should appear before Outputs.");
+      const std::pair<int, int> range = ctx->InputRangeAt(in_idx);
+      auto arg = ctx->OptionalInputAt(range.first);
+
+      InferMetaFnCallHelper<
+          Tail...>::template Call<in_idx + 1, attr_idx, out_idx>(ctx,
+                                                                 pargs...,
+                                                                 arg);
+    }
+  };
+
   template <typename... Tail>
   struct InferMetaFnCallHelper<const std::vector<MetaTensor>&, Tail...> {
     template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs>

From 7a7a7cad9212ffc563ddcc576b40368f25d5702f Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Thu, 24 Feb 2022 11:25:11 +0800
Subject: [PATCH 084/101] [Phi] Fix XPU OP segmentation Fault problem (#39827)

* [Phi] Fix XPU OP segmentation Fault problem

* fix cast_op_xpu in kunlun1

* fix cast_op_xpu in kunlun1
---
 paddle/fluid/framework/operator.cc           | 20 ++++++++++------
 paddle/fluid/imperative/prepared_operator.cc | 25 ++++++++++++--------
 paddle/phi/core/compat/convert_utils.cc      |  2 ++
 3 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 701fc7de6940a..692ebf6f332f1 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1211,7 +1211,17 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
                 << "` not found.";
       }
     }
-    if (pt_kernel_->IsValid()) {
+#ifdef PADDLE_WITH_XPU
+    bool is_xpu_unsupport =
+        paddle::platform::is_xpu_place(kernel_type_->place_) &&
+            !paddle::platform::is_xpu_support_op(type_, *kernel_type_.get()) ||
+        paddle::platform::is_in_xpu_black_list(type_);
+#endif
+    if (pt_kernel_->IsValid()
+#ifdef PADDLE_WITH_XPU
+        && !is_xpu_unsupport
+#endif
+        ) {
       run_pten_kernel_ = true;
     } else {
       auto& all_op_kernels = AllOpKernels();
@@ -1220,13 +1230,9 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
           kernels_iter->second.find(*kernel_type_.get()) ==
               kernels_iter->second.end()
 #ifdef PADDLE_WITH_XPU
-          ||
-          paddle::platform::is_xpu_place(kernel_type_->place_) &&  // NOLINT
-              !paddle::platform::is_xpu_support_op(
-                  type_, *kernel_type_.get())  // NOLINT
-          || paddle::platform::is_in_xpu_black_list(type_)
+          || is_xpu_unsupport
 #endif
-              ) {
+          ) {
         auto pt_cpu_kernel_key =
             FallBackToCpu(*kernel_type_.get(), pt_kernel_key, *this);
         pt_kernel_.reset(
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 05218ba961fdd..6d18b0a86f091 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -161,6 +161,13 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
   framework::KernelSignature pt_kernel_signature;
   phi::KernelKey pt_kernel_key;
   std::string pt_kernel_name;
+#ifdef PADDLE_WITH_XPU
+  bool is_xpu_unsupport =
+      paddle::platform::is_xpu_place(expected_kernel_key.place_) &&
+          !paddle::platform::is_xpu_support_op(op.Type(),
+                                               expected_kernel_key) ||
+      paddle::platform::is_in_xpu_black_list(op.Type());
+#endif
   if (phi::KernelFactory::Instance().HasCompatiblePtenKernel(op.Type())) {
     pt_kernel_signature = op.GetExpectedPtenKernelArgs(dygraph_exe_ctx);
     VLOG(6) << pt_kernel_signature;
@@ -170,7 +177,11 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
     auto pt_kernel = phi::KernelFactory::Instance().SelectKernel(pt_kernel_name,
                                                                  pt_kernel_key);
 
-    if (pt_kernel.IsValid()) {
+    if (pt_kernel.IsValid()
+#ifdef PADDLE_WITH_XPU
+        && !is_xpu_unsupport
+#endif
+        ) {
       VLOG(6) << "Dynamic mode PrepareImpl - kernel name: " << pt_kernel_name
               << " | kernel key: " << pt_kernel_key
               << " | kernel: " << pt_kernel;
@@ -197,13 +208,9 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
        kernels_iter->second.find(expected_kernel_key) ==
            kernels_iter->second.end())
 #ifdef PADDLE_WITH_XPU
-      ||
-      paddle::platform::is_xpu_place(expected_kernel_key.place_) &&
-          !paddle::platform::is_xpu_support_op(op.Type(),
-                                               expected_kernel_key) ||
-      paddle::platform::is_in_xpu_black_list(op.Type())
+      || is_xpu_unsupport
 #endif
-          ) {
+      ) {
     if (phi::KernelFactory::Instance().HasCompatiblePtenKernel(op.Type())) {
       auto pt_cpu_kernel_key =
           FallBackToCpu(expected_kernel_key, pt_kernel_key, op);
@@ -230,9 +237,7 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
 
 #ifdef PADDLE_WITH_XPU
   if (paddle::platform::is_xpu_place(expected_kernel_key.place_) &&
-      (kernel_iter == kernels.end() ||
-       !paddle::platform::is_xpu_support_op(op.Type(), expected_kernel_key) ||
-       paddle::platform::is_in_xpu_black_list(op.Type()))) {
+      (kernel_iter == kernels.end() || is_xpu_unsupport)) {
     VLOG(3) << "missing XPU kernel: " << op.Type()
             << ", expected_kernel_key:" << expected_kernel_key
             << ", fallbacking to CPU one!";
diff --git a/paddle/phi/core/compat/convert_utils.cc b/paddle/phi/core/compat/convert_utils.cc
index b4e7e127995ec..a5b7b869b948d 100644
--- a/paddle/phi/core/compat/convert_utils.cc
+++ b/paddle/phi/core/compat/convert_utils.cc
@@ -30,6 +30,8 @@ Backend TransToPtenBackend(const phi::Place& place) {
     return Backend::CPU;
   } else if (place.GetType() == phi::AllocationType::GPU) {
     return Backend::GPU;
+  } else if (place.GetType() == phi::AllocationType::XPU) {
+    return Backend::XPU;
   } else if (place.GetType() == phi::AllocationType::CUSTOM) {
     return static_cast<Backend>(
         static_cast<size_t>(Backend::NUM_BACKENDS) +

From 4e26fa577198e2ab978ef0091329ff5a6b2e707a Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Thu, 24 Feb 2022 11:27:33 +0800
Subject: [PATCH 085/101] fix 'invalid escape sequence' (#39842)

* fix 'invalid escape sequence'

* fix assert error
---
 python/paddle/distributed/auto_parallel/cost_model.py     | 4 ++--
 python/paddle/incubate/nn/functional/fused_transformer.py | 4 ++--
 python/paddle/nn/functional/loss.py                       | 2 +-
 python/paddle/nn/functional/pooling.py                    | 6 +++---
 python/paddle/nn/initializer/dirac.py                     | 2 +-
 python/paddle/nn/layer/common.py                          | 2 +-
 python/paddle/nn/layer/pooling.py                         | 6 +++---
 python/paddle/signal.py                                   | 4 ++--
 python/paddle/tensor/random.py                            | 4 ++--
 python/paddle/vision/ops.py                               | 4 ++--
 10 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/python/paddle/distributed/auto_parallel/cost_model.py b/python/paddle/distributed/auto_parallel/cost_model.py
index 1155c2817a21c..b72c044428f6c 100644
--- a/python/paddle/distributed/auto_parallel/cost_model.py
+++ b/python/paddle/distributed/auto_parallel/cost_model.py
@@ -426,7 +426,7 @@ def _merge_node(self, to_merge_node_list, merge_type='linear', nodes=None):
         return merged_node_id, merged_node
 
     def merge_linear(self):
-        '''
+        r'''
         This method does the following: 
         If X depends on Y only, they must be run sequentially.
             [ e.g. A ->- C ->- D   D and E depends on C only.] 
@@ -442,7 +442,7 @@ def merge_linear(self):
         return cnt
 
     def merge_branch(self):
-        '''
+        r'''
         This method does the following:
         If a node has more than one successor, there is *branch*.
             [ e.g. A ->- B ->- D                                       ] 
diff --git a/python/paddle/incubate/nn/functional/fused_transformer.py b/python/paddle/incubate/nn/functional/fused_transformer.py
index 3569d372fa6dc..d600cda8454cc 100644
--- a/python/paddle/incubate/nn/functional/fused_transformer.py
+++ b/python/paddle/incubate/nn/functional/fused_transformer.py
@@ -46,7 +46,7 @@ def fused_feedforward(x,
                       training=True,
                       mode='upscale_in_train',
                       name=None):
-    """
+    r"""
     This is a fusion operator to compute feed forward layer in transformer model architecture.
     This operator only supports running on GPU. The function of the operator is consistent with
     the following pseudo code:
@@ -230,7 +230,7 @@ def fused_multi_head_attention(x,
                                training=True,
                                mode='upscale_in_train',
                                name=None):
-    """
+    r"""
     Attention mapps queries and a set of key-value pairs to outputs, and
     Multi-Head Attention performs multiple parallel attention to jointly attending
     to information from different representation subspaces. This API only
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 94c516f476ede..e59ef5ebfb0ab 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -1113,7 +1113,7 @@ def margin_cross_entropy(logits,
                          group=None,
                          return_softmax=False,
                          reduction='mean'):
-    """
+    r"""
     .. math::
 
         L=-\\frac{1}{N}\sum^N_{i=1}\log\\frac{e^{s(cos(m_{1}\\theta_{y_i}+m_{2})-m_{3})}}{e^{s(cos(m_{1}\\theta_{y_i}+m_{2})-m_{3})}+\sum^n_{j=1,j\\neq y_i} e^{scos\\theta_{y_i}}}
diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py
index a528a72ec5cac..34a0159fbb0dc 100755
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -670,7 +670,7 @@ def max_unpool1d(x,
                  data_format="NCL",
                  output_size=None,
                  name=None):
-    """
+    r"""
     This API implements max unpooling 1d opereation.
     `max_unpool1d` accepts the output of `max_pool1d` as input, 
     including the indices of the maximum value and calculate the partial inverse. 
@@ -779,7 +779,7 @@ def max_unpool2d(x,
                  data_format="NCHW",
                  output_size=None,
                  name=None):
-    """
+    r"""
     This API implements max unpooling 2d opereation.
     See more details in :ref:`api_nn_pooling_MaxUnPool2D` .
 
@@ -894,7 +894,7 @@ def max_unpool3d(x,
                  data_format="NCDHW",
                  output_size=None,
                  name=None):
-    """
+    r"""
     This API implements max unpooling 3d opereation.
     `max_unpool3d` accepts the output of `max_pool3d` as input, 
     including the indices of the maximum value and calculate the partial inverse. 
diff --git a/python/paddle/nn/initializer/dirac.py b/python/paddle/nn/initializer/dirac.py
index 514afb15a8edb..da3266ab33694 100644
--- a/python/paddle/nn/initializer/dirac.py
+++ b/python/paddle/nn/initializer/dirac.py
@@ -23,7 +23,7 @@
 
 
 class Dirac(Initializer):
-    """Initialize the 3D/4D/5D Tensor with Dirac delta function.
+    r"""Initialize the 3D/4D/5D Tensor with Dirac delta function.
     
     It can reserve the feature of convolution layer input, which means that
     as many channels are reserved as possible.
diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py
index 9ae9d5bec437e..19fbcd5b6f856 100644
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -1554,7 +1554,7 @@ def extra_repr(self):
 
 
 class Fold(Layer):
-    """
+    r"""
 
     This Op is used to combines an array of sliding local blocks into a large containing
     tensor. also known as col2im when operated on batched 2D image tensor. Fold calculates each 
diff --git a/python/paddle/nn/layer/pooling.py b/python/paddle/nn/layer/pooling.py
index 96942f5c8500a..68808c6354afb 100755
--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
@@ -1131,7 +1131,7 @@ def extra_repr(self):
 
 
 class MaxUnPool1D(Layer):
-    """
+    r"""
     This API implements max unpooling 1d opereation.
 
     `max_unpool1d` accepts the output of `max_pool1d` as input, 
@@ -1213,7 +1213,7 @@ def extra_repr(self):
 
 
 class MaxUnPool2D(Layer):
-    """
+    r"""
     This API implements max unpooling 2d opereation.
 
     'max_unpool2d' accepts the output of 'max_unpool2d' as input
@@ -1299,7 +1299,7 @@ def extra_repr(self):
 
 
 class MaxUnPool3D(Layer):
-    """
+    r"""
     This API implements max unpooling 3d opereation.
 
     `max_unpool3d` accepts the output of `max_pool3d` as input, 
diff --git a/python/paddle/signal.py b/python/paddle/signal.py
index fc80c7cbc80f3..cd8ba2b58a8c9 100644
--- a/python/paddle/signal.py
+++ b/python/paddle/signal.py
@@ -243,7 +243,7 @@ def stft(x,
          normalized=False,
          onesided=True,
          name=None):
-    """
+    r"""
     Short-time Fourier transform (STFT).
 
     The STFT computes the discrete Fourier transforms (DFT) of short overlapping
@@ -398,7 +398,7 @@ def istft(x,
           length=None,
           return_complex=False,
           name=None):
-    """
+    r"""
     Inverse short-time Fourier transform (ISTFT).
 
     Reconstruct time-domain signal from the giving complex input and window tensor when
diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py
index c4e7e96191acf..660803f9f7475 100644
--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
@@ -81,7 +81,7 @@ def bernoulli(x, name=None):
 
 
 def poisson(x, name=None):
-    """
+    r"""
     This OP returns a tensor filled with random number from a Poisson Distribution.
 
     .. math::
@@ -984,7 +984,7 @@ def rand(shape, dtype=None, name=None):
 
 
 def exponential_(x, lam=1.0, name=None):
-    """
+    r"""
     This inplace OP fill input Tensor ``x`` with random number from a Exponential Distribution.
 
     ``lam`` is :math:`\lambda` parameter of Exponential Distribution. 
diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py
index 68cd3ae72a6aa..03060e92bdb69 100644
--- a/python/paddle/vision/ops.py
+++ b/python/paddle/vision/ops.py
@@ -949,8 +949,8 @@ def psroi_pool(x, boxes, boxes_num, output_size, spatial_scale=1.0, name=None):
     if isinstance(output_size, int):
         output_size = (output_size, output_size)
     pooled_height, pooled_width = output_size
-    assert (len(x.shape) == 4,
-            "Input features with shape should be (N, C, H, W)")
+    assert len(x.shape) == 4, \
+            "Input features with shape should be (N, C, H, W)"
     output_channels = int(x.shape[1] / (pooled_height * pooled_width))
     if in_dygraph_mode():
         return _C_ops.psroi_pool(x, boxes, boxes_num, "output_channels",

From 1abfc8dda8501e37c6f7ffd82d0dabc616563fba Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Thu, 24 Feb 2022 11:47:40 +0800
Subject: [PATCH 086/101] Refactored GradNodeAccumulation data structure and
 behaviour (#39526)

* Refactored GradNodeAccumulation data structure and behaviour

* Fixed CI issues

* Fix compilation issues

* Fixed minor issues

* Reverted changes for intermediate and OverwriteOutput

* fixed minor issue

* Fixed code format issues

* Fixed CI-Coverage issue

* Fixed CI issues
---
 .../eager/accumulation/accumulation_node.cc   | 23 ++---
 .../eager/accumulation/accumulation_node.h    | 13 ++-
 paddle/fluid/eager/api/utils/hook_utils.cc    | 32 +++----
 paddle/fluid/eager/api/utils/tensor_utils.cc  |  2 +-
 .../auto_code_generator/eager_generator.cc    |  4 +
 paddle/fluid/eager/autograd_meta.h            |  1 +
 paddle/fluid/eager/grad_node_info.cc          |  4 +-
 paddle/fluid/eager/tensor_wrapper.h           | 15 ++--
 .../accumulation_node_test.cc                 | 88 +++++++++++--------
 .../eager/tests/task_tests/backward_test.cc   | 63 +++++--------
 .../cross_batch_accumulation_test.cc          | 48 +++++-----
 .../fluid/eager/tests/task_tests/hook_test.cc | 51 ++++-------
 paddle/fluid/eager/utils.cc                   | 15 +++-
 paddle/fluid/eager/utils.h                    |  2 +
 paddle/fluid/pybind/eager.cc                  |  3 +-
 paddle/fluid/pybind/eager_method.cc           | 41 ++++-----
 paddle/fluid/pybind/eager_properties.cc       | 44 ++++------
 .../paddle/fluid/tests/unittests/op_test.py   |  9 +-
 18 files changed, 215 insertions(+), 243 deletions(-)

diff --git a/paddle/fluid/eager/accumulation/accumulation_node.cc b/paddle/fluid/eager/accumulation/accumulation_node.cc
index 9b0e784c0efb1..2e377e43ca3ec 100644
--- a/paddle/fluid/eager/accumulation/accumulation_node.cc
+++ b/paddle/fluid/eager/accumulation/accumulation_node.cc
@@ -25,6 +25,8 @@
 
 #include "glog/logging.h"
 
+namespace egr {
+
 static void CopyOrAddTensor(paddle::experimental::Tensor* tensor,
                             const paddle::experimental::Tensor& t) {
   if (!tensor->defined() || !tensor->initialized()) {
@@ -36,14 +38,6 @@ static void CopyOrAddTensor(paddle::experimental::Tensor* tensor,
   }
 }
 
-namespace egr {
-
-void GradNodeAccumulation::RetainGrad(
-    const std::function<paddle::experimental::Tensor(
-        const paddle::experimental::Tensor&)>& hook) {
-  retain_grad_hook_ = hook;
-}
-
 std::vector<std::vector<paddle::experimental::Tensor>> GradNodeAccumulation::
 operator()(
     const std::vector<std::vector<paddle::experimental::Tensor>>& grads) {
@@ -59,17 +53,18 @@ operator()(
                      "However received: %d in slot %d .",
                      grads[0].size(), 0));
   // Apply Gradient Hooks
+  paddle::experimental::Tensor grad_out;
   if (GradientHooksRegistered()) {
     std::vector<std::vector<paddle::experimental::Tensor>> hooked_grads =
         ApplyGradientHooks(grads);
-    // TODO(jiabin): It's little weird
-    CopyOrAddTensor(&accumulated_grad, hooked_grads[0][0]);
+    grad_out = hooked_grads[0][0];
   } else {
-    CopyOrAddTensor(&accumulated_grad, grads[0][0]);
+    grad_out = grads[0][0];
   }
 
-  if (retain_grad_hook_ != nullptr) {
-    retain_grad_hook_(accumulated_grad);
+  if (!weak_grad_.expired()) {
+    auto grad = weak_grad_.lock();
+    CopyOrAddTensor(grad.get(), grad_out);
   }
 
   // Apply Reduce Hooks
@@ -77,7 +72,7 @@ operator()(
     ApplyReduceHooks();
   }
 
-  return {{accumulated_grad}};
+  return {{grad_out}};
 }
 
 void GradNodeAccumulation::RegisterReduceHook(
diff --git a/paddle/fluid/eager/accumulation/accumulation_node.h b/paddle/fluid/eager/accumulation/accumulation_node.h
index 3f53517204a5a..787149ab30526 100644
--- a/paddle/fluid/eager/accumulation/accumulation_node.h
+++ b/paddle/fluid/eager/accumulation/accumulation_node.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/grad_node_info.h"
 
 namespace egr {
@@ -21,7 +22,10 @@ namespace egr {
 class GradNodeAccumulation : public GradNodeBase {
  public:
   // Constructor: configure fwd input tensors to grad node
-  GradNodeAccumulation() : GradNodeBase(1, 1) { SetDefaultGradInOutMeta(); }
+  explicit GradNodeAccumulation(AutogradMeta* meta) : GradNodeBase(1, 1) {
+    weak_grad_ = meta->WeakGrad();
+    SetDefaultGradInOutMeta();
+  }
 
   ~GradNodeAccumulation() override = default;
 
@@ -30,11 +34,6 @@ class GradNodeAccumulation : public GradNodeBase {
       const std::vector<std::vector<paddle::experimental::Tensor>>& grads)
       override;
 
-  void RetainGrad(const std::function<paddle::experimental::Tensor(
-                      const paddle::experimental::Tensor&)>& hook);
-
-  paddle::experimental::Tensor* Grad() { return &accumulated_grad; }
-
   std::string name() { return "GradNodeAccumulation"; }
 
   /**
@@ -49,7 +48,7 @@ class GradNodeAccumulation : public GradNodeBase {
   void ApplyReduceHooks();
 
  private:
-  paddle::experimental::Tensor accumulated_grad;
+  std::weak_ptr<paddle::experimental::Tensor> weak_grad_;
 
   std::function<paddle::experimental::Tensor(
       const paddle::experimental::Tensor&)>
diff --git a/paddle/fluid/eager/api/utils/hook_utils.cc b/paddle/fluid/eager/api/utils/hook_utils.cc
index 7d2997eb884c8..748afe6d1f313 100644
--- a/paddle/fluid/eager/api/utils/hook_utils.cc
+++ b/paddle/fluid/eager/api/utils/hook_utils.cc
@@ -52,9 +52,15 @@ void RegisterReduceHookForTensor(const paddle::experimental::Tensor& tensor,
   }
 }
 
-void RetainGradForTensor(const paddle::experimental::Tensor& tensor) {
-  // TODO(jiabin): Support More Tensor type here
+static void RetainGradForRegularNode(
+    const paddle::experimental::Tensor& tensor) {
   AutogradMeta* meta = EagerUtils::unsafe_autograd_meta(tensor);
+  if (meta->RetainGrads()) {
+    return;
+  } else {
+    meta->SetRetainGrads(true);
+  }
+
   std::weak_ptr<paddle::experimental::Tensor> weak_grad_tensor =
       meta->WeakGrad();
 
@@ -79,21 +85,17 @@ void RetainGradForTensor(const paddle::experimental::Tensor& tensor) {
         }
       };
 
-  if (IsLeafTensor(tensor)) {
-    // Add RetainGrad as PostHook to AccumulationNode
-    std::shared_ptr<GradNodeBase> grad_node = EagerUtils::grad_node(tensor);
-    PADDLE_ENFORCE(
-        grad_node.get() != nullptr,
-        paddle::platform::errors::Fatal("Detected NULL grad_node"
-                                        "Leaf tensor should have had grad_node "
-                                        "with type: GradNodeAccumulation"));
-    auto accumulation_grad_node =
-        std::dynamic_pointer_cast<GradNodeAccumulation>(grad_node);
-    accumulation_grad_node->RetainGrad(hook);
+  // Append to GradientHooks
+  RegisterGradientHookForTensor(tensor, hook);
+}
 
+void RetainGradForTensor(const paddle::experimental::Tensor& tensor) {
+  if (IsLeafTensor(tensor)) {
+    // Leaf tensor's grad will always be retained
+    // Refer to implementation of AccumulationNode for more details
+    return;
   } else {
-    // Append to GradientHooks
-    RegisterGradientHookForTensor(tensor, hook);
+    RetainGradForRegularNode(tensor);
   }
 }
 
diff --git a/paddle/fluid/eager/api/utils/tensor_utils.cc b/paddle/fluid/eager/api/utils/tensor_utils.cc
index c06edef7017be..628c0c500b3c4 100644
--- a/paddle/fluid/eager/api/utils/tensor_utils.cc
+++ b/paddle/fluid/eager/api/utils/tensor_utils.cc
@@ -47,7 +47,7 @@ paddle::experimental::Tensor CreateTensorWithValue(
 
   auto meta = EagerUtils::autograd_meta(&out);
   if (is_leaf) {
-    auto accumulation_node = std::make_shared<GradNodeAccumulation>();
+    auto accumulation_node = std::make_shared<GradNodeAccumulation>(meta);
     meta->SetGradNode(accumulation_node);
     meta->SetStopGradient(false);
   }
diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
index 7cddfd9c1c7dc..e1f4d6ee9a129 100644
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -1031,6 +1031,8 @@ static std::string GenerateGradNodeCreationContent(
     const std::string& output_name = output.name();
     const std::string& output_autograd_name = "p_autograd_" + output_name;
 
+    // Skip Intermediate Tensor
+
     if (output.duplicable()) {
       const char* GET_MULTI_AUTOGRAD_META_TEMPLATE =
           "  std::vector<egr::AutogradMeta*> %s = "
@@ -1145,6 +1147,8 @@ static std::string GenerateGradNodeCreationContent(
     const std::string& output_autograd_name = "p_autograd_" + output_name;
     size_t output_position = fwd_outputs_name_pos_map.at(output_name);
 
+    // Intermediate Tensor does not require SetHistory, nor RetainGrad
+
     if (output.duplicable()) {
       pass_stop_gradient_args += ", &" + output_autograd_name;
       const char* SET_OUT_RANK_TEMPLATE =
diff --git a/paddle/fluid/eager/autograd_meta.h b/paddle/fluid/eager/autograd_meta.h
index 53f17a4ffe58c..9e1dc4f2c8c6b 100644
--- a/paddle/fluid/eager/autograd_meta.h
+++ b/paddle/fluid/eager/autograd_meta.h
@@ -97,6 +97,7 @@ class AutogradMeta : public AbstractAutogradMeta {
             "Should Not set NULL as GradNode pointer, since "
             "our default Edge and autogradMeta has nullptr for "
             "grad node. Set Nullptr will lead error."));
+
     grad_node_ = grad_node;
   }
 
diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc
index d83fa916db66c..27c376b4c80c6 100644
--- a/paddle/fluid/eager/grad_node_info.cc
+++ b/paddle/fluid/eager/grad_node_info.cc
@@ -53,7 +53,7 @@ void GradNodeBase::AddEdges(std::vector<AutogradMeta*>* metas, size_t slot_id) {
         adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
                                          meta->OutRankInfo());
       } else {
-        meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>());
+        meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>(meta));
         adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
                                          meta->OutRankInfo());
       }
@@ -76,7 +76,7 @@ void GradNodeBase::AddEdges(AutogradMeta* meta, size_t slot_id) {
       adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
                                        meta->OutRankInfo());
     } else {
-      meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>());
+      meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>(meta));
       VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from "
               << this->name() << " to " << meta->GetMutableGradNode()->name();
       adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
diff --git a/paddle/fluid/eager/tensor_wrapper.h b/paddle/fluid/eager/tensor_wrapper.h
index 1732e0513d524..31aaa93c41643 100644
--- a/paddle/fluid/eager/tensor_wrapper.h
+++ b/paddle/fluid/eager/tensor_wrapper.h
@@ -66,14 +66,13 @@ class TensorWrapper {
     }
 
     intermidiate_tensor_.set_name(tensor.name() + "@Saved");
-    PADDLE_ENFORCE_NOT_NULL(
-        EagerUtils::unsafe_autograd_meta(tensor),
-        paddle::platform::errors::Fatal(
-            "Full reserved Tensor should not have null autograd meta, since "
-            "tensor_wrapper is used to build backward info. There is no way "
-            "for us to build it with null autograd_meta."));
-    // copy output_rank
-    out_rank_info_ = EagerUtils::OutRankInfo(tensor);
+
+    // If an output is marked "intermedaite", we won't create
+    // autograd_meta for it.
+    // In that case, simply skip OutRankInfo Copy
+    if (EagerUtils::nullable_autograd_meta(tensor)) {
+      out_rank_info_ = EagerUtils::OutRankInfo(tensor);
+    }
   }
 
   paddle::experimental::Tensor recover(
diff --git a/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc b/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc
index 682e55e7d9294..880bd26841027 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc
@@ -17,11 +17,13 @@
 #include "gtest/gtest.h"
 
 #include "paddle/fluid/eager/accumulation/accumulation_node.h"
+#include "paddle/fluid/eager/api/utils/hook_utils.h"
 #include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/fluid/eager/grad_node_info.h"
 #include "paddle/fluid/eager/grad_tensor_holder.h"
-#include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/fluid/eager/utils.h"
 
+#include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 // TODO(jiabin): remove nolint here!!!
@@ -37,7 +39,7 @@ TEST(AccumulationNode, Tensor) {
           .get(),
       meta);
   dt0->mutable_data<paddle::platform::float16>(
-      paddle::platform::CPUPlace())[0] = 10.0;
+      paddle::platform::CPUPlace())[0] = paddle::platform::float16(10.0f);
   paddle::experimental::Tensor et0 = paddle::experimental::Tensor(dt0);
 
   std::shared_ptr<phi::DenseTensor> dt1 = std::make_shared<phi::DenseTensor>(
@@ -47,84 +49,100 @@ TEST(AccumulationNode, Tensor) {
       meta);
 
   dt1->mutable_data<paddle::platform::float16>(
-      paddle::platform::CPUPlace())[0] = 20.0;
+      paddle::platform::CPUPlace())[0] = paddle::platform::float16(20.0f);
   paddle::experimental::Tensor et1 = paddle::experimental::Tensor(dt1);
 
+  std::shared_ptr<phi::DenseTensor> input_dt =
+      std::make_shared<phi::DenseTensor>(
+          std::make_unique<paddle::experimental::DefaultAllocator>(
+              paddle::platform::CPUPlace())
+              .get(),
+          meta);
+  paddle::experimental::Tensor input_et =
+      paddle::experimental::Tensor(input_dt);
+  auto grad_meta = EagerUtils::autograd_meta(&input_et);
+
+  // Initialize Grad Tensor
   std::shared_ptr<phi::DenseTensor> grad_dt =
       std::make_shared<phi::DenseTensor>(
           std::make_unique<paddle::experimental::DefaultAllocator>(
               paddle::platform::CPUPlace())
               .get(),
           meta);
-  paddle::experimental::Tensor grad_et = paddle::experimental::Tensor(grad_dt);
+  grad_dt->mutable_data<paddle::platform::float16>(
+      paddle::platform::CPUPlace())[0] = paddle::platform::float16(0.0f);
+  grad_meta->MutableGrad()->set_impl(grad_dt);
 
   // AccumulationNode
-  GradNodeAccumulation node = GradNodeAccumulation();
-
-  // Hook, RetainGrad
-  std::function<paddle::experimental::Tensor(
-      const paddle::experimental::Tensor&)>
-      hook = [&grad_et](const paddle::experimental::Tensor& t) {
-        grad_et.set_impl(t.impl());
-        return grad_et;
-      };
-  node.RetainGrad(hook);
+  auto node = std::make_shared<GradNodeAccumulation>(grad_meta);
+  grad_meta->SetGradNode(node);
+  grad_meta->SetStopGradient(false);
 
   // operator()
-  paddle::experimental::Tensor ret_et0 = node({{et0}})[0][0];
+  paddle::experimental::Tensor ret_et0 = node->operator()({{et0}})[0][0];
   auto* ret_et0_ptr =
       std::dynamic_pointer_cast<phi::DenseTensor>(ret_et0.impl())
           ->data<paddle::platform::float16>();
   CHECK_EQ(ret_et0_ptr[0], paddle::platform::float16(10.0f));
 
-  paddle::experimental::Tensor ret_et1 = node({{et1}})[0][0];
+  paddle::experimental::Tensor ret_et1 = node->operator()({{et1}})[0][0];
+
   auto* ret_et1_ptr =
       std::dynamic_pointer_cast<phi::DenseTensor>(ret_et1.impl())
           ->data<paddle::platform::float16>();
-  CHECK_EQ(ret_et1_ptr[0], paddle::platform::float16(30.0f));
+  CHECK_EQ(ret_et1_ptr[0], paddle::platform::float16(20.0f));
 
-  // Retain Grad
-  auto* ret_grad_et_ptr =
-      std::dynamic_pointer_cast<phi::DenseTensor>(grad_et.impl())
-          ->data<paddle::platform::float16>();
-  CHECK_EQ(ret_grad_et_ptr[0], paddle::platform::float16(30.0f));
+  // Check Retain Grad
+  CHECK_EQ(std::dynamic_pointer_cast<phi::DenseTensor>(et0.impl())
+               ->data<paddle::platform::float16>()[0],
+           paddle::platform::float16(10.0f));
+  paddle::experimental::Tensor* grad = EagerUtils::mutable_grad(input_et);
+  auto* grad_ptr = std::dynamic_pointer_cast<phi::DenseTensor>(grad->impl())
+                       ->data<paddle::platform::float16>();
+  CHECK_EQ(grad_ptr[0], paddle::platform::float16(30.0f));
 
   // Reduce Hook case 1: Call RegisterReduceHook and run operator()
   VLOG(6) << "Test Reduce Hook";
+  CHECK_EQ(std::dynamic_pointer_cast<phi::DenseTensor>(et0.impl())
+               ->data<paddle::platform::float16>()[0],
+           paddle::platform::float16(10.0f));
+
   auto reduce_hook_1 = [&](void) -> void {
-    auto* grad_et_ptr =
-        std::dynamic_pointer_cast<phi::DenseTensor>(grad_et.impl())
-            ->data<paddle::platform::float16>();
-    grad_et_ptr[0] = 36.0;
+    auto* input_et_ptr =
+        std::dynamic_pointer_cast<phi::DenseTensor>(input_et.impl())
+            ->mutable_data<paddle::platform::float16>(
+                paddle::platform::CPUPlace());
+    input_et_ptr[0] = 36.0;
     VLOG(6) << "Running Reduce Hook";
   };
 
-  node.RegisterReduceHook(reduce_hook_1);
+  node->RegisterReduceHook(reduce_hook_1);
 
   // operator()
-  paddle::experimental::Tensor _ret = node({{et0}})[0][0];
+  paddle::experimental::Tensor _ret = node->operator()({{et0}})[0][0];
 
   // Check operator() result, should be 36.0
   auto* _ret_ptr = std::dynamic_pointer_cast<phi::DenseTensor>(_ret.impl())
                        ->data<paddle::platform::float16>();
-  CHECK_EQ(_ret_ptr[0], paddle::platform::float16(36.0f));
+  CHECK_EQ(_ret_ptr[0], paddle::platform::float16(10.0f));
 
   // Check Retain Grad, should be 36.0
-  auto* _ret_grad_et_ptr =
-      std::dynamic_pointer_cast<phi::DenseTensor>(grad_et.impl())
+  auto* _ret_input_et_ptr =
+      std::dynamic_pointer_cast<phi::DenseTensor>(input_et.impl())
           ->data<paddle::platform::float16>();
-  CHECK_EQ(_ret_grad_et_ptr[0], paddle::platform::float16(36.0f));
+  CHECK_EQ(_ret_input_et_ptr[0], paddle::platform::float16(36.0f));
 
   // Reduce Hook case 2: Call RegisterReduceHook and ApplyReduceHooks directly
   VLOG(6) << "Test Reduce Hook";
   auto reduce_hook_2 = [&](void) -> void {
     auto* ret_et0_ptr = std::dynamic_pointer_cast<phi::DenseTensor>(et0.impl())
-                            ->data<paddle::platform::float16>();
+                            ->mutable_data<paddle::platform::float16>(
+                                paddle::platform::CPUPlace());
     ret_et0_ptr[0] = 100.0;  // set to 100.0
     VLOG(6) << "Running Reduce Hook";
   };
-  node.RegisterReduceHook(reduce_hook_2);
-  node.ApplyReduceHooks();
+  node->RegisterReduceHook(reduce_hook_2);
+  node->ApplyReduceHooks();
 
   // Check ApplyReduceHooks result
   CHECK_EQ(std::dynamic_pointer_cast<phi::DenseTensor>(et0.impl())
diff --git a/paddle/fluid/eager/tests/task_tests/backward_test.cc b/paddle/fluid/eager/tests/task_tests/backward_test.cc
index 771b324a69b5a..a4bc56bd606f3 100644
--- a/paddle/fluid/eager/tests/task_tests/backward_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/backward_test.cc
@@ -59,22 +59,18 @@ TEST(Backward, SingleNodeEmptyGrad) {
     auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
     auto_grad_meta->SetStopGradient(false);
 
+    AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor);
+
     // Connect Tensor and AccumulationNode via AutoGradMeta
-    auto acc_node_ptr = std::make_shared<egr::GradNodeAccumulation>();
+    auto acc_node_ptr =
+        std::make_shared<egr::GradNodeAccumulation>(auto_grad_meta1);
 
-    AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor);
     auto_grad_meta1->SetGradNode(
         std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
     auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
+    auto_grad_meta1->SetStopGradient(false);
 
-    egr_utils_api::RetainGradForTensor(leaf_tensor);
-
-    // Connect Node0 -> AccumulationNode via Edge
-    auto meta = egr::AutogradMeta();
-    meta.SetStopGradient(false);
-    meta.SetSingleOutRankWithSlot(0, 0);
-    meta.SetGradNode(acc_node_ptr);
-    std::vector<egr::AutogradMeta*> res = {&meta};
+    std::vector<egr::AutogradMeta*> res = {auto_grad_meta1};
     node0_ptr->AddEdges(&res, 0);
   }
   std::vector<paddle::experimental::Tensor> outs = {target_tensor};
@@ -123,22 +119,17 @@ TEST(Backward, SingleNodeCustomGrad) {
         std::dynamic_pointer_cast<GradNodeBase>(node0_ptr));
     auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
     auto_grad_meta->SetStopGradient(false);
-    // Connect Tensor and AccumulationNode via AutoGradMeta
-    auto acc_node_ptr = std::make_shared<egr::GradNodeAccumulation>();
 
     AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor);
+    // Connect Tensor and AccumulationNode via AutoGradMeta
+    auto acc_node_ptr =
+        std::make_shared<egr::GradNodeAccumulation>(auto_grad_meta1);
+
     auto_grad_meta1->SetGradNode(
         std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
     auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
-
-    egr_utils_api::RetainGradForTensor(leaf_tensor);
-
-    // Connect Node0 -> AccumulationNode via Edge
-    auto meta = egr::AutogradMeta();
-    meta.SetStopGradient(false);
-    meta.SetSingleOutRankWithSlot(0, 0);
-    meta.SetGradNode(acc_node_ptr);
-    std::vector<egr::AutogradMeta*> res = {&meta};
+    auto_grad_meta1->SetStopGradient(false);
+    std::vector<egr::AutogradMeta*> res = {auto_grad_meta1};
     node0_ptr->AddEdges(&res, 0);
   }
 
@@ -201,22 +192,17 @@ TEST(Backward, LinearNodes) {
     std::vector<egr::AutogradMeta*> res0 = {&meta0};
     node0_ptr->AddEdges(&res0, 0);
 
+    AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor);
     // Connect Tensor and AccumulationNode via AutoGradMeta
-    auto acc_node_ptr = std::make_shared<egr::GradNodeAccumulation>();
+    auto acc_node_ptr =
+        std::make_shared<egr::GradNodeAccumulation>(auto_grad_meta1);
 
-    AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor);
     auto_grad_meta1->SetGradNode(
         std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
     auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
 
-    egr_utils_api::RetainGradForTensor(leaf_tensor);
-
-    // Connect Node1 -> AccumulationNode via Edge
-    auto meta1 = egr::AutogradMeta();
-    meta1.SetStopGradient(false);
-    meta1.SetSingleOutRankWithSlot(0, 0);
-    meta1.SetGradNode(acc_node_ptr);
-    std::vector<egr::AutogradMeta*> res1 = {&meta1};
+    auto_grad_meta1->SetStopGradient(false);
+    std::vector<egr::AutogradMeta*> res1 = {auto_grad_meta1};
     node1_ptr->AddEdges(&res1, 0);
   }
 
@@ -311,22 +297,17 @@ TEST(Backward, WithAccumulation) {
     std::vector<egr::AutogradMeta*> res1 = {&meta1};
     node1_ptr->AddEdges(&res1, 0);
 
+    AutogradMeta* auto_grad_meta2 = EagerUtils::autograd_meta(&leaf_tensor);
     // Connect Tensor and AccumulationNode via AutoGradMeta
-    auto acc_node_ptr = std::make_shared<egr::GradNodeAccumulation>();
+    auto acc_node_ptr =
+        std::make_shared<egr::GradNodeAccumulation>(auto_grad_meta2);
 
-    AutogradMeta* auto_grad_meta2 = EagerUtils::autograd_meta(&leaf_tensor);
     auto_grad_meta2->SetGradNode(
         std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
     auto_grad_meta2->SetSingleOutRankWithSlot(0, 0);
 
-    egr_utils_api::RetainGradForTensor(leaf_tensor);
-
-    // Connect Node2 -> AccumulationNode via Edge
-    auto meta2 = egr::AutogradMeta();
-    meta2.SetStopGradient(false);
-    meta2.SetSingleOutRankWithSlot(0, 0);
-    meta2.SetGradNode(acc_node_ptr);
-    std::vector<egr::AutogradMeta*> res2 = {&meta2};
+    auto_grad_meta2->SetStopGradient(false);
+    std::vector<egr::AutogradMeta*> res2 = {auto_grad_meta2};
     node2_ptr->AddEdges(&res2, 0);
   }
 
diff --git a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
index a44ca6fcffbff..524872b2e5563 100644
--- a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
@@ -46,34 +46,26 @@ TEST(CrossBatchAccumulation, SingleScaleNode) {
   paddle::experimental::Tensor& target_tensor = target_tensors[0];
 
   paddle::experimental::Tensor leaf_tensor = paddle::experimental::Tensor();
-  {
-    auto scale_node_ptr = std::make_shared<GradNodeScale>(1, 1);
-    scale_node_ptr->SetAttributes_scale(5.0 /*scale*/);
-
-    scale_node_ptr->SetDefaultGradInOutMeta();
-
-    auto acc_node_ptr = std::make_shared<GradNodeAccumulation>();
-
-    AutogradMeta* auto_grad_meta = EagerUtils::autograd_meta(&target_tensor);
-    auto_grad_meta->SetGradNode(
-        std::dynamic_pointer_cast<GradNodeBase>(scale_node_ptr));
-    auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
-    auto_grad_meta->SetStopGradient(false);
-    egr_utils_api::RetainGradForTensor(target_tensor);  // result: 1.0
-
-    auto meta = AutogradMeta();
-    meta.SetSingleOutRankWithSlot(0, 0);
-    meta.SetStopGradient(false);
-    meta.SetGradNode(acc_node_ptr);
-    std::vector<egr::AutogradMeta*> res = {&meta};
-    scale_node_ptr->AddEdges(&res, 0);
-
-    AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor);
-    auto_grad_meta1->SetGradNode(
-        std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
-    auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
-    egr_utils_api::RetainGradForTensor(leaf_tensor);
-  }
+
+  auto scale_node_ptr = std::make_shared<GradNodeScale>(1, 1);
+  scale_node_ptr->SetAttributes_scale(5.0 /*scale*/);
+
+  scale_node_ptr->SetDefaultGradInOutMeta();
+
+  AutogradMeta* auto_grad_meta = EagerUtils::autograd_meta(&target_tensor);
+  auto_grad_meta->SetGradNode(
+      std::dynamic_pointer_cast<GradNodeBase>(scale_node_ptr));
+  auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
+  auto_grad_meta->SetStopGradient(false);
+  egr_utils_api::RetainGradForTensor(target_tensor);  // result: 1.0
+
+  AutogradMeta* meta = EagerUtils::autograd_meta(&leaf_tensor);
+  auto acc_node_ptr = std::make_shared<GradNodeAccumulation>(meta);
+  meta->SetStopGradient(false);
+  meta->SetSingleOutRankWithSlot(0, 0);
+  meta->SetGradNode(acc_node_ptr);
+  std::vector<egr::AutogradMeta*> res = {meta};
+  scale_node_ptr->AddEdges(&res, 0);
 
   RunBackward(target_tensors, {});
 
diff --git a/paddle/fluid/eager/tests/task_tests/hook_test.cc b/paddle/fluid/eager/tests/task_tests/hook_test.cc
index bf2f620dd19ba..fbc71168fe416 100644
--- a/paddle/fluid/eager/tests/task_tests/hook_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/hook_test.cc
@@ -79,9 +79,6 @@ TEST(RetainGrad, HookBeforeRetainGrad) {
   // Set grad in/out meta for node0
   scale_node_ptr->SetDefaultGradInOutMeta();
 
-  // Create AccumulationNode
-  auto acc_node_ptr = std::make_shared<GradNodeAccumulation>();
-
   // Connect Input Tensor and ScaleNode via AutoGradMeta
   // Apply RetainGrad
   {
@@ -102,16 +99,8 @@ TEST(RetainGrad, HookBeforeRetainGrad) {
     egr_utils_api::RegisterGradientHookForTensor(target_tensor, hook);
     egr_utils_api::RetainGradForTensor(
         target_tensor);  // result: 1.0 + 3.0 = 4.0
-  }
-
-  // Connect ScaleNode -> AccumulationNode via Edge
-  {
-    auto meta = AutogradMeta();
-    meta.SetStopGradient(false);
-    meta.SetSingleOutRankWithSlot(0, 0);
-    meta.SetGradNode(acc_node_ptr);
-    std::vector<egr::AutogradMeta*> res = {&meta};
-    scale_node_ptr->AddEdges(&res, 0);
+    egr_utils_api::RetainGradForTensor(
+        target_tensor);  // result: 1.0 + 3.0 = 4.0
   }
 
   // Retain Grad for leaf tensor1
@@ -123,9 +112,16 @@ TEST(RetainGrad, HookBeforeRetainGrad) {
         hook = &hook_function;
 
     auto auto_grad_meta = std::make_shared<AutogradMeta>();
-    auto_grad_meta->SetGradNode(
-        std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
+
+    auto acc_node_ptr =
+        std::make_shared<GradNodeAccumulation>(auto_grad_meta.get());
+
+    auto_grad_meta->SetStopGradient(false);
+    auto_grad_meta->SetGradNode(acc_node_ptr);
     auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
+    std::vector<egr::AutogradMeta*> res = {auto_grad_meta.get()};
+    scale_node_ptr->AddEdges(&res, 0);
+
     leaf_tensor.set_autograd_meta(
         std::dynamic_pointer_cast<paddle::experimental::AbstractAutogradMeta>(
             auto_grad_meta));
@@ -160,8 +156,6 @@ TEST(RetainGrad, HookAfterRetainGrad) {
   scale_node_ptr->SetAttributes_scale(5.0 /*scale*/);
   // Set grad in/out meta for node0
   scale_node_ptr->SetDefaultGradInOutMeta();
-  // Create AccumulationNode
-  auto acc_node_ptr = std::make_shared<GradNodeAccumulation>();
 
   // Connect Input Tensor and ScaleNode via AutoGradMeta
   // Apply RetainGrad
@@ -184,16 +178,6 @@ TEST(RetainGrad, HookAfterRetainGrad) {
     egr_utils_api::RegisterGradientHookForTensor(target_tensor, hook);
   }
 
-  // Connect ScaleNode -> AccumulationNode via Edge
-  {
-    auto meta = AutogradMeta();
-    meta.SetStopGradient(false);
-    meta.SetSingleOutRankWithSlot(0, 0);
-    meta.SetGradNode(acc_node_ptr);
-    std::vector<egr::AutogradMeta*> res = {&meta};
-    scale_node_ptr->AddEdges(&res, 0);
-  }
-
   // Retain Grad for leaf tensor1
   paddle::experimental::Tensor leaf_tensor = paddle::experimental::Tensor();
   {
@@ -203,17 +187,18 @@ TEST(RetainGrad, HookAfterRetainGrad) {
         hook = &hook_function;
 
     auto auto_grad_meta = std::make_shared<AutogradMeta>();
-    auto_grad_meta->SetGradNode(
-        std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
+    auto acc_node_ptr =
+        std::make_shared<GradNodeAccumulation>(auto_grad_meta.get());
+    auto_grad_meta->SetGradNode(acc_node_ptr);
+    auto_grad_meta->SetStopGradient(false);
+    std::vector<egr::AutogradMeta*> res = {auto_grad_meta.get()};
+    scale_node_ptr->AddEdges(&res, 0);
+
     auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
     leaf_tensor.set_autograd_meta(
         std::dynamic_pointer_cast<paddle::experimental::AbstractAutogradMeta>(
             auto_grad_meta));
 
-    egr_utils_api::RetainGradForTensor(
-        leaf_tensor);  // RetainGrad for leaf tensor gets
-                       // postponed, result: 4.0*5.0 + 3.0 =
-                       // 23.0
     egr_utils_api::RegisterGradientHookForTensor(leaf_tensor, hook);
   }
 
diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc
index 5d8dff5cd5b24..7464ad7413585 100644
--- a/paddle/fluid/eager/utils.cc
+++ b/paddle/fluid/eager/utils.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/eager/utils.h"
+#include "paddle/fluid/eager/accumulation/accumulation_node.h"
 #include "paddle/fluid/eager/api/utils/global_utils.h"
 #include "paddle/fluid/eager/api/utils/hook_utils.h"
 #include "paddle/fluid/eager/tensor_wrapper.h"
@@ -21,7 +22,6 @@
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/tensor_meta.h"
 
-#include "paddle/fluid/eager/accumulation/accumulation_node.h"
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/pten_utils.h"
 #include "paddle/fluid/framework/variable.h"
@@ -109,6 +109,16 @@ std::shared_ptr<GradNodeBase> EagerUtils::grad_node(
   }
 }
 
+paddle::experimental::Tensor* EagerUtils::mutable_grad(
+    const paddle::experimental::Tensor& target) {
+  auto* meta = nullable_autograd_meta(target);
+  if (meta) {
+    return meta->MutableGrad();
+  } else {
+    return nullptr;
+  }
+}
+
 void EagerUtils::SetHistory(std::vector<AutogradMeta*>* autograd_metas,
                             const std::shared_ptr<GradNodeBase>& grad_node) {
   for (const auto& autograd_meta : *autograd_metas) {
@@ -342,7 +352,8 @@ std::shared_ptr<egr::GradNodeBase> EagerUtils::GetGradAccumulationNode(
   } else {
     if (!autograd_ptr->StopGradient()) {
       VLOG(6) << "Add GradNodeAccumulation for tensor: " << tensor.name();
-      autograd_ptr->SetGradNode(std::make_shared<egr::GradNodeAccumulation>());
+      autograd_ptr->SetGradNode(
+          std::make_shared<egr::GradNodeAccumulation>(autograd_ptr));
       return autograd_ptr->GetMutableGradNode();
     } else {
       return nullptr;
diff --git a/paddle/fluid/eager/utils.h b/paddle/fluid/eager/utils.h
index b74d68db2a6d5..fa5735e6f32a0 100644
--- a/paddle/fluid/eager/utils.h
+++ b/paddle/fluid/eager/utils.h
@@ -102,6 +102,8 @@ class EagerUtils {
 
   static std::shared_ptr<GradNodeBase> grad_node(
       const paddle::experimental::Tensor& target);
+  static paddle::experimental::Tensor* mutable_grad(
+      const paddle::experimental::Tensor& target);
 
   // Set history is used to set backward info during forward process, it will
   // set forward var's autograd meta's grad node as current backward node.
diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc
index 2296169a16104..d9a2dcb686909 100644
--- a/paddle/fluid/pybind/eager.cc
+++ b/paddle/fluid/pybind/eager.cc
@@ -86,7 +86,8 @@ void EmptyTensorInitializer(TensorObject* self, const std::string& name,
   if (!autograd_meta->GetMutableGradNode()) {
     VLOG(3) << "Tensor(" << name
             << ") have not GradNode, add GradNodeAccumulation for it.";
-    autograd_meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>());
+    autograd_meta->SetGradNode(
+        std::make_shared<egr::GradNodeAccumulation>(autograd_meta));
   }
 }
 
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index 27328bea692af..4e900ae2ffbc1 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -177,7 +177,7 @@ static PyObject* tensor_retain_grads(TensorObject* self, PyObject* args,
     if (!meta->GetMutableGradNode()) {
       VLOG(6) << "Make grad node of tensor: " << self->tensor.name()
               << "become accumulation node";
-      meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>());
+      meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>(meta));
     }
     egr::egr_utils_api::RetainGradForTensor(self->tensor);
   }
@@ -199,17 +199,12 @@ static PyObject* tensor_clear_gradient(TensorObject* self, PyObject* args,
 
   paddle::experimental::Tensor* grad;
   if (egr::egr_utils_api::IsLeafTensor(self->tensor)) {
-    // Add RetainGrad as PostHook to AccumulationNode
-    std::shared_ptr<egr::GradNodeBase> grad_node =
-        egr::EagerUtils::grad_node(self->tensor);
-    PADDLE_ENFORCE(
-        grad_node.get() != nullptr,
-        paddle::platform::errors::Fatal("Detected NULL grad_node"
-                                        "Leaf tensor should have had grad_node "
-                                        "with type: GradNodeAccumulation"));
-    auto accumulation_grad_node =
-        std::dynamic_pointer_cast<egr::GradNodeAccumulation>(grad_node);
-    grad = accumulation_grad_node->Grad();
+    grad = egr::EagerUtils::mutable_grad(self->tensor);
+    PADDLE_ENFORCE(grad != nullptr,
+                   paddle::platform::errors::Fatal(
+                       "Detected NULL grad"
+                       "Please check if you have manually cleared"
+                       "the grad inside autograd_meta"));
   } else {
     auto meta = egr::EagerUtils::unsafe_autograd_meta(self->tensor);
     grad = meta->MutableGrad();
@@ -248,19 +243,15 @@ static PyObject* tensor__zero_grads(TensorObject* self, PyObject* args,
 
   if (egr::egr_utils_api::IsLeafTensor(self->tensor)) {
     // Add RetainGrad as PostHook to AccumulationNode
-    std::shared_ptr<egr::GradNodeBase> grad_node =
-        egr::EagerUtils::grad_node(self->tensor);
-    PADDLE_ENFORCE(
-        grad_node.get() != nullptr,
-        paddle::platform::errors::Fatal("Detected NULL grad_node"
-                                        "Leaf tensor should have had grad_node "
-                                        "with type: GradNodeAccumulation"));
-    auto accumulation_grad_node =
-        std::dynamic_pointer_cast<egr::GradNodeAccumulation>(grad_node);
-    if (accumulation_grad_node->Grad()->initialized()) {
-      accumulation_grad_node->Grad()->set_impl(
-          paddle::experimental::zeros_like(*(accumulation_grad_node->Grad()))
-              .impl());
+    paddle::experimental::Tensor* grad =
+        egr::EagerUtils::mutable_grad(self->tensor);
+    PADDLE_ENFORCE(grad != nullptr,
+                   paddle::platform::errors::Fatal(
+                       "Detected NULL grad"
+                       "Please check if you have manually cleared"
+                       "the grad inside autograd_meta"));
+    if (grad->initialized()) {
+      grad->set_impl(paddle::experimental::zeros_like(*(grad)).impl());
     }
   } else {
     auto meta = egr::EagerUtils::unsafe_autograd_meta(self->tensor);
diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc
index 43cfb50f2afe1..2e1390cb96155 100644
--- a/paddle/fluid/pybind/eager_properties.cc
+++ b/paddle/fluid/pybind/eager_properties.cc
@@ -70,26 +70,13 @@ PyObject* tensor_properties_get_stop_gradient(TensorObject* self,
 
 PyObject* tensor_properties_get_grad(TensorObject* self, void* closure) {
   EAGER_TRY
-  if (egr::egr_utils_api::IsLeafTensor(self->tensor)) {
-    std::shared_ptr<egr::GradNodeBase> grad_node =
-        egr::EagerUtils::grad_node(self->tensor);
-    PADDLE_ENFORCE(
-        grad_node.get() != nullptr,
-        paddle::platform::errors::Fatal("Detected NULL grad_node"
-                                        "Leaf tensor should have had grad_node "
-                                        "with type: GradNodeAccumulation"));
-    auto accumulation_grad_node =
-        std::dynamic_pointer_cast<egr::GradNodeAccumulation>(grad_node);
-    return ToPyObject(*accumulation_grad_node->Grad());
+  VLOG(6) << "Get grad for tensor: " << self->tensor.name();
+  auto meta = egr::EagerUtils::nullable_autograd_meta(self->tensor);
+  if (meta) {
+    return ToPyObject(meta->Grad());
   } else {
-    VLOG(6) << "Get grad for tensor: " << self->tensor.name();
-    auto meta = egr::EagerUtils::nullable_autograd_meta(self->tensor);
-    if (meta) {
-      return ToPyObject(meta->Grad());
-    } else {
-      Py_INCREF(Py_None);
-      return Py_None;
-    }
+    Py_INCREF(Py_None);
+    return Py_None;
   }
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
@@ -101,16 +88,15 @@ int tensor_properties_set_grad(TensorObject* self, PyObject* value,
   PADDLE_ENFORCE(
       egr::egr_utils_api::IsLeafTensor(self->tensor),
       paddle::platform::errors::Fatal("Only leaf Tensor can be set grad."));
-  std::shared_ptr<egr::GradNodeBase> grad_node =
-      egr::EagerUtils::grad_node(self->tensor);
-  PADDLE_ENFORCE(
-      grad_node.get() != nullptr,
-      paddle::platform::errors::Fatal("Detected NULL grad_node"
-                                      "Leaf tensor should have had grad_node "
-                                      "with type: GradNodeAccumulation"));
-  auto accumulation_grad_node =
-      std::dynamic_pointer_cast<egr::GradNodeAccumulation>(grad_node);
-  accumulation_grad_node->Grad()->copy_(src, true);
+
+  paddle::experimental::Tensor* grad =
+      egr::EagerUtils::mutable_grad(self->tensor);
+  PADDLE_ENFORCE(grad != nullptr,
+                 paddle::platform::errors::Fatal(
+                     "Detected NULL grad"
+                     "Please check if you have manually cleared"
+                     "the grad inside autograd_meta"));
+  grad->copy_(src, true);
   return 0;
   EAGER_CATCH_AND_THROW_RETURN_ZERO
 }
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 92cba4fca5aba..848ebae0706e3 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -606,8 +606,12 @@ def create_var(np_value, name, is_input, if_return_inputs_grad_dict):
 
             if is_input:
                 v = self._create_var_from_numpy(np_value_temp)
+
                 if if_return_inputs_grad_dict:
                     v.stop_gradient = False
+                    if _in_eager_mode():
+                        v.retain_grads()
+
                 if has_lod:
                     v.value().get_tensor().set_recursive_sequence_lengths(
                         lod_temp)
@@ -618,7 +622,6 @@ def create_var(np_value, name, is_input, if_return_inputs_grad_dict):
                     type=core.VarDesc.VarType.LOD_TENSOR,
                     persistable=False,
                     stop_gradient=False)
-
             return v
 
         # prepare variable for input or output
@@ -681,7 +684,6 @@ def _calc_dygraph_output(self, place, parallel=False, no_check_set=None):
             # prepare input variable
             inputs = self.append_input_output_for_dygraph(op_proto, self.inputs,
                                                           True, False, block)
-
             # prepare output variable
             outputs = self.append_input_output_for_dygraph(
                 op_proto, self.outputs, False, False, block)
@@ -1741,6 +1743,7 @@ def _get_dygraph_grad(self,
                 for attrs_name in self.attrs:
                     if self.attrs[attrs_name] is not None:
                         attrs_outputs[attrs_name] = self.attrs[attrs_name]
+
             block.append_op(
                 type=self.op_type,
                 inputs=inputs,
@@ -1817,7 +1820,9 @@ def _get_dygraph_grad(self,
                         inputs={"X": loss_sum},
                         outputs={"Out": loss},
                         attrs={'scale': 1.0 / float(len(avg_sum))})
+
                 loss.backward()
+
                 fetch_list_grad = []
                 for inputs_to_check_name in inputs_to_check:
                     a = inputs_grad_dict[inputs_to_check_name].gradient()

From 2ec943a7d29cace9ac5b36d3d6c9da3fedd99da5 Mon Sep 17 00:00:00 2001
From: jakpiase <jakpia21@gmail.com>
Date: Thu, 24 Feb 2022 05:43:43 +0100
Subject: [PATCH 087/101] Added nearest interp v2 BF16 FWD kernel (#39490)

* added nearest interp v2 bf16

* disabled bilinear interp nhwc test

* added skipping UT for gpu

* added NHWC support

* removed unnecessary statements

* minor change

* CI fix

* added appropriate changes to interpolate_v1

* fix after review

* minor change

* minor change

* revert unwanted deletions

* CI fix
---
 paddle/fluid/operators/interpolate_op.cc      |  2 +-
 paddle/fluid/operators/interpolate_v2_op.cc   |  2 +-
 .../operators/mkldnn/interpolate_mkldnn_op.cc | 34 ++++++++-----------
 .../test_nearest_interp_v2_mkldnn_op.py       | 26 +++++++++-----
 4 files changed, 34 insertions(+), 30 deletions(-)

diff --git a/paddle/fluid/operators/interpolate_op.cc b/paddle/fluid/operators/interpolate_op.cc
index 8fac84176d97f..fda168c94e1e0 100644
--- a/paddle/fluid/operators/interpolate_op.cc
+++ b/paddle/fluid/operators/interpolate_op.cc
@@ -328,7 +328,7 @@ class InterpolateOp : public framework::OperatorWithKernel {
     auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
 
 #ifdef PADDLE_WITH_MKLDNN
-    auto interp_method = ctx.Attr<std::string>("interp_method");
+    const auto& interp_method = ctx.Attr<std::string>("interp_method");
     // TODO(danqing): support other interp_method
     if (this->CanMKLDNNBeUsed(ctx, data_type) &&
         (interp_method == "nearest" || interp_method == "bilinear")) {
diff --git a/paddle/fluid/operators/interpolate_v2_op.cc b/paddle/fluid/operators/interpolate_v2_op.cc
index 7783303785998..4b5a18141d5aa 100644
--- a/paddle/fluid/operators/interpolate_v2_op.cc
+++ b/paddle/fluid/operators/interpolate_v2_op.cc
@@ -414,7 +414,7 @@ class InterpolateV2Op : public framework::OperatorWithKernel {
     auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
 
 #ifdef PADDLE_WITH_MKLDNN
-    auto interp_method = ctx.Attr<std::string>("interp_method");
+    const auto& interp_method = ctx.Attr<std::string>("interp_method");
     // TODO(danqing): support other interp_method
     if (this->CanMKLDNNBeUsed(ctx, data_type) &&
         (interp_method == "nearest" || interp_method == "bilinear")) {
diff --git a/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc
index 33ea36d24b8ae..04b90d2f1f380 100644
--- a/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc
@@ -53,17 +53,13 @@ class InterpolateMKLDNNKernel : public framework::OpKernel<T> {
   std::vector<int> ComputeOutputShape(
       const framework::ExecutionContext& ctx) const {
     const auto* x = ctx.Input<Tensor>("X");
-    auto in_dims = x->dims();
-    const bool is_channel_last = false;  // In mkldnn kernel, always use NCHW
-
-    framework::DDim in_dhw_dims;
-    if (is_channel_last) {  // NDHWC, NHWC, NWC
-      in_dhw_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-    } else {  // NCDHW, NCHW, NCW
-      in_dhw_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    }
+    const auto& in_dims = x->dims();
+
+    const framework::DDim in_dhw_dims =
+        phi::slice_ddim(in_dims, 2, in_dims.size());
 
     std::vector<int> out_dims;
+    out_dims.reserve(5);
     if (in_dhw_dims.size() == 1) {
       out_dims.push_back(ctx.Attr<int>("out_w"));
     } else if (in_dhw_dims.size() == 2) {
@@ -125,12 +121,8 @@ class InterpolateMKLDNNKernel : public framework::OpKernel<T> {
                              "out_d, out_h, out_w of Op(interpolate) "
                              "should be greater than 0."));
 
-    out_dims.insert(out_dims.begin(), in_dims[0]);
-    if (is_channel_last) {
-      out_dims.push_back(in_dims[in_dims.size() - 1]);
-    } else {
-      out_dims.insert(out_dims.begin() + 1, in_dims[1]);
-    }
+    const std::vector<int64_t> nc_dims = {in_dims[0], in_dims[1]};
+    out_dims.insert(out_dims.begin(), nc_dims.begin(), nc_dims.end());
     return out_dims;
   }
 
@@ -143,12 +135,12 @@ class InterpolateMKLDNNKernel : public framework::OpKernel<T> {
     const auto* x = ctx.Input<Tensor>("X");
     auto* z = ctx.Output<Tensor>("Out");
 
-    auto interp_method = ctx.Attr<std::string>("interp_method");
-    dnnl::algorithm algo = (interp_method == "nearest")
-                               ? dnnl::algorithm::resampling_nearest
-                               : dnnl::algorithm::resampling_linear;
+    const auto interp_method = ctx.Attr<std::string>("interp_method");
+    const dnnl::algorithm algo = (interp_method == "nearest")
+                                     ? dnnl::algorithm::resampling_nearest
+                                     : dnnl::algorithm::resampling_linear;
 
-    auto out_dims_vec = ComputeOutputShape(ctx);
+    const auto out_dims_vec = ComputeOutputShape(ctx);
     framework::DDim dim_out = phi::make_ddim(out_dims_vec);
     z->Resize(dim_out);
 
@@ -162,6 +154,7 @@ class InterpolateMKLDNNKernel : public framework::OpKernel<T> {
     const std::unordered_map<int, dnnl::memory> args = {
         {DNNL_ARG_SRC, *src_memory_p}, {DNNL_ARG_DST, *dst_memory_p}};
     auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
+
     resampling_prim->execute(astream, args);
     astream.wait();
 
@@ -184,6 +177,7 @@ REGISTER_OP_KERNEL(bilinear_interp, MKLDNN, ::paddle::platform::CPUPlace,
 
 REGISTER_OP_KERNEL(nearest_interp_v2, MKLDNN, ::paddle::platform::CPUPlace,
                    ops::InterpolateMKLDNNKernel<float>,
+                   ops::InterpolateMKLDNNKernel<paddle::platform::bfloat16>,
                    ops::InterpolateMKLDNNKernel<int8_t>,
                    ops::InterpolateMKLDNNKernel<uint8_t>);
 REGISTER_OP_KERNEL(bilinear_interp_v2, MKLDNN, ::paddle::platform::CPUPlace,
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_v2_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_v2_mkldnn_op.py
index 24ebf40216f4b..d72a1d53d3aa5 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_v2_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_v2_mkldnn_op.py
@@ -16,7 +16,7 @@
 
 import unittest
 import numpy as np
-from paddle.fluid.tests.unittests.op_test import OpTest
+from paddle.fluid.tests.unittests.op_test import OpTest, OpTestTool, convert_float_to_uint16
 from paddle.fluid.tests.unittests.op_test import skip_check_grad_ci
 
 
@@ -59,6 +59,7 @@ def nearest_neighbor_interp_mkldnn_np(X,
 
 
 @skip_check_grad_ci(reason="Haven not implement interpolate grad kernel.")
+@OpTestTool.skip_if_not_cpu_bf16()
 class TestNearestInterpV2MKLDNNOp(OpTest):
     def init_test_case(self):
         pass
@@ -84,7 +85,7 @@ def setUp(self):
         self.init_test_case()
         self.init_data_type()
 
-        if self.dtype == np.float32:
+        if self.dtype == np.float32 or self.dtype == np.uint16:
             input_np = np.random.random(self.input_shape).astype(self.dtype)
         else:
             init_low, init_high = (-5, 5) if self.dtype == np.int8 else (0, 10)
@@ -126,6 +127,9 @@ def setUp(self):
         if isinstance(self.scale, float):
             self.scale = [self.scale]
 
+        if self.dtype == np.uint16:
+            input_np = convert_float_to_uint16(input_np)
+
         self.inputs = {'X': input_np}
         if self.out_size is not None:
             self.inputs['OutSize'] = self.out_size
@@ -191,6 +195,10 @@ class TestFp32Case(parent):
         def init_data_type(self):
             self.dtype = np.float32
 
+    class TestBf16Case(parent):
+        def init_data_type(self):
+            self.dtype = np.uint16
+
     class TestInt8Case(parent):
         def init_data_type(self):
             self.dtype = np.int8
@@ -199,12 +207,14 @@ class TestUint8Case(parent):
         def init_data_type(self):
             self.dtype = np.uint8
 
-    TestFp32Case.__name__ = parent.__name__
-    TestInt8Case.__name__ = parent.__name__
-    TestUint8Case.__name__ = parent.__name__
-    globals()[parent.__name__] = TestFp32Case
-    globals()[parent.__name__] = TestInt8Case
-    globals()[parent.__name__] = TestUint8Case
+    TestFp32Case.__name__ = "{0}_{1}".format(parent.__name__, "FP32")
+    TestBf16Case.__name__ = "{0}_{1}".format(parent.__name__, "BF16")
+    TestInt8Case.__name__ = "{0}_{1}".format(parent.__name__, "INT8")
+    TestUint8Case.__name__ = "{0}_{1}".format(parent.__name__, "UINT8")
+    globals()[TestFp32Case.__name__] = TestFp32Case
+    globals()[TestBf16Case.__name__] = TestBf16Case
+    globals()[TestInt8Case.__name__] = TestInt8Case
+    globals()[TestUint8Case.__name__] = TestUint8Case
 
 
 create_test_class(TestNearestInterpV2MKLDNNOp)

From 23bbd912a0af9df3095c0659c6dd2e264c22979a Mon Sep 17 00:00:00 2001
From: zmxdream <zmxdream@pku.edu.cn>
Date: Thu, 24 Feb 2022 13:16:10 +0800
Subject: [PATCH 088/101] config fleet optimize. test=develop (#39849)

---
 paddle/fluid/framework/ps_gpu_trainer.cc | 81 ++++++++++++------------
 paddle/fluid/framework/trainer.h         |  1 +
 2 files changed, 43 insertions(+), 39 deletions(-)

diff --git a/paddle/fluid/framework/ps_gpu_trainer.cc b/paddle/fluid/framework/ps_gpu_trainer.cc
index 0705f658ff5fe..e0cf860e5bc7b 100644
--- a/paddle/fluid/framework/ps_gpu_trainer.cc
+++ b/paddle/fluid/framework/ps_gpu_trainer.cc
@@ -46,6 +46,48 @@ void PSGPUTrainer::Initialize(const TrainerDesc& trainer_desc,
       dense_grad_names_[table_id][j] = table.dense_grad_name(j);
     }
   }
+  InitializeGPUServer(trainer_desc);
+  scale_datanorm_ = trainer_desc.scale_datanorm();
+  int place_num = trainer_desc.worker_places_size();
+  const std::vector<paddle::framework::DataFeed*> readers =
+      dataset->GetReaders();
+  dump_file_num_ = trainer_desc.dump_file_num();
+  user_define_dump_filename_ = trainer_desc.user_define_dump_filename();
+  std::vector<int> dev_ids;
+  for (int i = 0; i < place_num; ++i) {
+    int num = trainer_desc.worker_places(i);
+    platform::CUDAPlace place = platform::CUDAPlace(num);
+    places_.push_back(place);
+    dev_ids.push_back(num);
+  }
+  for (int i = 0; i < trainer_desc.downpour_param().stat_var_names_size();
+       i++) {
+    need_merge_var_names_.push_back(
+        trainer_desc.downpour_param().stat_var_names(i));
+  }
+  VLOG(3) << "going to initialize pull dense worker";
+  SetDebug(trainer_desc.debug());
+  trainer_desc_ = trainer_desc;
+  workers_.resize(place_num);
+  for (int i = 0; i < place_num; ++i) {
+    workers_[i] = DeviceWorkerFactory::CreateDeviceWorker(
+        trainer_desc.device_worker_name());
+    workers_[i]->SetDeviceIndex(i);
+    workers_[i]->SetNeedDumpField(need_dump_field_);
+    workers_[i]->SetNeedDumpParam(need_dump_param_);
+    workers_[i]->SetDumpFieldVector(dump_fields_);
+    workers_[i]->SetDumpParamVector(dump_param_);
+    workers_[i]->InitRandomDumpConfig(trainer_desc);
+    workers_[i]->SetDataFeed(readers[i]);
+    workers_[i]->SetPlace(places_[i]);
+    workers_[i]->SetReaderPlace(places_[i]);
+    workers_[i]->Initialize(trainer_desc);
+    workers_[i]->SetWorkerNum(place_num);
+  }
+  return;
+}
+
+void PSGPUTrainer::InitializeGPUServer(const TrainerDesc& trainer_desc) {
   // add for hbmps optimizer config
   auto fleet_desc_str = trainer_desc.fleet_desc();
   google::protobuf::TextFormat::ParseFromString(fleet_desc_str, &_ps_param);
@@ -203,45 +245,6 @@ void PSGPUTrainer::Initialize(const TrainerDesc& trainer_desc,
 
   auto ps_gpu_wrapper = paddle::framework::PSGPUWrapper::GetInstance();
   ps_gpu_wrapper->InitializeGPUServer(config);
-
-  scale_datanorm_ = trainer_desc.scale_datanorm();
-  int place_num = trainer_desc.worker_places_size();
-  const std::vector<paddle::framework::DataFeed*> readers =
-      dataset->GetReaders();
-  dump_file_num_ = trainer_desc.dump_file_num();
-  user_define_dump_filename_ = trainer_desc.user_define_dump_filename();
-  std::vector<int> dev_ids;
-  for (int i = 0; i < place_num; ++i) {
-    int num = trainer_desc.worker_places(i);
-    platform::CUDAPlace place = platform::CUDAPlace(num);
-    places_.push_back(place);
-    dev_ids.push_back(num);
-  }
-  for (int i = 0; i < trainer_desc.downpour_param().stat_var_names_size();
-       i++) {
-    need_merge_var_names_.push_back(
-        trainer_desc.downpour_param().stat_var_names(i));
-  }
-  VLOG(3) << "going to initialize pull dense worker";
-  SetDebug(trainer_desc.debug());
-  trainer_desc_ = trainer_desc;
-  workers_.resize(place_num);
-  for (int i = 0; i < place_num; ++i) {
-    workers_[i] = DeviceWorkerFactory::CreateDeviceWorker(
-        trainer_desc.device_worker_name());
-    workers_[i]->SetDeviceIndex(i);
-    workers_[i]->SetNeedDumpField(need_dump_field_);
-    workers_[i]->SetNeedDumpParam(need_dump_param_);
-    workers_[i]->SetDumpFieldVector(dump_fields_);
-    workers_[i]->SetDumpParamVector(dump_param_);
-    workers_[i]->InitRandomDumpConfig(trainer_desc);
-    workers_[i]->SetDataFeed(readers[i]);
-    workers_[i]->SetPlace(places_[i]);
-    workers_[i]->SetReaderPlace(places_[i]);
-    workers_[i]->Initialize(trainer_desc);
-    workers_[i]->SetWorkerNum(place_num);
-  }
-  return;
 }
 
 std::string PSGPUTrainer::GetDumpPath(int tid) {
diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h
index 85eef89ee27f6..8a11775702e57 100644
--- a/paddle/fluid/framework/trainer.h
+++ b/paddle/fluid/framework/trainer.h
@@ -271,6 +271,7 @@ class PSGPUTrainer : public TrainerBase {
 
   template <typename T>
   void MergeToRootScope(LoDTensor* root_tensor, LoDTensor* thread_tensor);
+  void InitializeGPUServer(const TrainerDesc& trainer_desc);
 
  protected:
   Dataset* dataset_;

From bbe441fc74a203e464c0b73c47693a65818837ee Mon Sep 17 00:00:00 2001
From: Zhou Wei <1183042833@qq.com>
Date: Thu, 24 Feb 2022 13:28:38 +0800
Subject: [PATCH 089/101] =?UTF-8?q?=E3=80=90Phi=E3=80=91Migrate=20poisson?=
 =?UTF-8?q?=20op=20into=20phi=20(#39814)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Migrate poisson op into phi

* fix CI

* fix comment
---
 paddle/fluid/operators/poisson_op.cc          | 51 ++---------
 paddle/fluid/operators/poisson_op.cu          | 91 -------------------
 paddle/fluid/operators/poisson_op.h           | 41 ---------
 paddle/phi/infermeta/unary.h                  |  1 +
 paddle/phi/kernels/cpu/poisson_grad_kernel.cc | 19 ++++
 paddle/phi/kernels/cpu/poisson_kernel.cc      | 41 +++++++++
 paddle/phi/kernels/gpu/poisson_grad_kernel.cu | 19 ++++
 paddle/phi/kernels/gpu/poisson_kernel.cu      | 77 ++++++++++++++++
 .../kernels/impl/poisson_grad_kernel_impl.h   | 29 ++++++
 paddle/phi/kernels/poisson_grad_kernel.h      | 25 +++++
 paddle/phi/kernels/poisson_kernel.h           | 24 +++++
 paddle/phi/ops/compat/poisson_sig.cc          | 26 ++++++
 12 files changed, 270 insertions(+), 174 deletions(-)
 delete mode 100644 paddle/fluid/operators/poisson_op.cu
 delete mode 100644 paddle/fluid/operators/poisson_op.h
 create mode 100644 paddle/phi/kernels/cpu/poisson_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/poisson_kernel.cc
 create mode 100644 paddle/phi/kernels/gpu/poisson_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/poisson_kernel.cu
 create mode 100644 paddle/phi/kernels/impl/poisson_grad_kernel_impl.h
 create mode 100644 paddle/phi/kernels/poisson_grad_kernel.h
 create mode 100644 paddle/phi/kernels/poisson_kernel.h
 create mode 100644 paddle/phi/ops/compat/poisson_sig.cc

diff --git a/paddle/fluid/operators/poisson_op.cc b/paddle/fluid/operators/poisson_op.cc
index cc4b6e5e0756a..0cecbf0b9cb02 100644
--- a/paddle/fluid/operators/poisson_op.cc
+++ b/paddle/fluid/operators/poisson_op.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <string>
-
-#include "paddle/fluid/operators/poisson_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -23,14 +25,6 @@ class PoissonOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "PoissonOp");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "PoissonOp");
-
-    auto dim = ctx->GetInputDim("X");
-    ctx->SetOutputDim("Out", dim);
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -61,29 +55,6 @@ class PoissonOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput {
   }
 };
 
-template <typename T>
-class PoissonKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *x = ctx.Input<framework::Tensor>("X");
-    auto *out = ctx.Output<framework::Tensor>("Out");
-
-    const T *x_data = x->data<T>();
-    T *out_data = out->mutable_data<T>(ctx.GetPlace());
-
-    int64_t size = x->numel();
-
-    auto gen = framework::DefaultCPUGenerator();
-    auto engine = gen->GetCPUEngine();
-
-    for (int64_t i = 0; i < size; ++i) {
-      std::poisson_distribution<> dist(x_data[i]);
-      out_data[i] = static_cast<T>(dist(*engine));
-    }
-  }
-};
-
 class PoissonGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -116,17 +87,13 @@ class PoissonGradOpMaker : public framework::SingleGradOpMaker<T> {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
+DELCARE_INFER_SHAPE_FUNCTOR(poisson, PoissonInferShapeFunctor,
+                            PT_INFER_META(phi::UnchangedInferMeta));
+
 REGISTER_OPERATOR(poisson, ops::PoissonOp, ops::PoissonOpMaker,
                   ops::PoissonOpInferVarType,
                   ops::PoissonGradOpMaker<paddle::framework::OpDesc>,
-                  ops::PoissonGradOpMaker<paddle::imperative::OpBase>);
+                  ops::PoissonGradOpMaker<paddle::imperative::OpBase>,
+                  PoissonInferShapeFunctor);
 
 REGISTER_OPERATOR(poisson_grad, ops::PoissonGradOp);
-
-REGISTER_OP_CPU_KERNEL(poisson,
-                       ops::PoissonKernel<plat::CPUDeviceContext, float>,
-                       ops::PoissonKernel<plat::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(poisson_grad,
-                       ops::PoissonGradKernel<plat::CPUDeviceContext, float>,
-                       ops::PoissonGradKernel<plat::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/poisson_op.cu b/paddle/fluid/operators/poisson_op.cu
deleted file mode 100644
index ef2f6d4665554..0000000000000
--- a/paddle/fluid/operators/poisson_op.cu
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef __NVCC__
-#include <curand_kernel.h>
-#endif
-#ifdef __HIPCC__
-#include <hiprand_kernel.h>
-#endif
-#include "paddle/fluid/operators/poisson_op.h"
-#include "paddle/fluid/platform/for_range.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct PoissonCudaFunctor {
- public:
-  PoissonCudaFunctor(const T* in, T* out, unsigned int seed,
-                     unsigned int offset)
-      : in_(in), out_(out), seed_(seed), offset_(offset) {}
-
-  __device__ void operator()(int64_t idx) {
-#ifdef __NVCC__
-    curandStatePhilox4_32_10_t state;
-    curand_init(seed_, idx, offset_, &state);
-    out_[idx] = static_cast<T>(curand_poisson(&state, in_[idx]));
-#elif __HIPCC__
-    hiprandStatePhilox4_32_10_t state;
-    hiprand_init(seed_, idx, offset_, &state);
-    out_[idx] = static_cast<T>(hiprand_poisson(&state, in_[idx]));
-#endif
-  }
-
- private:
-  const T* in_;
-  T* out_;
-  const unsigned int seed_;
-  const unsigned int offset_;
-};
-
-template <typename T>
-class PoissonKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto* x = ctx.Input<framework::Tensor>("X");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-
-    const T* x_data = x->data<T>();
-    T* out_data = out->mutable_data<T>(ctx.GetPlace());
-    auto size = x->numel();
-    int64_t device_id = ctx.GetPlace().GetDeviceId();
-
-    auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
-    auto seed_offset = gen_cuda->IncrementOffset(20);
-    uint64_t seed = seed_offset.first;
-    uint64_t offset = seed_offset.second;
-
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    platform::ForRange<platform::CUDADeviceContext> for_range(dev_ctx, size);
-
-    PoissonCudaFunctor<T> functor(x_data, out_data, seed, offset);
-    for_range(functor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(poisson,
-                        ops::PoissonKernel<plat::CUDADeviceContext, float>,
-                        ops::PoissonKernel<plat::CUDADeviceContext, double>);
-
-REGISTER_OP_CUDA_KERNEL(
-    poisson_grad, ops::PoissonGradKernel<plat::CUDADeviceContext, float>,
-    ops::PoissonGradKernel<plat::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/poisson_op.h b/paddle/fluid/operators/poisson_op.h
deleted file mode 100644
index 2bcb5244012c7..0000000000000
--- a/paddle/fluid/operators/poisson_op.h
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/framework/generator.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class PoissonKernel;
-
-template <typename DeviceContext, typename T>
-class PoissonGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    dx->mutable_data<T>(ctx.GetPlace());
-    phi::funcs::SetConstant<DeviceContext, T> functor;
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    functor(dev_ctx, dx, static_cast<T>(0));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index 7d15f497ead14..21cbe76bb13c0 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -103,4 +103,5 @@ void UnfoldInferMeta(const MetaTensor& x,
                      const std::vector<int>& dilations,
                      MetaTensor* out,
                      MetaConfig config = MetaConfig());
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/cpu/poisson_grad_kernel.cc b/paddle/phi/kernels/cpu/poisson_grad_kernel.cc
new file mode 100644
index 0000000000000..4e274a7af9ff3
--- /dev/null
+++ b/paddle/phi/kernels/cpu/poisson_grad_kernel.cc
@@ -0,0 +1,19 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/poisson_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    poisson_grad, CPU, ALL_LAYOUT, phi::PoissonGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/poisson_kernel.cc b/paddle/phi/kernels/cpu/poisson_kernel.cc
new file mode 100644
index 0000000000000..6a3e32c2f0785
--- /dev/null
+++ b/paddle/phi/kernels/cpu/poisson_kernel.cc
@@ -0,0 +1,41 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <random>
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/poisson_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PoissonKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) {
+  const T* x_data = x.data<T>();
+  T* out_data = ctx.template Alloc<T>(out);
+  int64_t size = x.numel();
+
+  auto gen = ctx.GetGenerator();
+  auto engine = gen->GetCPUEngine();
+
+  for (int64_t i = 0; i < size; ++i) {
+    std::poisson_distribution<> dist(x_data[i]);
+    out_data[i] = static_cast<T>(dist(*engine));
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    poisson, CPU, ALL_LAYOUT, phi::PoissonKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/poisson_grad_kernel.cu b/paddle/phi/kernels/gpu/poisson_grad_kernel.cu
new file mode 100644
index 0000000000000..8c16bc51fffe5
--- /dev/null
+++ b/paddle/phi/kernels/gpu/poisson_grad_kernel.cu
@@ -0,0 +1,19 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/poisson_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    poisson_grad, GPU, ALL_LAYOUT, phi::PoissonGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/poisson_kernel.cu b/paddle/phi/kernels/gpu/poisson_kernel.cu
new file mode 100644
index 0000000000000..ae97f2fca68cb
--- /dev/null
+++ b/paddle/phi/kernels/gpu/poisson_kernel.cu
@@ -0,0 +1,77 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef __NVCC__
+#include <curand_kernel.h>
+#endif
+#ifdef __HIPCC__
+#include <hiprand_kernel.h>
+#endif
+
+#include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/poisson_kernel.h"
+
+namespace phi {
+
+template <typename T>
+struct PoissonCudaFunctor {
+ public:
+  PoissonCudaFunctor(const T* in,
+                     T* out,
+                     unsigned int seed,
+                     unsigned int offset)
+      : in_(in), out_(out), seed_(seed), offset_(offset) {}
+
+  __device__ void operator()(int64_t idx) {
+#ifdef __NVCC__
+    curandStatePhilox4_32_10_t state;
+    curand_init(seed_, idx, offset_, &state);
+    out_[idx] = static_cast<T>(curand_poisson(&state, in_[idx]));
+#elif __HIPCC__
+    hiprandStatePhilox4_32_10_t state;
+    hiprand_init(seed_, idx, offset_, &state);
+    out_[idx] = static_cast<T>(hiprand_poisson(&state, in_[idx]));
+#endif
+  }
+
+ private:
+  const T* in_;
+  T* out_;
+  const unsigned int seed_;
+  const unsigned int offset_;
+};
+
+template <typename T, typename Context>
+void PoissonKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) {
+  const T* x_data = x.data<T>();
+  T* out_data = ctx.template Alloc<T>(out);
+  auto size = x.numel();
+
+  auto gen_cuda = ctx.GetGenerator();
+  auto seed_offset = gen_cuda->IncrementOffset(20);
+  uint64_t seed = seed_offset.first;
+  uint64_t offset = seed_offset.second;
+
+  paddle::platform::ForRange<Context> for_range(ctx, size);
+
+  PoissonCudaFunctor<T> functor(x_data, out_data, seed, offset);
+  for_range(functor);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    poisson, GPU, ALL_LAYOUT, phi::PoissonKernel, float, double) {}
diff --git a/paddle/phi/kernels/impl/poisson_grad_kernel_impl.h b/paddle/phi/kernels/impl/poisson_grad_kernel_impl.h
new file mode 100644
index 0000000000000..4e82cccac3422
--- /dev/null
+++ b/paddle/phi/kernels/impl/poisson_grad_kernel_impl.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/poisson_grad_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PoissonGradKernel(const Context& ctx, DenseTensor* x_grad) {
+  ctx.template Alloc<T>(x_grad);
+  phi::funcs::SetConstant<Context, T> functor;
+  functor(ctx, x_grad, static_cast<T>(0));
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/poisson_grad_kernel.h b/paddle/phi/kernels/poisson_grad_kernel.h
new file mode 100644
index 0000000000000..21720474f4a12
--- /dev/null
+++ b/paddle/phi/kernels/poisson_grad_kernel.h
@@ -0,0 +1,25 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PoissonGradKernel(const Context& ctx, DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/poisson_kernel.h b/paddle/phi/kernels/poisson_kernel.h
new file mode 100644
index 0000000000000..f67c9c46311d1
--- /dev/null
+++ b/paddle/phi/kernels/poisson_kernel.h
@@ -0,0 +1,24 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PoissonKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/poisson_sig.cc b/paddle/phi/ops/compat/poisson_sig.cc
new file mode 100644
index 0000000000000..cb6ae28804669
--- /dev/null
+++ b/paddle/phi/ops/compat/poisson_sig.cc
@@ -0,0 +1,26 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature PoissonGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("poisson_grad", {}, {}, {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(poisson_grad, phi::PoissonGradOpArgumentMapping);

From eb4ad5094af931766a69a36f48b6db40211e9bee Mon Sep 17 00:00:00 2001
From: xiaoting <31891223+tink2123@users.noreply.github.com>
Date: Thu, 24 Feb 2022 14:12:53 +0800
Subject: [PATCH 090/101] [doc]Fix maxunpool2d example (#39862)

* fix maxunpool2d example, test=document_fix

* fix maxunpool2d example, test=document_fix
---
 python/paddle/nn/layer/pooling.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/paddle/nn/layer/pooling.py b/python/paddle/nn/layer/pooling.py
index 68808c6354afb..c664c6e318c46 100755
--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
@@ -1257,9 +1257,8 @@ class MaxUnPool2D(Layer):
         
         import paddle
         import paddle.nn.functional as F
-        import numpy as np
 
-        data = paddle.rand(shape=[1,1,7,7])
+        data = paddle.rand(shape=[1,1,6,6])
         pool_out, indices = F.max_pool2d(data, kernel_size=2, stride=2, padding=0, return_mask=True)
         # pool_out shape: [1, 1, 3, 3],  indices shape: [1, 1, 3, 3]
         Unpool2D = paddle.nn.MaxUnPool2D(kernel_size=2, padding=0)

From 6fc5d88a378892ad0936626222983d8958aea24c Mon Sep 17 00:00:00 2001
From: Linjie Chen <40840292+linjieccc@users.noreply.github.com>
Date: Thu, 24 Feb 2022 14:15:20 +0800
Subject: [PATCH 091/101] [phi] move bce_loss to phi (#39868)

* move bce_loss to phi

* refine PADDLE_ENFORCE

* revert PADDLE_ENFORCE

* fix ci
---
 paddle/fluid/operators/bce_loss_op.cc         |  52 ++-------
 paddle/fluid/operators/bce_loss_op.cu         | 109 ------------------
 paddle/fluid/operators/bce_loss_op.h          |  85 --------------
 paddle/fluid/operators/bce_loss_op_npu.cc     |   2 +-
 paddle/phi/infermeta/binary.cc                |  38 ++++++
 paddle/phi/infermeta/binary.h                 |   4 +
 paddle/phi/kernels/bce_loss_grad_kernel.h     |  28 +++++
 paddle/phi/kernels/bce_loss_kernel.h          |  27 +++++
 .../phi/kernels/cpu/bce_loss_grad_kernel.cc   |  47 ++++++++
 paddle/phi/kernels/cpu/bce_loss_kernel.cc     |  59 ++++++++++
 .../phi/kernels/gpu/bce_loss_grad_kernel.cu   |  59 ++++++++++
 paddle/phi/kernels/gpu/bce_loss_kernel.cu     |  64 ++++++++++
 paddle/phi/ops/compat/bce_loss_sig.cc         |  29 +++++
 13 files changed, 364 insertions(+), 239 deletions(-)
 delete mode 100644 paddle/fluid/operators/bce_loss_op.cu
 delete mode 100644 paddle/fluid/operators/bce_loss_op.h
 create mode 100644 paddle/phi/kernels/bce_loss_grad_kernel.h
 create mode 100644 paddle/phi/kernels/bce_loss_kernel.h
 create mode 100644 paddle/phi/kernels/cpu/bce_loss_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/bce_loss_kernel.cc
 create mode 100644 paddle/phi/kernels/gpu/bce_loss_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/bce_loss_kernel.cu
 create mode 100644 paddle/phi/ops/compat/bce_loss_sig.cc

diff --git a/paddle/fluid/operators/bce_loss_op.cc b/paddle/fluid/operators/bce_loss_op.cc
index 1c390923d0b0a..55bb57466c7b5 100644
--- a/paddle/fluid/operators/bce_loss_op.cc
+++ b/paddle/fluid/operators/bce_loss_op.cc
@@ -12,11 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/bce_loss_op.h"
 #include <memory>
 #include <string>
 #include <vector>
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/binary.h"
+
 namespace paddle {
 namespace operators {
 
@@ -26,41 +29,6 @@ class BCELossOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "BCELoss");
-    OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", "BCELoss");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "BCELoss");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto labels_dims = ctx->GetInputDim("Label");
-
-    int rank = x_dims.size();
-    PADDLE_ENFORCE_EQ(rank, labels_dims.size(),
-                      platform::errors::InvalidArgument(
-                          "Input(X) and Input(Label) shall have the same rank."
-                          "But received: the rank of Input(X) is [%d], "
-                          "the rank of Input(Label) is [%d].",
-                          rank, labels_dims.size()));
-
-    bool check = true;
-    if ((!ctx->IsRuntime()) &&
-        (phi::product(x_dims) <= 0 || phi::product(labels_dims) <= 0)) {
-      check = false;
-    }
-
-    if (check) {
-      PADDLE_ENFORCE_EQ(x_dims, labels_dims,
-                        platform::errors::InvalidArgument(
-                            "Input(X) and Input(Label) shall have the same "
-                            "shape. But received: the shape of Input(X) is "
-                            "[%s], the shape of Input(Label) is [%s].",
-                            x_dims, labels_dims));
-    }
-
-    ctx->ShareDim("X", "Out");
-    ctx->ShareLoD("X", "Out");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -170,16 +138,12 @@ DECLARE_INPLACE_OP_INFERER(BCELossGradInplaceInferer,
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DELCARE_INFER_SHAPE_FUNCTOR(bce_loss, BCELossInferShapeFunctor,
+                            PT_INFER_META(phi::BCELossInferMeta));
+
 REGISTER_OPERATOR(bce_loss, ops::BCELossOp, ops::BCELossOpMaker,
                   ops::BCELossGradOpMaker<paddle::framework::OpDesc>,
                   ops::BCELossGradOpMaker<paddle::imperative::OpBase>,
-                  ops::BCELossInplaceInferer);
+                  ops::BCELossInplaceInferer, BCELossInferShapeFunctor);
 REGISTER_OPERATOR(bce_loss_grad, ops::BCELossGradOp,
                   ops::BCELossGradInplaceInferer);
-REGISTER_OP_CPU_KERNEL(
-    bce_loss, ops::BCELossOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::BCELossOpKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    bce_loss_grad,
-    ops::BCELossGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::BCELossGradOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/bce_loss_op.cu b/paddle/fluid/operators/bce_loss_op.cu
deleted file mode 100644
index f71fbbdc6b19e..0000000000000
--- a/paddle/fluid/operators/bce_loss_op.cu
+++ /dev/null
@@ -1,109 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <algorithm>
-#include "paddle/fluid/operators/bce_loss_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
-#include "paddle/fluid/operators/math.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/phi/core/hostdevice.h"
-
-namespace paddle {
-namespace operators {
-template <typename T>
-struct BCELossFunctor {
-  T one;
-  T neg_100;
-
-  HOSTDEVICE inline BCELossFunctor() {
-    one = static_cast<T>(1.0f);
-    neg_100 = static_cast<T>(-100.);
-  }
-
-  HOSTDEVICE inline T operator()(const T x, const T label) const {
-    PADDLE_ENFORCE(
-        (x >= static_cast<T>(0)) && (x <= one),
-        "Input is expected to be within the interval [0, 1], but recieved %f.",
-        x);
-    T term1 = max(real_log(x), neg_100);
-    T term2 = max(real_log(one - x), neg_100);
-    return (((label - one) * term2) - (label * term1));
-  }
-};
-
-template <typename T>
-struct BCELossGradFunctor {
-  T one;
-  T eps;
-
-  HOSTDEVICE inline BCELossGradFunctor() {
-    one = static_cast<T>(1.0f);
-    eps = static_cast<T>(1e-12);
-  }
-
-  HOSTDEVICE inline T operator()(const T x, const T label, const T dout) const {
-    T term1 = max((one - x) * x, eps);
-    return (dout * (x - label) / term1);
-  }
-};
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class BCELossCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* labels = ctx.Input<Tensor>("Label");
-    auto* out = ctx.Output<Tensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-    std::vector<const framework::Tensor*> ins = {x, labels};
-    std::vector<framework::Tensor*> outs = {out};
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto functor = BCELossFunctor<T>();
-    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
-                                                              &outs, functor);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class BCELossGradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* labels = ctx.Input<Tensor>("Label");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    dx->mutable_data<T>(ctx.GetPlace());
-    std::vector<const framework::Tensor*> ins = {x, labels, dout};
-    std::vector<framework::Tensor*> outs = {dx};
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto functor = BCELossGradFunctor<T>();
-    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
-                                                              &outs, functor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    bce_loss,
-    ops::BCELossCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::BCELossCUDAKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    bce_loss_grad,
-    ops::BCELossGradCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::BCELossGradCUDAKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/bce_loss_op.h b/paddle/fluid/operators/bce_loss_op.h
deleted file mode 100644
index dd87b69efe286..0000000000000
--- a/paddle/fluid/operators/bce_loss_op.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>  // for max
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class BCELossOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* labels = ctx.Input<Tensor>("Label");
-    auto* out = ctx.Output<Tensor>("Out");
-
-    auto x_data = x->data<T>();
-    auto label_data = labels->data<T>();
-    auto out_data = out->mutable_data<T>(ctx.GetPlace());
-    auto x_numel = x->numel();
-
-    // out = -(label * ln(x) + (1 - label) * ln(1 - x)) = (label - 1) * ln(1 -
-    // x) - label * ln(x)
-    for (int64_t i = 0; i < x_numel; ++i) {
-      PADDLE_ENFORCE_GE(
-          x_data[i], static_cast<T>(0),
-          platform::errors::InvalidArgument(
-              "Illegal input, input must be greater than  or equal to 0"));
-      PADDLE_ENFORCE_LE(
-          x_data[i], static_cast<T>(1),
-          platform::errors::InvalidArgument(
-              "Illegal input, input must be less than or equal to 1"));
-      out_data[i] =
-          (label_data[i] - static_cast<T>(1)) *
-              std::max(real_log(static_cast<T>(1) - x_data[i]), (T)(-100)) -
-          label_data[i] * std::max(real_log(x_data[i]), (T)(-100));
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class BCELossGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* labels = ctx.Input<Tensor>("Label");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    auto dx_data = dx->mutable_data<T>(ctx.GetPlace());
-    auto dout_data = dout->data<T>();
-    auto x_data = x->data<T>();
-    auto label_data = labels->data<T>();
-
-    int x_numel = x->numel();
-
-    // dx = dout * ((x - label)/(x - x^2))
-    for (int i = 0; i < x_numel; ++i) {
-      dx_data[i] =
-          dout_data[i] * ((x_data[i] - label_data[i]) /
-                          std::max((static_cast<T>(1) - x_data[i]) * x_data[i],
-                                   static_cast<T>(1e-12)));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/bce_loss_op_npu.cc b/paddle/fluid/operators/bce_loss_op_npu.cc
index 46e8a36d2eef7..c3cee6a7b0d5b 100644
--- a/paddle/fluid/operators/bce_loss_op_npu.cc
+++ b/paddle/fluid/operators/bce_loss_op_npu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/bce_loss_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index e94926a9c1403..ab1fe5433f302 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -230,4 +230,42 @@ void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
   out->set_dims(in_dims);
 }
 
+void BCELossInferMeta(const MetaTensor& input,
+                      const MetaTensor& label,
+                      MetaTensor* out,
+                      MetaConfig config) {
+  auto input_dims = input.dims();
+  auto label_dims = label.dims();
+
+  int rank = input_dims.size();
+  PADDLE_ENFORCE_EQ(rank,
+                    label_dims.size(),
+                    phi::errors::InvalidArgument(
+                        "Input(X) and Input(Label) shall have the same rank."
+                        "But received: the rank of Input(X) is [%d], "
+                        "the rank of Input(Label) is [%d].",
+                        rank,
+                        label_dims.size()));
+
+  bool check = true;
+  if ((!config.is_runtime) &&
+      (phi::product(input_dims) <= 0 || phi::product(label_dims) <= 0)) {
+    check = false;
+  }
+
+  if (check) {
+    PADDLE_ENFORCE_EQ(input_dims,
+                      label_dims,
+                      phi::errors::InvalidArgument(
+                          "Input(X) and Input(Label) shall have the same "
+                          "shape. But received: the shape of Input(X) is "
+                          "[%s], the shape of Input(Label) is [%s].",
+                          input_dims,
+                          label_dims));
+  }
+
+  out->set_dims(input_dims);
+  out->share_lod(input);
+}
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h
index f23382be89b6a..effa18c5677f6 100644
--- a/paddle/phi/infermeta/binary.h
+++ b/paddle/phi/infermeta/binary.h
@@ -54,4 +54,8 @@ void HuberLossInferMeta(const MetaTensor& input_meta,
                         MetaConfig config = MetaConfig());
 
 void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out);
+void BCELossInferMeta(const MetaTensor& input,
+                      const MetaTensor& label,
+                      MetaTensor* out,
+                      MetaConfig config = MetaConfig());
 }  // namespace phi
diff --git a/paddle/phi/kernels/bce_loss_grad_kernel.h b/paddle/phi/kernels/bce_loss_grad_kernel.h
new file mode 100644
index 0000000000000..14bf52196ac40
--- /dev/null
+++ b/paddle/phi/kernels/bce_loss_grad_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void BCELossGradKernel(const Context& dev_ctx,
+                       const DenseTensor& input,
+                       const DenseTensor& label,
+                       const DenseTensor& out_grad,
+                       DenseTensor* input_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/bce_loss_kernel.h b/paddle/phi/kernels/bce_loss_kernel.h
new file mode 100644
index 0000000000000..6459ea911666e
--- /dev/null
+++ b/paddle/phi/kernels/bce_loss_kernel.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void BCELossKernel(const Context& dev_ctx,
+                   const DenseTensor& input,
+                   const DenseTensor& label,
+                   DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/bce_loss_grad_kernel.cc b/paddle/phi/kernels/cpu/bce_loss_grad_kernel.cc
new file mode 100644
index 0000000000000..6859451e8be32
--- /dev/null
+++ b/paddle/phi/kernels/cpu/bce_loss_grad_kernel.cc
@@ -0,0 +1,47 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/bce_loss_grad_kernel.h"
+
+#include <algorithm>  // for max
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void BCELossGradKernel(const Context& dev_ctx,
+                       const DenseTensor& input,
+                       const DenseTensor& label,
+                       const DenseTensor& out_grad,
+                       DenseTensor* input_grad) {
+  auto dx_data = dev_ctx.template Alloc<T>(input_grad);
+  auto dout_data = out_grad.data<T>();
+  auto x_data = input.data<T>();
+  auto label_data = label.data<T>();
+
+  int x_numel = input.numel();
+
+  // dx = dout * ((x - label)/(x - x^2))
+  for (int i = 0; i < x_numel; ++i) {
+    dx_data[i] =
+        dout_data[i] * ((x_data[i] - label_data[i]) /
+                        std::max((static_cast<T>(1) - x_data[i]) * x_data[i],
+                                 static_cast<T>(1e-12)));
+  }
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    bce_loss_grad, CPU, ALL_LAYOUT, phi::BCELossGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/bce_loss_kernel.cc b/paddle/phi/kernels/cpu/bce_loss_kernel.cc
new file mode 100644
index 0000000000000..76b9793651484
--- /dev/null
+++ b/paddle/phi/kernels/cpu/bce_loss_kernel.cc
@@ -0,0 +1,59 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/bce_loss_kernel.h"
+
+#include <algorithm>  // for max
+#include "paddle/fluid/operators/math.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void BCELossKernel(const Context& dev_ctx,
+                   const DenseTensor& input,
+                   const DenseTensor& label,
+                   DenseTensor* out) {
+  auto x_data = input.data<T>();
+  auto label_data = label.data<T>();
+  auto out_data = dev_ctx.template Alloc<T>(out);
+  auto x_numel = input.numel();
+
+  // out = -(label * ln(x) + (1 - label) * ln(1 - x)) = (label - 1) * ln(1 -
+  // x) - label * ln(x)
+  for (int64_t i = 0; i < x_numel; ++i) {
+    PADDLE_ENFORCE_GE(
+        x_data[i],
+        static_cast<T>(0),
+        phi::errors::InvalidArgument(
+            "Illegal input, input must be greater than  or equal to 0"));
+    PADDLE_ENFORCE_LE(
+        x_data[i],
+        static_cast<T>(1),
+        phi::errors::InvalidArgument(
+            "Illegal input, input must be less than or equal to 1"));
+    out_data[i] =
+        (label_data[i] - static_cast<T>(1)) *
+            std::max(paddle::operators::real_log(static_cast<T>(1) - x_data[i]),
+                     (T)(-100)) -
+        label_data[i] *
+            std::max(paddle::operators::real_log(x_data[i]), (T)(-100));
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    bce_loss, CPU, ALL_LAYOUT, phi::BCELossKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/bce_loss_grad_kernel.cu b/paddle/phi/kernels/gpu/bce_loss_grad_kernel.cu
new file mode 100644
index 0000000000000..94eabac4d1306
--- /dev/null
+++ b/paddle/phi/kernels/gpu/bce_loss_grad_kernel.cu
@@ -0,0 +1,59 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/bce_loss_grad_kernel.h"
+
+#include <algorithm>
+#include <vector>
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+
+namespace phi {
+
+template <typename T>
+struct BCELossGradFunctor {
+  T one;
+  T eps;
+
+  HOSTDEVICE inline BCELossGradFunctor() {
+    one = static_cast<T>(1.0f);
+    eps = static_cast<T>(1e-12);
+  }
+
+  HOSTDEVICE inline T operator()(const T x, const T label, const T dout) const {
+    T term1 = max((one - x) * x, eps);
+    return (dout * (x - label) / term1);
+  }
+};
+
+template <typename T, typename Context>
+void BCELossGradKernel(const Context& dev_ctx,
+                       const DenseTensor& input,
+                       const DenseTensor& label,
+                       const DenseTensor& out_grad,
+                       DenseTensor* input_grad) {
+  dev_ctx.template Alloc<T>(input_grad);
+  std::vector<const DenseTensor*> ins = {&input, &label, &out_grad};
+  std::vector<DenseTensor*> outs = {input_grad};
+  auto functor = BCELossGradFunctor<T>();
+  phi::funcs::ElementwiseKernel<T>(dev_ctx, ins, &outs, functor);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    bce_loss_grad, GPU, ALL_LAYOUT, phi::BCELossGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/bce_loss_kernel.cu b/paddle/phi/kernels/gpu/bce_loss_kernel.cu
new file mode 100644
index 0000000000000..adbcd3b2b6207
--- /dev/null
+++ b/paddle/phi/kernels/gpu/bce_loss_kernel.cu
@@ -0,0 +1,64 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/bce_loss_kernel.h"
+
+#include <algorithm>
+#include <vector>
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/primitive/functor_primitives.h"
+
+namespace phi {
+
+template <typename T>
+struct BCELossFunctor {
+  T one;
+  T neg_100;
+
+  HOSTDEVICE inline BCELossFunctor() {
+    one = static_cast<T>(1.0f);
+    neg_100 = static_cast<T>(-100.);
+  }
+
+  HOSTDEVICE inline T operator()(const T x, const T label) const {
+    PADDLE_ENFORCE(
+        (x >= static_cast<T>(0)) && (x <= one),
+        "Input is expected to be within the interval [0, 1], but recieved %f.",
+        x);
+    T term1 = max(phi::kps::details::Log(x), neg_100);
+    T term2 = max(phi::kps::details::Log(one - x), neg_100);
+    return (((label - one) * term2) - (label * term1));
+  }
+};
+
+template <typename T, typename Context>
+void BCELossKernel(const Context& dev_ctx,
+                   const DenseTensor& input,
+                   const DenseTensor& label,
+                   DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+  std::vector<const DenseTensor*> ins = {&input, &label};
+  std::vector<DenseTensor*> outs = {out};
+  auto functor = BCELossFunctor<T>();
+  phi::funcs::ElementwiseKernel<T>(dev_ctx, ins, &outs, functor);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    bce_loss, GPU, ALL_LAYOUT, phi::BCELossKernel, float, double) {}
diff --git a/paddle/phi/ops/compat/bce_loss_sig.cc b/paddle/phi/ops/compat/bce_loss_sig.cc
new file mode 100644
index 0000000000000..17f76067d22db
--- /dev/null
+++ b/paddle/phi/ops/compat/bce_loss_sig.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature BCELossGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("bce_loss_grad",
+                         {"X", "Label", GradVarName("Out")},
+                         {},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(bce_loss_grad, phi::BCELossGradOpArgumentMapping);

From 6c358a7c22fe9acd35cae13f7debc8200350d0ee Mon Sep 17 00:00:00 2001
From: 0x45f <23097963+0x45f@users.noreply.github.com>
Date: Thu, 24 Feb 2022 15:34:31 +0800
Subject: [PATCH 092/101] [Phi]Move cross OP to phi (#39829)

* move cross forward OP

* move cross grad op to phi

* move infershape

* refine infershape

* rename ctx

* set dtype and layout in InferMeta

* refine code
---
 paddle/fluid/operators/cross_op.cc            |  69 +-----
 paddle/fluid/operators/cross_op.cu            |  28 ---
 paddle/fluid/operators/cross_op.h             | 222 ------------------
 paddle/phi/infermeta/binary.cc                |  45 ++++
 paddle/phi/infermeta/binary.h                 |   5 +
 paddle/phi/kernels/cpu/cross_grad_kernel.cc   |  28 +++
 paddle/phi/kernels/cpu/cross_kernel.cc        |  22 ++
 paddle/phi/kernels/cross_grad_kernel.h        |  30 +++
 paddle/phi/kernels/cross_kernel.h             |  28 +++
 paddle/phi/kernels/funcs/common_shape.h       |  12 +
 paddle/phi/kernels/gpu/cross_grad_kernel.cu   |  28 +++
 paddle/phi/kernels/gpu/cross_kernel.cu        |  22 ++
 .../phi/kernels/impl/cross_grad_kernel_impl.h | 113 +++++++++
 paddle/phi/kernels/impl/cross_kernel_impl.h   | 116 +++++++++
 paddle/phi/ops/compat/cross_sig.cc            |  33 +++
 15 files changed, 491 insertions(+), 310 deletions(-)
 delete mode 100644 paddle/fluid/operators/cross_op.cu
 delete mode 100644 paddle/fluid/operators/cross_op.h
 create mode 100644 paddle/phi/kernels/cpu/cross_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/cross_kernel.cc
 create mode 100644 paddle/phi/kernels/cross_grad_kernel.h
 create mode 100644 paddle/phi/kernels/cross_kernel.h
 create mode 100644 paddle/phi/kernels/gpu/cross_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/cross_kernel.cu
 create mode 100644 paddle/phi/kernels/impl/cross_grad_kernel_impl.h
 create mode 100644 paddle/phi/kernels/impl/cross_kernel_impl.h
 create mode 100644 paddle/phi/ops/compat/cross_sig.cc

diff --git a/paddle/fluid/operators/cross_op.cc b/paddle/fluid/operators/cross_op.cc
index e6b30ba42fc26..fe00ee06603f0 100644
--- a/paddle/fluid/operators/cross_op.cc
+++ b/paddle/fluid/operators/cross_op.cc
@@ -12,67 +12,23 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/cross_op.h"
 #include <memory>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
 
 using framework::Tensor;
 using framework::DDim;
+const int kDefaultDim = framework::DDim::kMaxRank;
 
 class CrossOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of CrossOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Y"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(Index) of CrossOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(Out) of CrossOp should not be null."));
-
-    auto x_dim = ctx->GetInputDim("X");
-    auto y_dim = ctx->GetInputDim("Y");
-    auto dim = ctx->Attrs().Get<int>("dim");
-
-    bool dims_match = CheckDims(x_dim, y_dim);
-    PADDLE_ENFORCE_EQ(dims_match, true,
-                      platform::errors::InvalidArgument(
-                          "The 'shape' of Input(X) should be equal to "
-                          "the 'shape' of Input(Y). But received "
-                          "Input(X).dimensions = [%s], "
-                          "Input(Y).dimensions = [%s]",
-                          x_dim, y_dim));
-
-    if (dim != kDefaultDim) {
-      PADDLE_ENFORCE_EQ(
-          dim < x_dim.size() && dim >= (0 - x_dim.size()), true,
-          platform::errors::OutOfRange(
-              "Attr(dim) is out of range, It's expected "
-              "to be in range of [-%d, %d]. But received Attr(dim) = %d.",
-              x_dim.size(), x_dim.size() - 1, dim));
-      if (dim < 0) {
-        dim += x_dim.size();
-      }
-      PADDLE_ENFORCE_EQ(x_dim[dim] == 3 && y_dim[dim] == 3, true,
-                        platform::errors::InvalidArgument(
-                            "Input(X/Y).dims()[dim] should be equal to 3."
-                            "But received Input(X/Y).dims()[dim] = %d.",
-                            x_dim[dim]));
-    }
-
-    ctx->SetOutputDim("Out", x_dim);
-    auto type = ctx->GetInputsVarType("X")[0];
-    if (type == framework::proto::VarType::LOD_TENSOR) {
-      ctx->ShareLoD("X", /*->*/ "Out");
-    }
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -153,17 +109,10 @@ class CrossGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DELCARE_INFER_SHAPE_FUNCTOR(cross, CrossInferShapeFunctor,
+                            PT_INFER_META(phi::CrossInferMeta));
 REGISTER_OPERATOR(cross, ops::CrossOp, ops::CrossOpMaker,
                   ops::CrossGradMaker<paddle::framework::OpDesc>,
-                  ops::CrossGradMaker<paddle::imperative::OpBase>);
+                  ops::CrossGradMaker<paddle::imperative::OpBase>,
+                  CrossInferShapeFunctor);
 REGISTER_OPERATOR(cross_grad, ops::CrossGradOp);
-REGISTER_OP_CPU_KERNEL(
-    cross, ops::CrossKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CrossKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::CrossKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::CrossKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    cross_grad, ops::CrossGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CrossGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::CrossGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::CrossGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/cross_op.cu b/paddle/fluid/operators/cross_op.cu
deleted file mode 100644
index 78bbb3ea56454..0000000000000
--- a/paddle/fluid/operators/cross_op.cu
+++ /dev/null
@@ -1,28 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/cross_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    cross, ops::CrossKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::CrossKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::CrossKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::CrossKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    cross_grad,
-    ops::CrossGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::CrossGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::CrossGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::CrossGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/cross_op.h b/paddle/fluid/operators/cross_op.h
deleted file mode 100644
index b1c5eb62fdce5..0000000000000
--- a/paddle/fluid/operators/cross_op.h
+++ /dev/null
@@ -1,222 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using DDim = framework::DDim;
-const int kDefaultDim = framework::DDim::kMaxRank;
-
-inline bool CheckDims(const DDim& dims_x, const DDim& dims_y) {
-  if (dims_x.size() != dims_y.size()) {
-    return false;
-  }
-  for (int i = 0; i < dims_x.size(); i++) {
-    if (dims_x[i] != dims_y[i]) {
-      return false;
-    }
-  }
-  return true;
-}
-
-template <typename DeviceContext, typename T>
-class CrossKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input_x_var = context.InputVar("X");
-    auto* input_y_var = context.InputVar("Y");
-    auto* output_var = context.OutputVar("Out");
-
-    auto& input_x = input_x_var->Get<LoDTensor>();
-    auto& input_y = input_y_var->Get<LoDTensor>();
-    auto* output = output_var->GetMutable<LoDTensor>();
-    int dim = context.Attr<int>("dim");
-
-    auto input_x_dims = input_x.dims();
-    auto input_y_dims = input_y.dims();
-    bool dims_match = CheckDims(input_x_dims, input_y_dims);
-    PADDLE_ENFORCE_EQ(dims_match, true,
-                      platform::errors::InvalidArgument(
-                          "The 'shape' of Input(X) should be equal to "
-                          "the 'shape' of Input(Y). But received "
-                          "Input(X).dimensions = [%s], "
-                          "Input(Y).dimensions = [%s]",
-                          input_x_dims, input_x_dims));
-
-    if (dim != kDefaultDim) {
-      PADDLE_ENFORCE_EQ(
-          dim < input_x_dims.size() && dim >= (0 - input_x_dims.size()), true,
-          platform::errors::OutOfRange(
-              "Attr(dim) is out of range, It's expected "
-              "to be in range of [-%d, %d]. But received Attr(dim) = %d.",
-              input_x_dims.size(), input_x_dims.size() - 1, dim));
-      if (dim < 0) {
-        dim += input_x_dims.size();
-      }
-
-      PADDLE_ENFORCE_EQ(
-          input_x_dims[dim] == 3, true,
-          platform::errors::InvalidArgument(
-              "Input(X/Y).dims[dim] must be equal to 3. But received: "
-              "Input(X/Y).dims[dim] = [%d].",
-              input_x_dims[dim]));
-    } else {
-      for (auto i = 0; i < input_x_dims.size(); i++) {
-        if (input_x_dims[i] == 3) {
-          dim = i;
-          break;
-        }
-      }
-      PADDLE_ENFORCE_EQ(dim == kDefaultDim, false,
-                        platform::errors::InvalidArgument(
-                            "There must be at least one dimension 'd' so that "
-                            "Input(X/Y).dims()[d] is equal to 3. "
-                            "But received: Input(X/Y).dims() == [%s].",
-                            input_x_dims));
-    }
-    auto outer_loops = 1;
-    for (auto i = 0; i < dim; i++) {
-      outer_loops *= input_x_dims[i];
-    }
-    auto slice_size = 1;
-    for (auto i = dim + 1; i < input_x_dims.size(); i++) {
-      slice_size *= input_x_dims[i];
-    }
-
-    std::vector<T> input_x_vec, input_y_vec;
-    framework::TensorToVector(input_x, context.device_context(), &input_x_vec);
-    framework::TensorToVector(input_y, context.device_context(), &input_y_vec);
-    std::vector<T> out_vec(output->numel());
-
-    output->mutable_data<T>(context.GetPlace());
-
-    for (auto i = 0; i < outer_loops; i++) {
-      for (auto j = 0; j < 3; j++) {
-        auto dst_pos = (3 * i + j) * slice_size;
-        auto in_pos1 = (3 * i + ((j + 1) % 3)) * slice_size;
-        auto in_pos2 = (3 * i + ((j + 2) % 3)) * slice_size;
-
-        for (auto k = 0; k < slice_size; k++) {
-          out_vec[dst_pos + k] =
-              input_x_vec[in_pos1 + k] * input_y_vec[in_pos2 + k] -
-              input_x_vec[in_pos2 + k] * input_y_vec[in_pos1 + k];
-        }
-      }
-    }
-    framework::TensorFromVector(out_vec, context.device_context(), output);
-    output->Resize(input_x_dims);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class CrossGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input_x_var = context.InputVar("X");
-    auto* input_y_var = context.InputVar("Y");
-    auto* input_out_grad_var = context.InputVar(framework::GradVarName("Out"));
-    auto* output_x_grad_var = context.OutputVar(framework::GradVarName("X"));
-    auto* output_y_grad_var = context.OutputVar(framework::GradVarName("Y"));
-
-    auto& input_x = input_x_var->Get<LoDTensor>();
-    auto& input_y = input_y_var->Get<LoDTensor>();
-    auto& input_out_grad = input_out_grad_var->Get<LoDTensor>();
-    auto* output_x_grad = output_x_grad_var->GetMutable<LoDTensor>();
-    auto* output_y_grad = output_y_grad_var->GetMutable<LoDTensor>();
-
-    int dim = context.Attr<int>("dim");
-    auto input_x_dims = input_x.dims();
-    if (dim != kDefaultDim) {
-      PADDLE_ENFORCE_EQ(
-          dim < input_x_dims.size() && dim >= (0 - input_x_dims.size()), true,
-          platform::errors::OutOfRange(
-              "Attr(dim) is out of range, It's expected "
-              "to be in range of [-%d, %d]. But received Attr(dim) = %d.",
-              input_x_dims.size(), input_x_dims.size() - 1, dim));
-      if (dim < 0) {
-        dim += input_x_dims.size();
-      }
-
-      PADDLE_ENFORCE_EQ(
-          input_x_dims[dim] == 3, true,
-          platform::errors::InvalidArgument(
-              "Input(X/Y).dims[dim] must be equal to 3. But received: "
-              "Input(X/Y).dims[dim] = [%d].",
-              input_x_dims[dim]));
-    } else {
-      for (auto i = 0; i < input_x_dims.size(); i++) {
-        if (input_x_dims[i] == 3) {
-          dim = i;
-          break;
-        }
-      }
-      PADDLE_ENFORCE_EQ(dim == kDefaultDim, false,
-                        platform::errors::InvalidArgument(
-                            "There must be at least one dimension 'd' "
-                            "so that Input(X/Y).dims()[d] is equal to 3. "
-                            "But received: Input(X/Y).dims() == [%s].",
-                            input_x_dims));
-    }
-    auto outer_loops = 1;
-    for (auto i = 0; i < dim; i++) {
-      outer_loops *= input_x_dims[i];
-    }
-    auto slice_size = 1;
-    for (auto i = dim + 1; i < input_x_dims.size(); i++) {
-      slice_size *= input_x_dims[i];
-    }
-
-    std::vector<T> input_x_vec, input_y_vec, input_dout_vec;
-    framework::TensorToVector(input_x, context.device_context(), &input_x_vec);
-    framework::TensorToVector(input_y, context.device_context(), &input_y_vec);
-    framework::TensorToVector(input_out_grad, context.device_context(),
-                              &input_dout_vec);
-    std::vector<T> out_dx_vec(output_x_grad->numel());
-    std::vector<T> out_dy_vec(output_y_grad->numel());
-
-    output_x_grad->mutable_data<T>(context.GetPlace());
-    output_y_grad->mutable_data<T>(context.GetPlace());
-
-    for (auto i = 0; i < outer_loops; i++) {
-      for (auto j = 0; j < 3; j++) {
-        auto dst_pos = (3 * i + j) * slice_size;
-        auto in_pos1 = (3 * i + ((j + 1) % 3)) * slice_size;
-        auto in_pos2 = (3 * i + ((j + 2) % 3)) * slice_size;
-        for (auto k = 0; k < slice_size; k++) {
-          out_dx_vec[dst_pos + k] =
-              input_dout_vec[in_pos2 + k] * input_y_vec[in_pos1 + k] -
-              input_dout_vec[in_pos1 + k] * input_y_vec[in_pos2 + k];
-          out_dy_vec[dst_pos + k] =
-              input_dout_vec[in_pos1 + k] * input_x_vec[in_pos2 + k] -
-              input_dout_vec[in_pos2 + k] * input_x_vec[in_pos1 + k];
-        }
-      }
-    }
-    framework::TensorFromVector(out_dx_vec, context.device_context(),
-                                output_x_grad);
-    framework::TensorFromVector(out_dy_vec, context.device_context(),
-                                output_y_grad);
-    output_x_grad->Resize(input_x_dims);
-    output_y_grad->Resize(input_x_dims);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index ab1fe5433f302..58cd43998b8a5 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -225,6 +225,51 @@ void HuberLossInferMeta(const MetaTensor& input,
   out->share_lod(input);
 }
 
+void CrossInferMeta(const MetaTensor& x,
+                    const MetaTensor& y,
+                    int axis,
+                    MetaTensor* out) {
+  auto x_dim = x.dims();
+  auto y_dim = y.dims();
+  auto dim = axis;
+
+  bool dims_match = phi::funcs::CheckDims(x_dim, y_dim);
+  PADDLE_ENFORCE_EQ(
+      dims_match,
+      true,
+      phi::errors::InvalidArgument("The 'shape' of Input(X) should be equal to "
+                                   "the 'shape' of Input(Y). But received "
+                                   "Input(X).dimensions = [%s], "
+                                   "Input(Y).dimensions = [%s]",
+                                   x_dim,
+                                   y_dim));
+
+  if (dim != DDim::kMaxRank) {
+    PADDLE_ENFORCE_EQ(
+        dim < x_dim.size() && dim >= (0 - x_dim.size()),
+        true,
+        phi::errors::OutOfRange(
+            "Attr(dim) is out of range, It's expected "
+            "to be in range of [-%d, %d]. But received Attr(dim) = %d.",
+            x_dim.size(),
+            x_dim.size() - 1,
+            dim));
+    if (dim < 0) {
+      dim += x_dim.size();
+    }
+    PADDLE_ENFORCE_EQ(x_dim[dim] == 3 && y_dim[dim] == 3,
+                      true,
+                      phi::errors::InvalidArgument(
+                          "Input(X/Y).dims()[dim] should be equal to 3."
+                          "But received Input(X/Y).dims()[dim] = %d.",
+                          x_dim[dim]));
+  }
+  out->set_dims(x_dim);
+  out->set_dtype(x.dtype());
+  out->set_layout(x.layout());
+  out->share_lod(x);
+}
+
 void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
   auto in_dims = x.dims();
   out->set_dims(in_dims);
diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h
index effa18c5677f6..02750482dccaa 100644
--- a/paddle/phi/infermeta/binary.h
+++ b/paddle/phi/infermeta/binary.h
@@ -53,6 +53,11 @@ void HuberLossInferMeta(const MetaTensor& input_meta,
                         MetaTensor* residual,
                         MetaConfig config = MetaConfig());
 
+void CrossInferMeta(const MetaTensor& x,
+                    const MetaTensor& y,
+                    int axis,
+                    MetaTensor* out);
+
 void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out);
 void BCELossInferMeta(const MetaTensor& input,
                       const MetaTensor& label,
diff --git a/paddle/phi/kernels/cpu/cross_grad_kernel.cc b/paddle/phi/kernels/cpu/cross_grad_kernel.cc
new file mode 100644
index 0000000000000..390420008e6ea
--- /dev/null
+++ b/paddle/phi/kernels/cpu/cross_grad_kernel.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cross_grad_kernel.h"
+#include "paddle/phi/kernels/impl/cross_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(cross_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::CrossGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/cross_kernel.cc b/paddle/phi/kernels/cpu/cross_kernel.cc
new file mode 100644
index 0000000000000..a63f33174eacd
--- /dev/null
+++ b/paddle/phi/kernels/cpu/cross_kernel.cc
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cross_kernel.h"
+#include "paddle/phi/kernels/impl/cross_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(
+    cross, CPU, ALL_LAYOUT, phi::CrossKernel, float, double, int, int64_t) {}
diff --git a/paddle/phi/kernels/cross_grad_kernel.h b/paddle/phi/kernels/cross_grad_kernel.h
new file mode 100644
index 0000000000000..9ea0804a94b6b
--- /dev/null
+++ b/paddle/phi/kernels/cross_grad_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CrossGradKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     const DenseTensor& out_grad,
+                     int axis,
+                     DenseTensor* x_grad,
+                     DenseTensor* y_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cross_kernel.h b/paddle/phi/kernels/cross_kernel.h
new file mode 100644
index 0000000000000..567889e078345
--- /dev/null
+++ b/paddle/phi/kernels/cross_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CrossKernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 const DenseTensor& y,
+                 int axis,
+                 DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/common_shape.h b/paddle/phi/kernels/funcs/common_shape.h
index 8bd9867f39edd..d5289dcc22cbc 100644
--- a/paddle/phi/kernels/funcs/common_shape.h
+++ b/paddle/phi/kernels/funcs/common_shape.h
@@ -128,5 +128,17 @@ static void GetBroadcastDims(const DDim &in_dims,
   }
 }
 
+inline bool CheckDims(const DDim &dims_x, const DDim &dims_y) {
+  if (dims_x.size() != dims_y.size()) {
+    return false;
+  }
+  for (int i = 0; i < dims_x.size(); i++) {
+    if (dims_x[i] != dims_y[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/cross_grad_kernel.cu b/paddle/phi/kernels/gpu/cross_grad_kernel.cu
new file mode 100644
index 0000000000000..1bb0d42dad81a
--- /dev/null
+++ b/paddle/phi/kernels/gpu/cross_grad_kernel.cu
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cross_grad_kernel.h"
+#include "paddle/phi/kernels/impl/cross_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(cross_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::CrossGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/cross_kernel.cu b/paddle/phi/kernels/gpu/cross_kernel.cu
new file mode 100644
index 0000000000000..aa944f8291674
--- /dev/null
+++ b/paddle/phi/kernels/gpu/cross_kernel.cu
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cross_kernel.h"
+#include "paddle/phi/kernels/impl/cross_kernel_impl.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(
+    cross, GPU, ALL_LAYOUT, phi::CrossKernel, float, double, int, int64_t) {}
diff --git a/paddle/phi/kernels/impl/cross_grad_kernel_impl.h b/paddle/phi/kernels/impl/cross_grad_kernel_impl.h
new file mode 100644
index 0000000000000..99a79dc15c049
--- /dev/null
+++ b/paddle/phi/kernels/impl/cross_grad_kernel_impl.h
@@ -0,0 +1,113 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CrossGradKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     const DenseTensor& out_grad,
+                     int axis,
+                     DenseTensor* x_grad,
+                     DenseTensor* y_grad) {
+  auto& input_x = x;
+  auto& input_y = y;
+  auto& input_out_grad = out_grad;
+  auto* output_x_grad = x_grad;
+  auto* output_y_grad = y_grad;
+  int dim = axis;
+  auto input_x_dims = input_x.dims();
+  if (dim != DDim::kMaxRank) {
+    PADDLE_ENFORCE_EQ(
+        dim < input_x_dims.size() && dim >= (0 - input_x_dims.size()),
+        true,
+        errors::OutOfRange(
+            "Attr(dim) is out of range, It's expected "
+            "to be in range of [-%d, %d]. But received Attr(dim) = %d.",
+            input_x_dims.size(),
+            input_x_dims.size() - 1,
+            dim));
+    if (dim < 0) {
+      dim += input_x_dims.size();
+    }
+
+    PADDLE_ENFORCE_EQ(
+        input_x_dims[dim] == 3,
+        true,
+        errors::InvalidArgument(
+            "Input(X/Y).dims[dim] must be equal to 3. But received: "
+            "Input(X/Y).dims[dim] = [%d].",
+            input_x_dims[dim]));
+  } else {
+    for (auto i = 0; i < input_x_dims.size(); i++) {
+      if (input_x_dims[i] == 3) {
+        dim = i;
+        break;
+      }
+    }
+    PADDLE_ENFORCE_EQ(
+        dim == DDim::kMaxRank,
+        false,
+        errors::InvalidArgument("There must be at least one dimension 'd' "
+                                "so that Input(X/Y).dims()[d] is equal to 3. "
+                                "But received: Input(X/Y).dims() == [%s].",
+                                input_x_dims));
+  }
+  auto outer_loops = 1;
+  for (auto i = 0; i < dim; i++) {
+    outer_loops *= input_x_dims[i];
+  }
+  auto slice_size = 1;
+  for (auto i = dim + 1; i < input_x_dims.size(); i++) {
+    slice_size *= input_x_dims[i];
+  }
+
+  std::vector<T> input_x_vec, input_y_vec, input_dout_vec;
+  paddle::framework::TensorToVector(input_x, dev_ctx, &input_x_vec);
+  paddle::framework::TensorToVector(input_y, dev_ctx, &input_y_vec);
+  paddle::framework::TensorToVector(input_out_grad, dev_ctx, &input_dout_vec);
+  std::vector<T> out_dx_vec(output_x_grad->numel());
+  std::vector<T> out_dy_vec(output_y_grad->numel());
+
+  dev_ctx.template Alloc<T>(output_x_grad);
+  dev_ctx.template Alloc<T>(output_y_grad);
+
+  for (auto i = 0; i < outer_loops; i++) {
+    for (auto j = 0; j < 3; j++) {
+      auto dst_pos = (3 * i + j) * slice_size;
+      auto in_pos1 = (3 * i + ((j + 1) % 3)) * slice_size;
+      auto in_pos2 = (3 * i + ((j + 2) % 3)) * slice_size;
+      for (auto k = 0; k < slice_size; k++) {
+        out_dx_vec[dst_pos + k] =
+            input_dout_vec[in_pos2 + k] * input_y_vec[in_pos1 + k] -
+            input_dout_vec[in_pos1 + k] * input_y_vec[in_pos2 + k];
+        out_dy_vec[dst_pos + k] =
+            input_dout_vec[in_pos1 + k] * input_x_vec[in_pos2 + k] -
+            input_dout_vec[in_pos2 + k] * input_x_vec[in_pos1 + k];
+      }
+    }
+  }
+  paddle::framework::TensorFromVector(out_dx_vec, dev_ctx, output_x_grad);
+  paddle::framework::TensorFromVector(out_dy_vec, dev_ctx, output_y_grad);
+  output_x_grad->Resize(input_x_dims);
+  output_y_grad->Resize(input_x_dims);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/cross_kernel_impl.h b/paddle/phi/kernels/impl/cross_kernel_impl.h
new file mode 100644
index 0000000000000..6427d7f87193f
--- /dev/null
+++ b/paddle/phi/kernels/impl/cross_kernel_impl.h
@@ -0,0 +1,116 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/common_shape.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CrossKernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 const DenseTensor& y,
+                 int axis,
+                 DenseTensor* out) {
+  auto& input_x = x;
+  auto& input_y = y;
+  auto* output = out;
+  int dim = axis;
+
+  auto input_x_dims = input_x.dims();
+  auto input_y_dims = input_y.dims();
+  bool dims_match = phi::funcs::CheckDims(input_x_dims, input_y_dims);
+  PADDLE_ENFORCE_EQ(
+      dims_match,
+      true,
+      phi::errors::InvalidArgument("The 'shape' of Input(X) should be equal to "
+                                   "the 'shape' of Input(Y). But received "
+                                   "Input(X).dimensions = [%s], "
+                                   "Input(Y).dimensions = [%s]",
+                                   input_x_dims,
+                                   input_x_dims));
+
+  if (dim != DDim::kMaxRank) {
+    PADDLE_ENFORCE_EQ(
+        dim < input_x_dims.size() && dim >= (0 - input_x_dims.size()),
+        true,
+        phi::errors::OutOfRange(
+            "Attr(dim) is out of range, It's expected "
+            "to be in range of [-%d, %d]. But received Attr(dim) = %d.",
+            input_x_dims.size(),
+            input_x_dims.size() - 1,
+            dim));
+    if (dim < 0) {
+      dim += input_x_dims.size();
+    }
+
+    PADDLE_ENFORCE_EQ(
+        input_x_dims[dim] == 3,
+        true,
+        phi::errors::InvalidArgument(
+            "Input(X/Y).dims[dim] must be equal to 3. But received: "
+            "Input(X/Y).dims[dim] = [%d].",
+            input_x_dims[dim]));
+  } else {
+    for (auto i = 0; i < input_x_dims.size(); i++) {
+      if (input_x_dims[i] == 3) {
+        dim = i;
+        break;
+      }
+    }
+    PADDLE_ENFORCE_EQ(dim == DDim::kMaxRank,
+                      false,
+                      phi::errors::InvalidArgument(
+                          "There must be at least one dimension 'd' so that "
+                          "Input(X/Y).dims()[d] is equal to 3. "
+                          "But received: Input(X/Y).dims() == [%s].",
+                          input_x_dims));
+  }
+  auto outer_loops = 1;
+  for (auto i = 0; i < dim; i++) {
+    outer_loops *= input_x_dims[i];
+  }
+  auto slice_size = 1;
+  for (auto i = dim + 1; i < input_x_dims.size(); i++) {
+    slice_size *= input_x_dims[i];
+  }
+
+  std::vector<T> input_x_vec, input_y_vec;
+  paddle::framework::TensorToVector(input_x, dev_ctx, &input_x_vec);
+  paddle::framework::TensorToVector(input_y, dev_ctx, &input_y_vec);
+  std::vector<T> out_vec(output->numel());
+
+  dev_ctx.template Alloc<T>(output);
+
+  for (auto i = 0; i < outer_loops; i++) {
+    for (auto j = 0; j < 3; j++) {
+      auto dst_pos = (3 * i + j) * slice_size;
+      auto in_pos1 = (3 * i + ((j + 1) % 3)) * slice_size;
+      auto in_pos2 = (3 * i + ((j + 2) % 3)) * slice_size;
+
+      for (auto k = 0; k < slice_size; k++) {
+        out_vec[dst_pos + k] =
+            input_x_vec[in_pos1 + k] * input_y_vec[in_pos2 + k] -
+            input_x_vec[in_pos2 + k] * input_y_vec[in_pos1 + k];
+      }
+    }
+  }
+  paddle::framework::TensorFromVector(out_vec, dev_ctx, output);
+  output->Resize(input_x_dims);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/cross_sig.cc b/paddle/phi/ops/compat/cross_sig.cc
new file mode 100644
index 0000000000000..307c2ac5164b5
--- /dev/null
+++ b/paddle/phi/ops/compat/cross_sig.cc
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature CrossOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("cross", {"X", "Y"}, {"dim"}, {"Out"});
+}
+
+KernelSignature CrossGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("cross_grad",
+                         {"X", "Y", GradVarName("Out")},
+                         {"dim"},
+                         {GradVarName("X"), GradVarName("Y")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(cross, phi::CrossOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(cross_grad, phi::CrossGradOpArgumentMapping);

From 539fb0d7f551b06de5af84f2b40c3ca65bd1a7f5 Mon Sep 17 00:00:00 2001
From: crystal <62974595+Zjq9409@users.noreply.github.com>
Date: Thu, 24 Feb 2022 16:17:18 +0800
Subject: [PATCH 093/101] Fix unittests for eigh op (#39568)

* fix eigh test

* modify atol and rtol
---
 .../fluid/tests/unittests/test_eigh_op.py     | 41 ++++++++++++-------
 1 file changed, 26 insertions(+), 15 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_eigh_op.py b/python/paddle/fluid/tests/unittests/test_eigh_op.py
index 8e8c9df199f14..3e8230e5d0c62 100644
--- a/python/paddle/fluid/tests/unittests/test_eigh_op.py
+++ b/python/paddle/fluid/tests/unittests/test_eigh_op.py
@@ -59,8 +59,12 @@ def setUp(self):
         self.dtype = "float32"
         np.random.seed(123)
         self.x_np = np.random.random(self.x_shape).astype(self.dtype)
-        self.rtol = 1e-5
-        self.atol = 1e-5
+        if (paddle.version.cuda() >= "11.6"):
+            self.rtol = 5e-6
+            self.atol = 6e-5
+        else:
+            self.rtol = 1e-5
+            self.atol = 1e-5
 
     def test_check_output_gpu(self):
         if paddle.is_compiled_with_cuda():
@@ -79,23 +83,30 @@ def test_check_output_gpu(self):
 
 class TestEighAPI(unittest.TestCase):
     def setUp(self):
-        self.init_input_shape()
-        self.dtype = "float32"
+        self.init_input_data()
         self.UPLO = 'L'
-        self.rtol = 1e-6
-        self.atol = 1e-6
+        if (paddle.version.cuda() >= "11.6"):
+            self.rtol = 5e-6
+            self.atol = 6e-5
+        else:
+            self.rtol = 1e-5
+            self.atol = 1e-5
         self.place = paddle.CUDAPlace(0) if paddle.is_compiled_with_cuda() \
             else paddle.CPUPlace()
         np.random.seed(123)
+
+    def init_input_data(self):
+        self.x_shape = [5, 5]
+        self.dtype = "float32"
         self.real_data = np.random.random(self.x_shape).astype(self.dtype)
-        self.complex_data = np.random.random(self.x_shape).astype(
+        complex_data = np.random.random(self.x_shape).astype(
             self.dtype) + 1J * np.random.random(self.x_shape).astype(self.dtype)
         self.trans_dims = list(range(len(self.x_shape) - 2)) + [
             len(self.x_shape) - 1, len(self.x_shape) - 2
         ]
-
-    def init_input_shape(self):
-        self.x_shape = [5, 5]
+        #build a random conjugate matrix
+        self.complex_symm = np.divide(
+            complex_data + np.conj(complex_data.transpose(self.trans_dims)), 2)
 
     def compare_result(self, actual_w, actual_v, expected_w, expected_v):
         np.testing.assert_allclose(
@@ -129,9 +140,9 @@ def check_static_complex_result(self):
             exe = paddle.static.Executor(self.place)
             expected_w, expected_v = exe.run(
                 main_prog,
-                feed={"input_x": self.complex_data},
+                feed={"input_x": self.complex_symm},
                 fetch_list=[output_w, output_v])
-            actual_w, actual_v = np.linalg.eigh(self.complex_data)
+            actual_w, actual_v = np.linalg.eigh(self.complex_symm)
             self.compare_result(actual_w, actual_v, expected_w, expected_v)
 
     def test_in_static_mode(self):
@@ -146,14 +157,14 @@ def test_in_dynamic_mode(self):
         actual_w, actual_v = paddle.linalg.eigh(input_real_data)
         self.compare_result(actual_w, actual_v.numpy(), expected_w, expected_v)
 
-        input_complex_data = paddle.to_tensor(self.complex_data)
-        expected_w, expected_v = np.linalg.eigh(self.complex_data)
+        input_complex_data = paddle.to_tensor(self.complex_symm)
+        expected_w, expected_v = np.linalg.eigh(self.complex_symm)
         actual_w, actual_v = paddle.linalg.eigh(input_complex_data)
         self.compare_result(actual_w, actual_v.numpy(), expected_w, expected_v)
 
     def test_eigh_grad(self):
         paddle.disable_static()
-        x = paddle.to_tensor(self.complex_data, stop_gradient=False)
+        x = paddle.to_tensor(self.complex_symm, stop_gradient=False)
         w, v = paddle.linalg.eigh(x)
         (w.sum() + paddle.abs(v).sum()).backward()
         np.testing.assert_allclose(

From df0b4434e90e72ff7ac4cef3b7365deb71925865 Mon Sep 17 00:00:00 2001
From: Lijunhui <1578034415@qq.com>
Date: Thu, 24 Feb 2022 16:52:13 +0800
Subject: [PATCH 094/101] Optimize nearest_interp backward (#39067)

* nearest_interp_bw init

* optimize kernel config

* optimize kernel config

* fix struct init

* optimize code

* rm duplicated struct
---
 paddle/fluid/operators/interpolate_v2_op.cu | 102 ++++++++++++++------
 1 file changed, 73 insertions(+), 29 deletions(-)

diff --git a/paddle/fluid/operators/interpolate_v2_op.cu b/paddle/fluid/operators/interpolate_v2_op.cu
index 6e9d6a1995474..d61eb46d97e98 100644
--- a/paddle/fluid/operators/interpolate_v2_op.cu
+++ b/paddle/fluid/operators/interpolate_v2_op.cu
@@ -210,32 +210,66 @@ __global__ void KeNearestNeighbor3DInterpFw(
   }
 }
 
+template <typename T>
+__global__ void KeNearestNeighborInterpNCHWBw(
+    T* in, const size_t in_img_h, const size_t in_img_w, const T* out,
+    const size_t out_img_h, const size_t out_img_w, const size_t nc,
+    const float ratio_h, const float ratio_w, const bool align_corners) {
+  int out_img_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  int out_img_idy = threadIdx.y + blockIdx.y * blockDim.y;
+  int nc_id = threadIdx.z + blockIdx.z * blockDim.z;
+  int nc_stride = blockDim.z * gridDim.z;
+
+  // nearest_sampling by multiple read in_addr and write to out_addr
+  int in_img_idx = (align_corners)
+                       ? static_cast<int>(ratio_w * out_img_idx + 0.5)
+                       : static_cast<int>(ratio_w * out_img_idx);
+  int in_img_idy = (align_corners)
+                       ? static_cast<int>(ratio_h * out_img_idy + 0.5)
+                       : static_cast<int>(ratio_h * out_img_idy);
+
+  int in_index = (nc_id * in_img_h + in_img_idy) * in_img_w + in_img_idx;
+  int in_index_stride = nc_stride * in_img_h * in_img_w;
+
+  int out_index = (nc_id * out_img_h + out_img_idy) * out_img_w + out_img_idx;
+  int out_index_stride = nc_stride * out_img_h * out_img_w;
+
+  // prevent from multiple threads writing
+  if (out_img_idx < out_img_w && out_img_idy < out_img_h) {
+    while (nc_id < nc) {
+      T* in_pos = &in[in_index];
+      const T out_pos = out[out_index];
+      platform::CudaAtomicAdd(in_pos, out_pos);
+      in_index += in_index_stride;
+      out_index += out_index_stride;
+      nc_id += nc_stride;
+    }
+  }
+}
+
 template <typename T>
 __global__ void KeNearestNeighborInterpBw(
     T* in, const size_t in_img_h, const size_t in_img_w, const size_t input_h,
     const size_t input_w, const T* out, const size_t out_img_h,
     const size_t out_img_w, const size_t output_h, const size_t output_w,
     const size_t num_channels, const float ratio_h, const float ratio_w,
-    const bool align_corners, const DataLayout data_layout) {
+    const bool align_corners, FastDivModForInterpolate divmods) {
   int nthreads = output_h * output_w;
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   int stride = blockDim.x * gridDim.x;
+  int in_img_size = in_img_h * in_img_w;
+  int out_img_size = out_img_h * out_img_w;
+
   for (; tid < nthreads; tid += stride) {
-    int out_id_h = tid / output_w;
-    int out_id_w = tid % output_w;
-    int in_img_size = input_w / num_channels;
-    int out_img_size = output_w / num_channels;
+    auto out_id_divmod = divmods.output_w_div.Divmod(tid);
+    int out_id_h = out_id_divmod.val[0];
+    int out_id_w = out_id_divmod.val[1];
 
-    int channel_id, out_img_idy, out_img_idx;
-    if (data_layout == DataLayout::kNCHW) {
-      channel_id = out_id_w / out_img_size;
-      out_img_idy = (out_id_w % out_img_size) / out_img_w;
-      out_img_idx = tid % out_img_w;
-    } else {
-      out_img_idy = out_id_w / (out_img_w * num_channels);
-      out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels;
-      channel_id = tid % num_channels;
-    }
+    int channel_id = divmods.channels_div.Divmod(tid).val[1];
+    auto outimg_id_divmod = divmods.output_wc_div.Divmod(out_id_w);
+    int out_img_idy = outimg_id_divmod.val[0];
+    int out_img_idx =
+        divmods.channels_div.Divmod(outimg_id_divmod.val[1]).val[0];
 
     int in_img_idy = (align_corners)
                          ? static_cast<int>(ratio_h * out_img_idy + 0.5)
@@ -244,15 +278,10 @@ __global__ void KeNearestNeighborInterpBw(
                          ? static_cast<int>(ratio_w * out_img_idx + 0.5)
                          : static_cast<int>(ratio_w * out_img_idx);
 
-    T* in_pos;
-    if (data_layout == DataLayout::kNCHW) {
-      in_pos = &in[out_id_h * input_w + channel_id * in_img_size +
-                   in_img_idy * in_img_w + in_img_idx];
-    } else {
-      in_pos = &in[out_id_h * input_w + in_img_idy * in_img_w * num_channels +
-                   in_img_idx * num_channels + channel_id];
-    }
-    const T out_pos = out[out_id_h * output_w + out_id_w];
+    T* in_pos = &in[out_id_h * input_w + in_img_idy * in_img_w * num_channels +
+                    in_img_idx * num_channels + channel_id];
+
+    const T out_pos = out[tid];
     platform::CudaAtomicAdd(in_pos, out_pos);
   }
 }
@@ -1842,11 +1871,26 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx,
       platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum);
 
   if ("nearest" == interp_method) {
-    KeNearestNeighborInterpBw<
-        T><<<config.block_per_grid, config.thread_per_block, 0,
-             ctx.cuda_device_context().stream()>>>(
-        input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w,
-        n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
+    if (data_layout == DataLayout::kNCHW) {
+      // get launch 3D config
+      int nc = n * c;
+      platform::GpuLaunchConfig config_3d =
+          GetGpuLaunchConfig3D(ctx.cuda_device_context(), nc, out_h, out_w);
+      KeNearestNeighborInterpNCHWBw<
+          T><<<config_3d.block_per_grid, config_3d.thread_per_block, 0,
+               ctx.cuda_device_context().stream()>>>(
+          input_grad_data, in_h, in_w, output_grad_data, out_h, out_w, nc,
+          ratio_h, ratio_w, align_corners);
+    } else {
+      int64_t cw = c * out_w;
+      auto interp_divmods = FastDivModForInterpolate(c, out_chw, cw);
+      KeNearestNeighborInterpBw<
+          T><<<config.block_per_grid, config.thread_per_block, 0,
+               ctx.cuda_device_context().stream()>>>(
+          input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h,
+          out_w, n, out_chw, c, ratio_h, ratio_w, align_corners,
+          interp_divmods);
+    }
   } else if ("bilinear" == interp_method) {
     const T align_type_value = (align_mode == 0 && !align_corners) ? 0.5f : 0;
     bool is_nchw = (data_layout == DataLayout::kNCHW) ? true : false;

From 4d042a83732b8c2d4ff9abfd3e103b6f0799831d Mon Sep 17 00:00:00 2001
From: TeFeng Chen <ctfeng66@163.com>
Date: Thu, 24 Feb 2022 17:14:46 +0800
Subject: [PATCH 095/101] build a Paddle Graph from CINN compiled program for
 execution with PE (#39724)

* build a Paddle Graph from CINN compiled program for execution with PE

* update names of some variables

* fix random fail in build_cinn_pass_test and update some comments

* fix compiler error by merging phi pr
---
 .../framework/paddle2cinn/build_cinn_pass.cc  |   9 +-
 .../framework/paddle2cinn/build_cinn_pass.h   |   7 +
 .../paddle2cinn/build_cinn_pass_test.cc       |   4 +-
 .../framework/paddle2cinn/cinn_compiler.cc    |   6 +-
 paddle/fluid/operators/cinn/CMakeLists.txt    |   4 +-
 .../operators/cinn/cinn_launch_context.cc     | 249 +++++++++++++-----
 .../operators/cinn/cinn_launch_context.h      |  94 ++++---
 .../cinn/cinn_launch_context_test.cc          | 241 ++++++++++++-----
 paddle/fluid/operators/cinn/cinn_launch_op.h  |  74 ++----
 paddle/fluid/operators/cinn/test_helper.h     |  12 +
 10 files changed, 477 insertions(+), 223 deletions(-)

diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
index d55950064a4a2..6e55727c8bf67 100644
--- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
+++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
@@ -44,11 +44,6 @@ DECLARE_string(deny_cinn_ops);
 
 namespace paddle {
 namespace framework {
-
-namespace ir {
-class MemOptVarInfo;
-}  // namespace ir
-
 namespace paddle2cinn {
 
 using framework::ir::Graph;
@@ -398,9 +393,7 @@ std::unique_ptr<Graph> CreateNewSubGraph(const GraphNodeSet& cluster,
       kNoNeedBufferFeeds, no_need_buffer_feeds.release());
   // initialize empty map for kMemOptVarInfoFromMainGraph attribute,
   // it will be filled on the share_mem_opt_info_to_subgraph pass
-  subgraph->GetOrInit<std::unordered_map<
-      std::string, std::shared_ptr<framework::ir::MemOptVarInfo>>>(
-      kMemOptVarInfoFromMainGraph);
+  subgraph->GetOrInit<Name2VarInfoMap>(kMemOptVarInfoFromMainGraph);
   return subgraph;
 }
 
diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.h b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.h
index 8cb920831cc54..a902eacde820f 100644
--- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.h
+++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.h
@@ -18,6 +18,10 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
+namespace ir {
+class MemOptVarInfo;
+}  // namespace ir
+
 namespace paddle2cinn {
 
 constexpr char kCinnLaunchOp[] = "cinn_launch";
@@ -27,6 +31,9 @@ constexpr char kInternalVars[] = "InternalVars";
 constexpr char kOutputVars[] = "OutputVars";
 constexpr char kMemOptVarInfoFromMainGraph[] =
     "mem_opt_var_info_from_main_graph";
+using Name2VarInfoMap =
+    std::unordered_map<std::string,
+                       std::shared_ptr<framework::ir::MemOptVarInfo>>;
 
 // A pass named BuildCinnPass, the function of this pass is:
 //
diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc
index 919fc60d4cb61..bf9d1baaf394f 100644
--- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc
@@ -255,7 +255,9 @@ TEST(BuildCinnPassTest, AllOpSupportCinn) {
   ASSERT_EQ(
       std::unordered_set<Node*>(cinn_op->inputs.begin(), cinn_op->inputs.end()),
       std::unordered_set<Node*>({v0, v1, v2, v4}));
-  ASSERT_EQ(cinn_op->outputs, std::vector<Node*>({v6, v7}));
+  ASSERT_EQ(std::unordered_set<Node*>(cinn_op->outputs.begin(),
+                                      cinn_op->outputs.end()),
+            std::unordered_set<Node*>({v6, v7}));
   ASSERT_EQ(v1->outputs, std::vector<Node*>({cinn_op}));
   ASSERT_EQ(v6->inputs, std::vector<Node*>({cinn_op}));
 
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
index 716cd85e7117a..706815185a1b5 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
@@ -248,10 +248,10 @@ std::unique_ptr<CinnCompiledObject> CinnCompiler::CompileGraph(
   *compiled_obj = {std::move(graph_compiler),
                    std::move(compiled_res.runtime_program), scope,
                    symbol.var_model_to_program_map()};
-  compiled_obj->launch_context =
-      std::make_unique<operators::details::CinnLaunchContext>(
-          compiled_obj->paddle2cinn_varmap, compiled_obj->scope);
   compiled_obj->cached_index = compiled_num;
+  compiled_obj->launch_context =
+      std::make_unique<operators::details::CinnLaunchContext>(graph,
+                                                              *compiled_obj);
   return compiled_obj;
 }
 
diff --git a/paddle/fluid/operators/cinn/CMakeLists.txt b/paddle/fluid/operators/cinn/CMakeLists.txt
index a2fc080faadcf..f1247ebdf23c8 100644
--- a/paddle/fluid/operators/cinn/CMakeLists.txt
+++ b/paddle/fluid/operators/cinn/CMakeLists.txt
@@ -1,13 +1,13 @@
 include(operators)
 
 cc_library(cinn_op_helper SRCS cinn_op_helper.cc DEPS operator device_context)
-cc_library(cinn_launch_context SRCS cinn_launch_context.cc DEPS ddim lod_tensor scope cinn)
+cc_library(cinn_launch_context SRCS cinn_launch_context.cc DEPS ddim lod_tensor scope proto_desc graph build_strategy parallel_executor cinn)
 
 SET(CINN_OP_DEPS string_helper cinn cinn_compiler cinn_op_helper cinn_launch_context)
 register_operators(DEPS ${CINN_OP_DEPS})
 
 if (WITH_TESTING)
-  cc_test(cinn_launch_context_test SRCS cinn_launch_context_test.cc DEPS ddim lod_tensor scope cinn_launch_context)
+  cc_test(cinn_launch_context_test SRCS cinn_launch_context_test.cc DEPS ddim lod_tensor scope proto_desc graph cinn_launch_context cinn_instruction_run_op cinn)
   set_tests_properties(cinn_launch_context_test PROPERTIES LABELS "RUN_TYPE=CINN")
 
   SET(CINN_RUN_ENVIRONMENT "OMP_NUM_THREADS=1;runtime_include_dir=${PADDLE_BINARY_DIR}/third_party/CINN/src/external_cinn/cinn/runtime/cuda")
diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.cc b/paddle/fluid/operators/cinn/cinn_launch_context.cc
index 0b677f79f7f5d..0a21d937aa1a7 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_context.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_context.cc
@@ -17,22 +17,39 @@
 #include <functional>
 #include <utility>
 #include <vector>
+#include "cinn/hlir/framework/graph_compiler.h"
+#include "cinn/hlir/framework/instruction.h"
 #include "cinn/hlir/framework/scope.h"
 #include "cinn/hlir/framework/tensor.h"
 #include "cinn/runtime/cinn_runtime.h"
+#include "paddle/fluid/framework/details/build_strategy.h"
+#include "paddle/fluid/framework/details/execution_strategy.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/paddle2cinn/build_cinn_pass.h"
+#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/operators/cinn/cinn_op_helper.h"
+#include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/core/ddim.h"
 
 namespace paddle {
 namespace operators::details {
 
-using LoDTensor = framework::LoDTensor;
+using framework::Scope;
+using framework::LoDTensor;
+using framework::ParallelExecutor;
+using CinnInstruction = ::cinn::hlir::framework::Instruction;
+using CinnRuntimeProgram = ::cinn::hlir::framework::Program;
+using framework::paddle2cinn::Name2VarInfoMap;
+using framework::paddle2cinn::kMemOptVarInfoFromMainGraph;
 
-CinnLaunchContext::CinnLaunchContext(
-    const std::unordered_map<std::string, std::string>& paddle2cinn_varmap,
-    const std::shared_ptr<CinnScope>& cinn_scope)
-    : cinn_scope_(cinn_scope) {
-  // generate all names of the cinn execution arguments
+CinnLaunchContext::CinnLaunchContext(const framework::ir::Graph& graph,
+                                     const CinnCompiledObject& compiled_obj)
+    : cinn_scope_(compiled_obj.scope) {
+  // collect all names of the CINN execution arguments
   auto var_names = cinn_scope_->var_names();
   cinn_argument_names_.reserve(var_names.size());
   std::transform(
@@ -40,7 +57,42 @@ CinnLaunchContext::CinnLaunchContext(
       std::inserter(cinn_argument_names_, cinn_argument_names_.end()),
       [](const auto& name_view) { return std::string(name_view.data()); });
   // build name map between the original variables and compiled ones
-  BuildVarNameMap(paddle2cinn_varmap, cinn_argument_names_);
+  BuildVarNameMap(compiled_obj.paddle2cinn_varmap, cinn_argument_names_);
+
+  const auto& input_var_names =
+      graph.Get<std::vector<std::string>>(framework::paddle2cinn::kInputVars);
+  const auto& output_var_names =
+      graph.Get<std::vector<std::string>>(framework::paddle2cinn::kOutputVars);
+  internal_var_names_ =
+      ExtractInternalVarNames(input_var_names, output_var_names);
+  // check completeness of output variables in compiled result
+  for (auto&& var_name : output_var_names) {
+    PADDLE_ENFORCE_EQ(IsVariableUsed(var_name), true,
+                      platform::errors::PreconditionNotMet(
+                          "Variable(%s) not applied in CINN", var_name));
+  }
+
+  // initialize all execution arguments
+  InitializeArguments();
+  // DEPRECATED(CtfGo): following callback assignment will be deprecated soon
+  for (auto&& var_name : input_var_names) {
+    if (IsVariableUsed(var_name)) {
+      AssignExternalVariable(var_name);
+    }
+  }
+  for (auto&& var_name : output_var_names) {
+    AssignExternalVariable(var_name);
+  }
+  for (auto&& var_name : internal_var_names_) {
+    AssignInternalVariable(var_name);
+  }
+
+  // Convert the CINN runtime program to a Paddle graph
+  runtime_graph_ = std::make_unique<framework::ir::Graph>(
+      BuildCompiledProgram(graph, compiled_obj));
+  runtime_graph_->SetNotOwned<Name2VarInfoMap>(
+      kMemOptVarInfoFromMainGraph,
+      &graph.Get<Name2VarInfoMap>(kMemOptVarInfoFromMainGraph));
 }
 
 void CinnLaunchContext::BuildVarNameMap(
@@ -94,21 +146,15 @@ void CinnLaunchContext::UpdateCapturedEnv(const framework::Scope& scope,
           << std::addressof(place);
 }
 
-bool CinnLaunchContext::IsArgumentsInitialized() const {
-  if (hold_buffers_.empty() || name2argument_.empty()) {
-    return false;
-  }
-  return true;
-}
-
 bool CinnLaunchContext::IsVariableUsed(const std::string& var_name) const {
   return paddle2cinn_varmap_.count(var_name) > 0;
 }
 
-CinnTensor CinnLaunchContext::GetCinnTensor(const std::string& arg_name) {
-  PADDLE_ENFORCE_GT(cinn_argument_names_.count(arg_name), 0,
-                    platform::errors::InvalidArgument(
-                        "Variable(%s) not found in cinn scope.", arg_name));
+CinnTensor CinnLaunchContext::GetCinnTensorOfVar(const std::string& var_name) {
+  PADDLE_ENFORCE_EQ(
+      IsVariableUsed(var_name), true,
+      platform::errors::NotFound("Variable(%s) not applied in CINN", var_name));
+  const auto& arg_name = paddle2cinn_varmap_.at(var_name);
   return cinn_scope_->GetTensor(arg_name);
 }
 
@@ -132,10 +178,13 @@ std::unordered_set<std::string> CinnLaunchContext::ExtractInternalVarNames(
   return remain_var_names;
 }
 
-void CinnLaunchContext::CheckTensorEquivalent(const std::string& var_name,
-                                              const LoDTensor& paddle_tensor,
-                                              const CinnTensor& cinn_tensor) {
+void CinnLaunchContext::CheckTensorEquivalent(
+    const std::string& var_name, const framework::LoDTensor& paddle_tensor) {
+  PADDLE_ENFORCE_EQ(IsVariableUsed(var_name), true,
+                    platform::errors::InvalidArgument(
+                        "Variable(%s) not applied in cinn", var_name));
   // check dimension
+  auto cinn_tensor = GetCinnTensorOfVar(var_name);
   auto cinn_dims = phi::make_ddim(cinn_tensor->shape().data());
   PADDLE_ENFORCE_EQ(paddle_tensor.dims(), cinn_dims,
                     platform::errors::PreconditionNotMet(
@@ -146,22 +195,28 @@ void CinnLaunchContext::CheckTensorEquivalent(const std::string& var_name,
   // TODO(CtfGo): check the underlying data type after CINN ready
 }
 
+void CinnLaunchContext::InitializeArguments() {
+  for (auto&& arg : cinn_argument_names_) {
+    auto cinn_buffer = std::make_unique<cinn_buffer_t>();
+    auto cinn_tensor = GetCinnTensorOfVar(cinn2paddle_varmap_.at(arg));
+    // assign dimensions with corresponding compiled tensor
+    cinn_buffer->resize(cinn_tensor->shape().data().data(),
+                        cinn_tensor->shape().data().size());
+    VLOG(4) << string::Sprintf(
+        "Append an argument:name(%s),dims(%s),argument size:(%lu)", arg,
+        framework::DDim(cinn_buffer->dims, cinn_buffer->dimensions).to_str(),
+        name2argument_.size());
+    name2argument_.emplace(arg, cinn_buffer.get());
+    hold_buffers_.emplace_back(std::move(cinn_buffer));
+  }
+}
+
 void CinnLaunchContext::AssignExternalVariable(const std::string& var_name) {
   PADDLE_ENFORCE_EQ(IsVariableUsed(var_name), true,
                     platform::errors::InvalidArgument(
                         "Variable(%s) not applied in cinn", var_name));
-  const auto& cinn_arg_name = paddle2cinn_varmap_.at(var_name);
-
-  const auto& paddle_tensor = cached_scope_->GetVar(var_name)->Get<LoDTensor>();
-  CinnTensor cinn_tensor = GetCinnTensor(cinn_arg_name);
-  if (paddle_tensor.IsInitialized()) {
-    CheckTensorEquivalent(var_name, paddle_tensor, cinn_tensor);
-  }
-
-  auto cinn_buffer = std::make_unique<cinn_buffer_t>();
-  // assign dimensions and alloc/free callback of cinn_buffer_t
-  cinn_buffer->resize(cinn_tensor->shape().data().data(),
-                      cinn_tensor->shape().data().size());
+  auto* cinn_buffer = GetCinnBufferOfVar(var_name);
+  // assign external malloc/free callbacks of cinn_buffer_t
   cinn_buffer->external_malloc = new std::function<int(void*, cinn_buffer_t*)>(
       [this, var_name](void* ctx, cinn_buffer_t* buffer) {
         auto* tensor = cached_scope_->GetVar(var_name)->GetMutable<LoDTensor>();
@@ -177,22 +232,14 @@ void CinnLaunchContext::AssignExternalVariable(const std::string& var_name) {
         // Do nothing
         return 0;
       });
-
-  return AppendArgument(cinn_arg_name, std::move(cinn_buffer));
 }
 
 void CinnLaunchContext::AssignInternalVariable(const std::string& var_name) {
   PADDLE_ENFORCE_EQ(IsVariableUsed(var_name), true,
                     platform::errors::InvalidArgument(
                         "Variable(%s) not applied in cinn", var_name));
-  const auto& cinn_arg_name = paddle2cinn_varmap_.at(var_name);
-
-  CinnTensor cinn_tensor = GetCinnTensor(cinn_arg_name);
-  auto cinn_buffer = std::make_unique<cinn_buffer_t>();
-  // assign dimensions and alloc/free callback of cinn_buffer_t
-  cinn_buffer->resize(cinn_tensor->shape().data().data(),
-                      cinn_tensor->shape().data().size());
-
+  auto* cinn_buffer = GetCinnBufferOfVar(var_name);
+  // assign external malloc/free callbacks of cinn_buffer_t
   cinn_buffer->external_malloc = new std::function<int(void*, cinn_buffer_t*)>(
       [this, var_name](void* ctx, cinn_buffer_t* buffer) {
         auto* tensor =
@@ -212,30 +259,106 @@ void CinnLaunchContext::AssignInternalVariable(const std::string& var_name) {
         tensor->clear();
         return 0;
       });
-  return AppendArgument(cinn_arg_name, std::move(cinn_buffer));
 }
 
-void CinnLaunchContext::AppendArgument(
-    const std::string& arg_name, std::unique_ptr<cinn_buffer_t>&& buffer) {
-  name2argument_.emplace(arg_name, buffer.get());
-  hold_buffers_.emplace_back(std::move(buffer));
-  VLOG(4) << string::Sprintf(
-      "Append an argument:name(%s),dims(%s),argument size:(%lu)", arg_name,
-      framework::DDim(buffer->dims, buffer->dimensions).to_str(),
-      name2argument_.size());
+framework::ProgramDesc CinnLaunchContext::BuildCompiledProgram(
+    const framework::ir::Graph& graph, const CinnCompiledObject& compiled_obj) {
+  CinnRuntimeProgram* runtime_program = compiled_obj.runtime_program.get();
+  // Step 0: Create an empty program_desc, there will be only one block
+  framework::ProgramDesc program_desc;
+  auto* block = program_desc.MutableBlock(0);
+  const std::vector<std::unique_ptr<CinnInstruction>>& instructions =
+      runtime_program->GetRunInstructions();
+
+  // build a map that links the name of a Paddle variable to its VarDesc
+  const std::unordered_set<framework::ir::Node*>& nodes = graph.Nodes();
+  std::unordered_map<std::string, framework::VarDesc*> original_vardescs;
+  for (auto* node : nodes) {
+    if (node->IsVar() && node->Var()) {
+      original_vardescs.emplace(node->Name(), node->Var());
+    }
+  }
+
+  // Step 1: Create a VarDesc for each execution argument:
+  //   (1) For those variables that are input or output variables of the
+  //   original subgraph, there must exist an original VarDesc, so
+  //   we copy some useful info(such as IsParameter,Persistable)
+  //   to the new VarDesc.
+  //   (2) For all variables, the shape, data type of their VarDescs
+  //   are set by values of the corresponding compiled tensors,
+  //   including the in/out variables where the equiality between their tensors
+  //   and the CINN compiled ones is verified in corresponding cinn_launch_op.
+  for (auto&& arg : cinn_argument_names_) {
+    const std::string& var_name = cinn2paddle_varmap_.at(arg);
+    framework::VarDesc* var_desc = block->Var(var_name);
+    var_desc->SetType(framework::proto::VarType::LOD_TENSOR);
+
+    auto res = original_vardescs.find(var_name);
+    if (res != original_vardescs.end()) {
+      auto* ori_desc = res->second;
+      var_desc->SetPersistable(ori_desc->Persistable());
+      var_desc->SetIsParameter(ori_desc->IsParameter());
+    }
+
+    auto cinn_tensor = GetCinnTensorOfVar(var_name);
+    // TODO(CtfGo): set the corresponding data type after CINN ready,
+    //              currently set as FP32 in default
+    var_desc->SetDataType(framework::proto::VarType::FP32);
+    var_desc->SetShape(std::vector<int64_t>(cinn_tensor->shape().data().begin(),
+                                            cinn_tensor->shape().data().end()));
+  }
+
+  // transform names of the input or output arguments of a CINN instruction
+  // to the corresponding Paddle variable names, and repack them as one vector
+  auto trans_and_pack_args_fn =
+      [this](const std::vector<std::vector<std::string>>& cinn_args_array) {
+        std::vector<std::string> var_names;
+        for (auto&& cinn_args : cinn_args_array) {
+          for (auto&& arg : cinn_args) {
+            auto res = cinn2paddle_varmap_.find(arg);
+            PADDLE_ENFORCE_NE(
+                res, cinn2paddle_varmap_.end(),
+                platform::errors::NotFound("Argument(%s) not found", arg));
+            var_names.emplace_back(res->second);
+          }
+        }
+        return var_names;
+      };
+
+  // Step 2: create a VarDesc of cinn_instruction_run op for
+  //         each CINN instruction and append it to the main block
+  for (auto ins_idx = 0; ins_idx < instructions.size(); ++ins_idx) {
+    auto* ins = instructions.at(ins_idx).get();
+    auto in_args = trans_and_pack_args_fn(ins->GetInArgs());
+    auto out_args = trans_and_pack_args_fn(ins->GetOutArgs());
+
+    auto* op_desc = block->AppendOp();
+    op_desc->SetType("cinn_instruction_run");
+    op_desc->SetInput(kX, in_args);
+    op_desc->SetOutput(kOutputs, out_args);
+    op_desc->SetAttr(kCachedIndex,
+                     {static_cast<int64_t>(compiled_obj.cached_index)});
+    op_desc->SetAttr(kInstructionIndex, {static_cast<int64_t>(ins_idx)});
+  }
+
+  return program_desc;
 }
 
-const std::map<std::string, cinn_pod_value_t>&
-CinnLaunchContext::FinalizeArguments() const {
-  // Check all execution parameters are assigned valued.
-  std::for_each(cinn_argument_names_.begin(), cinn_argument_names_.end(),
-                [this](const auto& arg_name) {
-                  PADDLE_ENFORCE_GT(
-                      name2argument_.count(arg_name), 0,
-                      platform::errors::NotFound(
-                          "Argument(%s) is missed for execution", arg_name));
-                });
-  return name2argument_;
+ParallelExecutor* CinnLaunchContext::InitializePE(const platform::Place& place,
+                                                  framework::Scope* scope) {
+  if (!parallel_executor_) {
+    framework::details::ExecutionStrategy exec_strategy;
+    framework::details::BuildStrategy build_strategy;
+    parallel_executor_ = std::make_unique<ParallelExecutor>(
+        place, scope, exec_strategy, build_strategy, runtime_graph_.get());
+  }
+
+  // update the scope bound to an OpHandle and rebuild temporary variables
+  std::unordered_map<Scope*, Scope*> scope_map = {
+      {parallel_executor_->GetLocalScopes().front(), scope}};
+  parallel_executor_->ResetOpHandleScopeMapOfGraphs(scope_map);
+  parallel_executor_->PrepareVariables(scope);
+  return parallel_executor_.get();
 }
 
 cinn_buffer_t* CinnLaunchContext::GetCinnBufferOfVar(
diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.h b/paddle/fluid/operators/cinn/cinn_launch_context.h
index 502e6a92dc10b..a4d613ea618a8 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_context.h
+++ b/paddle/fluid/operators/cinn/cinn_launch_context.h
@@ -21,7 +21,7 @@
 #include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/parallel_executor.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/core/ddim.h"
 
@@ -35,10 +35,25 @@ class Program;
 }  // namespace cinn::hlir::framework
 
 namespace paddle {
+namespace framework {
+class ProgramDesc;
+class Scope;
+class VarDesc;
+
+namespace ir {
+class Graph;
+}  // namespace ir
+
+namespace paddle2cinn {
+class CinnCompiledObject;
+}  // namespace paddle2cinn
+}  // namespace framework
+
 namespace operators::details {
 
 using CinnTensor = ::cinn::hlir::framework::Tensor;
 using CinnScope = ::cinn::hlir::framework::Scope;
+using CinnCompiledObject = framework::paddle2cinn::CinnCompiledObject;
 
 // This class is used to cache some reusable data among repeated
 // executions for efficiency and it also provides easy interfaces
@@ -49,58 +64,71 @@ using CinnScope = ::cinn::hlir::framework::Scope;
 // Variable while a CINN variable is called an Argument.
 class CinnLaunchContext {
  public:
-  explicit CinnLaunchContext(
-      const std::unordered_map<std::string, std::string>& paddle2cinn_varmap,
-      const std::shared_ptr<CinnScope>& cinn_scope);
+  explicit CinnLaunchContext(const framework::ir::Graph& graph,
+                             const CinnCompiledObject& compiled_obj);
+
+  // Initialize a ParallelExecutor to execute the runtime graph,
+  // it will be constructed in the first call, and just update
+  // the execution scope in the following usage.
+  framework::ParallelExecutor* InitializePE(const platform::Place& place,
+                                            framework::Scope* scope);
 
   // explicitly update several environment variables captured
   // by callback of execution arguments
   void UpdateCapturedEnv(const framework::Scope& scope,
                          const platform::Place& place);
 
-  // Return whether execution arguments has been initialized
-  bool IsArgumentsInitialized() const;
-
   // Return whether a Paddle variable used in cinn execution
   bool IsVariableUsed(const std::string& var_name) const;
 
-  // Assign tensor buffer to input or output variables
-  void AssignExternalVariable(const std::string& var_name);
-
-  // Assign tensor buffer to internal variables
-  void AssignInternalVariable(const std::string& var_name);
+  // Check the equiality in type and dimension between the tensor
+  // in Paddle and the compiled tensor returned by CINN of a same variable
+  void CheckTensorEquivalent(const std::string& var_name,
+                             const framework::LoDTensor& paddle_tensor);
 
-  // Extract internal variable names from all applied variables
-  // in execution by excluding the input and output variables
-  std::unordered_set<std::string> ExtractInternalVarNames(
-      const std::vector<std::string>& input_var_names,
-      const std::vector<std::string>& output_var_names);
+  // Return internal variable names list
+  const std::unordered_set<std::string>& GetInternalVarNames() const {
+    return internal_var_names_;
+  }
 
   // Finalize all execution arguments and return the name->argument map
-  const std::map<std::string, cinn_pod_value_t>& FinalizeArguments() const;
+  const std::map<std::string, cinn_pod_value_t>& FinalizeArguments() const {
+    return name2argument_;
+  }
 
   // Return the cinn_buffer_t* of a specific variable
   cinn_buffer_t* GetCinnBufferOfVar(const std::string& var_name);
 
  private:
-  // Get CinnTensor with CINN argument name
-  CinnTensor GetCinnTensor(const std::string& arg_name);
+  // Get corresponding compiled tensor of a Paddle variable name
+  CinnTensor GetCinnTensorOfVar(const std::string& var_name);
+
   // Build the name maps of paddle->cinn and cinn->paddle
   // in reverse for all variables used in cinn execution
   void BuildVarNameMap(
       const std::unordered_map<std::string, std::string>& compiled_varmap,
       const std::unordered_set<std::string>& argument_names);
 
-  // Check whether the tensor in Paddle and the compiled
-  // tensor returned by CINN of a same variable
-  // are equivalent in type and dimension
-  void CheckTensorEquivalent(const std::string& var_name,
-                             const framework::LoDTensor& paddle_tensor,
-                             const CinnTensor& cinn_tensor);
+  // Extract internal variable names from all applied variables
+  // in execution by excluding the input and output variables
+  std::unordered_set<std::string> ExtractInternalVarNames(
+      const std::vector<std::string>& input_var_names,
+      const std::vector<std::string>& output_var_names);
+
+  // Initialize each execution argument with a cinn_buffer_t
+  void InitializeArguments();
 
-  // Append an argument with (cinn name)->(cinn_buffer_t) pair
-  void AppendArgument(const std::string& arg_name,
-                      std::unique_ptr<cinn_buffer_t>&& buffer);
+  // Assign tensor buffer to input or output variables
+  void AssignExternalVariable(const std::string& var_name);
+
+  // Assign tensor buffer to internal variables
+  void AssignInternalVariable(const std::string& var_name);
+
+  // Construct a Paddle ProgramDesc with the CINN runtime
+  // instructions included in the compiled CINN Program
+  framework::ProgramDesc BuildCompiledProgram(
+      const framework::ir::Graph& graph,
+      const CinnCompiledObject& compiled_obj);
 
  private:
   const framework::Scope* cached_scope_ = nullptr;
@@ -111,16 +139,22 @@ class CinnLaunchContext {
   std::unordered_map<std::string, std::string> paddle2cinn_varmap_;
   // a name map from cinn execution arguments to paddle variables
   std::unordered_map<std::string, std::string> cinn2paddle_varmap_;
+  // a list of internal variable names in Paddle
+  std::unordered_set<std::string> internal_var_names_;
   // the names of the cinn arguments used in compiled executable program
   std::unordered_set<std::string> cinn_argument_names_;
   // the variable scope compiled from cinn
   const std::shared_ptr<CinnScope> cinn_scope_;
 
+  // the ir::Graph object converted from the program compiled by CINN
+  std::unique_ptr<framework::ir::Graph> runtime_graph_;
+  // a ParallelExecutor to execute the runtime graph
+  std::unique_ptr<framework::ParallelExecutor> parallel_executor_;
+
   // because a cinn_pod_value_t does not own a cinn_buffer_t object,
   // an extra stroage is necessary to keep those objects and they can
   // not be released until the runtime program finish execution.
   std::vector<std::unique_ptr<cinn_buffer_t>> hold_buffers_;
-
   // this map saves all execution arguments with their cinn names as key,
   // and it is passed to the Execute interface of a cinn runtime program.
   std::map<std::string, cinn_pod_value_t> name2argument_;
diff --git a/paddle/fluid/operators/cinn/cinn_launch_context_test.cc b/paddle/fluid/operators/cinn/cinn_launch_context_test.cc
index 58a9c5db712b9..4976a59d1dd38 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_context_test.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_context_test.cc
@@ -13,87 +13,229 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/cinn/cinn_launch_context.h"
+#include <memory>
+#include <set>
+#include <utility>
+#include "cinn/common/target.h"
+#include "cinn/hlir/framework/graph_compiler.h"
+#include "cinn/hlir/framework/instruction.h"
 #include "cinn/hlir/framework/scope.h"
 #include "cinn/hlir/framework/tensor.h"
 #include "cinn/runtime/cinn_runtime.h"
 #include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/paddle2cinn/build_cinn_pass.h"
+#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
+#include "paddle/fluid/framework/parallel_executor.h"
+#include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/operators/cinn/cinn_op_helper.h"
 #include "paddle/phi/core/ddim.h"
 
+USE_OP(cinn_instruction_run);
 namespace paddle {
 namespace operators::details {
 
-using LoDTensor = framework::LoDTensor;
+using framework::OpDesc;
+using framework::ProgramDesc;
+using framework::LoDTensor;
+using framework::ir::Graph;
+using framework::ParallelExecutor;
+using framework::paddle2cinn::Name2VarInfoMap;
 using CinnShape = ::cinn::hlir::framework::Shape;
+using CinnInstruction = ::cinn::hlir::framework::Instruction;
+using CinnRuntimeProgram = ::cinn::hlir::framework::Program;
 
-std::unique_ptr<CinnLaunchContext> CreateDefaultLaunchContext() {
+const Graph& InitDefaultSubgraph() {
   static std::once_flag initialized;
-  static std::unordered_map<std::string, std::string> paddle2cinn_varmap;
-  static std::shared_ptr<CinnScope> cinn_scope;
-  std::call_once(initialized, [&paddle2cinn_varmap, &cinn_scope]() {
-    auto& scope = cinn_scope;
-    scope = std::make_shared<CinnScope>();
+  static std::unique_ptr<Graph> graph;
+  std::call_once(initialized, [&]() {
+    ProgramDesc program;
+    auto* block = program.MutableBlock(0);
+    auto* var1 = block->Var("var1");
+    var1->SetPersistable(true);
+    block->Var("var2");
+    block->Var("var3");
+    block->Var("var4");
+    auto* var5 = block->Var("var5");
+    var5->SetIsParameter(true);
+    auto add_op = std::unique_ptr<OpDesc>(
+        new OpDesc("elementwise_add", {{"X", {"var1"}}, {"Y", {"var2"}}},
+                   {{"Out", {"var3"}}}, {}));
+    block->AppendAllocatedOp(std::move(add_op));
+    auto mul_op = std::unique_ptr<OpDesc>(new OpDesc(
+        "mul", {{"X", {"var1"}}, {"Y", {"var2"}}}, {{"Out", {"var4"}}}, {}));
+    block->AppendAllocatedOp(std::move(mul_op));
+    auto res_op = std::unique_ptr<OpDesc>(
+        new OpDesc("elementwise_add", {{"X", {"var3"}}, {"Y", {"var4"}}},
+                   {{"Out", {"var5"}}}, {}));
+    block->AppendAllocatedOp(std::move(res_op));
+    graph = std::make_unique<Graph>(program);
+
+    graph->Set<std::vector<std::string>>(
+        framework::paddle2cinn::kInputVars,
+        new std::vector<std::string>({"var1", "var2"}));
+    graph->Set<std::vector<std::string>>(
+        framework::paddle2cinn::kInternalVars,
+        new std::vector<std::string>({"var3", "var4"}));
+    graph->Set<std::vector<std::string>>(
+        framework::paddle2cinn::kOutputVars,
+        new std::vector<std::string>({"var5"}));
+    graph->GetOrInit<Name2VarInfoMap>(
+        framework::paddle2cinn::kMemOptVarInfoFromMainGraph);
+  });
+  return *graph.get();
+}
 
+CinnCompiledObject* InitDefaultCompiledObject() {
+  static std::once_flag initialized;
+  static auto compiled_obj = std::make_unique<CinnCompiledObject>();
+  std::call_once(initialized, [result = compiled_obj.get()]() {
+    auto& scope = result->scope;
+    scope = std::make_shared<CinnScope>();
     scope->Var<CinnTensor>("cinn_var1");
     scope->GetTensor("cinn_var1")->Resize(CinnShape({3, 4}));
     scope->Var<CinnTensor>("cinn_var2");
     scope->GetTensor("cinn_var2")->Resize(CinnShape({6, 7, 8}));
     scope->Var<CinnTensor>("cinn_var3");
     scope->GetTensor("cinn_var3")->Resize(CinnShape({10, 16}));
+    scope->Var<CinnTensor>("cinn_var4");
+    scope->GetTensor("cinn_var4")->Resize(CinnShape({10, 16}));
+    scope->Var<CinnTensor>("cinn_var5");
+    scope->GetTensor("cinn_var5")->Resize(CinnShape({10, 16}));
 
-    paddle2cinn_varmap = {
-        {"var1", "cinn_var1"}, {"var3", "cinn_var3"}, {"var4", "cinn_var4"}};
+    // input variables: var1, var2; output: var5
+    // internal variables: var3 and var4, here var3 is retained
+    // in result map, so the name will be used neither cinn_var3
+    auto& paddle2cinn_varmap = result->paddle2cinn_varmap;
+    paddle2cinn_varmap = {{"var1", "cinn_var1"},
+                          {"var2", "cinn_var2"},
+                          {"var3", "cinn_var3"},
+                          {"var5", "cinn_var5"}};
+
+    auto& runtime_program = result->runtime_program;
+    std::vector<std::unique_ptr<CinnInstruction>> instructions;
+    instructions.emplace_back(new CinnInstruction(
+        cinn::common::DefaultHostTarget(), scope.get(),
+        {"cinn_var1", "cinn_var2"}, {"cinn_var3"}, "elementwise_add"));
+    instructions.emplace_back(
+        new CinnInstruction(cinn::common::DefaultHostTarget(), scope.get(),
+                            {"cinn_var1", "cinn_var2"}, {"cinn_var4"}, "mul"));
+    instructions.emplace_back(new CinnInstruction(
+        cinn::common::DefaultHostTarget(), scope.get(),
+        {"cinn_var3", "cinn_var4"}, {"cinn_var5"}, "elementwise_add"));
+    runtime_program =
+        std::make_unique<CinnRuntimeProgram>(scope, std::move(instructions));
+    result->cached_index = 110;
   });
 
-  return std::make_unique<CinnLaunchContext>(paddle2cinn_varmap, cinn_scope);
+  return compiled_obj.get();
 }
 
-TEST(CinnLaunchContextTest, TestBasic) {
-  auto launch_context = CreateDefaultLaunchContext();
-  // test IsVariableUsed
+class CinnLaunchContextTest : public ::testing::Test {
+ public:
+  std::unique_ptr<CinnLaunchContext> launch_context;
+  CinnCompiledObject* compiled_obj;
+
+  void SetUp() override {
+    compiled_obj = InitDefaultCompiledObject();
+    launch_context = std::make_unique<CinnLaunchContext>(InitDefaultSubgraph(),
+                                                         *compiled_obj);
+  }
+};
+
+TEST_F(CinnLaunchContextTest, TestConstructResult) {
   ASSERT_EQ(launch_context->IsVariableUsed("var1"), true);
+  ASSERT_EQ(launch_context->IsVariableUsed("var2"), true);
+  ASSERT_EQ(launch_context->IsVariableUsed("var3"), true);
   ASSERT_EQ(launch_context->IsVariableUsed("var4"), false);
-  // test UpdateCapturedEnv
-  platform::CPUPlace place;
-  framework::Scope scope;
-  ASSERT_NO_THROW(launch_context->UpdateCapturedEnv(scope, place));
-  // test IsArgumentsInitialized
-  ASSERT_FALSE(launch_context->IsArgumentsInitialized());
+  ASSERT_EQ(launch_context->IsVariableUsed("var5"), true);
+
+  // check result of ExtractInternalVarNames
+  ASSERT_EQ(launch_context->GetInternalVarNames(),
+            std::unordered_set<std::string>({"var3", "cinn_var4"}));
+
+  // check completeness of arguments list, and also check
+  // the two name maps of the paddle->cinn and the reverse one
+  // through the IsVariableUsed interface
+  auto&& arguments = launch_context->FinalizeArguments();
+  ASSERT_EQ(arguments.size(), 5);
+  auto check_argument_fn = [&arguments, this](const std::string& var_name,
+                                              const std::string& arg_name) {
+    ASSERT_EQ(launch_context->IsVariableUsed(var_name), true);
+    ASSERT_NO_THROW(launch_context->GetCinnBufferOfVar(var_name));
+    ASSERT_GT(arguments.count(arg_name), 0);
+    EXPECT_EQ(launch_context->GetCinnBufferOfVar(var_name),
+              static_cast<cinn_buffer_t*>(arguments.at(arg_name)));
+    auto* buffer = launch_context->GetCinnBufferOfVar(var_name);
+    auto&& scope = compiled_obj->scope;
+    ASSERT_EQ(framework::DDim(buffer->dims, buffer->dimensions),
+              phi::make_ddim(scope->GetTensor(arg_name)->shape().data()));
+  };
+  check_argument_fn("var1", "cinn_var1");
+  check_argument_fn("var2", "cinn_var2");
+  check_argument_fn("var3", "cinn_var3");
+  check_argument_fn("cinn_var4", "cinn_var4");
+  check_argument_fn("var5", "cinn_var5");
 }
 
-TEST(CinnLaunchContextTest, TestCheckTensorEquivalent) {
+TEST_F(CinnLaunchContextTest, TestCheckTensorEquivalent) {
   platform::CPUPlace place;
   framework::Scope scope;
-  auto launch_context = CreateDefaultLaunchContext();
   launch_context->UpdateCapturedEnv(scope, place);
   auto* tensor1 = scope.Var("var1")->GetMutable<LoDTensor>();
 
   // CheckTensorEquivalent: tensor dimension not equivalent
   tensor1->mutable_data<float>(phi::make_ddim({3, 5}), place);
-  ASSERT_THROW(launch_context->AssignExternalVariable("var1"),
+  ASSERT_THROW(launch_context->CheckTensorEquivalent("var1", *tensor1),
                paddle::platform::EnforceNotMet);
 }
 
-TEST(CinnLaunchContextTest, TestAssignVariablePreCondition) {
+TEST_F(CinnLaunchContextTest, TestBuildCompiledProgram) {
   platform::CPUPlace place;
   framework::Scope scope;
-  auto launch_context = CreateDefaultLaunchContext();
-  launch_context->UpdateCapturedEnv(scope, place);
-  auto* tensor4 = scope.Var("var4")->GetMutable<LoDTensor>();
+  ParallelExecutor* pe = nullptr;
+  ASSERT_NO_THROW((pe = launch_context->InitializePE(place, &scope)));
 
-  // not used
-  ASSERT_THROW(launch_context->AssignExternalVariable("var4"),
-               paddle::platform::EnforceNotMet);
-  // not found
-  ASSERT_THROW(launch_context->AssignInternalVariable("cinn_var4"),
-               paddle::platform::EnforceNotMet);
+  // check details of program build by compiled instructions
+  const ProgramDesc& program = pe->Graph().OriginProgram();
+  ASSERT_EQ(program.Size(), 1);
+  const auto& block = program.Block(0);
+  // vars
+  std::set<std::string> var_names = block.LocalVarNames();
+  ASSERT_EQ(var_names.size(), 5);
+  for (auto&& var_name : var_names) {
+    auto* var = block.FindVar(var_name);
+    ASSERT_NE(var, nullptr);
+    auto* buffer = launch_context->GetCinnBufferOfVar(var_name);
+    ASSERT_EQ(framework::DDim(buffer->dims, buffer->dimensions),
+              phi::make_ddim(var->GetShape()));
+  }
+  ASSERT_TRUE(block.FindVar("var1")->Persistable());
+  ASSERT_FALSE(block.FindVar("var5")->Persistable());
+  ASSERT_TRUE(block.FindVar("var5")->IsParameter());
+  ASSERT_FALSE(block.FindVar("var1")->IsParameter());
+  // ops
+  ASSERT_EQ(block.OpSize(), 3);
+  auto* op1 = block.Op(0);
+  ASSERT_EQ(op1->Type(), "cinn_instruction_run");
+  ASSERT_EQ(op1->Input(kX), std::vector<std::string>({"var1", "var2"}));
+  ASSERT_EQ(op1->Output(kOutputs), std::vector<std::string>({"var3"}));
+  ASSERT_EQ(op1->GetAttrIfExists<int64_t>(kCachedIndex), 110);
+  ASSERT_EQ(op1->GetAttrIfExists<int64_t>(kInstructionIndex), 0);
+  auto* op3 = block.Op(2);
+  ASSERT_EQ(op3->Type(), "cinn_instruction_run");
+  ASSERT_EQ(op3->Input(kX), std::vector<std::string>({"var3", "cinn_var4"}));
+  ASSERT_EQ(op3->Output(kOutputs), std::vector<std::string>({"var5"}));
+  ASSERT_EQ(op3->GetAttrIfExists<int64_t>(kCachedIndex), 110);
+  ASSERT_EQ(op3->GetAttrIfExists<int64_t>(kInstructionIndex), 2);
 }
 
-TEST(CinnLaunchContextTest, TestAppendArgument) {
-  platform::CPUPlace cpu_place;
-  platform::Place place(cpu_place);
+// DEPRECATED(CtfGo): following test of callback assignment
+// will be deprecated after we switch to pe
+TEST_F(CinnLaunchContextTest, TestCallbackAssignment) {
+  platform::CPUPlace place;
   framework::Scope scope;
-  auto launch_context = CreateDefaultLaunchContext();
   launch_context->UpdateCapturedEnv(scope, place);
 
   // assign external variables
@@ -101,33 +243,8 @@ TEST(CinnLaunchContextTest, TestAppendArgument) {
   float* data1 = tensor1->mutable_data<float>(phi::make_ddim({3, 4}), place);
   data1[0] = 9.99f;
   data1[10] = 19.99f;
-  ASSERT_NO_THROW(launch_context->AssignExternalVariable("var1"));
-
-  auto* tensor3 = scope.Var("var3")->GetMutable<LoDTensor>();
-  tensor3->mutable_data<float>(phi::make_ddim({10, 16}), place);
-  ASSERT_NO_THROW(launch_context->AssignExternalVariable("var3"));
-
-  // FinalizeArguments missed check
-  ASSERT_THROW(launch_context->FinalizeArguments(),
-               paddle::platform::EnforceNotMet);
-  // test get internal variables
-  auto internal_variable_names =
-      launch_context->ExtractInternalVarNames({"var1"}, {"var3"});
-  ASSERT_EQ(internal_variable_names.size(), 1);
-  EXPECT_EQ(*internal_variable_names.begin(), "cinn_var2");
-
-  auto* tensor2 = scope.Var("var2")->GetMutable<LoDTensor>();
-  tensor2->mutable_data<float>(phi::make_ddim({6, 7, 8}), place);
-  ASSERT_NO_THROW(launch_context->AssignInternalVariable("cinn_var2"));
-
   // check argument is set correctly and alloc/free callbacks work well
-  auto name2argument = launch_context->FinalizeArguments();
-  ASSERT_EQ(name2argument.size(), 3);
-  ASSERT_EQ(name2argument.count("cinn_var1"), 1);
-  ASSERT_TRUE(launch_context->IsArgumentsInitialized());
-
-  auto* cinn_buffer =
-      static_cast<cinn_buffer_t*>(name2argument.at("cinn_var1"));
+  auto* cinn_buffer = launch_context->GetCinnBufferOfVar("var1");
   ASSERT_EQ(cinn_buffer->memory, nullptr);
   cinn_buffer->external_malloc->operator()(nullptr, cinn_buffer);
   ASSERT_NE(cinn_buffer->memory, nullptr);
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.h b/paddle/fluid/operators/cinn/cinn_launch_op.h
index 1db9f2f25e270..cf3b98c6679b8 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op.h
+++ b/paddle/fluid/operators/cinn/cinn_launch_op.h
@@ -105,63 +105,29 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
     auto* launch_context = cinn_compiled_object.launch_context.get();
     // Step 3. Prepare arguments needed for the compiled executable program.
     launch_context->UpdateCapturedEnv(scope, place);
-    if (!launch_context->IsArgumentsInitialized()) {
-      VLOG(4) << "CinnLaunchOp prepare arguments";
-
-      // 3.1 Prepare input variables: tensors of input variables have
-      //     been initialized before graph compiled, just check the
-      //     equiality between tensors of paddle and cinn.
-      for (const auto& var_name : input_no_need_buffer_variable_names) {
-        // the input variable declared as 'no need buffer' can not be used
-        PADDLE_ENFORCE_EQ(
-            launch_context->IsVariableUsed(var_name), false,
-            platform::errors::InvalidArgument(
-                "Input variable(%s) should not be used by cinn in execution",
-                var_name));
-      }
-
-      for (const auto& var_name : input_x_variable_names) {
-        // some input variables don't need for cinn because they are
-        // eliminated by optimized passes or some cinn operators use
-        // less variables
-        if (!launch_context->IsVariableUsed(var_name)) {
-          VLOG(4) << "Input variable" << var_name << " not used by cinn";
-          continue;
-        }
-
-        launch_context->AssignExternalVariable(var_name);
-      }
-
-      // 3.2 Prepare output variables: all output variables should
-      //     be initialized and allocated buffer before
-      //     the runtime program start execution, the compilation result
-      //     includes details of their buffer assginment and we use that to
-      //     allocate space in Paddle. For those variables allocated yet,
-      //     like persistable parameters, just check the equiality between
-      //     Paddle allocation and CINN buffer assginment.
-      auto output_variable_names = ctx.OutputNames(kOutputs);
-      for (const auto var_name : output_variable_names) {
-        PADDLE_ENFORCE_EQ(
-            launch_context->IsVariableUsed(var_name), true,
-            platform::errors::InvalidArgument(
-                "Output variable(%s) not used by cinn", var_name));
-
-        launch_context->AssignExternalVariable(var_name);
-      }
-
-      // 3.3 Prepare internal or temporary variables: Create a temporary
-      //     scope to keep internal variables within graph or temporary
-      //     variables needed by the compiled runtime program in addition.
-      //     Here we directly use the names from CinnScope as Paddle variable
-      //     names, because they will not be used outside the graph
-      //     and should be destructed after computation finished.
-      auto internal_variable_names = launch_context->ExtractInternalVarNames(
-          input_x_variable_names, output_variable_names);
-      for (const auto& var_name : internal_variable_names) {
-        launch_context->AssignInternalVariable(var_name);
+    // 3.1 Input variables: tensors of input variables have
+    //     been initialized before graph compiled, just check the
+    //     equiality between tensors of paddle and cinn.
+    for (const auto& var_name : input_x_variable_names) {
+      // some input variables don't need for cinn because they are
+      // eliminated by optimized passes or some cinn operators use
+      // less variables
+      if (!launch_context->IsVariableUsed(var_name)) {
+        VLOG(4) << "Input variable" << var_name << " not used by cinn";
+        continue;
       }
+      launch_context->CheckTensorEquivalent(var_name,
+                                            *inputs_name2tensor.at(var_name));
     }
 
+    // 3.2 Output variables: the output variables will be initialized
+    //     and allocated buffer in callbacks which are defined in the
+    //     external_malloc/free interface of cinn_buffer_t
+    //     in their corresponding arguments.
+    // 3.3 Internal variables: A temporary scope is created in
+    //     UpdateCapturedEnv to keep the internal variables and
+    //     they are also initialized through callbacks
+
     // Step 4. Set CINN runtime FLAGS, such as FLAGS_cinn_cudnn_deterministic.
     details::SetCinnRuntimeFlags();
 
diff --git a/paddle/fluid/operators/cinn/test_helper.h b/paddle/fluid/operators/cinn/test_helper.h
index eb3d725d554b1..9720a5309fa6e 100644
--- a/paddle/fluid/operators/cinn/test_helper.h
+++ b/paddle/fluid/operators/cinn/test_helper.h
@@ -22,6 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/paddle2cinn/build_cinn_pass.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/phi/core/ddim.h"
 
@@ -31,6 +32,7 @@ using LoDTensor = framework::LoDTensor;
 using Variable = framework::Variable;
 using Graph = framework::ir::Graph;
 using Node = framework::ir::Node;
+using framework::paddle2cinn::Name2VarInfoMap;
 
 std::unique_ptr<Graph> CreateOnlyElementwiseAddGraph(
     const std::string& x_name, const std::string& y_name,
@@ -71,6 +73,16 @@ std::unique_ptr<Graph> CreateOnlyElementwiseAddGraph(
   y_node->inputs = {feed_op_node_y};
   y_node->outputs = {elementwise_add_node};
   out_node->inputs = {elementwise_add_node};
+  // set necessary attributes
+  g->Set<std::vector<std::string>>(
+      framework::paddle2cinn::kInputVars,
+      new std::vector<std::string>({x_name, y_name}));
+  g->Set<std::vector<std::string>>(framework::paddle2cinn::kInternalVars,
+                                   new std::vector<std::string>({}));
+  g->Set<std::vector<std::string>>(framework::paddle2cinn::kOutputVars,
+                                   new std::vector<std::string>({out_name}));
+  g->GetOrInit<Name2VarInfoMap>(
+      framework::paddle2cinn::kMemOptVarInfoFromMainGraph);
   return g;
 }
 

From 127440c3c540c2327c7aa570427ab1d59b8a3518 Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Thu, 24 Feb 2022 17:38:22 +0800
Subject: [PATCH 096/101] [phi] move randint to phi (#39872)

* move randint to phi

* use host generator
---
 paddle/fluid/operators/randint_op.cc     | 34 ----------
 paddle/fluid/operators/randint_op.cu     | 84 ------------------------
 paddle/phi/kernels/cpu/randint_kernel.cc | 63 ++++++++++++++++++
 paddle/phi/kernels/gpu/randint_kernel.cu | 80 ++++++++++++++++++++++
 paddle/phi/kernels/randint_kernel.h      | 39 +++++++++++
 paddle/phi/ops/compat/randint_sig.cc     | 63 ++++++++++++++++++
 6 files changed, 245 insertions(+), 118 deletions(-)
 delete mode 100644 paddle/fluid/operators/randint_op.cu
 create mode 100644 paddle/phi/kernels/cpu/randint_kernel.cc
 create mode 100644 paddle/phi/kernels/gpu/randint_kernel.cu
 create mode 100644 paddle/phi/kernels/randint_kernel.h
 create mode 100644 paddle/phi/ops/compat/randint_sig.cc

diff --git a/paddle/fluid/operators/randint_op.cc b/paddle/fluid/operators/randint_op.cc
index 09c58cd7d4cda..548e28716dd91 100644
--- a/paddle/fluid/operators/randint_op.cc
+++ b/paddle/fluid/operators/randint_op.cc
@@ -24,37 +24,6 @@
 namespace paddle {
 namespace operators {
 
-template <typename T>
-class CPURandintKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    std::vector<int64_t> new_shape;
-    auto list_new_shape_tensor =
-        ctx.MultiInput<framework::Tensor>("ShapeTensorList");
-    if (list_new_shape_tensor.size() > 0 || ctx.HasInput("ShapeTensor")) {
-      if (ctx.HasInput("ShapeTensor")) {
-        auto* shape_tensor = ctx.Input<framework::Tensor>("ShapeTensor");
-        new_shape = GetNewDataFromShapeTensor(shape_tensor);
-      } else if (list_new_shape_tensor.size() > 0) {
-        new_shape = GetNewDataFromShapeTensorList(list_new_shape_tensor);
-      }
-    }
-    auto* out = ctx.Output<framework::LoDTensor>("Out");
-    if (!new_shape.empty()) out->Resize(phi::make_ddim(new_shape));
-    T* data = out->mutable_data<T>(ctx.GetPlace());
-    int64_t size = out->numel();
-
-    std::uniform_int_distribution<T> dist(ctx.Attr<int>("low"),
-                                          ctx.Attr<int>("high") - 1);
-    unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
-    auto engine = framework::GetCPURandomEngine(seed);
-
-    for (int64_t i = 0; i < size; ++i) {
-      data[i] = dist(*engine);
-    }
-  }
-};
-
 class RandintOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -176,6 +145,3 @@ REGISTER_OPERATOR(
     randint, ops::RandintOp, ops::RandintOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>)
-
-REGISTER_OP_CPU_KERNEL(randint, ops::CPURandintKernel<int>,
-                       ops::CPURandintKernel<int64_t>)
diff --git a/paddle/fluid/operators/randint_op.cu b/paddle/fluid/operators/randint_op.cu
deleted file mode 100644
index 2f9a8cfd142ec..0000000000000
--- a/paddle/fluid/operators/randint_op.cu
+++ /dev/null
@@ -1,84 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include <thrust/random.h>
-#include <thrust/transform.h>
-#include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/generator.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/uniform_random_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class GPURandintKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    std::vector<int64_t> new_shape;
-    auto list_new_shape_tensor =
-        context.MultiInput<framework::Tensor>("ShapeTensorList");
-    if (list_new_shape_tensor.size() > 0 || context.HasInput("ShapeTensor")) {
-      if (context.HasInput("ShapeTensor")) {
-        auto* shape_tensor = context.Input<framework::Tensor>("ShapeTensor");
-        new_shape = GetNewDataFromShapeTensor(shape_tensor);
-      } else if (list_new_shape_tensor.size() > 0) {
-        new_shape = GetNewDataFromShapeTensorList(list_new_shape_tensor);
-      }
-    }
-
-    platform::CPUPlace cpu;
-    auto dtype = static_cast<framework::proto::VarType::Type>(
-        context.Attr<int>("dtype"));
-    auto* out = context.Output<framework::LoDTensor>("Out");
-    if (!new_shape.empty()) out->Resize(phi::make_ddim(new_shape));
-    T low = static_cast<T>(context.Attr<int>("low"));
-    T high = static_cast<T>(context.Attr<int>("high")) - 1;
-    framework::LoDTensor tensor;
-    tensor.Resize(out->dims());
-    tensor.mutable_data(cpu, framework::TransToPtenDataType(dtype));
-    T* data = tensor.mutable_data<T>(cpu);
-
-    int64_t size = out->numel();
-    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-
-    /*
-    std::minstd_rand engine;
-    if (seed == 0) {
-      std::random_device rd;
-      seed = rd();
-    }
-    engine.seed(seed);
-    */
-
-    std::uniform_int_distribution<> dist(context.Attr<int>("low"),
-                                         context.Attr<int>("high") - 1);
-    auto engine = framework::GetCPURandomEngine(seed);
-
-    for (int64_t i = 0; i < size; ++i) {
-      data[i] = dist(*engine);
-    }
-
-    if (platform::is_gpu_place(context.GetPlace())) {
-      // Copy tensor to out
-      framework::TensorCopy(tensor, context.GetPlace(), out);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(randint, ops::GPURandintKernel<int>,
-                        ops::GPURandintKernel<int64_t>)
diff --git a/paddle/phi/kernels/cpu/randint_kernel.cc b/paddle/phi/kernels/cpu/randint_kernel.cc
new file mode 100644
index 0000000000000..5fe56b57452d5
--- /dev/null
+++ b/paddle/phi/kernels/cpu/randint_kernel.cc
@@ -0,0 +1,63 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/randint_kernel.h"
+
+#include <random>
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RandintRawKernel(const Context& ctx,
+                      int low,
+                      int high,
+                      const ScalarArray& shape,
+                      DataType dtype,
+                      int seed,
+                      DenseTensor* out) {
+  out->ResizeAndAllocate(phi::make_ddim(shape.GetData()));
+  auto size = out->numel();
+  std::shared_ptr<std::mt19937_64> engine;
+  if (seed) {
+    engine = std::make_shared<std::mt19937_64>();
+    engine->seed(seed);
+  } else {
+    engine = ctx.GetGenerator()->GetCPUEngine();
+  }
+  std::uniform_int_distribution<T> dist(low, high - 1);
+  auto data = out->data<T>();
+  for (int64_t i = 0; i < size; ++i) {
+    data[i] = dist(*engine);
+  }
+}
+
+template <typename T, typename Context>
+void RandintKernel(const Context& ctx,
+                   int low,
+                   int high,
+                   const ScalarArray& shape,
+                   DataType dtype,
+                   DenseTensor* out) {
+  RandintRawKernel<T>(ctx, low, high, shape, dtype, 0, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    randint_raw, CPU, ALL_LAYOUT, phi::RandintRawKernel, int, int64_t) {}
+PD_REGISTER_KERNEL(randint, CPU, ALL_LAYOUT, phi::RandintKernel, int, int64_t) {
+}
diff --git a/paddle/phi/kernels/gpu/randint_kernel.cu b/paddle/phi/kernels/gpu/randint_kernel.cu
new file mode 100644
index 0000000000000..b89b714c73d92
--- /dev/null
+++ b/paddle/phi/kernels/gpu/randint_kernel.cu
@@ -0,0 +1,80 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/randint_kernel.h"
+
+#include <random>
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/memory/memcpy.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RandintRawKernel(const Context& ctx,
+                      int low,
+                      int high,
+                      const ScalarArray& shape,
+                      DataType dtype,
+                      int seed,
+                      DenseTensor* out) {
+  DenseTensor tmp;
+  tmp.Resize(phi::make_ddim(shape.GetData()));
+  T* tmp_data = ctx.template HostAlloc<T>(&tmp);
+
+  out->ResizeAndAllocate(tmp.dims());
+  auto size = out->numel();
+
+  std::shared_ptr<std::mt19937_64> engine;
+  if (seed) {
+    engine = std::make_shared<std::mt19937_64>();
+    engine->seed(seed);
+  } else {
+    engine = ctx.GetHostGenerator()->GetCPUEngine();
+  }
+  std::uniform_int_distribution<T> dist(low, high - 1);
+  auto data = out->data<T>();
+  for (int64_t i = 0; i < size; ++i) {
+    tmp_data[i] = dist(*engine);
+  }
+
+  paddle::memory::Copy<phi::GPUPlace, phi::Place>(
+      out->place(),
+      data,
+      tmp.place(),
+      tmp_data,
+      size * paddle::experimental::SizeOf(out->dtype()),
+      0);
+}
+
+template <typename T, typename Context>
+void RandintKernel(const Context& ctx,
+                   int low,
+                   int high,
+                   const ScalarArray& shape,
+                   DataType dtype,
+                   DenseTensor* out) {
+  RandintRawKernel<T>(ctx, low, high, shape, dtype, 0, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    randint_raw, GPU, ALL_LAYOUT, phi::RandintRawKernel, int, int64_t) {}
+
+PD_REGISTER_KERNEL(randint, GPU, ALL_LAYOUT, phi::RandintKernel, int, int64_t) {
+}
diff --git a/paddle/phi/kernels/randint_kernel.h b/paddle/phi/kernels/randint_kernel.h
new file mode 100644
index 0000000000000..1a78e73d863e3
--- /dev/null
+++ b/paddle/phi/kernels/randint_kernel.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RandintKernel(const Context& ctx,
+                   int low,
+                   int high,
+                   const ScalarArray& shape,
+                   DataType dtype,
+                   DenseTensor* out);
+
+template <typename T, typename Context>
+void RandintRawKernel(const Context& ctx,
+                      int low,
+                      int high,
+                      const ScalarArray& shape,
+                      DataType dtype,
+                      int seed,
+                      DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/randint_sig.cc b/paddle/phi/ops/compat/randint_sig.cc
new file mode 100644
index 0000000000000..eb6da78a258bc
--- /dev/null
+++ b/paddle/phi/ops/compat/randint_sig.cc
@@ -0,0 +1,63 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature RandintOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  int seed = paddle::any_cast<int>(ctx.Attr("seed"));
+  if (seed) {
+    if (ctx.InputSize("ShapeTensorList") > 0) {
+      return KernelSignature(
+          "randint_raw",
+          {},
+          {"low", "high", "ShapeTensorList", "seed", "dtype"},
+          {"Out"});
+    } else {
+      const auto& shape =
+          paddle::any_cast<std::vector<int64_t>>(ctx.Attr("shape"));
+      if (ctx.HasInput("ShapeTensor") && shape.empty()) {
+        return KernelSignature("randint_raw",
+                               {},
+                               {"low", "high", "ShapeTensor", "seed", "dtype"},
+                               {"Out"});
+      } else {
+        return KernelSignature("randint_raw",
+                               {},
+                               {"low", "high", "shape", "seed", "dtype"},
+                               {"Out"});
+      }
+    }
+  } else {
+    if (ctx.InputSize("ShapeTensorList") > 0) {
+      return KernelSignature(
+          "randint", {}, {"low", "high", "ShapeTensorList", "dtype"}, {"Out"});
+    } else {
+      const auto& shape =
+          paddle::any_cast<std::vector<int64_t>>(ctx.Attr("shape"));
+      if (ctx.HasInput("ShapeTensor") && shape.empty()) {
+        return KernelSignature(
+            "randint", {}, {"low", "high", "ShapeTensor", "dtype"}, {"Out"});
+      } else {
+        return KernelSignature(
+            "randint", {}, {"low", "high", "shape", "dtype"}, {"Out"});
+      }
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(randint, phi::RandintOpArgumentMapping);

From b695fd958a3b5ea3cbc4ee766d2867839129b4d3 Mon Sep 17 00:00:00 2001
From: Aganlengzi <aganlengzi@gmail.com>
Date: Thu, 24 Feb 2022 17:48:37 +0800
Subject: [PATCH 097/101] [phi]migrate increment addmm multinomial cholesky
 kernels to phi (#39858)

* migrate increment addmm multinomial cholesky kernels to phi

* test pr39869

* test pr39869

* fix style and ci
---
 paddle/fluid/operators/addmm_op.cc            |  12 +-
 paddle/fluid/operators/addmm_op.h             | 195 ---------
 paddle/fluid/operators/cholesky_op.cc         |  10 +-
 paddle/fluid/operators/cholesky_op.cu         | 169 --------
 paddle/fluid/operators/cholesky_op.h          | 374 ------------------
 paddle/fluid/operators/increment_op.cc        |  15 +-
 paddle/fluid/operators/increment_op.h         |  41 --
 paddle/fluid/operators/increment_op_npu.cc    |   2 +-
 paddle/fluid/operators/multinomial_op.cc      |  28 --
 paddle/fluid/operators/multinomial_op.cu      | 270 -------------
 paddle/phi/kernels/addmm_grad_kernel.h        |  33 ++
 .../kernels/addmm_kernel.h}                   |  24 +-
 paddle/phi/kernels/cholesky_grad_kernel.h     |  28 ++
 paddle/phi/kernels/cholesky_kernel.h          |  27 ++
 paddle/phi/kernels/cpu/addmm_grad_kernel.cc   |  22 ++
 paddle/phi/kernels/cpu/addmm_kernel.cc        |  21 +
 .../phi/kernels/cpu/cholesky_grad_kernel.cc   |  22 ++
 paddle/phi/kernels/cpu/cholesky_kernel.cc     |  81 ++++
 paddle/phi/kernels/cpu/increment_kernel.cc    |  28 ++
 paddle/phi/kernels/cpu/multinomial_kernel.cc  |  46 +++
 paddle/phi/kernels/gpu/addmm_grad_kernel.cu   |  22 ++
 paddle/phi/kernels/gpu/addmm_kernel.cu        |  21 +
 .../phi/kernels/gpu/cholesky_grad_kernel.cu   |  22 ++
 paddle/phi/kernels/gpu/cholesky_kernel.cu     | 217 ++++++++++
 paddle/phi/kernels/gpu/increment_kernel.cu    |  28 ++
 paddle/phi/kernels/gpu/multinomial_kernel.cu  | 288 ++++++++++++++
 .../phi/kernels/impl/addmm_grad_kernel_impl.h | 105 +++++
 paddle/phi/kernels/impl/addmm_kernel_impl.h   | 121 ++++++
 .../kernels/impl/cholesky_grad_kernel_impl.h  | 336 ++++++++++++++++
 .../phi/kernels/impl/increment_kernel_impl.h  |  37 ++
 paddle/phi/kernels/increment_kernel.h         |  27 ++
 .../kernels/multinomial_kernel.h}             |  59 ++-
 paddle/phi/ops/compat/addmm_sig.cc            |  35 ++
 paddle/phi/ops/compat/cholesky_sig.cc         |  34 ++
 34 files changed, 1649 insertions(+), 1151 deletions(-)
 delete mode 100644 paddle/fluid/operators/addmm_op.h
 delete mode 100644 paddle/fluid/operators/cholesky_op.cu
 delete mode 100644 paddle/fluid/operators/cholesky_op.h
 delete mode 100644 paddle/fluid/operators/increment_op.h
 delete mode 100644 paddle/fluid/operators/multinomial_op.cu
 create mode 100644 paddle/phi/kernels/addmm_grad_kernel.h
 rename paddle/{fluid/operators/addmm_op.cu => phi/kernels/addmm_kernel.h} (50%)
 create mode 100644 paddle/phi/kernels/cholesky_grad_kernel.h
 create mode 100644 paddle/phi/kernels/cholesky_kernel.h
 create mode 100644 paddle/phi/kernels/cpu/addmm_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/addmm_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/cholesky_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/cholesky_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/increment_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/multinomial_kernel.cc
 create mode 100644 paddle/phi/kernels/gpu/addmm_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/addmm_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/cholesky_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/cholesky_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/increment_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/multinomial_kernel.cu
 create mode 100644 paddle/phi/kernels/impl/addmm_grad_kernel_impl.h
 create mode 100644 paddle/phi/kernels/impl/addmm_kernel_impl.h
 create mode 100644 paddle/phi/kernels/impl/cholesky_grad_kernel_impl.h
 create mode 100644 paddle/phi/kernels/impl/increment_kernel_impl.h
 create mode 100644 paddle/phi/kernels/increment_kernel.h
 rename paddle/{fluid/operators/multinomial_op.h => phi/kernels/multinomial_kernel.h} (70%)
 create mode 100644 paddle/phi/ops/compat/addmm_sig.cc
 create mode 100644 paddle/phi/ops/compat/cholesky_sig.cc

diff --git a/paddle/fluid/operators/addmm_op.cc b/paddle/fluid/operators/addmm_op.cc
index 915b4daeeb525..863e64c686d7b 100644
--- a/paddle/fluid/operators/addmm_op.cc
+++ b/paddle/fluid/operators/addmm_op.cc
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/addmm_op.h"
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <vector>
+#include "paddle/fluid/framework/op_registry.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
@@ -24,6 +24,8 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+constexpr int kMULMKLDNNINT8 = 1;
+
 using framework::OpKernelType;
 using framework::Tensor;
 
@@ -227,11 +229,3 @@ REGISTER_OPERATOR(addmm, ops::AddMMOp, ops::AddMMOpMaker,
                   ops::AddMMOpGradMaker<paddle::imperative::OpBase>);
 
 REGISTER_OPERATOR(addmm_grad, ops::AddMMGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    addmm, ops::AddMMKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::AddMMKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    addmm_grad, ops::AddMMGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::AddMMGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/addmm_op.h b/paddle/fluid/operators/addmm_op.h
deleted file mode 100644
index 9d225ba999192..0000000000000
--- a/paddle/fluid/operators/addmm_op.h
+++ /dev/null
@@ -1,195 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <boost/preprocessor/repetition/repeat.hpp>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, size_t D, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
-
-using Array1 = Eigen::DSizes<Eigen::DenseIndex, 1>;
-using Array2 = Eigen::DSizes<Eigen::DenseIndex, 2>;
-
-using Tensor = framework::Tensor;
-
-constexpr int kMULMKLDNNINT8 = 1;
-
-template <typename DeviceContext, typename T>
-class AddMMKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<Tensor>("Input");
-    const Tensor* x = context.Input<Tensor>("X");
-    const Tensor* y = context.Input<Tensor>("Y");
-
-    auto input_dims = input->dims();
-    auto x_dims = x->dims();
-    auto y_dims = y->dims();
-
-    // broadcast mode check
-    if (x_dims[0] != input_dims[0]) {
-      PADDLE_ENFORCE_EQ(input_dims[0], 1,
-                        platform::errors::InvalidArgument(
-                            "When x_dims[0] is not equal with input_dims[0], "
-                            "input_dims[0] must be 1 but got %s",
-                            input_dims[0]));
-      PADDLE_ENFORCE_EQ(
-          y_dims[1] == input_dims[1] || input_dims[1] == 1, true,
-          platform::errors::InvalidArgument(
-              "The input tensor shape mismatch, input shape=[%s], "
-              "x shape=[%s], y shape=[%s]",
-              input_dims, x_dims, y_dims));
-    }
-    // broadcast mode check
-    if (y_dims[1] != input_dims[1]) {
-      PADDLE_ENFORCE_EQ(input_dims[1], 1,
-                        platform::errors::InvalidArgument(
-                            "When y_dims[1] is not equal with input_dims[0], "
-                            "input_dims[0] must be 1 but got %s",
-                            input_dims[1]));
-      PADDLE_ENFORCE_EQ(
-          x_dims[0] == input_dims[0] || input_dims[0] == 1, true,
-          platform::errors::InvalidArgument(
-              "The input tensor shape mismatch, input shape=[%s], "
-              "x shape=[%s], y shape=[%s]",
-              input_dims, x_dims, y_dims));
-    }
-    // broadcast mode check
-    PADDLE_ENFORCE_EQ(
-        x_dims[1], y_dims[0],
-        platform::errors::InvalidArgument(
-            "The input tensor X's width must be equal with matrix Y' height. "
-            "But received X's shape = [%s], Y's shape = [%s].",
-            x_dims[1], y_dims[0]));
-
-    auto* out = context.Output<Tensor>("Out");
-    out->mutable_data<T>({x_dims[0], y_dims[1]}, context.GetPlace());
-
-    float alpha = context.template Attr<float>("Alpha");
-    float beta = context.template Attr<float>("Beta");
-
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(context);
-
-    // calc broadcast dim
-    Array2 bcast_dims;
-    bcast_dims[0] = x_dims[0] / input_dims[0];
-    bcast_dims[1] = y_dims[1] / input_dims[1];
-    VLOG(3) << "bcast_dims=[" << bcast_dims[0] << "," << bcast_dims[1] << "]";
-    // broadcast using eigen
-    auto eigen_input = EigenTensor<T, 2>::From(*input);
-    auto eigen_out = EigenTensor<T, 2>::From(*out);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    EigenBroadcast<std::decay_t<decltype(place)>, T, 2>::Eval(
-        place, eigen_out, eigen_input, bcast_dims);
-
-    blas.GEMM(false, false, x_dims[0], y_dims[1], x_dims[1], alpha,
-              x->data<T>(), x_dims[1], y->data<T>(), y_dims[1], beta,
-              out->data<T>(), y_dims[1]);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class AddMMGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::LoDTensor>("X");
-    auto* y = ctx.Input<framework::LoDTensor>("Y");
-    auto* dout = ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
-    auto in_dims = ctx.Input<framework::LoDTensor>("Input")->dims();
-    auto* dinput =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("Input"));
-    auto* dx = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<framework::LoDTensor>(framework::GradVarName("Y"));
-
-    float alpha = ctx.Attr<float>("Alpha");
-    float beta = ctx.Attr<float>("Beta");
-
-    int total_elems = 0;
-
-    VLOG(3) << "alpha: " << alpha << " beta: " << beta;
-
-    if (dinput != nullptr) {
-      dinput->set_lod(dout->lod());
-    }
-    if (dx != nullptr) {
-      dx->set_lod(x->lod());
-    }
-    if (dy != nullptr) {
-      dy->set_lod(y->lod());
-    }
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
-    if (dinput) {
-      dinput->mutable_data<T>(ctx.GetPlace());
-      total_elems = in_dims[0] * in_dims[1];
-      auto& place =
-          *ctx.template device_context<DeviceContext>().eigen_device();
-      auto eigen_dout = EigenTensor<T, 2>::From(*dout);
-      auto eigen_dinput = EigenTensor<T, 2>::From(*dinput);
-
-      bool row_compress = in_dims[0] != dout->dims()[0];
-      bool col_compress = in_dims[1] != dout->dims()[1];
-      auto eigen_dinput_shape = Array2(dinput->dims()[0], dinput->dims()[1]);
-
-      if (row_compress && col_compress) {
-        eigen_dinput.device(place) =
-            eigen_dout.sum().eval().reshape(eigen_dinput_shape);
-      } else if (row_compress) {
-        eigen_dinput.device(place) =
-            eigen_dout.sum(Array1(0)).eval().reshape(eigen_dinput_shape);
-      } else if (col_compress) {
-        eigen_dinput.device(place) =
-            eigen_dout.sum(Array1(1)).eval().reshape(eigen_dinput_shape);
-      } else {
-        blas.VCOPY(total_elems, dout->data<T>(), dinput->data<T>());
-      }
-
-      blas.SCAL(total_elems, beta, dinput->data<T>());
-    }
-    if (dx) {
-      dx->mutable_data<T>(ctx.GetPlace());
-      total_elems = x->dims()[0] * x->dims()[1];
-      // dx = dout * y'. dx: M x K, dout : M x N, y : K x N
-      blas.MatMul(*dout, false, *y, true, dx);
-      blas.SCAL(total_elems, alpha, dx->data<T>());
-    }
-    if (dy) {
-      dy->mutable_data<T>(ctx.GetPlace());
-      total_elems = x->dims()[1] * y->dims()[1];
-      // dy = x' * dout. dy K x N, dout : M x N, x : M x K
-      blas.MatMul(*x, true, *dout, false, dy);
-      blas.SCAL(total_elems, alpha, dy->data<T>());
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/cholesky_op.cc b/paddle/fluid/operators/cholesky_op.cc
index 0902f5b6bc9e8..93dee0df7b954 100644
--- a/paddle/fluid/operators/cholesky_op.cc
+++ b/paddle/fluid/operators/cholesky_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/cholesky_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -111,11 +111,3 @@ REGISTER_OPERATOR(cholesky, ops::CholeskyOp, ops::CholeskyOpMaker,
                   ops::CholeskyGradOpMaker<paddle::framework::OpDesc>,
                   ops::CholeskyGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(cholesky_grad, ops::CholeskyGradOp);
-
-REGISTER_OP_CPU_KERNEL(cholesky, ops::CholeskyCPUKernel<float>,
-                       ops::CholeskyCPUKernel<double>);
-
-REGISTER_OP_CPU_KERNEL(
-    cholesky_grad,
-    ops::CholeskyGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CholeskyGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/cholesky_op.cu b/paddle/fluid/operators/cholesky_op.cu
deleted file mode 100644
index 43c16d607c2db..0000000000000
--- a/paddle/fluid/operators/cholesky_op.cu
+++ /dev/null
@@ -1,169 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef PADDLE_WITH_HIP
-// HIP not support cusolver
-
-#include <thrust/device_vector.h>
-#include <algorithm>
-#include <vector>
-#include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/operators/cholesky_op.h"
-#include "paddle/fluid/platform/dynload/cusolver.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class CholeskyGPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto& dev_ctx =
-        context.template device_context<platform::CUDADeviceContext>();
-
-    const Tensor* x = context.Input<Tensor>("X");
-    Tensor* out = context.Output<Tensor>("Out");
-
-    bool upper = context.Attr<bool>("upper");
-    auto& dims = x->dims();
-    int batch_count = 1;
-    for (int i = 0; i < dims.size() - 2; i++) {
-      batch_count *= dims[i];
-    }
-    int m = dims[dims.size() - 1];
-    int tensor_size = batch_count * m * m;
-
-    const auto* x_data = x->data<T>();
-    auto* out_data = out->mutable_data<T>(context.GetPlace());
-
-    // matrices are assumed to be stored in column-major order in cusolver
-    cublasFillMode_t uplo =
-        upper ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
-    // portf is inplace, thus copy the triangular part of the input matrices to
-    // the output and set the other triangular part to 0 firstly
-    platform::ForRange<platform::CUDADeviceContext> for_range(dev_ctx,
-                                                              tensor_size);
-    if (upper) {
-      MatrixBandPartFunctor<T> matrix_band_part_functor(
-          m, m, /* num_lower_diags */ 0, /* num_upper_diags */ m, x_data,
-          out_data);
-      for_range(matrix_band_part_functor);
-    } else {
-      MatrixBandPartFunctor<T> matrix_band_part_functor(
-          m, m, /* num_lower_diags */ m, /* num_upper_diags */ 0, x_data,
-          out_data);
-      for_range(matrix_band_part_functor);
-    }
-
-    auto info = memory::Alloc(dev_ctx, sizeof(int) * batch_count);
-    auto* info_ptr = reinterpret_cast<int*>(info->ptr());
-
-#if CUDA_VERSION >= 9020 && !defined(_WIN32)
-    if (batch_count > 1) {
-      std::vector<T*> output_ptrs;
-      for (int i = 0; i < batch_count; i++) {
-        output_ptrs.emplace_back(out_data + i * m * m);
-      }
-      thrust::device_vector<T*> dev_output_ptrs(output_ptrs.begin(),
-                                                output_ptrs.end());
-      PotrfBatched(dev_ctx, uplo, m,
-                   thrust::raw_pointer_cast(dev_output_ptrs.data()), m,
-                   info_ptr, batch_count);
-      // TODO(guosheng): There seems to a bug in cusolver potrfBatched and need
-      // to clear the upper triangle of the output. Remove this workaround once
-      // the bug is fixed.
-      if (!upper) {
-        MatrixBandPartFunctor<T> matrix_band_part_functor(
-            m, m, /* num_lower_diags */ m, /* num_upper_diags */ 0, out_data,
-            out_data);
-        for_range(matrix_band_part_functor);
-      }
-    } else {
-#endif
-      for (int i = 0; i < batch_count; i++) {
-        Potrf(dev_ctx, uplo, m, out_data + i * m * m, m, info_ptr + i);
-      }
-
-#if CUDA_VERSION >= 9020 && !defined(_WIN32)
-    }
-#endif
-    // check the info
-    std::vector<int> error_info;  // only for checking positive matrix
-    error_info.resize(batch_count);
-
-    memory::Copy(platform::CPUPlace(), error_info.data(), dev_ctx.GetPlace(),
-                 info_ptr, sizeof(int) * batch_count, dev_ctx.stream());
-
-    for (int i = 0; i < batch_count; ++i) {
-      PADDLE_ENFORCE_EQ(error_info[i], 0,
-                        platform::errors::PreconditionNotMet(
-                            "For batch [%d]: U(%d, %d) is zero, singular U.", i,
-                            error_info[i], error_info[i]));
-    }
-  }
-
-  void Potrf(const platform::CUDADeviceContext& dev_ctx, cublasFillMode_t uplo,
-             int n, T* A, int lda, int* info) const;
-
-  void PotrfBatched(const platform::CUDADeviceContext& dev_ctx,
-                    cublasFillMode_t uplo, int n, T* Aarray[], int lda,
-                    int* info_array, int batch_size) const;
-};
-
-#define FUNC_WITH_TYPES(m) m(float, S) m(double, D)
-
-#define POTRF_INSTANCE(T, C)                                                   \
-  template <>                                                                  \
-  void CholeskyGPUKernel<T>::Potrf(const platform::CUDADeviceContext& dev_ctx, \
-                                   cublasFillMode_t uplo, int n, T* A,         \
-                                   int lda, int* info) const {                 \
-    auto handle = dev_ctx.cusolver_dn_handle();                                \
-    int workspace_size = 0;                                                    \
-    PADDLE_ENFORCE_GPU_SUCCESS(                                                \
-        platform::dynload::cusolverDn##C##potrf_bufferSize(                    \
-            handle, uplo, n, A, lda, &workspace_size));                        \
-    auto workspace = memory::Alloc(dev_ctx, workspace_size);                   \
-    T* workspace_ptr = reinterpret_cast<T*>(workspace->ptr());                 \
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDn##C##potrf(        \
-        handle, uplo, n, A, lda, workspace_ptr, workspace_size, info));        \
-  }
-
-FUNC_WITH_TYPES(POTRF_INSTANCE);
-
-#if CUDA_VERSION >= 9020 && !defined(_WIN32)
-#define POTRF_BATCH_INSTANCE(T, C)                                             \
-  template <>                                                                  \
-  void CholeskyGPUKernel<T>::PotrfBatched(                                     \
-      const platform::CUDADeviceContext& dev_ctx, cublasFillMode_t uplo,       \
-      int n, T* Aarray[], int lda, int* info_array, int batch_size) const {    \
-    auto handle = dev_ctx.cusolver_dn_handle();                                \
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDn##C##potrfBatched( \
-        handle, uplo, n, Aarray, lda, info_array, batch_size));                \
-  }
-
-FUNC_WITH_TYPES(POTRF_BATCH_INSTANCE);
-#endif
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(cholesky, ops::CholeskyGPUKernel<float>,
-                        ops::CholeskyGPUKernel<double>);
-REGISTER_OP_CUDA_KERNEL(
-    cholesky_grad,
-    ops::CholeskyGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::CholeskyGradKernel<paddle::platform::CUDADeviceContext, double>);
-
-#endif  // not PADDLE_WITH_HIP
diff --git a/paddle/fluid/operators/cholesky_op.h b/paddle/fluid/operators/cholesky_op.h
deleted file mode 100644
index 9504909073f79..0000000000000
--- a/paddle/fluid/operators/cholesky_op.h
+++ /dev/null
@@ -1,374 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <numeric>
-#include <vector>
-#include "Eigen/Cholesky"
-#include "Eigen/Core"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/transpose_op.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-class CholeskyCPUKernel : public framework::OpKernel<T> {
- public:
-  // different with EigenMatrix in framework/eigen.h
-  using EigenMatrix =
-      Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
-  using InputMatrixMap = Eigen::Map<const EigenMatrix>;
-  using OutputMatrixMap = Eigen::Map<EigenMatrix>;
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* x = context.Input<Tensor>("X");
-    Tensor* out = context.Output<Tensor>("Out");
-
-    bool upper = context.Attr<bool>("upper");
-    auto& dims = x->dims();
-    int batch_count = 1;
-    for (int i = 0; i < dims.size() - 2; i++) {
-      batch_count *= dims[i];
-    }
-    auto m = dims[dims.size() - 1];
-
-    const auto* x_data = x->data<T>();
-    auto* out_data = out->mutable_data<T>(context.GetPlace());
-    // Cholesky decomposition for each matrix, maybe can use multi threads
-    for (int i = 0; i < batch_count; i++) {
-      auto input = InputMatrixMap(x_data + i * m * m, m, m);
-      auto output = OutputMatrixMap(out_data + i * m * m, m, m);
-      if (upper) {
-        Eigen::LLT<
-            Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>,
-            Eigen::UpLoType::Upper>
-            llt_decomposition(input);
-        PADDLE_ENFORCE_EQ(llt_decomposition.info(), Eigen::Success,
-                          platform::errors::InvalidArgument(
-                              "Cholesky decomposition was not successful. The "
-                              "%d-th input matrice "
-                              "might not be not be positive definite.",
-                              i));
-        output = llt_decomposition.matrixU();
-      } else {
-        Eigen::LLT<
-            Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>,
-            Eigen::UpLoType::Lower>
-            llt_decomposition(input);
-        PADDLE_ENFORCE_EQ(llt_decomposition.info(), Eigen::Success,
-                          platform::errors::InvalidArgument(
-                              "Cholesky decomposition was not successful. The "
-                              "%d-th input matrice "
-                              "might not be not be positive definite.",
-                              i));
-        output = llt_decomposition.matrixL();
-      }
-    }
-  }
-};
-
-/*! Use these functors to implement tril, triu, diagonal and other operators */
-template <typename T>
-struct EyeFunctor {
-  EyeFunctor(const int m, const int n, T* output)
-      : m_(m), n_(n), output_(output) {}
-
-  HOSTDEVICE void operator()(size_t index) const {
-    const int global_row = index / n_;
-    const int col = index - global_row * n_;
-    const int batch = global_row / m_;
-    const int row = global_row - batch * m_;
-    output_[index] = col == row ? static_cast<T>(1) : static_cast<T>(0);
-  }
-
-  const int m_, n_;
-  T* output_;
-};
-
-template <typename T>
-struct MatrixBandPartFunctor {
-  /*! Set output as input value outside a central band and 0 inside that band.
-   * That is: output[i, j, ..., m, n] = in_band(m, n) * input[i, j, ..., m, n]
-   * where: in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) && (num_upper
-   * < 0 || (n-m) <= num_upper)
-   */
-  MatrixBandPartFunctor(const int m, const int n, const int num_lower_diags,
-                        const int num_upper_diags, const T* input, T* output)
-      : m_(m),
-        n_(n),
-        num_lower_diags_(num_lower_diags),
-        num_upper_diags_(num_upper_diags),
-        input_(input),
-        output_(output) {}
-
-  HOSTDEVICE void operator()(size_t index) const {
-    const int col = index % n_;
-    const int row = (index / n_) % m_;
-    const int band_start = (num_lower_diags_ < 0 ? 0 : row - num_lower_diags_);
-    const int band_end =
-        (num_upper_diags_ < 0 ? n_ : row + num_upper_diags_ + 1);
-    if (col < band_start || col >= band_end) {
-      output_[index] = static_cast<T>(0);
-    } else {
-      output_[index] = input_[index];
-    }
-  }
-
-  const int m_, n_, num_lower_diags_, num_upper_diags_;
-  const T* input_;
-  T* output_;
-};
-
-template <typename T>
-struct MatrixSetDiagFunctor {
-  /*! Overwrite specified diagonals of output by the values in diagonal.
-   * diagonals can be a central band specified by num_diags and
-   * upper_diag_index, where upper_diag_index=0 refers to the main diagonal,
-   * positive value means superdiagonal and negative value means subdiagonal.
-   * When it is a band, `diag` has a shape [i, j, ..., num_diags, max_diag_len]
-   * and the num_diags diagonals has a up to down layout. Otherwise it has a
-   * shape [i, j, ..., max_diag_len].
-   */
-  MatrixSetDiagFunctor(const int m, const int n, const int num_diags,
-                       const int max_diag_len, const int upper_diag_index,
-                       const T* diag, T* output)
-      : m_(m),
-        n_(n),
-        num_diags_(num_diags),
-        max_diag_len_(max_diag_len),
-        upper_diag_index_(upper_diag_index),
-        diag_(diag),
-        output_(output) {}
-
-  HOSTDEVICE void operator()(size_t index) const {
-    const int batch_and_diag_index = index / max_diag_len_;
-    const int index_in_the_diagonal =
-        index - batch_and_diag_index * max_diag_len_;
-    const int batch = batch_and_diag_index / num_diags_;
-    const int diag_index_in_input = batch_and_diag_index - batch * num_diags_;
-    // diag_index=0 refers to the main diagonal
-    const int diag_index = upper_diag_index_ - diag_index_in_input;
-    // shift down for subdiagonal if diag_index < 0
-    const int y_index =
-        index_in_the_diagonal + (0 > -diag_index ? 0 : -diag_index);
-    // shift right for superdiagonal if diag_index > 0
-    const int x_index =
-        index_in_the_diagonal + (0 > diag_index ? 0 : diag_index);
-
-    // Upper-bound checks for diagonals shorter than max_diag_len.
-    // y_index and x_index are nonnegative by construction.
-    if (y_index < m_ && x_index < n_) {
-      const int out_index = batch * m_ * n_ + y_index * n_ + x_index;
-      output_[out_index] = diag_[index];
-    }
-  }
-
-  const int m_, n_, num_diags_, max_diag_len_, upper_diag_index_;
-  const T* diag_;
-  T* output_;
-};
-
-template <typename T>
-struct MatrixDiagPartFunctor {
-  /*! Similar to MatrixSetDiagFunctor but return the diagonals. diag_index=0
-   * refers to the main diagonal, positive value means superdiagonal and
-   * negative value means subdiagonal */
-  MatrixDiagPartFunctor(const int m, const int n, const int num_diags,
-                        const int max_diag_len, const int upper_diag_index,
-                        const T padding, const T* input, T* output)
-      : m_(m),
-        n_(n),
-        num_diags_(num_diags),
-        max_diag_len_(max_diag_len),
-        upper_diag_index_(upper_diag_index),
-        input_(input),
-        output_(output) {}
-
-  HOSTDEVICE void operator()(size_t index) const {
-    const int batch_and_mapped_diag_index = index / max_diag_len_;
-    const int index_in_the_diagonal =
-        index - batch_and_mapped_diag_index * max_diag_len_;
-    const int batch = batch_and_mapped_diag_index / num_diags_;
-    const int mapped_diag_index =
-        batch_and_mapped_diag_index - batch * num_diags_;
-    // diag_index=0 refers to the main diagonal
-    const int diag_index = upper_diag_index_ - mapped_diag_index;
-    // shift down for subdiagonal if diag_index < 0
-    const int y_index =
-        index_in_the_diagonal + (0 > -diag_index ? 0 : -diag_index);
-    // shift right for superdiagonal if diag_index > 0
-    const int x_index =
-        index_in_the_diagonal + (0 > diag_index ? 0 : diag_index);
-    if (y_index < m_ && x_index < n_) {
-      output_[index] = input_[batch * m_ * n_ + y_index * m_ + x_index];
-    } else {
-      output_[index] = padding_;
-    }
-  }
-
-  const int m_, n_, num_diags_, max_diag_len_, upper_diag_index_;
-  const T padding_;
-  const T* input_;
-  T* output_;
-};
-
-template <typename T>
-struct MatrixBandPartScaleEndFunctor {
-  /*! Compared with MatrixBandPartFunctor, it scale up values at the end of
-   * band. It can be used to fuse the following operations, which actually
-   * output triangular with diagonal scaled up:
-   * 1. dig = matrix_diag_part(middle)
-   * 2. middle = matrix_set_diag(middle, diag * scalar)
-   * 3. middle = matrix_band_part(middle, -1, 0)
-   */
-  MatrixBandPartScaleEndFunctor(const int m, const int n,
-                                const int num_lower_diags,
-                                const int num_upper_diags, const T scale,
-                                const T* input, T* output)
-      : m_(m),
-        n_(n),
-        num_lower_diags_(num_lower_diags),
-        num_upper_diags_(num_upper_diags),
-        scale_(scale),
-        input_(input),
-        output_(output) {}
-
-  HOSTDEVICE void operator()(size_t index) const {
-    const int col = index % n_;
-    const int row = (index / n_) % m_;
-    const int band_start = (num_lower_diags_ < 0 ? 0 : row - num_lower_diags_);
-    const int band_end =
-        (num_upper_diags_ < 0 ? n_ : row + num_upper_diags_ + 1);
-    if (col < band_start || col >= band_end) {
-      output_[index] = 0;
-    } else if (col == band_end - 1) {
-      output_[index] = scale_ * input_[index];
-    } else {
-      output_[index] = input_[index];
-    }
-  }
-
-  const int m_, n_, num_lower_diags_, num_upper_diags_;
-  const T scale_;
-  const T* input_;
-  T* output_;
-};
-
-template <typename T>
-struct AddtoScaleFunctor {
-  AddtoScaleFunctor(const T scale, const T* input, T* output)
-      : scale_(scale), input_(input), output_(output) {}
-  HOSTDEVICE void operator()(size_t index) const {
-    output_[index] += input_[index];
-    output_[index] *= scale_;
-  }
-  const T scale_;
-  const T* input_;
-  T* output_;
-};
-
-template <typename DeviceContext, typename T>
-class CholeskyGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* out = context.Input<Tensor>("Out");
-    auto* out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* x_grad = context.Output<Tensor>(framework::GradVarName("X"));
-    auto* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
-
-    bool upper = context.Attr<bool>("upper");
-    auto& dims = out->dims();
-    int batch_count = 1;
-    for (int i = 0; i < dims.size() - 2; i++) {
-      batch_count *= dims[i];
-    }
-    auto m = dims[dims.size() - 1];
-    int tensor_size = batch_count * m * m;
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-
-    std::vector<int> axis(dims.size() - 2);
-    std::iota(axis.begin(), axis.end(), 0);
-    axis.insert(axis.end(), {dims.size() - 1, dims.size() - 2});
-    Tensor l, l_grad;
-    if (upper) {
-      l.mutable_data<T>(dims, context.GetPlace());
-      l_grad.mutable_data<T>(dims, context.GetPlace());
-      TransCompute<DeviceContext, T>(dims.size(), dev_ctx, *out, &l, axis);
-      TransCompute<DeviceContext, T>(dims.size(), dev_ctx, *out_grad, &l_grad,
-                                     axis);
-    } else {
-      l = *out;
-      l_grad = *out_grad;
-    }
-    auto* l_data = l.data<T>();
-
-    /*！ refer to Iain Murray (2016); arXiv 1602.07527 */
-    /*! phi = matmul(L.transpose(-1, -2), grad) */
-    Tensor middle;
-    auto* middle_data = middle.mutable_data<T>(dims, context.GetPlace());
-    auto trans_desc = phi::funcs::CreateMatrixDescriptor(dims, 0, true);
-    auto no_trans_desc = phi::funcs::CreateMatrixDescriptor(dims, 0, false);
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(context);
-    blas.MatMul(l, trans_desc, l_grad, no_trans_desc, T(1), &middle, T(0));
-
-    /*! phi.tril_().diagonal(0, -2, -1).mul_(0.5) */
-    platform::ForRange<DeviceContext> for_range(dev_ctx, tensor_size);
-    MatrixBandPartScaleEndFunctor<T> matrix_band_part_scale_end_functor(
-        m, m, /* num_lower_diags */ m, /* num_upper_diags */ 0,
-        /* scale */ 0.5, middle_data, middle_data);
-    for_range(matrix_band_part_scale_end_functor);
-
-    // Compute inverse by solving the triangular linear system AX = B, where B
-    // is the identity matrix. The matrix X would be overwritten on B
-    Tensor identity;
-    auto* identity_data = identity.mutable_data<T>(dims, context.GetPlace());
-    EyeFunctor<T> eye_functor(m, m, identity_data);
-    for_range(eye_functor);
-    // TODO(guosheng): use trsmBatched for GPU
-    for (int i = 0; i < batch_count; i++) {
-      blas.TRSM(/*side*/ CblasLeft, /*uplo*/ CblasLower,
-                /*trans*/ CblasNoTrans, /*diag*/ CblasNonUnit, /*m*/ m, /*n*/ m,
-                /*alpha*/ T(1), l_data + i * m * m, /*lda*/ m,
-                identity_data + i * m * m, /*ldb*/ m);
-    }
-    Tensor& l_inverse = identity;
-
-    /*! x_grad = matmul(matmul(L_inverse.transpose(-1, -2), phi), L_inverse) */
-    Tensor middle1;
-    middle1.mutable_data<T>(dims, context.GetPlace());
-    blas.MatMul(l_inverse, trans_desc, middle, no_trans_desc, T(1), &middle1,
-                T(0));
-    blas.MatMul(middle1, no_trans_desc, l_inverse, no_trans_desc, T(1), x_grad,
-                T(0));
-
-    /*! x_grad.add(x_grad.transpose(-1, -2)).mul_(0.5) */
-    Tensor x_grad_trans;
-    auto* x_grad_trans_data =
-        x_grad_trans.mutable_data<T>(dims, context.GetPlace());
-    TransCompute<DeviceContext, T>(dims.size(), dev_ctx, *x_grad, &x_grad_trans,
-                                   axis);
-    AddtoScaleFunctor<T> addto_scale_functor(0.5, x_grad_trans_data,
-                                             x_grad_data);
-    for_range(addto_scale_functor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/increment_op.cc b/paddle/fluid/operators/increment_op.cc
index c572870d950a8..3d8e80bfaeb8f 100644
--- a/paddle/fluid/operators/increment_op.cc
+++ b/paddle/fluid/operators/increment_op.cc
@@ -12,9 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/increment_op.h"
-
-#include <string>
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -101,14 +99,3 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(increment, ops::IncrementOp, ops::IncrementOpMaker,
                   ops::IncrementGradOpMaker<paddle::framework::OpDesc>,
                   ops::IncrementGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    increment, ops::IncrementKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::IncrementKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::IncrementKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::IncrementKernel<paddle::platform::CPUDeviceContext, int64_t>);
-
-REGISTER_OP_CUDA_KERNEL(
-    increment, ops::IncrementKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::IncrementKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::IncrementKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::IncrementKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/increment_op.h b/paddle/fluid/operators/increment_op.h
deleted file mode 100644
index 4b9d07146484f..0000000000000
--- a/paddle/fluid/operators/increment_op.h
+++ /dev/null
@@ -1,41 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class IncrementKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x_tensor = context.Input<framework::Tensor>("X");
-    auto* out_tensor = context.Output<framework::Tensor>("Out");
-    float step = context.Attr<float>("step");
-
-    out_tensor->mutable_data<T>(context.GetPlace());
-    auto& dev =
-        *context.template device_context<DeviceContext>().eigen_device();
-    EigenAdd<std::decay_t<decltype(dev)>, T>::Eval(
-        dev, framework::EigenScalar<T>::From(*out_tensor),
-        framework::EigenScalar<T>::From(*x_tensor), static_cast<T>(step));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/increment_op_npu.cc b/paddle/fluid/operators/increment_op_npu.cc
index 1c7c8a19110bc..16f1b3b126995 100644
--- a/paddle/fluid/operators/increment_op_npu.cc
+++ b/paddle/fluid/operators/increment_op_npu.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/increment_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/multinomial_op.cc b/paddle/fluid/operators/multinomial_op.cc
index 02479222747df..00eaa2f8e77cf 100644
--- a/paddle/fluid/operators/multinomial_op.cc
+++ b/paddle/fluid/operators/multinomial_op.cc
@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/multinomial_op.h"
 
 #include <algorithm>
 #include <string>
@@ -80,29 +79,6 @@ class MultinomialOp : public framework::OperatorWithKernel {
   }
 };
 
-template <typename T>
-class MultinomialOpKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto x = ctx.Input<framework::Tensor>("X");
-    auto out = ctx.Output<framework::Tensor>("Out");
-    const int64_t num_samples = ctx.Attr<int>("num_samples");
-    const bool replacement = ctx.Attr<bool>("replacement");
-
-    auto *in_data = x->data<T>();
-    int64_t *out_data = out->mutable_data<int64_t>(ctx.GetPlace());
-
-    auto in_dims = x->dims();
-    int64_t in_rank = in_dims.size();
-    const int64_t num_categories = in_dims[in_rank - 1];
-    const int64_t num_distributions = in_rank > 1 ? in_dims[in_rank - 2] : 1;
-
-    MultinomialFunctor<T>(out_data, in_data, num_samples, replacement,
-                          num_categories, num_distributions);
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
 
@@ -112,7 +88,3 @@ REGISTER_OPERATOR(
     multinomial, ops::MultinomialOp, ops::MultinomialOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-
-REGISTER_OP_CPU_KERNEL(
-    multinomial, ops::MultinomialOpKernel<plat::CPUDeviceContext, float>,
-    ops::MultinomialOpKernel<plat::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/multinomial_op.cu b/paddle/fluid/operators/multinomial_op.cu
deleted file mode 100644
index a07cae8d3dabc..0000000000000
--- a/paddle/fluid/operators/multinomial_op.cu
+++ /dev/null
@@ -1,270 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef PADDLE_WITH_HIP
-// To-do(qili93): fix this after issue resolved
-// https://github.com/ROCmSoftwarePlatform/rocPRIM/issues/202
-
-#include <thrust/execution_policy.h>
-#include <thrust/random.h>
-#include <thrust/scan.h>
-#include <thrust/transform.h>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/multinomial_op.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/transform.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-__global__ void NormalizeProbability(T* norm_probs, const T* in_data,
-                                     T* sum_rows, int64_t num_distributions,
-                                     int64_t num_categories) {
-  int id = threadIdx.x + blockIdx.x * blockDim.x +
-           blockIdx.y * gridDim.x * blockDim.x;
-  if (id < num_distributions * num_categories) {
-    PADDLE_ENFORCE(
-        in_data[id] >= 0.0,
-        "The input of multinomial distribution should be >= 0, but got %f.",
-        in_data[id]);
-    int64_t row_id = id / num_categories;
-    PADDLE_ENFORCE(sum_rows[row_id] > 0.0,
-                   "The sum of one multinomial distribution probability should "
-                   "be > 0, but got %f.",
-                   sum_rows[row_id]);
-    norm_probs[id] = in_data[id] / sum_rows[row_id];
-  }
-}
-
-template <typename T>
-__global__ void GetCumulativeProbs(T* norm_probs_data,
-                                   int64_t num_distributions,
-                                   int64_t num_categories,
-                                   T* cumulative_probs) {
-  int id = blockIdx.x;
-  thrust::inclusive_scan(thrust::device, norm_probs_data + id * num_categories,
-                         norm_probs_data + (id + 1) * num_categories,
-                         cumulative_probs + id * num_categories);
-}
-
-template <typename T>
-struct RandomGeneratorCudaFunctor {
-  unsigned int seed_;
-  __host__ __device__ RandomGeneratorCudaFunctor(int seed) : seed_(seed) {}
-
-  __host__ __device__ T operator()(const unsigned int n) const {
-    thrust::minstd_rand rng;
-    rng.seed(seed_);
-    thrust::uniform_real_distribution<T> dist(0.0, 1.0);
-    rng.discard(n);
-    return dist(rng);
-  }
-};
-
-template <typename T>
-__device__ int binarySearchFunctor(T* cumulative_probs, T* norm_probs_data,
-                                   int num_categories, T rng_number) {
-  int left = 0;
-  int right = num_categories;
-
-  while (right - left > 0) {
-    int mid = left + (right - left) / 2;
-
-    T temp_prob = cumulative_probs[mid];
-    if (temp_prob < rng_number) {
-      left = mid + 1;
-    } else {
-      right = mid;
-    }
-  }
-
-  if (left == num_categories) {
-    left = num_categories - 1;
-  }
-
-  while (left >= 1 && norm_probs_data[left] == 0) left--;
-
-  return left;
-}
-
-template <typename T>
-__global__ void sampleMultinomialWithReplacement(
-    T* rng_data, const int64_t num_samples, int64_t* out_data,
-    const int64_t num_distributions, const int64_t num_categories,
-    T* cumulative_probs, T* norm_probs_data) {
-  // use binary search to get the selected category sample id.
-  // let cumulative_probs[id-1] < rng_data < cumulative_probs[id].
-
-  // for every distribution
-  int dist = blockIdx.y;
-  // for every sample
-  int sample = blockIdx.x * blockDim.x + threadIdx.x;
-  if (sample < num_samples) {
-    T rng_number = rng_data[sample + dist * num_samples];
-
-    // Find the bucket that a uniform random number lies in
-    int selected_category = binarySearchFunctor<T>(
-        cumulative_probs + dist * num_categories,
-        norm_probs_data + dist * num_categories, num_categories, rng_number);
-
-    out_data[sample + dist * num_samples] = selected_category;
-  }
-}
-
-template <typename T>
-class MultinomialOpKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto x = ctx.Input<framework::Tensor>("X");
-    auto out = ctx.Output<framework::Tensor>("Out");
-
-    const int64_t num_samples = ctx.Attr<int>("num_samples");
-    const bool replacement = ctx.Attr<bool>("replacement");
-
-    auto* in_data = x->data<T>();
-    int64_t* out_data = out->mutable_data<int64_t>(ctx.GetPlace());
-
-    auto in_dims = x->dims();
-    int64_t in_rank = in_dims.size();
-    const int64_t num_categories = in_dims[in_rank - 1];
-    const int64_t num_distributions = in_rank > 1 ? in_dims[in_rank - 2] : 1;
-
-    // If replacement is False, it's not a replaceable sample. Every category
-    // can
-    // be used only once. So after every sample, probability of the distribution
-    // will change. The implementation can't be parallelizable. Thus, call CPU
-    // implementation ``MultinomialFunctor`` to sample the distribution.
-    if (!replacement) {
-      int64_t in_data_numel = x->numel();
-      int64_t out_data_numel = out->numel();
-
-      T* cpu_in_data = new T[in_data_numel];
-      int64_t* cpu_out_data = new int64_t[out_data_numel];
-
-#ifdef PADDLE_WITH_HIP
-      hipMemcpy(cpu_in_data, in_data, in_data_numel * sizeof(T),
-                hipMemcpyDeviceToHost);
-#else
-      cudaMemcpy(cpu_in_data, in_data, in_data_numel * sizeof(T),
-                 cudaMemcpyDeviceToHost);
-#endif
-
-      MultinomialFunctor<T>(cpu_out_data, cpu_in_data, num_samples, replacement,
-                            num_categories, num_distributions);
-
-#ifdef PADDLE_WITH_HIP
-      hipMemcpy(out_data, cpu_out_data, out_data_numel * sizeof(int64_t),
-                hipMemcpyHostToDevice);
-#else
-      cudaMemcpy(out_data, cpu_out_data, out_data_numel * sizeof(int64_t),
-                 cudaMemcpyHostToDevice);
-#endif
-
-      delete[] cpu_in_data;
-      delete[] cpu_out_data;
-      return;
-    }
-
-    // Sum of input may not be 1. To get probability in range [0, 1], calculate
-    // sum of each row of input, and then use the sum to normalize the input.
-    // sum_row_data: sum of each row
-    framework::Tensor sum_rows_tensor;
-    auto* sum_rows_data =
-        sum_rows_tensor.mutable_data<T>({num_distributions}, ctx.GetPlace());
-
-    auto& place = *ctx.template device_context<platform::CUDADeviceContext>()
-                       .eigen_device();
-
-    if (num_distributions == 1) {
-      auto eigen_input = framework::EigenVector<T>::Flatten(*x);
-      auto eigen_sum_rows = framework::EigenVector<T>::Flatten(sum_rows_tensor);
-      eigen_sum_rows.device(place) =
-          eigen_input.sum(Eigen::DSizes<int, 1>(1))
-              .eval()
-              .reshape(Eigen::DSizes<int, 1>(sum_rows_tensor.dims()[0]));
-    } else {
-      auto eigen_input = framework::EigenMatrix<T>::From(*x);
-      auto eigen_sum_rows = framework::EigenVector<T>::Flatten(sum_rows_tensor);
-      eigen_sum_rows.device(place) = eigen_input.sum(Eigen::DSizes<int, 1>(1));
-    }
-
-    // Normalize row of each distribution to get the probability in range [0,
-    // 1].
-    // norm_probs_data: probability of the distribution
-    framework::Tensor norm_probs_tensor;
-    auto* norm_probs_data = norm_probs_tensor.mutable_data<T>(
-        {num_distributions, num_categories}, ctx.GetPlace());
-
-    // number of threads in a block is min(num_categories, 512)
-    dim3 block_norm(num_categories < 512 ? num_categories : 512);
-    dim3 grid_norm((num_distributions * num_categories - 1) / block_norm.x + 1);
-    NormalizeProbability<
-        T><<<grid_norm, block_norm, 0, ctx.cuda_device_context().stream()>>>(
-        norm_probs_data, in_data, sum_rows_data, num_distributions,
-        num_categories);
-
-    // Get cumulative probability of each distribution. It's the same function
-    // of
-    // ``cumsum`` op.
-    framework::Tensor cumulative_probs_tensor;
-    auto* cumulative_probs = cumulative_probs_tensor.mutable_data<T>(
-        {num_distributions, num_categories}, ctx.GetPlace());
-    dim3 block_cumsum(1);
-    dim3 grid_cumsum(num_distributions);
-    GetCumulativeProbs<T><<<grid_cumsum, block_cumsum, 0,
-                            ctx.cuda_device_context().stream()>>>(
-        norm_probs_data, num_distributions, num_categories, cumulative_probs);
-
-    // Generate random number for each sample.
-    std::random_device rd;
-    auto seed = rd();
-
-    framework::Tensor rng_data_tensor;
-    auto* rng_data = rng_data_tensor.mutable_data<T>(
-        {num_distributions, num_samples}, ctx.GetPlace());
-
-    thrust::counting_iterator<int64_t> index_sequence_begin(0);
-    platform::Transform<platform::CUDADeviceContext> trans;
-    auto* context =
-        static_cast<const platform::CUDADeviceContext*>(&ctx.device_context());
-    trans(*context, index_sequence_begin,
-          index_sequence_begin + num_distributions * num_samples, rng_data,
-          RandomGeneratorCudaFunctor<T>(seed));
-
-    // Sample the multinomial distributions.
-    dim3 block_sample(128);
-    dim3 grid_sample((num_samples - 1) / block_sample.x + 1, num_distributions);
-    sampleMultinomialWithReplacement<T><<<grid_sample, block_sample, 0,
-                                          ctx.cuda_device_context().stream()>>>(
-        rng_data, num_samples, out_data, num_distributions, num_categories,
-        cumulative_probs, norm_probs_data);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    multinomial, ops::MultinomialOpKernel<plat::CUDADeviceContext, double>,
-    ops::MultinomialOpKernel<plat::CUDADeviceContext, float>);
-
-#endif
diff --git a/paddle/phi/kernels/addmm_grad_kernel.h b/paddle/phi/kernels/addmm_grad_kernel.h
new file mode 100644
index 0000000000000..0d2f445a61de0
--- /dev/null
+++ b/paddle/phi/kernels/addmm_grad_kernel.h
@@ -0,0 +1,33 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AddmmGradKernel(const Context& dev_ctx,
+                     const DenseTensor& input,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     const DenseTensor& out_grad,
+                     float alpha,
+                     float beta,
+                     DenseTensor* input_grad,
+                     DenseTensor* x_grad,
+                     DenseTensor* y_grad);
+
+}  // namespace phi
diff --git a/paddle/fluid/operators/addmm_op.cu b/paddle/phi/kernels/addmm_kernel.h
similarity index 50%
rename from paddle/fluid/operators/addmm_op.cu
rename to paddle/phi/kernels/addmm_kernel.h
index e42d9c84f9234..3674305796cde 100644
--- a/paddle/fluid/operators/addmm_op.cu
+++ b/paddle/phi/kernels/addmm_kernel.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,13 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/addmm_op.h"
+#pragma once
 
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
+#include "paddle/phi/core/dense_tensor.h"
 
-REGISTER_OP_CUDA_KERNEL(addmm, ops::AddMMKernel<plat::CUDADeviceContext, float>,
-                        ops::AddMMKernel<plat::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(addmm_grad,
-                        ops::AddMMGradKernel<plat::CUDADeviceContext, float>,
-                        ops::AddMMGradKernel<plat::CUDADeviceContext, double>);
+namespace phi {
+
+template <typename T, typename Context>
+void AddmmKernel(const Context& dev_ctx,
+                 const DenseTensor& input,
+                 const DenseTensor& x,
+                 const DenseTensor& y,
+                 float alpha,
+                 float beta,
+                 DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cholesky_grad_kernel.h b/paddle/phi/kernels/cholesky_grad_kernel.h
new file mode 100644
index 0000000000000..3fb532d9af7f9
--- /dev/null
+++ b/paddle/phi/kernels/cholesky_grad_kernel.h
@@ -0,0 +1,28 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CholeskyGradKernel(const Context& dev_ctx,
+                        const DenseTensor& out,
+                        const DenseTensor& out_grad,
+                        bool upper,
+                        DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cholesky_kernel.h b/paddle/phi/kernels/cholesky_kernel.h
new file mode 100644
index 0000000000000..5dc1473d8dbca
--- /dev/null
+++ b/paddle/phi/kernels/cholesky_kernel.h
@@ -0,0 +1,27 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CholeskyKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    bool upper,
+                    DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/addmm_grad_kernel.cc b/paddle/phi/kernels/cpu/addmm_grad_kernel.cc
new file mode 100644
index 0000000000000..6032f15e0f75e
--- /dev/null
+++ b/paddle/phi/kernels/cpu/addmm_grad_kernel.cc
@@ -0,0 +1,22 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/addmm_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/addmm_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    addmm_grad, CPU, ALL_LAYOUT, phi::AddmmGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/addmm_kernel.cc b/paddle/phi/kernels/cpu/addmm_kernel.cc
new file mode 100644
index 0000000000000..ff86b655ed3ef
--- /dev/null
+++ b/paddle/phi/kernels/cpu/addmm_kernel.cc
@@ -0,0 +1,21 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/addmm_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/addmm_kernel_impl.h"
+
+PD_REGISTER_KERNEL(addmm, CPU, ALL_LAYOUT, phi::AddmmKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/cholesky_grad_kernel.cc b/paddle/phi/kernels/cpu/cholesky_grad_kernel.cc
new file mode 100644
index 0000000000000..ad9d51db4921e
--- /dev/null
+++ b/paddle/phi/kernels/cpu/cholesky_grad_kernel.cc
@@ -0,0 +1,22 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cholesky_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/cholesky_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    cholesky_grad, CPU, ALL_LAYOUT, phi::CholeskyGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/cholesky_kernel.cc b/paddle/phi/kernels/cpu/cholesky_kernel.cc
new file mode 100644
index 0000000000000..3d9b6b52d75d6
--- /dev/null
+++ b/paddle/phi/kernels/cpu/cholesky_kernel.cc
@@ -0,0 +1,81 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/cholesky_kernel.h"
+
+#include "Eigen/Cholesky"
+#include "Eigen/Core"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CholeskyKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    bool upper,
+                    DenseTensor* out) {
+  using EigenMatrix =
+      Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+  using InputMatrixMap = Eigen::Map<const EigenMatrix>;
+  using OutputMatrixMap = Eigen::Map<EigenMatrix>;
+
+  auto& dims = x.dims();
+  int batch_count = 1;
+  for (int i = 0; i < dims.size() - 2; i++) {
+    batch_count *= dims[i];
+  }
+  auto m = dims[dims.size() - 1];
+
+  const auto* x_data = x.data<T>();
+  auto* out_data = dev_ctx.template Alloc<T>(out);
+  // Cholesky decomposition for each matrix, maybe can use multi threads
+  for (int i = 0; i < batch_count; i++) {
+    auto input = InputMatrixMap(x_data + i * m * m, m, m);
+    auto output = OutputMatrixMap(out_data + i * m * m, m, m);
+    if (upper) {
+      Eigen::LLT<
+          Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>,
+          Eigen::UpLoType::Upper>
+          llt_decomposition(input);
+      PADDLE_ENFORCE_EQ(llt_decomposition.info(),
+                        Eigen::Success,
+                        errors::InvalidArgument(
+                            "Cholesky decomposition was not successful. The "
+                            "%d-th input matrice "
+                            "might not be not be positive definite.",
+                            i));
+      output = llt_decomposition.matrixU();
+    } else {
+      Eigen::LLT<
+          Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>,
+          Eigen::UpLoType::Lower>
+          llt_decomposition(input);
+      PADDLE_ENFORCE_EQ(llt_decomposition.info(),
+                        Eigen::Success,
+                        errors::InvalidArgument(
+                            "Cholesky decomposition was not successful. The "
+                            "%d-th input matrice "
+                            "might not be not be positive definite.",
+                            i));
+      output = llt_decomposition.matrixL();
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    cholesky, CPU, ALL_LAYOUT, phi::CholeskyKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/increment_kernel.cc b/paddle/phi/kernels/cpu/increment_kernel.cc
new file mode 100644
index 0000000000000..70c178d25a10a
--- /dev/null
+++ b/paddle/phi/kernels/cpu/increment_kernel.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/increment_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/increment_kernel_impl.h"
+
+PD_REGISTER_KERNEL(increment,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::IncrementKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/multinomial_kernel.cc b/paddle/phi/kernels/cpu/multinomial_kernel.cc
new file mode 100644
index 0000000000000..67e7d5bb68c61
--- /dev/null
+++ b/paddle/phi/kernels/cpu/multinomial_kernel.cc
@@ -0,0 +1,46 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/multinomial_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MultinomialKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       int num_samples,
+                       bool replacement,
+                       DenseTensor* out) {
+  auto* in_data = x.data<T>();
+  int64_t* out_data = dev_ctx.template Alloc<int64_t>(out);
+  auto in_dims = x.dims();
+  int64_t in_rank = in_dims.size();
+  const int64_t num_categories = in_dims[in_rank - 1];
+  const int64_t num_distributions = in_rank > 1 ? in_dims[in_rank - 2] : 1;
+
+  MultinomialFunctor<T>(out_data,
+                        in_data,
+                        num_samples,
+                        replacement,
+                        num_categories,
+                        num_distributions);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    multinomial, CPU, ALL_LAYOUT, phi::MultinomialKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/addmm_grad_kernel.cu b/paddle/phi/kernels/gpu/addmm_grad_kernel.cu
new file mode 100644
index 0000000000000..65978da1374e4
--- /dev/null
+++ b/paddle/phi/kernels/gpu/addmm_grad_kernel.cu
@@ -0,0 +1,22 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/addmm_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/addmm_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    addmm_grad, GPU, ALL_LAYOUT, phi::AddmmGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/addmm_kernel.cu b/paddle/phi/kernels/gpu/addmm_kernel.cu
new file mode 100644
index 0000000000000..7b589ce20acca
--- /dev/null
+++ b/paddle/phi/kernels/gpu/addmm_kernel.cu
@@ -0,0 +1,21 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/addmm_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/addmm_kernel_impl.h"
+
+PD_REGISTER_KERNEL(addmm, GPU, ALL_LAYOUT, phi::AddmmKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/cholesky_grad_kernel.cu b/paddle/phi/kernels/gpu/cholesky_grad_kernel.cu
new file mode 100644
index 0000000000000..9165e8ea4147f
--- /dev/null
+++ b/paddle/phi/kernels/gpu/cholesky_grad_kernel.cu
@@ -0,0 +1,22 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cholesky_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/cholesky_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    cholesky_grad, GPU, ALL_LAYOUT, phi::CholeskyGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/cholesky_kernel.cu b/paddle/phi/kernels/gpu/cholesky_kernel.cu
new file mode 100644
index 0000000000000..22ea87d83e8db
--- /dev/null
+++ b/paddle/phi/kernels/gpu/cholesky_kernel.cu
@@ -0,0 +1,217 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef PADDLE_WITH_HIP
+// HIP not support cusolver
+
+#include "paddle/phi/kernels/cholesky_kernel.h"
+
+#include <thrust/device_vector.h>
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/backends/dynload/cusolver.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T>
+struct MatrixBandPartFunctor {
+  /*! Set output as input value outside a central band and 0 inside that band.
+   * That is: output[i, j, ..., m, n] = in_band(m, n) * input[i, j, ..., m, n]
+   * where: in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) && (num_upper
+   * < 0 || (n-m) <= num_upper)
+   */
+  MatrixBandPartFunctor(const int m,
+                        const int n,
+                        const int num_lower_diags,
+                        const int num_upper_diags,
+                        const T* input,
+                        T* output)
+      : m_(m),
+        n_(n),
+        num_lower_diags_(num_lower_diags),
+        num_upper_diags_(num_upper_diags),
+        input_(input),
+        output_(output) {}
+
+  HOSTDEVICE void operator()(size_t index) const {
+    const int col = index % n_;
+    const int row = (index / n_) % m_;
+    const int band_start = (num_lower_diags_ < 0 ? 0 : row - num_lower_diags_);
+    const int band_end =
+        (num_upper_diags_ < 0 ? n_ : row + num_upper_diags_ + 1);
+    if (col < band_start || col >= band_end) {
+      output_[index] = static_cast<T>(0);
+    } else {
+      output_[index] = input_[index];
+    }
+  }
+
+  const int m_, n_, num_lower_diags_, num_upper_diags_;
+  const T* input_;
+  T* output_;
+};
+
+#define FUNC_WITH_TYPES(m) m(float, S) m(double, D)
+
+#define POTRF_INSTANCE(T, C)                                             \
+  void Potrf(const GPUContext& dev_ctx,                                  \
+             cublasFillMode_t uplo,                                      \
+             int n,                                                      \
+             T* A,                                                       \
+             int lda,                                                    \
+             int* info) {                                                \
+    auto handle = dev_ctx.cusolver_dn_handle();                          \
+    int workspace_size = 0;                                              \
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDn##C##potrf_bufferSize( \
+        handle, uplo, n, A, lda, &workspace_size));                      \
+    auto workspace = paddle::memory::Alloc(dev_ctx, workspace_size);     \
+    T* workspace_ptr = reinterpret_cast<T*>(workspace->ptr());           \
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDn##C##potrf(            \
+        handle, uplo, n, A, lda, workspace_ptr, workspace_size, info));  \
+  }
+
+FUNC_WITH_TYPES(POTRF_INSTANCE);
+
+#if CUDA_VERSION >= 9020 && !defined(_WIN32)
+#define POTRF_BATCH_INSTANCE(T, C)                                   \
+  void PotrfBatched(const GPUContext& dev_ctx,                       \
+                    cublasFillMode_t uplo,                           \
+                    int n,                                           \
+                    T* Aarray[],                                     \
+                    int lda,                                         \
+                    int* info_array,                                 \
+                    int batch_size) {                                \
+    auto handle = dev_ctx.cusolver_dn_handle();                      \
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDn##C##potrfBatched( \
+        handle, uplo, n, Aarray, lda, info_array, batch_size));      \
+  }
+
+FUNC_WITH_TYPES(POTRF_BATCH_INSTANCE);
+#endif
+
+template <typename T, typename Context>
+void CholeskyKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    bool upper,
+                    DenseTensor* out) {
+  auto& dims = x.dims();
+  int batch_count = 1;
+  for (int i = 0; i < dims.size() - 2; i++) {
+    batch_count *= dims[i];
+  }
+  int m = dims[dims.size() - 1];
+  int tensor_size = batch_count * m * m;
+
+  const auto* x_data = x.data<T>();
+  auto* out_data = dev_ctx.template Alloc<T>(out);
+
+  // matrices are assumed to be stored in column-major order in cusolver
+  cublasFillMode_t uplo =
+      upper ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
+  // portf is inplace, thus copy the triangular part of the input matrices to
+  // the output and set the other triangular part to 0 firstly
+  paddle::platform::ForRange<GPUContext> for_range(dev_ctx, tensor_size);
+  if (upper) {
+    MatrixBandPartFunctor<T> matrix_band_part_functor(m,
+                                                      m,
+                                                      /* num_lower_diags */ 0,
+                                                      /* num_upper_diags */ m,
+                                                      x_data,
+                                                      out_data);
+    for_range(matrix_band_part_functor);
+  } else {
+    MatrixBandPartFunctor<T> matrix_band_part_functor(m,
+                                                      m,
+                                                      /* num_lower_diags */ m,
+                                                      /* num_upper_diags */ 0,
+                                                      x_data,
+                                                      out_data);
+    for_range(matrix_band_part_functor);
+  }
+
+  auto info = paddle::memory::Alloc(dev_ctx, sizeof(int) * batch_count);
+  auto* info_ptr = reinterpret_cast<int*>(info->ptr());
+
+#if CUDA_VERSION >= 9020 && !defined(_WIN32)
+  if (batch_count > 1) {
+    std::vector<T*> output_ptrs;
+    for (int i = 0; i < batch_count; i++) {
+      output_ptrs.emplace_back(out_data + i * m * m);
+    }
+    thrust::device_vector<T*> dev_output_ptrs(output_ptrs.begin(),
+                                              output_ptrs.end());
+    PotrfBatched(dev_ctx,
+                 uplo,
+                 m,
+                 thrust::raw_pointer_cast(dev_output_ptrs.data()),
+                 m,
+                 info_ptr,
+                 batch_count);
+    // TODO(guosheng): There seems to a bug in cusolver potrfBatched and need
+    // to clear the upper triangle of the output. Remove this workaround once
+    // the bug is fixed.
+    if (!upper) {
+      MatrixBandPartFunctor<T> matrix_band_part_functor(m,
+                                                        m,
+                                                        /* num_lower_diags */ m,
+                                                        /* num_upper_diags */ 0,
+                                                        out_data,
+                                                        out_data);
+      for_range(matrix_band_part_functor);
+    }
+  } else {
+#endif
+    for (int i = 0; i < batch_count; i++) {
+      Potrf(dev_ctx, uplo, m, out_data + i * m * m, m, info_ptr + i);
+    }
+
+#if CUDA_VERSION >= 9020 && !defined(_WIN32)
+  }
+#endif
+  // check the info
+  std::vector<int> error_info;  // only for checking positive matrix
+  error_info.resize(batch_count);
+
+  paddle::memory::Copy(CPUPlace(),
+                       error_info.data(),
+                       dev_ctx.GetPlace(),
+                       info_ptr,
+                       sizeof(int) * batch_count,
+                       dev_ctx.stream());
+
+  for (int i = 0; i < batch_count; ++i) {
+    PADDLE_ENFORCE_EQ(error_info[i],
+                      0,
+                      errors::PreconditionNotMet(
+                          "For batch [%d]: U(%d, %d) is zero, singular U.",
+                          i,
+                          error_info[i],
+                          error_info[i]));
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(cholesky,  // cuda_only
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::CholeskyKernel,
+                   float,
+                   double) {}
+
+#endif  // not PADDLE_WITH_HIP
diff --git a/paddle/phi/kernels/gpu/increment_kernel.cu b/paddle/phi/kernels/gpu/increment_kernel.cu
new file mode 100644
index 0000000000000..b3c3127191148
--- /dev/null
+++ b/paddle/phi/kernels/gpu/increment_kernel.cu
@@ -0,0 +1,28 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/increment_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/increment_kernel_impl.h"
+
+PD_REGISTER_KERNEL(increment,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::IncrementKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/multinomial_kernel.cu b/paddle/phi/kernels/gpu/multinomial_kernel.cu
new file mode 100644
index 0000000000000..ea1cf361958aa
--- /dev/null
+++ b/paddle/phi/kernels/gpu/multinomial_kernel.cu
@@ -0,0 +1,288 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef PADDLE_WITH_HIP
+// To-do(qili93): fix this after issue resolved
+// https://github.com/ROCmSoftwarePlatform/rocPRIM/issues/202
+
+#include "paddle/phi/kernels/multinomial_kernel.h"
+
+#include <thrust/execution_policy.h>
+#include <thrust/random.h>
+#include <thrust/scan.h>
+#include <thrust/transform.h>
+
+#include "paddle/fluid/platform/transform.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+
+template <typename T>
+__global__ void NormalizeProbability(T* norm_probs,
+                                     const T* in_data,
+                                     T* sum_rows,
+                                     int64_t num_distributions,
+                                     int64_t num_categories) {
+  int id = threadIdx.x + blockIdx.x * blockDim.x +
+           blockIdx.y * gridDim.x * blockDim.x;
+  if (id < num_distributions * num_categories) {
+    PADDLE_ENFORCE(
+        in_data[id] >= 0.0,
+        "The input of multinomial distribution should be >= 0, but got %f.",
+        in_data[id]);
+    int64_t row_id = id / num_categories;
+    PADDLE_ENFORCE(sum_rows[row_id] > 0.0,
+                   "The sum of one multinomial distribution probability should "
+                   "be > 0, but got %f.",
+                   sum_rows[row_id]);
+    norm_probs[id] = in_data[id] / sum_rows[row_id];
+  }
+}
+
+template <typename T>
+__global__ void GetCumulativeProbs(T* norm_probs_data,
+                                   int64_t num_distributions,
+                                   int64_t num_categories,
+                                   T* cumulative_probs) {
+  int id = blockIdx.x;
+  thrust::inclusive_scan(thrust::device,
+                         norm_probs_data + id * num_categories,
+                         norm_probs_data + (id + 1) * num_categories,
+                         cumulative_probs + id * num_categories);
+}
+
+template <typename T>
+struct RandomGeneratorCudaFunctor {
+  unsigned int seed_;
+  __host__ __device__ RandomGeneratorCudaFunctor(int seed) : seed_(seed) {}
+
+  __host__ __device__ T operator()(const unsigned int n) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed_);
+    thrust::uniform_real_distribution<T> dist(0.0, 1.0);
+    rng.discard(n);
+    return dist(rng);
+  }
+};
+
+template <typename T>
+__device__ int binarySearchFunctor(T* cumulative_probs,
+                                   T* norm_probs_data,
+                                   int num_categories,
+                                   T rng_number) {
+  int left = 0;
+  int right = num_categories;
+
+  while (right - left > 0) {
+    int mid = left + (right - left) / 2;
+
+    T temp_prob = cumulative_probs[mid];
+    if (temp_prob < rng_number) {
+      left = mid + 1;
+    } else {
+      right = mid;
+    }
+  }
+
+  if (left == num_categories) {
+    left = num_categories - 1;
+  }
+
+  while (left >= 1 && norm_probs_data[left] == 0) left--;
+
+  return left;
+}
+
+template <typename T>
+__global__ void sampleMultinomialWithReplacement(
+    T* rng_data,
+    const int64_t num_samples,
+    int64_t* out_data,
+    const int64_t num_distributions,
+    const int64_t num_categories,
+    T* cumulative_probs,
+    T* norm_probs_data) {
+  // use binary search to get the selected category sample id.
+  // let cumulative_probs[id-1] < rng_data < cumulative_probs[id].
+
+  // for every distribution
+  int dist = blockIdx.y;
+  // for every sample
+  int sample = blockIdx.x * blockDim.x + threadIdx.x;
+  if (sample < num_samples) {
+    T rng_number = rng_data[sample + dist * num_samples];
+
+    // Find the bucket that a uniform random number lies in
+    int selected_category =
+        binarySearchFunctor<T>(cumulative_probs + dist * num_categories,
+                               norm_probs_data + dist * num_categories,
+                               num_categories,
+                               rng_number);
+
+    out_data[sample + dist * num_samples] = selected_category;
+  }
+}
+
+template <typename T, typename Context>
+void MultinomialKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       int num_samples,
+                       bool replacement,
+                       DenseTensor* out) {
+  auto* in_data = x.data<T>();
+  int64_t* out_data = dev_ctx.template Alloc<int64_t>(out);
+
+  auto in_dims = x.dims();
+  int64_t in_rank = in_dims.size();
+  const int64_t num_categories = in_dims[in_rank - 1];
+  const int64_t num_distributions = in_rank > 1 ? in_dims[in_rank - 2] : 1;
+
+  // If replacement is False, it's not a replaceable sample. Every category
+  // can
+  // be used only once. So after every sample, probability of the distribution
+  // will change. The implementation can't be parallelizable. Thus, call CPU
+  // implementation ``MultinomialFunctor`` to sample the distribution.
+  if (!replacement) {
+    int64_t in_data_numel = x.numel();
+    int64_t out_data_numel = out->numel();
+
+    T* cpu_in_data = new T[in_data_numel];
+    int64_t* cpu_out_data = new int64_t[out_data_numel];
+
+#ifdef PADDLE_WITH_HIP
+    hipMemcpy(
+        cpu_in_data, in_data, in_data_numel * sizeof(T), hipMemcpyDeviceToHost);
+#else
+    cudaMemcpy(cpu_in_data,
+               in_data,
+               in_data_numel * sizeof(T),
+               cudaMemcpyDeviceToHost);
+#endif
+
+    MultinomialFunctor<T>(cpu_out_data,
+                          cpu_in_data,
+                          num_samples,
+                          replacement,
+                          num_categories,
+                          num_distributions);
+
+#ifdef PADDLE_WITH_HIP
+    hipMemcpy(out_data,
+              cpu_out_data,
+              out_data_numel * sizeof(int64_t),
+              hipMemcpyHostToDevice);
+#else
+    cudaMemcpy(out_data,
+               cpu_out_data,
+               out_data_numel * sizeof(int64_t),
+               cudaMemcpyHostToDevice);
+#endif
+
+    delete[] cpu_in_data;
+    delete[] cpu_out_data;
+    return;
+  }
+
+  // Sum of input may not be 1. To get probability in range [0, 1], calculate
+  // sum of each row of input, and then use the sum to normalize the input.
+  // sum_row_data: sum of each row
+  DenseTensor sum_rows_tensor;
+  sum_rows_tensor.Resize({num_distributions});
+  auto* sum_rows_data = dev_ctx.template Alloc<T>(&sum_rows_tensor);
+
+  auto& place = *dev_ctx.eigen_device();
+
+  if (num_distributions == 1) {
+    auto eigen_input = EigenVector<T>::Flatten(x);
+    auto eigen_sum_rows = EigenVector<T>::Flatten(sum_rows_tensor);
+    eigen_sum_rows.device(place) =
+        eigen_input.sum(Eigen::DSizes<int, 1>(1))
+            .eval()
+            .reshape(Eigen::DSizes<int, 1>(sum_rows_tensor.dims()[0]));
+  } else {
+    auto eigen_input = EigenMatrix<T>::From(x);
+    auto eigen_sum_rows = EigenVector<T>::Flatten(sum_rows_tensor);
+    eigen_sum_rows.device(place) = eigen_input.sum(Eigen::DSizes<int, 1>(1));
+  }
+
+  // Normalize row of each distribution to get the probability in range [0,
+  // 1].
+  // norm_probs_data: probability of the distribution
+  DenseTensor norm_probs_tensor;
+  norm_probs_tensor.Resize({num_distributions, num_categories});
+  auto* norm_probs_data = dev_ctx.template Alloc<T>(&norm_probs_tensor);
+
+  // number of threads in a block is min(num_categories, 512)
+  dim3 block_norm(num_categories < 512 ? num_categories : 512);
+  dim3 grid_norm((num_distributions * num_categories - 1) / block_norm.x + 1);
+  NormalizeProbability<T><<<grid_norm, block_norm, 0, dev_ctx.stream()>>>(
+      norm_probs_data,
+      in_data,
+      sum_rows_data,
+      num_distributions,
+      num_categories);
+
+  // Get cumulative probability of each distribution. It's the same function
+  // of
+  // ``cumsum`` op.
+  DenseTensor cumulative_probs_tensor;
+  cumulative_probs_tensor.Resize({num_distributions, num_categories});
+  auto* cumulative_probs = dev_ctx.template Alloc<T>(&cumulative_probs_tensor);
+
+  dim3 block_cumsum(1);
+  dim3 grid_cumsum(num_distributions);
+  GetCumulativeProbs<T><<<grid_cumsum, block_cumsum, 0, dev_ctx.stream()>>>(
+      norm_probs_data, num_distributions, num_categories, cumulative_probs);
+
+  // Generate random number for each sample.
+  std::random_device rd;
+  auto seed = rd();
+
+  DenseTensor rng_data_tensor;
+  rng_data_tensor.Resize({num_distributions, num_samples});
+  auto* rng_data = dev_ctx.template Alloc<T>(&rng_data_tensor);
+
+  thrust::counting_iterator<int64_t> index_sequence_begin(0);
+  paddle::platform::Transform<GPUContext> trans;
+  trans(dev_ctx,
+        index_sequence_begin,
+        index_sequence_begin + num_distributions * num_samples,
+        rng_data,
+        RandomGeneratorCudaFunctor<T>(seed));
+
+  // Sample the multinomial distributions.
+  dim3 block_sample(128);
+  dim3 grid_sample((num_samples - 1) / block_sample.x + 1, num_distributions);
+  sampleMultinomialWithReplacement<
+      T><<<grid_sample, block_sample, 0, dev_ctx.stream()>>>(rng_data,
+                                                             num_samples,
+                                                             out_data,
+                                                             num_distributions,
+                                                             num_categories,
+                                                             cumulative_probs,
+                                                             norm_probs_data);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(multinomial,  // cuda_only
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MultinomialKernel,
+                   float,
+                   double) {}
+
+#endif
diff --git a/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h b/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h
new file mode 100644
index 0000000000000..d5efd22a31daa
--- /dev/null
+++ b/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h
@@ -0,0 +1,105 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/kernels/addmm_grad_kernel.h"
+
+#include <type_traits>
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+namespace phi {
+
+template <typename T,
+          size_t D,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using PhiEigenTensor = EigenTensor<T, D, MajorType, IndexType>;
+
+using Array1 = Eigen::DSizes<Eigen::DenseIndex, 1>;
+using Array2 = Eigen::DSizes<Eigen::DenseIndex, 2>;
+
+template <typename T, typename Context>
+void AddmmGradKernel(const Context& dev_ctx,
+                     const DenseTensor& input,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     const DenseTensor& out_grad,
+                     float alpha,
+                     float beta,
+                     DenseTensor* input_grad,
+                     DenseTensor* x_grad,
+                     DenseTensor* y_grad) {
+  auto in_dims = input.dims();
+  int total_elems = 0;
+
+  VLOG(3) << "alpha: " << alpha << " beta: " << beta;
+
+  if (input_grad != nullptr) {
+    input_grad->set_lod(out_grad.lod());
+  }
+  if (x_grad != nullptr) {
+    x_grad->set_lod(x.lod());
+  }
+  if (y_grad != nullptr) {
+    y_grad->set_lod(y.lod());
+  }
+
+  auto blas = funcs::GetBlas<Context, T>(dev_ctx);
+  if (input_grad) {
+    dev_ctx.template Alloc<T>(input_grad);
+    total_elems = in_dims[0] * in_dims[1];
+    auto& place = *dev_ctx.eigen_device();
+    auto eigen_dout = PhiEigenTensor<T, 2>::From(out_grad);
+    auto eigen_dinput = PhiEigenTensor<T, 2>::From(*input_grad);
+
+    bool row_compress = in_dims[0] != out_grad.dims()[0];
+    bool col_compress = in_dims[1] != out_grad.dims()[1];
+    auto eigen_dinput_shape =
+        Array2(input_grad->dims()[0], input_grad->dims()[1]);
+
+    if (row_compress && col_compress) {
+      eigen_dinput.device(place) =
+          eigen_dout.sum().eval().reshape(eigen_dinput_shape);
+    } else if (row_compress) {
+      eigen_dinput.device(place) =
+          eigen_dout.sum(Array1(0)).eval().reshape(eigen_dinput_shape);
+    } else if (col_compress) {
+      eigen_dinput.device(place) =
+          eigen_dout.sum(Array1(1)).eval().reshape(eigen_dinput_shape);
+    } else {
+      blas.VCOPY(total_elems, out_grad.data<T>(), input_grad->data<T>());
+    }
+
+    blas.SCAL(total_elems, beta, input_grad->data<T>());
+  }
+  if (x_grad) {
+    dev_ctx.template Alloc<T>(x_grad);
+    total_elems = x.dims()[0] * x.dims()[1];
+    // x_grad = out_grad * y'. x_grad: M x K, out_grad : M x N, y : K x N
+    blas.MatMul(out_grad, false, y, true, x_grad);
+    blas.SCAL(total_elems, alpha, x_grad->data<T>());
+  }
+  if (y_grad) {
+    dev_ctx.template Alloc<T>(y_grad);
+    total_elems = x.dims()[1] * y.dims()[1];
+    // y_grad = x' * out_grad. y_grad K x N, out_grad : M x N, x : M x K
+    blas.MatMul(x, true, out_grad, false, y_grad);
+    blas.SCAL(total_elems, alpha, y_grad->data<T>());
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/addmm_kernel_impl.h b/paddle/phi/kernels/impl/addmm_kernel_impl.h
new file mode 100644
index 0000000000000..f7afdfd622e63
--- /dev/null
+++ b/paddle/phi/kernels/impl/addmm_kernel_impl.h
@@ -0,0 +1,121 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/kernels/addmm_kernel.h"
+
+#include <type_traits>
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+namespace phi {
+
+template <typename T,
+          size_t D,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using PhiEigenTensor = EigenTensor<T, D, MajorType, IndexType>;
+
+using Array1 = Eigen::DSizes<Eigen::DenseIndex, 1>;
+using Array2 = Eigen::DSizes<Eigen::DenseIndex, 2>;
+
+template <typename T, typename Context>
+void AddmmKernel(const Context& dev_ctx,
+                 const DenseTensor& input,
+                 const DenseTensor& x,
+                 const DenseTensor& y,
+                 float alpha,
+                 float beta,
+                 DenseTensor* out) {
+  auto input_dims = input.dims();
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+
+  // broadcast mode check
+  if (x_dims[0] != input_dims[0]) {
+    PADDLE_ENFORCE_EQ(input_dims[0],
+                      1,
+                      errors::InvalidArgument(
+                          "When x_dims[0] is not equal with input_dims[0], "
+                          "input_dims[0] must be 1 but got %s",
+                          input_dims[0]));
+    PADDLE_ENFORCE_EQ(y_dims[1] == input_dims[1] || input_dims[1] == 1,
+                      true,
+                      errors::InvalidArgument(
+                          "The input tensor shape mismatch, input shape=[%s], "
+                          "x shape=[%s], y shape=[%s]",
+                          input_dims,
+                          x_dims,
+                          y_dims));
+  }
+  // broadcast mode check
+  if (y_dims[1] != input_dims[1]) {
+    PADDLE_ENFORCE_EQ(input_dims[1],
+                      1,
+                      errors::InvalidArgument(
+                          "When y_dims[1] is not equal with input_dims[0], "
+                          "input_dims[0] must be 1 but got %s",
+                          input_dims[1]));
+    PADDLE_ENFORCE_EQ(x_dims[0] == input_dims[0] || input_dims[0] == 1,
+                      true,
+                      errors::InvalidArgument(
+                          "The input tensor shape mismatch, input shape=[%s], "
+                          "x shape=[%s], y shape=[%s]",
+                          input_dims,
+                          x_dims,
+                          y_dims));
+  }
+  // broadcast mode check
+  PADDLE_ENFORCE_EQ(
+      x_dims[1],
+      y_dims[0],
+      errors::InvalidArgument(
+          "The input tensor X's width must be equal with matrix Y' height. "
+          "But received X's shape = [%s], Y's shape = [%s].",
+          x_dims[1],
+          y_dims[0]));
+
+  dev_ctx.template Alloc<T>(out);
+  auto blas = funcs::GetBlas<Context, T>(dev_ctx);
+
+  // calc broadcast dim
+  Array2 bcast_dims;
+  bcast_dims[0] = x_dims[0] / input_dims[0];
+  bcast_dims[1] = y_dims[1] / input_dims[1];
+  VLOG(3) << "bcast_dims=[" << bcast_dims[0] << "," << bcast_dims[1] << "]";
+  // broadcast using eigen
+  auto eigen_input = PhiEigenTensor<T, 2>::From(input);
+  auto eigen_out = PhiEigenTensor<T, 2>::From(*out);
+  auto& place = *dev_ctx.eigen_device();
+  funcs::EigenBroadcast<std::decay_t<decltype(place)>, T, 2>::Eval(
+      place, eigen_out, eigen_input, bcast_dims);
+
+  blas.GEMM(false,
+            false,
+            x_dims[0],
+            y_dims[1],
+            x_dims[1],
+            alpha,
+            x.data<T>(),
+            x_dims[1],
+            y.data<T>(),
+            y_dims[1],
+            beta,
+            out->data<T>(),
+            y_dims[1]);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/cholesky_grad_kernel_impl.h b/paddle/phi/kernels/impl/cholesky_grad_kernel_impl.h
new file mode 100644
index 0000000000000..b8df86cc69344
--- /dev/null
+++ b/paddle/phi/kernels/impl/cholesky_grad_kernel_impl.h
@@ -0,0 +1,336 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/kernels/cholesky_grad_kernel.h"
+
+#include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+
+namespace phi {
+
+template <typename Context, typename T>
+inline void TransCompute(const int dim,
+                         const Context& dev_ctx,
+                         const DenseTensor& in,
+                         DenseTensor* out,
+                         const std::vector<int>& axis) {
+  switch (dim) {
+    case 1:
+      funcs::Transpose<Context, T, 1> trans1;
+      trans1(dev_ctx, in, out, axis);
+      break;
+    case 2:
+      funcs::Transpose<Context, T, 2> trans2;
+      trans2(dev_ctx, in, out, axis);
+      break;
+    case 3:
+      funcs::Transpose<Context, T, 3> trans3;
+      trans3(dev_ctx, in, out, axis);
+      break;
+    case 4:
+      funcs::Transpose<Context, T, 4> trans4;
+      trans4(dev_ctx, in, out, axis);
+      break;
+    case 5:
+      funcs::Transpose<Context, T, 5> trans5;
+      trans5(dev_ctx, in, out, axis);
+      break;
+    case 6:
+      funcs::Transpose<Context, T, 6> trans6;
+      trans6(dev_ctx, in, out, axis);
+      break;
+    default:
+      // for dim >= 7 situation
+      funcs::TransposeNormal<Context, T> trans_normal;
+      trans_normal(dev_ctx, in, out, axis);
+  }
+}
+
+/*! Use these functors to implement tril, triu, diagonal and other operators */
+template <typename T>
+struct EyeFunctor {
+  EyeFunctor(const int m, const int n, T* output)
+      : m_(m), n_(n), output_(output) {}
+
+  HOSTDEVICE void operator()(size_t index) const {
+    const int global_row = index / n_;
+    const int col = index - global_row * n_;
+    const int batch = global_row / m_;
+    const int row = global_row - batch * m_;
+    output_[index] = col == row ? static_cast<T>(1) : static_cast<T>(0);
+  }
+
+  const int m_, n_;
+  T* output_;
+};
+
+template <typename T>
+struct MatrixSetDiagFunctor {
+  /*! Overwrite specified diagonals of output by the values in diagonal.
+   * diagonals can be a central band specified by num_diags and
+   * upper_diag_index, where upper_diag_index=0 refers to the main diagonal,
+   * positive value means superdiagonal and negative value means subdiagonal.
+   * When it is a band, `diag` has a shape [i, j, ..., num_diags, max_diag_len]
+   * and the num_diags diagonals has a up to down layout. Otherwise it has a
+   * shape [i, j, ..., max_diag_len].
+   */
+  MatrixSetDiagFunctor(const int m,
+                       const int n,
+                       const int num_diags,
+                       const int max_diag_len,
+                       const int upper_diag_index,
+                       const T* diag,
+                       T* output)
+      : m_(m),
+        n_(n),
+        num_diags_(num_diags),
+        max_diag_len_(max_diag_len),
+        upper_diag_index_(upper_diag_index),
+        diag_(diag),
+        output_(output) {}
+
+  HOSTDEVICE void operator()(size_t index) const {
+    const int batch_and_diag_index = index / max_diag_len_;
+    const int index_in_the_diagonal =
+        index - batch_and_diag_index * max_diag_len_;
+    const int batch = batch_and_diag_index / num_diags_;
+    const int diag_index_in_input = batch_and_diag_index - batch * num_diags_;
+    // diag_index=0 refers to the main diagonal
+    const int diag_index = upper_diag_index_ - diag_index_in_input;
+    // shift down for subdiagonal if diag_index < 0
+    const int y_index =
+        index_in_the_diagonal + (0 > -diag_index ? 0 : -diag_index);
+    // shift right for superdiagonal if diag_index > 0
+    const int x_index =
+        index_in_the_diagonal + (0 > diag_index ? 0 : diag_index);
+
+    // Upper-bound checks for diagonals shorter than max_diag_len.
+    // y_index and x_index are nonnegative by construction.
+    if (y_index < m_ && x_index < n_) {
+      const int out_index = batch * m_ * n_ + y_index * n_ + x_index;
+      output_[out_index] = diag_[index];
+    }
+  }
+
+  const int m_, n_, num_diags_, max_diag_len_, upper_diag_index_;
+  const T* diag_;
+  T* output_;
+};
+
+template <typename T>
+struct MatrixDiagPartFunctor {
+  /*! Similar to MatrixSetDiagFunctor but return the diagonals. diag_index=0
+   * refers to the main diagonal, positive value means superdiagonal and
+   * negative value means subdiagonal */
+  MatrixDiagPartFunctor(const int m,
+                        const int n,
+                        const int num_diags,
+                        const int max_diag_len,
+                        const int upper_diag_index,
+                        const T padding,
+                        const T* input,
+                        T* output)
+      : m_(m),
+        n_(n),
+        num_diags_(num_diags),
+        max_diag_len_(max_diag_len),
+        upper_diag_index_(upper_diag_index),
+        input_(input),
+        output_(output) {}
+
+  HOSTDEVICE void operator()(size_t index) const {
+    const int batch_and_mapped_diag_index = index / max_diag_len_;
+    const int index_in_the_diagonal =
+        index - batch_and_mapped_diag_index * max_diag_len_;
+    const int batch = batch_and_mapped_diag_index / num_diags_;
+    const int mapped_diag_index =
+        batch_and_mapped_diag_index - batch * num_diags_;
+    // diag_index=0 refers to the main diagonal
+    const int diag_index = upper_diag_index_ - mapped_diag_index;
+    // shift down for subdiagonal if diag_index < 0
+    const int y_index =
+        index_in_the_diagonal + (0 > -diag_index ? 0 : -diag_index);
+    // shift right for superdiagonal if diag_index > 0
+    const int x_index =
+        index_in_the_diagonal + (0 > diag_index ? 0 : diag_index);
+    if (y_index < m_ && x_index < n_) {
+      output_[index] = input_[batch * m_ * n_ + y_index * m_ + x_index];
+    } else {
+      output_[index] = padding_;
+    }
+  }
+
+  const int m_, n_, num_diags_, max_diag_len_, upper_diag_index_;
+  const T padding_;
+  const T* input_;
+  T* output_;
+};
+
+template <typename T>
+struct MatrixBandPartScaleEndFunctor {
+  /*! Compared with MatrixBandPartFunctor, it scale up values at the end of
+   * band. It can be used to fuse the following operations, which actually
+   * output triangular with diagonal scaled up:
+   * 1. dig = matrix_diag_part(middle)
+   * 2. middle = matrix_set_diag(middle, diag * scalar)
+   * 3. middle = matrix_band_part(middle, -1, 0)
+   */
+  MatrixBandPartScaleEndFunctor(const int m,
+                                const int n,
+                                const int num_lower_diags,
+                                const int num_upper_diags,
+                                const T scale,
+                                const T* input,
+                                T* output)
+      : m_(m),
+        n_(n),
+        num_lower_diags_(num_lower_diags),
+        num_upper_diags_(num_upper_diags),
+        scale_(scale),
+        input_(input),
+        output_(output) {}
+
+  HOSTDEVICE void operator()(size_t index) const {
+    const int col = index % n_;
+    const int row = (index / n_) % m_;
+    const int band_start = (num_lower_diags_ < 0 ? 0 : row - num_lower_diags_);
+    const int band_end =
+        (num_upper_diags_ < 0 ? n_ : row + num_upper_diags_ + 1);
+    if (col < band_start || col >= band_end) {
+      output_[index] = 0;
+    } else if (col == band_end - 1) {
+      output_[index] = scale_ * input_[index];
+    } else {
+      output_[index] = input_[index];
+    }
+  }
+
+  const int m_, n_, num_lower_diags_, num_upper_diags_;
+  const T scale_;
+  const T* input_;
+  T* output_;
+};
+
+template <typename T>
+struct AddtoScaleFunctor {
+  AddtoScaleFunctor(const T scale, const T* input, T* output)
+      : scale_(scale), input_(input), output_(output) {}
+  HOSTDEVICE void operator()(size_t index) const {
+    output_[index] += input_[index];
+    output_[index] *= scale_;
+  }
+  const T scale_;
+  const T* input_;
+  T* output_;
+};
+
+template <typename T, typename Context>
+void CholeskyGradKernel(const Context& dev_ctx,
+                        const DenseTensor& out,
+                        const DenseTensor& out_grad,
+                        bool upper,
+                        DenseTensor* x_grad) {
+  auto* x_grad_data = dev_ctx.template Alloc<T>(x_grad);
+
+  auto& dims = out.dims();
+  int batch_count = 1;
+  for (int i = 0; i < dims.size() - 2; i++) {
+    batch_count *= dims[i];
+  }
+  auto m = dims[dims.size() - 1];
+  int tensor_size = batch_count * m * m;
+
+  std::vector<int> axis(dims.size() - 2);
+  std::iota(axis.begin(), axis.end(), 0);
+  axis.insert(axis.end(), {dims.size() - 1, dims.size() - 2});
+  DenseTensor l, l_grad;
+  if (upper) {
+    l.Resize(dims);
+    dev_ctx.template Alloc<T>(&l);
+    l_grad.Resize(dims);
+    dev_ctx.template Alloc<T>(&l_grad);
+    TransCompute<Context, T>(dims.size(), dev_ctx, out, &l, axis);
+    TransCompute<Context, T>(dims.size(), dev_ctx, out_grad, &l_grad, axis);
+  } else {
+    l = out;
+    l_grad = out_grad;
+  }
+  auto* l_data = l.data<T>();
+
+  /*！ refer to Iain Murray (2016); arXiv 1602.07527 */
+  /*! phi = matmul(L.transpose(-1, -2), grad) */
+  DenseTensor middle;
+  middle.Resize(dims);
+  auto* middle_data = dev_ctx.template Alloc<T>(&middle);
+  auto trans_desc = funcs::CreateMatrixDescriptor(dims, 0, true);
+  auto no_trans_desc = funcs::CreateMatrixDescriptor(dims, 0, false);
+  auto blas = funcs::GetBlas<Context, T>(dev_ctx);
+  blas.MatMul(l, trans_desc, l_grad, no_trans_desc, T(1), &middle, T(0));
+
+  /*! phi.tril_().diagonal(0, -2, -1).mul_(0.5) */
+  paddle::platform::ForRange<Context> for_range(dev_ctx, tensor_size);
+  MatrixBandPartScaleEndFunctor<T> matrix_band_part_scale_end_functor(
+      m,
+      m,
+      /* num_lower_diags */ m,
+      /* num_upper_diags */ 0,
+      /* scale */ 0.5,
+      middle_data,
+      middle_data);
+  for_range(matrix_band_part_scale_end_functor);
+
+  // Compute inverse by solving the triangular linear system AX = B, where B
+  // is the identity matrix. The matrix X would be overwritten on B
+  DenseTensor identity;
+  identity.Resize(dims);
+  auto* identity_data = dev_ctx.template Alloc<T>(&identity);
+  EyeFunctor<T> eye_functor(m, m, identity_data);
+  for_range(eye_functor);
+  // TODO(guosheng): use trsmBatched for GPU
+  for (int i = 0; i < batch_count; i++) {
+    blas.TRSM(/*side*/ CblasLeft,
+              /*uplo*/ CblasLower,
+              /*trans*/ CblasNoTrans,
+              /*diag*/ CblasNonUnit,
+              /*m*/ m,
+              /*n*/ m,
+              /*alpha*/ T(1),
+              l_data + i * m * m,
+              /*lda*/ m,
+              identity_data + i * m * m,
+              /*ldb*/ m);
+  }
+  DenseTensor& l_inverse = identity;
+
+  /*! x_grad = matmul(matmul(L_inverse.transpose(-1, -2), phi), L_inverse) */
+  DenseTensor middle1;
+  middle1.Resize(dims);
+  dev_ctx.template Alloc<T>(&middle1);
+  blas.MatMul(
+      l_inverse, trans_desc, middle, no_trans_desc, T(1), &middle1, T(0));
+  blas.MatMul(
+      middle1, no_trans_desc, l_inverse, no_trans_desc, T(1), x_grad, T(0));
+
+  /*! x_grad.add(x_grad.transpose(-1, -2)).mul_(0.5) */
+  DenseTensor x_grad_trans;
+  x_grad_trans.Resize(dims);
+  auto* x_grad_trans_data = dev_ctx.template Alloc<T>(&x_grad_trans);
+  TransCompute<Context, T>(dims.size(), dev_ctx, *x_grad, &x_grad_trans, axis);
+  AddtoScaleFunctor<T> addto_scale_functor(0.5, x_grad_trans_data, x_grad_data);
+  for_range(addto_scale_functor);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/increment_kernel_impl.h b/paddle/phi/kernels/impl/increment_kernel_impl.h
new file mode 100644
index 0000000000000..0756807a87532
--- /dev/null
+++ b/paddle/phi/kernels/impl/increment_kernel_impl.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/increment_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void IncrementKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     float value,
+                     DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+  auto& dev = *dev_ctx.eigen_device();
+  funcs::EigenAdd<std::decay_t<decltype(dev)>, T>::Eval(
+      dev,
+      EigenScalar<T>::From(*out),
+      EigenScalar<T>::From(x),
+      static_cast<T>(value));
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/increment_kernel.h b/paddle/phi/kernels/increment_kernel.h
new file mode 100644
index 0000000000000..7c5bc2a202791
--- /dev/null
+++ b/paddle/phi/kernels/increment_kernel.h
@@ -0,0 +1,27 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void IncrementKernel(const Context& ctx,
+                     const DenseTensor& x,
+                     float value,
+                     DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/fluid/operators/multinomial_op.h b/paddle/phi/kernels/multinomial_kernel.h
similarity index 70%
rename from paddle/fluid/operators/multinomial_op.h
rename to paddle/phi/kernels/multinomial_kernel.h
index 077e0e0ffa57e..70be21dc2861f 100644
--- a/paddle/fluid/operators/multinomial_op.h
+++ b/paddle/phi/kernels/multinomial_kernel.h
@@ -1,31 +1,30 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 
 #pragma once
-#include <vector>
+
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/fluid/framework/generator.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/phi/core/hostdevice.h"
 
-namespace paddle {
-namespace operators {
+namespace phi {
 
-/**
- * Samples a multinomial distribution given a probability input
- */
+template <typename T, typename Context>
+void MultinomialKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  int num_samples,
+                  bool replacement,
+                  DenseTensor* out);
 
 template <typename T>
 void MultinomialFunctor(int64_t* out_data, const T* in_data,
@@ -35,7 +34,7 @@ void MultinomialFunctor(int64_t* out_data, const T* in_data,
   std::vector<T> cumulative_probs(num_categories);
 
   std::uniform_real_distribution<T> dist(0, 1);
-  auto gen_ptr = framework::DefaultCPUGenerator();
+  auto gen_ptr = paddle::framework::DefaultCPUGenerator();
   auto engine = gen_ptr->GetCPUEngine();
 
   for (int64_t i = 0; i < num_distributions; i++) {
@@ -45,7 +44,7 @@ void MultinomialFunctor(int64_t* out_data, const T* in_data,
     for (int64_t j = 0; j < num_categories; j++) {
       prob_value = in_data[i * num_categories + j];
       PADDLE_ENFORCE_GE(prob_value, 0.0,
-                        platform::errors::InvalidArgument(
+                        errors::InvalidArgument(
                             "The input of multinomial distribution "
                             "should be >= 0, but got %f.",
                             prob_value));
@@ -57,13 +56,13 @@ void MultinomialFunctor(int64_t* out_data, const T* in_data,
       cumulative_probs[j] = probs_sum;
     }
     PADDLE_ENFORCE_GT(probs_sum, 0.0,
-                      platform::errors::InvalidArgument(
+                      errors::InvalidArgument(
                           "The sum of one multinomial distribution "
                           "probability should be > 0, but got %f.",
                           probs_sum));
     PADDLE_ENFORCE_EQ(
         (replacement || (num_categories - num_zeros >= num_samples)), true,
-        platform::errors::InvalidArgument(
+        errors::InvalidArgument(
             "When replacement is False, number of "
             "samples should be less than non-zero "
             "categories."));
@@ -121,8 +120,4 @@ void MultinomialFunctor(int64_t* out_data, const T* in_data,
   }
 }
 
-template <typename DeviceContext, typename T>
-class MultinomialOpKernel;
-
-}  // namespace operators
-}  // namespace paddle
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/addmm_sig.cc b/paddle/phi/ops/compat/addmm_sig.cc
new file mode 100644
index 0000000000000..34da5fe9fe954
--- /dev/null
+++ b/paddle/phi/ops/compat/addmm_sig.cc
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature AddmmOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "addmm", {"Input", "X", "Y"}, {"Alpha", "Beta"}, {"Out"});
+}
+
+KernelSignature AddmmGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "addmm_grad",
+      {"Input", "X", "Y", GradVarName("Out")},
+      {"Alpha", "Beta"},
+      {GradVarName("Input"), GradVarName("X"), GradVarName("Y")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(addmm, phi::AddmmOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(addmm_grad, phi::AddmmGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/cholesky_sig.cc b/paddle/phi/ops/compat/cholesky_sig.cc
new file mode 100644
index 0000000000000..068c7f4f0a77a
--- /dev/null
+++ b/paddle/phi/ops/compat/cholesky_sig.cc
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature CholeskyOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("cholesky", {"X"}, {"upper"}, {"Out"});
+}
+
+KernelSignature CholeskyGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("cholesky_grad",
+                         {"Out", GradVarName("Out")},
+                         {"upper"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(cholesky, phi::CholeskyOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(cholesky_grad, phi::CholeskyGradOpArgumentMapping);

From c5ae43a2676f275b32543a281a555004182a98d6 Mon Sep 17 00:00:00 2001
From: ronnywang <524019753@qq.com>
Date: Thu, 24 Feb 2022 18:57:32 +0800
Subject: [PATCH 098/101] fix paddle.where torch diff (#39859)

---
 .../fluid/tests/unittests/test_where_op.py    | 31 +++++++++++++++++++
 python/paddle/tensor/search.py                | 10 ++++--
 2 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_where_op.py b/python/paddle/fluid/tests/unittests/test_where_op.py
index d601117b96f12..7fb4d39cd7338 100644
--- a/python/paddle/fluid/tests/unittests/test_where_op.py
+++ b/python/paddle/fluid/tests/unittests/test_where_op.py
@@ -139,6 +139,28 @@ def test_api_broadcast(self, use_cuda=False):
                               fetch_list=[result])
                 assert np.array_equal(out[0], np.where((x_i > 1), x_i, y_i))
 
+    def test_scalar(self):
+        paddle.enable_static()
+        main_program = Program()
+        with fluid.program_guard(main_program):
+            cond_shape = [2, 4]
+            cond = fluid.layers.data(
+                name='cond', shape=cond_shape, dtype='bool')
+            x_data = 1.0
+            y_data = 2.0
+            cond_data = np.array([False, False, True, True]).astype('bool')
+            result = paddle.where(condition=cond, x=x_data, y=y_data)
+            for use_cuda in [False, True]:
+                if (use_cuda and (not fluid.core.is_compiled_with_cuda())):
+                    return
+                place = (fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace())
+                exe = fluid.Executor(place)
+                out = exe.run(fluid.default_main_program(),
+                              feed={'cond': cond_data},
+                              fetch_list=[result])
+                expect = np.where(cond_data, x_data, y_data)
+                assert np.array_equal(out[0], expect)
+
     def __test_where_with_broadcast_static(self, cond_shape, x_shape, y_shape):
         paddle.enable_static()
         main_program = Program()
@@ -227,6 +249,15 @@ def test_api(self):
             out = paddle.where(cond, x, y)
             assert np.array_equal(out.numpy(), np.where(cond_i, x_i, y_i))
 
+    def test_scalar(self):
+        with fluid.dygraph.guard():
+            cond_i = np.array([False, False, True, True]).astype('bool')
+            x = 1.0
+            y = 2.0
+            cond = fluid.dygraph.to_variable(cond_i)
+            out = paddle.where(cond, x, y)
+            assert np.array_equal(out.numpy(), np.where(cond_i, x, y))
+
     def __test_where_with_broadcast_dygraph(self, cond_shape, a_shape, b_shape):
         with fluid.dygraph.guard():
             cond_tmp = paddle.rand(cond_shape)
diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py
index 5c5517e54f71a..ecf70ffe4a1dd 100644
--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
@@ -543,8 +543,8 @@ def where(condition, x=None, y=None, name=None):
 
     Args:
         condition(Tensor): The condition to choose x or y.
-        x(Tensor, optional): x is a Tensor with data type float32, float64, int32, int64. Either both or neither of x and y should be given.
-        y(Tensor, optional): y is a Tensor with data type float32, float64, int32, int64. Either both or neither of x and y should be given.
+        x(Tensor or Scalar, optional): x is a Tensor or Scalar with data type float32, float64, int32, int64. Either both or neither of x and y should be given.
+        y(Tensor or Scalar, optional): y is a Tensor or Scalar with data type float32, float64, int32, int64. Either both or neither of x and y should be given.
 
         name(str, optional): The default value is None. Normally there is no
             need for user to set this property. For more information, please
@@ -571,6 +571,12 @@ def where(condition, x=None, y=None, name=None):
           #            [[2],
           #             [3]]),)
     """
+    if np.isscalar(x):
+        x = layers.fill_constant([1], np.array([x]).dtype.name, x)
+
+    if np.isscalar(y):
+        y = layers.fill_constant([1], np.array([y]).dtype.name, y)
+
     if x is None and y is None:
         return nonzero(condition, as_tuple=True)
 

From ce207c3aba1b1c3eebcc7fb7cb1ba3da2f0c460b Mon Sep 17 00:00:00 2001
From: zn <96479180+kangna-qi@users.noreply.github.com>
Date: Thu, 24 Feb 2022 19:04:44 +0800
Subject: [PATCH 099/101] [MLU]add mlu kernel for allreduce (#39788)

---
 .../operators/collective/c_allreduce_op.h     | 68 +++++++++++++++++-
 .../collective/c_allreduce_sum_op_mlu.cc      | 26 +++++++
 .../collective/c_broadcast_op_mlu.cc          |  3 +-
 .../fluid/tests/unittests/mlu/CMakeLists.txt  |  1 +
 .../unittests/mlu/collective_allreduce_op.py  | 70 +++++++++++++++++++
 .../mlu/test_collective_allreduce.py          | 55 +++++++++++++++
 .../unittests/mlu/test_collective_base_mlu.py |  8 +++
 7 files changed, 228 insertions(+), 3 deletions(-)
 create mode 100644 paddle/fluid/operators/collective/c_allreduce_sum_op_mlu.cc
 create mode 100644 python/paddle/fluid/tests/unittests/mlu/collective_allreduce_op.py
 create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_collective_allreduce.py

diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h
index a04935d43eb2d..7e5120cd2b392 100644
--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ b/paddle/fluid/operators/collective/c_allreduce_op.h
@@ -23,8 +23,9 @@ limitations under the License. */
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
-    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_XPU_BKCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) ||          \
+    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_XPU_BKCL) || \
+    defined(PADDLE_WITH_CNCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #endif
 
@@ -45,6 +46,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/npu/hccl_helper.h"
 #endif
 
+#if defined(PADDLE_WITH_CNCL)
+#include "paddle/fluid/platform/device/mlu/cncl_helper.h"
+#endif
+
 #if defined(PADDLE_WITH_ASCEND_CL)
 DECLARE_bool(hccl_check_nan);
 #endif
@@ -398,6 +403,65 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel<T> {
   }
 };
 
+template <ReduceType red_type, typename T>
+class CAllReduceOpMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_CNCL)
+    auto in = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+    cnclDataType_t dtype =
+        platform::ToCNCLDataType(framework::TransToProtoVarType(in->type()));
+    int64_t numel = in->numel();
+    const void* sendbuff = in->data<T>();
+    out->Resize(in->dims());
+    void* recvbuff = out->mutable_data<T>(place);
+
+    int rid = ctx.Attr<int>("ring_id");
+    auto comm = platform::CNCLCommContext::Instance().Get(rid, place);
+
+    mluStream stream = nullptr;
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+      stream = static_cast<platform::MLUDeviceContext*>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+
+    cnclReduceOp_t cncl_red_type = cnclSum;
+    switch (red_type) {
+      case kRedSum:
+        cncl_red_type = cnclSum;
+        break;
+
+      case kRedMax:
+        cncl_red_type = cnclMax;
+        break;
+
+      case kRedMin:
+        cncl_red_type = cnclMin;
+        break;
+
+      case kRedProd:
+        cncl_red_type = cnclProd;
+        break;
+
+      default:
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Invalid reduce type: %d", red_type));
+    }
+
+    PADDLE_ENFORCE_MLU_SUCCESS(cnclAllReduce(
+        sendbuff, recvbuff, numel, dtype, cncl_red_type, comm->comm(), stream));
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with MLU."));
+#endif
+  }
+};
+
 class CAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() {
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_mlu.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_mlu.cc
new file mode 100644
index 0000000000000..4879696b3f470
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_mlu.cc
@@ -0,0 +1,26 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(c_allreduce_sum,
+                       ops::CAllReduceOpMLUKernel<ops::kRedSum, float>,
+                       ops::CAllReduceOpMLUKernel<ops::kRedSum, plat::float16>,
+                       ops::CAllReduceOpMLUKernel<ops::kRedSum, int>,
+                       ops::CAllReduceOpMLUKernel<ops::kRedSum, int16_t>,
+                       ops::CAllReduceOpMLUKernel<ops::kRedSum, int8_t>,
+                       ops::CAllReduceOpMLUKernel<ops::kRedSum, uint8_t>)
diff --git a/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc b/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc
index 123fb2aafb524..d315f211709e4 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc
@@ -30,7 +30,8 @@ class CBroadcastOPMLUKernel : public framework::OpKernel<T> {
     auto x = ctx.Input<framework::LoDTensor>("X");
     auto out = ctx.Output<framework::LoDTensor>("Out");
     int numel = x->numel();
-    cnclDataType_t dtype = platform::ToCNCLDataType(x->type());
+    cnclDataType_t dtype =
+        platform::ToCNCLDataType(framework::TransToProtoVarType(x->type()));
 
     int rid = ctx.Attr<int>("ring_id");
     auto place = ctx.GetPlace();
diff --git a/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt
index 2e588355ce793..41f3a31017e7f 100644
--- a/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt
@@ -6,4 +6,5 @@ if (WITH_MLU)
         py_test_modules(${TEST_OP} MODULES ${TEST_OP})
     endforeach(TEST_OP)
     set_tests_properties(test_collective_broadcast PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_collective_allreduce PROPERTIES TIMEOUT 120)
 endif()
diff --git a/python/paddle/fluid/tests/unittests/mlu/collective_allreduce_op.py b/python/paddle/fluid/tests/unittests/mlu/collective_allreduce_op.py
new file mode 100644
index 0000000000000..0371e1bbb2406
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/collective_allreduce_op.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+import socket
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_base_mlu import TestCollectiveRunnerBase, runtime_main
+
+paddle.enable_static()
+
+
+class TestCollectiveAllreduce(TestCollectiveRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program):
+        ring_id = 0
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float32')
+            toutdata = main_prog.current_block().create_var(
+                name="outofallreduce",
+                dtype='float32',
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                persistable=False,
+                stop_gradient=False)
+            main_prog.global_block().append_op(
+                type="c_allreduce_sum",
+                inputs={'X': tindata},
+                attrs={'ring_id': ring_id},
+                outputs={'Out': toutdata})
+            main_prog.global_block().append_op(
+                type="c_sync_comm_stream",
+                inputs={'X': toutdata},
+                outputs={'Out': toutdata},
+                attrs={'ring_id': ring_id})
+            return toutdata
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveAllreduce, "allreduce", 0)
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_allreduce.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_allreduce.py
new file mode 100644
index 0000000000000..5fd5db7a604d5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_allreduce.py
@@ -0,0 +1,55 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import sys
+import unittest
+import numpy as np
+import paddle
+
+from test_collective_base_mlu import TestDistBase
+
+paddle.enable_static()
+
+
+class TestCAllreduceOp(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_allreduce_fp32(self):
+        self.check_with_place("collective_allreduce_op.py", "allreduce",
+                              "float32")
+
+    def test_allreduce_fp16(self):
+        self.check_with_place("collective_allreduce_op.py", "allreduce",
+                              "float16")
+
+    def test_allreduce_int32(self):
+        self.check_with_place("collective_allreduce_op.py", "allreduce",
+                              "int32")
+
+    def test_allreduce_int16(self):
+        self.check_with_place("collective_allreduce_op.py", "allreduce",
+                              "int16")
+
+    def test_allreduce_int8(self):
+        self.check_with_place("collective_allreduce_op.py", "allreduce", "int8")
+
+    def test_allreduce_uint8(self):
+        self.check_with_place("collective_allreduce_op.py", "allreduce",
+                              "uint8")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_base_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_base_mlu.py
index 2a7c64fe48972..4692c893d00b4 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_collective_base_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_base_mlu.py
@@ -262,5 +262,13 @@ def check_with_place(self,
             need_result = input2
             self.assertTrue(np.allclose(tr0_out, need_result))
             self.assertTrue(np.allclose(tr1_out, need_result))
+        elif col_type == "allreduce":
+            need_result = input1 + input2
+            self.assertTrue(
+                np.allclose(
+                    tr0_out, need_result, rtol=1e-05, atol=1e-05))
+            self.assertTrue(
+                np.allclose(
+                    tr1_out, need_result, rtol=1e-05, atol=1e-05))
         else:
             pass

From 1255e7d6fa6a6ca75821273c5839a657cd1a4757 Mon Sep 17 00:00:00 2001
From: Wangzheee <634486483@qq.com>
Date: Thu, 24 Feb 2022 20:34:11 +0800
Subject: [PATCH 100/101] [Paddle-Inference] fix special_slice plugin (#39875)

* fix plugin: special slice for ernie
---
 .../tensorrt/plugin/special_slice_plugin.cu   | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu
index ecf06e9bf1513..324e9c0392c93 100644
--- a/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu
@@ -113,12 +113,12 @@ nvinfer1::DataType SpecialSlicePluginDynamic::getOutputDataType(
 template <typename T>
 __global__ void SpecialSliceKernel(const T* slice_input,
                                    const int32_t* cu_seqlens, T* output) {
-  const int hidden = blockDim.x * gridDim.y;
-  const int batch = blockIdx.x;
-  const int local_idx = blockIdx.y * blockDim.y + threadIdx.x;
+  const int hidden = blockDim.x * gridDim.x;
+  const int hidden_id = blockIdx.x * blockDim.x + threadIdx.x;
+  const int batch_id = blockIdx.y;
 
-  output[batch * hidden + local_idx] =
-      slice_input[cu_seqlens[batch] * hidden + local_idx];
+  output[batch_id * hidden + hidden_id] =
+      slice_input[cu_seqlens[batch_id] * hidden + hidden_id];
 }
 
 int SpecialSlicePluginDynamic::enqueue(
@@ -137,15 +137,16 @@ int SpecialSlicePluginDynamic::enqueue(
                                          "hidden should be multiple of 128."));
 
   constexpr int num_threads = 128;
-  const dim3 blocks(out_dims.d[0], hidden / num_threads);
-
   const half* slice_input = static_cast<const half*>(inputs[0]);
   const int32_t* cu_seqlens = static_cast<const int32_t*>(inputs[1]);
   half* output = static_cast<half*>(outputs[0]);
 
-  SpecialSliceKernel<<<blocks, num_threads, 0, stream>>>(slice_input,
-                                                         cu_seqlens, output);
+  const int32_t num_blocks_x = hidden / num_threads;
+  const int32_t num_blocks_y = out_dims.d[0];         // batchs
+  const dim3 num_blocks(num_blocks_x, num_blocks_y);  // blocks
 
+  SpecialSliceKernel<<<num_blocks, num_threads, 0, stream>>>(
+      slice_input, cu_seqlens, output);
   return cudaGetLastError() != cudaSuccess;
 }
 

From e0409c93c0b4b5f063421eac32c26be5b83de012 Mon Sep 17 00:00:00 2001
From: Allen Guo <alleng@graphcore.ai>
Date: Thu, 24 Feb 2022 20:43:50 +0800
Subject: [PATCH 101/101] [IPU] Update IpuStrategy Python Part (#39646)

* Update IpuStrategy Python Part

* add docs

* add add_custom_op for ipu_strategy

* fix build warning

* rm unneeded part

* clean api

* fix typo

* update option names

* update IpuStrategy doc
---
 paddle/fluid/pybind/pybind.cc   | 214 ++++++++++++--------
 paddle/fluid/pybind/tensor_py.h |  10 +-
 python/paddle/fluid/compiler.py | 337 ++++++++++++++++++++------------
 python/paddle/fluid/executor.py |   3 -
 4 files changed, 351 insertions(+), 213 deletions(-)

diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 958174420570e..1ea9c7c65d5f5 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -3786,86 +3786,142 @@ All parameter, weight, gradient are variables in Paddle.
 
 #ifdef PADDLE_WITH_IPU
   py::class_<platform::ipu::IpuBackend,
-             std::shared_ptr<platform::ipu::IpuBackend>>(m, "IpuBackend")
-      .def(py::init(&platform::ipu::IpuBackend::GetNewInstance))
-      .def("clear", &platform::ipu::IpuBackend::Clear)
+             std::unique_ptr<platform::ipu::IpuBackend, py::nodelete>>(
+      m, "IpuBackend")
+      // manage IpuBackend in C++
+      .def("get_instance",
+           []() {
+             return std::unique_ptr<platform::ipu::IpuBackend, py::nodelete>(
+                 platform::ipu::IpuBackend::GetInstance());
+           },
+           py::return_value_policy::reference)
+      .def("detach", &platform::ipu::IpuBackend::Detach)
+      .def("reset", &platform::ipu::IpuBackend::Reset)
       .def("set_scope", &platform::ipu::IpuBackend::SetScope)
-      .def("set_ipu_strategy", &platform::ipu::IpuBackend::SetIpuStrategy);
-
-  py::class_<platform::ipu::IpuStrategy> ipu_strategy(m, "IpuStrategy");
-  ipu_strategy.def(py::init())
-      .def_property(
-          "num_ipus",
-          [](const platform::ipu::IpuStrategy &self) { return self.num_ipus; },
-          [](platform::ipu::IpuStrategy &self, int num_ipus) {
-            self.num_ipus = num_ipus;
-          })
-      .def_property(
-          "accumulationFactor",
-          [](const platform::ipu::IpuStrategy &self) {
-            return self.popart_options_.accumulationFactor;
-          },
-          [](platform::ipu::IpuStrategy &self, int accumulationFactor) {
-            self.popart_options_.accumulationFactor = accumulationFactor;
-          })
-      .def_property("batches_per_step",
-                    [](const platform::ipu::IpuStrategy &self) {
-                      return self.batches_per_step;
-                    },
-                    [](platform::ipu::IpuStrategy &self, int batches_per_step) {
-                      self.batches_per_step = batches_per_step;
-                    })
-      .def_property("is_training",
-                    [](const platform::ipu::IpuStrategy &self) {
-                      return self.is_training;
-                    },
-                    [](platform::ipu::IpuStrategy &self, bool is_training) {
-                      self.is_training = is_training;
-                    })
-      .def_property(
-          "enable_pipelining",
-          [](const platform::ipu::IpuStrategy &self) {
-            return self.popart_options_.enablePipelining;
-          },
-          [](platform::ipu::IpuStrategy &self, bool enable_pipelining) {
-            self.popart_options_.enablePipelining = enable_pipelining;
-          })
-      .def_property(
-          "enable_manual_shard",
-          [](const platform::ipu::IpuStrategy &self) {
-            return self.popart_options_.virtualGraphMode ==
-                   platform::ipu::VirtualGraphMode::Manual;
-          },
-          [](platform::ipu::IpuStrategy &self, bool enable_ipu_shard) {
-            if (enable_ipu_shard) {
-              self.popart_options_.virtualGraphMode =
-                  platform::ipu::VirtualGraphMode::Manual;
-            } else {
-              self.popart_options_.virtualGraphMode =
-                  platform::ipu::VirtualGraphMode::Off;
-            }
-          })
-      .def_property("need_avg_shard",
-                    [](const platform::ipu::IpuStrategy &self) {
-                      return self.need_avg_shard;
-                    },
-                    [](platform::ipu::IpuStrategy &self, bool need_avg_shard) {
-                      self.need_avg_shard = need_avg_shard;
-                    })
-      .def_property("batch_size",
-                    [](const platform::ipu::IpuStrategy &self) {
-                      return self.batch_size;
-                    },
-                    [](platform::ipu::IpuStrategy &self, int batch_size) {
-                      self.batch_size = batch_size;
-                    })
-      .def_property("enable_fp16",
-                    [](const platform::ipu::IpuStrategy &self) {
-                      return self.enable_fp16;
-                    },
-                    [](platform::ipu::IpuStrategy &self, bool enable_fp16) {
-                      self.enable_fp16 = enable_fp16;
-                    });
+      .def("set_ipu_strategy", &platform::ipu::IpuBackend::SetIpuStrategy)
+      .def("save_model_proto", &platform::ipu::IpuBackend::SaveModelProto);
+
+  py::class_<platform::ipu::IpuStrategy>(m, "IpuStrategy")
+      .def(py::init())
+      .def("set_options",
+           [](platform::ipu::IpuStrategy &self, const py::dict &opt) {
+             for (auto element : opt) {
+               auto option_name = element.first.cast<std::string>();
+               VLOG(10) << "Set option: " << option_name;
+               if (py::isinstance<py::bool_>(element.second)) {
+                 self.AddBoolOption(option_name, element.second.cast<bool>());
+               } else if (py::isinstance<py::float_>(element.second)) {
+                 self.AddDoubleOption(option_name,
+                                      element.second.cast<double>());
+               } else if (py::isinstance<py::int_>(element.second)) {
+                 self.AddUint64Option(option_name,
+                                      element.second.cast<std::uint64_t>());
+               } else if (py::isinstance<py::str>(element.second)) {
+                 self.AddStringOption(option_name,
+                                      element.second.cast<std::string>());
+               } else if (py::isinstance<py::set>(element.second) ||
+                          py::isinstance<py::list>(element.second)) {
+                 for (auto option : element.second.cast<py::list>()) {
+                   std::string option_val;
+                   if (py::isinstance<py::str>(option)) {
+                     option_val = option.cast<std::string>();
+                   } else if (py::isinstance<py::int_>(option)) {
+                     option_val = std::to_string(option.cast<std::uint64_t>());
+                   } else {
+                     PADDLE_THROW(platform::errors::Unimplemented(
+                         "Failed to convert type: %s when set IpuStrategy "
+                         "option: %s",
+                         option.get_type(), option_name));
+                   }
+                   self.InsertStringOption(option_name, option_val);
+                 }
+               } else if (py::isinstance<py::dict>(element.second)) {
+                 if (option_name.rfind("location_", 0) == 0) {
+                   for (auto option : element.second.cast<py::dict>()) {
+                     self.SetTensorLocation(
+                         option_name, option.first.cast<std::string>(),
+                         option.second.cast<std::uint64_t>());
+                   }
+                 } else if (option_name == "custom_op") {
+                   std::string paddle_op;
+                   std::string popart_op;
+                   std::string domain;
+                   int version = -1;
+                   for (auto option : element.second.cast<py::dict>()) {
+                     std::string option_key = option.first.cast<std::string>();
+                     if (option_key == "paddle_op") {
+                       paddle_op = option.second.cast<std::string>();
+                     } else if (option_key == "popart_op") {
+                       popart_op = option.second.cast<std::string>();
+                     } else if (option_key == "domain") {
+                       domain = option.second.cast<std::string>();
+                     } else if (option_key == "version") {
+                       version = option.second.cast<int>();
+                     } else {
+                       PADDLE_THROW(platform::errors::InvalidArgument(
+                           "Invalid argument, key must be one of paddle_op, "
+                           "popart_op, domain or version, but revecived %s",
+                           option_key));
+                     }
+                   }
+                   self.AddCustomOp(paddle_op, popart_op, domain, version);
+                 } else {
+                   for (auto option : element.second.cast<py::dict>()) {
+                     std::string option_key = option.first.cast<std::string>();
+                     std::string option_val;
+                     if (py::isinstance<py::str>(option.second)) {
+                       option_val = option.second.cast<std::string>();
+                     } else if (py::isinstance<py::int_>(option.second)) {
+                       option_val =
+                           std::to_string(option.second.cast<std::uint64_t>());
+                     } else {
+                       PADDLE_THROW(platform::errors::Unimplemented(
+                           "Failed to convert value type: %s when set "
+                           "IpuStrategy option: %s",
+                           option.second.get_type(), option_key));
+                     }
+                     self.InsertStringPairOption(option_name, option_key,
+                                                 option_val);
+                   }
+                 }
+               } else {
+                 PADDLE_THROW(platform::errors::InvalidArgument(
+                     "Invalid IpuStrategy option value type: %s, please check "
+                     "input value for option: %s",
+                     element.second.get_type(), option_name));
+               }
+             }
+           })
+      .def("get_option",
+           [](platform::ipu::IpuStrategy &self, const std::string &name) {
+             py::dict res;
+             auto option_type = self.GetOptionType(name);
+             res["name"] = name;
+             res["type"] = option_type;
+             if (option_type == "vector") {
+               auto value = self.GetVectorOption(name);
+               res["value"] = value;
+             } else if (option_type == "map") {
+               auto value = self.GetMapOption(name);
+               res["value"] = value;
+             } else {
+               auto value_s = self.GetOption(name);
+               res["value_s"] = value_s;
+               if (option_type == "bool") {
+                 res["value"] = static_cast<bool>(std::stoi(value_s));
+               } else if (option_type == "uint64") {
+                 res["value"] = std::stoul(value_s);
+               } else if (option_type == "double") {
+                 res["value"] = std::stod(value_s);
+               } else if (option_type == "string") {
+                 res["value"] = value_s;
+               }
+             }
+             return res;
+           })
+      .def("enable_pattern", &platform::ipu::IpuStrategy::EnablePattern)
+      .def("disable_pattern", &platform::ipu::IpuStrategy::DisablePattern)
+      .def("is_pattern_enabled", &platform::ipu::IpuStrategy::IsPatternEnabled);
 #endif
 
   BindFleetWrapper(&m);
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index 531cc03f26714..49bacc1cd6d85 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -350,8 +350,14 @@ void SetTensorFromPyArrayT(
       auto type = framework::ToDataType(std::type_index(typeid(T)));
       self->ResetHolderWithType(holder, framework::TransToPtenDataType(type));
     } else {
-      auto dst = self->mutable_data<T>(place);
-      std::memcpy(dst, array.data(), array.nbytes());
+      // IPU does not store Tensor data, Tensor will be created on CPU
+      if (!self->initialized()) {
+        auto dst = self->mutable_data<T>(place);
+        std::memcpy(dst, array.data(), array.nbytes());
+      } else {
+        auto dst = self->mutable_data<T>(self->place());
+        std::memcpy(dst, array.data(), array.nbytes());
+      }
     }
 #else
     PADDLE_THROW(platform::errors::PermissionDenied(
diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py
index 7e3dfde5d4f67..b8a696057e780 100644
--- a/python/paddle/fluid/compiler.py
+++ b/python/paddle/fluid/compiler.py
@@ -502,9 +502,6 @@ class IpuStrategy(object):
     """
     Help users precisely control the graph building in :code:`paddle.static.IpuCompiledProgram` .
 
-    Args:
-        None.
-        
     Returns:
         The IpuStrategy instance.
 
@@ -517,23 +514,36 @@ class IpuStrategy(object):
             import paddle.static as static
 
             paddle.enable_static()
+
             ipu_strategy = static.IpuStrategy()
     """
 
     def __init__(self):
         if core.is_compiled_with_ipu():
             self._ipu_strategy = core.IpuStrategy()
+            default_options = {
+                'location_optimizer': {
+                    'on_chip': 0,
+                    'use_replicated_tensor_sharding': 1,
+                },  # set optimizer location
+                'accumulation_and_replication_reduction_type':
+                1,  # popart::ReductionType::Mean
+                'mean_accumulation_and_replication_reduction_strategy':
+                1,  # popart::MeanReductionStrategy::Post
+            }
+            self._ipu_strategy.set_options(default_options)
+            self.has_custom_ops = False
+            self.custom_op_names = []
         else:
             raise RuntimeError(
                 "Can not use IpuStrategy in non IPU compiled environment, please re-compile with WITH_IPU=ON."
             )
 
-    def SetGraphConfig(self,
-                       num_ipus=1,
-                       is_training=True,
-                       batch_size=1,
-                       enable_manual_shard=False,
-                       need_avg_shard=False):
+    def set_graph_config(self,
+                         num_ipus=1,
+                         is_training=True,
+                         batch_size=1,
+                         enable_manual_shard=False):
         """
         Set graph configuration to the IpuStrategy instance.
 
@@ -544,8 +554,6 @@ def SetGraphConfig(self,
                 if the batch-size in the graph is dynamic. Default 1, which means the batch-size would be set 1, if the batch-size is dynamice.
             enable_manual_shard (bool, optional): Enable graph sharding or not. Only if num_ipus > 1, enable_manual_shard is able to be set True. 
                 Default False, which means disabled.    
-            need_avg_shard (bool, optional): Enable auto graph sharding or not. Only if num_ipus > 1 and enable_manual_shard=True, need_avg_shard is able to be set Trues. 
-                Default False, which means disabled.
             
         Returns:
             None.
@@ -559,32 +567,29 @@ def SetGraphConfig(self,
                 import paddle.static as static
 
                 paddle.enable_static()
+
                 ipu_strategy = static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(num_ipus=1,
+                ipu_strategy.set_graph_config(num_ipus=1,
                                             is_training=True,
                                             batch_size=1,
-                                            enable_manual_shard=False,
-                                            need_avg_shard=False)
+                                            enable_manual_shard=False)
         """
-
-        self._ipu_strategy.num_ipus = num_ipus
-        self._ipu_strategy.is_training = is_training
-        self._ipu_strategy.batch_size = batch_size
-        self._ipu_strategy.enable_manual_shard = enable_manual_shard
-        if self._ipu_strategy.num_ipus == 1 and self._ipu_strategy.enable_manual_shard:
+        if num_ipus == 1 and enable_manual_shard:
             raise RuntimeError(
                 "Only if num_ipus > 1, enable_manual_shard is able to be set True."
             )
-        self._ipu_strategy.need_avg_shard = need_avg_shard
-        if self._ipu_strategy.enable_manual_shard != True and self._ipu_strategy.need_avg_shard:
-            raise RuntimeError(
-                "Only if enable_manual_shard=True, need_avg_shard is able to be set True."
-            )
-
-    def SetPipeliningConfig(self,
-                            enable_pipelining=False,
-                            batches_per_step=1,
-                            accumulationFactor=1):
+        options = {
+            'num_ipus': num_ipus,
+            'is_training': is_training,
+            'micro_batch_size': batch_size,
+            'enable_manual_shard': enable_manual_shard,
+        }
+        self.set_options(options)
+
+    def set_pipelining_config(self,
+                              enable_pipelining=False,
+                              batches_per_step=1,
+                              accumulation_factor=1):
         """
         Set pipelining configuration to the IpuStrategy instance. Used to optimize the throughput performance.
 
@@ -593,7 +598,7 @@ def SetPipeliningConfig(self,
                 Default False, which means disabled.
             batches_per_step (int, optional): Set the batches per run in data pipelining mode. Only if enable_pipelining=True, batches_per_step is able to be set > 1.
                 Default 1, which means no data pipelining.
-            accumulationFactor (int, optional): Specify the number of micro-batches to accumulate 
+            accumulation_factor (int, optional): Specify the number of micro-batches to accumulate 
                 before applying the varUpdate. Default 1, which means disable the accumulation.
         
         Returns:
@@ -610,23 +615,23 @@ def SetPipeliningConfig(self,
                 paddle.enable_static()
 
                 ipu_strategy = static.IpuStrategy()
-                ipu_strategy.SetPipeliningConfig(enable_pipelining=False,
-                                                 batches_per_step=1,
-                                                 accumulationFactor=1)
+                ipu_strategy.set_pipelining_config(enable_pipelining=False,
+                                                    batches_per_step=1,
+                                                    accumulation_factor=1)
         """
-        self._ipu_strategy.enable_pipelining = enable_pipelining
-        if self._ipu_strategy.enable_manual_shard != True and self._ipu_strategy.enable_pipelining:
+        enable_manual_shard = self.get_option('enable_manual_shard')
+        if not enable_manual_shard and enable_pipelining:
             raise RuntimeError(
                 "Only if enable_manual_shard=True, enable_pipelining is able to be set True."
             )
-        self._ipu_strategy.batches_per_step = batches_per_step
-        if self._ipu_strategy.enable_pipelining != True and self._ipu_strategy.batches_per_step > 1:
-            raise RuntimeError(
-                "Only if enable_pipelining=True, batches_per_step is able to be set > 1."
-            )
-        self._ipu_strategy.accumulationFactor = accumulationFactor
-
-    def SetHalfConfig(self, enable_fp16=False):
+        options = {
+            'enable_pipelining': enable_pipelining,
+            'batches_per_step': batches_per_step,
+            'accumulation_factor': accumulation_factor,
+        }
+        self.set_options(options)
+
+    def set_precision_config(self, enable_fp16=False):
         """
         Set half computation configuration to the IpuStrategy instance. Used to optimize the performance.
 
@@ -647,73 +652,135 @@ def SetHalfConfig(self, enable_fp16=False):
                 paddle.enable_static()
 
                 ipu_strategy = static.IpuStrategy()
-                ipu_strategy.SetHalfConfig(enable_fp16=False)
+                ipu_strategy.set_precision_config(enable_fp16=False)
+        """
+        options = {'enable_fp16': enable_fp16, }
+        self.set_options(options)
+
+    def add_custom_op(self,
+                      paddle_op,
+                      popart_op=None,
+                      domain='custom.ops',
+                      version=1):
         """
+        Add a mapping to use popart custom ops running on the IPU.
 
-        self._ipu_strategy.enable_fp16 = enable_fp16
+        Args:
+            paddle_op(str): the name of custom op in paddle.
 
-    @property
-    def num_ipus(self):
-        """
-        Get the number of IPU devices from IpuStrategy instance.
-        """
-        return self._ipu_strategy.num_ipus
+            popart_op(str): the name of custom op in popart.
 
-    @property
-    def is_training(self):
-        """
-        Get the boolean of training or inference from IpuStrategy instance.
-        """
-        return self._ipu_strategy.is_training
+            domain(str): domain name of custom op in popart.
 
-    @property
-    def batch_size(self):
+            version(int): version of custom op in popart.
+        
+        Returns:
+            None.
+
+        Examples:
+            .. code-block:: python
+
+                # required: ipu
+
+                import paddle
+                import paddle.static as static
+
+                paddle.enable_static()
+
+                ipu_strategy = static.IpuStrategy()
+                ipu_strategy.add_custom_op('paddle_relu', 'popart_relu')
         """
-        Get the batch_size used in dynamic batch_size graph from IpuStrategy instance.
+        if popart_op is None:
+            popart_op = paddle_op
+        custom_op = {
+            'paddle_op': paddle_op,
+            'popart_op': popart_op,
+            'domain': domain,
+            'version': version,
+        }
+        self.set_options({'custom_op': custom_op})
+        self.custom_op_names.append(paddle_op)
+        if not self.has_custom_ops:
+            self.has_custom_ops = True
+
+    def set_options(self, options):
         """
-        return self._ipu_strategy.batch_size
+        Set options from dict.
 
-    @property
-    def enable_manual_shard(self):
-        """
-        Get the boolean of enable manual shard or not from IpuStrategy instance.
+        Args:
+            options(dict): dict of options.
+        
+        Returns:
+            None.
+
+        Examples:
+            .. code-block:: python
+
+                # required: ipu
+
+                import paddle
+                import paddle.static as static
+
+                paddle.enable_static()
+
+                ipu_strategy = static.IpuStrategy()
+                options = {'num_ipus':1, 'enable_fp16': True}
+                ipu_strategy.set_options(options)
         """
-        return self._ipu_strategy.enable_manual_shard
+        self._ipu_strategy.set_options(options)
 
-    @property
-    def need_avg_shard(self):
+    def get_option(self, option):
         """
-        Get the boolean of need average shard or not from IpuStrategy instance.
+        Get option.
+
+        Args:
+            option(str): name of option.
+        
+        Returns:
+            option value.
+
+        Examples:
+            .. code-block:: python
+
+                # required: ipu
+
+                import paddle
+                import paddle.static as static
+
+                paddle.enable_static()
+
+                ipu_strategy = static.IpuStrategy()
+                num_ipus = ipu_strategy.get_option('num_ipus')
         """
-        return self._ipu_strategy.need_avg_shard
+        return self._ipu_strategy.get_option(option)['value']
 
     @property
-    def enable_pipelining(self):
+    def num_ipus(self):
         """
-        Get the boolean of enable pipelining or not from IpuStrategy instance.
+        Get the number of IPU devices from IpuStrategy instance.
         """
-        return self._ipu_strategy.enable_pipelining
+        return self.get_option('num_ipus')
 
     @property
-    def batches_per_step(self):
+    def is_training(self):
         """
-        Get the number of batch_size per run in the pipelining mode from IpuStrategy instance.
+        Get the boolean of training or inference from IpuStrategy instance.
         """
-        return self._ipu_strategy.batches_per_step
+        return self.get_option('is_training')
 
     @property
-    def accumulationFactor(self):
+    def enable_pipelining(self):
         """
-        Get the number of micro-batches to accumulate before applying the varUpdate from IpuStrategy instance.
+        Get the boolean of enable pipelining or not from IpuStrategy instance.
         """
-        return self._ipu_strategy.accumulationFactor
+        return self.get_option('enable_pipelining')
 
     @property
     def enable_fp16(self):
         """
         Get the boolean of float16 mode or not from IpuStrategy instance.
         """
-        return self._ipu_strategy.enable_fp16
+        return self.get_option('enable_fp16')
 
 
 class IpuCompiledProgram(object):
@@ -750,9 +817,9 @@ class IpuCompiledProgram(object):
             main_prog = static.default_main_program()
             
             ipu_strategy = static.IpuStrategy()
-            ipu_strategy.SetGraphConfig(num_ipus=1, is_training=True, batch_size=1)
-            ipu_strategy.SetPipeliningConfig(enable_pipelining=False, batches_per_step=1, accumulationFactor=1)
-            ipu_strategy.SetHalfConfig(enable_fp16=False)
+            ipu_strategy.set_graph_config(num_ipus=1, is_training=True, batch_size=1)
+            ipu_strategy.set_pipelining_config(enable_pipelining=False, batches_per_step=1, accumulation_factor=1)
+            ipu_strategy.set_precision_config(enable_fp16=False)
             
             ipu_compiled_program = static.IpuCompiledProgram(
                 main_prog,
@@ -766,14 +833,12 @@ def __init__(self, program=None, scope=None, ipu_strategy=None):
             )
 
         if program is None:
-            program = default_main_program()
+            program = framework.default_main_program()
 
         if not isinstance(program, framework.Program):
             raise TypeError(
                 "The type of program is wrong, expected Program, but got %s" %
                 type(program))
-        # import here to avoiding confused
-        import paddle
 
         self._program = program
         self._compiled = False
@@ -781,23 +846,21 @@ def __init__(self, program=None, scope=None, ipu_strategy=None):
         if scope is not None:
             self._scope = scope
         else:
+            # import here to avoiding confused
+            import paddle
             self._scope = paddle.static.global_scope()
 
         if ipu_strategy is not None:
-            self._ipu_strategy = ipu_strategy._ipu_strategy
+            self._ipu_strategy = ipu_strategy
         else:
-            self._ipu_strategy = core.IpuStrategy()
+            self._ipu_strategy = IpuStrategy()
 
-        self._backend = core.IpuBackend()
-        self._backend.set_scope(self._scope)
-        self._backend.set_ipu_strategy(self._ipu_strategy)
-        self._graph_passes = [
-            "optimizer_extract_pass", "optimizer_state_align_pass",
-            "forward_graph_extract_pass", "infer_shape_pass", "avg_shard_pass",
-            "popart_canonicalization_pass"
-        ]
-        global ipu_compiler_ref
-        ipu_compiler_ref = self
+        if ipu_strategy.has_custom_ops:
+            self._custom_op_names = set(ipu_strategy.custom_op_names)
+        else:
+            self._custom_op_names = ()
+
+        self._backend = core.IpuBackend.get_instance()
 
     def compile(self, feed_list, fetch_list):
         """
@@ -828,20 +891,23 @@ def compile(self, feed_list, fetch_list):
                 main_prog = static.default_main_program()
 
                 ipu_strategy = static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(num_ipus=1, is_training=True, batch_size=1)
-                ipu_strategy.SetPipeliningConfig(enable_pipelining=False, batches_per_step=1, accumulationFactor=1)
-                ipu_strategy.SetHalfConfig(enable_fp16=False)
+                ipu_strategy.set_graph_config(num_ipus=1, is_training=True, batch_size=1)
+                ipu_strategy.set_pipelining_config(enable_pipelining=False, batches_per_step=1, accumulation_factor=1)
+                ipu_strategy.set_precision_config(enable_fp16=False)
                 
                 program = static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile([a.name], [b.name])
         """
+        self._backend.set_scope(self._scope)
+        self._backend.set_ipu_strategy(self._ipu_strategy._ipu_strategy)
+
         # feed and fetch doesn't have corresponding popart op, so we rm both here
         global_block = self._program.global_block()
         need_to_remove_op_index = []
         for i, op in enumerate(global_block.ops):
             op.desc.set_is_target(False)
-            if op.type == "feed" or op.type == "fetch":
+            if op.type == 'feed' or op.type == 'fetch':
                 need_to_remove_op_index.append(i)
 
         for index in need_to_remove_op_index[::-1]:
@@ -854,26 +920,45 @@ def compile(self, feed_list, fetch_list):
         self._program.desc.flush()
         self._graph = core.Graph(self._program.desc)
 
-        for pass_name in self._graph_passes:
-            graph_pass = core.get_pass(pass_name)
-            if pass_name == "infer_shape_pass":
-                graph_pass.set("feed_list", feed_list)
-            graph_pass.apply(self._graph)
-
-        ipu_inplace_pass = core.get_pass("ipu_inplace_pass")
-        ipu_inplace_pass.set("feed_list", feed_list)
-        ipu_inplace_pass.set("fetch_list", fetch_list)
-        ipu_inplace_pass.apply(self._graph)
-
-        ipu_graph_builder_pass = core.get_pass("ipu_graph_builder_pass")
-        ipu_graph_builder_pass.set("feed_list", feed_list)
-        ipu_graph_builder_pass.set("fetch_list", fetch_list)
-        ipu_graph_builder_pass.apply(self._graph)
-
-        ipu_runtime_replacer_pass = core.get_pass("ipu_runtime_replacer_pass")
-        ipu_runtime_replacer_pass.set("feed_list", feed_list)
-        ipu_runtime_replacer_pass.set("fetch_list", fetch_list)
-        ipu_runtime_replacer_pass.apply(self._graph)
+        if self._ipu_strategy.is_training:
+            passes = [
+                'optimizer_extract_pass',
+                'optimizer_state_align_pass',
+            ]
+            for pass_name in passes:
+                a_pass = core.get_pass(pass_name)
+                a_pass.apply(self._graph)
+
+        passes = [
+            'forward_graph_extract_pass',
+            'infer_shape_pass',
+            'avg_shard_pass',
+            'delete_scale_op_pass',
+        ]
+        for pass_name in passes:
+            a_pass = core.get_pass(pass_name)
+            if pass_name == 'infer_shape_pass':
+                a_pass.set('feed_list', feed_list)
+            a_pass.apply(self._graph)
+
+        a_pass = core.get_pass('popart_canonicalization_pass')
+        if self._custom_op_names:
+            a_pass.set('custom_ops', self._custom_op_names)
+        a_pass.apply(self._graph)
+
+        a_pass = core.get_pass("transfer_cast_op_pass")
+        a_pass.apply(self._graph)
+
+        passes = [
+            'ipu_inplace_pass',
+            'ipu_graph_builder_pass',
+            'ipu_runtime_replacer_pass',
+        ]
+        for pass_name in passes:
+            a_pass = core.get_pass(pass_name)
+            a_pass.set('feed_list', feed_list)
+            a_pass.set('fetch_list', fetch_list)
+            a_pass.apply(self._graph)
 
         convert_pass = core.get_pass('graph_to_program_pass')
         desc = core.ProgramDesc()
@@ -904,9 +989,3 @@ def compile(self, feed_list, fetch_list):
             program.org_program = self._program
 
         return program
-
-    def clean(self):
-        self._backend.clear()
-
-    def __del__(self):
-        self.clean()
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 447d6457e0a3c..e372727b0f0b6 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -1583,9 +1583,6 @@ def _run_program(self, program, feed, fetch_list, feed_var_name,
             lr_sheduler = program.lr_sheduler
             lr_value = lr_sheduler()
             lr_var = program.global_block().vars[lr_sheduler._var_name]
-            if core.is_compiled_with_ipu():
-                if hasattr(program.lr_sheduler, 'lr_var'):
-                    lr_var = program.lr_sheduler.lr_var
             data = np.array([lr_value]).astype(convert_dtype(lr_var.dtype))
             tensor = core.get_variable_tensor(scope, lr_sheduler._var_name)
             tensor.set(data, self.place)