From 1456b02d8909b471dccee82ffded985cd8dc39dd Mon Sep 17 00:00:00 2001
From: "joanna.wozna.intel" <joanna.wozna@intel.com>
Date: Wed, 5 Jan 2022 03:51:17 +0100
Subject: [PATCH 001/151] Quantize nearest_interp and nearest_interp_v2
 (#38622)

* Quantize nearest_interp and nearest_interp_v2

* Check if avx_core supported

* Add depthwise_conv2d to supported quantization list
---
 .../framework/ir/graph_pattern_detector.cc    |  37 +++--
 .../framework/ir/graph_pattern_detector.h     |  15 ++
 .../framework/ir/mkldnn/cpu_quantize_pass.cc  |  62 ++++++++
 .../framework/ir/mkldnn/cpu_quantize_pass.h   |   1 +
 .../ir/mkldnn/cpu_quantize_pass_tester.cc     |  15 +-
 .../ir/mkldnn/cpu_quantize_placement_pass.cc  |  33 +++--
 .../cpu_quantize_placement_pass_tester.cc     |  26 ++++
 .../fluid/inference/api/mkldnn_quantizer.cc   |   3 +-
 .../inference/api/mkldnn_quantizer_config.cc  |  12 ++
 .../quantization/quant2_int8_mkldnn_pass.py   |   3 +-
 .../tests/test_quant2_int8_mkldnn_pass.py     | 135 ++++++++++++++++++
 11 files changed, 319 insertions(+), 23 deletions(-)
 mode change 100755 => 100644 paddle/fluid/framework/ir/graph_pattern_detector.cc

diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
old mode 100755
new mode 100644
index cd9292b59a359..6949e4d078c0c
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -1641,6 +1641,32 @@ PDNode *patterns::Slice::operator()() {
   return slice_out;
 }
 
+PDNode *patterns::NearestInterp::operator()() {
+  auto prev_op = pattern->NewNode(prev_op_repr())->assert_is_op();
+
+  auto nearest_interp_op =
+      pattern->NewNode(nearest_interp_op_repr())
+          ->assert_is_ops({"nearest_interp", "nearest_interp_v2"});
+
+  auto nearest_interp_in =
+      pattern->NewNode(nearest_interp_in_repr())
+          ->AsInput()
+          ->assert_is_ops_input({"nearest_interp", "nearest_interp_v2"}, "X");
+  auto nearest_interp_out =
+      pattern->NewNode(nearest_interp_out_repr())
+          ->AsOutput()
+          ->assert_is_ops_output({"nearest_interp", "nearest_interp_v2"},
+                                 "Out");
+
+  auto next_op = pattern->NewNode(next_op_repr())->assert_is_op();
+
+  prev_op->LinksTo({nearest_interp_in});
+  nearest_interp_op->LinksFrom({nearest_interp_in})
+      .LinksTo({nearest_interp_out});
+  next_op->LinksFrom({nearest_interp_out});
+  return nearest_interp_out;
+}
+
 PDNode *patterns::Matmul::operator()() {
   auto matmul_op = pattern->NewNode(matmul_op_repr())->assert_is_op("matmul");
 
@@ -2376,15 +2402,8 @@ PDNode *patterns::MultipleQuantize::operator()() {
 
 PDNode *patterns::QuantizePlacement::operator()(
     const std::unordered_set<std::string> &quantize_enabled_op_types) {
-  std::unordered_set<std::string> supported_op_types =
-      std::unordered_set<std::string>({"concat", "conv2d", "elementwise_add",
-                                       "fc", "matmul", "pool2d", "prior_box",
-                                       "reshape2", "transpose2", "fusion_gru",
-                                       "fusion_lstm", "multi_gru", "slice"});
-  if (!quantize_enabled_op_types.empty()) {
-    supported_op_types = quantize_enabled_op_types;
-  }
-  auto *op = pattern->NewNode(op_repr())->assert_is_ops(supported_op_types);
+  auto *op =
+      pattern->NewNode(op_repr())->assert_is_ops(quantize_enabled_op_types);
   return op;
 }
 
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index deaba36ba5da2..940f6b8561e48 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -995,6 +995,21 @@ struct Slice : public PatternBase {
   PATTERN_DECL_NODE(next_op);
 };
 
+// Nearest Interp op
+// Forward pass for nearest_interp.
+// nearest_interp_out is a result of the operator.
+struct NearestInterp : public PatternBase {
+  NearestInterp(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "nearest_interp") {}
+
+  PDNode* operator()();
+  PATTERN_DECL_NODE(prev_op);
+  PATTERN_DECL_NODE(nearest_interp_in);
+  PATTERN_DECL_NODE(nearest_interp_op);
+  PATTERN_DECL_NODE(nearest_interp_out);
+  PATTERN_DECL_NODE(next_op);
+};
+
 // Matmul op
 // Forward pass for matmul.
 struct Matmul : public PatternBase {
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
index 3df4a84470524..64d9bf603533e 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
@@ -1053,6 +1053,67 @@ void CPUQuantizePass::QuantizeFusionLSTM(Graph* graph) const {
   PrettyLogDetail("---    quantized %d fusion_lstm ops", quantize_count);
 }
 
+void CPUQuantizePass::QuantizeNearestInterp(Graph* graph) const {
+  GraphPatternDetector gpd;
+  auto pattern = gpd.mutable_pattern();
+  patterns::NearestInterp nearest_interp_pattern{pattern, name_scope_};
+  nearest_interp_pattern();
+
+  int quantize_nearest_interp_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "Quantize nearest_interp op";
+    GET_IR_NODE_FROM_SUBGRAPH(nearest_interp_op, nearest_interp_op,
+                              nearest_interp_pattern);
+
+    // skip if should not be quantized
+    if (!platform::HasOpINT8DataType(nearest_interp_op->Op())) {
+      LogQuantizationDisabled(nearest_interp_op);
+      return;
+    }
+    GET_IR_NODE_FROM_SUBGRAPH(prev_op, prev_op, nearest_interp_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(next_op, next_op, nearest_interp_pattern);
+
+    // skip if prev op and next op is not quantized
+    if (!(IsOpDequantized(prev_op)) && !(IsOpQuantized(next_op))) {
+      LogCannotQuantizeOp(nearest_interp_op,
+                          "There are no other quantized operators nearby, so "
+                          "quantization is not recommended.");
+      return;
+    }
+
+    GET_IR_NODE_FROM_SUBGRAPH(nearest_interp_in, nearest_interp_in,
+                              nearest_interp_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(nearest_interp_out, nearest_interp_out,
+                              nearest_interp_pattern);
+
+    if (!AreScalesPresentForNodes({nearest_interp_in, nearest_interp_out})) {
+      LogCannotQuantizeOp(nearest_interp_op);
+      return;
+    }
+
+    bool is_input_unsigned{false};
+    auto input_scale =
+        GetScaleValueForNode(nearest_interp_in, &is_input_unsigned);
+    QuantizeInput(g, nearest_interp_op, nearest_interp_in, "X", input_scale,
+                  is_input_unsigned);
+
+    bool is_output_unsigned{false};
+    auto output_scale =
+        GetScaleValueForNode(nearest_interp_out, &is_output_unsigned);
+    DequantizeOutput(g, nearest_interp_op, nearest_interp_out, "Out",
+                     output_scale, is_output_unsigned);
+
+    ++quantize_nearest_interp_count;
+  };
+
+  gpd(graph, handler);
+  AddStatis(quantize_nearest_interp_count);
+
+  PrettyLogDetail("---    quantized %d nearest_interp ops",
+                  quantize_nearest_interp_count);
+}
+
 void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const {
   VLOG(3) << "Quantizing the graph.";
   PADDLE_ENFORCE_NOT_NULL(
@@ -1076,6 +1137,7 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const {
   QuantizeMultiGru(graph);
   QuantizeFusionLSTM(graph);
   QuantizeSlice(graph);
+  QuantizeNearestInterp(graph);
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
index b3ee98263c0c0..412c4e40a01d5 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
@@ -62,6 +62,7 @@ class CPUQuantizePass : public FusePassBase {
   void QuantizeMultiGru(Graph* graph) const;
   void QuantizeFusionLSTM(Graph* graph) const;
   void QuantizeSlice(Graph* graph) const;
+  void QuantizeNearestInterp(Graph* graph) const;
 
   void QuantizeInput(Graph* g, Node* op, Node* input, std::string input_name,
                      double scale_to_one, bool is_input_unsigned,
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
index db1a10e3e31b2..e7c236bc489b7 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
@@ -58,7 +58,8 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
     op->SetAttr("Scale_in", 1.0f);
     op->SetAttr("Scale_out", 1.0f);
     op->SetAttr("Scale_weights", std::vector<float>{1.0f});
-  } else if (type == "pool2d" || type == "transpose2" || type == "reshape2") {
+  } else if (type == "pool2d" || type == "transpose2" || type == "reshape2" ||
+             type == "nearest_interp" || type == "nearest_interp_v2") {
     op->SetInput("X", {inputs[0]});
     op->SetOutput("Out", {outputs[0]});
   } else if (type == "slice") {
@@ -434,6 +435,18 @@ TEST(CpuQuantizePass, sliceBetweenNonQuantizedOp) {
   TestImmutableOpBetweenNonQuantizedOp("slice");
 }
 
+TEST(CpuQuantizePass, nearestInterp) { TestImmutableOp("nearest_interp"); }
+
+TEST(CpuQuantizePass, nearestInterpBetweenNonQuantizedOp) {
+  TestImmutableOpBetweenNonQuantizedOp("nearest_interp");
+}
+
+TEST(CpuQuantizePass, nearestInterpV2) { TestImmutableOp("nearest_interp_v2"); }
+
+TEST(CpuQuantizePass, nearestInterpV2BetweenNonQuantizedOp) {
+  TestImmutableOpBetweenNonQuantizedOp("nearest_interp_v2");
+}
+
 static const std::initializer_list<std::string> variable_names_matmul = {
     "a", "b", "c", "d", "e", "f"};
 
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
index 1a701e2ef0a7e..5f74b61ee86aa 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h"
+
 #include <unordered_set>
 
 namespace paddle {
@@ -23,15 +24,34 @@ class Graph;
 
 void CPUQuantizePlacementPass::ApplyImpl(ir::Graph* graph) const {
   VLOG(3) << "Marks operators which are to be quantized.";
+  std::unordered_set<std::string> supported_op_types =
+      std::unordered_set<std::string>(
+          {"concat", "conv2d", "depthwise_conv2d", "elementwise_add", "fc",
+           "matmul", "nearest_interp", "nearest_interp_v2", "pool2d",
+           "prior_box", "reshape2", "transpose2", "fusion_gru", "fusion_lstm",
+           "multi_gru", "slice"});
   const auto& excluded_ids_list =
       Get<std::unordered_set<int>>("quantize_excluded_op_ids");
   const auto& op_types_list =
       Get<std::unordered_set<std::string>>("quantize_enabled_op_types");
+
+  if (!op_types_list.empty()) {
+    // Verify that all user-specified operators can be quantized.
+    for (const auto& op : op_types_list) {
+      PADDLE_ENFORCE_NE(
+          supported_op_types.count(op), 0,
+          platform::errors::InvalidArgument(
+              "Pass attribute quantize_enabled_op_types contains operator %s "
+              "that is not supported by OneDNN quantization.",
+              op));
+    }
+    supported_op_types = op_types_list;
+  }
   Init(name_scope_, graph);
   GraphPatternDetector gpd;
   patterns::QuantizePlacement quantize_placement_pattern{gpd.mutable_pattern(),
                                                          "quantize_placement"};
-  quantize_placement_pattern(op_types_list);
+  quantize_placement_pattern(supported_op_types);
 
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
@@ -46,16 +66,7 @@ void CPUQuantizePlacementPass::ApplyImpl(ir::Graph* graph) const {
       return;
     }
 
-    if (op->Op()->HasAttr("mkldnn_data_type") ||
-        op->Op()->HasProtoAttr("mkldnn_data_type")) {
-      // use_quantizer is no longer used
-      // assign value for compatibility
-      if (op->Op()->GetAttrIfExists<bool>("use_quantizer")) {
-        op->Op()->SetAttr("mkldnn_data_type", std::string("int8"));
-      }
-      op->Op()->SetAttr("mkldnn_data_type", std::string("int8"));
-      op->Op()->SetAttr("use_quantizer", true);
-    }
+    op->Op()->SetAttr("mkldnn_data_type", std::string("int8"));
   };
   gpd(graph, handler);
 }
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc
index daf913bf7d80d..350fad2c672d4 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc
@@ -140,6 +140,32 @@ TEST(QuantizerPlacementPass, default_attr_value) {
   DefaultAttrTest(5);
 }
 
+void EnabledOpTypesTest(
+    std::initializer_list<std::string> quantize_enabled_op_types,
+    std::string missing_op) {
+  auto prog = BuildProgramDesc();
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+
+  auto pass = PassRegistry::Instance().Get("cpu_quantize_placement_pass");
+  pass->Set("quantize_enabled_op_types",
+            new std::unordered_set<std::string>(quantize_enabled_op_types));
+
+  try {
+    graph.reset(pass->Apply(graph.release()));
+  } catch (paddle::platform::EnforceNotMet& err) {
+    std::string ex_msg = err.what();
+    std::string expected_msg =
+        "Pass attribute quantize_enabled_op_types contains operator " +
+        missing_op + " that is not supported by OneDNN quantization.";
+    EXPECT_TRUE(ex_msg.find(expected_msg) != std::string::npos);
+  }
+}
+
+TEST(QuantizerPlacementPass, unsupported_op_type) {
+  // Dropout op is not supported by OneDNN quantization
+  EnabledOpTypesTest({"conv2d", "dropout"}, "dropout");
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.cc b/paddle/fluid/inference/api/mkldnn_quantizer.cc
index aa29b779e471b..ef9d03d1dcbaf 100644
--- a/paddle/fluid/inference/api/mkldnn_quantizer.cc
+++ b/paddle/fluid/inference/api/mkldnn_quantizer.cc
@@ -124,7 +124,8 @@ void AnalysisPredictor::MkldnnQuantizer::CalculateScalesForOpOutputs(
       } else if (op->Type() == "relu") {
         is_unsigned = true;
       } else if (op->Type() == "transpose2" || op->Type() == "reshape2" ||
-                 op->Type() == "pool2d") {
+                 op->Type() == "pool2d" || op->Type() == "nearest_interp" ||
+                 op->Type() == "nearest_interp_v2") {
         auto input_var_name = op->Input("X")[0];
         PADDLE_ENFORCE_NE(scales_.find(input_var_name), scales_.end(),
                           platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/inference/api/mkldnn_quantizer_config.cc b/paddle/fluid/inference/api/mkldnn_quantizer_config.cc
index 6642a2c030b26..d4fa78518e149 100644
--- a/paddle/fluid/inference/api/mkldnn_quantizer_config.cc
+++ b/paddle/fluid/inference/api/mkldnn_quantizer_config.cc
@@ -107,6 +107,18 @@ MkldnnQuantizerConfig::MkldnnQuantizerConfig() {
   rules_["fusion_lstm"]["ReorderedC0"] = ScaleAlgo::NONE;
   rules_["fusion_lstm"]["CheckedCell"] = ScaleAlgo::NONE;
   rules_["fusion_lstm"]["Hidden"] = ScaleAlgo::KL;
+
+  rules_["nearest_interp"]["X"] = ScaleAlgo::KL;
+  rules_["nearest_interp"]["OutSize"] = ScaleAlgo::NONE;
+  rules_["nearest_interp"]["SizeTensor"] = ScaleAlgo::NONE;
+  rules_["nearest_interp"]["Scale"] = ScaleAlgo::NONE;
+  rules_["nearest_interp"]["Out"] = ScaleAlgo::NONE;
+
+  rules_["nearest_interp_v2"]["X"] = ScaleAlgo::KL;
+  rules_["nearest_interp_v2"]["OutSize"] = ScaleAlgo::NONE;
+  rules_["nearest_interp_v2"]["SizeTensor"] = ScaleAlgo::NONE;
+  rules_["nearest_interp_v2"]["Scale"] = ScaleAlgo::NONE;
+  rules_["nearest_interp_v2"]["Out"] = ScaleAlgo::NONE;
 }
 
 ScaleAlgo MkldnnQuantizerConfig::scale_algo(
diff --git a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
index 3e5db06a86a37..7dbd927874d19 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
@@ -63,7 +63,8 @@ def __init__(self,
         self._op_ids_to_skip = _op_ids_to_skip if _op_ids_to_skip is not None else set(
             [-1])
         self._scale_immutable_ops = [
-            'transpose2', 'reshape2', 'pool2d', 'slice'
+            'transpose2', 'reshape2', 'pool2d', 'slice', 'nearest_interp',
+            'nearest_interp_v2'
         ]
         self._scale_ops = ['scale']
         self._conv_ops = ['conv2d', 'depthwise_conv2d']
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py
index 994f89ab3e9f3..f0dae081dd48f 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py
@@ -216,6 +216,141 @@ def test_quant_update_activation(self):
             graph = quant2_int8_mkldnn_pass._update_activations(graph)
             self.check_graph_after_pass(graph)
 
+    class TestQuant2Int8MkldnnPassNearestInterp(unittest.TestCase):
+        def op_name(self):
+            return "nearest_interp"
+
+        def setUp(self):
+            self.scope = fluid.Scope()
+            self.place = fluid.CPUPlace()
+            self.dtype = np.float32
+            self.use_cudnn = False
+            self.use_mkldnn = True
+
+            # conv2d
+            self.data_format = "ANYLAYOUT"
+            self.pad = [0, 0]
+            self.stride = [1, 1]
+            self.dilations = [1, 1]
+            self.groups = 1
+            self.input_size = [1, 3, 5, 5]
+            self.filter_size = [16, 3, 3, 3]
+            self.conv_output_size = [1, 16, 3, 3]
+            self.input = np.random.random(self.input_size).astype(self.dtype)
+            self.filter = np.random.random(self.filter_size).astype(self.dtype)
+            self.conv_output = np.ndarray(self.conv_output_size).astype(
+                self.dtype)
+
+            # nearest_interp
+            self.out_h = 1
+            self.out_w = 1
+            self.scale = 2.0
+            self.interp_method = 'nearest'
+            self.data_layout = 'NCHW'
+            self.nearest_interp_output_size = [1, 1, 2, 2]
+            self.nearest_interp_output = np.ndarray(
+                self.nearest_interp_output_size).astype(self.dtype)
+
+            # dropout
+            self.dropout_prob = 0.5
+            self.dropout_out = np.ndarray(
+                self.nearest_interp_output_size).astype(self.dtype)
+            self.dropout_mask = np.ndarray(self.nearest_interp_output_size)
+
+            self.quantized_ops = {
+                "conv2d", "nearest_interp", "nearest_interp_v2"
+            }
+            self.variables = {
+                "input": self.input,
+                "filter": self.filter,
+                "conv_output": self.conv_output,
+                "nearest_interp_output": self.nearest_interp_output,
+                "dropout_out": self.dropout_out,
+                'dropout_mask': self.dropout_mask
+            }
+
+        def prepare_program(self, program):
+            block = program.global_block()
+            for name in self.variables:
+                block.create_var(
+                    name=name,
+                    dtype="float32",
+                    shape=self.variables[name].shape)
+            block.append_op(
+                type="conv2d",
+                inputs={
+                    "Input": block.var('input'),
+                    'Filter': block.var('filter')
+                },
+                outputs={"Output": block.var('conv_output')},
+                attrs={
+                    'strides': self.stride,
+                    'paddings': self.pad,
+                    'groups': self.groups,
+                    'dilations': self.dilations,
+                    'use_cudnn': self.use_cudnn,
+                    'use_mkldnn': self.use_mkldnn,
+                    'data_format': self.data_format,
+                    'fuse_relu': True
+                })
+            block.append_op(
+                type=self.op_name(),
+                inputs={"X": block.var('conv_output'), },
+                outputs={"Out": block.var('nearest_interp_output')},
+                attrs={
+                    'interp_method': self.interp_method,
+                    'out_h': self.out_h,
+                    'out_w': self.out_w,
+                    'scale': self.scale,
+                    'data_layout': self.data_layout,
+                    'use_mkldnn': self.use_mkldnn
+                })
+            block.append_op(
+                type='dropout',
+                inputs={"X": block.var('nearest_interp_output'), },
+                outputs={
+                    'Out': block.var('dropout_out'),
+                    'Mask': block.var('dropout_mask')
+                },
+                attrs={'dropout_prob': self.dropout_prob, })
+
+        def check_graph_after_pass(self, graph):
+            for op in graph.all_op_nodes():
+                if op.op().type() in self.quantized_ops:
+                    self.assertTrue(op.op().has_attr("mkldnn_data_type"))
+                    self.assertTrue(op.op().attr("mkldnn_data_type") == "int8")
+
+        def test_quant_update_activation(self):
+            program = fluid.Program()
+            with fluid.program_guard(program):
+                self.prepare_program(program)
+                graph = IrGraph(core.Graph(program.desc), for_test=True)
+                quant2_int8_mkldnn_pass = Quant2Int8MkldnnPass(
+                    self.quantized_ops,
+                    _scope=self.scope,
+                    _place=self.place,
+                    _core=core,
+                    _debug=False)
+
+                input_scale_tensor = quant2_int8_mkldnn_pass._convert_scale2tensor(
+                    np.array(self.scale).astype(np.float64))
+                output_scale_tensor = quant2_int8_mkldnn_pass._convert_scale2tensor(
+                    np.array(1. / self.scale * self.scale).astype(np.float64))
+                var_scale = {
+                    "input": (False, input_scale_tensor),
+                    "filter": (False, input_scale_tensor),
+                    "conv_output": (False, output_scale_tensor),
+                }
+                if core.avx_supported():
+                    quant2_int8_mkldnn_pass._var_quant_scales = var_scale
+                    graph = quant2_int8_mkldnn_pass._propagate_scales(graph)
+                    graph = quant2_int8_mkldnn_pass._quantize_fp32_graph(graph)
+                    self.check_graph_after_pass(graph)
+
+    class TestQuant2Int8MkldnnPassNearestInterpV2(unittest.TestCase):
+        def op_name(self):
+            return "nearest_interp_v2"
+
 
 if __name__ == '__main__':
     unittest.main()

From f289cf851115286064c9d0864f153b8dae1a960d Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Wed, 5 Jan 2022 11:00:45 +0800
Subject: [PATCH 002/151] inference c_api support std::string (#38667)

* c_api support std::string

* update

* update

* add NOTE

* fix delete error.
---
 paddle/fluid/inference/capi_exp/pd_config.cc  |  6 ++---
 paddle/fluid/inference/capi_exp/pd_config.h   |  2 +-
 paddle/fluid/inference/capi_exp/pd_types.h    |  5 ++++
 paddle/fluid/inference/capi_exp/pd_utils.cc   | 23 +++++++++++++++++++
 paddle/fluid/inference/capi_exp/pd_utils.h    |  9 ++++++++
 .../fluid/inference/capi_exp/utils_internal.h |  8 +++++++
 paddle/fluid/inference/goapi/config.go        |  4 ++--
 .../tests/api/analyzer_capi_exp_tester.cc     |  5 ++++
 8 files changed, 55 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/inference/capi_exp/pd_config.cc b/paddle/fluid/inference/capi_exp/pd_config.cc
index b1ad2f6c87cc6..e342190fda1ac 100644
--- a/paddle/fluid/inference/capi_exp/pd_config.cc
+++ b/paddle/fluid/inference/capi_exp/pd_config.cc
@@ -459,12 +459,10 @@ __pd_give PD_OneDimArrayCstr* PD_ConfigAllPasses(
   std::vector<std::string> passes = config->pass_builder()->AllPasses();
   return paddle_infer::CvtVecToOneDimArrayCstr(passes);
 }
-const char* PD_ConfigSummary(__pd_keep PD_Config* pd_config) {
+__pd_give PD_Cstr* PD_ConfigSummary(__pd_keep PD_Config* pd_config) {
   CHECK_AND_CONVERT_PD_CONFIG;
   auto sum_str = config->Summary();
-  char* c = reinterpret_cast<char*>(malloc(sum_str.length() + 1));
-  snprintf(c, sum_str.length() + 1, "%s", sum_str.c_str());
-  return c;
+  return paddle_infer::CvtStrToCstr(sum_str);
 }
 
 }  // extern "C"
diff --git a/paddle/fluid/inference/capi_exp/pd_config.h b/paddle/fluid/inference/capi_exp/pd_config.h
index e8ab9357dc95d..c314aca918f14 100644
--- a/paddle/fluid/inference/capi_exp/pd_config.h
+++ b/paddle/fluid/inference/capi_exp/pd_config.h
@@ -705,7 +705,7 @@ PADDLE_CAPI_EXPORT extern __pd_give PD_OneDimArrayCstr* PD_ConfigAllPasses(
 ///
 /// \return Return config info.
 ///
-PADDLE_CAPI_EXPORT extern const char* PD_ConfigSummary(
+PADDLE_CAPI_EXPORT extern __pd_give PD_Cstr* PD_ConfigSummary(
     __pd_keep PD_Config* pd_config);
 
 #ifdef __cplusplus
diff --git a/paddle/fluid/inference/capi_exp/pd_types.h b/paddle/fluid/inference/capi_exp/pd_types.h
index a5da2913a9b20..62c54616535cf 100644
--- a/paddle/fluid/inference/capi_exp/pd_types.h
+++ b/paddle/fluid/inference/capi_exp/pd_types.h
@@ -34,6 +34,11 @@ typedef struct PD_OneDimArrayCstr {
   char** data;
 } PD_OneDimArrayCstr;  // std::vector<std::string>
 
+typedef struct PD_Cstr {
+  size_t size;
+  char* data;
+} PD_Cstr;  // std::string
+
 typedef struct PD_TwoDimArraySize {
   size_t size;
   PD_OneDimArraySize** data;
diff --git a/paddle/fluid/inference/capi_exp/pd_utils.cc b/paddle/fluid/inference/capi_exp/pd_utils.cc
index 94362b8784bb3..efca350fbaf49 100644
--- a/paddle/fluid/inference/capi_exp/pd_utils.cc
+++ b/paddle/fluid/inference/capi_exp/pd_utils.cc
@@ -78,6 +78,17 @@ void PD_OneDimArrayCstrDestroy(__pd_take PD_OneDimArrayCstr* array) {
     delete array;
   }
 }
+
+void PD_CstrDestroy(__pd_take PD_Cstr* cstr) {
+  if (cstr != NULL) {
+    if (cstr->size != 0) {
+      cstr->size = 0;
+      delete[] cstr->data;
+      cstr->data = NULL;
+    }
+    delete cstr;
+  }
+}
 namespace paddle_infer {
 
 __pd_give PD_OneDimArrayCstr* CvtVecToOneDimArrayCstr(
@@ -101,6 +112,18 @@ std::vector<std::string> CvtOneDimArrayToVecCstr(
   return vec;
 }
 
+__pd_give PD_Cstr* CvtStrToCstr(const std::string& str) {
+  PD_Cstr* cstr = new PD_Cstr;
+  if (str.empty()) {
+    cstr->size = 0;
+    cstr->data = NULL;
+  } else {
+    cstr->size = str.length() + 1;
+    cstr->data = new char[str.length() + 1];
+    memcpy(cstr->data, str.c_str(), str.length() + 1);
+  }
+  return cstr;
+}
 }  // namespace paddle_infer
 
 #define DESTROY_TWO_DIM_ARRAY(type)                                           \
diff --git a/paddle/fluid/inference/capi_exp/pd_utils.h b/paddle/fluid/inference/capi_exp/pd_utils.h
index 68e519d4bb5e9..8743c58db76c9 100644
--- a/paddle/fluid/inference/capi_exp/pd_utils.h
+++ b/paddle/fluid/inference/capi_exp/pd_utils.h
@@ -65,6 +65,15 @@ PADDLE_CAPI_EXPORT extern void PD_OneDimArraySizeDestroy(
 PADDLE_CAPI_EXPORT extern void PD_TwoDimArraySizeDestroy(
     __pd_take PD_TwoDimArraySize* array);
 
+///
+/// \brief Destroy the PD_Cstr object pointed to by the pointer.
+/// NOTE: if input string is empty, the return PD_Cstr's size is
+/// 0 and data is NULL.
+///
+/// \param[in] cstr pointer to the PD_Cstr object.
+///
+PADDLE_CAPI_EXPORT extern void PD_CstrDestroy(__pd_take PD_Cstr* cstr);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/paddle/fluid/inference/capi_exp/utils_internal.h b/paddle/fluid/inference/capi_exp/utils_internal.h
index fbae512ecd855..95b16dbd59943 100644
--- a/paddle/fluid/inference/capi_exp/utils_internal.h
+++ b/paddle/fluid/inference/capi_exp/utils_internal.h
@@ -114,6 +114,14 @@ __pd_give PD_TwoDimArraySize* CvtVecToTwoDimArraySize(
 std::vector<std::vector<size_t>> CvtTwoDimArrayToVecSize(
     __pd_keep const PD_TwoDimArraySize* array);
 
+///
+/// \brief Convert the 'std::string' object to a 'PD_Cstr' object.
+///
+/// \param[in] vec source object.
+/// \return target object.
+///
+__pd_give PD_Cstr* CvtStrToCstr(const std::string& vec);
+
 ///
 /// \brief Convert the 'PD_PlaceType' object to a 'paddle_infer::PlaceType'
 /// object.
diff --git a/paddle/fluid/inference/goapi/config.go b/paddle/fluid/inference/goapi/config.go
index a95bb6bef6ee4..def26913b0a1c 100644
--- a/paddle/fluid/inference/goapi/config.go
+++ b/paddle/fluid/inference/goapi/config.go
@@ -833,7 +833,7 @@ func (config *Config) AllPasses() []string {
 ///
 func (config *Config) Summary() string {
 	cSummary := C.PD_ConfigSummary(config.c)
-	summary := C.GoString(cSummary)
-	C.free(unsafe.Pointer(cSummary))
+	summary := C.GoString(cSummary.data)
+	C.PD_CstrDestroy(cSummary)
 	return summary
 }
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_tester.cc
index 11de1a5a6fab4..4b2852be86149 100644
--- a/paddle/fluid/inference/tests/api/analyzer_capi_exp_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_tester.cc
@@ -18,7 +18,9 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
+#include "paddle/fluid/inference/capi_exp/pd_config.h"
 #include "paddle/fluid/inference/capi_exp/pd_inference_api.h"
+#include "paddle/fluid/inference/capi_exp/pd_utils.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
 namespace paddle {
@@ -34,6 +36,8 @@ void predictor_run() {
   PD_ConfigSetCpuMathLibraryNumThreads(config, 10);
   PD_ConfigSwitchIrDebug(config, TRUE);
   PD_ConfigSetModel(config, prog_file.c_str(), params_file.c_str());
+  PD_Cstr *config_summary = PD_ConfigSummary(config);
+  LOG(INFO) << config_summary->data;
 
   PD_Predictor *predictor = PD_PredictorCreate(config);
   PD_Tensor *tensor = PD_PredictorGetInputHandle(predictor, "data");
@@ -51,6 +55,7 @@ void predictor_run() {
 
   delete[] input;
   PD_TensorDestroy(tensor);
+  PD_CstrDestroy(config_summary);
   PD_PredictorDestroy(predictor);
 }
 

From 6792312445d0f05bb1912745021420c50615185f Mon Sep 17 00:00:00 2001
From: jakpiase <jakpia21@gmail.com>
Date: Wed, 5 Jan 2022 04:01:18 +0100
Subject: [PATCH 003/151] Fix for matmul_v2 oneDNN op broadcasting when inputs
 dims have different lengths (#38665)

* fix for matmul_v2 broadcasting

* fix for output shape not broadcasted
---
 .../operators/mkldnn/matmul_v2_mkldnn_op.cc   | 69 ++++++++++++-------
 .../mkldnn/test_matmul_v2_mkldnn_op.py        | 35 ++++++++++
 2 files changed, 79 insertions(+), 25 deletions(-)

diff --git a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
index 5cb6ae34dcecf..a8d4b852ca3c2 100644
--- a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
@@ -295,7 +295,7 @@ class MatMulV2MKLDNNKernel : public paddle::framework::OpKernel<T> {
       x_bd_dims[x_bd_dims.size() - 2] = x_dims[0];
     } else {
       for (size_t i = 0; i < x_dims.size(); ++i) {
-        x_bd_dims[i] = x_dims[i];
+        x_bd_dims[x_bd_dims.size() - x_dims.size() + i] = x_dims[i];
       }
     }
     if (y_dims.size() == 1) {
@@ -305,21 +305,21 @@ class MatMulV2MKLDNNKernel : public paddle::framework::OpKernel<T> {
       y_bd_dims[y_bd_dims.size() - 2] = y_dims[0];
     } else {
       for (size_t i = 0; i < y_dims.size(); ++i) {
-        y_bd_dims[i] = y_dims[i];
+        y_bd_dims[y_bd_dims.size() - y_dims.size() + i] = y_dims[i];
       }
     }
 
-    if ((y_dims.size() == x_dims.size()) && y_dims.size() > 2 &&
-        !IsOutputFused(ctx)) {
-      for (size_t i = 0; i < x_dims.size() - 2; ++i) {
+    if (!IsOutputFused(ctx) && x_dims.size() > 2 && y_dims.size() > 2) {
+      for (size_t i = 0; i < x_bd_dims.size() - 2; ++i) {
         PADDLE_ENFORCE_EQ(
-            x_dims[i] == y_dims[i] || x_dims[i] == 1 || y_dims[i] == 1, true,
-            paddle::platform::errors::InvalidArgument(
-                "Tensor dimensions are incorrect for broadcasting."
-                "Dimensions in X and Y must be same or equal to 1, but "
-                "received x_dim[%d]=%d and y_dims[%d]= %d",
-                i, x_dims[i], i, y_dims[i]));
-        out_dims[i] = std::max(x_dims[i], y_dims[i]);
+            x_bd_dims[i] == y_bd_dims[i] || x_bd_dims[i] == 1 ||
+                y_bd_dims[i] == 1,
+            true, paddle::platform::errors::InvalidArgument(
+                      "Tensor dimensions are incorrect for broadcasting."
+                      "Dimensions in X and Y must be same or equal to 1, but "
+                      "received x_dim[%d]=%d and y_dims[%d]= %d",
+                      i, x_bd_dims[i], i, y_bd_dims[i]));
+        out_dims[i] = std::max(x_bd_dims[i], y_bd_dims[i]);
       }
       out->Resize(make_ddim(out_dims));
     }
@@ -382,11 +382,11 @@ class MatMulV2GradMKLDNNKernel : public paddle::framework::OpKernel<T> {
     dy_tmp->mutable_data<T>(ctx.GetPlace());
   }
 
-  void ReduceSumForMatmulGradOutput(const ExecutionContext& ctx,
-                                    const MKLDNNDeviceContext& dev_ctx,
-                                    const dnnl::engine onednn_engine,
-                                    const Tensor* dx_tmp, Tensor* dx,
-                                    std::vector<int64_t> dx_dims) const {
+  void ReduceSumForMatmulGradOutput(
+      const ExecutionContext& ctx, const MKLDNNDeviceContext& dev_ctx,
+      const dnnl::engine onednn_engine, const Tensor* dx_tmp, Tensor* dx,
+      std::vector<int64_t>& dx_dims,
+      const std::vector<int64_t>& squeezed_dims) const {
     paddle::platform::ReductionMKLDNNHandler<T> handler(
         dnnl::algorithm::reduction_sum, 0.0f, 0.0f, onednn_engine,
         ctx.GetPlace(), dx_tmp, dx, dx_dims);
@@ -402,6 +402,19 @@ class MatMulV2GradMKLDNNKernel : public paddle::framework::OpKernel<T> {
 
     reduction_p->execute(astream, reduction_args);
     astream.wait();
+
+    dx->set_format(paddle::platform::GetMKLDNNFormat(
+        dst_memory_p->get_desc().reshape(squeezed_dims)));
+  }
+
+  std::vector<int64_t> ExtendDimsWithOnes(const std::vector<int64_t>& dims,
+                                          int new_size) const {
+    std::vector<int64_t> new_dims(new_size, 1);
+    for (size_t i = 0; i < dims.size(); ++i) {
+      new_dims[new_size - dims.size() + i] = dims[i];
+    }
+
+    return new_dims;
   }
 
   void RunKernel(const ExecutionContext& ctx) const {
@@ -440,8 +453,14 @@ class MatMulV2GradMKLDNNKernel : public paddle::framework::OpKernel<T> {
     bool trans_y = ctx.Attr<bool>("trans_y");
     auto dout_dims = vectorize(dout->dims());
 
-    int ndims = std::max(x->dims().size(), y->dims().size());
-    ndims = std::max(ndims, 3);
+    size_t ndims = std::max(x->dims().size(), y->dims().size());
+    ndims = std::max<size_t>(ndims, 3);
+
+    if (x_dims.size() != ndims) {
+      x_dims = ExtendDimsWithOnes(x_dims, ndims);
+    } else if (y_dims.size() != ndims) {
+      y_dims = ExtendDimsWithOnes(y_dims, ndims);
+    }
 
     // in broadcasting scenario new memory is required because
     // reduce sum must be calculated upon broadcasted dims
@@ -481,21 +500,21 @@ class MatMulV2GradMKLDNNKernel : public paddle::framework::OpKernel<T> {
 
     if (x_dims != dx_bd_dims) {
       ReduceSumForMatmulGradOutput(ctx, dev_ctx, onednn_engine, &dx_tmp, dx,
-                                   x_dims);
+                                   x_dims,
+                                   paddle::framework::vectorize(x->dims()));
     } else {
       *dx = std::move(dx_tmp);
     }
     if (y_dims != dy_bd_dims) {
       ReduceSumForMatmulGradOutput(ctx, dev_ctx, onednn_engine, &dy_tmp, dy,
-                                   y_dims);
+                                   y_dims,
+                                   paddle::framework::vectorize(y->dims()));
     } else {
       *dy = std::move(dy_tmp);
     }
 
-    dx->set_layout(paddle::framework::DataLayout::kMKLDNN);
-    dx->set_format(x->format());
-    dy->set_layout(paddle::framework::DataLayout::kMKLDNN);
-    dy->set_format(y->format());
+    dx->Resize(x->dims());
+    dy->Resize(y->dims());
   }
 
  private:
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py
index 5dd1795818c2b..25701b797ec4a 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py
@@ -262,6 +262,41 @@ def config(self):
         self.trans_y = False
 
 
+class TestMatMulV2MatrixXMatrix4Dx3DTransposeXOneDNNOp(
+        TestMatMulV2VectorXVectorOneDNNOp):
+    def config(self):
+        self.x_shape = (5, 4, 15, 10)
+        self.y_shape = (1, 15, 20)
+        self.trans_x = True
+        self.trans_y = False
+
+
+class TestMatMulV2MatrixXMatrix3Dx4DTransposeYOneDNNOp(
+        TestMatMulV2VectorXVectorOneDNNOp):
+    def config(self):
+        self.x_shape = (2, 10, 15)
+        self.y_shape = (4, 2, 20, 15)
+        self.trans_x = False
+        self.trans_y = True
+
+
+class TestMatMulV2MatrixXMatrix5Dx3DTransposeXTransposeYOneDNNOp(
+        TestMatMulV2VectorXVectorOneDNNOp):
+    def config(self):
+        self.x_shape = (4, 3, 2, 15, 10)
+        self.y_shape = (1, 20, 15)
+        self.trans_x = True
+        self.trans_y = True
+
+
+class TestMatMulV2MatrixXMatrix3Dx4DOneDNNOp(TestMatMulV2VectorXVectorOneDNNOp):
+    def config(self):
+        self.x_shape = (1, 1, 32, 16)
+        self.y_shape = (16, 16, 16)
+        self.trans_x = False
+        self.trans_y = False
+
+
 #   BF16 TESTS
 def create_bf16_test_class(parent):
     @OpTestTool.skip_if_not_cpu_bf16()

From d6df5bd98a21a2bd68bba69925722054c5ac08eb Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Wed, 5 Jan 2022 12:07:38 +0800
Subject: [PATCH 004/151] [PTen] Polish infermeta filename (#38695)

* polish infermeta filename

* polish infermeta filename
---
 paddle/pten/include/infermeta.h               |  3 ++-
 paddle/pten/infermeta/CMakeLists.txt          |  2 +-
 paddle/pten/infermeta/multiary.cc             | 17 +++++++++++++++++
 paddle/pten/infermeta/multiary.h              | 17 +++++++++++++++++
 paddle/pten/infermeta/{nary.cc => nullary.cc} |  2 +-
 paddle/pten/infermeta/{nary.h => nullary.h}   |  0
 paddle/pten/kernels/empty_kernel.h            |  2 +-
 7 files changed, 39 insertions(+), 4 deletions(-)
 create mode 100644 paddle/pten/infermeta/multiary.cc
 create mode 100644 paddle/pten/infermeta/multiary.h
 rename paddle/pten/infermeta/{nary.cc => nullary.cc} (96%)
 rename paddle/pten/infermeta/{nary.h => nullary.h} (100%)

diff --git a/paddle/pten/include/infermeta.h b/paddle/pten/include/infermeta.h
index 151cb638d85b7..5e356dd37c03e 100644
--- a/paddle/pten/include/infermeta.h
+++ b/paddle/pten/include/infermeta.h
@@ -16,5 +16,6 @@ limitations under the License. */
 
 // See Note: [ How do we organize the kernel directory ]
 #include "paddle/pten/infermeta/binary.h"
-#include "paddle/pten/infermeta/nary.h"
+#include "paddle/pten/infermeta/multiary.h"
+#include "paddle/pten/infermeta/nullary.h"
 #include "paddle/pten/infermeta/unary.h"
diff --git a/paddle/pten/infermeta/CMakeLists.txt b/paddle/pten/infermeta/CMakeLists.txt
index 2b4bba8313f58..f92727f33fb05 100644
--- a/paddle/pten/infermeta/CMakeLists.txt
+++ b/paddle/pten/infermeta/CMakeLists.txt
@@ -1 +1 @@
-cc_library(infermeta SRCS nary.cc unary.cc binary.cc DEPS convert_utils)
+cc_library(infermeta SRCS nullary.cc unary.cc binary.cc multiary.cc DEPS convert_utils)
diff --git a/paddle/pten/infermeta/multiary.cc b/paddle/pten/infermeta/multiary.cc
new file mode 100644
index 0000000000000..5dbf3d58a1952
--- /dev/null
+++ b/paddle/pten/infermeta/multiary.cc
@@ -0,0 +1,17 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/infermeta/multiary.h"
+
+namespace pten {}  // namespace pten
diff --git a/paddle/pten/infermeta/multiary.h b/paddle/pten/infermeta/multiary.h
new file mode 100644
index 0000000000000..6aa15159630bc
--- /dev/null
+++ b/paddle/pten/infermeta/multiary.h
@@ -0,0 +1,17 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace pten {}  // namespace pten
diff --git a/paddle/pten/infermeta/nary.cc b/paddle/pten/infermeta/nullary.cc
similarity index 96%
rename from paddle/pten/infermeta/nary.cc
rename to paddle/pten/infermeta/nullary.cc
index 5287c5cca1439..731e69e60907b 100644
--- a/paddle/pten/infermeta/nary.cc
+++ b/paddle/pten/infermeta/nullary.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 // See Note [ Why still include the fluid headers? ]
-#include "paddle/pten/infermeta/nary.h"
+#include "paddle/pten/infermeta/nullary.h"
 
 namespace pten {
 
diff --git a/paddle/pten/infermeta/nary.h b/paddle/pten/infermeta/nullary.h
similarity index 100%
rename from paddle/pten/infermeta/nary.h
rename to paddle/pten/infermeta/nullary.h
diff --git a/paddle/pten/kernels/empty_kernel.h b/paddle/pten/kernels/empty_kernel.h
index 3249526805bfb..d71ee0b1266f2 100644
--- a/paddle/pten/kernels/empty_kernel.h
+++ b/paddle/pten/kernels/empty_kernel.h
@@ -17,7 +17,7 @@
 #include "paddle/pten/api/lib/utils/storage.h"
 #include "paddle/pten/common/scalar_array.h"
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/infermeta/nary.h"
+#include "paddle/pten/infermeta/nullary.h"
 #include "paddle/pten/infermeta/unary.h"
 
 namespace pten {

From c90a652d77c4149cb3f700c7840a45ea2f08804d Mon Sep 17 00:00:00 2001
From: wawltor <fangzeyang0904@hotmail.com>
Date: Wed, 5 Jan 2022 14:33:17 +0800
Subject: [PATCH 005/151] add the examples for the mm (#38669)

* add the examples for the mm

* fix the document of paddle.mm
---
 python/paddle/tensor/math.py | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 5f7e51598cc43..c4a92b1486d58 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -1118,6 +1118,38 @@ def mm(input, mat2, name=None):
     Returns:
         Tensor: The product Tensor.
 
+    ::
+
+        * example 1:
+
+        input: [B, ..., M, K], mat2: [B, ..., K, N]
+        out: [B, ..., M, N]
+
+        * example 2:
+
+        input: [B, M, K], mat2: [B, K, N]
+        out: [B, M, N]
+
+        * example 3:
+
+        input: [B, M, K], mat2: [K, N]
+        out: [B, M, N]
+
+        * example 4:
+
+        input: [M, K], mat2: [K, N]
+        out: [M, N]
+
+        * example 5:
+
+        input: [B, M, K], mat2: [K]
+        out: [B, M]
+
+        * example 6:
+
+        input: [K], mat2: [K]
+        out: [1]
+
     Examples:
         .. code-block:: python
 

From 7a4a512daa172062068c7fab669bd321f1926274 Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Wed, 5 Jan 2022 14:33:46 +0800
Subject: [PATCH 006/151] [pten]Move reduce code new (#38648)

* change 'math' to 'math_kernel'

* fix compile bugs

* merge develop

* fix compile bugs

* fix compile bugs

* move reduce files by new rule

* add set header

* format code style

* merge develop and fix conflict

* merge develop and fix conflict

Co-authored-by: YuanRisheng <yuanrisheng@baidu.com>
---
 .../reduce_ops/check_reduce_rank_test.cu      |  2 +-
 .../fluid/operators/reduce_ops/reduce_op.cu.h |  2 +-
 paddle/fluid/operators/reduce_ops/reduce_op.h |  6 +-
 paddle/pten/include/math.h                    |  4 +-
 paddle/pten/kernels/cpu/math_kernel.cc        | 41 +++++------
 .../general/reduce_impl.h => cpu/reduce.h}    |  9 ++-
 paddle/pten/kernels/gpu/math_kernel.cu        | 33 +++++----
 .../reduce_cuda_impl.h => gpu/reduce.h}       | 48 +++++++++++++
 .../pten/kernels/hybird/cuda/reduce/reduce.h  | 71 -------------------
 paddle/pten/kernels/math_kernel.h             | 26 +++----
 10 files changed, 107 insertions(+), 135 deletions(-)
 rename paddle/pten/kernels/{hybird/general/reduce_impl.h => cpu/reduce.h} (95%)
 rename paddle/pten/kernels/{hybird/cuda/reduce/reduce_cuda_impl.h => gpu/reduce.h} (96%)
 delete mode 100644 paddle/pten/kernels/hybird/cuda/reduce/reduce.h

diff --git a/paddle/fluid/operators/reduce_ops/check_reduce_rank_test.cu b/paddle/fluid/operators/reduce_ops/check_reduce_rank_test.cu
index 63d42790205ab..33e195f899209 100644
--- a/paddle/fluid/operators/reduce_ops/check_reduce_rank_test.cu
+++ b/paddle/fluid/operators/reduce_ops/check_reduce_rank_test.cu
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "gtest/gtest.h"
-#include "paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h"
+#include "paddle/pten/kernels/gpu/reduce.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
index e779da641b963..62486f62f66f8 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
@@ -32,7 +32,7 @@ namespace cub = hipcub;
 #include "paddle/fluid/framework/tensor.h"
 
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h"
+#include "paddle/pten/kernels/gpu/reduce.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
index bd09a7951aa2c..e1854d8a13d8b 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -28,10 +28,10 @@ limitations under the License. */
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
 #include "paddle/pten/include/core.h"
 #include "paddle/pten/include/math.h"
-#include "paddle/pten/kernels/hybird/general/reduce_impl.h"
+#include "paddle/pten/kernels/cpu/reduce.h"
 
 #if defined(__HIPCC__) || defined(__NVCC__)
-#include "paddle/pten/kernels/hybird/cuda/reduce/reduce.h"
+#include "paddle/pten/kernels/gpu/reduce.h"
 #endif
 
 namespace paddle {
@@ -259,7 +259,7 @@ class ReduceKernel : public framework::OpKernel<T> {
     std::vector<int64_t> tmp_dims(dims.begin(), dims.end());
 
     // call new kernel
-    pten::general::Reduce<DeviceContext, T, Functor>(
+    pten::Reduce<DeviceContext, T, Functor>(
         dev_ctx, *pt_x.get(), reduce_all, tmp_dims, keep_dim,
         pten::TransToPtenDataType(cast_out_dtype), pt_out.get());
   }
diff --git a/paddle/pten/include/math.h b/paddle/pten/include/math.h
index 9abfa297a9452..e46f460260adb 100644
--- a/paddle/pten/include/math.h
+++ b/paddle/pten/include/math.h
@@ -45,7 +45,7 @@ DenseTensor Mean(const ContextT& dev_ctx,
           dev_ctx.GetPlace()),
       std::move(out_meta));
   bool reduce_all = false;
-  Mean<T, ContextT>(dev_ctx, x, axis, keep_dim, reduce_all, &dense_out);
+  MeanKernel<T, ContextT>(dev_ctx, x, axis, keep_dim, reduce_all, &dense_out);
   return dense_out;
 }
 
@@ -65,7 +65,7 @@ DenseTensor Sum(const ContextT& dev_ctx,
   // so use default value(false) is OK.
   bool reduce_all = false;
 
-  Sum<T, ContextT>(
+  SumKernel<T, ContextT>(
       dev_ctx, x, axis, keep_dim, reduce_all, out_meta.dtype, &dense_out);
   return dense_out;
 }
diff --git a/paddle/pten/kernels/cpu/math_kernel.cc b/paddle/pten/kernels/cpu/math_kernel.cc
index c022dd08bbe40..4f895d9514a97 100644
--- a/paddle/pten/kernels/cpu/math_kernel.cc
+++ b/paddle/pten/kernels/cpu/math_kernel.cc
@@ -18,13 +18,10 @@
 #include "paddle/pten/backends/cpu/cpu_context.h"
 #include "paddle/pten/common/scalar.h"
 #include "paddle/pten/core/kernel_registry.h"
-
 #include "paddle/pten/kernels/cpu/elementwise_impl.h"
+#include "paddle/pten/kernels/cpu/reduce.h"
 #include "paddle/pten/kernels/funcs/elementwise_functor.h"
 
-#include "paddle/pten/kernels/hybird/eigen/reduce.h"
-#include "paddle/pten/kernels/hybird/general/reduce_impl.h"
-
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/platform/bfloat16.h"
@@ -57,14 +54,14 @@ namespace pten {
   }
 
 template <typename T, typename Context>
-void Mean(const Context& dev_ctx,
-          const DenseTensor& x,
-          const std::vector<int64_t>& dims,
-          bool keep_dim,
-          bool reduce_all,
-          DenseTensor* out) {
+void MeanKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const std::vector<int64_t>& dims,
+                bool keep_dim,
+                bool reduce_all,
+                DenseTensor* out) {
   auto out_dtype = x.dtype();
-  pten::general::Reduce<CPUContext, T, pten::eigen::MeanFunctor>(
+  pten::Reduce<CPUContext, T, pten::eigen::MeanFunctor>(
       dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
 }
 
@@ -93,14 +90,14 @@ void DivideKernel(const Context& dev_ctx,
 }
 
 template <typename T, typename Context>
-void Sum(const Context& dev_ctx,
-         const DenseTensor& x,
-         const std::vector<int64_t>& dims,
-         bool keep_dim,
-         bool reduce_all,
-         DataType out_dtype,
-         DenseTensor* out) {
-  pten::general::Reduce<CPUContext, T, pten::eigen::SumFunctor>(
+void SumKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               bool reduce_all,
+               DataType out_dtype,
+               DenseTensor* out) {
+  pten::Reduce<CPUContext, T, pten::eigen::SumFunctor>(
       dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
 }
 
@@ -120,8 +117,8 @@ using complex128 = ::paddle::platform::complex<double>;
 
 // NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16
 // using bfloat16 = ::paddle::platform::bfloat16;
-PT_REGISTER_CTX_KERNEL(mean, CPU, ALL_LAYOUT, pten::Mean, float, double, bool) {
-}
+PT_REGISTER_CTX_KERNEL(
+    mean, CPU, ALL_LAYOUT, pten::MeanKernel, float, double, bool) {}
 PT_REGISTER_CTX_KERNEL(add,
                        CPU,
                        ALL_LAYOUT,
@@ -166,7 +163,7 @@ PT_REGISTER_CTX_KERNEL(multiply,
 PT_REGISTER_CTX_KERNEL(sum,
                        CPU,
                        ALL_LAYOUT,
-                       pten::Sum,
+                       pten::SumKernel,
                        bool,
                        float,
                        double,
diff --git a/paddle/pten/kernels/hybird/general/reduce_impl.h b/paddle/pten/kernels/cpu/reduce.h
similarity index 95%
rename from paddle/pten/kernels/hybird/general/reduce_impl.h
rename to paddle/pten/kernels/cpu/reduce.h
index 631ad7f6125bc..fc5dbe9d58d63 100644
--- a/paddle/pten/kernels/hybird/general/reduce_impl.h
+++ b/paddle/pten/kernels/cpu/reduce.h
@@ -13,14 +13,15 @@
 // limitations under the License.
 
 #pragma once
-#include "paddle/fluid/platform/transform.h"
+
+#include <set>
+
 #include "paddle/pten/api/ext/dispatch.h"
-#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/backends/cpu/cpu_context.h"
 #include "paddle/pten/kernels/cast_kernel.h"
 #include "paddle/pten/kernels/hybird/eigen/reduce.h"
 
 namespace pten {
-namespace general {
 
 template <typename DeviceContext, typename T, typename Functor>
 void Reduce(const DeviceContext& dev_ctx,
@@ -71,6 +72,4 @@ void Reduce(const DeviceContext& dev_ctx,
   }
 }
 
-}  // namespace general
-
 }  // namespace pten
diff --git a/paddle/pten/kernels/gpu/math_kernel.cu b/paddle/pten/kernels/gpu/math_kernel.cu
index 760bebe687841..051f7cb3bdd05 100644
--- a/paddle/pten/kernels/gpu/math_kernel.cu
+++ b/paddle/pten/kernels/gpu/math_kernel.cu
@@ -16,9 +16,8 @@ limitations under the License. */
 
 #include "paddle/pten/backends/gpu/gpu_context.h"
 #include "paddle/pten/kernels/funcs/elementwise_functor.h"
+#include "paddle/pten/kernels/gpu/reduce.h"
 #include "paddle/pten/kernels/hybird/cuda/elementwise/elementwise.h"
-#include "paddle/pten/kernels/hybird/cuda/reduce/reduce.h"
-#include "paddle/pten/kernels/hybird/general/reduce_impl.h"
 
 #ifdef __NVCC__
 #include "cub/cub.cuh"
@@ -76,12 +75,12 @@ struct DivideFunctor {
  */
 
 template <typename T, typename Context>
-void Mean(const Context& dev_ctx,
-          const DenseTensor& x,
-          const std::vector<int64_t>& dims,
-          bool keep_dim,
-          bool reduce_all,
-          DenseTensor* out) {
+void MeanKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const std::vector<int64_t>& dims,
+                bool keep_dim,
+                bool reduce_all,
+                DenseTensor* out) {
   auto out_dtype = x.dtype();
   pten::Reduce<T, kps::AddFunctor, kps::DivideFunctor>(
       dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
@@ -97,13 +96,13 @@ DEFINE_CUDA_ELEMENTWISE_OP(Multiply)
 DEFINE_CUDA_ELEMENTWISE_OP(Divide)
 
 template <typename T, typename Context>
-void Sum(const Context& dev_ctx,
-         const DenseTensor& x,
-         const std::vector<int64_t>& dims,
-         bool keep_dim,
-         bool reduce_all,
-         DataType out_dtype,
-         DenseTensor* out) {
+void SumKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               bool reduce_all,
+               DataType out_dtype,
+               DenseTensor* out) {
   pten::Reduce<T, kps::AddFunctor, kps::IdentityFunctor>(
       dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
 }
@@ -115,7 +114,7 @@ using complex64 = ::paddle::platform::complex<float>;
 using complex128 = ::paddle::platform::complex<double>;
 
 PT_REGISTER_CTX_KERNEL(
-    mean, GPU, ALL_LAYOUT, pten::Mean, float, double, bool, float16) {}
+    mean, GPU, ALL_LAYOUT, pten::MeanKernel, float, double, bool, float16) {}
 PT_REGISTER_CTX_KERNEL(add,
                        GPU,
                        ALL_LAYOUT,
@@ -164,7 +163,7 @@ PT_REGISTER_CTX_KERNEL(multiply,
 PT_REGISTER_CTX_KERNEL(sum,
                        GPU,
                        ALL_LAYOUT,
-                       pten::Sum,
+                       pten::SumKernel,
                        bool,
                        float,
                        double,
diff --git a/paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h b/paddle/pten/kernels/gpu/reduce.h
similarity index 96%
rename from paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h
rename to paddle/pten/kernels/gpu/reduce.h
index 4cfcad9149a3f..0704b76a2f069 100644
--- a/paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h
+++ b/paddle/pten/kernels/gpu/reduce.h
@@ -14,6 +14,9 @@
 
 #pragma once
 
+// CUDA and HIP use same api
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
 #include <algorithm>
 #include <cmath>
 #include <numeric>
@@ -40,6 +43,7 @@ namespace cub = hipcub;
 #include "paddle/fluid/string/string_helper.h"
 
 #include "paddle/pten/api/ext/dispatch.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/kernels/cast_kernel.h"
 #include "paddle/pten/kernels/copy_kernel.h"
@@ -1230,4 +1234,48 @@ void TensorReduceFunctorImpl(const pten::DenseTensor& x,
 }
 
 }  // namespace kernels
+
+template <typename T,
+          template <typename> class ReduceOp,
+          template <typename, typename> class TransformOp>
+void Reduce(const GPUContext& dev_ctx,
+            const DenseTensor& x,
+            bool reduce_all,
+            const std::vector<int64_t>& dims,
+            bool keep_dim,
+            DataType out_dtype,
+            DenseTensor* out) {
+  std::vector<int> reduce_dims =
+      pten::kernels::details::GetReduceDim(dims, x.dims().size(), reduce_all);
+
+  int reduce_num = 1;
+  for (auto i : reduce_dims) {
+    reduce_num *= (x.dims())[i];
+  }
+
+  gpuStream_t stream = dev_ctx.stream();
+
+  if (out_dtype != pten::DataType::UNDEFINED && out_dtype != x.dtype()) {
+    PD_DISPATCH_FLOATING_AND_COMPLEX_AND_2_TYPES(
+        pten::DataType::INT32,
+        pten::DataType::INT64,
+        out_dtype,
+        "TensorReduceFunctorImpl",
+        ([&] {
+          using MPType = typename kps::details::MPTypeTrait<data_t>::Type;
+          pten::kernels::TensorReduceFunctorImpl<T,
+                                                 data_t,
+                                                 ReduceOp,
+                                                 TransformOp<T, MPType>>(
+              x, out, TransformOp<T, MPType>(reduce_num), reduce_dims, stream);
+        }));
+  } else {
+    using MPType = typename kps::details::MPTypeTrait<T>::Type;
+    pten::kernels::
+        TensorReduceFunctorImpl<T, T, ReduceOp, TransformOp<T, MPType>>(
+            x, out, TransformOp<T, MPType>(reduce_num), reduce_dims, stream);
+  }
+}
 }  // namespace pten
+
+#endif
diff --git a/paddle/pten/kernels/hybird/cuda/reduce/reduce.h b/paddle/pten/kernels/hybird/cuda/reduce/reduce.h
deleted file mode 100644
index 2281cd5ef78ea..0000000000000
--- a/paddle/pten/kernels/hybird/cuda/reduce/reduce.h
+++ /dev/null
@@ -1,71 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-// CUDA and HIP use same api
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-
-#include "paddle/pten/api/ext/dispatch.h"
-#include "paddle/pten/backends/gpu/gpu_context.h"
-#include "paddle/pten/common/scalar.h"
-#include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h"
-namespace pten {
-
-template <typename T,
-          template <typename> class ReduceOp,
-          template <typename, typename> class TransformOp>
-void Reduce(const GPUContext& dev_ctx,
-            const DenseTensor& x,
-            bool reduce_all,
-            const std::vector<int64_t>& dims,
-            bool keep_dim,
-            DataType out_dtype,
-            DenseTensor* out) {
-  std::vector<int> reduce_dims =
-      pten::kernels::details::GetReduceDim(dims, x.dims().size(), reduce_all);
-
-  int reduce_num = 1;
-  for (auto i : reduce_dims) {
-    reduce_num *= (x.dims())[i];
-  }
-
-  gpuStream_t stream = dev_ctx.stream();
-
-  if (out_dtype != pten::DataType::UNDEFINED && out_dtype != x.dtype()) {
-    PD_DISPATCH_FLOATING_AND_COMPLEX_AND_2_TYPES(
-        pten::DataType::INT32,
-        pten::DataType::INT64,
-        out_dtype,
-        "TensorReduceFunctorImpl",
-        ([&] {
-          using MPType = typename kps::details::MPTypeTrait<data_t>::Type;
-          pten::kernels::TensorReduceFunctorImpl<T,
-                                                 data_t,
-                                                 ReduceOp,
-                                                 TransformOp<T, MPType>>(
-              x, out, TransformOp<T, MPType>(reduce_num), reduce_dims, stream);
-        }));
-  } else {
-    using MPType = typename kps::details::MPTypeTrait<T>::Type;
-    pten::kernels::
-        TensorReduceFunctorImpl<T, T, ReduceOp, TransformOp<T, MPType>>(
-            x, out, TransformOp<T, MPType>(reduce_num), reduce_dims, stream);
-  }
-}
-
-}  // namespace pten
-
-#endif
diff --git a/paddle/pten/kernels/math_kernel.h b/paddle/pten/kernels/math_kernel.h
index 2968aa3524a9f..b1e5188f3aaef 100644
--- a/paddle/pten/kernels/math_kernel.h
+++ b/paddle/pten/kernels/math_kernel.h
@@ -21,12 +21,12 @@ limitations under the License. */
 namespace pten {
 
 template <typename T, typename Context>
-void Mean(const Context& dev_ctx,
-          const DenseTensor& x,
-          const std::vector<int64_t>& dims,
-          bool keep_dim,
-          bool reduce_all,
-          DenseTensor* out);
+void MeanKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const std::vector<int64_t>& dims,
+                bool keep_dim,
+                bool reduce_all,
+                DenseTensor* out);
 
 template <typename T, typename Context>
 void AddKernel(const Context& dev_ctx,
@@ -57,13 +57,13 @@ void MultiplyKernel(const Context& dev_ctx,
                     DenseTensor* out);
 
 template <typename T, typename Context>
-void Sum(const Context& dev_ctx,
-         const DenseTensor& x,
-         const std::vector<int64_t>& dims,
-         bool keep_dim,
-         bool reduce_all,
-         DataType out_dtype,
-         DenseTensor* out);
+void SumKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               bool reduce_all,
+               DataType out_dtype,
+               DenseTensor* out);
 
 template <typename T, typename ContextT>
 DenseTensor Add(const ContextT& dev_ctx,

From e1cc22362f978d0a9ab3cad8d32184453c8f4d22 Mon Sep 17 00:00:00 2001
From: wangxinxin08 <69842442+wangxinxin08@users.noreply.github.com>
Date: Wed, 5 Jan 2022 15:28:28 +0800
Subject: [PATCH 007/151] add depthwise_conv2d op for mkldnn (#38484)

---
 .../fluid/operators/mkldnn/conv_mkldnn_op.cc  | 30 +++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
index 761ca3388530e..1bde58f7c4edb 100644
--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
@@ -1017,6 +1017,36 @@ REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(
     ops::kConvMKLDNNFP32,
     ops::ConvMKLDNNGradOpKernel<paddle::platform::bfloat16, float>);
 
+REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(depthwise_conv2d, MKLDNN,
+                                    ::paddle::platform::CPUPlace, FP32,
+                                    ops::kConvMKLDNNFP32,
+                                    ops::ConvMKLDNNOpKernel<float, float>);
+
+REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(
+    depthwise_conv2d, MKLDNN, ::paddle::platform::CPUPlace, BF16,
+    ops::kConvMKLDNNFP32,
+    ops::ConvMKLDNNOpKernel<paddle::platform::bfloat16, float>);
+
+REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(depthwise_conv2d, MKLDNN,
+                                    ::paddle::platform::CPUPlace, U8,
+                                    ops::kConvMKLDNNINT8,
+                                    ops::ConvMKLDNNOpKernel<uint8_t, float>);
+
+REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(depthwise_conv2d, MKLDNN,
+                                    ::paddle::platform::CPUPlace, S8,
+                                    ops::kConvMKLDNNINT8,
+                                    ops::ConvMKLDNNOpKernel<int8_t, float>);
+
+REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(depthwise_conv2d_grad, MKLDNN,
+                                    ::paddle::platform::CPUPlace, FP32,
+                                    ops::kConvMKLDNNFP32,
+                                    ops::ConvMKLDNNGradOpKernel<float, float>);
+
+REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(
+    depthwise_conv2d_grad, MKLDNN, ::paddle::platform::CPUPlace, BF16,
+    ops::kConvMKLDNNFP32,
+    ops::ConvMKLDNNGradOpKernel<paddle::platform::bfloat16, float>);
+
 REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv3d, MKLDNN,
                                     ::paddle::platform::CPUPlace, FP32,
                                     ops::kConvMKLDNNFP32,

From 9108e777aea2eedd003e26606710505fc20ec67e Mon Sep 17 00:00:00 2001
From: wanghuancoder <wanghuancoder@163.com>
Date: Wed, 5 Jan 2022 16:05:58 +0800
Subject: [PATCH 008/151] [Eager] Support test imperative basic in eager 
 test_empty_grad (#38376)

* Rearranged Eager AutoCodeGen directory structure

* Removed USE_OP in Eager AutoCodeGen

* Enabled generation for Operators without Grad/Inputs/Outputs

* Resolved operators without input

* Fixed merge conflicts

* Enabled Eager AutoCodeGen for 10+ more operators

* Refactored Eager AutoCodeGen with more organized helper objects

* Enabled Eager AutoCodeGen for operators with multiple OpBases

* Adjusted Eager AutoCodeGen to Enable Passing Output Tensor as Input Argument

* Handled Dispensable Inputs/Outputs in Eager AutoCodeGen

* Adjusted function generation/call between Python-C API & Dygraph API

* Synchronized auto-generated Python-C API with Dygraph Forward Functions

* support more eager tensor api

* fix merge compile error

* fix compile error and fit develop code

* support pure CPU

* fix some logic error in eager_mode

* support _varbase_creator in eager mode

* Added safe_initialized interface to EagerTensor for use in processing dispensable inputs

* for eager mode

* refine

* support multiple constructor for eager tensor

* add place related code

* polish code

* specific randint with dtype of int64

* Support pure cpu test

* eager logic

* refine test in pure cpu

* eager logic

* eager logic

* eager logic, test=develop

* skip core.eager when in inference, test=develop

* refine, test=develop

* refine, test=develop

* call RetainGrad after run forward kernel, test=develop

* refine, test=develop

* support dygraph util, meta, guard test

* eager test case

* support inference test

* refine test and fix initializer failed

* modify eagertensor patch method

* add eagertensor.clear_grandint, test=develop

* refine, test=develop

* refine, test=develop

* refine, test=develop

* call monkey_patch_varbase in _test_eager_guard, test=develop

* split clear_gradient to clear_gradient and zero_grads, test=develop

* refine, test=develop

* refine, test=develop

* refine, test=develop

Co-authored-by: jim19930609 <jim19930609@gmail.com>
Co-authored-by: JiabinYang <360788950@qq.com>
---
 paddle/fluid/pybind/eager_method.cc           |  90 ++++++++++-
 paddle/fluid/pybind/eager_utils.cc            |   9 +-
 python/paddle/fluid/__init__.py               |   4 -
 python/paddle/fluid/dygraph/math_op_patch.py  |  29 +++-
 .../fluid/dygraph/varbase_patch_methods.py    | 148 +++++++++++++-----
 python/paddle/fluid/eager/__init__.py         |  20 ---
 .../fluid/eager/eager_tensor_patch_methods.py | 135 ----------------
 python/paddle/fluid/framework.py              |   6 +
 python/paddle/fluid/initializer.py            |   2 +-
 .../unittests/test_egr_code_generate_api.py   |   1 -
 .../tests/unittests/test_egr_python_api.py    |  32 ++--
 .../tests/unittests/test_imperative_basic.py  |   8 +-
 python/setup.py.in                            |   1 -
 13 files changed, 248 insertions(+), 237 deletions(-)
 delete mode 100644 python/paddle/fluid/eager/__init__.py
 delete mode 100644 python/paddle/fluid/eager/eager_tensor_patch_methods.py

diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index f47bd3350e30f..7f131f9ccd742 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -27,6 +27,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/eager.h"
 #include "paddle/fluid/pybind/eager_utils.h"
 #include "paddle/fluid/pybind/exception.h"
+#include "paddle/pten/api/include/api.h"
 #include "paddle/pten/common/data_type.h"
 #include "paddle/pten/core/convert_utils.h"
 #include "paddle/pten/core/dense_tensor.h"
@@ -125,13 +126,17 @@ static PyObject* eager_tensor_method_copy_(EagerTensorObject* self,
   bool blocking = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 1), 1);
   VLOG(6) << "Start Copy Tensor " << src_tensor.name() << " to "
           << self->eager_tensor.name();
+  if (!self->eager_tensor.defined()) {
+    egr::EagerUtils::autograd_meta(&(self->eager_tensor))
+        ->SetStopGradient(
+            egr::EagerUtils::autograd_meta(&(src_tensor))->StopGradient());
+    egr::EagerUtils::autograd_meta(&(self->eager_tensor))
+        ->SetPersistable(
+            egr::EagerUtils::autograd_meta(&(src_tensor))->Persistable());
+  }
+
   self->eager_tensor.copy_(src_tensor, blocking);
-  egr::EagerUtils::autograd_meta(&(self->eager_tensor))
-      ->SetStopGradient(
-          egr::EagerUtils::autograd_meta(&(src_tensor))->StopGradient());
-  egr::EagerUtils::autograd_meta(&(self->eager_tensor))
-      ->SetPersistable(
-          egr::EagerUtils::autograd_meta(&(src_tensor))->Persistable());
+
   VLOG(6) << "Finish Copy Tensor " << src_tensor.name() << " to "
           << self->eager_tensor.name();
   Py_INCREF(Py_None);
@@ -156,6 +161,74 @@ static PyObject* eager_tensor_retain_grads(EagerTensorObject* self,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
+static PyObject* eager_tensor__clear_gradient(EagerTensorObject* self,
+                                              PyObject* args,
+                                              PyObject* kwargs) {
+  EAGER_SYNC_TRY
+  VLOG(4) << "ClearGradient " << self->eager_tensor.name();
+
+  egr::EagerTensor grad;
+  if (egr::egr_utils_api::IsLeafTensor(self->eager_tensor)) {
+    // Add RetainGrad as PostHook to AccumulationNode
+    std::shared_ptr<egr::GradNodeBase> grad_node =
+        egr::EagerUtils::grad_node(self->eager_tensor);
+    PADDLE_ENFORCE(
+        grad_node.get() != nullptr,
+        paddle::platform::errors::Fatal("Detected NULL grad_node"
+                                        "Leaf tensor should have had grad_node "
+                                        "with type: GradNodeAccumulation"));
+    auto accumulation_grad_node =
+        std::dynamic_pointer_cast<egr::GradNodeAccumulation>(grad_node);
+    grad = accumulation_grad_node->Grad();
+  } else {
+    auto meta = egr::EagerUtils::unsafe_autograd_meta(self->eager_tensor);
+    grad = meta->Grad();
+  }
+
+  if (grad.initialized()) {
+    VLOG(4) << "Gradient of " << self->eager_tensor.name()
+            << " is initialized, will be released.";
+    auto dense_tensor =
+        std::dynamic_pointer_cast<pten::DenseTensor>(grad.impl());
+    dense_tensor->release();
+  }
+  Py_INCREF(Py_None);
+  return Py_None;
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* eager_tensor__zero_grads(EagerTensorObject* self,
+                                          PyObject* args, PyObject* kwargs) {
+  EAGER_TRY
+  VLOG(4) << "ZeroGrads " << self->eager_tensor.name();
+
+  egr::EagerTensor grad;
+  if (egr::egr_utils_api::IsLeafTensor(self->eager_tensor)) {
+    // Add RetainGrad as PostHook to AccumulationNode
+    std::shared_ptr<egr::GradNodeBase> grad_node =
+        egr::EagerUtils::grad_node(self->eager_tensor);
+    PADDLE_ENFORCE(
+        grad_node.get() != nullptr,
+        paddle::platform::errors::Fatal("Detected NULL grad_node"
+                                        "Leaf tensor should have had grad_node "
+                                        "with type: GradNodeAccumulation"));
+    auto accumulation_grad_node =
+        std::dynamic_pointer_cast<egr::GradNodeAccumulation>(grad_node);
+    grad = accumulation_grad_node->Grad();
+  } else {
+    auto meta = egr::EagerUtils::unsafe_autograd_meta(self->eager_tensor);
+    grad = meta->Grad();
+  }
+
+  if (grad.initialized()) {
+    grad.set_tensor(std::make_shared<paddle::experimental::Tensor>(
+        paddle::experimental::zeros_like(*(grad.Tensor().get()))));
+  }
+  Py_INCREF(Py_None);
+  return Py_None;
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
 PyMethodDef variable_methods[] = {
     {"numpy", (PyCFunction)(void (*)(void))eager_tensor_method_numpy,
      METH_VARARGS | METH_KEYWORDS, NULL},
@@ -168,6 +241,11 @@ PyMethodDef variable_methods[] = {
      METH_VARARGS | METH_KEYWORDS, NULL},
     {"retain_grads", (PyCFunction)(void (*)(void))eager_tensor_retain_grads,
      METH_VARARGS | METH_KEYWORDS, NULL},
+    {"_clear_gradient",
+     (PyCFunction)(void (*)(void))eager_tensor__clear_gradient,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    {"_zero_grads", (PyCFunction)(void (*)(void))eager_tensor__zero_grads,
+     METH_VARARGS | METH_KEYWORDS, NULL},
     {NULL, NULL, 0, NULL}};
 
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index 879ea2b5d264e..9849d0d41611b 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -222,6 +222,8 @@ std::vector<egr::EagerTensor> CastPyArg2VectorOfEagerTensor(PyObject* obj,
             reinterpret_cast<PyTypeObject*>(item->ob_type)->tp_name, i));
       }
     }
+  } else if (obj == Py_None) {
+    return {};
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "argument (position %d) must be "
@@ -263,6 +265,8 @@ std::vector<int> CastPyArg2VectorOfInt(PyObject* obj, size_t arg_pos) {
             reinterpret_cast<PyTypeObject*>(item->ob_type)->tp_name, i));
       }
     }
+  } else if (obj == Py_None) {
+    return {};
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "argument (position %d) must be "
@@ -557,6 +561,8 @@ std::vector<egr::EagerTensor> GetEagerTensorListFromArgs(
           reinterpret_cast<EagerTensorObject*>(PyTuple_GetItem(list, i))
               ->eager_tensor);
     }
+  } else if (list == Py_None) {
+    return {};
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "%s(): argument '%s' (position %d) must be list of Tensors, but got "
@@ -634,6 +640,8 @@ std::vector<egr::EagerTensor*> GetEagerTensorPtrListFromArgs(
           &(reinterpret_cast<EagerTensorObject*>(PyTuple_GetItem(list, i))
                 ->eager_tensor));
     }
+  } else if (list == Py_None) {
+    return {};
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "%s(): argument '%s' (position %d) must be list of Tensors, but got "
@@ -644,6 +652,5 @@ std::vector<egr::EagerTensor*> GetEagerTensorPtrListFromArgs(
 
   return result;
 }
-
 }  // namespace pybind
 }  // namespace paddle
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index cd8f9f8545847..ec589b40e907f 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -55,7 +55,6 @@
 from .initializer import set_global_initializer
 from . import layers
 from . import dygraph
-from . import eager
 from . import contrib
 from . import nets
 from . import optimizer
@@ -91,7 +90,6 @@
 from .io import save, load, load_program_state, set_program_state
 from .dygraph.checkpoint import save_dygraph, load_dygraph
 from .dygraph.varbase_patch_methods import monkey_patch_varbase
-from .eager.eager_tensor_patch_methods import monkey_patch_eagertensor
 from . import generator
 from .core import _cuda_synchronize
 from .generator import Generator
@@ -115,7 +113,6 @@
         'contrib',
         'data',
         'dygraph',
-        'eager',
         'enable_dygraph',
         'disable_dygraph',
         'enable_imperative',
@@ -221,7 +218,6 @@ def remove_flag_if_exists(name):
 monkey_patch_variable()
 __bootstrap__()
 monkey_patch_varbase()
-monkey_patch_eagertensor()
 
 # NOTE(zhiqiu): register npu_finalize on the exit of Python,
 # do some clean up manually.
diff --git a/python/paddle/fluid/dygraph/math_op_patch.py b/python/paddle/fluid/dygraph/math_op_patch.py
index 92fbc89a46e32..64c418fabb11f 100644
--- a/python/paddle/fluid/dygraph/math_op_patch.py
+++ b/python/paddle/fluid/dygraph/math_op_patch.py
@@ -60,6 +60,7 @@
 ]
 
 _already_patch_varbase = False
+_already_patch_eager_tensor = False
 
 
 def monkey_patch_math_varbase():
@@ -220,7 +221,11 @@ def __impl__(self, other_var):
 
             # 2. create varbase for scalar
             lhs_dtype = self.dtype
-            if not isinstance(other_var, core.VarBase):
+            if _in_eager_mode():
+                other_var_should_be = core.eager.EagerTensor
+            else:
+                other_var_should_be = core.VarBase
+            if not isinstance(other_var, other_var_should_be):
                 if isinstance(other_var, complex):
                     import paddle
                     other_var = paddle.to_tensor(other_var, dtype='complex64')
@@ -333,22 +338,30 @@ def __impl__(self, other_var):
     ]
 
     global _already_patch_varbase
+    global _already_patch_eager_tensor
+
+    if core._in_eager_mode():
+        local_already_patch = _already_patch_eager_tensor
+        _already_patch_eager_tensor = True
+        local_tensor = core.eager.EagerTensor
+    else:
+        local_already_patch = _already_patch_varbase
+        _already_patch_varbase = True
+        local_tensor = core.VarBase
 
-    if not _already_patch_varbase:
+    if not local_already_patch:
         for method in varbase_methods:
             method_name = method[0]
             method_impl = method[1]
-            setattr(core.VarBase, method_name, method_impl)
+            setattr(local_tensor, method_name, method_impl)
     else:
         import paddle.tensor
         # Tensor method from module paddle.tensor
         for method_name in paddle.tensor.tensor_method_func:
-            if hasattr(core.VarBase, method_name): continue
+            if hasattr(local_tensor, method_name): continue
             method_impl = getattr(paddle.tensor, method_name, None)
-            if method_impl: setattr(core.VarBase, method_name, method_impl)
+            if method_impl: setattr(local_tensor, method_name, method_impl)
 
         for magic_method, origin_method in paddle.tensor.magic_method_func:
             impl = getattr(paddle.tensor, origin_method, None)
-            if impl: setattr(core.VarBase, magic_method, impl)
-
-    _already_patch_varbase = True
+            if impl: setattr(local_tensor, magic_method, impl)
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index a2cecb8030db5..c61f87ccf9089 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -22,7 +22,7 @@
 from .. import framework
 from .. import core
 from .. import unique_name
-from ..framework import Variable, Parameter, ParamBase, _getitem_impl_, _setitem_impl_
+from ..framework import Variable, Parameter, ParamBase, _getitem_impl_, _setitem_impl_, _in_eager_mode
 from .base import switch_to_static_graph
 from .math_op_patch import monkey_patch_math_varbase
 from .parallel import scale_loss
@@ -58,6 +58,9 @@ def remove(self):
         return False
 
 
+_already_patch_repr = False
+
+
 def monkey_patch_varbase():
     @switch_to_static_graph
     def _to_static_var(self, to_parameter=False, **kwargs):
@@ -146,7 +149,11 @@ def set_value(self, value):
                     out = linear(t)  # call with different weight
 
         """
-        assert isinstance(value, (np.ndarray, core.VarBase, dict, str)), \
+        if _in_eager_mode():
+            base_tensor = core.eager.EagerTensor
+        else:
+            base_tensor = core.VarBase
+        assert isinstance(value, (np.ndarray, base_tensor, dict, str)), \
             "Variable set_value function, arguments type only support Variable, numpy, VarBase, dict, string."
 
         if isinstance(value, (dict, str)):
@@ -160,7 +167,7 @@ def set_value(self, value):
                 self.value().set_string_list(value)
         else:
             value_np = value
-            if isinstance(value, core.VarBase):
+            if isinstance(value, base_tensor):
                 value_np = value.numpy()
 
             self_tensor_np = self.numpy()
@@ -231,22 +238,40 @@ def backward(self, grad_tensor=None, retain_graph=False):
         """
         if framework.in_dygraph_mode():
             if grad_tensor is not None:
-                assert isinstance(
-                    grad_tensor, paddle.
-                    Tensor), "The type of grad_tensor must be paddle.Tensor"
+                if _in_eager_mode():
+                    assert isinstance(
+                        grad_tensor, core.eager.EagerTensor
+                    ), "The type of grad_tensor must be paddle.Tensor"
+                else:
+                    assert isinstance(
+                        grad_tensor, paddle.
+                        Tensor), "The type of grad_tensor must be paddle.Tensor"
                 assert grad_tensor.shape == self.shape, \
                     "Tensor shape not match, Tensor of grad_tensor [ {} ] with shape {} mismatch Tensor [ {} ] with shape {}".format(
                     grad_tensor.name, grad_tensor.shape, self.name, self.shape)
 
+            if _in_eager_mode():
+                if grad_tensor is None:
+                    grad_tensor = []
+                else:
+                    grad_tensor = [grad_tensor]
             if paddle.is_compiled_with_xpu() or paddle.is_compiled_with_npu():
                 # TODO(liuyuhui): Currently only for xpu. Will be removed in the future.
                 scaled_loss = scale_loss(self)
-                core.dygraph_run_backward([scaled_loss], [grad_tensor],
-                                          retain_graph,
-                                          framework._dygraph_tracer())
+                if _in_eager_mode():
+                    core.eager.run_backward([scaled_loss], grad_tensor,
+                                            retain_graph)
+                else:
+                    core.dygraph_run_backward([scaled_loss], [grad_tensor],
+                                              retain_graph,
+                                              framework._dygraph_tracer())
             else:
-                core.dygraph_run_backward([self], [grad_tensor], retain_graph,
-                                          framework._dygraph_tracer())
+                if _in_eager_mode():
+                    core.eager.run_backward([self], grad_tensor, retain_graph)
+                else:
+                    core.dygraph_run_backward([self], [grad_tensor],
+                                              retain_graph,
+                                              framework._dygraph_tracer())
         else:
             raise ValueError(
                 "Variable.backward() is only available in DyGraph mode")
@@ -280,15 +305,22 @@ def gradient(self):
                 # [500.]
 
         """
-        if self._grad_ivar() is None:
-            return None
+        if _in_eager_mode():
+            if not self.grad._is_initialized():
+                return None
+            # TODO(wanghuancoder) support SELECTED_ROWS
+            return self.grad.numpy()
+        else:
+            if self._grad_ivar() is None:
+                return None
 
-        new_ivar = self._grad_ivar()._copy_to(core.CPUPlace(), True)
-        if self._grad_ivar().type == core.VarDesc.VarType.SELECTED_ROWS:
-            return (np.array(new_ivar.value().get_selected_rows().get_tensor()),
+            new_ivar = self._grad_ivar()._copy_to(core.CPUPlace(), True)
+            if self._grad_ivar().type == core.VarDesc.VarType.SELECTED_ROWS:
+                return (
+                    np.array(new_ivar.value().get_selected_rows().get_tensor()),
                     np.array(new_ivar.value().get_selected_rows().rows()))
-        else:
-            return np.array(new_ivar.value().get_tensor())
+            else:
+                return np.array(new_ivar.value().get_tensor())
 
     @framework.dygraph_only
     def register_hook(self, hook):
@@ -555,8 +587,12 @@ def __str__(self):
                 #        [[0.30574632, 0.55739117, 0.30902600, 0.39413780, 0.44830436],
                 #         [0.79010487, 0.53972793, 0.09495186, 0.44267157, 0.72112119]])
         """
-        from paddle.tensor.to_string import to_string
-        return to_string(self)
+        if _in_eager_mode():
+            from paddle.tensor.to_string import eager_tensor_to_string
+            return eager_tensor_to_string(self)
+        else:
+            from paddle.tensor.to_string import to_string
+            return to_string(self)
 
     def __deepcopy__(self, memo):
         """
@@ -583,7 +619,10 @@ def __deepcopy__(self, memo):
             raise RuntimeError(
                 "Only Leaf Tensor support the deepcopy at the moment, non-Leaf Tensors contains graph information that does't support deepcopy"
             )
-        new_varbase = core.VarBase()
+        if _in_eager_mode():
+            new_varbase = core.eager.EagerTensor()
+        else:
+            new_varbase = core.VarBase()
         new_varbase.name = self.name + unique_name.generate("_deepcopy")
         memo[id(self)] = new_varbase
         new_varbase.copy_(self, True)
@@ -717,33 +756,62 @@ def is_combine_index(item):
             # Call c++ func __setitem_varbase__ to speedup.
             return self.__setitem_varbase__(item, value)
 
+    @framework.dygraph_only
+    def _grad_ivar(self):
+        if self.grad._is_initialized():
+            return self.grad
+        else:
+            return None
+
+    @framework.dygraph_only
+    def clear_gradient(self, set_to_zero=True):
+        if set_to_zero:
+            self._zero_grads()
+        else:
+            self._clear_gradient()
+
+    if core._in_eager_mode() and not hasattr(core, "eager"):
+        return
+
     for method_name, method in (
         ("__bool__", __bool__), ("__nonzero__", __nonzero__),
         ("_to_static_var", _to_static_var), ("set_value", set_value),
         ("block", block), ("backward", backward), ("clear_grad", clear_grad),
-        ("inplace_version", inplace_version), ("grad", grad),
-        ("gradient", gradient), ("register_hook", register_hook),
-        ("__str__", __str__), ("__repr__", __str__),
-        ("__deepcopy__", __deepcopy__), ("__module__", "paddle"),
-        ("__name__", "Tensor"), ("__array__", __array__),
+        ("inplace_version", inplace_version), ("gradient", gradient),
+        ("register_hook", register_hook), ("__str__", __str__),
+        ("__repr__", __str__), ("__deepcopy__", __deepcopy__),
+        ("__module__", "paddle"), ("__array__", __array__),
         ("__getitem__", __getitem__), ("item", item),
         ("__setitem__", __setitem__), ("_to", _to)):
-        setattr(core.VarBase, method_name, method)
-
-    # NOTE(zhiqiu): pybind11 will set a default __str__ method of enum class.
-    # So, we need to overwrite it to a more readable one.
-    # See details in https://github.com/pybind/pybind11/issues/2537.
-    origin = getattr(core.VarDesc.VarType, "__repr__")
-
-    def dtype_str(dtype):
-        if dtype in _PADDLE_DTYPE_2_NUMPY_DTYPE:
-            prefix = 'paddle.'
-            return prefix + _PADDLE_DTYPE_2_NUMPY_DTYPE[dtype]
+        if core._in_eager_mode():
+            setattr(core.eager.EagerTensor, method_name, method)
         else:
-            # for example, paddle.fluid.core.VarDesc.VarType.LOD_TENSOR
-            return origin(dtype)
+            setattr(core.VarBase, method_name, method)
+
+    if core._in_eager_mode():
+        setattr(core.eager.EagerTensor, "_grad_ivar", _grad_ivar)
+        setattr(core.eager.EagerTensor, "clear_gradient", clear_gradient)
+    else:
+        setattr(core.VarBase, "__name__", "Tensor")
+        setattr(core.VarBase, "grad", grad)
+
+    global _already_patch_repr
+    if not _already_patch_repr:
+        # NOTE(zhiqiu): pybind11 will set a default __str__ method of enum class.
+        # So, we need to overwrite it to a more readable one.
+        # See details in https://github.com/pybind/pybind11/issues/2537.
+        origin = getattr(core.VarDesc.VarType, "__repr__")
+
+        def dtype_str(dtype):
+            if dtype in _PADDLE_DTYPE_2_NUMPY_DTYPE:
+                prefix = 'paddle.'
+                return prefix + _PADDLE_DTYPE_2_NUMPY_DTYPE[dtype]
+            else:
+                # for example, paddle.fluid.core.VarDesc.VarType.LOD_TENSOR
+                return origin(dtype)
 
-    setattr(core.VarDesc.VarType, "__repr__", dtype_str)
+        setattr(core.VarDesc.VarType, "__repr__", dtype_str)
+        _already_patch_repr = True
 
     # patch math methods for varbase
     monkey_patch_math_varbase()
diff --git a/python/paddle/fluid/eager/__init__.py b/python/paddle/fluid/eager/__init__.py
deleted file mode 100644
index 1dc82ef69979c..0000000000000
--- a/python/paddle/fluid/eager/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-
-# incubate directory is mainly for internal use
-# after we have tested incubate APIs in industrial application for a period
-# we will move stable functions into fluid
-
-from . import eager_tensor_patch_methods
-
-__all__ = []
diff --git a/python/paddle/fluid/eager/eager_tensor_patch_methods.py b/python/paddle/fluid/eager/eager_tensor_patch_methods.py
deleted file mode 100644
index 2586685ec1ada..0000000000000
--- a/python/paddle/fluid/eager/eager_tensor_patch_methods.py
+++ /dev/null
@@ -1,135 +0,0 @@
-#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .. import core as core
-from .. import framework as framework
-from ..dygraph.parallel import scale_loss
-import numpy as np
-
-
-def monkey_patch_eagertensor():
-    def __str__(self):
-        from paddle.tensor.to_string import eager_tensor_to_string
-        return eager_tensor_to_string(self)
-
-    @framework.dygraph_only
-    def backward(self, grad_tensor=None, retain_graph=False):
-        """
-        Run backward of current Graph which starts from current Tensor.
-
-        The new gradient will accumulat on previous gradient.
-
-        You can clear gradient by ``Tensor.clear_grad()`` .
-
-        Args:
-            grad_tensor(Tensor, optional): initial gradient values of the current Tensor. If `grad_tensor` is None, 
-            the initial gradient values of the current Tensor would be Tensor filled with 1.0; 
-            if `grad_tensor` is not None, it must have the same length as the current Tensor.
-            Teh default value is None.
-
-            retain_graph(bool, optional): If False, the graph used to compute grads will be freed. If you would
-                like to add more ops to the built graph after calling this method( :code:`backward` ), set the parameter
-                :code:`retain_graph` to True, then the grads will be retained. Thus, seting it to False is much more memory-efficient.
-                Defaults to False.
-        Returns:
-            NoneType: None
-
-        Examples:
-            .. code-block:: python
-
-                import paddle
-                x = paddle.to_tensor(5., stop_gradient=False)
-                for i in range(5):
-                    y = paddle.pow(x, 4.0)
-                    y.backward()
-                    print("{}: {}".format(i, x.grad))
-                # 0: [500.]
-                # 1: [1000.]
-                # 2: [1500.]
-                # 3: [2000.]
-                # 4: [2500.]
-
-                x.clear_grad()
-                print("{}".format(x.grad))
-                # 0.
-
-                grad_tensor=paddle.to_tensor(2.)
-                for i in range(5):
-                    y = paddle.pow(x, 4.0)
-                    y.backward(grad_tensor)
-                    print("{}: {}".format(i, x.grad))
-                # 0: [1000.]
-                # 1: [2000.]
-                # 2: [3000.]
-                # 3: [4000.]
-                # 4: [5000.]
-
-        """
-        if framework.in_dygraph_mode():
-            if grad_tensor is not None:
-                assert isinstance(
-                    grad_tensor, core.eager.EagerTensor
-                ), "The type of grad_tensor must be paddle.Tensor"
-                assert grad_tensor.shape == self.shape, \
-                    "Tensor shape not match, Tensor of grad_tensor [ {} ] with shape {} mismatch Tensor [ {} ] with shape {}".format(
-                    grad_tensor.name, grad_tensor.shape, self.name, self.shape)
-                grad_tensor = [grad_tensor]
-            else:
-                grad_tensor = []
-
-            if core.is_compiled_with_xpu() or core.is_compiled_with_npu():
-                # TODO(liuyuhui): Currently only for xpu. Will be removed in the future.
-                scaled_loss = scale_loss(self)
-                core.eager.run_backward([scaled_loss], grad_tensor,
-                                        retain_graph)
-            else:
-                core.eager.run_backward([self], grad_tensor, retain_graph)
-        else:
-            raise ValueError(
-                "Variable.backward() is only available in DyGraph mode")
-
-    @framework.dygraph_only
-    def gradient(self):
-        """
-        .. warning::
-          This API will be deprecated in the future, it is recommended to use
-          :code:`x.grad` which returns the tensor value of the gradient.
-
-        Get the Gradient of Current Tensor.
-
-        Returns:
-            ndarray: Numpy value of the gradient of current Tensor
-
-        Examples:
-            .. code-block:: python
-
-                import paddle
-
-                x = paddle.to_tensor(5., stop_gradient=False)
-                y = paddle.pow(x, 4.0)
-                y.backward()
-                print("grad of x: {}".format(x.gradient()))
-                # [500.]
-
-        """
-        if self.grad._is_initialized():
-            return self.grad.numpy()
-        else:
-            return None
-        # TODO(wanghuancoder) support SELECTED_ROWS
-
-    if hasattr(core, "eager"):
-        setattr(core.eager.EagerTensor, "__str__", __str__)
-        setattr(core.eager.EagerTensor, "backward", backward)
-        setattr(core.eager.EagerTensor, "gradient", gradient)
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 73407ef834e22..3d8cd1142cf3a 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -77,6 +77,7 @@
 _current_device = None
 global_prog_seed = 0
 _current_pipeline_stage = None
+_already_patch_eager_tensor = False
 _global_flags_ = core.globals()
 core._disable_eager_mode()
 
@@ -85,6 +86,11 @@
 def _test_eager_guard(tracer=None):
     core._enable_eager_mode()
     _C_ops.switch_to_eager_ops()
+    global _already_patch_eager_tensor
+    if not _already_patch_eager_tensor:
+        from .dygraph.varbase_patch_methods import monkey_patch_varbase
+        monkey_patch_varbase()
+        _already_patch_eager_tensor = True
     if tracer is None:
         core._set_eager_tracer(_dygraph_tracer_)
     else:
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index 5d0b56ed537d8..fd1562d609a1d 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -604,7 +604,7 @@ def __call__(self, var, block=None):
         if framework.in_dygraph_mode():
             if self._uniform:
                 limit = np.sqrt(6.0 / float(fan_in + fan_out))
-                out_var = _C_ops.uniform_random('shape', var.shape, 'min',
+                out_var = _C_ops.uniform_random('shape', out_var.shape, 'min',
                                                 -limit, 'max', limit, 'seed',
                                                 self._seed, 'dtype', out_dtype)
             else:
diff --git a/python/paddle/fluid/tests/unittests/test_egr_code_generate_api.py b/python/paddle/fluid/tests/unittests/test_egr_code_generate_api.py
index 3bf2be3d64bee..45cb7e785bc5e 100644
--- a/python/paddle/fluid/tests/unittests/test_egr_code_generate_api.py
+++ b/python/paddle/fluid/tests/unittests/test_egr_code_generate_api.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import paddle.fluid.core as core
-import paddle.fluid.eager.eager_tensor_patch_methods as eager_tensor_patch_methods
 import paddle
 import numpy as np
 from paddle.fluid.framework import _test_eager_guard
diff --git a/python/paddle/fluid/tests/unittests/test_egr_python_api.py b/python/paddle/fluid/tests/unittests/test_egr_python_api.py
index 6d81a27882ff0..e4576fe2ea8bd 100644
--- a/python/paddle/fluid/tests/unittests/test_egr_python_api.py
+++ b/python/paddle/fluid/tests/unittests/test_egr_python_api.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import paddle.fluid.core as core
-import paddle.fluid.eager.eager_tensor_patch_methods as eager_tensor_patch_methods
 import paddle
 import numpy as np
 from paddle.fluid.framework import _test_eager_guard, EagerParamBase, _in_eager_mode
@@ -621,7 +620,7 @@ def test_copy_and_copy_to(self):
             self.assertTrue(np.array_equal(tensor.numpy(), arr))
             print("Test copy_")
             tensor.copy_(tensor1, True)
-            self.assertEqual(tensor.persistable, True)
+            self.assertEqual(tensor.persistable, False)
             self.assertEqual(tensor.shape, [4, 16])
             self.assertEqual(tensor.dtype, core.VarDesc.VarType.FP32)
             self.assertTrue(np.array_equal(tensor.numpy(), arr1))
@@ -764,20 +763,21 @@ def test_to_variable(self):
         self.assertTrue(np.array_equal(res3, res4))
 
     def test_backward_with_single_tensor(self):
-        arr4 = np.random.rand(4, 16, 16, 32).astype('float32')
-        egr_tensor12 = core.eager.EagerTensor(arr4, core.CPUPlace())
-        egr_tensor12.retain_grads()
-        arr = np.ones([4, 16, 16, 32]).astype('float32')
-        self.assertEqual(egr_tensor12.persistable, False)
-        self.assertTrue("generated_tensor" in egr_tensor12.name)
-        self.assertEqual(egr_tensor12.shape, [4, 16, 16, 32])
-        self.assertEqual(egr_tensor12.dtype, core.VarDesc.VarType.FP32)
-        self.assertEqual(egr_tensor12.stop_gradient, True)
-        self.assertTrue(egr_tensor12.place._equals(paddle.fluid.CPUPlace()))
-        self.assertTrue(np.array_equal(egr_tensor12.numpy(), arr4))
-        self.assertTrue(np.array_equal(egr_tensor12.gradient(), None))
-        egr_tensor12.backward()
-        self.assertTrue(np.array_equal(egr_tensor12.gradient(), arr))
+        with _test_eager_guard():
+            arr4 = np.random.rand(4, 16, 16, 32).astype('float32')
+            egr_tensor12 = core.eager.EagerTensor(arr4, core.CPUPlace())
+            egr_tensor12.retain_grads()
+            arr = np.ones([4, 16, 16, 32]).astype('float32')
+            self.assertEqual(egr_tensor12.persistable, False)
+            self.assertTrue("generated_tensor" in egr_tensor12.name)
+            self.assertEqual(egr_tensor12.shape, [4, 16, 16, 32])
+            self.assertEqual(egr_tensor12.dtype, core.VarDesc.VarType.FP32)
+            self.assertEqual(egr_tensor12.stop_gradient, True)
+            self.assertTrue(egr_tensor12.place._equals(paddle.fluid.CPUPlace()))
+            self.assertTrue(np.array_equal(egr_tensor12.numpy(), arr4))
+            self.assertTrue(np.array_equal(egr_tensor12.gradient(), None))
+            egr_tensor12.backward()
+            self.assertTrue(np.array_equal(egr_tensor12.gradient(), arr))
 
 
 class EagerGuardTestCase(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_basic.py b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
index 5b9e9ab8373ab..262d07336de08 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
@@ -267,7 +267,7 @@ def test_no_grad_guard(self):
                 tmp = l1.weight * 2
                 self.assertTrue(tmp.stop_gradient)
             x = paddle.to_tensor(data)
-            y = l0(x) + tmp
+            y = paddle.add(l0(x), tmp)
             o = l1(y)
             o.backward()
 
@@ -285,7 +285,7 @@ def test_paddle_imperative_no_grad_guard(self):
                 tmp = l1.weight * 2
                 self.assertTrue(tmp.stop_gradient)
             x = paddle.to_tensor(data)
-            y = l0(x) + tmp
+            y = paddle.add(l0(x), tmp)
             o = l1(y)
             o.backward()
 
@@ -306,7 +306,7 @@ def test_paddle_imperative_set_grad_enabled(self):
                 self.assertTrue(tmp.stop_gradient)
                 self.assertTrue(tmp2.stop_gradient is False)
             x = paddle.to_tensor(data)
-            y = l0(x) + tmp2
+            y = paddle.add(l0(x), tmp2)
             o = l1(y)
             o.backward()
 
@@ -329,7 +329,7 @@ def test_sum_op(self):
                 tmp = paddle.to_tensor(x)
                 tmp.stop_gradient = False
                 inputs.append(tmp)
-            ret = fluid.layers.sums(inputs)
+            ret = paddle.add_n(inputs)
             loss = fluid.layers.reduce_sum(ret)
             loss.backward()
         with fluid.dygraph.guard():
diff --git a/python/setup.py.in b/python/setup.py.in
index f14111c7dabb9..6b38facb5fd5e 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -308,7 +308,6 @@ packages=['paddle',
           'paddle.fluid.dygraph',
           'paddle.fluid.dygraph.dygraph_to_static',
           'paddle.fluid.dygraph.amp',
-          'paddle.fluid.eager',
           'paddle.fluid.proto',
           'paddle.fluid.proto.profiler',
           'paddle.fluid.distributed',

From 40078103c480ba9a6cf59deb7fdba951c69b62f2 Mon Sep 17 00:00:00 2001
From: TTerror <tangzhiyi11@users.noreply.github.com>
Date: Wed, 5 Jan 2022 17:10:52 +0800
Subject: [PATCH 009/151] update masked_select_op for kunlun (#38678)

---
 cmake/external/xpu.cmake                       | 2 +-
 paddle/fluid/operators/masked_select_op_xpu.cc | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index 588ba0bfe86cd..c7a6f04b5f40a 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -36,7 +36,7 @@ ENDIF()
 
 if(NOT DEFINED XPU_BASE_URL)
   SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
-  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20211228")
+  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220104")
 else()
   SET(XPU_BASE_URL "${XPU_BASE_URL}")
 endif()
diff --git a/paddle/fluid/operators/masked_select_op_xpu.cc b/paddle/fluid/operators/masked_select_op_xpu.cc
index aafc2510a8c44..c575f133b1572 100644
--- a/paddle/fluid/operators/masked_select_op_xpu.cc
+++ b/paddle/fluid/operators/masked_select_op_xpu.cc
@@ -57,7 +57,7 @@ class MaskedSelectXPUKernel : public framework::OpKernel<T> {
 
     PADDLE_ENFORCE_XPU_SUCCESS(
         xpu::masked_select(dev_ctx.x_context(), input_data, mask_data, out_data,
-                           input_shape, mask_shape));
+                           input_shape, mask_shape, out_size_cpu));
   }
 };
 

From bbe83ed1c0cce65a5ab551ada0154852d8f728cf Mon Sep 17 00:00:00 2001
From: Qi Li <qili93@qq.com>
Date: Wed, 5 Jan 2022 17:19:36 +0800
Subject: [PATCH 010/151] [XPU] update XPU run check scripts, test=develop
 (#38698)

---
 python/CMakeLists.txt                |  2 +
 python/paddle/utils/install_check.py | 58 +++++++++++++++++++++-------
 2 files changed, 47 insertions(+), 13 deletions(-)

diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 0fecd7c8c36ee..fe5f2c25ca551 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -12,6 +12,8 @@ elseif(WITH_ROCM)
   SET(PACKAGE_NAME "paddlepaddle-rocm")
 elseif(WITH_ASCEND_CL)
   SET(PACKAGE_NAME "paddlepaddle-npu")
+elseif(WITH_XPU)
+  SET(PACKAGE_NAME "paddlepaddle-xpu")
 else()
   SET(PACKAGE_NAME "paddlepaddle")
 endif()
diff --git a/python/paddle/utils/install_check.py b/python/paddle/utils/install_check.py
index efdc6847f0056..9feda3d2dae6a 100644
--- a/python/paddle/utils/install_check.py
+++ b/python/paddle/utils/install_check.py
@@ -89,16 +89,35 @@ def _is_npu_available():
         return False
 
 
-def _run_dygraph_single(use_cuda, use_npu):
+def _is_xpu_available():
     """
-    Testing the simple network in dygraph mode using one CPU/GPU.
+    Check whether XPU is avaiable.
+    """
+    try:
+        assert len(paddle.static.xpu_places()) > 0
+        return True
+    except Exception as e:
+        logging.warning(
+            "You are using XPU version PaddlePaddle, but there is no XPU "
+            "detected on your machine. Maybe XPU devices is not set properly."
+            "\n Original Error is {}".format(e))
+        return False
+
+
+def _run_dygraph_single(use_cuda, use_xpu, use_npu):
+    """
+    Testing the simple network in dygraph mode using one CPU/GPU/XPU/NPU.
 
     Args:
         use_cuda (bool): Whether running with CUDA.
+        use_xpu (bool): Whether running with XPU.
+        use_npu (bool): Whether running with NPU.
     """
     paddle.disable_static()
     if use_cuda:
         paddle.set_device('gpu')
+    elif use_xpu:
+        paddle.set_device('xpu')
     elif use_npu:
         paddle.set_device('npu')
     else:
@@ -119,12 +138,14 @@ def _run_dygraph_single(use_cuda, use_npu):
     opt.step()
 
 
-def _run_static_single(use_cuda, use_npu):
+def _run_static_single(use_cuda, use_xpu, use_npu):
     """
-    Testing the simple network with executor running directly, using one CPU/GPU.
+    Testing the simple network with executor running directly, using one CPU/GPU/XPU/NPU.
 
     Args:
         use_cuda (bool): Whether running with CUDA.
+        use_xpu (bool): Whether running with XPU.
+        use_npu (bool): Whether running with NPU.
     """
     paddle.enable_static()
     with paddle.static.scope_guard(paddle.static.Scope()):
@@ -138,6 +159,8 @@ def _run_static_single(use_cuda, use_npu):
 
         if use_cuda:
             place = paddle.CUDAPlace(0)
+        elif use_xpu:
+            place = paddle.XPUPlace(0)
         elif use_npu:
             place = paddle.NPUPlace(0)
         else:
@@ -151,12 +174,14 @@ def _run_static_single(use_cuda, use_npu):
     paddle.disable_static()
 
 
-def _run_static_parallel(use_cuda, use_npu, device_list):
+def _run_static_parallel(use_cuda, use_xpu, use_npu, device_list):
     """
     Testing the simple network in data parallel mode, using multiple CPU/GPU.
 
     Args:
         use_cuda (bool): Whether running with CUDA.
+        use_xpu (bool): Whether running with XPU.
+        use_npu (bool): Whether running with NPU.
         device_list (int): The specified devices.
     """
     paddle.enable_static()
@@ -175,6 +200,9 @@ def _run_static_parallel(use_cuda, use_npu, device_list):
 
         if use_cuda:
             place = paddle.CUDAPlace(0)
+        elif use_xpu:
+            place = paddle.XPUPlace(0)
+            compiled_prog = train_prog
         elif use_npu:
             place = paddle.NPUPlace(0)
             compiled_prog = train_prog
@@ -210,19 +238,23 @@ def run_check():
 
     print("Running verify PaddlePaddle program ... ")
 
+    use_cuda = False
+    use_xpu = False
+    use_npu = False
+
     if paddle.is_compiled_with_cuda():
         use_cuda = _is_cuda_available()
-        use_npu = False
+    elif paddle.is_compiled_with_xpu():
+        use_xpu = _is_xpu_available()
     elif paddle.is_compiled_with_npu():
         use_npu = _is_npu_available()
-        use_cuda = False
-    else:
-        use_npu = False
-        use_cuda = False
 
     if use_cuda:
         device_str = "GPU"
         device_list = paddle.static.cuda_places()
+    elif use_xpu:
+        device_str = "XPU"
+        device_list = paddle.static.xpu_places()
     elif use_npu:
         device_str = "NPU"
         device_list = paddle.static.npu_places()
@@ -231,12 +263,12 @@ def run_check():
         device_list = paddle.static.cpu_places(device_count=2)
     device_count = len(device_list)
 
-    _run_static_single(use_cuda, use_npu)
-    _run_dygraph_single(use_cuda, use_npu)
+    _run_static_single(use_cuda, use_xpu, use_npu)
+    _run_dygraph_single(use_cuda, use_xpu, use_npu)
     print("PaddlePaddle works well on 1 {}.".format(device_str))
 
     try:
-        _run_static_parallel(use_cuda, use_npu, device_list)
+        _run_static_parallel(use_cuda, use_xpu, use_npu, device_list)
         print("PaddlePaddle works well on {} {}s.".format(device_count,
                                                           device_str))
         print(

From 60c51de5cebb5c098618236815ee32d62432048b Mon Sep 17 00:00:00 2001
From: "joanna.wozna.intel" <joanna.wozna@intel.com>
Date: Wed, 5 Jan 2022 10:26:02 +0100
Subject: [PATCH 011/151] Add input data type checking in BF16 placement pass
 (#38702)

---
 .../framework/ir/graph_pattern_detector.cc    |  2 +
 .../framework/ir/graph_pattern_detector.h     |  1 +
 .../ir/mkldnn/cpu_bfloat16_placement_pass.cc  |  4 ++
 .../cpu_bfloat16_placement_pass_tester.cc     | 41 +++++++++++++++----
 4 files changed, 39 insertions(+), 9 deletions(-)

diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 6949e4d078c0c..8c4965fc40235 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -2441,11 +2441,13 @@ PDNode *patterns::Bfloat16Placement::operator()(
   if (!bfloat16_enabled_op_types.empty()) {
     supported_op_types = bfloat16_enabled_op_types;
   }
+  auto *op_in = pattern->NewNode(op_in_repr())->AsInput();
   auto *op = pattern->NewNode(op_repr())->assert_is_ops(supported_op_types);
   op->assert_more([&](Node *node) {
     return node->Op()->GetAttrIfExists<bool>("use_mkldnn") ||
            node->Op()->Type() == "reshape2";
   });
+  op->LinksFrom({op_in});
   return op;
 }
 
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index 940f6b8561e48..5b996a3ab918b 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -1446,6 +1446,7 @@ struct Bfloat16Placement : public PatternBase {
   PDNode* operator()(
       const std::unordered_set<std::string>& bfloat16_enabled_op_types);
 
+  PATTERN_DECL_NODE(op_in);
   PATTERN_DECL_NODE(op);
 };
 
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc
index 0f9edeba525b0..d89891ec3c857 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc
@@ -41,8 +41,12 @@ void CPUBfloat16PlacementPass::SetMkldnnDataType(
 
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(op_in, op_in, bfloat16_placement_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(op, op, bfloat16_placement_pattern);
 
+    // Only float input can be converted to bfloat16
+    if (op_in->Var()->GetDataType() != proto::VarType::FP32) return;
+
     if ((op->Op()->HasAttr("mkldnn_data_type") ||
          op->Op()->HasProtoAttr("mkldnn_data_type")) &&
         !platform::HasOpINT8DataType(op->Op())) {
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass_tester.cc
index 28a45f36fb71d..e3ef7b7af05d2 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass_tester.cc
@@ -68,7 +68,7 @@ ProgramDesc BuildProgramDesc() {
   for (auto& v :
        std::vector<std::string>({"a", "b", "c", "f", "g", "h", "k", "l", "m",
                                  "n", "o", "p", "r", "s"})) {
-    prog.MutableBlock(0)->Var(v);
+    prog.MutableBlock(0)->Var(v)->SetDataType(proto::VarType::FP32);
   }
 
   SetOp(&prog, "concat", "concat1", {"a", "b"}, {"c"});
@@ -86,9 +86,8 @@ ProgramDesc BuildProgramDesc() {
 }
 
 void MainTest(std::initializer_list<std::string> bfloat16_enabled_op_types,
-              unsigned expected_bfloat16_data_type_count) {
-  auto prog = BuildProgramDesc();
-
+              unsigned expected_bfloat16_data_type_count,
+              const ProgramDesc& prog) {
   std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
 
   auto pass = PassRegistry::Instance().Get("cpu_bfloat16_placement_pass");
@@ -110,8 +109,8 @@ void MainTest(std::initializer_list<std::string> bfloat16_enabled_op_types,
   EXPECT_EQ(bfloat16_data_type_count, expected_bfloat16_data_type_count);
 }
 
-void DefaultAttrTest(unsigned expected_bfloat16_data_type_count) {
-  auto prog = BuildProgramDesc();
+void DefaultAttrTest(unsigned expected_bfloat16_data_type_count,
+                     const ProgramDesc& prog) {
   std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
   auto pass = PassRegistry::Instance().Get("cpu_bfloat16_placement_pass");
   graph.reset(pass->Apply(graph.release()));
@@ -128,15 +127,39 @@ void DefaultAttrTest(unsigned expected_bfloat16_data_type_count) {
 }
 
 TEST(Bfloat16PlacementPass, enable_all) {
-  MainTest({"conv2d", "pool2d", "gelu", "concat", "sum"}, 8);
+  MainTest({"conv2d", "pool2d", "gelu", "concat", "sum"}, 8,
+           BuildProgramDesc());
 }
 
 TEST(Bfloat16PlacementPass, enabled_conv_and_pool) {
   // 2 conv2d + 2 pool2 - 1 orphaned conv2d
-  MainTest({"conv2d", "pool2d"}, 3);
+  MainTest({"conv2d", "pool2d"}, 3, BuildProgramDesc());
+}
+
+TEST(Bfloat16PlacementPass, default_attr_value) {
+  DefaultAttrTest(10, BuildProgramDesc());
+}
+
+ProgramDesc BuildProgramDescWithDataType() {
+  ProgramDesc prog;
+
+  for (auto& v : std::vector<std::string>({"a", "b", "c", "d", "e"})) {
+    if (v == "a") {
+      prog.MutableBlock(0)->Var(v)->SetDataType(proto::VarType::INT32);
+    } else {
+      prog.MutableBlock(0)->Var(v)->SetDataType(proto::VarType::FP32);
+    }
+  }
+
+  SetOp(&prog, "conv2d", "conv1", {"a"}, {"b"});
+  SetOp(&prog, "pool2d", "pool1", {"b"}, {"c"});
+  SetOp(&prog, "concat", "concat1", {"c", "d"}, {"e"});
+  return prog;
 }
 
-TEST(Bfloat16PlacementPass, default_attr_value) { DefaultAttrTest(10); }
+TEST(Bfloat16PlacementPass, check_data_types) {
+  DefaultAttrTest(2, BuildProgramDescWithDataType());
+}
 
 }  // namespace ir
 }  // namespace framework

From 0af1a87b70af7f478b9195f52515f821b64242d1 Mon Sep 17 00:00:00 2001
From: Jiaqi Liu <liujiaqi06@baidu.com>
Date: Wed, 5 Jan 2022 19:21:07 +0800
Subject: [PATCH 012/151] Make post training quant API support dataloader
 (#38686)

* make post training quant API support dataloader
---
 .../post_training_quantization.py             | 17 ++++++--
 .../test_post_training_quantization_while.py  | 43 ++++++++++++++++---
 2 files changed, 52 insertions(+), 8 deletions(-)

diff --git a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
index e9173a86b89fa..9da798375af25 100644
--- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
+++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
@@ -17,6 +17,7 @@
 import logging
 import numpy as np
 import shutil
+from inspect import isgeneratorfunction
 from .... import io
 from .... import core
 from .... import framework
@@ -136,6 +137,7 @@ def __init__(self,
                  params_filename=None,
                  batch_generator=None,
                  sample_generator=None,
+                 data_loader=None,
                  batch_size=10,
                  batch_nums=None,
                  algo="KL",
@@ -175,6 +177,9 @@ def __init__(self,
                 calibrate data for DataLoader, and it only returns a sample every
                 time. Note that, sample_generator and batch_generator, only one
                 should be set. Beisdes, sample_generator dose not support lod tensor.
+            data_loader(Python Generator, Paddle.io.DataLoader, optional): The
+                Generator or Dataloader provides calibrate data, and it could
+                return a batch every time.
             batch_size(int, optional): The batch size of DataLoader. Default is 10.
             batch_nums(int, optional): If batch_nums is not None, the number of 
                 calibrate data is batch_size*batch_nums. If batch_nums is None, use 
@@ -279,8 +284,11 @@ def __init__(self,
         assert executor is not None, "The executor cannot be None."
         assert model_dir is not None, "The model_dir cannot be None."
         assert any([gen is not None] for gen in [sample_generator,
-            batch_generator]), "The sample_generator and batch_generator " \
-            "cannot be None in the same time."
+            batch_generator, data_loader]), "The sample_generator, batch_generator " \
+            "and data_loader cannot be None in the same time."
+        if data_loader is not None:
+            assert isinstance(data_loader, (io.DataLoader, type(isgeneratorfunction))), \
+                "data_loader only accepts `paddle.io.DataLoader` or Generator instance."
         assert batch_size > 0, "The batch_size should be greater than 0."
         assert algo in self._support_algo_type, \
             "The algo should be KL, hist, mse, avg, abs_max or min_max."
@@ -323,7 +331,7 @@ def __init__(self,
         self._program = None
         self._feed_list = None
         self._fetch_list = None
-        self._data_loader = None
+        self._data_loader = data_loader
 
         self._out_scale_op_list = _out_scale_op_list
         self._quantized_weight_var_name = set()
@@ -473,6 +481,9 @@ def _load_model_data(self):
 
         feed_vars = [framework._get_var(str(var_name), self._program) \
             for var_name in self._feed_list]
+
+        if self._data_loader is not None:
+            return
         self._data_loader = io.DataLoader.from_generator(
             feed_list=feed_vars, capacity=3 * self._batch_size, iterable=True)
         if self._sample_generator is not None:
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_while.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_while.py
index 3c3dfd08fccfa..642bcf2a47679 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_while.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_while.py
@@ -115,19 +115,30 @@ def generate_quantized_model(self,
                                  is_use_cache_file=False,
                                  is_optimize_model=False,
                                  batch_size=10,
-                                 batch_nums=10):
+                                 batch_nums=10,
+                                 is_data_loader=False):
 
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
         scope = fluid.global_scope()
         val_reader = paddle.dataset.mnist.train()
 
+        def val_data_generator():
+            batches = []
+            for data in val_reader():
+                batches.append(data[0].reshape(1, 28, 28))
+                if len(batches) == batch_size:
+                    batches = np.asarray(batches)
+                    yield {"x": batches}
+                    batches = []
+
         ptq = PostTrainingQuantization(
             executor=exe,
             model_dir=model_path,
             model_filename='model.pdmodel',
             params_filename='model.pdiparams',
-            sample_generator=val_reader,
+            sample_generator=val_reader if not is_data_loader else None,
+            data_loader=val_data_generator if is_data_loader else None,
             batch_size=batch_size,
             batch_nums=batch_nums,
             algo=algo,
@@ -153,7 +164,8 @@ def run_test(self,
                  diff_threshold,
                  batch_size=10,
                  infer_iterations=10,
-                 quant_iterations=5):
+                 quant_iterations=5,
+                 is_data_loader=False):
 
         origin_model_path = self.download_model(data_url, data_md5, model_name)
         #origin_model_path = os.path.join(origin_model_path, model_name)
@@ -166,8 +178,15 @@ def run_test(self,
         print("Start INT8 post training quantization for {0} on {1} images ...".
               format(model_name, quant_iterations * batch_size))
         self.generate_quantized_model(
-            origin_model_path, algo, quantizable_op_type, is_full_quantize,
-            is_use_cache_file, is_optimize_model, batch_size, quant_iterations)
+            origin_model_path,
+            algo,
+            quantizable_op_type,
+            is_full_quantize,
+            is_use_cache_file,
+            is_optimize_model,
+            batch_size,
+            quant_iterations,
+            is_data_loader=is_data_loader)
 
         print("Start INT8 inference for {0} on {1} images ...".format(
             model_name, infer_iterations * batch_size))
@@ -307,6 +326,20 @@ def test_post_training_abs_max(self):
                       is_full_quantize, is_use_cache_file, is_optimize_model,
                       diff_threshold, batch_size, infer_iterations,
                       quant_iterations)
+        self.run_test(
+            model_name,
+            data_url,
+            data_md5,
+            algo,
+            quantizable_op_type,
+            is_full_quantize,
+            is_use_cache_file,
+            is_optimize_model,
+            diff_threshold,
+            batch_size,
+            infer_iterations,
+            quant_iterations,
+            is_data_loader=True)
 
 
 if __name__ == '__main__':

From 905c80222bd59a2491b704018875fcc18276aa1e Mon Sep 17 00:00:00 2001
From: From00 <chenruibiao@baidu.com>
Date: Wed, 5 Jan 2022 20:52:59 +0800
Subject: [PATCH 013/151] Fix bug for UT GetAllocatorInterfaceTest (#38720)

* Fix bug of GetAllocatorInterfaceTest

* Replace some shared_ptr with unique_ptr

* Change Alloc call
---
 .../memory/stream_safe_cuda_alloc_test.cu     | 33 +++++++++----------
 1 file changed, 15 insertions(+), 18 deletions(-)

diff --git a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
index 083b8a14d29f2..bb44b29ac5b01 100644
--- a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
+++ b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
@@ -85,7 +85,7 @@ class StreamSafeCUDAAllocTest : public ::testing::Test {
       workspaces_.emplace_back(allocation);
     }
 
-    result_ = AllocShared(place_, stream_num_ * workspace_size_);
+    result_ = Alloc(place_, stream_num_ * workspace_size_);
   }
 
   void SingleStreamRun(size_t idx) {
@@ -185,7 +185,7 @@ class StreamSafeCUDAAllocTest : public ::testing::Test {
   platform::CUDAPlace place_;
   std::vector<gpuStream_t> streams_;
   std::vector<std::shared_ptr<Allocation>> workspaces_;
-  std::shared_ptr<Allocation> result_;
+  allocation::AllocationPtr result_;
 };
 
 TEST_F(StreamSafeCUDAAllocTest, CUDAMutilStreamTest) {
@@ -225,22 +225,23 @@ TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) {
 
 TEST(StreamSafeCUDAAllocInterfaceTest, GetAllocatorInterfaceTest) {
   platform::CUDAPlace place = platform::CUDAPlace();
+  size_t alloc_size = 256;
+
+  allocation::AllocationPtr allocation_implicit_stream =
+      Alloc(place, alloc_size);
+  EXPECT_GE(allocation_implicit_stream->size(), alloc_size);
+  void *address = allocation_implicit_stream->ptr();
+  allocation_implicit_stream.reset();
+
   auto &instance = allocation::AllocatorFacade::Instance();
   const std::shared_ptr<Allocator> &allocator = instance.GetAllocator(place);
 
-  size_t alloc_size = 256;
-  std::shared_ptr<Allocation> allocation_from_allocator =
+  allocation::AllocationPtr allocation_from_allocator =
       allocator->Allocate(alloc_size);
   EXPECT_GE(allocation_from_allocator->size(), alloc_size);
-  void *address = allocation_from_allocator->ptr();
+  EXPECT_EQ(allocation_from_allocator->ptr(), address);
   allocation_from_allocator.reset();
 
-  std::shared_ptr<Allocation> allocation_implicit_stream =
-      AllocShared(place, alloc_size);
-  EXPECT_GE(allocation_implicit_stream->size(), alloc_size);
-  EXPECT_EQ(allocation_implicit_stream->ptr(), address);
-  allocation_implicit_stream.reset();
-
   Release(place);
   CheckMemLeak(place);
 }
@@ -347,16 +348,12 @@ TEST(StreamSafeCUDAAllocRetryTest, RetryTest) {
   // so the second alloc will fail and retry
   size_t alloc_size = available_size / 4 * 3;
 
-  std::shared_ptr<Allocation> allocation1 = AllocShared(
-      place, alloc_size,
-      platform::Stream(reinterpret_cast<platform::StreamId>(stream1)));
-  std::shared_ptr<Allocation> allocation2;
+  allocation::AllocationPtr allocation1 = Alloc(place, alloc_size, stream1);
+  allocation::AllocationPtr allocation2;
 
   std::thread th([&allocation2, &place, &stream2, alloc_size]() {
     std::this_thread::sleep_for(std::chrono::seconds(1));
-    allocation2 = AllocShared(
-        place, alloc_size,
-        platform::Stream(reinterpret_cast<platform::StreamId>(stream2)));
+    allocation2 = Alloc(place, alloc_size, stream2);
   });
   allocation1.reset();  // free but not release
   th.join();

From 36a102f85de37c4d6ade394eb3f552a1dc1a30b7 Mon Sep 17 00:00:00 2001
From: Lijunhui <1578034415@qq.com>
Date: Wed, 5 Jan 2022 21:49:41 +0800
Subject: [PATCH 014/151] optimize elementwise_mul_grad using new interfaces
 (#37728)

* init commit: new elem_mul_grad

* add template speciallization for complex in multiply

* reply review comments

* correct dx and dy computation when T is complex

* reply review comments

* update to new ReduceRunctor

* mul-output broadcast

* call functions

* call functions with comments

* remove comments
---
 .../elementwise/elementwise_functor.h         | 42 ++++++++
 .../elementwise/elementwise_mul_op.cu         | 95 +++++++------------
 .../elementwise/elementwise_mul_op.h          | 29 ++----
 3 files changed, 86 insertions(+), 80 deletions(-)

diff --git a/paddle/fluid/operators/elementwise/elementwise_functor.h b/paddle/fluid/operators/elementwise/elementwise_functor.h
index b7bebcaa386ba..a62c531ff0733 100644
--- a/paddle/fluid/operators/elementwise/elementwise_functor.h
+++ b/paddle/fluid/operators/elementwise/elementwise_functor.h
@@ -194,5 +194,47 @@ struct FMinFunctor<paddle::platform::float16> {
   }
 };
 
+template <typename T>
+struct MulGradFunctor {
+  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a * b; }
+};
+template <typename T>
+struct MulGradFunctor<Complex<T>> {
+  inline HOSTDEVICE Complex<T> operator()(const Complex<T>& a,
+                                          const Complex<T>& b) const {
+    Complex<T> b_conj(b.real, -b.imag);
+    return a * b_conj;
+  }
+};
+
+template <typename InT, typename OutT>
+struct MulGradXYFunctor {
+  inline HOSTDEVICE paddle::framework::Array<OutT, 2> operator()(const InT& a,
+                                                                 const InT& b,
+                                                                 const InT& c) {
+    paddle::framework::Array<OutT, 2> outs;
+    // dx = dout * y
+    outs[0] = a * b;
+    // dy = dout * x
+    outs[1] = a * c;
+    return outs;
+  }
+};
+
+template <typename InT, typename OutT>
+struct MulGradXYFunctor<Complex<InT>, Complex<OutT>> {
+  inline HOSTDEVICE paddle::framework::Array<Complex<OutT>, 2> operator()(
+      const Complex<InT>& a, const Complex<InT>& b, const Complex<InT>& c) {
+    paddle::framework::Array<Complex<OutT>, 2> outs;
+    // dx = dout * y
+    Complex<InT> b_conj(b.real, -b.imag);
+    outs[0] = a * b_conj;
+    // dy = dout * x
+    Complex<InT> c_conj(c.real, -c.imag);
+    outs[1] = a * c_conj;
+    return outs;
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
index 12e0062a698be..cdf376fd6a8cc 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/float16.h"
 
@@ -68,69 +69,41 @@ class ElementwiseMulKernel<platform::CUDADeviceContext, T>
   }
 };
 
-template <typename T>
-static __global__ void SimpleElemwiseMulGradCUDAKernel(const T* x, const T* y,
-                                                       const T* out,
-                                                       const T* dout,
-                                                       int64_t size, T* dx,
-                                                       T* dy) {
-  int col = blockIdx.x * blockDim.x + threadIdx.x;
-
-  while (col < size) {
-    T o = dout[col];
-    dx[col] = y[col] * o;
-    dy[col] = x[col] * o;
-    col += blockDim.x * gridDim.x;
-  }
-}
-
-template <>
-__global__ void SimpleElemwiseMulGradCUDAKernel<plat::complex<float>>(
-    const plat::complex<float>* x, const plat::complex<float>* y,
-    const plat::complex<float>* out, const plat::complex<float>* dout,
-    int64_t size, plat::complex<float>* dx, plat::complex<float>* dy) {
-  int col = blockIdx.x * blockDim.x + threadIdx.x;
-
-  while (col < size) {
-    plat::complex<float> o = dout[col];
-    dx[col] = plat::complex<float>(y[col].real, -y[col].imag) * o;
-    dy[col] = plat::complex<float>(x[col].real, -x[col].imag) * o;
-    col += blockDim.x * gridDim.x;
-  }
-}
-
-template <>
-__global__ void SimpleElemwiseMulGradCUDAKernel<plat::complex<double>>(
-    const plat::complex<double>* x, const plat::complex<double>* y,
-    const plat::complex<double>* out, const plat::complex<double>* dout,
-    int64_t size, plat::complex<double>* dx, plat::complex<double>* dy) {
-  int col = blockIdx.x * blockDim.x + threadIdx.x;
-
-  while (col < size) {
-    plat::complex<double> o = dout[col];
-    dx[col] = plat::complex<double>(y[col].real, -y[col].imag) * o;
-    dy[col] = plat::complex<double>(x[col].real, -x[col].imag) * o;
-    col += blockDim.x * gridDim.x;
-  }
-}
-
 template <typename DeviceContext, typename T>
 typename std::enable_if<
-    std::is_same<DeviceContext, plat::CUDADeviceContext>::value>::type
-elementwise_mul_grad(const framework::ExecutionContext& ctx,
-                     const framework::Tensor* x, const framework::Tensor* y,
-                     const framework::Tensor* out,
-                     const framework::Tensor* dout, framework::Tensor* dx,
-                     framework::Tensor* dy) {
-  dim3 block_size = dim3(ELEMENTWISE_BLOCK_SIZE, 1);
-  auto size = x->numel();
-  dim3 grid_size =
-      dim3((size + ELEMENTWISE_BLOCK_SIZE - 1) / ELEMENTWISE_BLOCK_SIZE, 1);
-  SimpleElemwiseMulGradCUDAKernel<
-      T><<<grid_size, block_size, 0,
-           ctx.template device_context<plat::CUDADeviceContext>().stream()>>>(
-      x->data<T>(), y->data<T>(), out->data<T>(), dout->data<T>(), size,
-      dx->mutable_data<T>(ctx.GetPlace()), dy->mutable_data<T>(ctx.GetPlace()));
+    std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
+ElementwiseMulGrad(const framework::ExecutionContext& ctx,
+                   const framework::Tensor* x, const framework::Tensor* y,
+                   const framework::Tensor* out, const framework::Tensor* dout,
+                   framework::Tensor* dx, framework::Tensor* dy) {
+  int axis = ctx.Attr<int>("axis");
+  const auto& dev_ctx =
+      ctx.template device_context<platform::CUDADeviceContext>();
+  const auto place = ctx.GetPlace();
+
+  if (dx != nullptr && dy != nullptr) {
+    dx->mutable_data<T>(place);
+    if (dx->IsSharedBufferWith(*dout)) {
+      dx->clear();
+      dx->mutable_data<T>(x->dims(), place);
+    }
+    std::vector<const framework::Tensor*> ins = {dout, y, x};
+    GetGradXAndYOut<ElementwiseType::kBinary, T>(
+        dev_ctx, place, axis, ins, dout, dx, dy, MulGradXYFunctor<T, T>());
+  } else if (dx != nullptr && dy == nullptr) {
+    dx->mutable_data<T>(place);
+    if (dx->IsSharedBufferWith(*dout)) {
+      dx->clear();
+      dx->mutable_data<T>(x->dims(), place);
+    }
+    std::vector<const framework::Tensor*> ins = {dout, y};
+    GetGradXOrYOut<ElementwiseType::kBinary, T>(dev_ctx, place, axis, ins, dout,
+                                                dx, MulGradFunctor<T>());
+  } else if (dx == nullptr && dy != nullptr) {
+    std::vector<const framework::Tensor*> ins = {dout, x};
+    GetGradXOrYOut<ElementwiseType::kBinary, T>(dev_ctx, place, axis, ins, dout,
+                                                dy, MulGradFunctor<T>());
+  }
 }
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
index 3b0f072572210..5cff3173e8115 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
@@ -174,26 +174,23 @@ struct MulGradDY<paddle::platform::complex<T>> {
 template <typename DeviceContext, typename T>
 typename std::enable_if<
     std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
-elementwise_mul_grad(const framework::ExecutionContext& ctx,
-                     const framework::Tensor* x, const framework::Tensor* y,
-                     const framework::Tensor* out,
-                     const framework::Tensor* dout, framework::Tensor* dx,
-                     framework::Tensor* dy) {
+ElementwiseMulGrad(const framework::ExecutionContext& ctx,
+                   const framework::Tensor* x, const framework::Tensor* y,
+                   const framework::Tensor* out, const framework::Tensor* dout,
+                   framework::Tensor* dx, framework::Tensor* dy) {
   int axis = ctx.Attr<int>("axis");
   ElemwiseGradCompute<DeviceContext, T, MulGradDX<T>, MulGradDY<T>>(
       ctx, *x, *y, *out, *dout, axis, dx, dy, MulGradDX<T>(), MulGradDY<T>());
 }
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-// cuda definition
 template <typename DeviceContext, typename T>
 typename std::enable_if<
     std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
-elementwise_mul_grad(const framework::ExecutionContext& ctx,
-                     const framework::Tensor* x, const framework::Tensor* y,
-                     const framework::Tensor* out,
-                     const framework::Tensor* dout, framework::Tensor* dx,
-                     framework::Tensor* dy);
+ElementwiseMulGrad(const framework::ExecutionContext& ctx,
+                   const framework::Tensor* x, const framework::Tensor* y,
+                   const framework::Tensor* out, const framework::Tensor* dout,
+                   framework::Tensor* dx, framework::Tensor* dy);
 #endif
 
 template <typename DeviceContext, typename T>
@@ -209,14 +206,8 @@ class ElementwiseMulGradKernel : public ElemwiseGradKernel<T> {
     auto* out = dout;  // out is not necessary
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    int axis = ctx.Attr<int>("axis");
-    if (dx != nullptr && dy != nullptr && (dx->dims() == dy->dims())) {
-      elementwise_mul_grad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
-    } else {
-      ElemwiseGradCompute<DeviceContext, T, MulGradDX<T>, MulGradDY<T>>(
-          ctx, *x, *y, *out, *dout, axis, dx, dy, MulGradDX<T>(),
-          MulGradDY<T>());
-    }
+
+    ElementwiseMulGrad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
   }
 };
 

From 718183f1c37ed7c1b4bcba924e08fc0ca0932ad6 Mon Sep 17 00:00:00 2001
From: jakpiase <jakpia21@gmail.com>
Date: Thu, 6 Jan 2022 02:46:19 +0100
Subject: [PATCH 015/151] Added exp FP32 FWD/BWD oneDNN kernel and optimized
 other oneDNN grad kernels (#38624)

* added exp activation and use_dst_for_bwd kernels

* CI RERUN

* minor change
---
 .../operators/mkldnn/activation_mkldnn_op.cc  | 95 ++++++++++++++-----
 .../mkldnn/test_activation_mkldnn_op.py       | 10 ++
 2 files changed, 80 insertions(+), 25 deletions(-)

diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
index 44edc22479570..8630515a9fdaf 100644
--- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
@@ -83,9 +83,9 @@ void eltwise_forward(const framework::ExecutionContext &ctx,
   const auto &mkldnn_engine = dev_ctx.GetEngine();
 
   const auto *x = ctx.Input<Tensor>("X");
-  auto *y = ctx.Output<Tensor>("Out");
+  auto *out = ctx.Output<Tensor>("Out");
 
-  bool is_inplaced = x->IsSharedBufferWith(*y);
+  bool is_inplaced = x->IsSharedBufferWith(*out);
 
   platform::ActivationMKLDNNHandler<T> handler(algorithm, ctx, mkldnn_engine,
                                                ctx.GetPlace(), x);
@@ -94,9 +94,9 @@ void eltwise_forward(const framework::ExecutionContext &ctx,
   std::shared_ptr<dnnl::memory> dst_memory_p = nullptr;
   if (is_inplaced) {
     dst_memory_p = src_memory_p;
-    y->mutable_data<T>(ctx.GetPlace());
+    out->mutable_data<T>(ctx.GetPlace());
   } else {
-    dst_memory_p = handler.AcquireDstMemory(y);
+    dst_memory_p = handler.AcquireDstMemory(out);
   }
   auto activation_p = handler.AcquireForwardPrimitive();
 
@@ -105,8 +105,8 @@ void eltwise_forward(const framework::ExecutionContext &ctx,
       astream, {{DNNL_ARG_FROM, *src_memory_p}, {DNNL_ARG_TO, *dst_memory_p}});
   astream.wait();
 
-  y->set_layout(DataLayout::kMKLDNN);
-  y->set_format(GetMKLDNNFormat(*dst_memory_p));
+  out->set_layout(DataLayout::kMKLDNN);
+  out->set_format(GetMKLDNNFormat(*dst_memory_p));
 }
 
 template <typename T>
@@ -116,15 +116,15 @@ void eltwise_grad(const framework::ExecutionContext &ctx,
   const auto &mkldnn_engine = dev_ctx.GetEngine();
 
   const auto *x = ctx.Input<Tensor>("X");
-  const auto *diff_y = ctx.Input<Tensor>(framework::GradVarName("Out"));
-  auto *diff_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+  const auto *dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+  auto *dx = ctx.Output<Tensor>(framework::GradVarName("X"));
 
   platform::ActivationMKLDNNHandler<T> handler(algorithm, ctx, mkldnn_engine,
-                                               ctx.GetPlace(), x, diff_y);
+                                               ctx.GetPlace(), x, dout);
 
   auto src_memory_p = handler.AcquireBackwardSrcMemory(x);
-  auto diff_dst_memory_p = handler.AcquireDiffDstMemory(diff_y);
-  auto diff_src_memory_p = handler.AcquireDiffSrcMemory(diff_x);
+  auto diff_dst_memory_p = handler.AcquireDiffDstMemory(dout);
+  auto diff_src_memory_p = handler.AcquireDiffSrcMemory(dx);
   auto activation_backward_p = handler.AcquireBackwardPrimitive();
 
   auto &astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();
@@ -134,8 +134,37 @@ void eltwise_grad(const framework::ExecutionContext &ctx,
                                   {DNNL_ARG_DIFF_SRC, *diff_src_memory_p}});
   astream.wait();
 
-  diff_x->set_layout(DataLayout::kMKLDNN);
-  diff_x->set_format(GetMKLDNNFormat(*diff_src_memory_p));
+  dx->set_layout(DataLayout::kMKLDNN);
+  dx->set_format(GetMKLDNNFormat(*diff_src_memory_p));
+}
+
+template <typename T>
+void eltwise_grad_use_out(const framework::ExecutionContext &ctx,
+                          dnnl::algorithm algorithm) {
+  auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+  const auto &mkldnn_engine = dev_ctx.GetEngine();
+
+  const auto *out = ctx.Input<Tensor>("Out");
+  const auto *dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+  auto *dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+  platform::ActivationMKLDNNHandler<T> handler(algorithm, ctx, mkldnn_engine,
+                                               ctx.GetPlace(), out, dout);
+
+  auto dst_memory_p = handler.AcquireBackwardSrcMemory(out);
+  auto diff_dst_memory_p = handler.AcquireDiffDstMemory(dout);
+  auto diff_src_memory_p = handler.AcquireDiffSrcMemory(dx);
+  auto activation_backward_p = handler.AcquireBackwardPrimitive();
+
+  auto &astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();
+  activation_backward_p->execute(astream,
+                                 {{DNNL_ARG_DST, *dst_memory_p},
+                                  {DNNL_ARG_DIFF_DST, *diff_dst_memory_p},
+                                  {DNNL_ARG_DIFF_SRC, *diff_src_memory_p}});
+  astream.wait();
+
+  dx->set_layout(DataLayout::kMKLDNN);
+  dx->set_format(GetMKLDNNFormat(*diff_src_memory_p));
 }
 
 template <typename T, dnnl::algorithm algorithm>
@@ -152,6 +181,13 @@ struct MKLDNNActivationGradFunc : public BaseActivationFunctor<T> {
   }
 };
 
+template <typename T, dnnl::algorithm algorithm>
+struct MKLDNNActivationGradUseOutFunc : public BaseActivationFunctor<T> {
+  void operator()(const framework::ExecutionContext &ctx) const {
+    eltwise_grad_use_out<T>(ctx, algorithm);
+  }
+};
+
 template <typename T>
 struct GeluMKLDNNFunctor : public BaseActivationFunctor<T> {
   void operator()(const framework::ExecutionContext &ctx) const {
@@ -217,6 +253,9 @@ using AbsMKLDNNFunctor = MKLDNNActivationFunc<T, dnnl::algorithm::eltwise_abs>;
 template <typename T>
 using EluMKLDNNFunctor = MKLDNNActivationFunc<T, dnnl::algorithm::eltwise_elu>;
 
+template <typename T>
+using ExpMKLDNNFunctor = MKLDNNActivationFunc<T, dnnl::algorithm::eltwise_exp>;
+
 template <typename T>
 using ReluMKLDNNGradFunctor =
     MKLDNNActivationGradFunc<T, dnnl::algorithm::eltwise_relu>;
@@ -234,24 +273,29 @@ using HardSwishMKLDNNGradFunctor =
     MKLDNNActivationGradFunc<T, dnnl::algorithm::eltwise_hardswish>;
 
 template <typename T>
-using SigmoidMKLDNNGradFunctor =
-    MKLDNNActivationGradFunc<T, dnnl::algorithm::eltwise_logistic>;
+using SigmoidMKLDNNGradUseOutFunctor = MKLDNNActivationGradUseOutFunc<
+    T, dnnl::algorithm::eltwise_logistic_use_dst_for_bwd>;
 
 template <typename T>
-using TanhMKLDNNGradFunctor =
-    MKLDNNActivationGradFunc<T, dnnl::algorithm::eltwise_tanh>;
+using TanhMKLDNNGradUseOutFunctor = MKLDNNActivationGradUseOutFunc<
+    T, dnnl::algorithm::eltwise_tanh_use_dst_for_bwd>;
 
 template <typename T>
-using SqrtMKLDNNGradFunctor =
-    MKLDNNActivationGradFunc<T, dnnl::algorithm::eltwise_sqrt>;
+using SqrtMKLDNNGradUseOutFunctor = MKLDNNActivationGradUseOutFunc<
+    T, dnnl::algorithm::eltwise_sqrt_use_dst_for_bwd>;
 
 template <typename T>
 using AbsMKLDNNGradFunctor =
     MKLDNNActivationGradFunc<T, dnnl::algorithm::eltwise_abs>;
 
 template <typename T>
-using EluMKLDNNGradFunctor =
-    MKLDNNActivationGradFunc<T, dnnl::algorithm::eltwise_elu>;
+using EluMKLDNNGradUseOutFunctor = MKLDNNActivationGradUseOutFunc<
+    T, dnnl::algorithm::eltwise_elu_use_dst_for_bwd>;
+
+template <typename T>
+using ExpMKLDNNGradUseOutFunctor = MKLDNNActivationGradUseOutFunc<
+    T, dnnl::algorithm::eltwise_exp_use_dst_for_bwd>;
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -281,9 +325,10 @@ namespace ops = paddle::operators;
   __macro(leaky_relu, ReluMKLDNNFunctor, ReluMKLDNNGradFunctor);           \
   __macro(swish, SwishMKLDNNFunctor, SwishMKLDNNGradFunctor);              \
   __macro(hard_swish, HardSwishMKLDNNFunctor, HardSwishMKLDNNGradFunctor); \
-  __macro(tanh, TanhMKLDNNFunctor, TanhMKLDNNGradFunctor);                 \
+  __macro(tanh, TanhMKLDNNFunctor, TanhMKLDNNGradUseOutFunctor);           \
   __macro(abs, AbsMKLDNNFunctor, AbsMKLDNNGradFunctor);                    \
-  __macro(elu, EluMKLDNNFunctor, EluMKLDNNGradFunctor);
+  __macro(elu, EluMKLDNNFunctor, EluMKLDNNGradUseOutFunctor);              \
+  __macro(exp, ExpMKLDNNFunctor, ExpMKLDNNGradUseOutFunctor);
 
 FOR_EACH_MKLDNN_KERNEL_FUNCTOR(REGISTER_ACTIVATION_MKLDNN_KERNEL);
 REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL(relu, ReluMKLDNNFunctor,
@@ -291,9 +336,9 @@ REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL(relu, ReluMKLDNNFunctor,
 REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL(gelu, GeluMKLDNNFunctor,
                                        GeluMKLDNNGradFunctor);
 REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL(sigmoid, SigmoidMKLDNNFunctor,
-                                       SigmoidMKLDNNGradFunctor);
+                                       SigmoidMKLDNNGradUseOutFunctor);
 REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL(sqrt, SqrtMKLDNNFunctor,
-                                       SqrtMKLDNNGradFunctor);
+                                       SqrtMKLDNNGradUseOutFunctor);
 
 namespace ops = paddle::operators;
 REGISTER_OP_KERNEL(
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
index 6ee266a93d56a..8af2101346fec 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
@@ -349,6 +349,16 @@ def set_alpha(self):
         self.alpha = 2.5
 
 
+class TestMKLDNNExpOp(TestActivation):
+    def setUp(self):
+        self.op_type = "exp"
+        x = np.random.random((5, 5, 4)).astype("float32")
+
+        self.inputs = {'X': x}
+        self.attrs = {'use_mkldnn': True}
+        self.outputs = {'Out': np.exp(x)}
+
+
 # Check if primitives already exist in backward
 class TestMKLDNNAbsPrimitivesAlreadyExist(unittest.TestCase):
     def setUp(self):

From c1adced7d23218a39fba174a655289ab0bdd7c7a Mon Sep 17 00:00:00 2001
From: YuanRisheng <yuanrisheng@baidu.com>
Date: Thu, 6 Jan 2022 09:54:34 +0800
Subject: [PATCH 016/151] [Pten]Move GPU_implementation of elementwise kernel
 in new directory (#38696)

* move gpu_impl of elementwise kernel

* change copyright to 2022
---
 paddle/fluid/operators/dropout_impl.cu.h      |   4 +-
 .../elementwise/elementwise_op_function.h     |   2 +-
 .../elementwise/elementwise_op_impl.cu.h      |   2 +-
 paddle/pten/kernels/cpu/CMakeLists.txt        |   1 -
 .../cpu/{elementwise_impl.h => elementwise.h} |   0
 paddle/pten/kernels/cpu/math.cc               |  15 -
 paddle/pten/kernels/cpu/math_kernel.cc        |   2 +-
 .../pten/kernels/funcs/cuda_kernel_config.h   |  55 +++
 .../elementwise.h}                            | 332 +++++++++++++++++-
 paddle/pten/kernels/gpu/math_kernel.cu        |   5 +-
 .../hybird/cuda/elementwise/elementwise.h     |  52 ---
 .../cuda/elementwise/elementwise_common.cu.h  | 120 -------
 .../elementwise/elementwise_no_broadcast.cu.h | 253 -------------
 .../kernels/hybird/general/manipulation.h     |  34 --
 14 files changed, 391 insertions(+), 486 deletions(-)
 rename paddle/pten/kernels/cpu/{elementwise_impl.h => elementwise.h} (100%)
 delete mode 100644 paddle/pten/kernels/cpu/math.cc
 create mode 100644 paddle/pten/kernels/funcs/cuda_kernel_config.h
 rename paddle/pten/kernels/{hybird/cuda/elementwise/elementwise_broadcast.cu.h => gpu/elementwise.h} (61%)
 delete mode 100644 paddle/pten/kernels/hybird/cuda/elementwise/elementwise.h
 delete mode 100644 paddle/pten/kernels/hybird/cuda/elementwise/elementwise_common.cu.h
 delete mode 100644 paddle/pten/kernels/hybird/cuda/elementwise/elementwise_no_broadcast.cu.h
 delete mode 100644 paddle/pten/kernels/hybird/general/manipulation.h

diff --git a/paddle/fluid/operators/dropout_impl.cu.h b/paddle/fluid/operators/dropout_impl.cu.h
index 7491d6189ebde..a708cbbfaacfc 100644
--- a/paddle/fluid/operators/dropout_impl.cu.h
+++ b/paddle/fluid/operators/dropout_impl.cu.h
@@ -34,7 +34,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/pten/kernels/hybird/cuda/elementwise/elementwise_no_broadcast.cu.h"
+#include "paddle/pten/kernels/funcs/cuda_kernel_config.h"
 
 namespace paddle {
 namespace operators {
@@ -193,7 +193,7 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
     // VectorizedRandomGenerator use curand_uniform4, so we only support
     // vec_size is 4;
     int vec_size = (platform::GetVectorizedSize<T>(x_data) == 4) ? 4 : 1;
-    int block_size = pten::GetThreadsConfig(dev_ctx, x_numel, vec_size);
+    int block_size = pten::funcs::GetThreadsConfig(dev_ctx, x_numel, vec_size);
     int grid_size =
         ((x_numel + vec_size - 1) / vec_size + block_size - 1) / block_size;
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h
index a145848bad96c..3929699955a17 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -31,7 +31,7 @@ limitations under the License. */
 
 // only can include the headers in paddle/pten/include dirs
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
-#include "paddle/pten/kernels/cpu/elementwise_impl.h"
+#include "paddle/pten/kernels/cpu/elementwise.h"
 
 #if defined(__NVCC__) || defined(__HIPCC__)
 #ifdef __NVCC__
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
index 27897f10a3c63..1d8acd5eca5d9 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
@@ -23,7 +23,7 @@ limitations under the License. */
 // only can include the headers in paddle/top/api dirs
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
 #include "paddle/pten/include/core.h"
-#include "paddle/pten/kernels/hybird/cuda/elementwise/elementwise.h"
+#include "paddle/pten/kernels/gpu/elementwise.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/pten/kernels/cpu/CMakeLists.txt b/paddle/pten/kernels/cpu/CMakeLists.txt
index 9bf3df598e4c0..e69de29bb2d1d 100644
--- a/paddle/pten/kernels/cpu/CMakeLists.txt
+++ b/paddle/pten/kernels/cpu/CMakeLists.txt
@@ -1 +0,0 @@
-cc_library(math_cpu SRCS math.cc DEPS dense_tensor kernel_context kernel_factory eigen_function blas pten_transpose_cpu cast_kernel)
diff --git a/paddle/pten/kernels/cpu/elementwise_impl.h b/paddle/pten/kernels/cpu/elementwise.h
similarity index 100%
rename from paddle/pten/kernels/cpu/elementwise_impl.h
rename to paddle/pten/kernels/cpu/elementwise.h
diff --git a/paddle/pten/kernels/cpu/math.cc b/paddle/pten/kernels/cpu/math.cc
deleted file mode 100644
index b4642d475d566..0000000000000
--- a/paddle/pten/kernels/cpu/math.cc
+++ /dev/null
@@ -1,15 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-namespace pten {}  // namespace pten
diff --git a/paddle/pten/kernels/cpu/math_kernel.cc b/paddle/pten/kernels/cpu/math_kernel.cc
index 4f895d9514a97..2a696584bc781 100644
--- a/paddle/pten/kernels/cpu/math_kernel.cc
+++ b/paddle/pten/kernels/cpu/math_kernel.cc
@@ -18,7 +18,7 @@
 #include "paddle/pten/backends/cpu/cpu_context.h"
 #include "paddle/pten/common/scalar.h"
 #include "paddle/pten/core/kernel_registry.h"
-#include "paddle/pten/kernels/cpu/elementwise_impl.h"
+#include "paddle/pten/kernels/cpu/elementwise.h"
 #include "paddle/pten/kernels/cpu/reduce.h"
 #include "paddle/pten/kernels/funcs/elementwise_functor.h"
 
diff --git a/paddle/pten/kernels/funcs/cuda_kernel_config.h b/paddle/pten/kernels/funcs/cuda_kernel_config.h
new file mode 100644
index 0000000000000..27fbc1de55a35
--- /dev/null
+++ b/paddle/pten/kernels/funcs/cuda_kernel_config.h
@@ -0,0 +1,55 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+
+#ifdef __HIPCC__
+#define ELEMENTWISE_BLOCK_SIZE 256
+#else
+#define ELEMENTWISE_BLOCK_SIZE 512
+#endif
+
+namespace pten {
+namespace funcs {
+/*
+* According to NVIDIA, if number of threads per block is 64/128/256/512,
+* cuda performs better. And number of blocks should be greater (at least
+* 2x~4x) than number of SMs. Hence, SM count is took into account within
+* this function to determine the right number of threads per block.
+*/
+inline int GetThreadsConfig(const paddle::platform::CUDADeviceContext &ctx,
+                            int64_t numel,
+                            int vec_size) {
+  int threads = ELEMENTWISE_BLOCK_SIZE;
+  int sm_count = ctx.GetSMCount();
+  int active_threads_num = numel / vec_size;
+  if (active_threads_num / (sm_count << 1) < ELEMENTWISE_BLOCK_SIZE) {
+    // Round up threads number into an exponential multiple of 2, while number
+    // of acitve blocks is about twice of SM, to acquire better performance.
+    threads = paddle::platform::RoundToPowerOfTwo(active_threads_num /
+                                                  (sm_count << 1));
+  } else if (active_threads_num / (sm_count << 2) < ELEMENTWISE_BLOCK_SIZE) {
+    // Round up threads number into an exponential multiple of 2, while number
+    // of acitve blocks is about 4 times of SM, to acquire better performance.
+    threads = paddle::platform::RoundToPowerOfTwo(active_threads_num /
+                                                  (sm_count << 2));
+  }
+  // Number of threads per block shall be larger than 64.
+  return std::max(64, threads);
+}
+
+}  // namespace funcs
+}  // namespace pten
diff --git a/paddle/pten/kernels/hybird/cuda/elementwise/elementwise_broadcast.cu.h b/paddle/pten/kernels/gpu/elementwise.h
similarity index 61%
rename from paddle/pten/kernels/hybird/cuda/elementwise/elementwise_broadcast.cu.h
rename to paddle/pten/kernels/gpu/elementwise.h
index 134ad08913c21..f78328c01a30d 100644
--- a/paddle/pten/kernels/hybird/cuda/elementwise/elementwise_broadcast.cu.h
+++ b/paddle/pten/kernels/gpu/elementwise.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,11 +14,309 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h"
+#include "paddle/fluid/platform/aligned_vector.h"
+#include "paddle/fluid/platform/function_traits.h"
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/kernels/hybird/cuda/elementwise/elementwise_common.cu.h"
+#include "paddle/pten/kernels/funcs/cuda_kernel_config.h"
 
 namespace pten {
 
+namespace kps = paddle::operators::kernel_primitives;
+enum ElementwiseType { kUnary = 1, kBinary = 2, kTernary = 3, kAny = -1 };
+
+/* Packing scalar type T(float, int etc.) into Array<T, NumOuts> type
+   for supporting multiple-output feature in elementwise system.*/
+template <class T, int Num>
+using ConditionalT =
+    typename std::conditional_t<Num == 1, T, paddle::framework::Array<T, Num>>;
+
+template <typename InT,
+          typename OutT,
+          int VecSize,
+          typename Functor,
+          int Arity,
+          bool CallElementwiseAny = false>
+struct ElementwisePrimitiveCaller {
+  __device__ inline void operator()(Functor func,
+                                    InT (*args)[VecSize],
+                                    OutT *result);
+};
+
+template <typename InT, typename OutT, int VecSize, typename Functor, int Arity>
+struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, Arity, true> {
+  __device__ inline void operator()(Functor func,
+                                    InT (*args)[VecSize],
+                                    OutT *result) {
+    kps::ElementwiseAny<InT, OutT, VecSize, 1, 1, Arity, Functor>(
+        result, args, func);
+  }
+};
+
+template <typename InT, typename OutT, int VecSize, typename Functor>
+struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 1, false> {
+  __device__ inline void operator()(Functor func,
+                                    InT (*args)[VecSize],
+                                    OutT *result) {
+    kps::ElementwiseUnary<InT, OutT, VecSize, 1, 1, Functor>(
+        result, args[0], func);
+  }
+};
+
+template <typename InT, typename OutT, int VecSize, typename Functor>
+struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 2, false> {
+  __device__ inline void operator()(Functor func,
+                                    InT (*args)[VecSize],
+                                    OutT *result) {
+    kps::ElementwiseBinary<InT, OutT, VecSize, 1, 1, Functor>(
+        result, args[0], args[1], func);
+  }
+};
+
+template <typename InT, typename OutT, int VecSize, typename Functor>
+struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 3, false> {
+  __device__ inline void operator()(Functor func,
+                                    InT (*args)[VecSize],
+                                    OutT *result) {
+    kps::ElementwiseTernary<InT, OutT, VecSize, 1, 1, Functor>(
+        result, args[0], args[1], args[2], func);
+  }
+};
+
+template <typename OutT, int VecSize, bool IsBoundary, int NumOuts>
+struct ElementwiseWriteDataCaller {
+  __device__ __forceinline__ void operator()(
+      paddle::framework::Array<OutT *, NumOuts> outs,
+      ConditionalT<OutT, NumOuts> src[VecSize],
+      int block_offset,
+      int num) {
+    OutT dst[NumOuts][VecSize];
+#pragma unroll
+    for (int i = 0; i < VecSize; ++i) {
+#pragma unroll
+      for (int j = 0; j < NumOuts; ++j) {
+        dst[j][i] = (src[i])[j];
+      }
+    }
+#pragma unroll
+    for (int i = 0; i < NumOuts; ++i) {
+      kps::WriteData<OutT, VecSize, 1, 1, IsBoundary>(
+          outs[i] + block_offset, dst[i], num);
+    }
+  }
+};
+
+template <typename OutT, int VecSize, bool IsBoundary>
+struct ElementwiseWriteDataCaller<OutT, VecSize, IsBoundary, 1> {
+  __device__ __forceinline__ void operator()(
+      paddle::framework::Array<OutT *, 1> outs,
+      OutT src[VecSize],
+      int block_offset,
+      int num) {
+    kps::WriteData<OutT, VecSize, 1, 1, IsBoundary>(
+        outs[0] + block_offset, src, num);
+  }
+};
+
+template <typename InT,
+          typename OutT,
+          typename Functor,
+          int Arity,
+          int NumOuts,
+          int VecSize,
+          bool IsBoundary>
+__device__ void VectorizedElementwiseKernelImpl(
+    const paddle::framework::Array<const InT *__restrict__, Arity> &in,
+    paddle::framework::Array<OutT *, NumOuts> outs,
+    int num,
+    int data_offset,
+    Functor func) {
+  InT args[Arity][VecSize];
+  ConditionalT<OutT, NumOuts> result[VecSize];
+
+#pragma unroll
+  for (int i = 0; i < Arity; i++) {
+    kps::Init<InT, VecSize>(args[i], static_cast<InT>(1.0f));
+    kps::ReadData<InT, VecSize, 1, 1, IsBoundary>(
+        args[i], in[i] + data_offset, num);
+  }
+
+  constexpr bool kCallElementwiseAny =
+      paddle::platform::FunctionTraits<Functor>::has_pointer_args;
+  ElementwisePrimitiveCaller<InT,
+                             ConditionalT<OutT, NumOuts>,
+                             VecSize,
+                             Functor,
+                             Arity,
+                             kCallElementwiseAny>()(func, args, result);
+
+  ElementwiseWriteDataCaller<OutT, VecSize, IsBoundary, NumOuts>()(
+      outs, result, data_offset, num);
+}
+
+template <typename InT,
+          typename OutT,
+          typename Functor,
+          int Arity,
+          int NumOuts,
+          int VecSize>
+__global__ void VectorizedElementwiseKernel(
+    paddle::framework::Array<const InT *__restrict__, Arity> ins,
+    paddle::framework::Array<OutT *, NumOuts> outs,
+    int size,
+    int main_offset,
+    Functor func) {
+  int data_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize;
+  int stride = BLOCK_NUM_X * GRID_NUM_X * VecSize;
+  for (; data_offset < main_offset; data_offset += stride) {
+    VectorizedElementwiseKernelImpl<InT,
+                                    OutT,
+                                    Functor,
+                                    Arity,
+                                    NumOuts,
+                                    VecSize,
+                                    false>(
+        ins, outs, VecSize * BLOCK_NUM_X, data_offset, func);
+  }
+
+  int num = size - data_offset;
+  if (num > 0) {
+    VectorizedElementwiseKernelImpl<InT,
+                                    OutT,
+                                    Functor,
+                                    Arity,
+                                    NumOuts,
+                                    VecSize,
+                                    true>(ins, outs, num, data_offset, func);
+  }
+}
+
+template <typename InT, typename OutT>
+int GetVectorizedSizeForTensors(const std::vector<const DenseTensor *> &ins,
+                                const std::vector<DenseTensor *> &outs) {
+  int vec_size = 4;
+  for (auto iter = ins.begin(); iter != ins.end(); ++iter) {
+    vec_size = std::min<int>(
+        vec_size, paddle::platform::GetVectorizedSize((*iter)->data<InT>()));
+  }
+  for (auto iter = outs.begin(); iter != outs.end(); ++iter) {
+    vec_size = std::min<int>(
+        vec_size, paddle::platform::GetVectorizedSize((*iter)->data<OutT>()));
+  }
+  return vec_size;
+}
+
+template <typename InT,
+          typename OutT,
+          typename Functor,
+          int Arity,
+          int NumOuts,
+          int VecSize>
+void ElementwiseCudaKernel(const paddle::platform::CUDADeviceContext &ctx,
+                           const std::vector<const DenseTensor *> &ins,
+                           std::vector<DenseTensor *> *outs,
+                           Functor func) {
+  auto numel = ins[0]->numel();
+  int block_size = funcs::GetThreadsConfig(ctx, numel, VecSize);
+  int grid_size =
+      ((numel + VecSize - 1) / VecSize + block_size - 1) / block_size;
+  auto stream = ctx.stream();
+  paddle::framework::Array<const InT *__restrict__, Arity> ins_data;
+  paddle::framework::Array<OutT *, NumOuts> outs_data;
+
+  for (int i = 0; i < Arity; ++i) {
+    ins_data[i] = ins[i]->data<InT>();
+  }
+  for (int i = 0; i < NumOuts; ++i) {
+    outs_data[i] = (*outs)[i]->mutable_data<OutT>();
+  }
+#ifdef PADDLE_WITH_XPU2
+  block_size = 128;
+  grid_size = 8;
+  int main_offset = (numel / (VecSize * block_size)) * VecSize * block_size;
+  VectorizedElementwiseKernel<InT,
+                              OutT,
+                              Functor,
+                              Arity,
+                              NumOuts,
+                              VecSize><<<grid_size, block_size, 0, stream>>>(
+      ins_data, outs_data, numel, main_offset, func);
+#else
+  int main_offset = (numel / (VecSize * block_size)) * VecSize * block_size;
+  VectorizedElementwiseKernel<InT,
+                              OutT,
+                              Functor,
+                              Arity,
+                              NumOuts,
+                              VecSize><<<grid_size, block_size, 0, stream>>>(
+      ins_data, outs_data, numel, main_offset, func);
+#endif
+}
+
+template <ElementwiseType ET,
+          typename InT,
+          typename OutT,
+          typename Functor,
+          int NumOuts = 1>
+void LaunchSameDimsElementwiseCudaKernel(
+    const paddle::platform::CUDADeviceContext &ctx,
+    const std::vector<const DenseTensor *> &ins,
+    std::vector<DenseTensor *> *outs,
+    Functor func) {
+  using Traits = paddle::platform::FunctionTraits<Functor>;
+  const int kArity =
+      Traits::has_pointer_args ? static_cast<int>(ET) : Traits::arity;
+  PADDLE_ENFORCE_EQ(ins.size(),
+                    kArity,
+                    paddle::platform::errors::InvalidArgument(
+                        "The number of inputs is expected to be equal to the "
+                        "arity of functor. But recieved: the number of inputs "
+                        "is %d, the arity of functor is %d.",
+                        ins.size(),
+                        kArity));
+  PADDLE_ENFORCE_EQ(outs->size(),
+                    NumOuts,
+                    paddle::platform::errors::InvalidArgument(
+                        "Number of outputs shall equal to number of functions, "
+                        "but number of outputs is %d, of functions is %d.",
+                        outs->size(),
+                        NumOuts));
+
+  if (NumOuts > 1) {
+    for (int i = 1; i < NumOuts; ++i) {
+      PADDLE_ENFORCE_EQ(
+          (*outs)[i]->dims(),
+          (*outs)[0]->dims(),
+          paddle::platform::errors::InvalidArgument(
+              "The shape of each output tensor shall be identical yet, "
+              "but %dth output tensor`s shape is not.",
+              i));
+    }
+  }
+
+  // calculate the max vec_size for all ins and outs
+  int vec_size = GetVectorizedSizeForTensors<InT, OutT>(ins, *outs);
+  switch (vec_size) {
+    case 4:
+      ElementwiseCudaKernel<InT, OutT, Functor, kArity, NumOuts, 4>(
+          ctx, ins, outs, func);
+      break;
+    case 2:
+      ElementwiseCudaKernel<InT, OutT, Functor, kArity, NumOuts, 2>(
+          ctx, ins, outs, func);
+      break;
+    case 1:
+      ElementwiseCudaKernel<InT, OutT, Functor, kArity, NumOuts, 1>(
+          ctx, ins, outs, func);
+      break;
+    default: {
+      PADDLE_THROW(paddle::platform::errors::Unimplemented(
+          "Unsupported vectorized size: %d !", vec_size));
+      break;
+    }
+  }
+}
+
 struct DimensionsTransform {
   using DimVector = std::vector<int64_t>;
   typedef void (*MergeFunctor)(
@@ -532,4 +830,34 @@ void LaunchBroadcastElementwiseCudaKernel(
   }
 }
 
+template <ElementwiseType ET,
+          typename InT,
+          typename OutT,
+          typename Functor,
+          int NumOuts = 1>
+void LaunchElementwiseCudaKernel(
+    const paddle::platform::CUDADeviceContext &cuda_ctx,
+    const std::vector<const DenseTensor *> &ins,
+    std::vector<DenseTensor *> *outs,
+    int axis,
+    Functor func) {
+  std::vector<int> dims_size;
+  bool no_broadcast_flag = true;
+  for (auto *in : ins) {
+    no_broadcast_flag &= ins[0]->dims() == in->dims();
+    dims_size.emplace_back(in->dims().size());
+  }
+  if (no_broadcast_flag) {
+    LaunchSameDimsElementwiseCudaKernel<ET, InT, OutT, Functor, NumOuts>(
+        cuda_ctx, ins, outs, func);
+  } else {
+    axis = axis == -1
+               ? *std::max_element(dims_size.begin(), dims_size.end()) -
+                     *std::min_element(dims_size.begin(), dims_size.end())
+               : axis;
+    LaunchBroadcastElementwiseCudaKernel<ET, InT, OutT, Functor, NumOuts>(
+        cuda_ctx, ins, outs, axis, func);
+  }
+}
+
 }  // namespace pten
diff --git a/paddle/pten/kernels/gpu/math_kernel.cu b/paddle/pten/kernels/gpu/math_kernel.cu
index 051f7cb3bdd05..f41934313d674 100644
--- a/paddle/pten/kernels/gpu/math_kernel.cu
+++ b/paddle/pten/kernels/gpu/math_kernel.cu
@@ -16,8 +16,8 @@ limitations under the License. */
 
 #include "paddle/pten/backends/gpu/gpu_context.h"
 #include "paddle/pten/kernels/funcs/elementwise_functor.h"
+#include "paddle/pten/kernels/gpu/elementwise.h"
 #include "paddle/pten/kernels/gpu/reduce.h"
-#include "paddle/pten/kernels/hybird/cuda/elementwise/elementwise.h"
 
 #ifdef __NVCC__
 #include "cub/cub.cuh"
@@ -30,12 +30,9 @@ namespace cub = hipcub;
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/float16.h"
-#include "paddle/pten/api/lib/utils/tensor_utils.h"
 #include "paddle/pten/core/convert_utils.h"
 #include "paddle/pten/core/kernel_registry.h"
 
-namespace kps = paddle::operators::kernel_primitives;
-
 namespace pten {
 
 #define DEFINE_CUDA_ELEMENTWISE_OP(name)                             \
diff --git a/paddle/pten/kernels/hybird/cuda/elementwise/elementwise.h b/paddle/pten/kernels/hybird/cuda/elementwise/elementwise.h
deleted file mode 100644
index 83d662b14e7fc..0000000000000
--- a/paddle/pten/kernels/hybird/cuda/elementwise/elementwise.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/pten/kernels/hybird/cuda/elementwise/elementwise_broadcast.cu.h"
-#include "paddle/pten/kernels/hybird/cuda/elementwise/elementwise_no_broadcast.cu.h"
-
-namespace pten {
-
-template <ElementwiseType ET,
-          typename InT,
-          typename OutT,
-          typename Functor,
-          int NumOuts = 1>
-void LaunchElementwiseCudaKernel(
-    const paddle::platform::CUDADeviceContext &cuda_ctx,
-    const std::vector<const DenseTensor *> &ins,
-    std::vector<DenseTensor *> *outs,
-    int axis,
-    Functor func) {
-  std::vector<int> dims_size;
-  bool no_broadcast_flag = true;
-  for (auto *in : ins) {
-    no_broadcast_flag &= ins[0]->dims() == in->dims();
-    dims_size.emplace_back(in->dims().size());
-  }
-  if (no_broadcast_flag) {
-    LaunchSameDimsElementwiseCudaKernel<ET, InT, OutT, Functor, NumOuts>(
-        cuda_ctx, ins, outs, func);
-  } else {
-    axis = axis == -1
-               ? *std::max_element(dims_size.begin(), dims_size.end()) -
-                     *std::min_element(dims_size.begin(), dims_size.end())
-               : axis;
-    LaunchBroadcastElementwiseCudaKernel<ET, InT, OutT, Functor, NumOuts>(
-        cuda_ctx, ins, outs, axis, func);
-  }
-}
-
-}  // namespace pten
diff --git a/paddle/pten/kernels/hybird/cuda/elementwise/elementwise_common.cu.h b/paddle/pten/kernels/hybird/cuda/elementwise/elementwise_common.cu.h
deleted file mode 100644
index ae384693249a4..0000000000000
--- a/paddle/pten/kernels/hybird/cuda/elementwise/elementwise_common.cu.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h"
-#include "paddle/fluid/platform/aligned_vector.h"
-#include "paddle/fluid/platform/function_traits.h"
-#include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/kernels/funcs/elementwise_base.h"
-
-namespace pten {
-namespace kps = paddle::operators::kernel_primitives;
-enum ElementwiseType { kUnary = 1, kBinary = 2, kTernary = 3, kAny = -1 };
-
-/* Packing scalar type T(float, int etc.) into Array<T, NumOuts> type
-   for supporting multiple-output feature in elementwise system.*/
-template <class T, int Num>
-using ConditionalT =
-    typename std::conditional_t<Num == 1, T, paddle::framework::Array<T, Num>>;
-
-template <typename InT,
-          typename OutT,
-          int VecSize,
-          typename Functor,
-          int Arity,
-          bool CallElementwiseAny = false>
-struct ElementwisePrimitiveCaller {
-  __device__ inline void operator()(Functor func,
-                                    InT (*args)[VecSize],
-                                    OutT *result);
-};
-
-template <typename InT, typename OutT, int VecSize, typename Functor, int Arity>
-struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, Arity, true> {
-  __device__ inline void operator()(Functor func,
-                                    InT (*args)[VecSize],
-                                    OutT *result) {
-    kps::ElementwiseAny<InT, OutT, VecSize, 1, 1, Arity, Functor>(
-        result, args, func);
-  }
-};
-
-template <typename InT, typename OutT, int VecSize, typename Functor>
-struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 1, false> {
-  __device__ inline void operator()(Functor func,
-                                    InT (*args)[VecSize],
-                                    OutT *result) {
-    kps::ElementwiseUnary<InT, OutT, VecSize, 1, 1, Functor>(
-        result, args[0], func);
-  }
-};
-
-template <typename InT, typename OutT, int VecSize, typename Functor>
-struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 2, false> {
-  __device__ inline void operator()(Functor func,
-                                    InT (*args)[VecSize],
-                                    OutT *result) {
-    kps::ElementwiseBinary<InT, OutT, VecSize, 1, 1, Functor>(
-        result, args[0], args[1], func);
-  }
-};
-
-template <typename InT, typename OutT, int VecSize, typename Functor>
-struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 3, false> {
-  __device__ inline void operator()(Functor func,
-                                    InT (*args)[VecSize],
-                                    OutT *result) {
-    kps::ElementwiseTernary<InT, OutT, VecSize, 1, 1, Functor>(
-        result, args[0], args[1], args[2], func);
-  }
-};
-
-template <typename OutT, int VecSize, bool IsBoundary, int NumOuts>
-struct ElementwiseWriteDataCaller {
-  __device__ __forceinline__ void operator()(
-      paddle::framework::Array<OutT *, NumOuts> outs,
-      ConditionalT<OutT, NumOuts> src[VecSize],
-      int block_offset,
-      int num) {
-    OutT dst[NumOuts][VecSize];
-#pragma unroll
-    for (int i = 0; i < VecSize; ++i) {
-#pragma unroll
-      for (int j = 0; j < NumOuts; ++j) {
-        dst[j][i] = (src[i])[j];
-      }
-    }
-#pragma unroll
-    for (int i = 0; i < NumOuts; ++i) {
-      kps::WriteData<OutT, VecSize, 1, 1, IsBoundary>(
-          outs[i] + block_offset, dst[i], num);
-    }
-  }
-};
-
-template <typename OutT, int VecSize, bool IsBoundary>
-struct ElementwiseWriteDataCaller<OutT, VecSize, IsBoundary, 1> {
-  __device__ __forceinline__ void operator()(
-      paddle::framework::Array<OutT *, 1> outs,
-      OutT src[VecSize],
-      int block_offset,
-      int num) {
-    kps::WriteData<OutT, VecSize, 1, 1, IsBoundary>(
-        outs[0] + block_offset, src, num);
-  }
-};
-
-}  // namespace pten
diff --git a/paddle/pten/kernels/hybird/cuda/elementwise/elementwise_no_broadcast.cu.h b/paddle/pten/kernels/hybird/cuda/elementwise/elementwise_no_broadcast.cu.h
deleted file mode 100644
index f37e3b0b5e3b3..0000000000000
--- a/paddle/pten/kernels/hybird/cuda/elementwise/elementwise_no_broadcast.cu.h
+++ /dev/null
@@ -1,253 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/pten/kernels/hybird/cuda/elementwise/elementwise_common.cu.h"
-
-#ifdef __HIPCC__
-#define ELEMENTWISE_BLOCK_SIZE 256
-#else
-#define ELEMENTWISE_BLOCK_SIZE 512
-#endif
-
-namespace pten {
-
-/*
-* According to NVIDIA, if number of threads per block is 64/128/256/512,
-* cuda performs better. And number of blocks should be greater (at least
-* 2x~4x) than number of SMs. Hence, SM count is took into account within
-* this function to determine the right number of threads per block.
-*/
-inline int GetThreadsConfig(const paddle::platform::CUDADeviceContext &ctx,
-                            int64_t numel,
-                            int vec_size) {
-  int threads = ELEMENTWISE_BLOCK_SIZE;
-  int sm_count = ctx.GetSMCount();
-  int active_threads_num = numel / vec_size;
-  if (active_threads_num / (sm_count << 1) < ELEMENTWISE_BLOCK_SIZE) {
-    // Round up threads number into an exponential multiple of 2, while number
-    // of acitve blocks is about twice of SM, to acquire better performance.
-    threads = paddle::platform::RoundToPowerOfTwo(active_threads_num /
-                                                  (sm_count << 1));
-  } else if (active_threads_num / (sm_count << 2) < ELEMENTWISE_BLOCK_SIZE) {
-    // Round up threads number into an exponential multiple of 2, while number
-    // of acitve blocks is about 4 times of SM, to acquire better performance.
-    threads = paddle::platform::RoundToPowerOfTwo(active_threads_num /
-                                                  (sm_count << 2));
-  }
-  // Number of threads per block shall be larger than 64.
-  return std::max(64, threads);
-}
-
-template <typename InT,
-          typename OutT,
-          typename Functor,
-          int Arity,
-          int NumOuts,
-          int VecSize,
-          bool IsBoundary>
-__device__ void VectorizedElementwiseKernelImpl(
-    const paddle::framework::Array<const InT *__restrict__, Arity> &in,
-    paddle::framework::Array<OutT *, NumOuts> outs,
-    int num,
-    int data_offset,
-    Functor func) {
-  InT args[Arity][VecSize];
-  ConditionalT<OutT, NumOuts> result[VecSize];
-
-#pragma unroll
-  for (int i = 0; i < Arity; i++) {
-    kps::Init<InT, VecSize>(args[i], static_cast<InT>(1.0f));
-    kps::ReadData<InT, VecSize, 1, 1, IsBoundary>(
-        args[i], in[i] + data_offset, num);
-  }
-
-  constexpr bool kCallElementwiseAny =
-      paddle::platform::FunctionTraits<Functor>::has_pointer_args;
-  ElementwisePrimitiveCaller<InT,
-                             ConditionalT<OutT, NumOuts>,
-                             VecSize,
-                             Functor,
-                             Arity,
-                             kCallElementwiseAny>()(func, args, result);
-
-  ElementwiseWriteDataCaller<OutT, VecSize, IsBoundary, NumOuts>()(
-      outs, result, data_offset, num);
-}
-
-template <typename InT,
-          typename OutT,
-          typename Functor,
-          int Arity,
-          int NumOuts,
-          int VecSize>
-__global__ void VectorizedElementwiseKernel(
-    paddle::framework::Array<const InT *__restrict__, Arity> ins,
-    paddle::framework::Array<OutT *, NumOuts> outs,
-    int size,
-    int main_offset,
-    Functor func) {
-  int data_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize;
-  int stride = BLOCK_NUM_X * GRID_NUM_X * VecSize;
-  for (; data_offset < main_offset; data_offset += stride) {
-    VectorizedElementwiseKernelImpl<InT,
-                                    OutT,
-                                    Functor,
-                                    Arity,
-                                    NumOuts,
-                                    VecSize,
-                                    false>(
-        ins, outs, VecSize * BLOCK_NUM_X, data_offset, func);
-  }
-
-  int num = size - data_offset;
-  if (num > 0) {
-    VectorizedElementwiseKernelImpl<InT,
-                                    OutT,
-                                    Functor,
-                                    Arity,
-                                    NumOuts,
-                                    VecSize,
-                                    true>(ins, outs, num, data_offset, func);
-  }
-}
-
-template <typename InT, typename OutT>
-int GetVectorizedSizeForTensors(const std::vector<const DenseTensor *> &ins,
-                                const std::vector<DenseTensor *> &outs) {
-  int vec_size = 4;
-  for (auto iter = ins.begin(); iter != ins.end(); ++iter) {
-    vec_size = std::min<int>(
-        vec_size, paddle::platform::GetVectorizedSize((*iter)->data<InT>()));
-  }
-  for (auto iter = outs.begin(); iter != outs.end(); ++iter) {
-    vec_size = std::min<int>(
-        vec_size, paddle::platform::GetVectorizedSize((*iter)->data<OutT>()));
-  }
-  return vec_size;
-}
-
-template <typename InT,
-          typename OutT,
-          typename Functor,
-          int Arity,
-          int NumOuts,
-          int VecSize>
-void ElementwiseCudaKernel(const paddle::platform::CUDADeviceContext &ctx,
-                           const std::vector<const DenseTensor *> &ins,
-                           std::vector<DenseTensor *> *outs,
-                           Functor func) {
-  auto numel = ins[0]->numel();
-  int block_size = GetThreadsConfig(ctx, numel, VecSize);
-  int grid_size =
-      ((numel + VecSize - 1) / VecSize + block_size - 1) / block_size;
-  auto stream = ctx.stream();
-  paddle::framework::Array<const InT *__restrict__, Arity> ins_data;
-  paddle::framework::Array<OutT *, NumOuts> outs_data;
-
-  for (int i = 0; i < Arity; ++i) {
-    ins_data[i] = ins[i]->data<InT>();
-  }
-  for (int i = 0; i < NumOuts; ++i) {
-    outs_data[i] = (*outs)[i]->mutable_data<OutT>();
-  }
-#ifdef PADDLE_WITH_XPU2
-  block_size = 128;
-  grid_size = 8;
-  int main_offset = (numel / (VecSize * block_size)) * VecSize * block_size;
-  VectorizedElementwiseKernel<InT,
-                              OutT,
-                              Functor,
-                              Arity,
-                              NumOuts,
-                              VecSize><<<grid_size, block_size, 0, stream>>>(
-      ins_data, outs_data, numel, main_offset, func);
-#else
-  int main_offset = (numel / (VecSize * block_size)) * VecSize * block_size;
-  VectorizedElementwiseKernel<InT,
-                              OutT,
-                              Functor,
-                              Arity,
-                              NumOuts,
-                              VecSize><<<grid_size, block_size, 0, stream>>>(
-      ins_data, outs_data, numel, main_offset, func);
-#endif
-}
-
-template <ElementwiseType ET,
-          typename InT,
-          typename OutT,
-          typename Functor,
-          int NumOuts = 1>
-void LaunchSameDimsElementwiseCudaKernel(
-    const paddle::platform::CUDADeviceContext &ctx,
-    const std::vector<const DenseTensor *> &ins,
-    std::vector<DenseTensor *> *outs,
-    Functor func) {
-  using Traits = paddle::platform::FunctionTraits<Functor>;
-  const int kArity =
-      Traits::has_pointer_args ? static_cast<int>(ET) : Traits::arity;
-  PADDLE_ENFORCE_EQ(ins.size(),
-                    kArity,
-                    paddle::platform::errors::InvalidArgument(
-                        "The number of inputs is expected to be equal to the "
-                        "arity of functor. But recieved: the number of inputs "
-                        "is %d, the arity of functor is %d.",
-                        ins.size(),
-                        kArity));
-  PADDLE_ENFORCE_EQ(outs->size(),
-                    NumOuts,
-                    paddle::platform::errors::InvalidArgument(
-                        "Number of outputs shall equal to number of functions, "
-                        "but number of outputs is %d, of functions is %d.",
-                        outs->size(),
-                        NumOuts));
-
-  if (NumOuts > 1) {
-    for (int i = 1; i < NumOuts; ++i) {
-      PADDLE_ENFORCE_EQ(
-          (*outs)[i]->dims(),
-          (*outs)[0]->dims(),
-          paddle::platform::errors::InvalidArgument(
-              "The shape of each output tensor shall be identical yet, "
-              "but %dth output tensor`s shape is not.",
-              i));
-    }
-  }
-
-  // calculate the max vec_size for all ins and outs
-  int vec_size = GetVectorizedSizeForTensors<InT, OutT>(ins, *outs);
-  switch (vec_size) {
-    case 4:
-      ElementwiseCudaKernel<InT, OutT, Functor, kArity, NumOuts, 4>(
-          ctx, ins, outs, func);
-      break;
-    case 2:
-      ElementwiseCudaKernel<InT, OutT, Functor, kArity, NumOuts, 2>(
-          ctx, ins, outs, func);
-      break;
-    case 1:
-      ElementwiseCudaKernel<InT, OutT, Functor, kArity, NumOuts, 1>(
-          ctx, ins, outs, func);
-      break;
-    default: {
-      PADDLE_THROW(paddle::platform::errors::Unimplemented(
-          "Unsupported vectorized size: %d !", vec_size));
-      break;
-    }
-  }
-}
-
-}  // namespace pten
diff --git a/paddle/pten/kernels/hybird/general/manipulation.h b/paddle/pten/kernels/hybird/general/manipulation.h
deleted file mode 100644
index 85f6b613ac609..0000000000000
--- a/paddle/pten/kernels/hybird/general/manipulation.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/pten/core/dense_tensor.h"
-
-namespace pten {
-namespace general {
-
-inline void SetXShape(const DenseTensor& x, DenseTensor* xshape) {
-  const auto& in_dims = x.meta().dims;
-  std::vector<int64_t> xshape_dims(in_dims.size() + 1);
-  xshape_dims[0] = 0;
-  for (int i = 0; i < in_dims.size(); ++i) {
-    xshape_dims[i + 1] = in_dims[i];
-  }
-  xshape->Resize(paddle::framework::make_ddim(xshape_dims));
-  xshape->ResetLoD(x.meta().lod);
-}
-
-}  // namespace general
-}  // namespace pten

From 0c02d2ed7c840360ed42023902dc5da96552b3fd Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Thu, 6 Jan 2022 09:55:18 +0800
Subject: [PATCH 017/151] =?UTF-8?q?=E3=80=90PTen=E3=80=91Adjust=20the=20fo?=
 =?UTF-8?q?rmat=20of=20full=20kernel=20(#38596)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* adjust the full kernel

* remove creation.h

* use Empty to create tensor in full
---
 paddle/fluid/operators/fill_any_like_op.h     |  4 +-
 paddle/pten/all.h                             |  1 -
 paddle/pten/include/creation.h                | 59 -------------------
 paddle/pten/kernels/cpu/full_kernel.cc        |  4 +-
 paddle/pten/kernels/full_kernel.h             | 42 +++++++++++--
 paddle/pten/kernels/gpu/full_kernel.cu        |  4 +-
 paddle/pten/kernels/impl/full_kernel_impl.h   | 18 +++---
 .../tests/kernels/test_creation_dev_api.cc    |  3 +-
 8 files changed, 55 insertions(+), 80 deletions(-)
 delete mode 100644 paddle/pten/include/creation.h

diff --git a/paddle/fluid/operators/fill_any_like_op.h b/paddle/fluid/operators/fill_any_like_op.h
index 3ad56827f8344..287bbbfa3b343 100644
--- a/paddle/fluid/operators/fill_any_like_op.h
+++ b/paddle/fluid/operators/fill_any_like_op.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/pten_utils.h"
 
 #include "paddle/pten/include/core.h"
-#include "paddle/pten/include/creation.h"
+#include "paddle/pten/kernels/full_kernel.h"
 
 namespace paddle {
 namespace operators {
@@ -65,7 +65,7 @@ class FillAnyLikeKernel : public framework::OpKernel<T> {
 
     const auto& dev_ctx = context.template device_context<DeviceContext>();
     // call new kernel
-    pten::FullLike<T>(dev_ctx, value, pt_out.get());
+    pten::FullLikeKernel<T>(dev_ctx, value, pt_out.get());
   }
 };
 
diff --git a/paddle/pten/all.h b/paddle/pten/all.h
index b7ef1c1ec2611..844114c341d67 100644
--- a/paddle/pten/all.h
+++ b/paddle/pten/all.h
@@ -16,7 +16,6 @@ limitations under the License. */
 
 // developer apis
 #include "paddle/pten/include/core.h"
-#include "paddle/pten/include/creation.h"
 #include "paddle/pten/include/infermeta.h"
 #include "paddle/pten/include/linalg.h"
 #include "paddle/pten/include/manipulation.h"
diff --git a/paddle/pten/include/creation.h b/paddle/pten/include/creation.h
deleted file mode 100644
index fa5bd49ca3026..0000000000000
--- a/paddle/pten/include/creation.h
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/pten/api/lib/utils/storage.h"
-#include "paddle/pten/include/infermeta.h"
-#include "paddle/pten/kernels/empty_kernel.h"
-#include "paddle/pten/kernels/full_kernel.h"
-
-namespace pten {
-
-// TODO(YuanRisheng) This function name should be same as User API name.
-// TODO(zyfncg) Automatic code generation
-template <typename T, typename ContextT>
-DenseTensor Full(const ContextT& dev_ctx,
-                 const ScalarArray& shape,
-                 const Scalar& val,
-                 DataType dtype = DataType::FLOAT32,
-                 Backend backend = Backend::CPU,  // Is backend needed here?
-                 DataLayout layout = DataLayout::NCHW) {
-  auto out_meta = CreateInferMeta(shape, dtype, layout);
-  pten::DenseTensor dense_out(
-      pten::make_intrusive<paddle::experimental::SharedStorage>(
-          dev_ctx.GetPlace()),
-      std::move(out_meta));
-  Full<T, ContextT>(dev_ctx, shape, val, &dense_out);
-  return dense_out;
-}
-
-template <typename T, typename ContextT>
-DenseTensor FullLike(
-    const ContextT& dev_ctx,
-    const DenseTensor& x,
-    const Scalar& val,
-    DataType dtype = DataType::UNDEFINED,
-    Backend backend = Backend::UNDEFINED,  // Is backend needed here?
-    DataLayout layout = DataLayout::UNDEFINED) {
-  auto out_meta = CreateLikeInferMeta(x.meta(), dtype, layout);
-  pten::DenseTensor dense_out(
-      pten::make_intrusive<paddle::experimental::SharedStorage>(
-          dev_ctx.GetPlace()),
-      std::move(out_meta));
-  FullLike<T, ContextT>(dev_ctx, val, &dense_out);
-  return dense_out;
-}
-
-}  // namespace pten
diff --git a/paddle/pten/kernels/cpu/full_kernel.cc b/paddle/pten/kernels/cpu/full_kernel.cc
index 4912656bb2aef..1ae8001d79dc7 100644
--- a/paddle/pten/kernels/cpu/full_kernel.cc
+++ b/paddle/pten/kernels/cpu/full_kernel.cc
@@ -21,7 +21,7 @@ limitations under the License. */
 PT_REGISTER_CTX_KERNEL(full,
                        CPU,
                        ALL_LAYOUT,
-                       pten::Full,
+                       pten::FullKernel,
                        float,
                        double,
                        uint8_t,
@@ -37,7 +37,7 @@ PT_REGISTER_CTX_KERNEL(full,
 PT_REGISTER_CTX_KERNEL(full_like,
                        CPU,
                        ALL_LAYOUT,
-                       pten::FullLike,
+                       pten::FullLikeKernel,
                        float,
                        double,
                        int,
diff --git a/paddle/pten/kernels/full_kernel.h b/paddle/pten/kernels/full_kernel.h
index 5bf6e37c36e57..bc484fb4edffa 100644
--- a/paddle/pten/kernels/full_kernel.h
+++ b/paddle/pten/kernels/full_kernel.h
@@ -18,15 +18,47 @@
 #include "paddle/pten/common/scalar_array.h"
 #include "paddle/pten/core/dense_tensor.h"
 
+#include "paddle/pten/infermeta/nullary.h"
+#include "paddle/pten/kernels/empty_kernel.h"
+
 namespace pten {
 
 template <typename T, typename Context>
-void Full(const Context& dev_ctx,
-          const ScalarArray& shape,
-          const Scalar& val,
-          DenseTensor* out);
+void FullKernel(const Context& dev_ctx,
+                const ScalarArray& shape,
+                const Scalar& val,
+                DenseTensor* out);
+
+template <typename T, typename Context>
+void FullLikeKernel(const Context& dev_ctx,
+                    const Scalar& val,
+                    DenseTensor* out);
+
+template <typename T, typename Context>
+DenseTensor Full(const Context& dev_ctx,
+                 const ScalarArray& shape,
+                 const Scalar& val,
+                 DataType dtype = DataType::FLOAT32,
+                 Backend backend = Backend::CPU,  // Is backend needed here?
+                 DataLayout layout = DataLayout::NCHW) {
+  auto out_meta = CreateInferMeta(shape, dtype, layout);
+  auto dense_out = Empty<T, Context>(dev_ctx, std::move(out_meta));
+  FullKernel<T, Context>(dev_ctx, shape, val, &dense_out);
+  return dense_out;
+}
 
 template <typename T, typename Context>
-void FullLike(const Context& dev_ctx, const Scalar& val, DenseTensor* out);
+DenseTensor FullLike(
+    const Context& dev_ctx,
+    const DenseTensor& x,
+    const Scalar& val,
+    DataType dtype = DataType::UNDEFINED,
+    Backend backend = Backend::UNDEFINED,  // Is backend needed here?
+    DataLayout layout = DataLayout::UNDEFINED) {
+  auto out_meta = CreateLikeInferMeta(x.meta(), dtype, layout);
+  auto dense_out = Empty<T, Context>(dev_ctx, std::move(out_meta));
+  FullLikeKernel<T, Context>(dev_ctx, val, &dense_out);
+  return dense_out;
+}
 
 }  // namespace pten
diff --git a/paddle/pten/kernels/gpu/full_kernel.cu b/paddle/pten/kernels/gpu/full_kernel.cu
index 16389d7749bf1..ae1f8529db3de 100644
--- a/paddle/pten/kernels/gpu/full_kernel.cu
+++ b/paddle/pten/kernels/gpu/full_kernel.cu
@@ -21,7 +21,7 @@ limitations under the License. */
 PT_REGISTER_CTX_KERNEL(full,
                        GPU,
                        ALL_LAYOUT,
-                       pten::Full,
+                       pten::FullKernel,
                        float,
                        double,
                        uint8_t,
@@ -36,7 +36,7 @@ PT_REGISTER_CTX_KERNEL(full,
 PT_REGISTER_CTX_KERNEL(full_like,
                        GPU,
                        ALL_LAYOUT,
-                       pten::FullLike,
+                       pten::FullLikeKernel,
                        float,
                        double,
                        int,
diff --git a/paddle/pten/kernels/impl/full_kernel_impl.h b/paddle/pten/kernels/impl/full_kernel_impl.h
index c77b7a7077824..9be40e22a0360 100644
--- a/paddle/pten/kernels/impl/full_kernel_impl.h
+++ b/paddle/pten/kernels/impl/full_kernel_impl.h
@@ -24,7 +24,7 @@ limitations under the License. */
 
 namespace pten {
 
-template <typename Context, typename T, typename VType>
+template <typename T, typename Context, typename VType>
 void FullValue(const Context& dev_ctx, DenseTensor* tensor, VType val) {
   tensor->mutable_data<T>();
   auto t = pten::EigenVector<T>::Flatten(*tensor);
@@ -32,16 +32,18 @@ void FullValue(const Context& dev_ctx, DenseTensor* tensor, VType val) {
 }
 
 template <typename T, typename Context>
-void Full(const Context& dev_ctx,
-          const ScalarArray& shape,
-          const Scalar& val,
-          DenseTensor* out) {
+void FullKernel(const Context& dev_ctx,
+                const ScalarArray& shape,
+                const Scalar& val,
+                DenseTensor* out) {
   out->Resize(paddle::framework::make_ddim(shape.GetData()));
-  FullValue<Context, T>(dev_ctx, out, val.to<T>());
+  FullValue<T>(dev_ctx, out, val.to<T>());
 }
 
 template <typename T, typename Context>
-void FullLike(const Context& dev_ctx, const Scalar& val, DenseTensor* out) {
+void FullLikeKernel(const Context& dev_ctx,
+                    const Scalar& val,
+                    DenseTensor* out) {
   auto value = val.to<float>();
   using CommonType = typename std::common_type<
       float,
@@ -66,7 +68,7 @@ void FullLike(const Context& dev_ctx, const Scalar& val, DenseTensor* out) {
           static_cast<CommonType>(std::numeric_limits<T>::lowest()),
           static_cast<CommonType>(std::numeric_limits<T>::max()),
           static_cast<float>(value)));
-  FullValue<Context, T>(dev_ctx, out, value);
+  FullValue<T>(dev_ctx, out, value);
 }
 
 }  // namespace pten
diff --git a/paddle/pten/tests/kernels/test_creation_dev_api.cc b/paddle/pten/tests/kernels/test_creation_dev_api.cc
index 8469b94b797c8..4d753f7d09b8e 100644
--- a/paddle/pten/tests/kernels/test_creation_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_creation_dev_api.cc
@@ -15,7 +15,8 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/pten/include/creation.h"
+#include "paddle/pten/kernels/empty_kernel.h"
+#include "paddle/pten/kernels/full_kernel.h"
 
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"

From c0e2b98e2222f48896eb29f48a31fa4022d5d2a7 Mon Sep 17 00:00:00 2001
From: limingshu <61349199+JamesLim-sy@users.noreply.github.com>
Date: Thu, 6 Jan 2022 10:39:26 +0800
Subject: [PATCH 018/151] Remove useless headers for some grad ops (#38732)

* fix the wrong filename

* first commit
---
 .../fluid/operators/elementwise/elementwise_div_op.cu |  1 +
 .../fluid/operators/elementwise/elementwise_div_op.h  | 11 -----------
 .../fluid/operators/elementwise/elementwise_mul_op.cu |  7 -------
 .../fluid/operators/elementwise/elementwise_mul_op.h  |  5 +----
 .../fluid/operators/elementwise/elementwise_sub_op.cu |  3 ---
 .../fluid/operators/elementwise/elementwise_sub_op.h  |  4 ----
 6 files changed, 2 insertions(+), 29 deletions(-)

diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cu b/paddle/fluid/operators/elementwise/elementwise_div_op.cu
index 7a25f65366901..c737cf5cf69fa 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.h b/paddle/fluid/operators/elementwise/elementwise_div_op.h
index b13a0539ec6ad..d9f7bbc56a902 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.h
@@ -14,21 +14,10 @@ limitations under the License. */
 
 #pragma once
 
-#include <string>
 #include <vector>
 #include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
 
-#include "paddle/fluid/framework/pten_utils.h"
-
-// only can include the headers in paddle/pten/include dirs
-#include "paddle/pten/api/lib/utils/tensor_utils.h"
-#include "paddle/pten/include/core.h"
-#include "paddle/pten/kernels/math_kernel.h"
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
index cdf376fd6a8cc..f4098398ffd0b 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
@@ -13,15 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
-#include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/float16.h"
 
-// only can include the headers in paddle/top/api dirs
-#include "paddle/pten/api/lib/utils/tensor_utils.h"
-#include "paddle/pten/include/core.h"
-#include "paddle/pten/include/math.h"
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
index 5cff3173e8115..9e75f6983c4a4 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
@@ -17,14 +17,11 @@ limitations under the License. */
 #include <string>
 #include "paddle/fluid/framework/pten_utils.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/platform/cpu_info.h"
 
 // only can include the headers in paddle/pten/include dirs
-#include "paddle/pten/api/lib/utils/tensor_utils.h"
-#include "paddle/pten/include/core.h"
 #include "paddle/pten/kernels/math_kernel.h"
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
index cba261a394732..4e1a5b5eecfa1 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
@@ -12,11 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
 #include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
-#include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.h b/paddle/fluid/operators/elementwise/elementwise_sub_op.h
index 6a51d7c2a45ad..f5cb81bf56864 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.h
@@ -16,12 +16,8 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/pten_utils.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/operators/math/blas.h"
 
 // only can include the headers in paddle/pten/include dirs
-#include "paddle/pten/api/lib/utils/tensor_utils.h"
-#include "paddle/pten/include/core.h"
 #include "paddle/pten/kernels/math_kernel.h"
 namespace paddle {
 namespace operators {

From 35213c6469b0b46448a02f76da512c57906a2a83 Mon Sep 17 00:00:00 2001
From: Roc <30228238+sljlp@users.noreply.github.com>
Date: Thu, 6 Jan 2022 10:54:23 +0800
Subject: [PATCH 019/151] fix bugs: output of splited fc is wrong (#38724)

---
 python/paddle/distributed/collective.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index 14a411ae25356..3731332d1e777 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -1219,6 +1219,7 @@ def _parallel_linear(x,
             inputs={'X': linear_out},
             outputs={'Out': out},
             attrs={
+                'rank': inner_rank,
                 'ring_id': ring_id,
                 'nranks': nranks,
                 'use_calc_stream': True,

From 9c1167cfd0c89d6039af4def02391490bf033d19 Mon Sep 17 00:00:00 2001
From: wenbin <wang3323032@qq.com>
Date: Thu, 6 Jan 2022 11:06:31 +0800
Subject: [PATCH 020/151] nearest_interp_v2 bug fix (#38725)

* bug fix

* remove blank
---
 paddle/fluid/inference/tensorrt/op_teller.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 6f0dec45644ef..8504474168d53 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -726,6 +726,7 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
       auto out_h = BOOST_GET_CONST(int, desc.GetAttr("out_h"));
       auto out_w = BOOST_GET_CONST(int, desc.GetAttr("out_w"));
       if (!(out_h > 0 && out_w > 0)) {
+        if (scale.size() < 2) return false;
         if (scale[0] <= 0.f || scale[1] <= 0.f) {
           VLOG(3) << "scale factor must be greater than 0 if out_h or out_w is "
                      "not set.";

From aec6e8a961ea22af9e144b46134bbaa9fb56e0fd Mon Sep 17 00:00:00 2001
From: minghaoBD <79566150+minghaoBD@users.noreply.github.com>
Date: Thu, 6 Jan 2022 11:28:42 +0800
Subject: [PATCH 021/151] [Paddle-ASP]Asp sharding (#37725)

---
 .../meta_optimizers/sharding_optimizer.py     |   1 +
 python/paddle/fluid/contrib/sparsity/asp.py   |  24 +++-
 .../asp/test_fleet_with_asp_sharding.py       | 120 ++++++++++++++++++
 .../unittests/fleet_meta_optimizer_base.py    |   2 +
 .../test_fleet_sharding_meta_optimizer.py     |  47 +++++++
 5 files changed, 189 insertions(+), 5 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_sharding.py

diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
index 8b75c57fab407..52468ab533496 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
@@ -53,6 +53,7 @@ def __init__(self, optimizer):
             "AMPOptimizer",
             "LarsOptimizer",
             "LambOptimizer",
+            "ASPOptimizer",
             # "ModelParallelOptimizer",
             # "PipelineOptimizer",
         ]
diff --git a/python/paddle/fluid/contrib/sparsity/asp.py b/python/paddle/fluid/contrib/sparsity/asp.py
index 61e3a61fc9cd2..937fcdf0463be 100644
--- a/python/paddle/fluid/contrib/sparsity/asp.py
+++ b/python/paddle/fluid/contrib/sparsity/asp.py
@@ -16,12 +16,17 @@
 Functions for Auto SParsity (ASP) training and inference.
 """
 
+import os
 import copy
 import numpy as np
 import paddle
 from paddle.fluid import global_scope, program_guard, layers
 from paddle.fluid.initializer import ConstantInitializer
 from paddle.fluid.contrib import sparsity
+from paddle.fluid import core
+
+OpRole = core.op_proto_and_checker_maker.OpRole
+OP_ROLE_KEY = core.op_proto_and_checker_maker.kOpRoleAttrName()
 
 __all__ = [
     'decorate', 'prune_model', 'set_excluded_layers', 'reset_excluded_layers'
@@ -150,7 +155,8 @@ def prune_model(main_program=None,
                 n=2,
                 m=4,
                 mask_algo='mask_1d',
-                with_mask=True):
+                with_mask=True,
+                sharding=False):
     r"""
     Pruning parameters of supported layers in :attr:`main_program` via 
     specified mask generation function given by :attr:`mask_algo`. This 
@@ -173,6 +179,7 @@ def prune_model(main_program=None,
         mask_algo (string, optional): The function name to generate spase mask. Default is `mask_1d`.
                                       The vaild inputs should be one of 'mask_1d', 'mask_2d_greedy' and 'mask_2d_best'.
         with_mask (bool, optional): To prune mask Variables related to parameters or not. Ture is purning also, False is not. Defalut is True.
+        sharding (bool, optional): Whether to turn on sharding (model parallel) during training. Please consider turning it ON when encountering OOM using sharding. Default is False.
     Returns:
         dictionary: A dictionary with key: `parameter name` (string) and value: its corresponding mask Variable.
     Examples:
@@ -214,8 +221,12 @@ def prune_model(main_program=None,
             # Must call `exe.run(startup_program)` first before calling `sparsity.prune_model`
             sparsity.prune_model(main_program, mask_algo='mask_2d_best')
     """
-    device = paddle.device.get_device()
-    place = paddle.set_device(device)
+    if sharding:
+        gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
+        place = paddle.CUDAPlace(gpu_id)
+    else:
+        device = paddle.device.get_device()
+        place = paddle.set_device(device)
 
     MaskAlgo_mapping = {
         'mask_1d': sparsity.MaskAlgo.MASK_1D,
@@ -528,8 +539,11 @@ def _insert_sparse_mask_ops(cls, main_program, param_grads):
                         'Y': asp_info.mask_vars[param_grad[0].name]
                     },
                     outputs={'Out': param_grad[0]},
-                    attrs={'axis': -1,
-                           'use_mkldnn': False})
+                    attrs={
+                        'axis': -1,
+                        'use_mkldnn': False,
+                        OP_ROLE_KEY: OpRole.Optimize
+                    })
 
 
 class OptimizerWithSparsityGuarantee(object):
diff --git a/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_sharding.py b/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_sharding.py
new file mode 100644
index 0000000000000..dd609d3ae2e11
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_sharding.py
@@ -0,0 +1,120 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 NVIDIA Corporation.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.distributed.fleet as fleet
+import paddle.distributed.fleet.base.role_maker as role_maker
+import unittest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import os
+import sys
+from paddle.static import sparsity
+from paddle.fluid.contrib.sparsity.asp import ASPHelper
+import numpy as np
+cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES')
+if cuda_visible_devices is None or cuda_visible_devices == "":
+    os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+else:
+    os.environ['CUDA_VISIBLE_DEVICES'] = cuda_visible_devices.split(',')[0]
+
+paddle.enable_static()
+
+
+class TestFleetWithASPSharding(unittest.TestCase):
+    def setUp(self):
+        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36213"
+        os.environ["PADDLE_CURRENT_ENDPOINTS"] = "127.0.0.1:36213"
+        os.environ["PADDLE_TRAINERS_NUM"] = "1"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+
+        os.environ['FLAGS_enable_parallel_graph'] = "0"
+        os.environ['FLAGS_fraction_of_gpu_memory_to_use'] = "0.1"
+        os.environ['FLAGS_sync_nccl_allreduce'] = "1"
+        os.environ['FLAGS_eager_delete_tensor_gb'] = "0"
+        os.environ['FLAGS_fuse_parameter_memory_size'] = "32"
+        os.environ['FLAGS_fuse_parameter_groups_size'] = "50"
+        os.environ['FLAGS_check_nan_inf'] = "0"
+
+    def net(self, main_prog, startup_prog):
+        with fluid.program_guard(main_prog, startup_prog):
+            input_x = paddle.static.data(
+                name="x", shape=[-1, 32], dtype='float32')
+            input_y = paddle.static.data(name="y", shape=[-1, 1], dtype='int64')
+
+            fc_1 = fluid.layers.fc(input=input_x, size=64, act='tanh')
+            fc_2 = fluid.layers.fc(input=fc_1, size=64, act='tanh')
+            fc_3 = fluid.layers.fc(input=fc_2, size=64, act='tanh')
+            fc_4 = fluid.layers.fc(input=fc_3, size=64, act='tanh')
+            prediction = fluid.layers.fc(input=fc_4, size=2, act='softmax')
+            cost = fluid.layers.cross_entropy(input=prediction, label=input_y)
+            avg_cost = paddle.mean(x=cost)
+
+            dist_strategy = paddle.distributed.fleet.DistributedStrategy()
+            dist_strategy.sharding = True
+            dist_strategy.sharding_configs = {
+                "sharding_segment_strategy": "segment_broadcast_MB",
+                "segment_broadcast_MB": 32,
+                "segment_anchors": None,
+                "sharding_degree": 8,
+                "mp_degree": 1,
+                "hybrid_dp": False,
+                "gradient_merge_acc_step": 1
+            }
+            dist_strategy.nccl_comm_num = 1
+            dist_strategy.asp = True
+        return avg_cost, dist_strategy, input_x, input_y
+
+    def test_with_asp_sharding(self):
+        if sys.platform == 'win32':
+            return
+        print(sys.platform)
+        fleet.init(is_collective=True)
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy, input_x, input_y = self.net(train_prog,
+                                                        startup_prog)
+
+        with fluid.program_guard(train_prog, startup_prog):
+            optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
+            optimizer = fleet.distributed_optimizer(
+                optimizer, strategy=strategy)
+            optimizer.minimize(avg_cost)
+
+        if paddle.fluid.is_compiled_with_cuda():
+            place = fluid.CUDAPlace(
+                int(os.environ.get('FLAGS_selected_gpus', 0)))
+        else:
+            place = fluid.CPUPlace()
+
+        exe = fluid.Executor(place)
+        feeder = fluid.DataFeeder(feed_list=[input_x, input_y], place=place)
+        exe.run(startup_prog)
+
+        sparsity.prune_model(train_prog, sharding=True)
+
+        data = (np.random.randn(64, 32), np.random.randint(2, size=(64, 1)))
+        exe.run(train_prog, feed=feeder.feed([data]))
+
+        for param in train_prog.global_block().all_parameters():
+            if ASPHelper._is_supported_layer(train_prog, param.name):
+                mat = np.array(fluid.global_scope().find_var(param.name)
+                               .get_tensor())
+                self.assertTrue(
+                    paddle.fluid.contrib.sparsity.check_sparsity(
+                        mat.T, n=2, m=4))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py b/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py
index 6150df5c29a9b..83d53cc22a205 100755
--- a/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py
+++ b/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py
@@ -229,5 +229,7 @@ def set_strategy(self, strategy, name):
                 "micro_batch_size": 2,
                 "accumulate_steps": 4,
             }
+        elif name == 'asp':
+            strategy.asp = True
         else:
             raise NotImplementedError()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
index c7eaf4e0ff33d..42ec81ad9d869 100755
--- a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
@@ -190,6 +190,53 @@ def test_sharding_amp_recompute_optimizer(self):
             'momentum', 'momentum'
         ])
 
+    def test_sharding_amp_asp_optimizer(self):
+        train_prog, startup_prog = paddle.fluid.Program(), paddle.fluid.Program(
+        )
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        self.set_strategy(strategy, 'sharding')
+        self.set_strategy(strategy, 'amp')
+        self.set_strategy(strategy, 'asp')
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        vars = [x.name for x in train_prog.list_vars()]
+        parameters = [
+            x.name for x in train_prog.list_vars() if x.persistable == True
+        ]
+
+        self.assertIn('@BroadCast', ''.join(vars))
+        self.assertIn('cast', ops)
+        self.assertIn('check_finite_and_unscale', ops)
+
+        self.assertEqual(
+            set(parameters),
+            set([
+                'fc_2.b_0', 'num_good_steps_0', 'fc_2.w_0', 'loss_scaling_0',
+                'num_bad_steps_0', 'fc_2.w_0_velocity_0', 'fc_2.w_0_asp_mask',
+                'learning_rate_0', 'fc_1.b_0', 'fc_1.w_0_asp_mask',
+                'fc_0.w_0_asp_mask', 'fc_1.b_0_velocity_0',
+                'fc_2.b_0_velocity_0'
+            ]))
+        self.assertEqual(ops, [
+            'cast', 'cast', 'cast', 'fill_constant', 'fill_constant',
+            'fill_constant', 'c_sync_calc_stream', 'c_broadcast', 'c_broadcast',
+            'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast',
+            'c_sync_comm_stream', 'cast', 'mul', 'elementwise_add', 'cast',
+            'tanh', 'cast', 'mul', 'elementwise_add', 'cast', 'tanh', 'cast',
+            'mul', 'elementwise_add', 'softmax', 'cast', 'cross_entropy2',
+            'mean', 'elementwise_mul', 'fill_constant', 'elementwise_mul_grad',
+            'mean_grad', 'cross_entropy_grad2', 'cast', 'softmax_grad',
+            'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', 'cast',
+            'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', 'cast',
+            'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream',
+            'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum',
+            'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream', 'cast',
+            'cast', 'cast', 'check_finite_and_unscale', 'cast',
+            'c_allreduce_max', 'cast', 'update_loss_scaling', 'momentum',
+            'momentum', 'momentum', 'elementwise_mul'
+        ])
+
     def test_sharding_weight_decay(self):
         train_prog, startup_prog = paddle.fluid.Program(), paddle.fluid.Program(
         )

From a28eb0f0920efde7166b994fa46014d63a3b0e7b Mon Sep 17 00:00:00 2001
From: tianshuo78520a <707759223@qq.com>
Date: Thu, 6 Jan 2022 14:20:16 +0800
Subject: [PATCH 022/151] Add NPU dockerfile (#38659)

---
 tools/dockerfile/Dockerfile.paddle-npu-build | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 tools/dockerfile/Dockerfile.paddle-npu-build

diff --git a/tools/dockerfile/Dockerfile.paddle-npu-build b/tools/dockerfile/Dockerfile.paddle-npu-build
new file mode 100644
index 0000000000000..62361880cc6eb
--- /dev/null
+++ b/tools/dockerfile/Dockerfile.paddle-npu-build
@@ -0,0 +1,5 @@
+FROM registry.baidubce.com/paddlepaddle/paddle-npu:latest-dev-cann5.0.2.alpha005-gcc82-x86_64-with-driver
+RUN apt-get install pigz -y
+RUN apt-get remove -y openjdk*
+CMD ["/bin/bash"]
+EXPOSE 22

From fc990d08b2ba4754d68628809ff776381fb66cf0 Mon Sep 17 00:00:00 2001
From: limingshu <61349199+JamesLim-sy@users.noreply.github.com>
Date: Thu, 6 Jan 2022 15:18:13 +0800
Subject: [PATCH 023/151] Revert "Remove useless headers for some grad ops
 (#38732)" (#38743)

This reverts commit c0e2b98e2222f48896eb29f48a31fa4022d5d2a7.
---
 .../fluid/operators/elementwise/elementwise_div_op.cu |  1 -
 .../fluid/operators/elementwise/elementwise_div_op.h  | 11 +++++++++++
 .../fluid/operators/elementwise/elementwise_mul_op.cu |  7 +++++++
 .../fluid/operators/elementwise/elementwise_mul_op.h  |  5 ++++-
 .../fluid/operators/elementwise/elementwise_sub_op.cu |  3 +++
 .../fluid/operators/elementwise/elementwise_sub_op.h  |  4 ++++
 6 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cu b/paddle/fluid/operators/elementwise/elementwise_div_op.cu
index c737cf5cf69fa..7a25f65366901 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cu
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.h b/paddle/fluid/operators/elementwise/elementwise_div_op.h
index d9f7bbc56a902..b13a0539ec6ad 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.h
@@ -14,10 +14,21 @@ limitations under the License. */
 
 #pragma once
 
+#include <string>
 #include <vector>
 #include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
 
+#include "paddle/fluid/framework/pten_utils.h"
+
+// only can include the headers in paddle/pten/include dirs
+#include "paddle/pten/api/lib/utils/tensor_utils.h"
+#include "paddle/pten/include/core.h"
+#include "paddle/pten/kernels/math_kernel.h"
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
index f4098398ffd0b..cdf376fd6a8cc 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
@@ -13,8 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
+#include "paddle/fluid/platform/complex.h"
+#include "paddle/fluid/platform/float16.h"
 
+// only can include the headers in paddle/top/api dirs
+#include "paddle/pten/api/lib/utils/tensor_utils.h"
+#include "paddle/pten/include/core.h"
+#include "paddle/pten/include/math.h"
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
index 9e75f6983c4a4..5cff3173e8115 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
@@ -17,11 +17,14 @@ limitations under the License. */
 #include <string>
 #include "paddle/fluid/framework/pten_utils.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
+#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/platform/cpu_info.h"
 
 // only can include the headers in paddle/pten/include dirs
+#include "paddle/pten/api/lib/utils/tensor_utils.h"
+#include "paddle/pten/include/core.h"
 #include "paddle/pten/kernels/math_kernel.h"
-
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
index 4e1a5b5eecfa1..cba261a394732 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
@@ -12,8 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
 #include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
+#include "paddle/fluid/platform/complex.h"
+#include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.h b/paddle/fluid/operators/elementwise/elementwise_sub_op.h
index f5cb81bf56864..6a51d7c2a45ad 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.h
@@ -16,8 +16,12 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/pten_utils.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
+#include "paddle/fluid/operators/math/blas.h"
 
 // only can include the headers in paddle/pten/include dirs
+#include "paddle/pten/api/lib/utils/tensor_utils.h"
+#include "paddle/pten/include/core.h"
 #include "paddle/pten/kernels/math_kernel.h"
 namespace paddle {
 namespace operators {

From 89c0877ee502f0f59f1cb361cd3e0ad17b1eda23 Mon Sep 17 00:00:00 2001
From: baoachun <962571062@qq.com>
Date: Thu, 6 Jan 2022 15:53:13 +0800
Subject: [PATCH 024/151] add mkldnn matmulv2 ut (#38749)

---
 .../ir/inference/test_mkldnn_matmulv2_op.py   | 135 ++++++++++++++++++
 1 file changed, 135 insertions(+)
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmulv2_op.py

diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmulv2_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmulv2_op.py
new file mode 100644
index 0000000000000..9fa98045ef303
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmulv2_op.py
@@ -0,0 +1,135 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from auto_scan_test import MkldnnAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig, OpConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+import unittest
+
+import hypothesis
+from hypothesis import given, settings, seed, example, assume
+import hypothesis.strategies as st
+
+
+class TestMkldnnMatmulv2Op(MkldnnAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        if len(program_config.inputs["input_data2"].shape) == 4:
+            if program_config.inputs["input_data1"].shape[
+                    -4] != 1 and program_config.inputs["input_data2"].shape[
+                        -4] != 1:
+                if program_config.inputs["input_data1"].shape[
+                        -4] != program_config.inputs["input_data2"].shape[-4]:
+                    return False
+
+        if program_config.inputs["input_data1"].shape[
+                -3] != 1 and program_config.inputs["input_data2"].shape[
+                    -3] != 1:
+            if program_config.inputs["input_data1"].shape[
+                    -3] != program_config.inputs["input_data2"].shape[-3]:
+                return False
+        return True
+
+    def sample_program_configs(self, *args, **kwargs):
+        def generate_input(type, *args, **kwargs):
+            transpose_X = kwargs["transpose_X"]
+            transpose_Y = kwargs["transpose_Y"]
+            batch_size1 = kwargs["batch_size1"]
+            batch_size2 = kwargs["batch_size2"]
+            channel1 = kwargs["channel1"]
+            channel2 = kwargs["channel2"]
+            input_dim = kwargs["input_dim"]
+            y_dim_len = kwargs["y_dim_len"]
+            if transpose_X and transpose_Y:
+                shape_x = [batch_size1, channel1, input_dim, 32]
+                if y_dim_len == 4:
+                    shape_y = [batch_size2, channel2, 64, input_dim]
+                elif y_dim_len == 3:
+                    shape_y = [channel2, 64, input_dim]
+            elif transpose_X:
+                shape_x = [batch_size1, channel1, input_dim, 32]
+                if y_dim_len == 4:
+                    shape_y = [batch_size2, channel2, input_dim, 64]
+                elif y_dim_len == 3:
+                    shape_y = [channel2, input_dim, 64]
+            elif transpose_Y:
+                shape_x = [batch_size1, channel1, 32, input_dim]
+                if y_dim_len == 4:
+                    shape_y = [batch_size2, channel2, 8, input_dim]
+                elif y_dim_len == 3:
+                    shape_y = [channel2, 8, input_dim]
+            else:
+                shape_x = [batch_size1, channel1, 32, input_dim]
+                if y_dim_len == 4:
+                    shape_y = [batch_size2, channel2, input_dim, 16]
+                elif y_dim_len == 3:
+                    shape_y = [channel2, input_dim, 16]
+
+            if type == "x":
+                return np.random.random(shape_x).astype(np.float32)
+            else:
+                return np.random.random(shape_y).astype(np.float32)
+
+        matmul_op = OpConfig(
+            type="matmul_v2",
+            inputs={"X": ["input_data1"],
+                    "Y": ["input_data2"]},
+            outputs={"Out": ["matmul_output"]},
+            attrs={
+                "trans_x": kwargs["transpose_X"],
+                "trans_y": kwargs["transpose_Y"],
+                "fused_reshape_X": [],
+                "fused_reshape_Y": [],
+                "fused_transpose_X": [],
+                "fused_transpose_Y": [],
+                "fused_reshape_Out": [],
+                "fused_transpose_Out": []
+            })
+
+        program_config = ProgramConfig(
+            ops=[matmul_op],
+            weights={},
+            inputs={
+                "input_data1": TensorConfig(data_gen=partial(
+                    generate_input, "x", *args, **kwargs)),
+                "input_data2": TensorConfig(data_gen=partial(
+                    generate_input, "y", *args, **kwargs))
+            },
+            outputs=["matmul_output"])
+
+        yield program_config
+
+    def sample_predictor_configs(self, program_config):
+        config = self.create_inference_config(use_mkldnn=True)
+        yield config, (1e-5, 1e-5)
+
+    @given(
+        transpose_X=st.booleans(),
+        transpose_Y=st.booleans(),
+        y_dim_len=st.sampled_from([3, 4]),
+        batch_size1=st.integers(
+            min_value=1, max_value=4),
+        batch_size2=st.integers(
+            min_value=1, max_value=4),
+        channel1=st.sampled_from([1, 16, 32, 64]),
+        channel2=st.sampled_from([1, 16, 32, 64]),
+        input_dim=st.sampled_from([16, 32, 64]))
+    def test(self, *args, **kwargs):
+        self.run_test(*args, **kwargs)
+
+
+if __name__ == "__main__":
+    unittest.main()

From d422a1ede99d266b45a3acaf0024d46763482318 Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Thu, 6 Jan 2022 16:27:26 +0800
Subject: [PATCH 025/151] Handled special sum_grad_op code gen in Eager Dygraph
 (#38573)

* Handled special sum_grad_op code gen in Eager Dygraph

* Fixed merge issues
---
 .../auto_code_generator/eager_generator.cc    | 673 +++++++++++-------
 .../tests/unittests/test_imperative_basic.py  |   9 +-
 2 files changed, 407 insertions(+), 275 deletions(-)

diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
index 448fadd4b4644..2c3207b116e29 100644
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -27,6 +27,8 @@
 #include "paddle/fluid/pybind/pybind.h"
 #include "paddle/fluid/string/string_helper.h"
 
+#define NUM_CREATED_DUP_INPUTS 4
+
 namespace paddle {
 namespace framework {
 
@@ -46,6 +48,62 @@ static std::string LegalizeVariableName(const std::string& var_name) {
   return ret;
 }
 
+static bool IgnoreGradAttribute(const std::string& op_type,
+                                const std::string& attr_name) {
+  // Attributes in operators_with_attrs are created manually during code
+  // generation
+  // We should ignore these arbitrary attrs when setting up grad attribute map
+  if (operators_with_attrs.count(op_type)) {
+    if (operators_with_attrs[op_type].count(attr_name)) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+static void PrepareAttrMapForOps() {
+  // Handle "run_program_op"
+  static framework::ProgramDesc fake_prog;
+  operators_with_attrs["run_program"] = {};
+  operators_with_attrs["run_program"]["global_block"] =
+      fake_prog.MutableBlock(0);
+
+  // Handle "fused_elemwise_add_activation"
+  std::vector<std::string> functor_list = {"a", "b"};
+  operators_with_attrs["fused_elemwise_add_activation"] = {};
+  operators_with_attrs["fused_elemwise_add_activation"]["functor_list"] =
+      functor_list;
+
+  // Handle "fused_elemwise_activation"
+  operators_with_attrs["fused_elemwise_activation"] = {};
+  operators_with_attrs["fused_elemwise_activation"]["functor_list"] =
+      functor_list;
+
+  // Handle "reverse"
+  std::vector<int> axis = {0};
+  operators_with_attrs["reverse"] = {};
+  operators_with_attrs["reverse"]["axis"] = axis;
+
+  // Handle "flip"
+  operators_with_attrs["flip"] = {};
+  operators_with_attrs["flip"]["axis"] = axis;
+
+  // Handle "cast"
+  operators_with_attrs["cast"] = {};
+  operators_with_attrs["cast"]["out_dtype"] = 5;
+  operators_with_attrs["cast"]["in_dtype"] = 5;
+
+  // Handle "transfer_dtype"
+  operators_with_attrs["transfer_dtype"] = {};
+  operators_with_attrs["transfer_dtype"]["out_dtype"] = 5;
+  operators_with_attrs["transfer_dtype"]["in_dtype"] = 5;
+
+  // Handle "c_split"
+  operators_with_attrs["c_split"] = {};
+  operators_with_attrs["c_split"]["nranks"] = 1;
+}
+
 /* --- Helper Objects --- */
 class ForwardGenerationInfo {
  public:
@@ -136,6 +194,13 @@ class GradNodeGenerationInfo {
       return &grad_outs_;
     }
 
+    const paddle::framework::AttributeMap& GetGradAttrs() const {
+      return grad_attrs_;
+    }
+    paddle::framework::AttributeMap* GetMutableGradAttrs() {
+      return &grad_attrs_;
+    }
+
    private:
     std::string op_base_type_;
     std::map<std::string, std::string> grad_outs_slotname_map_;
@@ -147,6 +212,7 @@ class GradNodeGenerationInfo {
     std::map<std::string,
              std::vector<std::shared_ptr<paddle::imperative::VariableWrapper>>>
         grad_outs_;
+    paddle::framework::AttributeMap grad_attrs_;
   };
 
  public:
@@ -677,27 +743,48 @@ static bool CollectGradInformationFromOpInfo(
   std::map<std::string,
            std::vector<std::shared_ptr<paddle::imperative::VarBase>>>
       ins;
-  for (const proto::OpProto::Var& input : op_proto.inputs()) {
-    const std::string& in_name = input.name();
-
-    // Handle dispensable input:
-    // 1. At python level, dispensable input will be detected at Python-C
-    // interface and filled with an empty vector
-    // 2. At C++ level, customers should always pass an empty vector for any
-    // dispensable input
-    // 3. During further lowering, there will always be a placeholder VarBase
-    // in ins/outs no matter whether it's dispensable or not
-    // As a result, we always create input VarBase regardless of its
-    // dispensability.
-
-    // Handle duplicable input: list(VarBase) or VarBase
-    // We dont know the exact number of inputs expected,
-    // but we only need to identify the slot name order,
-    // therefore fill in 1 single input VarBase is enough in this scenario
-    ins[in_name] = {std::shared_ptr<paddle::imperative::VarBase>(
-        new paddle::imperative::VarBase("auto_" + in_name))};
-    ins[in_name][0]->SetOverridedStopGradient(false);
-    ins[in_name][0]->MutableVar()->GetMutable<framework::LoDTensor>();
+
+  if (op_proto.inputs().size() == 1 && op_proto.outputs().size() == 1 &&
+      op_proto.inputs()[0].duplicable() &&
+      !op_proto.outputs()[0].duplicable()) {
+    VLOG(6) << "Handle op with special op_bases: " << op_type;
+    // @special case (sum_op): for ops with single duplicable input and single
+    // non-duplicable output
+    //                         feed in NUM_CREATED_DUP_INPUTS inputs to detect a
+    //                         special scenario.
+    const std::string& in_name = op_proto.inputs()[0].name();
+    ins[in_name] = {};
+    for (size_t i = 0; i < NUM_CREATED_DUP_INPUTS; i++) {
+      ins[in_name].emplace_back(std::shared_ptr<paddle::imperative::VarBase>(
+          new paddle::imperative::VarBase("auto_" + in_name + "_" +
+                                          std::to_string(i))));
+      ins[in_name][i]->SetOverridedStopGradient(false);
+      ins[in_name][i]->MutableVar()->GetMutable<framework::LoDTensor>();
+    }
+  } else {
+    for (const proto::OpProto::Var& input : op_proto.inputs()) {
+      const std::string& in_name = input.name();
+
+      // Handle dispensable input:
+      // 1. At python level, dispensable input will be detected at Python-C
+      // interface and filled with an empty vector
+      // 2. At C++ level, customers should always pass an empty vector for any
+      // dispensable input
+      // 3. During further lowering, there will always be a placeholder VarBase
+      // in ins/outs no matter whether it's dispensable or not
+      // As a result, we always create input VarBase regardless of its
+      // dispensability.
+
+      // Handle duplicable input: list(VarBase) or VarBase
+      // We dont know the exact number of inputs expected,
+      // but we only need to identify the slot name order,
+      // therefore fill in 1 single input VarBase is enough in this scenario
+
+      ins[in_name] = {std::shared_ptr<paddle::imperative::VarBase>(
+          new paddle::imperative::VarBase("auto_" + in_name))};
+      ins[in_name][0]->SetOverridedStopGradient(false);
+      ins[in_name][0]->MutableVar()->GetMutable<framework::LoDTensor>();
+    }
   }
   VLOG(6) << "Prepared Forward Ins Map, size = " << ins.size();
 
@@ -725,7 +812,6 @@ static bool CollectGradInformationFromOpInfo(
     VLOG(6) << "Checking AttributeMap Settings";
     attr_checker->Check(&attrs, true, /*only_check_exist_value=*/true);
     default_attrs = attr_checker->GetDefaultAttrMap();
-    VLOG(6) << "AttributeMap Checking Passed";
   } else {
     VLOG(6) << "Detected Null Attribute Checker, use empty default_attrs";
   }
@@ -797,13 +883,13 @@ static bool CollectGradInformationFromOpInfo(
     (*op_base_infos)[index].SetOpBaseType(op_base.Type());
   }
 
-  /* ------ Get Grad ins/outs ---- */
-  // In case of multiple OpBase, stitch all the respective ins/outs into one
+  /* ------ Get Grad ins/outs/attrs ---- */
   VLOG(6) << "In function size: " << grad_node->size();
   for (auto iter = grad_node->begin(); iter < grad_node->end(); iter++) {
     int index = std::distance(grad_node->begin(), iter);
     auto* op_base_grad_ins = (*op_base_infos)[index].GetMutableGradIns();
     auto* op_base_grad_outs = (*op_base_infos)[index].GetMutableGradOuts();
+    auto* op_base_grad_attrs = (*op_base_infos)[index].GetMutableGradAttrs();
 
     const paddle::imperative::OpBase& op_base = *iter;
     const std::map<std::string, paddle::imperative::SavedVariableWrapperList>&
@@ -811,6 +897,8 @@ static bool CollectGradInformationFromOpInfo(
     const std::map<std::string, paddle::imperative::SavedVariableWrapperList>&
         g_outs = op_base.GetOutsMap();
 
+    *op_base_grad_attrs = op_base.Attrs();
+
     for (const auto& it : g_ins) {
       if (!op_base_grad_ins->count(it.first))
         (*op_base_grad_ins)[it.first] = {};
@@ -1395,6 +1483,261 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
   return {fwd_function_str, dygraph_function_declaration_str};
 }
 
+static std::string GenerateSingleOpBase(
+    const std::string& fwd_op_type, const std::string& op_base_type,
+    const std::unordered_map<std::string, size_t>& fwd_inputs_name_pos_map,
+    const std::unordered_map<std::string, size_t>& fwd_outputs_name_pos_map,
+    const std::vector<proto::OpProto::Var>& in_vars,
+    const std::map<std::string, std::string>& grad_ins_fwd_slotname_map,
+    const std::map<std::string, std::string>& grad_ins_grad_slotname_map,
+    const std::map<std::string, std::string>& grad_outs_slotname_map,
+    const std::map<
+        std::string,
+        std::vector<std::shared_ptr<paddle::imperative::VariableWrapper>>>&
+        grad_ins,
+    const std::map<
+        std::string,
+        std::vector<std::shared_ptr<paddle::imperative::VariableWrapper>>>&
+        grad_outs,
+    const paddle::framework::AttributeMap& grad_attrs,
+    bool is_op_base_per_duplicable_input, size_t* outs_size) {
+  std::string generated_grad_function_body = "";
+
+  const std::string& ins_name = "ins" + std::to_string(*outs_size);
+  const std::string& outs_name = "outs" + std::to_string(*outs_size);
+  const std::string& attrs_name = "attrs_map" + std::to_string(*outs_size);
+
+  // [Generation] Get Ins Map
+  std::string ins_contents_str = "";
+  for (auto iter : grad_ins) {
+    const std::string& grad_input_name = iter.first;
+
+    if (grad_ins_fwd_slotname_map.count(grad_input_name)) {
+      // Fwd Tensor
+      std::string struct_fwd_input_name =
+          grad_ins_fwd_slotname_map.at(grad_input_name) + "_";
+      const char* GRAD_INS_FWD_CONTENT_TEMPLATE =
+          "{ \"%s\", "
+          "egr::EagerUtils::SyncToVars(egr::EagerUtils::RecoverTensorWrapper("
+          "&"
+          "this->%s, "
+          "nullptr)) },";
+      ins_contents_str +=
+          paddle::string::Sprintf(GRAD_INS_FWD_CONTENT_TEMPLATE,
+                                  grad_input_name, struct_fwd_input_name);
+
+    } else if (grad_ins_grad_slotname_map.count(grad_input_name)) {
+      // Fwd Tensor's Grad
+      size_t fwd_output_position = fwd_outputs_name_pos_map.at(
+          grad_ins_grad_slotname_map.at(grad_input_name));
+      const char* GRAD_INS_GRAD_CONTENT_TEMPLATE =
+          "{ \"%s\", egr::EagerUtils::SyncToVars(grads[%d]) },";
+      ins_contents_str += paddle::string::Sprintf(
+          GRAD_INS_GRAD_CONTENT_TEMPLATE, grad_input_name, fwd_output_position);
+
+    } else {
+      PADDLE_THROW(platform::errors::Fatal(
+          "Detected mismatched slot names."
+          "Unable to find forward slot name that matches %s",
+          grad_input_name));
+    }
+  }
+  if (ins_contents_str.size() > 0)
+    ins_contents_str.pop_back();  // // Remove trailing ","
+
+  const char* BWD_INS_MAP_TEMPLATE =
+      "  std::map<std::string, "
+      "std::vector<std::shared_ptr<egr::EagerTensor>>> %s = { "
+      "%s };\n";
+  std::string ins_map_str =
+      paddle::string::Sprintf(BWD_INS_MAP_TEMPLATE, ins_name, ins_contents_str);
+  generated_grad_function_body += ins_map_str;
+
+  VLOG(6) << "Generated Ins Map";
+
+  // [Generation] Get Outs Map
+  std::unordered_set<std::string> duplicable_input_name_set;
+  for (const auto& in : in_vars) {
+    if (in.duplicable()) duplicable_input_name_set.insert(in.name());
+  }
+
+  std::string outs_contents_str = "";
+  for (auto iter : grad_outs) {
+    const std::string& grad_output_name = iter.first;
+
+    if (grad_outs_slotname_map.count(grad_output_name)) {
+      // Fwd Tensor
+      const std::string& fwd_name = grad_outs_slotname_map.at(grad_output_name);
+
+      /* Handle Special Case: "PullSparseOp", etc
+
+          Forward:
+
+             Ids  W
+              |   |
+           PullSparseOp
+                |
+               Out
+
+          Backward:
+
+             Ids  GradOut  W
+              |      |     |
+             PullSparseGradOp
+                     |
+                  GradOut
+
+          Its grad output "GradOut" corresponds to forward output "Out",
+          where there is a hiden inplace involved. So we find "GradOut"'s
+         index
+         in
+          grads, and perform the inplace operation by constructing outs =
+         {{"Out", grads[i]}}
+
+          GradOut -> Out -> fwd_output_pos -> grads position -> grads[i]
+          outs = {{"Out", grads[i]}}
+
+          For returns, append "GradOut" to the very end of return list.
+      */
+      if (!fwd_inputs_name_pos_map.count(fwd_name)) {
+        PADDLE_ENFORCE(fwd_outputs_name_pos_map.count(fwd_name),
+                       paddle::platform::errors::Fatal(
+                           "fwd_name not found in fwd_inputs_name_pos_map nor "
+                           "fwd_outputs_name_pos_map"));
+
+        size_t grads_position = fwd_outputs_name_pos_map.at(fwd_name);
+
+        const char* GRAD_OUTS_CONTENT_TEMPLATE =
+            "{ \"%s\", egr::EagerUtils::SyncToVars(grads[%d]) },";
+        outs_contents_str += paddle::string::Sprintf(
+            GRAD_OUTS_CONTENT_TEMPLATE, grad_output_name, grads_position);
+
+      } else {
+        size_t fwd_input_position = fwd_inputs_name_pos_map.at(fwd_name);
+        if (duplicable_input_name_set.count(fwd_name) &&
+            !is_op_base_per_duplicable_input) {
+          const char* GRAD_OUTS_CONTENT_TEMPLATE =
+              "{ \"%s\", egr::EagerUtils::ConstructDuplicableOutput( "
+              "this->OutputMeta()[%d].Size() ) },";
+          outs_contents_str += paddle::string::Sprintf(
+              GRAD_OUTS_CONTENT_TEMPLATE, grad_output_name, fwd_input_position);
+        } else {
+          const char* GRAD_OUTS_CONTENT_TEMPLATE =
+              "{ \"%s\", "
+              "{std::make_shared<egr::EagerTensor>(egr::Controller::Instance("
+              ")."
+              "GenerateUniqueName())}},";
+          outs_contents_str += paddle::string::Sprintf(
+              GRAD_OUTS_CONTENT_TEMPLATE, grad_output_name);
+        }
+      }
+    } else {
+      PADDLE_THROW(platform::errors::Fatal(
+          "Detected mismatched slot names."
+          "Unable to find forward slot name that matches %s",
+          grad_output_name));
+    }
+  }
+  if (outs_contents_str.size() > 0)
+    outs_contents_str.pop_back();  // // Remove trailing ","
+
+  const char* BWD_OUTS_MAP_TEMPLATE =
+      "  std::map<std::string, "
+      "std::vector<std::shared_ptr<egr::EagerTensor>>> %s = { "
+      "%s };\n";
+  std::string outs_map_str = paddle::string::Sprintf(
+      BWD_OUTS_MAP_TEMPLATE, outs_name, outs_contents_str);
+  generated_grad_function_body += outs_map_str;
+  generated_grad_function_body += "\n";
+
+  VLOG(6) << "Generated Outs Map";
+
+  // [Generation] Get Attrs Map
+  const char* ATTRS_TEMPLATE = "  auto %s = this->attr_map_;\n";
+  std::string grad_attrs_str =
+      paddle::string::Sprintf(ATTRS_TEMPLATE, attrs_name);
+  for (const auto& iter : grad_attrs) {
+    if (IgnoreGradAttribute(fwd_op_type, iter.first)) continue;
+    std::pair<std::string, std::string> type_val =
+        GetAttrType(iter.second, false /*is_arg*/);
+    const char* GRAD_ATTRS_TEMPLATE =
+        "  %s %s = %s;\n"
+        "  %s[\"%s\"] = %s;\n";
+    std::string var_name = iter.first + std::to_string(*outs_size);
+    grad_attrs_str += paddle::string::Sprintf(
+        GRAD_ATTRS_TEMPLATE, type_val.first, var_name, type_val.second,
+        attrs_name, iter.first, var_name);
+  }
+  generated_grad_function_body += grad_attrs_str;
+
+  const char* TRACE_OP_TEMPLATE =
+      "  // Pass the entire attribute map to TraceOp\n"
+      "  // The underlying kernel will pickup whatever attribute they need "
+      "at runtime\n"
+      "  egr::legacy::RunOp(\"%s\", %s, %s, %s,\n"
+      "      egr::Controller::Instance().GetExpectedPlace(),\n"
+      "      &this->default_attr_map_, false, {});\n";
+  std::string trace_opbase_str = paddle::string::Sprintf(
+      TRACE_OP_TEMPLATE, op_base_type, ins_name, outs_name, attrs_name);
+
+  generated_grad_function_body += trace_opbase_str;
+
+  VLOG(6) << "Generated Attrs Map";
+
+  // [Generation] Get Return
+  std::string outputs_str = "";
+  size_t num_appended_outputs = 0;
+  for (auto iter : grad_outs) {
+    const std::string& grad_out_name = iter.first;
+    const std::string& fwd_name = grad_outs_slotname_map.at(grad_out_name);
+
+    if (fwd_inputs_name_pos_map.count(fwd_name)) {
+      size_t fwd_input_position = fwd_inputs_name_pos_map.at(fwd_name);
+      if (!is_op_base_per_duplicable_input) {
+        const char* BWD_OUTPUT_TEMPLATE =
+            "  outputs[%d] = egr::EagerUtils::GetOutputs(%s[\"%s\"]);\n";
+        outputs_str += paddle::string::Sprintf(
+            BWD_OUTPUT_TEMPLATE, fwd_input_position, outs_name, grad_out_name);
+      } else {
+        const char* BWD_OUTPUT_TEMPLATE =
+            "  "
+            "outputs[0].emplace_back(egr::EagerUtils::GetOutputs(%s[\"%s\"])[0]"
+            ");\n";
+        outputs_str += paddle::string::Sprintf(BWD_OUTPUT_TEMPLATE, outs_name,
+                                               grad_out_name);
+      }
+      num_appended_outputs++;
+    } else {
+      PADDLE_ENFORCE(fwd_outputs_name_pos_map.count(fwd_name),
+                     paddle::platform::errors::Fatal(
+                         "fwd_name not found in fwd_inputs_name_pos_map nor "
+                         "fwd_outputs_name_pos_map"));
+    }
+  }
+
+  /* Handle Special Case: "PullSparseOp", etc
+     For returns, append "GradOut" to the very end of return list. */
+  for (auto iter : grad_outs) {
+    const std::string& grad_out_name = iter.first;
+    const std::string& fwd_name = grad_outs_slotname_map.at(grad_out_name);
+
+    if (fwd_outputs_name_pos_map.count(fwd_name)) {
+      const char* BWD_OUTPUT_TEMPLATE =
+          "  outputs[%d] = egr::EagerUtils::GetOutputs(%s[\"%s\"]);\n";
+      outputs_str += paddle::string::Sprintf(
+          BWD_OUTPUT_TEMPLATE, num_appended_outputs, outs_name, grad_out_name);
+      num_appended_outputs++;
+    }
+  }
+
+  generated_grad_function_body += outputs_str;
+  generated_grad_function_body += "\n";
+
+  *outs_size += grad_outs.size();
+
+  return generated_grad_function_body;
+}
+
 /* ---------------------------------------------- */
 /* --------- CodeGen: GradNode::operator() ------ */
 /* ---------------------------------------------- */
@@ -1408,6 +1751,7 @@ static std::string GenerateGradNodeCCContents(
   const std::unordered_map<std::string, size_t>& fwd_outputs_name_pos_map =
       fwd_info.GetFwdOutputsNamePosMap();
   const std::vector<proto::OpProto::Var>& in_vars = fwd_info.GetInVars();
+  const std::vector<proto::OpProto::Var>& out_vars = fwd_info.GetOutVars();
 
   VLOG(6) << "Generating Grad Node CC";
 
@@ -1454,9 +1798,26 @@ static std::string GenerateGradNodeCCContents(
   }
   */
 
+  // This is a Copy
+  auto op_base_infos = bwd_info.GetOpBaseInfos();
+
+  /* Special Case: ops such as sum_grad_op is implemented abnormaly,
+                   where it unpacked duplicable GradX and created one OpBase
+                   corresponds to each member of GradX[i]
+     */
+  bool is_op_base_per_duplicable_input = false;
+  if (in_vars.size() == 1 && out_vars.size() == 1 && in_vars[0].duplicable() &&
+      !out_vars[0].duplicable() &&
+      op_base_infos.size() == NUM_CREATED_DUP_INPUTS) {
+    is_op_base_per_duplicable_input = true;
+    // Only keep the first op_base
+    auto op_base_info = op_base_infos[0];
+    op_base_infos.clear();
+    op_base_infos.emplace_back(std::move(op_base_info));
+  }
+
   std::string generated_grad_function_body = "";
   size_t outs_size = 0;
-  const auto& op_base_infos = bwd_info.GetOpBaseInfos();
   for (size_t i = 0; i < op_base_infos.size(); i++) {
     const auto& op_base_info = op_base_infos[i];
 
@@ -1467,216 +1828,23 @@ static std::string GenerateGradNodeCCContents(
     const auto& grad_outs_slotname_map = op_base_info.GetGradOutsSlotnameMap();
     const auto& grad_ins = op_base_info.GetGradIns();
     const auto& grad_outs = op_base_info.GetGradOuts();
+    const auto& grad_attrs = op_base_info.GetGradAttrs();
 
     const std::string& op_base_type = op_base_info.GetOpBaseType();
-    const std::string& ins_name = "ins" + std::to_string(i);
-    const std::string& outs_name = "outs" + std::to_string(i);
-
-    outs_size += grad_outs.size();
-
-    // [Generation] Get Ins Map
-    std::string ins_contents_str = "";
-    for (auto iter : grad_ins) {
-      const std::string& grad_input_name = iter.first;
-
-      if (grad_ins_fwd_slotname_map.count(grad_input_name)) {
-        // Fwd Tensor
-        std::string struct_fwd_input_name =
-            grad_ins_fwd_slotname_map.at(grad_input_name) + "_";
-        const char* GRAD_INS_FWD_CONTENT_TEMPLATE =
-            "{ \"%s\", "
-            "egr::EagerUtils::SyncToVars(egr::EagerUtils::RecoverTensorWrapper("
-            "&"
-            "this->%s, "
-            "nullptr)) },";
-        ins_contents_str +=
-            paddle::string::Sprintf(GRAD_INS_FWD_CONTENT_TEMPLATE,
-                                    grad_input_name, struct_fwd_input_name);
-
-      } else if (grad_ins_grad_slotname_map.count(grad_input_name)) {
-        // Fwd Tensor's Grad
-        size_t fwd_output_position = fwd_outputs_name_pos_map.at(
-            grad_ins_grad_slotname_map.at(grad_input_name));
-        const char* GRAD_INS_GRAD_CONTENT_TEMPLATE =
-            "{ \"%s\", egr::EagerUtils::SyncToVars(grads[%d]) },";
-        ins_contents_str +=
-            paddle::string::Sprintf(GRAD_INS_GRAD_CONTENT_TEMPLATE,
-                                    grad_input_name, fwd_output_position);
-
-      } else {
-        PADDLE_THROW(platform::errors::Fatal(
-            "Detected mismatched slot names."
-            "Unable to find forward slot name that matches %s",
-            grad_input_name));
-      }
-    }
-    if (ins_contents_str.size() > 0)
-      ins_contents_str.pop_back();  // // Remove trailing ","
-
-    const char* BWD_INS_MAP_TEMPLATE =
-        "  std::map<std::string, "
-        "std::vector<std::shared_ptr<egr::EagerTensor>>> %s = { "
-        "%s };\n";
-    std::string ins_map_str = paddle::string::Sprintf(
-        BWD_INS_MAP_TEMPLATE, ins_name, ins_contents_str);
-    generated_grad_function_body += ins_map_str;
-
-    VLOG(6) << "Generated Ins Map";
-
-    // [Generation] Get Outs Map
-    std::unordered_set<std::string> duplicable_input_name_set;
-    for (const auto& in : in_vars) {
-      if (in.duplicable()) duplicable_input_name_set.insert(in.name());
-    }
-
-    std::string outs_contents_str = "";
-    for (auto iter : grad_outs) {
-      const std::string& grad_output_name = iter.first;
-
-      if (grad_outs_slotname_map.count(grad_output_name)) {
-        // Fwd Tensor
-        const std::string& fwd_name =
-            grad_outs_slotname_map.at(grad_output_name);
-
-        /* Handle Special Case: "PullSparseOp", etc
-
-            Forward:
-
-               Ids  W
-                |   |
-             PullSparseOp
-                  |
-                 Out
-
-            Backward:
-
-               Ids  GradOut  W
-                |      |     |
-               PullSparseGradOp
-                       |
-                    GradOut
-
-            Its grad output "GradOut" corresponds to forward output "Out",
-            where there is a hiden inplace involved. So we find "GradOut"'s
-           index
-           in
-            grads, and perform the inplace operation by constructing outs =
-           {{"Out", grads[i]}}
-
-            GradOut -> Out -> fwd_output_pos -> grads position -> grads[i]
-            outs = {{"Out", grads[i]}}
-
-            For returns, append "GradOut" to the very end of return list.
-        */
-        if (!fwd_inputs_name_pos_map.count(fwd_name)) {
-          PADDLE_ENFORCE(
-              fwd_outputs_name_pos_map.count(fwd_name),
-              paddle::platform::errors::Fatal(
-                  "fwd_name not found in fwd_inputs_name_pos_map nor "
-                  "fwd_outputs_name_pos_map"));
-
-          size_t grads_position = fwd_outputs_name_pos_map.at(fwd_name);
-
-          const char* GRAD_OUTS_CONTENT_TEMPLATE =
-              "{ \"%s\", egr::EagerUtils::SyncToVars(grads[%d]) },";
-          outs_contents_str += paddle::string::Sprintf(
-              GRAD_OUTS_CONTENT_TEMPLATE, grad_output_name, grads_position);
-
-        } else {
-          size_t fwd_input_position = fwd_inputs_name_pos_map.at(fwd_name);
-          if (duplicable_input_name_set.count(fwd_name)) {
-            const char* GRAD_OUTS_CONTENT_TEMPLATE =
-                "{ \"%s\", egr::EagerUtils::ConstructDuplicableOutput( "
-                "this->OutputMeta()[%d].Size() ) },";
-            outs_contents_str +=
-                paddle::string::Sprintf(GRAD_OUTS_CONTENT_TEMPLATE,
-                                        grad_output_name, fwd_input_position);
-          } else {
-            const char* GRAD_OUTS_CONTENT_TEMPLATE =
-                "{ \"%s\", "
-                "{std::make_shared<egr::EagerTensor>(egr::Controller::Instance("
-                ")."
-                "GenerateUniqueName())}},";
-            outs_contents_str += paddle::string::Sprintf(
-                GRAD_OUTS_CONTENT_TEMPLATE, grad_output_name);
-          }
-        }
-      } else {
-        PADDLE_THROW(platform::errors::Fatal(
-            "Detected mismatched slot names."
-            "Unable to find forward slot name that matches %s",
-            grad_output_name));
-      }
-    }
-    if (outs_contents_str.size() > 0)
-      outs_contents_str.pop_back();  // // Remove trailing ","
-
-    const char* BWD_OUTS_MAP_TEMPLATE =
-        "  std::map<std::string, "
-        "std::vector<std::shared_ptr<egr::EagerTensor>>> %s = { "
-        "%s };\n";
-    std::string outs_map_str = paddle::string::Sprintf(
-        BWD_OUTS_MAP_TEMPLATE, outs_name, outs_contents_str);
-    generated_grad_function_body += outs_map_str;
-    generated_grad_function_body += "\n";
-
-    VLOG(6) << "Generated Outs Map";
-
-    // [Generation] Get Attrs Map
-    const char* TRACE_OP_TEMPLATE =
-        "  // Pass the entire attribute map to TraceOp\n"
-        "  // The underlying kernel will pickup whatever attribute they need "
-        "at runtime\n"
-        "  egr::legacy::RunOp(\"%s\", %s, %s, this->attr_map_,\n"
-        "      egr::Controller::Instance().GetExpectedPlace(),\n"
-        "      &this->default_attr_map_, false, {});\n";
-    std::string trace_opbase_str = paddle::string::Sprintf(
-        TRACE_OP_TEMPLATE, op_base_type, ins_name, outs_name);
-
-    generated_grad_function_body += trace_opbase_str;
-
-    VLOG(6) << "Generated Attrs Map";
-
-    // [Generation] Get Return
-    std::string outputs_str = "";
-    size_t num_appended_outputs = 0;
-    for (auto iter : grad_outs) {
-      const std::string& grad_out_name = iter.first;
-      const std::string& fwd_name = grad_outs_slotname_map.at(grad_out_name);
-
-      if (fwd_inputs_name_pos_map.count(fwd_name)) {
-        size_t fwd_input_position = fwd_inputs_name_pos_map.at(fwd_name);
-        const char* BWD_OUTPUT_TEMPLATE =
-            "  outputs[%d] = egr::EagerUtils::GetOutputs(%s[\"%s\"]);\n";
-        outputs_str += paddle::string::Sprintf(
-            BWD_OUTPUT_TEMPLATE, fwd_input_position, outs_name, grad_out_name);
-        num_appended_outputs++;
-      } else {
-        PADDLE_ENFORCE(fwd_outputs_name_pos_map.count(fwd_name),
-                       paddle::platform::errors::Fatal(
-                           "fwd_name not found in fwd_inputs_name_pos_map nor "
-                           "fwd_outputs_name_pos_map"));
-      }
-    }
-
-    /* Handle Special Case: "PullSparseOp", etc
-       For returns, append "GradOut" to the very end of return list. */
-    for (auto iter : grad_outs) {
-      const std::string& grad_out_name = iter.first;
-      const std::string& fwd_name = grad_outs_slotname_map.at(grad_out_name);
-
-      if (fwd_outputs_name_pos_map.count(fwd_name)) {
-        const char* BWD_OUTPUT_TEMPLATE =
-            "  outputs[%d] = egr::EagerUtils::GetOutputs(%s[\"%s\"]);\n";
-        outputs_str +=
-            paddle::string::Sprintf(BWD_OUTPUT_TEMPLATE, num_appended_outputs,
-                                    outs_name, grad_out_name);
-        num_appended_outputs++;
-      }
-    }
+    generated_grad_function_body += GenerateSingleOpBase(
+        fwd_op_type, op_base_type, fwd_inputs_name_pos_map,
+        fwd_outputs_name_pos_map, in_vars, grad_ins_fwd_slotname_map,
+        grad_ins_grad_slotname_map, grad_outs_slotname_map, grad_ins, grad_outs,
+        grad_attrs, is_op_base_per_duplicable_input, &outs_size);
+  }
 
-    generated_grad_function_body += outputs_str;
-    generated_grad_function_body += "\n";
+  if (is_op_base_per_duplicable_input) {
+    const char* OP_BASE_PER_DUP_INPUT_TEMPLATE =
+        "  for(int i = 0; i < this->OutputMeta()[0].Size(); i++) {\n"
+        "    %s\n"
+        "  }\n";
+    generated_grad_function_body = paddle::string::Sprintf(
+        OP_BASE_PER_DUP_INPUT_TEMPLATE, generated_grad_function_body);
   }
 
   const char* BWD_RETURN_TEMPLATE =
@@ -2045,47 +2213,6 @@ static void DygraphCodeGeneration(const std::string& output_dir) {
   GenerateNodeCCFile(node_cc_path, grad_node_cc_str);
 }
 
-static void PrepareAttrMapForOps() {
-  // Handle "run_program_op"
-  static framework::ProgramDesc fake_prog;
-  operators_with_attrs["run_program"] = {};
-  operators_with_attrs["run_program"]["global_block"] =
-      fake_prog.MutableBlock(0);
-  // Handle "fused_elemwise_add_activation"
-  std::vector<std::string> functor_list = {"a", "b"};
-  operators_with_attrs["fused_elemwise_add_activation"] = {};
-  operators_with_attrs["fused_elemwise_add_activation"]["functor_list"] =
-      functor_list;
-
-  // Handle "fused_elemwise_activation"
-  operators_with_attrs["fused_elemwise_activation"] = {};
-  operators_with_attrs["fused_elemwise_activation"]["functor_list"] =
-      functor_list;
-
-  // Handle "reverse"
-  std::vector<int> axis = {0};
-  operators_with_attrs["reverse"] = {};
-  operators_with_attrs["reverse"]["axis"] = axis;
-
-  // Handle "flip"
-  operators_with_attrs["flip"] = {};
-  operators_with_attrs["flip"]["axis"] = axis;
-
-  // Handle "cast"
-  operators_with_attrs["cast"] = {};
-  operators_with_attrs["cast"]["out_dtype"] = 5;
-  operators_with_attrs["cast"]["in_dtype"] = 5;
-
-  // Handle "transfer_dtype"
-  operators_with_attrs["transfer_dtype"] = {};
-  operators_with_attrs["transfer_dtype"]["out_dtype"] = 5;
-  operators_with_attrs["transfer_dtype"]["in_dtype"] = 5;
-
-  // Handle "c_split"
-  operators_with_attrs["c_split"] = {};
-  operators_with_attrs["c_split"]["nranks"] = 1;
-}
-
 }  // namespace framework
 }  // namespace paddle
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_basic.py b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
index 262d07336de08..07a8ae0ba0f9f 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
@@ -321,7 +321,7 @@ def test_paddle_imperative_is_grad_enabled(self):
                 with paddle.set_grad_enabled(True):
                     self.assertTrue(paddle.is_grad_enabled())
 
-    def test_sum_op(self):
+    def func_sum_op(self):
         x = np.ones([2, 2], np.float32)
         with fluid.dygraph.guard():
             inputs = []
@@ -338,7 +338,7 @@ def test_sum_op(self):
                 tmp = paddle.to_tensor(x)
                 tmp.stop_gradient = False
                 inputs2.append(tmp)
-            ret2 = fluid.layers.sums(inputs2)
+            ret2 = paddle.add_n(inputs2)
             loss2 = fluid.layers.reduce_sum(ret2)
             fluid.set_flags({'FLAGS_sort_sum_gradient': True})
             loss2.backward()
@@ -349,6 +349,11 @@ def test_sum_op(self):
             a = inputs2[0].gradient()
             self.assertTrue(np.allclose(inputs2[0].gradient(), x))
 
+    def test_sum_op(self):
+        with _test_eager_guard():
+            self.func_sum_op()
+        self.func_sum_op()
+
     def func_empty_var(self):
         with fluid.dygraph.guard():
             cur_program = fluid.Program()

From 4514f16d8a08a6c5014143528cfec87b14b6b1f0 Mon Sep 17 00:00:00 2001
From: wanghuancoder <wanghuancoder@163.com>
Date: Thu, 6 Jan 2022 16:47:21 +0800
Subject: [PATCH 026/151] fix slot, test=develop (#38738)

---
 paddle/fluid/pybind/eager.cc | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc
index be7c60334a68a..9484d506b20fb 100644
--- a/paddle/fluid/pybind/eager.cc
+++ b/paddle/fluid/pybind/eager.cc
@@ -741,6 +741,10 @@ extern struct PyGetSetDef variable_properties[];
 
 extern PyMethodDef variable_methods[];
 
+PyNumberMethods number_methods;
+PySequenceMethods sequence_methods;
+PyMappingMethods mapping_methods;
+
 PyTypeObject eager_tensor_type = {
     PyVarObject_HEAD_INIT(NULL, 0) "core_avx.eager.EagerTensor", /* tp_name */
     sizeof(EagerTensorObject),       /* tp_basicsize */
@@ -751,9 +755,9 @@ PyTypeObject eager_tensor_type = {
     0,                               /* tp_setattr */
     0,                               /* tp_reserved */
     0,                               /* tp_repr */
-    0,                               /* tp_as_number */
-    0,                               /* tp_as_sequence */
-    0,                               /* tp_as_mapping */
+    &number_methods,                 /* tp_as_number */
+    &sequence_methods,               /* tp_as_sequence */
+    &mapping_methods,                /* tp_as_mapping */
     0,                               /* tp_hash  */
     0,                               /* tp_call */
     0,                               /* tp_str */

From c48bd3ffe7e60aee306886a5a6898d1919e6c3ce Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Thu, 6 Jan 2022 18:46:01 +0800
Subject: [PATCH 027/151] [pten]move reduce files  and dev_api (#38715)

* move eigen/reduce.h imple into cpu/reduce.h

* ctx to dev_ctx
---
 paddle/pten/include/math.h                    |  37 ---
 paddle/pten/kernels/cpu/math_kernel.cc        |   5 +-
 paddle/pten/kernels/cpu/reduce.h              | 180 ++++++++++++++-
 paddle/pten/kernels/funcs/reduce_functor.h    |  37 +++
 paddle/pten/kernels/hybird/eigen/reduce.h     | 214 ------------------
 paddle/pten/kernels/math_kernel.h             |  31 +++
 .../pten/tests/kernels/test_mean_dev_api.cc   |   2 +-
 paddle/pten/tests/kernels/test_sum_dev_api.cc |   2 +-
 8 files changed, 250 insertions(+), 258 deletions(-)
 create mode 100644 paddle/pten/kernels/funcs/reduce_functor.h
 delete mode 100644 paddle/pten/kernels/hybird/eigen/reduce.h

diff --git a/paddle/pten/include/math.h b/paddle/pten/include/math.h
index e46f460260adb..faa4c8db8dac3 100644
--- a/paddle/pten/include/math.h
+++ b/paddle/pten/include/math.h
@@ -18,7 +18,6 @@ limitations under the License. */
 #include "paddle/pten/api/lib/utils/storage.h"
 #include "paddle/pten/include/infermeta.h"
 #include "paddle/pten/kernels/complex_kernel.h"
-#include "paddle/pten/kernels/math_kernel.h"
 #include "paddle/pten/kernels/scale_kernel.h"
 
 namespace pten {
@@ -34,42 +33,6 @@ DenseTensor Sign(const ContextT& dev_ctx, const DenseTensor& x) {
   return dense_out;
 }
 
-template <typename T, typename ContextT>
-DenseTensor Mean(const ContextT& dev_ctx,
-                 const DenseTensor& x,
-                 const std::vector<int64_t>& axis,
-                 bool keep_dim) {
-  auto out_meta = ReduceInferMeta(x.meta(), axis, keep_dim);
-  pten::DenseTensor dense_out(
-      pten::make_intrusive<paddle::experimental::SharedStorage>(
-          dev_ctx.GetPlace()),
-      std::move(out_meta));
-  bool reduce_all = false;
-  MeanKernel<T, ContextT>(dev_ctx, x, axis, keep_dim, reduce_all, &dense_out);
-  return dense_out;
-}
-
-template <typename T, typename ContextT>
-DenseTensor Sum(const ContextT& dev_ctx,
-                const DenseTensor& x,
-                const std::vector<int64_t>& axis,
-                DataType dtype,
-                bool keep_dim) {
-  auto out_meta = ReduceInferMeta(x.meta(), axis, keep_dim, dtype);
-  pten::DenseTensor dense_out(
-      pten::make_intrusive<paddle::experimental::SharedStorage>(
-          dev_ctx.GetPlace()),
-      out_meta);
-
-  // The real value of reduce_all will be get in kernel
-  // so use default value(false) is OK.
-  bool reduce_all = false;
-
-  SumKernel<T, ContextT>(
-      dev_ctx, x, axis, keep_dim, reduce_all, out_meta.dtype, &dense_out);
-  return dense_out;
-}
-
 template <typename T, typename ContextT>
 DenseTensor Scale(const ContextT& dev_ctx,
                   const DenseTensor& x,
diff --git a/paddle/pten/kernels/cpu/math_kernel.cc b/paddle/pten/kernels/cpu/math_kernel.cc
index 2a696584bc781..be0d52355bce6 100644
--- a/paddle/pten/kernels/cpu/math_kernel.cc
+++ b/paddle/pten/kernels/cpu/math_kernel.cc
@@ -21,6 +21,7 @@
 #include "paddle/pten/kernels/cpu/elementwise.h"
 #include "paddle/pten/kernels/cpu/reduce.h"
 #include "paddle/pten/kernels/funcs/elementwise_functor.h"
+#include "paddle/pten/kernels/funcs/reduce_functor.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/framework/eigen.h"
@@ -61,7 +62,7 @@ void MeanKernel(const Context& dev_ctx,
                 bool reduce_all,
                 DenseTensor* out) {
   auto out_dtype = x.dtype();
-  pten::Reduce<CPUContext, T, pten::eigen::MeanFunctor>(
+  pten::Reduce<CPUContext, T, pten::funcs::MeanFunctor>(
       dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
 }
 
@@ -97,7 +98,7 @@ void SumKernel(const Context& dev_ctx,
                bool reduce_all,
                DataType out_dtype,
                DenseTensor* out) {
-  pten::Reduce<CPUContext, T, pten::eigen::SumFunctor>(
+  pten::Reduce<CPUContext, T, pten::funcs::SumFunctor>(
       dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
 }
 
diff --git a/paddle/pten/kernels/cpu/reduce.h b/paddle/pten/kernels/cpu/reduce.h
index fc5dbe9d58d63..fa603b2163055 100644
--- a/paddle/pten/kernels/cpu/reduce.h
+++ b/paddle/pten/kernels/cpu/reduce.h
@@ -19,10 +19,184 @@
 #include "paddle/pten/api/ext/dispatch.h"
 #include "paddle/pten/backends/cpu/cpu_context.h"
 #include "paddle/pten/kernels/cast_kernel.h"
-#include "paddle/pten/kernels/hybird/eigen/reduce.h"
 
+#include "paddle/pten/api/lib/utils/storage.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/kernels/hybird/eigen/common.h"
+#include "paddle/pten/kernels/hybird/transpose.h"
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/operators/eigen/eigen_function.h"
 namespace pten {
 
+template <typename DeviceContext,
+          typename T,
+          size_t D,
+          size_t R_D,
+          typename Functor>
+void ReduceFunctor(const DeviceContext& context,
+                   const pten::DenseTensor& input,
+                   pten::DenseTensor* output,
+                   const std::vector<int64_t>& dims,
+                   bool keep_dim) {
+  auto x = EigenTensor<T, D>::From(input);
+  auto x_rank = static_cast<int>(x.dimensions().size());
+  auto reduce_dim = Eigen::array<int, R_D>();
+  std::vector<int64_t> dims_ref = dims;
+  for (size_t i = 0; i < dims_ref.size(); ++i) {
+    if (dims_ref[i] < 0) dims_ref[i] = x_rank + dims_ref[i];
+    reduce_dim[i] = dims_ref[i];
+  }
+  // construct the squeezed output tensor
+  DDim out_dims = output->dims();
+  if (keep_dim && x_rank > 1) {
+    const int kDelFlag = -2;
+    auto dims_vector = paddle::framework::vectorize(out_dims);
+    for (size_t i = 0; i < dims_ref.size(); ++i) {
+      dims_vector[dims_ref[i]] = kDelFlag;
+    }
+    dims_vector.erase(remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
+                      dims_vector.end());
+    out_dims = paddle::framework::make_ddim(dims_vector);
+  }
+  auto& place = *context.eigen_device();
+  Functor functor;
+
+  if (D == 1) {
+    auto out = EigenScalar<T>::From(*output);
+    functor(place, &x, &out, reduce_dim);
+  } else {
+    auto out = EigenTensor<T, (D - R_D)>::From(*output, out_dims);
+    functor(place, &x, &out, reduce_dim);
+  }
+}
+
+#define HANDLE_REDUCE_DIM(NDIM, RDIM)                        \
+  if (ndim == NDIM && rdim == RDIM) {                        \
+    ReduceFunctor<DeviceContext, OutT, NDIM, RDIM, Functor>( \
+        dev_ctx, input, output, dims, keep_dim);             \
+  }
+//////////////// HandleLargeDim
+
+inline void GetShuffledDim(const DDim& src_dims,
+                           DDim* dst_dims,
+                           const std::vector<int64_t>& reduced_dims,
+                           std::vector<int64_t>* perm_axis) {
+  // check if it's a reduced dim
+  std::vector<bool> src_dims_check(src_dims.size(), false);
+  size_t src_size = src_dims.size();
+  size_t reduce_size = reduced_dims.size();
+  std::vector<int64_t> regular_reduced_dims = reduced_dims;
+  for (size_t i = 0; i < regular_reduced_dims.size(); i++) {
+    if (regular_reduced_dims[i] < 0) {
+      regular_reduced_dims[i] = src_size + regular_reduced_dims[i];
+    }
+  }
+
+  for (size_t i = 0; i < reduce_size; ++i) {
+    dst_dims->at(src_size - reduce_size + i) =
+        src_dims[regular_reduced_dims[i]];
+    (*perm_axis)[src_size - reduce_size + i] = regular_reduced_dims[i];
+    src_dims_check[regular_reduced_dims[i]] = true;
+  }
+
+  size_t offset = 0;
+  for (size_t i = 0; i < src_dims_check.size(); ++i) {
+    bool is_reduced = src_dims_check[i];
+    if (!is_reduced) {
+      (*perm_axis)[offset] = i;
+      dst_dims->at(offset++) = src_dims[i];
+    }
+  }
+}
+
+template <typename DeviceContext, typename OutT>
+void GetShuffledInput(const DeviceContext& dev_ctx,
+                      const pten::DenseTensor& input,
+                      pten::DenseTensor* shuffled_input,
+                      const std::vector<int64_t>& dims) {
+  DDim shuffled_dims(input.dims());
+  std::vector<int64_t> perm_axis(input.dims().size());
+  GetShuffledDim(input.dims(), &shuffled_dims, dims, &perm_axis);
+
+  shuffled_input->Resize(shuffled_dims);
+  shuffled_input->mutable_data<OutT>();
+
+  pten::math::TransposeNormal<DeviceContext, OutT> trans;
+  trans(dev_ctx, input, shuffled_input, perm_axis);
+}
+
+template <typename DeviceContext, typename OutT, typename Functor>
+void HandleLargeDim(const DeviceContext& dev_ctx,
+                    const pten::DenseTensor& input,
+                    pten::DenseTensor* output,
+                    const std::vector<int64_t>& dims,
+                    bool keep_dim) {
+  //  shuffle the reduced dim to the end
+  pten::DenseTensor shuffled_input = pten::DenseTensor(
+      pten::make_intrusive<paddle::experimental::SharedStorage>(input.place()),
+      input.meta());
+
+  GetShuffledInput<DeviceContext, OutT>(dev_ctx, input, &shuffled_input, dims);
+
+  // transpose to 2D tensor whose shape is {unreduced, reduced}.
+  const int64_t unreduced = output->numel();
+  const int64_t reduced = shuffled_input.numel() / unreduced;
+  shuffled_input.Resize({unreduced, reduced});
+  DDim output_dim = output->dims();
+  output->Resize({unreduced});
+  ReduceFunctor<DeviceContext, OutT, 2, 1, Functor>(
+      dev_ctx, shuffled_input, output, {1}, keep_dim);
+  output->Resize(output_dim);
+}
+
+////////////// ReduceKernel
+
+template <typename DeviceContext, typename T, typename OutT, typename Functor>
+void ReduceKernelImpl(const DeviceContext& dev_ctx,
+                      const pten::DenseTensor& input,
+                      pten::DenseTensor* output,
+                      const std::vector<int64_t>& dims,
+                      bool keep_dim,
+                      bool reduce_all) {
+  output->mutable_data<OutT>();
+
+  if (reduce_all) {
+    // Flatten and reduce 1-D tensor
+    auto x = EigenVector<OutT>::Flatten(input);
+    auto out = EigenScalar<OutT>::From(*output);
+    auto& dev = *dev_ctx.eigen_device();
+    auto reduce_dim = Eigen::array<int, 1>({{0}});
+
+    Functor functor;
+    functor(dev, &x, &out, reduce_dim);
+  } else {
+    int ndim = input.dims().size();
+    int rdim = dims.size();
+    if (ndim > 6) {
+      HandleLargeDim<DeviceContext, OutT, Functor>(
+          dev_ctx, input, output, dims, keep_dim);
+
+    } else {
+      HANDLE_REDUCE_DIM(6, 5);
+      HANDLE_REDUCE_DIM(6, 4);
+      HANDLE_REDUCE_DIM(6, 3);
+      HANDLE_REDUCE_DIM(6, 2);
+      HANDLE_REDUCE_DIM(6, 1);
+      HANDLE_REDUCE_DIM(5, 4);
+      HANDLE_REDUCE_DIM(5, 3);
+      HANDLE_REDUCE_DIM(5, 2);
+      HANDLE_REDUCE_DIM(5, 1);
+      HANDLE_REDUCE_DIM(4, 3);
+      HANDLE_REDUCE_DIM(4, 2);
+      HANDLE_REDUCE_DIM(4, 1);
+      HANDLE_REDUCE_DIM(3, 2);
+      HANDLE_REDUCE_DIM(3, 1);
+      HANDLE_REDUCE_DIM(2, 1);
+      HANDLE_REDUCE_DIM(1, 1);
+    }
+  }
+}
+
 template <typename DeviceContext, typename T, typename Functor>
 void Reduce(const DeviceContext& dev_ctx,
             const DenseTensor& x,
@@ -52,7 +226,7 @@ void Reduce(const DeviceContext& dev_ctx,
     // do reduce sum
     PD_VISIT_ALL_TYPES(
         out_dtype, "ReduceKernelImpl", ([&] {
-          pten::eigen::ReduceKernelImpl<DeviceContext, T, data_t, Functor>(
+          pten::ReduceKernelImpl<DeviceContext, T, data_t, Functor>(
               dev_ctx, x, out, dims, keep_dim, reduce_all);
         }));
   } else {
@@ -66,7 +240,7 @@ void Reduce(const DeviceContext& dev_ctx,
     // do reduce sum
     PD_VISIT_ALL_TYPES(
         out_dtype, "ReduceKernelImpl", ([&] {
-          pten::eigen::ReduceKernelImpl<DeviceContext, T, data_t, Functor>(
+          pten::ReduceKernelImpl<DeviceContext, T, data_t, Functor>(
               dev_ctx, tmp_tensor, out, dims, keep_dim, reduce_all);
         }));
   }
diff --git a/paddle/pten/kernels/funcs/reduce_functor.h b/paddle/pten/kernels/funcs/reduce_functor.h
new file mode 100644
index 0000000000000..64ada0231892e
--- /dev/null
+++ b/paddle/pten/kernels/funcs/reduce_functor.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace pten {
+namespace funcs {
+
+//////// Sum Functor ///////
+struct SumFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->sum(dim);
+  }
+};
+
+//////// Mean Functor ///////
+struct MeanFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->mean(dim);
+  }
+};
+
+}  // namespace funcs
+}  // namespace pten
diff --git a/paddle/pten/kernels/hybird/eigen/reduce.h b/paddle/pten/kernels/hybird/eigen/reduce.h
deleted file mode 100644
index d60a416dfdb37..0000000000000
--- a/paddle/pten/kernels/hybird/eigen/reduce.h
+++ /dev/null
@@ -1,214 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/pten/api/lib/utils/storage.h"
-#include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/kernels/hybird/eigen/common.h"
-#include "paddle/pten/kernels/hybird/transpose.h"
-
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-
-namespace pten {
-namespace eigen {
-
-template <typename DeviceContext,
-          typename T,
-          size_t D,
-          size_t R_D,
-          typename Functor>
-void ReduceFunctor(const DeviceContext& context,
-                   const pten::DenseTensor& input,
-                   pten::DenseTensor* output,
-                   const std::vector<int64_t>& dims,
-                   bool keep_dim) {
-  auto x = EigenTensor<T, D>::From(input);
-  auto x_rank = static_cast<int>(x.dimensions().size());
-  auto reduce_dim = Eigen::array<int, R_D>();
-  std::vector<int64_t> dims_ref = dims;
-  for (size_t i = 0; i < dims_ref.size(); ++i) {
-    if (dims_ref[i] < 0) dims_ref[i] = x_rank + dims_ref[i];
-    reduce_dim[i] = dims_ref[i];
-  }
-  // construct the squeezed output tensor
-  DDim out_dims = output->dims();
-  if (keep_dim && x_rank > 1) {
-    const int kDelFlag = -2;
-    auto dims_vector = paddle::framework::vectorize(out_dims);
-    for (size_t i = 0; i < dims_ref.size(); ++i) {
-      dims_vector[dims_ref[i]] = kDelFlag;
-    }
-    dims_vector.erase(remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
-                      dims_vector.end());
-    out_dims = paddle::framework::make_ddim(dims_vector);
-  }
-  auto& place = *context.eigen_device();
-  Functor functor;
-
-  if (D == 1) {
-    auto out = EigenScalar<T>::From(*output);
-    functor(place, &x, &out, reduce_dim);
-  } else {
-    auto out = EigenTensor<T, (D - R_D)>::From(*output, out_dims);
-    functor(place, &x, &out, reduce_dim);
-  }
-}
-
-#define HANDLE_REDUCE_DIM(NDIM, RDIM)                        \
-  if (ndim == NDIM && rdim == RDIM) {                        \
-    ReduceFunctor<DeviceContext, OutT, NDIM, RDIM, Functor>( \
-        dev_ctx, input, output, dims, keep_dim);             \
-  }
-//////////////// HandleLargeDim
-
-inline void GetShuffledDim(const DDim& src_dims,
-                           DDim* dst_dims,
-                           const std::vector<int64_t>& reduced_dims,
-                           std::vector<int64_t>* perm_axis) {
-  // check if it's a reduced dim
-  std::vector<bool> src_dims_check(src_dims.size(), false);
-  size_t src_size = src_dims.size();
-  size_t reduce_size = reduced_dims.size();
-  std::vector<int64_t> regular_reduced_dims = reduced_dims;
-  for (size_t i = 0; i < regular_reduced_dims.size(); i++) {
-    if (regular_reduced_dims[i] < 0) {
-      regular_reduced_dims[i] = src_size + regular_reduced_dims[i];
-    }
-  }
-
-  for (size_t i = 0; i < reduce_size; ++i) {
-    dst_dims->at(src_size - reduce_size + i) =
-        src_dims[regular_reduced_dims[i]];
-    (*perm_axis)[src_size - reduce_size + i] = regular_reduced_dims[i];
-    src_dims_check[regular_reduced_dims[i]] = true;
-  }
-
-  size_t offset = 0;
-  for (size_t i = 0; i < src_dims_check.size(); ++i) {
-    bool is_reduced = src_dims_check[i];
-    if (!is_reduced) {
-      (*perm_axis)[offset] = i;
-      dst_dims->at(offset++) = src_dims[i];
-    }
-  }
-}
-
-template <typename DeviceContext, typename OutT>
-void GetShuffledInput(const DeviceContext& dev_ctx,
-                      const pten::DenseTensor& input,
-                      pten::DenseTensor* shuffled_input,
-                      const std::vector<int64_t>& dims) {
-  DDim shuffled_dims(input.dims());
-  std::vector<int64_t> perm_axis(input.dims().size());
-  GetShuffledDim(input.dims(), &shuffled_dims, dims, &perm_axis);
-
-  shuffled_input->Resize(shuffled_dims);
-  shuffled_input->mutable_data<OutT>();
-
-  pten::math::TransposeNormal<DeviceContext, OutT> trans;
-  trans(dev_ctx, input, shuffled_input, perm_axis);
-}
-
-template <typename DeviceContext, typename OutT, typename Functor>
-void HandleLargeDim(const DeviceContext& dev_ctx,
-                    const pten::DenseTensor& input,
-                    pten::DenseTensor* output,
-                    const std::vector<int64_t>& dims,
-                    bool keep_dim) {
-  //  shuffle the reduced dim to the end
-  pten::DenseTensor shuffled_input = pten::DenseTensor(
-      pten::make_intrusive<paddle::experimental::SharedStorage>(input.place()),
-      input.meta());
-
-  GetShuffledInput<DeviceContext, OutT>(dev_ctx, input, &shuffled_input, dims);
-
-  // transpose to 2D tensor whose shape is {unreduced, reduced}.
-  const int64_t unreduced = output->numel();
-  const int64_t reduced = shuffled_input.numel() / unreduced;
-  shuffled_input.Resize({unreduced, reduced});
-  DDim output_dim = output->dims();
-  output->Resize({unreduced});
-  ReduceFunctor<DeviceContext, OutT, 2, 1, Functor>(
-      dev_ctx, shuffled_input, output, {1}, keep_dim);
-  output->Resize(output_dim);
-}
-
-////////////// ReduceKernel
-
-template <typename DeviceContext, typename T, typename OutT, typename Functor>
-void ReduceKernelImpl(const DeviceContext& dev_ctx,
-                      const pten::DenseTensor& input,
-                      pten::DenseTensor* output,
-                      const std::vector<int64_t>& dims,
-                      bool keep_dim,
-                      bool reduce_all) {
-  output->mutable_data<OutT>();
-
-  if (reduce_all) {
-    // Flatten and reduce 1-D tensor
-    auto x = EigenVector<OutT>::Flatten(input);
-    auto out = EigenScalar<OutT>::From(*output);
-    auto& dev = *dev_ctx.eigen_device();
-    auto reduce_dim = Eigen::array<int, 1>({{0}});
-
-    Functor functor;
-    functor(dev, &x, &out, reduce_dim);
-  } else {
-    int ndim = input.dims().size();
-    int rdim = dims.size();
-    if (ndim > 6) {
-      HandleLargeDim<DeviceContext, OutT, Functor>(
-          dev_ctx, input, output, dims, keep_dim);
-
-    } else {
-      HANDLE_REDUCE_DIM(6, 5);
-      HANDLE_REDUCE_DIM(6, 4);
-      HANDLE_REDUCE_DIM(6, 3);
-      HANDLE_REDUCE_DIM(6, 2);
-      HANDLE_REDUCE_DIM(6, 1);
-      HANDLE_REDUCE_DIM(5, 4);
-      HANDLE_REDUCE_DIM(5, 3);
-      HANDLE_REDUCE_DIM(5, 2);
-      HANDLE_REDUCE_DIM(5, 1);
-      HANDLE_REDUCE_DIM(4, 3);
-      HANDLE_REDUCE_DIM(4, 2);
-      HANDLE_REDUCE_DIM(4, 1);
-      HANDLE_REDUCE_DIM(3, 2);
-      HANDLE_REDUCE_DIM(3, 1);
-      HANDLE_REDUCE_DIM(2, 1);
-      HANDLE_REDUCE_DIM(1, 1);
-    }
-  }
-}
-
-//////// Sum Functor ///////
-struct SumFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
-    y->device(place) = x->sum(dim);
-  }
-};
-
-//////// Mean Functor ///////
-struct MeanFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
-    y->device(place) = x->mean(dim);
-  }
-};
-
-}  // namespace eigen
-}  // namespace pten
diff --git a/paddle/pten/kernels/math_kernel.h b/paddle/pten/kernels/math_kernel.h
index b1e5188f3aaef..f87d0a31b470b 100644
--- a/paddle/pten/kernels/math_kernel.h
+++ b/paddle/pten/kernels/math_kernel.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/pten/api/lib/utils/storage.h"
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/include/infermeta.h"
+#include "paddle/pten/kernels/empty_kernel.h"
 
 namespace pten {
 
@@ -121,4 +122,34 @@ DenseTensor Multiply(const ContextT& dev_ctx,
   return dense_out;
 }
 
+template <typename T, typename Context>
+DenseTensor Mean(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 const std::vector<int64_t>& axis,
+                 bool keep_dim) {
+  auto out_meta = ReduceInferMeta(x.meta(), axis, keep_dim);
+  auto dense_out = pten::Empty<T, Context>(dev_ctx, std::move(out_meta));
+  bool reduce_all = false;
+  MeanKernel<T, Context>(dev_ctx, x, axis, keep_dim, reduce_all, &dense_out);
+  return dense_out;
+}
+
+template <typename T, typename Context>
+DenseTensor Sum(const Context& dev_ctx,
+                const DenseTensor& x,
+                const std::vector<int64_t>& axis,
+                DataType dtype,
+                bool keep_dim) {
+  auto out_meta = ReduceInferMeta(x.meta(), axis, keep_dim, dtype);
+  auto dense_out = pten::Empty<T, Context>(dev_ctx, std::move(out_meta));
+
+  // The real value of reduce_all will be get in kernel
+  // so use default value(false) is OK.
+  bool reduce_all = false;
+
+  SumKernel<T, Context>(
+      dev_ctx, x, axis, keep_dim, reduce_all, out_meta.dtype, &dense_out);
+  return dense_out;
+}
+
 }  // namespace pten
diff --git a/paddle/pten/tests/kernels/test_mean_dev_api.cc b/paddle/pten/tests/kernels/test_mean_dev_api.cc
index 4d062977e23bd..4b254e7e6c1ac 100644
--- a/paddle/pten/tests/kernels/test_mean_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_mean_dev_api.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/pten/include/math.h"
+#include "paddle/pten/kernels/math_kernel.h"
 
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
diff --git a/paddle/pten/tests/kernels/test_sum_dev_api.cc b/paddle/pten/tests/kernels/test_sum_dev_api.cc
index 381b8fe44f532..afaf903063781 100644
--- a/paddle/pten/tests/kernels/test_sum_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_sum_dev_api.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/pten/include/math.h"
+#include "paddle/pten/kernels/math_kernel.h"
 
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"

From aec493c0d30dd739bc7e9d39269ebca0357a10ca Mon Sep 17 00:00:00 2001
From: Thomas Young <35565423+HexToString@users.noreply.github.com>
Date: Thu, 6 Jan 2022 19:23:53 +0800
Subject: [PATCH 028/151] fix expand_v2 and expand_as_v2 bug (#38677)

---
 paddle/fluid/operators/expand_as_v2_op.cc | 11 ++++++++
 paddle/fluid/operators/expand_as_v2_op.h  | 33 +++++++++++++++++------
 paddle/fluid/operators/expand_v2_op.cc    |  6 ++++-
 python/paddle/tensor/manipulation.py      |  2 +-
 4 files changed, 42 insertions(+), 10 deletions(-)
 mode change 100644 => 100755 paddle/fluid/operators/expand_as_v2_op.cc
 mode change 100644 => 100755 paddle/fluid/operators/expand_as_v2_op.h
 mode change 100644 => 100755 paddle/fluid/operators/expand_v2_op.cc
 mode change 100644 => 100755 python/paddle/tensor/manipulation.py

diff --git a/paddle/fluid/operators/expand_as_v2_op.cc b/paddle/fluid/operators/expand_as_v2_op.cc
old mode 100644
new mode 100755
index 5296a144f6247..cc293a5aaa0b2
--- a/paddle/fluid/operators/expand_as_v2_op.cc
+++ b/paddle/fluid/operators/expand_as_v2_op.cc
@@ -12,6 +12,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/expand_as_v2_op.h"
 #include <memory>
 #include <vector>
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -50,6 +51,10 @@ class ExpandAsV2OpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X",
              "(Tensor, default Tensor<float>). A tensor with rank in [1, 6]."
              "X is the input to be expanded.");
+    AddInput("Y",
+             "(Tensor, default Tensor<float>). A tensor with rank in [1, 6]."
+             "Expand X according to the shape of Y.")
+        .AsDispensable();
     AddOutput("Out",
               "(Tensor, default Tensor<float>). A tensor with rank in [1, 6]."
               "The rank of Output(Out) have the same with Input(X). "
@@ -144,3 +149,9 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ExpandAsV2GradKernel<paddle::platform::CUDADeviceContext, float>,
     ops::ExpandAsV2GradKernel<paddle::platform::CUDADeviceContext, double>);
 #endif
+
+REGISTER_OP_VERSION(expand_as_v2)
+    .AddCheckpoint(
+        R"ROC(fix expand_as_v2 and add new input [Y])ROC",
+        paddle::framework::compatible::OpVersionDesc().NewInput(
+            "Y", "Expand X according to the shape of Y"));
\ No newline at end of file
diff --git a/paddle/fluid/operators/expand_as_v2_op.h b/paddle/fluid/operators/expand_as_v2_op.h
old mode 100644
new mode 100755
index 3e8f7d15880bc..9e683a792c61f
--- a/paddle/fluid/operators/expand_as_v2_op.h
+++ b/paddle/fluid/operators/expand_as_v2_op.h
@@ -91,17 +91,34 @@ class ExpandAsV2Kernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_NE(target_shape[i], 0,
                         platform::errors::InvalidArgument(
                             "The value of target shape cannot be zero."));
-      if (vec_in_dims[i] != 1) {
+      if (i < diff) {
+        PADDLE_ENFORCE_GT(
+            target_shape[i], 0,
+            platform::errors::InvalidArgument(
+                "The expanded size (%d) for non-existing dimensions must be "
+                "positive for expand_as_v2 op.",
+                target_shape[i]));
+        repeat_times[i] = target_shape[i];
+      } else if (target_shape[i] > 0) {
+        if (vec_in_dims[i] != 1) {
+          PADDLE_ENFORCE_EQ(
+              vec_in_dims[i], target_shape[i],
+              platform::errors::InvalidArgument(
+                  "The value (%d) of the non-singleton dimension does not match"
+                  " the corresponding value (%d) in shape for expand_as_v2 op.",
+                  vec_in_dims[i], target_shape[i]));
+          repeat_times[i] = 1;
+        } else {
+          repeat_times[i] = target_shape[i];
+        }
+      } else {
         PADDLE_ENFORCE_EQ(
-            vec_in_dims[i], target_shape[i],
+            target_shape[i], -1,
             platform::errors::InvalidArgument(
-                "The value (%d) of the non-singleton dimension does not match"
-                " the corresponding value (%d) in "
-                "target tensor for expand_as_v2 op.",
-                vec_in_dims[i], target_shape[i]));
+                "When the value in shape is negative for expand_as_v2 op, "
+                "only -1 is supported, but the value received is %d.",
+                target_shape[i]));
         repeat_times[i] = 1;
-      } else {
-        repeat_times[i] = target_shape[i];
       }
     }
     auto* out0 = context.Output<Tensor>("Out");
diff --git a/paddle/fluid/operators/expand_v2_op.cc b/paddle/fluid/operators/expand_v2_op.cc
old mode 100644
new mode 100755
index dc6da979671e5..6d803c500d90f
--- a/paddle/fluid/operators/expand_v2_op.cc
+++ b/paddle/fluid/operators/expand_v2_op.cc
@@ -65,7 +65,11 @@ class ExpandV2Op : public framework::OperatorWithKernel {
       if (x_dims[i] == -1) {
         out_shape[i] = -1;
       } else if (expand_shape[i] == -1) {
-        out_shape[i] = x_dims[i];
+        if (static_cast<size_t>(x_dims.size()) > i) {
+          out_shape[i] = x_dims[i];
+        } else {
+          out_shape[i] = -1;
+        }
       } else if (expand_shape[i] == -2) {
         // We use -2 to represent the element in expand_shape is a var.
         out_shape[i] = -1;
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
old mode 100644
new mode 100755
index b54c3596a26a9..a15c1af391f9f
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -1838,7 +1838,7 @@ def expand_as(x, y, name=None):
             "you must set its stop_gradient to be False by "
             "some_var.stop_gradient = True, supporting "
             "some_var as the input 'x'.")
-    inputs = {"X": [x]}
+    inputs = {"X": [x], "Y": [y]}
 
     helper = LayerHelper('expand_as', **locals())
     dtype = helper.input_dtype(input_param_name='x')

From 747000dda98f71a1d82c88dbf84ba46843011e9b Mon Sep 17 00:00:00 2001
From: JZ-LIANG <jianzhongliang10@gmail.com>
Date: Thu, 6 Jan 2022 20:17:05 +0800
Subject: [PATCH 029/151] [Auto Parallel] Pass bugfix (#38741)

---
 .../distributed/auto_parallel/parallelizer.py | 53 ++++++---------
 .../passes/auto_parallel_sharding.py          | 66 ++++++++++---------
 .../auto_parallel_pass_test_base.py           |  4 +-
 .../test_auto_parallel_sharding_pass.py       |  2 +-
 .../test_auto_parallel_cost_model.py          |  3 -
 .../unittests/test_auto_parallel_mapper.py    |  3 +-
 .../test_auto_parallel_partitioner_gpt.py     |  4 --
 .../unittests/test_auto_parallel_reshard.py   |  3 -
 .../test_auto_parallel_reshard_dpmppp.py      |  3 -
 .../test_auto_parallel_reshard_mppp.py        |  2 -
 10 files changed, 60 insertions(+), 83 deletions(-)

diff --git a/python/paddle/distributed/auto_parallel/parallelizer.py b/python/paddle/distributed/auto_parallel/parallelizer.py
index 9ff673b1d2901..7cad4d746bbf2 100644
--- a/python/paddle/distributed/auto_parallel/parallelizer.py
+++ b/python/paddle/distributed/auto_parallel/parallelizer.py
@@ -23,6 +23,7 @@
 import pickle
 import time
 import paddle
+from paddle.fluid.backward import append_backward
 from paddle.distributed.utils import get_logger
 from paddle.distributed.fleet import cloud_utils
 import paddle.fluid.core as core
@@ -96,49 +97,35 @@ def _remove_distributed_attrs(self, main_program):
                     if suffix in attr_name:
                         op._remove_attr(attr_name)
 
-    def _apply_serial_forward_pass(self, main_program, startup_program):
+    def _apply_serial_pass(self, main_program, startup_program):
 
-        # apply amp forward pass
+        # apply amp pass
         if self._dist_strategy.amp:
             auto_parallel_amp_pass = new_pass("auto_parallel_amp_pass",
                                               self._dist_strategy.amp_configs)
-            auto_parallel_amp_pass.apply_forward(main_program, startup_program,
-                                                 self._pass_context)
+            auto_parallel_amp_pass.apply(main_program, startup_program,
+                                         self._pass_context)
 
-        # apply recompute forward pass
+        # apply recompute pass
         if self._dist_strategy.recompute:
             auto_parallel_recompute_pass = new_pass(
                 "auto_parallel_recompute_pass",
                 self._dist_strategy.recompute_configs)
-            auto_parallel_recompute_pass.apply_forward(
-                main_program, startup_program, self._pass_context)
+            auto_parallel_recompute_pass.apply(main_program, startup_program,
+                                               self._pass_context)
 
     def _generate_backward(self, main_program, startup_program, loss,
                            parameter_list, no_grad_set, callbacks):
 
-        # apply recompute backward pass
-        if self._dist_strategy.recompute:
-            assert auto_parallel_recompute_pass
-            auto_parallel_recompute_pass.apply_forward(
-                main_program, startup_program, parameter_list, no_grad_set,
-                self._pass_context)
-        else:
-            from paddle.fluid.backward import append_backward
-            with program_guard(main_program, startup_program):
-                params_grads = append_backward(
-                    loss,
-                    parameter_list,
-                    no_grad_set,
-                    callbacks,
-                    distop_context=self._dist_context.dist_op_context)
-            complete_backward_annotation(
-                main_program, dist_context=self._dist_context)
-
-        # apply amp forward pass
-        if self._dist_strategy.amp:
-            assert auto_parallel_amp_pass
-            auto_parallel_amp_pass.apply_backward(main_program, startup_program,
-                                                  self._pass_context)
+        with program_guard(main_program, startup_program):
+            params_grads = append_backward(
+                loss,
+                parameter_list,
+                no_grad_set,
+                callbacks,
+                distop_context=self._dist_context.dist_op_context)
+        complete_backward_annotation(
+            main_program, dist_context=self._dist_context)
 
         return params_grads
 
@@ -192,14 +179,14 @@ def _get_dist_program(self, rank, dist_context=None, relaunch_phase=False):
             completed_main_program = serial_main_program
             self._dist_context = copy.deepcopy(dist_context)
 
-        # serial forward pass
-        self._apply_serial_forward_pass(completed_main_program,
-                                        serial_startup_program)
         # serial backward pass
         params_grads = self._generate_backward(
             completed_main_program, serial_startup_program, serial_loss,
             self._parameter_list, self._no_grad_set, self._callbacks)
 
+        # serial forward pass
+        self._apply_serial_pass(completed_main_program, serial_startup_program)
+
         # Logical partition 
         rank = paddle.distributed.get_rank()
         partitioner = Partitioner(self._dist_context, rank)
diff --git a/python/paddle/distributed/passes/auto_parallel_sharding.py b/python/paddle/distributed/passes/auto_parallel_sharding.py
index 5e799c52092db..2785eae6e8a46 100644
--- a/python/paddle/distributed/passes/auto_parallel_sharding.py
+++ b/python/paddle/distributed/passes/auto_parallel_sharding.py
@@ -94,7 +94,7 @@ def _build_sharding_groups(self, main_block, params_grads):
 
     def _collective_data_parallel_groups(self, main_block):
         for op in main_block.ops:
-            if op.type in _skip_ops:
+            if not _is_forward_op(op) or op.type in _skip_ops:
                 continue
             group = _inference_data_parallel_group_for_operator(
                 self.global_rank, op, self._dist_context)
@@ -106,7 +106,7 @@ def _collective_data_parallel_groups(self, main_block):
         if len(self.dp_groups) != 1:
             raise NotImplementedError(
                 "So far Only and Exactly one data parallel group in network are supported, but got [{}] different data parallel groups".
-                format(len(groups)))
+                format(len(self.dp_groups)))
 
     def _build_sharding_infos(self, params_grads):
 
@@ -193,18 +193,32 @@ def _shard_gradient_clip(self, main_block):
             return
 
         # TODO (JZ-LIANG) support calculate global norm with tensor parallelism
-        is_clip_grad_by_global_norm = False
+        removed_op_type = ['elementwise_mul', 'squared_l2_norm', 'clip_by_norm']
+        removed_op_idx = set()
+        removed_tmp_var = set()
+
         for idx, op in list(enumerate(main_block.ops)):
             if not _is_gradient_clip_op(op):
                 continue
-            if op.type == 'sum':
-                is_clip_grad_by_global_norm = True
-                break
-        if not is_clip_grad_by_global_norm:
-            return
 
-        removed_op_idx = set()
-        removed_tmp_var = set()
+            if op.type in removed_op_type:
+                input_name = op.input("X")[0]
+                param_name = input_name[:input_name.find("@GRAD")]
+                if not self._is_parameter_in_local_shard(param_name):
+                    removed_op_idx.add(idx)
+                    if op.type in ['squared_l2_norm', 'clip_by_norm']:
+                        for output_name in op.output_arg_names:
+                            removed_tmp_var.add(output_name)
+
+        for idx, op in reversed(list(enumerate(main_block.ops))):
+            if not _is_gradient_clip_op(op):
+                continue
+            if idx in removed_op_idx:
+                main_block._remove_op(idx, sync=False)
+
+        for varname in removed_tmp_var:
+            main_block._remove_var(varname, sync=False)
+
         for idx, op in list(enumerate(main_block.ops)):
             if not _is_gradient_clip_op(op):
                 continue
@@ -218,7 +232,7 @@ def _shard_gradient_clip(self, main_block):
                 sum_op_output = op.desc.output_arg_names()[0]
                 for i, sharding_info in enumerate(self.sharding_infos):
                     new_op = main_block._insert_op(
-                        idx + i,
+                        idx + i + 1,
                         type='c_allreduce_sum',
                         inputs={'X': [sum_op_output]},
                         outputs={'Out': [sum_op_output]},
@@ -235,21 +249,6 @@ def _shard_gradient_clip(self, main_block):
                         new_op, dist_attr.process_mesh, dist_attr.dims_mapping,
                         self._dist_context)
                 break
-            for input_name in op.input_arg_names:
-                param_name = input_name[:input_name.find("@GRAD")]
-                if not self._is_parameter_in_local_shard(param_name):
-                    removed_op_idx.add(idx)
-                    for output_name in op.output_arg_names:
-                        removed_tmp_var.add(output_name)
-
-        for idx, op in reversed(list(enumerate(main_block.ops))):
-            if not _is_gradient_clip_op(op):
-                continue
-            if idx in removed_op_idx:
-                main_block._remove_op(idx, sync=False)
-
-        for varname in removed_tmp_var:
-            main_block._remove_var(varname, sync=False)
 
         main_block._sync_with_cpp()
 
@@ -424,12 +423,15 @@ def _shard_parameter(self, main_block, startup_block):
                         startup_block._remove_op(idx, sync=False)
                     continue
 
-                if op.type != "c_broadcast" and output_name in not_used_param_nane:
+                if op.type != "c_broadcast" and output_name in param_usage and sharding_info.get_var_rank(
+                        output_name) != sharding_info.local_rank:
                     startup_block._remove_op(idx, sync=False)
 
-            for varname in not_used_param_nane:
-                main_block._remove_var(varname, sync=False)
-                startup_block._remove_var(varname, sync=False)
+            for param_name in param_usage:
+                if sharding_info.get_var_rank(
+                        param_name) != sharding_info.local_rank:
+                    main_block._remove_var(param_name, sync=False)
+                    startup_block._remove_var(param_name, sync=False)
 
         main_block._sync_with_cpp()
         startup_block._sync_with_cpp()
@@ -594,6 +596,10 @@ def _is_param_grad_allreduce_op(op, block, dp_ring_ids):
     return block.var(base_name).is_parameter
 
 
+def _is_forward_op(op):
+    return op.attr("op_role") == 0
+
+
 def _inference_data_parallel_group_for_operator(rank_id, op, dist_context):
 
     dp_group = None
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/auto_parallel_pass_test_base.py b/python/paddle/fluid/tests/unittests/distributed_passes/auto_parallel_pass_test_base.py
index f5eda2fdbf8e2..42bdf67824220 100644
--- a/python/paddle/fluid/tests/unittests/distributed_passes/auto_parallel_pass_test_base.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/auto_parallel_pass_test_base.py
@@ -178,13 +178,13 @@ def get_gpt_model(self, strategy, place, batch_size, sequence_len,
         preds = model(tokens, position_ids, attention_mask)
         criterion = GPTPretrainingCriterion()
         loss = criterion(preds, labels, loss_mask)
-
+        clip = paddle.nn.ClipGradByNorm(clip_norm=1.0)
         optimizer = paddle.fluid.optimizer.AdamOptimizer(
             learning_rate=0.00001,
             beta1=0.9,
             beta2=0.999,
             epsilon=1e-08,
-            grad_clip=None)
+            grad_clip=clip)
         optimizer = fleet.distributed_optimizer(optimizer)
         startup_program = paddle.static.default_startup_program()
         _, _, dist_startup_prog, dist_main_prog = optimizer.minimize(
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_sharding_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_sharding_pass.py
index f6b42701c2195..51e87260609df 100644
--- a/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_sharding_pass.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_sharding_pass.py
@@ -46,7 +46,7 @@ def apply_passes(self):
         dist_strategy.sharding = True
         dist_strategy.sharding_configs = {
             "sharding_degree": 2,
-            "stage": 3,
+            "stage": 2,
         }
         fleet.init(is_collective=True, strategy=dist_strategy)
 
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py
index ab91c3fe7c4c2..83254de61298b 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py
@@ -157,9 +157,6 @@ def get_dist_prog(train_program, startup_program, dist_context, rank_id):
     complete_train_program = auto.complete_annotation(train_program,
                                                       dist_context)
 
-    parallelizer._apply_serial_forward_pass(complete_train_program,
-                                            startup_program)
-
     params_grads = parallelizer._generate_backward(
         complete_train_program,
         startup_program,
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py
index 9fe5a52cf08af..3a28595c833e0 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py
@@ -478,8 +478,7 @@ def get_dist_prog(train_program, startup_program, dist_context, rank_id):
     # auto completion
     complete_train_program = auto.complete_annotation(train_program,
                                                       dist_context)
-    parallelizer._apply_serial_forward_pass(complete_train_program,
-                                            startup_program)
+
     params_grads = parallelizer._generate_backward(
         complete_train_program,
         startup_program,
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py
index 3270cfc3c8a54..dc2ad1d900f52 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py
@@ -884,10 +884,6 @@ def test_gpt_dp_mp(self):
         complete_train_program = auto.complete_annotation(train_program,
                                                           dist_context)
 
-        # serial forward pass
-        parallelizer._apply_serial_forward_pass(complete_train_program,
-                                                startup_program)
-
         # serial backward pass
         params_grads = parallelizer._generate_backward(
             complete_train_program,
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py
index 0631cc74a32bd..614b996d26521 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py
@@ -155,9 +155,6 @@ def get_dist_prog(train_program, startup_program, dist_context, rank_id):
     complete_train_program = auto.complete_annotation(train_program,
                                                       dist_context)
 
-    parallelizer._apply_serial_forward_pass(complete_train_program,
-                                            startup_program)
-
     params_grads = parallelizer._generate_backward(
         complete_train_program,
         startup_program,
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py
index 0e098664f7ebb..cfbb7653fad8e 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py
@@ -119,9 +119,6 @@ def get_dist_prog(train_program, startup_program, dist_context, rank_id):
     complete_train_program = auto.complete_annotation(train_program,
                                                       dist_context)
 
-    parallelizer._apply_serial_forward_pass(complete_train_program,
-                                            startup_program)
-
     params_grads = parallelizer._generate_backward(
         complete_train_program,
         startup_program,
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py
index c6b1be652073c..272c1c212f08e 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py
@@ -134,8 +134,6 @@ def get_dist_prog(train_program, startup_program, dist_context, rank_id):
     # serial forward & backward completion
     complete_train_program = auto.complete_annotation(train_program,
                                                       dist_context)
-    parallelizer._apply_serial_forward_pass(complete_train_program,
-                                            startup_program)
 
     params_grads = parallelizer._generate_backward(
         complete_train_program,

From ee813e349d017b3b1abb775ebf81a5282dd8f628 Mon Sep 17 00:00:00 2001
From: jakpiase <jakpia21@gmail.com>
Date: Thu, 6 Jan 2022 13:36:41 +0100
Subject: [PATCH 030/151] Reupload: Added numpy bf16 datatype support via
 custom pip package (#38703)

* reuploaded files

* Changed year from 2021 to 2022

* minor change

* fixed requirements.txt file
---
 .../test_python_bf16_numpy_datatype.py        | 34 +++++++++++++++++++
 python/requirements.txt                       |  1 +
 2 files changed, 35 insertions(+)
 create mode 100644 python/paddle/fluid/tests/unittests/test_python_bf16_numpy_datatype.py

diff --git a/python/paddle/fluid/tests/unittests/test_python_bf16_numpy_datatype.py b/python/paddle/fluid/tests/unittests/test_python_bf16_numpy_datatype.py
new file mode 100644
index 0000000000000..a58d7d35807c6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_python_bf16_numpy_datatype.py
@@ -0,0 +1,34 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from paddle_bfloat import bfloat16
+import unittest
+
+
+class TestBF16DataType(unittest.TestCase):
+    def test_matmul(self):
+        a_bf16 = np.random.random((6, 7)).astype(bfloat16)
+        b_bf16 = np.random.random((7, 8)).astype(bfloat16)
+        c_bf16 = np.matmul(a_bf16, b_bf16)
+
+        a_fp32 = a_bf16.astype(np.float32)
+        b_fp32 = b_bf16.astype(np.float32)
+        c_fp32 = np.matmul(a_fp32, b_fp32)
+
+        self.assertTrue(np.allclose(c_bf16, c_fp32))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/requirements.txt b/python/requirements.txt
index f2a4580a94e51..5f2b788a81a0a 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -5,3 +5,4 @@ Pillow
 six
 decorator
 astor
+paddle_bfloat==0.1.2

From 3d3bc6816a2069191d701fba64cbb859a32c4655 Mon Sep 17 00:00:00 2001
From: YuanRisheng <yuanrisheng@baidu.com>
Date: Thu, 6 Jan 2022 20:45:14 +0800
Subject: [PATCH 031/151] [PTen]Move manipulation mid to new directory and
 rename flatten/reshape kernel (#38730)

* move mid api and rename kernel

* use empty kernel
---
 paddle/fluid/operators/cast_op.h              |  2 +-
 paddle/fluid/operators/flatten_op.h           |  6 +--
 paddle/fluid/operators/reshape_op.cc          |  8 +--
 paddle/pten/all.h                             |  1 -
 paddle/pten/include/manipulation.h            | 53 -------------------
 paddle/pten/kernels/flatten_kernel.cc         | 18 +++----
 paddle/pten/kernels/flatten_kernel.h          | 23 ++++++--
 paddle/pten/kernels/reshape_kernel.cc         | 31 +++++++----
 paddle/pten/kernels/reshape_kernel.h          | 20 +++++--
 .../pten/tests/kernels/test_cast_dev_api.cc   |  2 +-
 .../tests/kernels/test_flatten_dev_api.cc     |  2 +-
 .../tests/kernels/test_reshape_dev_api.cc     |  2 +-
 12 files changed, 74 insertions(+), 94 deletions(-)
 delete mode 100644 paddle/pten/include/manipulation.h

diff --git a/paddle/fluid/operators/cast_op.h b/paddle/fluid/operators/cast_op.h
index 4f7fe2854ae87..72aa9a195ec7c 100644
--- a/paddle/fluid/operators/cast_op.h
+++ b/paddle/fluid/operators/cast_op.h
@@ -20,7 +20,7 @@ limitations under the License. */
 
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
 #include "paddle/pten/include/core.h"
-#include "paddle/pten/include/manipulation.h"
+#include "paddle/pten/kernels/cast_kernel.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/flatten_op.h b/paddle/fluid/operators/flatten_op.h
index 29eb579b2a0d3..fa116d9516ecd 100644
--- a/paddle/fluid/operators/flatten_op.h
+++ b/paddle/fluid/operators/flatten_op.h
@@ -21,7 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/pooling.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/pten/include/core.h"
-#include "paddle/pten/include/manipulation.h"
+#include "paddle/pten/kernels/flatten_kernel.h"
 
 namespace paddle {
 namespace operators {
@@ -134,8 +134,8 @@ class FlattenContiguousRangeKernel : public framework::OpKernel<T> {
     auto pt_out = paddle::experimental::MakePtenDenseTensor(*out);
 
     // call new kernel
-    pten::Flatten<T, DeviceContext>(dev_ctx, *pt_x.get(), start_axis, stop_axis,
-                                    pt_out.get());
+    pten::FlattenKernel<T, DeviceContext>(dev_ctx, *pt_x.get(), start_axis,
+                                          stop_axis, pt_out.get());
   }
 };
 
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 996a784affa4c..f2162f55636e5 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -21,7 +21,7 @@ limitations under the License. */
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
 #include "paddle/pten/common/scalar_array.h"
 #include "paddle/pten/include/core.h"
-#include "paddle/pten/include/manipulation.h"
+#include "paddle/pten/kernels/reshape_kernel.h"
 namespace paddle {
 namespace framework {
 class InferShapeContext;
@@ -438,18 +438,18 @@ class ReshapeKernel {
     }
     if (platform::is_cpu_place(ctx.GetPlace())) {
       auto &dev_ctx = ctx.device_context<platform::CPUDeviceContext>();
-      pten::Reshape(dev_ctx, *pt_x.get(), pt_scalar_shape, pt_out);
+      pten::ReshapeKernel(dev_ctx, *pt_x.get(), pt_scalar_shape, pt_out);
     }
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (platform::is_gpu_place(ctx.GetPlace())) {
       auto &dev_ctx = ctx.device_context<platform::CUDADeviceContext>();
-      pten::Reshape(dev_ctx, *pt_x.get(), pt_scalar_shape, pt_out);
+      pten::ReshapeKernel(dev_ctx, *pt_x.get(), pt_scalar_shape, pt_out);
     }
 #endif
 #ifdef PADDLE_WITH_XPU
     if (platform::is_xpu_place(ctx.GetPlace())) {
       auto &dev_ctx = ctx.device_context<platform::XPUDeviceContext>();
-      pten::Reshape(dev_ctx, *pt_x.get(), pt_scalar_shape, pt_out);
+      pten::ReshapeKernel(dev_ctx, *pt_x.get(), pt_scalar_shape, pt_out);
     }
 #endif
     // non-inplace need move all result from pt_out to out, inplace need set
diff --git a/paddle/pten/all.h b/paddle/pten/all.h
index 844114c341d67..7dd517e5e6381 100644
--- a/paddle/pten/all.h
+++ b/paddle/pten/all.h
@@ -18,5 +18,4 @@ limitations under the License. */
 #include "paddle/pten/include/core.h"
 #include "paddle/pten/include/infermeta.h"
 #include "paddle/pten/include/linalg.h"
-#include "paddle/pten/include/manipulation.h"
 #include "paddle/pten/include/math.h"
diff --git a/paddle/pten/include/manipulation.h b/paddle/pten/include/manipulation.h
deleted file mode 100644
index a8625e52f5618..0000000000000
--- a/paddle/pten/include/manipulation.h
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-// See Note: [ How do we organize the kernel directory ]
-#include "paddle/pten/api/lib/utils/storage.h"
-#include "paddle/pten/include/infermeta.h"
-#include "paddle/pten/kernels/cast_kernel.h"
-#include "paddle/pten/kernels/flatten_kernel.h"
-#include "paddle/pten/kernels/reshape_kernel.h"
-
-namespace pten {
-
-template <typename T, typename ContextT>
-DenseTensor Flatten(const ContextT& dev_ctx,
-                    const DenseTensor& x,
-                    int start_axis,
-                    int stop_axis) {
-  auto out_meta = FlattenInferMeta(x.meta(), start_axis, stop_axis);
-  pten::DenseTensor dense_out(
-      pten::make_intrusive<paddle::experimental::SharedStorage>(
-          dev_ctx.GetPlace()),
-      std::move(out_meta));
-  Flatten<T, ContextT>(dev_ctx, x, start_axis, stop_axis, &dense_out);
-  return dense_out;
-}
-
-template <typename T, typename ContextT>
-DenseTensor Reshape(const ContextT& dev_ctx,
-                    const DenseTensor& x,
-                    const std::vector<int64_t>& shape) {
-  auto out_meta = InferMetaFromVecValue(x.meta(), shape);
-  pten::DenseTensor dense_out(
-      pten::make_intrusive<paddle::experimental::SharedStorage>(
-          dev_ctx.GetPlace()),
-      std::move(out_meta));
-  Reshape<ContextT>(dev_ctx, x, ScalarArray(shape), &dense_out);
-  return dense_out;
-}
-
-}  // namespace pten
diff --git a/paddle/pten/kernels/flatten_kernel.cc b/paddle/pten/kernels/flatten_kernel.cc
index df8238cbf3a91..37d4d88ccb40e 100644
--- a/paddle/pten/kernels/flatten_kernel.cc
+++ b/paddle/pten/kernels/flatten_kernel.cc
@@ -22,11 +22,11 @@
 namespace pten {
 
 template <typename T, typename Context>
-void Flatten(const Context& dev_ctx,
-             const DenseTensor& x,
-             int start_axis,
-             int stop_axis,
-             DenseTensor* out) {
+void FlattenKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   int start_axis,
+                   int stop_axis,
+                   DenseTensor* out) {
   auto out_dims = out->dims();
   pten::Copy(dev_ctx, x, false, out);
   out->Resize(out_dims);
@@ -42,7 +42,7 @@ void FlattenWithXShape(const Context& dev_ctx,
                        int stop_axis,
                        DenseTensor* out,
                        DenseTensor* xshape) {
-  Flatten<T, Context>(dev_ctx, x, start_axis, stop_axis, out);
+  FlattenKernel<T, Context>(dev_ctx, x, start_axis, stop_axis, out);
   funcs::SetXShape(x, xshape);
 }
 
@@ -51,7 +51,7 @@ void FlattenWithXShape(const Context& dev_ctx,
 PT_REGISTER_CTX_KERNEL(flatten,
                        CPU,
                        ALL_LAYOUT,
-                       pten::Flatten,
+                       pten::FlattenKernel,
                        float,
                        double,
                        uint8_t,
@@ -74,7 +74,7 @@ PT_REGISTER_CTX_KERNEL(flatten_with_xshape,
 PT_REGISTER_CTX_KERNEL(flatten,
                        GPU,
                        ALL_LAYOUT,
-                       pten::Flatten,
+                       pten::FlattenKernel,
                        float,
                        paddle::platform::float16,
                        double,
@@ -100,7 +100,7 @@ PT_REGISTER_CTX_KERNEL(flatten_with_xshape,
 PT_REGISTER_CTX_KERNEL(flatten,
                        XPU,
                        ALL_LAYOUT,
-                       pten::Flatten,
+                       pten::FlattenKernel,
                        float,
                        paddle::platform::float16,
                        double,
diff --git a/paddle/pten/kernels/flatten_kernel.h b/paddle/pten/kernels/flatten_kernel.h
index 5a0445489bcf3..a67e66fac4130 100644
--- a/paddle/pten/kernels/flatten_kernel.h
+++ b/paddle/pten/kernels/flatten_kernel.h
@@ -15,15 +15,17 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/include/infermeta.h"
+#include "paddle/pten/kernels/empty_kernel.h"
 
 namespace pten {
 
 template <typename T, typename Context>
-void Flatten(const Context& dev_ctx,
-             const DenseTensor& x,
-             int start_axis,
-             int stop_axis,
-             DenseTensor* out);
+void FlattenKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   int start_axis,
+                   int stop_axis,
+                   DenseTensor* out);
 
 template <typename T, typename Context>
 void FlattenWithXShape(const Context& dev_ctx,
@@ -33,4 +35,15 @@ void FlattenWithXShape(const Context& dev_ctx,
                        DenseTensor* out,
                        DenseTensor* xshape);
 
+template <typename T, typename Context>
+DenseTensor Flatten(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    int start_axis,
+                    int stop_axis) {
+  auto out_meta = FlattenInferMeta(x.meta(), start_axis, stop_axis);
+  auto dense_out = Empty<T, Context>(dev_ctx, std::move(out_meta));
+  FlattenKernel<T, Context>(dev_ctx, x, start_axis, stop_axis, &dense_out);
+  return dense_out;
+}
+
 }  // namespace pten
diff --git a/paddle/pten/kernels/reshape_kernel.cc b/paddle/pten/kernels/reshape_kernel.cc
index 0535ea20c8cb0..d7e2e2707ee1b 100644
--- a/paddle/pten/kernels/reshape_kernel.cc
+++ b/paddle/pten/kernels/reshape_kernel.cc
@@ -22,10 +22,10 @@
 namespace pten {
 
 template <typename Context>
-void Reshape(const Context& dev_ctx,
-             const DenseTensor& x,
-             const ScalarArray& shape,
-             DenseTensor* out) {
+void ReshapeKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const ScalarArray& shape,
+                   DenseTensor* out) {
   auto out_meta = InferMetaFromVecValue(x.meta(), shape.GetData());
   if (x.data() == out->data() && x.numel() == out->numel()) {
     out->Resize(out_meta.dims);
@@ -43,13 +43,16 @@ void ReshapeWithXShape(const Context& dev_ctx,
                        DenseTensor* xshape,
                        DenseTensor* out) {
   funcs::SetXShape(x, xshape);
-  Reshape(dev_ctx, x, shape, out);
+  ReshapeKernel(dev_ctx, x, shape, out);
 }
 
 }  // namespace pten
 
-PT_REGISTER_GENERAL_KERNEL(
-    reshape, CPU, ALL_LAYOUT, pten::Reshape<pten::CPUContext>, ALL_DTYPE) {}
+PT_REGISTER_GENERAL_KERNEL(reshape,
+                           CPU,
+                           ALL_LAYOUT,
+                           pten::ReshapeKernel<pten::CPUContext>,
+                           ALL_DTYPE) {}
 PT_REGISTER_GENERAL_KERNEL(reshape_with_xshape,
                            CPU,
                            ALL_LAYOUT,
@@ -57,8 +60,11 @@ PT_REGISTER_GENERAL_KERNEL(reshape_with_xshape,
                            ALL_DTYPE) {}
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_REGISTER_GENERAL_KERNEL(
-    reshape, GPU, ALL_LAYOUT, pten::Reshape<pten::GPUContext>, ALL_DTYPE) {}
+PT_REGISTER_GENERAL_KERNEL(reshape,
+                           GPU,
+                           ALL_LAYOUT,
+                           pten::ReshapeKernel<pten::GPUContext>,
+                           ALL_DTYPE) {}
 PT_REGISTER_GENERAL_KERNEL(reshape_with_xshape,
                            GPU,
                            ALL_LAYOUT,
@@ -67,8 +73,11 @@ PT_REGISTER_GENERAL_KERNEL(reshape_with_xshape,
 #endif
 
 #ifdef PADDLE_WITH_XPU
-PT_REGISTER_GENERAL_KERNEL(
-    reshape, XPU, ALL_LAYOUT, pten::Reshape<pten::XPUContext>, ALL_DTYPE) {}
+PT_REGISTER_GENERAL_KERNEL(reshape,
+                           XPU,
+                           ALL_LAYOUT,
+                           pten::ReshapeKernel<pten::XPUContext>,
+                           ALL_DTYPE) {}
 PT_REGISTER_GENERAL_KERNEL(reshape_with_xshape,
                            XPU,
                            ALL_LAYOUT,
diff --git a/paddle/pten/kernels/reshape_kernel.h b/paddle/pten/kernels/reshape_kernel.h
index b10e31a434c00..faa51c69ad17c 100644
--- a/paddle/pten/kernels/reshape_kernel.h
+++ b/paddle/pten/kernels/reshape_kernel.h
@@ -16,14 +16,16 @@ limitations under the License. */
 
 #include "paddle/pten/common/scalar_array.h"
 #include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/include/infermeta.h"
+#include "paddle/pten/kernels/empty_kernel.h"
 
 namespace pten {
 
 template <typename Context>
-void Reshape(const Context& dev_ctx,
-             const DenseTensor& x,
-             const ScalarArray& shape,
-             DenseTensor* out);
+void ReshapeKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const ScalarArray& shape,
+                   DenseTensor* out);
 
 template <typename Context>
 void ReshapeWithXShape(const Context& dev_ctx,
@@ -32,4 +34,14 @@ void ReshapeWithXShape(const Context& dev_ctx,
                        DenseTensor* xshape,
                        DenseTensor* out);
 
+template <typename T, typename Context>
+DenseTensor Reshape(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const std::vector<int64_t>& shape) {
+  auto out_meta = InferMetaFromVecValue(x.meta(), shape);
+  auto dense_out = Empty<T, Context>(dev_ctx, std::move(out_meta));
+  ReshapeKernel<Context>(dev_ctx, x, ScalarArray(shape), &dense_out);
+  return dense_out;
+}
+
 }  // namespace pten
diff --git a/paddle/pten/tests/kernels/test_cast_dev_api.cc b/paddle/pten/tests/kernels/test_cast_dev_api.cc
index dc3cff150b47b..cb45d827e3be9 100644
--- a/paddle/pten/tests/kernels/test_cast_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_cast_dev_api.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/pten/include/manipulation.h"
+#include "paddle/pten/kernels/cast_kernel.h"
 
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/common/data_type.h"
diff --git a/paddle/pten/tests/kernels/test_flatten_dev_api.cc b/paddle/pten/tests/kernels/test_flatten_dev_api.cc
index d2ff7480e904f..f18e5c050ba70 100644
--- a/paddle/pten/tests/kernels/test_flatten_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_flatten_dev_api.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/pten/include/manipulation.h"
+#include "paddle/pten/kernels/flatten_kernel.h"
 
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
diff --git a/paddle/pten/tests/kernels/test_reshape_dev_api.cc b/paddle/pten/tests/kernels/test_reshape_dev_api.cc
index 64efdc6f67201..0196e1c211004 100644
--- a/paddle/pten/tests/kernels/test_reshape_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_reshape_dev_api.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/pten/include/manipulation.h"
+#include "paddle/pten/kernels/reshape_kernel.h"
 
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"

From 42cfd15e672e1ed7ad0242c1ae9e492f197599d6 Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Thu, 6 Jan 2022 21:03:04 +0800
Subject: [PATCH 032/151] [pten] fix typo of device (#38760)

---
 paddle/pten/common/device.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/pten/common/device.cc b/paddle/pten/common/device.cc
index 9583b521d9123..55130067ae200 100644
--- a/paddle/pten/common/device.cc
+++ b/paddle/pten/common/device.cc
@@ -24,7 +24,7 @@ const char* DeviceTypeStr(DeviceType type) {
     case DeviceType::kUndef:
       return "kUndef";
     case DeviceType::kHost:
-      return "kUndef";
+      return "kHost";
     case DeviceType::kXpu:
       return "kXpu";
     case DeviceType::kCuda:

From 1b6e4664e7a9d03b3fb21c9a2e70eae243be7130 Mon Sep 17 00:00:00 2001
From: Li Min <11663212+limin2021@users.noreply.github.com>
Date: Fri, 7 Jan 2022 10:21:26 +0800
Subject: [PATCH 033/151] Add fp16 support for scale and bias parameter for
 fused_layernnorm_residual_dropout op. (#38775)

* Add fp16 support for scale/bias for fused_layernnorm_residual_dropout_bias op.
---
 .../operators/fused/fused_dropout_helper.h    | 37 +++----
 .../fused_layernorm_residual_dropout_bias.h   | 96 +++++++++++--------
 ...ed_layernorm_residual_dropout_bias_test.cu |  2 +-
 3 files changed, 80 insertions(+), 55 deletions(-)

diff --git a/paddle/fluid/operators/fused/fused_dropout_helper.h b/paddle/fluid/operators/fused/fused_dropout_helper.h
index 970b2d82e2b15..3972c60e8347b 100644
--- a/paddle/fluid/operators/fused/fused_dropout_helper.h
+++ b/paddle/fluid/operators/fused/fused_dropout_helper.h
@@ -250,11 +250,14 @@ class FusedDropoutLayerNormHelper : public FusedDropoutHelper<T, MaskType> {
   }
 
   // out = layernorm(residual + dropout(src + bias))
-  void LayernormResidualDropoutBias(
-      const platform::CUDADeviceContext& ctx, const T* src, const T* residual,
-      const T* bias, const LayerNormParamType<T>* gamma,
-      const LayerNormParamType<T>* beta, T* dropout_out, MaskType* mask, T* out,
-      LayerNormParamType<T>* mean, LayerNormParamType<T>* variance) {
+  template <typename P = LayerNormParamType<T>, bool is_same_type = false>
+  void LayernormResidualDropoutBias(const platform::CUDADeviceContext& ctx,
+                                    const T* src, const T* residual,
+                                    const T* bias, const P* gamma,
+                                    const P* beta, T* dropout_out,
+                                    MaskType* mask, T* out,
+                                    LayerNormParamType<T>* mean,
+                                    LayerNormParamType<T>* variance) {
     using U = LayerNormParamType<T>;
     int vec_size = MAX_CACHE_BYTES / sizeof(T);
     if (this->cols_ % vec_size != 0) {
@@ -263,7 +266,7 @@ class FusedDropoutLayerNormHelper : public FusedDropoutHelper<T, MaskType> {
     int threads = GetDesiredBlockDim(this->cols_ / vec_size);
     int increment = ((this->cols_ - 1) / (threads * vec_size) + 1) * vec_size;
     increment = this->dropout_param_.UpdateSeedAndIncrement(ctx, increment);
-    LaunchLayernormResidualDropoutBias<T, MaskType>(
+    LaunchLayernormResidualDropoutBias<T, MaskType, U, is_same_type>(
         this->rows_, this->cols_, increment, this->dropout_param_.seed,
         this->dropout_param_.dropout_prob, epsilon_,
         this->dropout_param_.is_upscale_in_train, this->dropout_param_.is_test,
@@ -271,17 +274,19 @@ class FusedDropoutLayerNormHelper : public FusedDropoutHelper<T, MaskType> {
         variance, ctx);
   }
 
-  void LayernormResidualDropoutBiasGrad(
-      const platform::CUDADeviceContext& ctx, const T* d_out,
-      const T* layernorm_src, const MaskType* mask,
-      const LayerNormParamType<T>* gamma, const LayerNormParamType<T>* mean,
-      const LayerNormParamType<T>* variance, T* d_layernorm_src,
-      LayerNormParamType<T>* d_scale, LayerNormParamType<T>* d_layernorm_bias,
-      T* d_dropout_src, T* d_bias, T* d_residual) {
+  template <typename P = LayerNormParamType<T>, bool is_same_type = false>
+  void LayernormResidualDropoutBiasGrad(const platform::CUDADeviceContext& ctx,
+                                        const T* d_out, const T* layernorm_src,
+                                        const MaskType* mask, const P* gamma,
+                                        const LayerNormParamType<T>* mean,
+                                        const LayerNormParamType<T>* variance,
+                                        T* d_layernorm_src, P* d_scale,
+                                        P* d_layernorm_bias, T* d_dropout_src,
+                                        T* d_bias, T* d_residual) {
     using U = LayerNormParamType<T>;
-    LayerNormBackward<T, U>(layernorm_src, d_out, gamma, mean, variance,
-                            d_layernorm_src, d_scale, d_layernorm_bias,
-                            epsilon_, this->rows_, this->cols_, ctx);
+    LayerNormBackward<T, U, is_same_type>(
+        layernorm_src, d_out, gamma, mean, variance, d_layernorm_src, d_scale,
+        d_layernorm_bias, epsilon_, this->rows_, this->cols_, ctx);
     this->ResidualDropoutBiasGrad(ctx, d_layernorm_src, mask, d_dropout_src,
                                   d_residual, d_bias);
   }
diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
index 1827e137c15f1..b27b70dc9dc0c 100644
--- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
+++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
@@ -24,46 +24,57 @@ using CudnnDataType = platform::CudnnDataType<T>;
 template <typename T>
 using LayerNormParamType = typename CudnnDataType<T>::BatchNormParamType;
 
+template <typename T, typename U, bool ScaleBiasWithSameTypeX>
+using LayerNormScaleBiasT =
+    typename std::conditional<ScaleBiasWithSameTypeX, T, U>::type;
+
 /**
  * @brief fused add_bias, dropout, add residual and leyer_norm into one
  * operators. Currently only support forward
  */
 
-template <typename T, int VecSize>
-__device__ void CalcLayernormY(const LayerNormParamType<T> *scale,
-                               const LayerNormParamType<T> *bias, const T *x,
-                               T *y, const int row_id, const int col_id,
-                               const int cols,
-                               const LayerNormParamType<T> mean_val,
-                               const LayerNormParamType<T> invvar) {
-  using U = LayerNormParamType<T>;
+template <typename T, int VecSize, typename U,
+          bool ScaleBiasWithSameTypeX = false>
+__device__ void CalcLayernormY(
+    const LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX> *scale,
+    const LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX> *bias, const T *x,
+    T *y, const int row_id, const int col_id, const int cols,
+    const LayerNormParamType<T> mean_val, const LayerNormParamType<T> invvar) {
   using LoadT = platform::AlignedVector<T, VecSize>;
   using StoreT = platform::AlignedVector<T, VecSize>;
   using LoadU = platform::AlignedVector<U, VecSize>;
+  using LoadScaleOrBias =
+      platform::AlignedVector<LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>,
+                              VecSize>;
   for (int i = col_id * VecSize; i < cols; i += blockDim.x * VecSize) {
-    LoadU scale_vec;
-    LoadU bias_vec;
+    LoadScaleOrBias scale_vec;
+    LoadScaleOrBias bias_vec;
     LoadT x_vec;
 #pragma unroll
     for (int ii = 0; ii < VecSize; ii++) {
-      scale_vec[ii] = static_cast<U>(1);
-      bias_vec[ii] = static_cast<U>(0);
+      scale_vec[ii] =
+          static_cast<LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>>(1);
+      bias_vec[ii] =
+          static_cast<LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>>(0);
     }
     // vectorize load data from global
     platform::Load<T, VecSize>(&x[row_id * cols + i], &x_vec);
 
     if (scale != nullptr) {
-      platform::Load<U, VecSize>(&scale[i], &scale_vec);
+      platform::Load<LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>,
+                     VecSize>(&scale[i], &scale_vec);
     }
     if (bias != nullptr) {
-      platform::Load<U, VecSize>(&bias[i], &bias_vec);
+      platform::Load<LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>,
+                     VecSize>(&bias[i], &bias_vec);
     }
 
     StoreT y_vec;
     for (int ii = 0; ii < VecSize; ii++) {
-      y_vec[ii] = static_cast<T>(
-          scale_vec[ii] * (static_cast<U>(x_vec[ii]) - mean_val) * invvar +
-          bias_vec[ii]);
+      y_vec[ii] =
+          static_cast<T>(static_cast<U>(scale_vec[ii]) *
+                             (static_cast<U>(x_vec[ii]) - mean_val) * invvar +
+                         static_cast<U>(bias_vec[ii]));
     }
     platform::Store<T, VecSize>(y_vec, &y[row_id * cols + i]);
   }
@@ -85,15 +96,17 @@ __device__ void CalcLayernormY(const LayerNormParamType<T> *scale,
  * means: [rows]: layernorm means
  * vars: [rows]: layernorm vars
  */
-template <typename T, typename MaskType, int VecSize>
+template <typename T, typename MaskType, int VecSize, typename U,
+          bool ScaleBiasWithSameTypeX = false>
 __global__ void FusedLayernormResidualDropoutBias(
     const size_t rows, const size_t cols, uint64_t seed,
     const float dropout_prob, const bool is_upscale_in_train,
     const bool is_test, const uint64_t increment, const float epsilon,
     const T *src, const T *residual, const T *bias,
-    const LayerNormParamType<T> *scale,
-    const LayerNormParamType<T> *layernorm_bias, MaskType *mask, T *dst,
-    T *layernorm_dst, LayerNormParamType<T> *mean, LayerNormParamType<T> *var) {
+    const LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX> *scale,
+    const LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX> *layernorm_bias,
+    MaskType *mask, T *dst, T *layernorm_dst, LayerNormParamType<T> *mean,
+    LayerNormParamType<T> *var) {
   int col_id = threadIdx.x;
   int row_id = blockIdx.x;
   int idx = row_id * cols + col_id;
@@ -101,7 +114,6 @@ __global__ void FusedLayernormResidualDropoutBias(
   curand_init(seed, idx, increment, &state);
 
   T factor = GetFactor<T>(dropout_prob, is_upscale_in_train, is_test);
-  using U = LayerNormParamType<T>;
 
   __shared__ U mean_share;
   __shared__ U var_share;
@@ -121,10 +133,12 @@ __global__ void FusedLayernormResidualDropoutBias(
   mean_val = BlockReduceSum<U>(mean_val, shared_mean);
   var_val = BlockReduceSum<U>(var_val, shared_var);
   if (threadIdx.x == 0) {
-    auto scale = static_cast<float>(1.) / static_cast<float>(cols);
-    auto tmp = mean_val * scale;
+    auto scale = static_cast<LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>>(
+        static_cast<float>(1.) / static_cast<float>(cols));
+    auto tmp = mean_val * static_cast<U>(scale);
     mean[row_id] = mean_share = static_cast<U>(tmp);
-    var_share = static_cast<U>(var_val * scale - mean_share * mean_share);
+    var_share = static_cast<U>(var_val * static_cast<U>(scale) -
+                               mean_share * mean_share);
     var_share = var_share > U(0) ? var_share : U(0);
     var[row_id] = var_share;
   }
@@ -134,8 +148,9 @@ __global__ void FusedLayernormResidualDropoutBias(
   U invvar = rsqrt_<U>(var_share + static_cast<U>(epsilon));
 
   // calculate layernorm_dst
-  CalcLayernormY<T, VecSize>(scale, layernorm_bias, dst, layernorm_dst, row_id,
-                             col_id, cols, mean_val, invvar);
+  CalcLayernormY<T, VecSize, U, ScaleBiasWithSameTypeX>(
+      scale, layernorm_bias, dst, layernorm_dst, row_id, col_id, cols, mean_val,
+      invvar);
 }
 
 /**
@@ -154,16 +169,17 @@ __global__ void FusedLayernormResidualDropoutBias(
  * means: [rows]: layernorm means
  * vars: [rows]: layernorm vars
  */
-template <typename T, typename MaskType>
+template <typename T, typename MaskType, typename U,
+          bool ScaleBiasWithSameTypeX = false>
 void LaunchLayernormResidualDropoutBias(
     const uint32_t rows, const uint32_t cols, const int increment,
     uint64_t seed, const float dropout_prob, const float epsilon,
     const bool is_upscale_in_train, const bool is_test, const T *src,
-    const T *residual, const T *bias, const LayerNormParamType<T> *scale,
-    const LayerNormParamType<T> *layernorm_bias, MaskType *mask_data, T *dst,
-    T *layernorm_dst, LayerNormParamType<T> *mean, LayerNormParamType<T> *var,
-    const platform::CUDADeviceContext &ctx) {
-  using U = LayerNormParamType<T>;
+    const T *residual, const T *bias,
+    const LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX> *scale,
+    const LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX> *layernorm_bias,
+    MaskType *mask_data, T *dst, T *layernorm_dst, LayerNormParamType<T> *mean,
+    LayerNormParamType<T> *var, const platform::CUDADeviceContext &ctx) {
   // dropout_prob == 1.0f
   if (std::abs(dropout_prob - 1.0f) < 1e-5) {
     auto cuda_place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
@@ -175,8 +191,9 @@ void LaunchLayernormResidualDropoutBias(
     // call layernorm forward
     switch (GetDesiredBlockDim(cols)) {
       FIXED_BLOCK_DIM_CASE(
-          LayerNormForward<T, U,
-                           kBlockDim><<<rows, kBlockDim, 0, ctx.stream()>>>(
+          LayerNormForward<
+              T, U, kBlockDim,
+              ScaleBiasWithSameTypeX><<<rows, kBlockDim, 0, ctx.stream()>>>(
               dst, scale, layernorm_bias, layernorm_dst, mean, var, epsilon,
               cols));
       default:
@@ -184,21 +201,24 @@ void LaunchLayernormResidualDropoutBias(
             "Product from begin_norm_axis to end must be larger than 1"));
         break;
     }
+
     return;
   }
 
   const int VecSize = MAX_CACHE_BYTES / sizeof(T);
   if (cols % VecSize != 0) {
     int blockDim = GetDesiredBlockDim(cols);
-    FusedLayernormResidualDropoutBias<T, uint8_t,
-                                      1><<<rows, blockDim, 0, ctx.stream()>>>(
+    FusedLayernormResidualDropoutBias<
+        T, uint8_t, 1, U,
+        ScaleBiasWithSameTypeX><<<rows, blockDim, 0, ctx.stream()>>>(
         rows, cols, seed, dropout_prob, is_upscale_in_train, is_test, increment,
         epsilon, src, residual, bias, scale, layernorm_bias, mask_data, dst,
         layernorm_dst, mean, var);
   } else {
     int blockDim = GetDesiredBlockDim(cols / VecSize);
     FusedLayernormResidualDropoutBias<
-        T, uint8_t, VecSize><<<rows, blockDim, 0, ctx.stream()>>>(
+        T, uint8_t, VecSize, U,
+        ScaleBiasWithSameTypeX><<<rows, blockDim, 0, ctx.stream()>>>(
         rows, cols, seed, dropout_prob, is_upscale_in_train, is_test, increment,
         epsilon, src, residual, bias, scale, layernorm_bias, mask_data, dst,
         layernorm_dst, mean, var);
diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu
index 50e3555b4bcd6..57d3fc94dc88a 100644
--- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu
+++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu
@@ -223,7 +223,7 @@ struct TestFusedLayernormResidualDropoutBias {
       layernorm_bias_ptr = layernorm_bias.data<U>();
     }
 
-    paddle::operators::LaunchLayernormResidualDropoutBias<T, uint8_t>(
+    paddle::operators::LaunchLayernormResidualDropoutBias<T, uint8_t, U, false>(
         rows, cols, increment, seed, dropout_prob, epsilon, is_upscale_in_train,
         is_test, src.data<T>(), residual.data<T>(), bias_ptr, scale_ptr,
         layernorm_bias_ptr, mask.data<uint8_t>(), out.data<T>(),

From 7f3b08772273795fb6845d248603addc6adccfe8 Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Fri, 7 Jan 2022 10:52:30 +0800
Subject: [PATCH 034/151] [new-exec] support pten kernel (#38770)

---
 .../framework/new_executor/interpretercore.cc | 18 ++++++-
 .../new_executor/interpretercore_util.cc      | 48 +++++++++++++++----
 .../new_executor/new_executor_defs.cc         |  8 ++++
 .../new_executor/new_executor_defs.h          |  9 ++++
 paddle/fluid/framework/operator.cc            |  3 ++
 paddle/fluid/framework/operator.h             | 22 +++++----
 6 files changed, 89 insertions(+), 19 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index 7c0bbac61807e..950756c0394a5 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -413,7 +413,23 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
     if (op_with_kernel == nullptr) {
       instr_node.OpBase()->Run(*local_scope, place_);
     } else {
-      instr_node.KernelFunc()(*instr_node.InnerExecutionContext().get());
+      // fit for pten
+      if (instr_node.PtenKernel() && instr_node.PtenKernel()->IsValid()) {
+        VLOG(4) << "Run pten kernel: " << op->Type();
+        VLOG(4) << instr_node.InnerRuntimeContext().get() << " "
+                << &instr_node.DeviceContext();
+        op_with_kernel->BuildPtenKernelContext(
+            *instr_node.InnerRuntimeContext().get(),
+            const_cast<platform::DeviceContext*>(&instr_node.DeviceContext()));
+
+        (*instr_node.PtenKernel())(instr_node.PtenKernelContext());
+
+        op_with_kernel->WriteBackToOutputs(
+            instr_node.InnerRuntimeContext().get());
+        instr_node.PtenKernelContext()->ClearData();
+      } else {
+        instr_node.KernelFunc()(*instr_node.InnerExecutionContext().get());
+      }
     }
   }
 
diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.cc b/paddle/fluid/framework/new_executor/interpretercore_util.cc
index 3817a11b9afe4..41c4faa67fbeb 100644
--- a/paddle/fluid/framework/new_executor/interpretercore_util.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc
@@ -19,10 +19,13 @@
 #include "paddle/fluid/operators/controlflow/conditional_block_op_helper.h"
 #include "paddle/fluid/operators/controlflow/recurrent_op_helper.h"
 #include "paddle/fluid/operators/controlflow/while_op_helper.h"
+#include "paddle/pten/core/kernel_factory.h"
 
 PADDLE_DEFINE_EXPORTED_bool(
     new_executor_sequential_run, false,
     "Enable sequential execution for standalone executor, used for debug");
+DECLARE_bool(run_pten_kernel);
+
 namespace paddle {
 namespace framework {
 namespace interpreter {
@@ -338,6 +341,8 @@ void build_op_func_list(const platform::Place& place,
       // op is not a operatorwithkernel, so direcly run OperatorBase::Run()
       deal_operator_base(place, var_scope, ops[i], &op_func_node, local_scope);
     } else {
+      auto op_with_kernel =
+          static_cast<const framework::OperatorWithKernel*>(op);
       // construct RuntimeContext and analysis KernelType
       RuntimeContext runtime_context({}, {});
       runtime_context.inputs.swap(ins_map);
@@ -350,8 +355,7 @@ void build_op_func_list(const platform::Place& place,
         // TODO(Aurelius84): In case of control flow ops, they are NOT
         // inheritted
         // from OperatorWithKernel.
-        static_cast<const framework::OperatorWithKernel*>(op)->InferShape(
-            &infer_shape_ctx);
+        op_with_kernel->InferShape(&infer_shape_ctx);
       }
 
       auto kernels_iter = all_op_kernels.find(op->Type());
@@ -367,10 +371,8 @@ void build_op_func_list(const platform::Place& place,
           platform::DeviceContextPool::Instance();
       auto* dev_ctx = pool.Get(place);
       Scope scope;
-      auto expected_kernel_key =
-          dynamic_cast<const framework::OperatorWithKernel*>(op)
-              ->GetExpectedKernelType(
-                  ExecutionContext(*op, scope, *dev_ctx, runtime_context));
+      auto expected_kernel_key = op_with_kernel->GetExpectedKernelType(
+          ExecutionContext(*op, scope, *dev_ctx, runtime_context));
 
       // change device by the device_guard()
       apply_device_guard(op, place, &expected_kernel_key);
@@ -378,10 +380,16 @@ void build_op_func_list(const platform::Place& place,
 
       // step 3. apply data transforms and insert data transfer ops
       VariableValueMap& ins_map_temp = runtime_context.inputs;
+
+      // NOTE(zhiqiu): op_func_node->operator_base_ maybe changed in
+      // ApplyDataTransform
       ApplyDataTransform(expected_kernel_key, place, &ins_map_temp, var_scope,
                          &op_func_node, vec_func_list, use_local_scope);
+      op_with_kernel = static_cast<const framework::OperatorWithKernel*>(
+          op_func_node.operator_base_.get());
+
       // step 4. Run op kernel
-      VLOG(3) << op->Type()
+      VLOG(3) << op_with_kernel->Type()
               << " : expected_kernel_key : " << expected_kernel_key;
 
       if (platform::is_gpu_place(expected_kernel_key.place_)) {
@@ -397,7 +405,8 @@ void build_op_func_list(const platform::Place& place,
       }
       op_func_node.dev_ctx_ = dev_ctx;
 
-      auto exec_ctx = ExecutionContext(*op, scope, *dev_ctx, runtime_context);
+      auto exec_ctx =
+          ExecutionContext(*op_with_kernel, scope, *dev_ctx, runtime_context);
 
       auto kernel_iter = kernels.find(expected_kernel_key);
       PADDLE_ENFORCE_NE(
@@ -406,8 +415,27 @@ void build_op_func_list(const platform::Place& place,
               "Operator (%s) does not have kernel for %s.", op->Type(),
               KernelTypeToString(expected_kernel_key)));
 
-      op_func_node.kernel_func_ = OpKernelComputeFunc(kernel_iter->second);
-      op_func_node.kernel_func_(exec_ctx);
+      auto run_pten_kernel = false;
+
+      if (FLAGS_run_pten_kernel &&
+          pten::KernelFactory::Instance().HasCompatiblePtenKernel(
+              op_with_kernel->Type())) {
+        op_with_kernel->ChoosePtenKernel(exec_ctx);
+        run_pten_kernel = op_with_kernel->PtenKernel()->IsValid();
+      }
+
+      if (run_pten_kernel) {
+        op_with_kernel->BuildPtenKernelContext(runtime_context, dev_ctx);
+        op_func_node.pt_kernel_ = op_with_kernel->PtenKernel();
+        op_func_node.pt_kernel_context_ = op_with_kernel->PtenKernelContext();
+
+        (*op_func_node.pt_kernel_)(op_func_node.pt_kernel_context_);
+        op_with_kernel->WriteBackToOutputs(&runtime_context);
+        op_func_node.pt_kernel_context_->ClearData();
+      } else {
+        op_func_node.kernel_func_ = OpKernelComputeFunc(kernel_iter->second);
+        op_func_node.kernel_func_(exec_ctx);
+      }
 
       // post-process grad_op.outputs if need cast complex grad into real grad.
       // NOTE(Aurelius84): insert a transfer_dtype_op inplacely to cast it.
diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.cc b/paddle/fluid/framework/new_executor/new_executor_defs.cc
index 73f16fe3e9cc7..4b9404fd178fd 100644
--- a/paddle/fluid/framework/new_executor/new_executor_defs.cc
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.cc
@@ -673,6 +673,14 @@ OpKernelComputeFunc Instruction::KernelFunc() const {
   return op_func_node_.kernel_func_;
 }
 
+pten::Kernel* Instruction::PtenKernel() const {
+  return op_func_node_.pt_kernel_;
+}
+
+pten::KernelContext* Instruction::PtenKernelContext() const {
+  return op_func_node_.pt_kernel_context_;
+}
+
 OpFuncType Instruction::KernelType() const { return op_func_node_.type_; }
 
 OperatorBase* Instruction::OpBase() const {
diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h
index d691a75a6d35b..ca49e7f5670d6 100644
--- a/paddle/fluid/framework/new_executor/new_executor_defs.h
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.h
@@ -295,6 +295,11 @@ struct OpFuncNode {
 
   OpKernelComputeFunc kernel_func_;
   platform::DeviceContext* dev_ctx_;  // not owned
+
+  // fit for pten kernel
+  pten::Kernel* pt_kernel_{nullptr};                 // not owned
+  pten::KernelContext* pt_kernel_context_{nullptr};  // not onwed
+
   OpFuncType type_;
 };
 
@@ -313,6 +318,10 @@ class Instruction {
 
   OpKernelComputeFunc KernelFunc() const;
 
+  pten::Kernel* PtenKernel() const;
+
+  pten::KernelContext* PtenKernelContext() const;
+
   OpFuncType KernelType() const;
 
   OperatorBase* OpBase() const;
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 50e16920a6737..2d2e198ef40ec 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1791,6 +1791,9 @@ KernelSignature OperatorWithKernel::GetExpectedPtenKernelArgs(
 
 void OperatorWithKernel::BuildPtenKernelContext(
     const RuntimeContext& ctx, platform::DeviceContext* dev_ctx) const {
+  if (pt_kernel_context_ == nullptr) {
+    pt_kernel_context_.reset(new pten::KernelContext());
+  }
   // TODO(chenweihang): now only work for very simple case,
   // many cases need to be deal with later:
   // 1. the input and output are not tensor
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 842ef0457d7bd..59bc4813d985b 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -555,6 +555,20 @@ class OperatorWithKernel : public OperatorBase {
   virtual KernelSignature GetExpectedPtenKernelArgs(
       const ExecutionContext& ctx) const;
 
+  /* member functions for adapting to pten lib */
+  void ChoosePtenKernel(const ExecutionContext& ctx) const;
+
+  void BuildPtenKernelContext(const RuntimeContext& ctx,
+                              platform::DeviceContext* dev_ctx) const;
+
+  void WriteBackToOutputs(RuntimeContext* ctx) const;
+
+  pten::Kernel* PtenKernel() const { return pt_kernel_.get(); }
+
+  pten::KernelContext* PtenKernelContext() const {
+    return pt_kernel_context_.get();
+  }
+
  private:
   void RunImpl(const Scope& scope, const platform::Place& place) const final;
   void RunImpl(const Scope& scope, const platform::Place& place,
@@ -595,14 +609,6 @@ class OperatorWithKernel : public OperatorBase {
   Tensor* GetTensorFormInputSafely(const ExecutionContext& ctx,
                                    const std::string& name) const;
 
-  /* member functions for adapting to pten lib */
-  void ChoosePtenKernel(const ExecutionContext& ctx) const;
-
-  void BuildPtenKernelContext(const RuntimeContext& ctx,
-                              platform::DeviceContext* dev_ctx) const;
-
-  void WriteBackToOutputs(RuntimeContext* ctx) const;
-
  protected:
   mutable std::unique_ptr<OpKernelType> kernel_type_;
   mutable std::unique_ptr<OpKernelFunc> kernel_func_;

From 769e5bc49d530bb0d2f919eb23e44cf38c4f8fb5 Mon Sep 17 00:00:00 2001
From: LiYuRio <63526175+LiYuRio@users.noreply.github.com>
Date: Fri, 7 Jan 2022 14:40:51 +0800
Subject: [PATCH 035/151] [fleet_executor] Support multi carriers (#38709)

---
 .../distributed/fleet_executor/CMakeLists.txt |  3 ++
 .../distributed/fleet_executor/carrier.cc     | 36 +++++--------------
 .../distributed/fleet_executor/carrier.h      |  5 ---
 .../fleet_executor/fleet_executor.cc          | 30 ++++++++--------
 .../fleet_executor/fleet_executor.h           |  3 --
 .../fleet_executor/{global_map.h => global.h} | 35 +++++++++++++-----
 .../interceptor_message_service.cc            |  8 ++---
 .../distributed/fleet_executor/message_bus.cc | 22 ++++++++++++
 .../distributed/fleet_executor/message_bus.h  |  1 +
 .../test/compute_interceptor_run_op_test.cc   |  5 ++-
 .../test/compute_interceptor_test.cc          |  5 ++-
 .../test/interceptor_ping_pong_test.cc        |  5 ++-
 .../interceptor_ping_pong_with_brpc_test.cc   | 17 ++++-----
 .../interceptor_pipeline_long_path_test.cc    |  5 ++-
 .../interceptor_pipeline_short_path_test.cc   |  5 ++-
 15 files changed, 97 insertions(+), 88 deletions(-)
 rename paddle/fluid/distributed/fleet_executor/{global_map.h => global.h} (76%)

diff --git a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
index 95ec6b329964e..e9da55c417e9a 100644
--- a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
+++ b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
@@ -19,6 +19,9 @@ cc_library(fleet_executor SRCS fleet_executor.cc carrier.cc task_node.cc runtime
 
 if(WITH_DISTRIBUTE)
   set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+  if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+    set(DISTRIBUTE_COMPILE_FLAGS "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new")
+  endif()
   set_source_files_properties(interceptor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
   set_source_files_properties(compute_interceptor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
   set_source_files_properties(amplifier_interceptor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
diff --git a/paddle/fluid/distributed/fleet_executor/carrier.cc b/paddle/fluid/distributed/fleet_executor/carrier.cc
index 45296853adf7b..79be1824b864d 100644
--- a/paddle/fluid/distributed/fleet_executor/carrier.cc
+++ b/paddle/fluid/distributed/fleet_executor/carrier.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/fleet_executor/carrier.h"
-#include "paddle/fluid/distributed/fleet_executor/global_map.h"
+#include "paddle/fluid/distributed/fleet_executor/global.h"
 #include "paddle/fluid/distributed/fleet_executor/interceptor.h"
 #include "paddle/fluid/distributed/fleet_executor/interceptor_message_service.h"
 #include "paddle/fluid/distributed/fleet_executor/message_bus.h"
@@ -71,17 +71,13 @@ Carrier::~Carrier() { VLOG(3) << "Carrier's destructor."; }
 
 bool Carrier::EnqueueInterceptorMessage(
     const InterceptorMessage& interceptor_message) {
-  if (interceptor_message.ctrl_message()) {
-    VLOG(3) << "Receiving control message from rank "
-            << interceptor_message.src_id() << " to rank "
-            << interceptor_message.dst_id();
-    // for barrier
-    msg_bus_->IncreaseBarrierCount();
-  } else {
-    int64_t dst_id = interceptor_message.dst_id();
-    Interceptor* dst_interceptor = GetInterceptor(dst_id);
-    dst_interceptor->EnqueueRemoteInterceptorMessage(interceptor_message);
-  }
+  PADDLE_ENFORCE_EQ(
+      interceptor_message.ctrl_message(), false,
+      platform::errors::Fatal(
+          "Control message should be only send inter rank using message bus."));
+  int64_t dst_id = interceptor_message.dst_id();
+  Interceptor* dst_interceptor = GetInterceptor(dst_id);
+  dst_interceptor->EnqueueRemoteInterceptorMessage(interceptor_message);
   return true;
 }
 
@@ -106,11 +102,6 @@ void Carrier::WakeUp() {
 }
 
 void Carrier::Start() {
-  PADDLE_ENFORCE_EQ(msg_bus_->IsInit(), true,
-                    platform::errors::PreconditionNotMet(
-                        "Using message bus since it has not been initialized. "
-                        "Please invoke MessageBus::Init() before using it or "
-                        "neccessary components are not ready."));
   PADDLE_ENFORCE_EQ(is_init_, true, platform::errors::PreconditionNotMet(
                                         "Using carrier before initialized."));
   for (int64_t id : source_interceptor_ids_) {
@@ -154,19 +145,10 @@ bool Carrier::Send(const InterceptorMessage& msg) {
             << " to interceptor " << dst_id << ", which are in the same ranks.";
     return EnqueueInterceptorMessage(msg);
   } else {
-    PADDLE_ENFORCE_NOT_NULL(
-        msg_bus_.get(),
-        platform::errors::Unavailable("Message bus is released accidently"));
-    PADDLE_ENFORCE_EQ(
-        msg_bus_->IsInit(), true,
-        platform::errors::PreconditionNotMet(
-            "Using message bus since it has not been initialized. "
-            "Please invoke MessageBus::Init() before using it or "
-            "neccessary components are not ready."));
     VLOG(3) << "Send a message from interceptor " << src_id
             << " to interceptor " << dst_id
             << ", which are in different ranks.";
-    return msg_bus_->Send(dst_rank, msg);
+    return GlobalVal<MessageBus>::Get()->Send(dst_rank, msg);
   }
 }
 
diff --git a/paddle/fluid/distributed/fleet_executor/carrier.h b/paddle/fluid/distributed/fleet_executor/carrier.h
index cd70ab46ce58e..75ac07083a796 100644
--- a/paddle/fluid/distributed/fleet_executor/carrier.h
+++ b/paddle/fluid/distributed/fleet_executor/carrier.h
@@ -73,10 +73,6 @@ class Carrier final {
   Interceptor* SetInterceptor(int64_t interceptor_id,
                               std::unique_ptr<Interceptor>);
 
-  void SetMsgBus(const std::shared_ptr<MessageBus>& msg_bus) {
-    msg_bus_ = msg_bus;
-  }
-
   void Start();
 
   bool IsInit() const;
@@ -107,7 +103,6 @@ class Carrier final {
   framework::Scope* minibatch_scope_;
   paddle::platform::Place place_;
   paddle::platform::DeviceContext* dev_ctx_{nullptr};
-  std::shared_ptr<MessageBus> msg_bus_;
   int64_t rank_;
   std::string carrier_id_;
   std::unordered_map<int64_t, TaskNode*> interceptor_id_to_node_;
diff --git a/paddle/fluid/distributed/fleet_executor/fleet_executor.cc b/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
index f81cdf200c65d..e22d0945a2398 100644
--- a/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
+++ b/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/fleet_executor/fleet_executor.h"
-#include "paddle/fluid/distributed/fleet_executor/global_map.h"
+#include "paddle/fluid/distributed/fleet_executor/global.h"
 #include "paddle/fluid/distributed/fleet_executor/message_bus.h"
 #include "paddle/fluid/distributed/fleet_executor/runtime_graph.h"
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
@@ -32,6 +32,9 @@ FleetExecutor::FleetExecutor(const std::string& exe_desc_str) {
   bool parse_flag = exe_desc_.ParseFromString(exe_desc_str);
   PADDLE_ENFORCE(parse_flag, platform::errors::PreconditionNotMet(
                                  "Error occurs while parsing string to proto"));
+  // Message bus will be created and inited only once
+  GlobalVal<MessageBus>::Create();
+  InitMessageBus();
 }
 
 FleetExecutor::~FleetExecutor() {
@@ -81,21 +84,16 @@ void FleetExecutor::Init(
     CopyParameters(i, program_desc);
   }
   VLOG(5) << runtime_graph_->DebugString();
-  msg_bus_ = std::make_shared<MessageBus>();
   Carrier* carrier =
       GlobalMap<std::string, Carrier>::Create(carrier_id, carrier_id);
   carrier_ids_.insert(carrier_id);
-  GlobalVal<std::string>::Set(carrier_id);
-  // TODO(liyurui): Maybe message bus should be created only once
+  // Set current running carrier
+  GlobalVal<std::string>::Set(new std::string(carrier_id));
   InitCarrier(carrier);
-  InitMessageBus();
-
-  // Wait for all message bus connected.
-  msg_bus_->Barrier();
+  GlobalVal<MessageBus>::Get()->Barrier();
 }
 
 void FleetExecutor::InitCarrier(Carrier* carrier) {
-  carrier->SetMsgBus(msg_bus_);
   carrier->Init(exe_desc_.cur_rank(), runtime_graph_->interceptor_id_to_rank(),
                 runtime_graph_->interceptor_id_to_node(), root_scope_,
                 minibatch_scope_, microbatch_scopes_, place_);
@@ -131,14 +129,18 @@ void FleetExecutor::InitMessageBus() {
   VLOG(3) << "The number of ranks are "
           << (rank_to_addr.size() == 0 ? 1 : rank_to_addr.size()) << ".";
   VLOG(5) << ss.str();
-  if (!msg_bus_->IsInit()) {
-    msg_bus_->Init(cur_rank, rank_to_addr, addr);
-  }
+  GlobalVal<MessageBus>::Get()->Init(cur_rank, rank_to_addr, addr);
 }
 
 void FleetExecutor::Run(const std::string& carrier_id) {
-  GlobalMap<std::string, Carrier>::Get(carrier_id)->Start();
-  GlobalVal<std::string>::Set(carrier_id);
+  Carrier* carrier = GlobalMap<std::string, Carrier>::Get(carrier_id);
+  // Set current running carrier
+  if (*GlobalVal<std::string>::Get() != carrier_id) {
+    GlobalVal<std::string>::Set(new std::string(carrier_id));
+    // TODO(liyurui): Move barrier to service
+    GlobalVal<MessageBus>::Get()->Barrier();
+  }
+  carrier->Start();
   for (auto* micro_scop : microbatch_scopes_) {
     // By default, we should delete all kid scopes after run executor because
     // some operators may create local scope when running, such as while_op.
diff --git a/paddle/fluid/distributed/fleet_executor/fleet_executor.h b/paddle/fluid/distributed/fleet_executor/fleet_executor.h
index 33b7d4a40dc3b..89ab4c62d386f 100644
--- a/paddle/fluid/distributed/fleet_executor/fleet_executor.h
+++ b/paddle/fluid/distributed/fleet_executor/fleet_executor.h
@@ -55,9 +55,6 @@ class FleetExecutor final {
   framework::Scope* minibatch_scope_;
   platform::Place place_;
   std::vector<framework::Scope*> microbatch_scopes_;
-  // The carriers under FleetExecutor will share message bus,
-  // using shared_ptr to manage lifetime and condition race.
-  std::shared_ptr<MessageBus> msg_bus_;
   std::unordered_set<std::string> carrier_ids_;
 };
 
diff --git a/paddle/fluid/distributed/fleet_executor/global_map.h b/paddle/fluid/distributed/fleet_executor/global.h
similarity index 76%
rename from paddle/fluid/distributed/fleet_executor/global_map.h
rename to paddle/fluid/distributed/fleet_executor/global.h
index 2e2923e447d29..776f314e6afb2 100644
--- a/paddle/fluid/distributed/fleet_executor/global_map.h
+++ b/paddle/fluid/distributed/fleet_executor/global.h
@@ -14,24 +14,41 @@
 
 #pragma once
 
+#include "paddle/fluid/platform/enforce.h"
+
 namespace paddle {
 namespace distributed {
 
-// TODO(liyurui): Change this file to global.h
 template <typename T>
 class GlobalVal final {
  public:
-  static T Get() { return *GetPtr(); }
-  static T Set(T val) {
-    auto* ptr = GetPtr();
-    *ptr = val;
-    return val;
+  static T* Get() {
+    T* ptr = GetPPtr()->get();
+    PADDLE_ENFORCE_NOT_NULL(
+        ptr, platform::errors::NotFound("This value is not global value."));
+    return ptr;
+  }
+  template <typename... Args>
+  static T* Create(Args&&... args) {
+    auto* ptr = GetPPtr();
+    PADDLE_ENFORCE_EQ(ptr->get(), nullptr,
+                      platform::errors::AlreadyExists(
+                          "This value is already a global value."));
+    T* item = new T(std::forward<Args>(args)...);
+    ptr->reset(item);
+    return item;
+  }
+
+  static T* Set(T* new_item) {
+    auto* ptr = GetPPtr();
+    ptr->reset(new_item);
+    return ptr->get();
   }
 
  private:
-  static T* GetPtr() {
-    static T value;
-    return &value;
+  static std::unique_ptr<T>* GetPPtr() {
+    static std::unique_ptr<T> ptr;
+    return &ptr;
   }
 };
 
diff --git a/paddle/fluid/distributed/fleet_executor/interceptor_message_service.cc b/paddle/fluid/distributed/fleet_executor/interceptor_message_service.cc
index 52be135f1ce42..ce8a73602d0be 100644
--- a/paddle/fluid/distributed/fleet_executor/interceptor_message_service.cc
+++ b/paddle/fluid/distributed/fleet_executor/interceptor_message_service.cc
@@ -15,8 +15,8 @@
     !defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/distributed/fleet_executor/interceptor_message_service.h"
 #include "brpc/server.h"
-#include "paddle/fluid/distributed/fleet_executor/carrier.h"
-#include "paddle/fluid/distributed/fleet_executor/global_map.h"
+#include "paddle/fluid/distributed/fleet_executor/global.h"
+#include "paddle/fluid/distributed/fleet_executor/message_bus.h"
 
 namespace paddle {
 namespace distributed {
@@ -29,9 +29,7 @@ void InterceptorMessageServiceImpl::InterceptorMessageService(
   VLOG(3) << "Interceptor Message Service receives a message from interceptor "
           << request->src_id() << " to interceptor " << request->dst_id()
           << ", with the message: " << request->message_type();
-  const auto& carrier_id = GlobalVal<std::string>::Get();
-  bool flag = GlobalMap<std::string, Carrier>::Get(carrier_id)
-                  ->EnqueueInterceptorMessage(*request);
+  bool flag = GlobalVal<MessageBus>::Get()->DispatchMsgToCarrier(*request);
   response->set_rst(flag);
 }
 
diff --git a/paddle/fluid/distributed/fleet_executor/message_bus.cc b/paddle/fluid/distributed/fleet_executor/message_bus.cc
index dd95a90ad1ba4..110c5feafc71a 100644
--- a/paddle/fluid/distributed/fleet_executor/message_bus.cc
+++ b/paddle/fluid/distributed/fleet_executor/message_bus.cc
@@ -17,6 +17,8 @@
 #include <set>
 #include <thread>
 
+#include "paddle/fluid/distributed/fleet_executor/carrier.h"
+#include "paddle/fluid/distributed/fleet_executor/global.h"
 #include "paddle/fluid/distributed/fleet_executor/message_bus.h"
 #include "paddle/fluid/platform/gen_comm_id_helper.h"
 
@@ -81,6 +83,10 @@ const std::string& MessageBus::GetAddr(int64_t rank) const {
 
 bool MessageBus::Send(int64_t dst_rank,
                       const InterceptorMessage& interceptor_message) {
+  PADDLE_ENFORCE_EQ(
+      IsInit(), true,
+      platform::errors::PreconditionNotMet(
+          "Using message bus since it has not been initialized."));
 #if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
     !defined(PADDLE_WITH_ASCEND_CL)
   int retry_time = 0;  // message bus will retry sending for 10 times
@@ -155,6 +161,22 @@ void MessageBus::Barrier() {
   }
 }
 
+bool MessageBus::DispatchMsgToCarrier(
+    const InterceptorMessage& interceptor_message) {
+  if (interceptor_message.ctrl_message()) {
+    VLOG(3) << "Receiving control message from rank "
+            << interceptor_message.src_id() << " to rank "
+            << interceptor_message.dst_id();
+    // for barrier
+    IncreaseBarrierCount();
+    return true;
+  } else {
+    const std::string& carrier_id = *GlobalVal<std::string>::Get();
+    return GlobalMap<std::string, Carrier>::Get(carrier_id)
+        ->EnqueueInterceptorMessage(interceptor_message);
+  }
+}
+
 void MessageBus::ListenPort() {
   if (addr_ == "") {
     LOG(INFO) << "No need listen to port since training on single card.";
diff --git a/paddle/fluid/distributed/fleet_executor/message_bus.h b/paddle/fluid/distributed/fleet_executor/message_bus.h
index c8685a73900d5..456cd77e2dde8 100644
--- a/paddle/fluid/distributed/fleet_executor/message_bus.h
+++ b/paddle/fluid/distributed/fleet_executor/message_bus.h
@@ -54,6 +54,7 @@ class MessageBus final {
 
   void IncreaseBarrierCount();
   void Barrier();
+  bool DispatchMsgToCarrier(const InterceptorMessage& interceptor_message);
 
  private:
   DISABLE_COPY_AND_ASSIGN(MessageBus);
diff --git a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc
index c48fd09623795..07d2a0f6b727a 100644
--- a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "gtest/gtest.h"
 
 #include "paddle/fluid/distributed/fleet_executor/carrier.h"
-#include "paddle/fluid/distributed/fleet_executor/global_map.h"
+#include "paddle/fluid/distributed/fleet_executor/global.h"
 #include "paddle/fluid/distributed/fleet_executor/interceptor.h"
 #include "paddle/fluid/distributed/fleet_executor/message_bus.h"
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
@@ -67,9 +67,8 @@ TEST(ComputeInterceptor, Compute) {
       GlobalMap<std::string, Carrier>::Create(carrier_id, carrier_id);
   carrier->Init(0, {{0, 0}, {1, 0}});
 
-  auto msg_bus = std::make_shared<MessageBus>();
+  MessageBus* msg_bus = GlobalVal<MessageBus>::Create();
   msg_bus->Init(0, {{0, "127.0.0.0:0"}}, "");
-  carrier->SetMsgBus(msg_bus);
 
   // FIXME: don't delete, otherwise interceptor will use undefined node
   TaskNode* node_a =
diff --git a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_test.cc b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_test.cc
index f34f862c6285c..954b52693f46c 100644
--- a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_test.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "gtest/gtest.h"
 
 #include "paddle/fluid/distributed/fleet_executor/carrier.h"
-#include "paddle/fluid/distributed/fleet_executor/global_map.h"
+#include "paddle/fluid/distributed/fleet_executor/global.h"
 #include "paddle/fluid/distributed/fleet_executor/interceptor.h"
 #include "paddle/fluid/distributed/fleet_executor/message_bus.h"
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
@@ -52,9 +52,8 @@ TEST(ComputeInterceptor, Compute) {
       GlobalMap<std::string, Carrier>::Create(carrier_id, carrier_id);
   carrier->Init(0, {{0, 0}, {1, 0}, {2, 0}});
 
-  auto msg_bus = std::make_shared<MessageBus>();
+  MessageBus* msg_bus = GlobalVal<MessageBus>::Create();
   msg_bus->Init(0, {{0, "127.0.0.0:0"}}, "");
-  carrier->SetMsgBus(msg_bus);
 
   // NOTE: don't delete, otherwise interceptor will use undefined node
   TaskNode* node_a = new TaskNode(0, 0, 0, 3, 0);  // role, rank, task_id
diff --git a/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_test.cc b/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_test.cc
index 8289eab167500..19c1d0a0d7a6a 100644
--- a/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_test.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "gtest/gtest.h"
 
 #include "paddle/fluid/distributed/fleet_executor/carrier.h"
-#include "paddle/fluid/distributed/fleet_executor/global_map.h"
+#include "paddle/fluid/distributed/fleet_executor/global.h"
 #include "paddle/fluid/distributed/fleet_executor/interceptor.h"
 #include "paddle/fluid/distributed/fleet_executor/message_bus.h"
 
@@ -64,9 +64,8 @@ TEST(InterceptorTest, PingPong) {
   Carrier* carrier =
       GlobalMap<std::string, Carrier>::Create(carrier_id, carrier_id);
   carrier->Init(0, {{0, 0}, {1, 0}});
-  auto msg_bus = std::make_shared<MessageBus>();
+  MessageBus* msg_bus = GlobalVal<MessageBus>::Create();
   msg_bus->Init(0, {{0, "127.0.0.0:0"}}, "");
-  carrier->SetMsgBus(msg_bus);
 
   Interceptor* a = carrier->SetInterceptor(
       0, InterceptorFactory::Create("PingPong", 0, nullptr));
diff --git a/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_with_brpc_test.cc b/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_with_brpc_test.cc
index f7adf59a6e819..78cff2606f6b8 100644
--- a/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_with_brpc_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_with_brpc_test.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "gtest/gtest.h"
 
 #include "paddle/fluid/distributed/fleet_executor/carrier.h"
-#include "paddle/fluid/distributed/fleet_executor/global_map.h"
+#include "paddle/fluid/distributed/fleet_executor/global.h"
 #include "paddle/fluid/distributed/fleet_executor/interceptor.h"
 #include "paddle/fluid/distributed/fleet_executor/message_bus.h"
 
@@ -112,12 +112,10 @@ TEST(InterceptorTest, PingPong) {
   if (pid == 0) {
     Carrier* carrier =
         GlobalMap<std::string, Carrier>::Create(carrier_id, carrier_id);
-    GlobalVal<std::string>::Set(carrier_id);
-    auto msg_bus = std::make_shared<MessageBus>();
-    carrier->SetMsgBus(msg_bus);
-    // NOTE: need Init msg_bus after carrier SetMsgBus
-    carrier->Init(0, interceptor_id_to_rank);
+    GlobalVal<std::string>::Set(new std::string(carrier_id));
+    MessageBus* msg_bus = GlobalVal<MessageBus>::Create();
     msg_bus->Init(0, {{0, ip0}, {1, ip1}}, ip0);
+    carrier->Init(0, interceptor_id_to_rank);
     Interceptor* a = carrier->SetInterceptor(
         0, InterceptorFactory::Create("PingPong", 0, nullptr));
     msg_bus->Barrier();
@@ -127,11 +125,10 @@ TEST(InterceptorTest, PingPong) {
   } else {
     Carrier* carrier =
         GlobalMap<std::string, Carrier>::Create(carrier_id, carrier_id);
-    GlobalVal<std::string>::Set(carrier_id);
-    auto msg_bus = std::make_shared<MessageBus>();
-    carrier->SetMsgBus(msg_bus);
-    carrier->Init(1, interceptor_id_to_rank);
+    GlobalVal<std::string>::Set(new std::string(carrier_id));
+    MessageBus* msg_bus = GlobalVal<MessageBus>::Create();
     msg_bus->Init(1, {{0, ip0}, {1, ip1}}, ip1);
+    carrier->Init(1, interceptor_id_to_rank);
     carrier->SetInterceptor(1,
                             InterceptorFactory::Create("PingPong", 1, nullptr));
     msg_bus->Barrier();
diff --git a/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_long_path_test.cc b/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_long_path_test.cc
index 2cd0813803f0c..3860e9f4e137e 100644
--- a/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_long_path_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_long_path_test.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "gtest/gtest.h"
 
 #include "paddle/fluid/distributed/fleet_executor/carrier.h"
-#include "paddle/fluid/distributed/fleet_executor/global_map.h"
+#include "paddle/fluid/distributed/fleet_executor/global.h"
 #include "paddle/fluid/distributed/fleet_executor/interceptor.h"
 #include "paddle/fluid/distributed/fleet_executor/message_bus.h"
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
@@ -56,9 +56,8 @@ TEST(AmplifierInterceptor, Amplifier) {
   Carrier* carrier =
       GlobalMap<std::string, Carrier>::Create(carrier_id, carrier_id);
   carrier->Init(0, {{0, 0}, {1, 0}, {2, 0}, {3, 0}, {4, 0}, {5, 0}});
-  auto msg_bus = std::make_shared<MessageBus>();
+  MessageBus* msg_bus = GlobalVal<MessageBus>::Create();
   msg_bus->Init(0, {{0, "127.0.0.0:0"}}, "127.0.0.0:0");
-  carrier->SetMsgBus(msg_bus);
 
   int64_t micro_steps = 3;
 
diff --git a/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_short_path_test.cc b/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_short_path_test.cc
index 66c283b65fb76..b510b68e4e2ed 100644
--- a/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_short_path_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_short_path_test.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "gtest/gtest.h"
 
 #include "paddle/fluid/distributed/fleet_executor/carrier.h"
-#include "paddle/fluid/distributed/fleet_executor/global_map.h"
+#include "paddle/fluid/distributed/fleet_executor/global.h"
 #include "paddle/fluid/distributed/fleet_executor/interceptor.h"
 #include "paddle/fluid/distributed/fleet_executor/message_bus.h"
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
@@ -74,9 +74,8 @@ TEST(AmplifierInterceptor, Amplifier) {
   Carrier* carrier =
       GlobalMap<std::string, Carrier>::Create(carrier_id, carrier_id);
   carrier->Init(0, {{0, 0}, {1, 0}, {2, 0}, {3, 0}});
-  auto msg_bus = std::make_shared<MessageBus>();
+  MessageBus* msg_bus = GlobalVal<MessageBus>::Create();
   msg_bus->Init(0, {{0, ""}}, "");
-  carrier->SetMsgBus(msg_bus);
 
   int64_t micro_steps = 6;
 

From c8fbd3cd6f263e0cc8ca04b17db232ab74ca1fce Mon Sep 17 00:00:00 2001
From: wanghuancoder <wanghuancoder@163.com>
Date: Fri, 7 Jan 2022 14:41:58 +0800
Subject: [PATCH 036/151] patch_tensor_method_func, test=develop (#38761)

---
 python/paddle/fluid/framework.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 3d8cd1142cf3a..a26e322cbd9b7 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -90,6 +90,8 @@ def _test_eager_guard(tracer=None):
     if not _already_patch_eager_tensor:
         from .dygraph.varbase_patch_methods import monkey_patch_varbase
         monkey_patch_varbase()
+        from .dygraph import monkey_patch_math_varbase
+        monkey_patch_math_varbase()
         _already_patch_eager_tensor = True
     if tracer is None:
         core._set_eager_tracer(_dygraph_tracer_)

From 4a3a2d6b926e2bfb8a040755b8b5af9a0816d293 Mon Sep 17 00:00:00 2001
From: guguguzi <48168743+guguguzi@users.noreply.github.com>
Date: Fri, 7 Jan 2022 14:42:19 +0800
Subject: [PATCH 037/151] Add api MultiplicativeDecay (#38250)

* delete the modification of dygraph

* CI

* check CI

* modify the retrun value of get_lr
---
 .../tests/unittests/test_lr_scheduler.py      |  11 +
 python/paddle/optimizer/lr.py                 | 199 ++++++++++++------
 2 files changed, 144 insertions(+), 66 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_lr_scheduler.py b/python/paddle/fluid/tests/unittests/test_lr_scheduler.py
index 04a0d47e47c86..d62a633c28576 100644
--- a/python/paddle/fluid/tests/unittests/test_lr_scheduler.py
+++ b/python/paddle/fluid/tests/unittests/test_lr_scheduler.py
@@ -205,6 +205,13 @@ def lambda_lr(epoch_num, learning_rate, lr_lambda, verbose=False):
     return learning_rate * lr_lambda(epoch_num)
 
 
+def multiplicative_lr(epoch_num, learning_rate, lr_lambda, verbose=False):
+    latest_lr = learning_rate
+    for i in range(epoch_num):
+        latest_lr = latest_lr * lr_lambda(i + 1)
+    return latest_lr
+
+
 def piecewise_lr(epoch_num, boundaries, values, verbose=False):
     assert len(boundaries) + 1 == len(values)
     for i in range(len(boundaries)):
@@ -519,6 +526,10 @@ def test_scheduler(self):
             "learning_rate": 0.5,
             "lr_lambda": lambda x: 0.95**x,
             "verbose": True
+        }), (multiplicative_lr, paddle.optimizer.lr.MultiplicativeDecay, {
+            "learning_rate": 0.5,
+            "lr_lambda": lambda x: 0.95,
+            "verbose": True
         }), (cosine_annealing_lr, paddle.optimizer.lr.CosineAnnealingDecay, {
             "learning_rate": 0.5,
             "T_max": 10,
diff --git a/python/paddle/optimizer/lr.py b/python/paddle/optimizer/lr.py
index be1786696bd92..d4fafba9229b0 100644
--- a/python/paddle/optimizer/lr.py
+++ b/python/paddle/optimizer/lr.py
@@ -17,7 +17,7 @@
 import warnings
 from paddle import Tensor
 
-__all__ = [ #noqa
+__all__ = [  # noqa
     'LRScheduler',
     'NoamDecay',
     'PiecewiseDecay',
@@ -30,7 +30,8 @@
     'StepDecay',
     'LambdaDecay',
     'ReduceOnPlateau',
-    'CosineAnnealingDecay'
+    'CosineAnnealingDecay',
+    'MultiplicativeDecay'
 ]
 
 
@@ -55,9 +56,9 @@ class LRScheduler(object):
 
     Examples:
         Here is an example of a simple ``StepDecay`` implementation. 
-        
+
         .. code-block:: python
-            
+
             import paddle
             from paddle.optimizer.lr import LRScheduler
 
@@ -99,7 +100,7 @@ def __init__(self, learning_rate=0.1, last_epoch=-1, verbose=False):
         self.step()
 
     def __call__(self):
-        """ 
+        """
         Return lastest computed learning rate on current epoch.
         """
         return self.last_lr
@@ -107,7 +108,7 @@ def __call__(self):
     def step(self, epoch=None):
         """
 
-        ``step`` should be called after ``optimizer.step`` . It will update the learning rate in optimizer according to current ``epoch`` .  
+        ``step`` should be called after ``optimizer.step`` . It will update the learning rate in optimizer according to current ``epoch`` .
         The new learning rate will take effect on next ``optimizer.step`` .
 
         Args:
@@ -191,7 +192,7 @@ def set_state_dict(self, state_dict):
 
     def get_lr(self):
         """
-        
+
         For those subclass who overload ``LRScheduler`` (Base Class), User should have a custom implementation of ``get_lr()`` .
 
         Otherwise, an ``NotImplementedError`` exception will be thrown.
@@ -203,7 +204,7 @@ def get_lr(self):
 class NoamDecay(LRScheduler):
     r"""
 
-    Applies Noam Decay to the initial learning rate. 
+    Applies Noam Decay to the initial learning rate.
 
     The algorithm can be described as following.
 
@@ -211,7 +212,7 @@ class NoamDecay(LRScheduler):
 
         new\_learning\_rate = learning\_rate * d_{model}^{-0.5} * min(epoch^{-0.5}, epoch * warmup\_steps^{-1.5})
 
-    Please reference `attention is all you need <https://arxiv.org/pdf/1706.03762.pdf>`_ 
+    Please reference `attention is all you need <https://arxiv.org/pdf/1706.03762.pdf>`_
 
 
     Args:
@@ -312,8 +313,8 @@ class PiecewiseDecay(LRScheduler):
             learning_rate = 0.1
 
     Args:
-        boundaries(list|tuple): A list/tuple of steps numbers. The type of element in the list is python int. 
-        values(list|tuple): A list/tuple of learning rate values that will be picked during different epoch boundaries. 
+        boundaries(list|tuple): A list/tuple of steps numbers. The type of element in the list is python int.
+        values(list|tuple): A list/tuple of learning rate values that will be picked during different epoch boundaries.
             The type of element in the list is python float.
         last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
         verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
@@ -322,7 +323,7 @@ class PiecewiseDecay(LRScheduler):
         ``PiecewiseDecay`` instance to schedule learning rate.
 
     Examples:
-        
+
         .. code-block:: python
 
             import paddle
@@ -388,7 +389,7 @@ class NaturalExpDecay(LRScheduler):
     r"""
 
     Applies natural exponential decay to the initial learning rate.
-    
+
     The algorithm can be described as following:
 
     .. math::
@@ -405,7 +406,7 @@ class NaturalExpDecay(LRScheduler):
         ``NaturalExpDecay`` instance to schedule learning rate.
 
     Examples:
-        
+
         .. code-block:: python
 
             import paddle
@@ -476,7 +477,7 @@ class InverseTimeDecay(LRScheduler):
 
     Args:
         learning_rate (float): The initial learning rate. It is a python float number.
-        gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` . 
+        gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` .
             It should be less than 1.0. Default: 0.1.
         last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
         verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
@@ -485,7 +486,7 @@ class InverseTimeDecay(LRScheduler):
         ``InverseTimeDecay`` instance to schedule learning rate.
 
     Examples:
-        
+
         .. code-block:: python
 
             import paddle
@@ -555,7 +556,7 @@ class PolynomialDecay(LRScheduler):
 
     .. math::
 
-        decay\_steps & = decay\_steps * math.ceil(\frac{epoch}{decay\_steps}) 
+        decay\_steps & = decay\_steps * math.ceil(\frac{epoch}{decay\_steps})
 
         new\_learning\_rate & = (learning\_rate-end\_lr)*(1-\frac{epoch}{decay\_steps})^{power}+end\_lr
 
@@ -563,7 +564,7 @@ class PolynomialDecay(LRScheduler):
 
     .. math::
 
-        epoch & = min(epoch, decay\_steps) 
+        epoch & = min(epoch, decay\_steps)
 
         new\_learning\_rate & = (learning\_rate-end\_lr)*(1-\frac{epoch}{decay\_steps})^{power}+end\_lr
 
@@ -573,7 +574,7 @@ class PolynomialDecay(LRScheduler):
         decay_steps(int): The decay step size. It determines the decay cycle. It must be a positive integer.
         end_lr(float, optional): The minimum final learning rate. Default: 0.0001.
         power(float, optional): Power of polynomial. Default: 1.0.
-        cycle(bool, optional): Whether the learning rate rises again. If True, then the learning rate will rise when it decrease 
+        cycle(bool, optional): Whether the learning rate rises again. If True, then the learning rate will rise when it decrease
             to ``end_lr`` .  If False, the learning rate is monotone decreasing. Default: False.
         last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
         verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
@@ -582,7 +583,7 @@ class PolynomialDecay(LRScheduler):
         ``PolynomialDecay`` instance to schedule learning rate.
 
     Examples:
-        
+
         .. code-block:: python
 
             import paddle
@@ -671,21 +672,21 @@ class LinearWarmup(LRScheduler):
 
     Linear learning rate warm up strategy. Update the learning rate preliminarily before the normal learning rate scheduler.
     For more information, please refer to `Bag of Tricks for Image Classification with Convolutional Neural Networks <https://arxiv.org/abs/1812.01187>`_
-    
+
     When epoch < warmup_steps, learning rate is updated as:
-    
+
     .. math::
-    
+
             lr = start\_lr + (end\_lr - start\_lr) * \frac{epoch}{warmup\_steps}
-    
+
     where start_lr is the initial learning rate, and end_lr is the final learning rate;
-    
+
     When epoch >= warmup_steps, learning rate is updated as:
-    
+
     .. math::
-    
+
             lr = learning_rate
-    
+
     where ``learning_rate`` is float or any subclass of ``LRScheduler`` .
 
     Args:
@@ -700,7 +701,7 @@ class LinearWarmup(LRScheduler):
         ``LinearWarmup`` instance to schedule learning rate.
 
     Examples:
-        
+
         .. code-block:: python
 
             import paddle
@@ -811,14 +812,14 @@ class ExponentialDecay(LRScheduler):
     Update learning rate by `gamma` each epoch.
 
     The algorithm can be described as following.
-    
+
     .. math::
 
         new\_learning\_rate = last\_learning\_rate * gamma
 
     Args:
         learning_rate (float): The initial learning rate. It is a python float number.
-        gamma (float): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` . 
+        gamma (float): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` .
             It should be less than 1.0.
         last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
         verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
@@ -827,7 +828,7 @@ class ExponentialDecay(LRScheduler):
         ``ExponentialDecay`` instance to schedule learning rate.
 
     Examples:
-        
+
         .. code-block:: python
 
             import paddle
@@ -889,7 +890,7 @@ class MultiStepDecay(LRScheduler):
     """
     Update the learning rate by ``gamma`` once ``epoch`` reaches one of the milestones.
 
-    The algorithm can be described as the code below. 
+    The algorithm can be described as the code below.
 
     .. code-block:: text
 
@@ -906,17 +907,17 @@ class MultiStepDecay(LRScheduler):
     Args:
         learning_rate (float): The initial learning rate. It is a python float number.
         milestones (tuple|list): List or tuple of each boundaries. Must be increasing.
-        gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` . 
+        gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` .
             It should be less than 1.0. Default: 0.1.
         last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
         verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
-        
+
 
     Returns:
         ``MultiStepDecay`` instance to schedule learning rate.
 
     Examples:
-        
+
         .. code-block:: python
 
             import paddle
@@ -999,7 +1000,7 @@ class StepDecay(LRScheduler):
     """
     Update the learning rate of ``optimizer`` by ``gamma`` every ``step_size`` number of epoch.
 
-    The algorithm can be described as the code below. 
+    The algorithm can be described as the code below.
 
     .. code-block:: text
 
@@ -1015,7 +1016,7 @@ class StepDecay(LRScheduler):
     Args:
         learning_rate (float): The initial learning rate. It is a python float number.
         step_size (int): the interval to update. It must be a positive integer.
-        gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` . 
+        gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` .
             It should be less than 1.0. Default: 0.1.
         last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
         verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
@@ -1025,7 +1026,7 @@ class StepDecay(LRScheduler):
 
 
     Examples:
-        
+
         .. code-block:: python
 
             import paddle
@@ -1102,7 +1103,7 @@ class LambdaDecay(LRScheduler):
     """
     Sets the learning rate of ``optimizer`` by function ``lr_lambda`` . ``lr_lambda`` is funciton which receives ``epoch`` .
 
-    The algorithm can be described as the code below. 
+    The algorithm can be described as the code below.
 
     .. code-block:: text
 
@@ -1118,12 +1119,12 @@ class LambdaDecay(LRScheduler):
         lr_lambda (function): A function which computes a factor by ``epoch`` , and then multiply the initial learning rate by this factor.
         last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
         verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
-    
+
     Returns:
         ``LambdaDecay`` instance to schedule learning rate.
 
     Examples:
-        
+
         .. code-block:: python
 
             import paddle
@@ -1188,37 +1189,37 @@ def get_lr(self):
 
 class ReduceOnPlateau(LRScheduler):
     """
-    Reduce learning rate when ``metrics`` has stopped descending. Models often benefit from reducing the learning rate 
+    Reduce learning rate when ``metrics`` has stopped descending. Models often benefit from reducing the learning rate
     by 2 to 10 times once model performance has no longer improvement.
 
-    The ``metrics`` is the one which has been pass into ``step`` , it must be 1-D Tensor with shape [1]. When ``metrics`` 
-    stop descending for a ``patience`` number of epochs, the learning rate will be reduced to ``learning_rate * factor`` . 
-    (Specially, ``mode`` can also be set to ``'max`` , in this case, when ``metrics`` stop ascending for a ``patience`` 
+    The ``metrics`` is the one which has been pass into ``step`` , it must be 1-D Tensor with shape [1]. When ``metrics``
+    stop descending for a ``patience`` number of epochs, the learning rate will be reduced to ``learning_rate * factor`` .
+    (Specially, ``mode`` can also be set to ``'max`` , in this case, when ``metrics`` stop ascending for a ``patience``
     number of epochs, the learning rate will be reduced.)
 
     In addition, After each reduction, it will wait a ``cooldown`` number of epochs before resuming above operation.
 
     Args:
         learning_rate (float): The initial learning rate. It is a python float number.
-        mode (str, optional): ``'min'`` or ``'max'`` can be selected. Normally, it is ``'min'`` , which means that the 
-            learning rate will reduce when ``loss`` stops descending. Specially, if it's set to ``'max'`` ,  the learning 
+        mode (str, optional): ``'min'`` or ``'max'`` can be selected. Normally, it is ``'min'`` , which means that the
+            learning rate will reduce when ``loss`` stops descending. Specially, if it's set to ``'max'`` ,  the learning
             rate will reduce when ``loss`` stops ascending. Default: ``'min'`` .
-        factor (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * factor`` . 
+        factor (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * factor`` .
             It should be less than 1.0. Default: 0.1.
-        patience (int, optional): When ``loss`` doesn't improve for this number of epochs, learing rate will be reduced. 
+        patience (int, optional): When ``loss`` doesn't improve for this number of epochs, learing rate will be reduced.
             Default: 10.
-        threshold (float, optional): ``threshold`` and ``threshold_mode`` will determine the minimum change of ``loss`` . 
+        threshold (float, optional): ``threshold`` and ``threshold_mode`` will determine the minimum change of ``loss`` .
             This make tiny changes of ``loss`` will be ignored. Default: 1e-4.
         threshold_mode (str, optional): ``'rel'`` or ``'abs'`` can be selected. In ``'rel'`` mode, the minimum change of ``loss``
-            is ``last_loss * threshold`` , where ``last_loss`` is ``loss`` in last epoch. In ``'abs'`` mode, the minimum 
+            is ``last_loss * threshold`` , where ``last_loss`` is ``loss`` in last epoch. In ``'abs'`` mode, the minimum
             change of ``loss`` is ``threshold`` . Default: ``'rel'`` .
         cooldown (int, optional): The number of epochs to wait before resuming normal operation. Default: 0.
         min_lr (float, optional): The lower bound of the learning rate after reduction. Default: 0.
-        epsilon (float, optional): Minimal decay applied to lr. If the difference between new and old lr is smaller than epsilon, 
+        epsilon (float, optional): Minimal decay applied to lr. If the difference between new and old lr is smaller than epsilon,
             the update is ignored. Default: 1e-8.
         verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False``.
 
-    
+
     Returns:
         ``ReduceOnPlateau`` instance to schedule learning rate.
 
@@ -1331,18 +1332,18 @@ def state_keys(self):
 
     def step(self, metrics, epoch=None):
         """
-        step should be called after `optimizer.step()` . It will update the learning rate in optimizer according to ``metrics`` .  
+        step should be called after `optimizer.step()` . It will update the learning rate in optimizer according to ``metrics`` .
         The new learning rate will take effect on next epoch.
 
         Args:
-            metrics (Tensor|numpy.ndarray|float): Which will be monitored to determine whether the learning rate will reduce. 
+            metrics (Tensor|numpy.ndarray|float): Which will be monitored to determine whether the learning rate will reduce.
                 If it stop descending for a ``patience`` number of epochs, the learning rate will reduce. If it's 'Tensor' or
                 'numpy.ndarray', its shape must be [1].
             epoch (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1.
 
         Returns:
             None
-        
+
         Examples:
             Please refer to the example of current LRScheduler.
         """
@@ -1354,8 +1355,9 @@ def step(self, metrics, epoch=None):
         # loss must be float, numpy.ndarray or 1-D Tensor with shape [1]
         if isinstance(metrics, (Tensor, numpy.ndarray)):
             assert len(metrics.shape) == 1 and metrics.shape[0] == 1, "the metrics.shape " \
-                "should be (1L,), but the current metrics.shape is {}. Maybe that "  \
-                "you should call paddle.mean to process it first.".format(metrics.shape)
+                                                                      "should be (1L,), but the current metrics.shape is {}. Maybe that " \
+                                                                      "you should call paddle.mean to process it first.".format(
+                metrics.shape)
         elif not isinstance(metrics,
                             (int, float, numpy.float32, numpy.float64)):
             raise TypeError(
@@ -1399,8 +1401,8 @@ def _is_better(self, current, best):
 class CosineAnnealingDecay(LRScheduler):
     r"""
 
-    Set the learning rate using a cosine annealing schedule, where :math:`\eta_{max}` is set to 
-    the initial learning_rate. :math:`T_{cur}` is the number of epochs since the last restart in 
+    Set the learning rate using a cosine annealing schedule, where :math:`\eta_{max}` is set to
+    the initial learning_rate. :math:`T_{cur}` is the number of epochs since the last restart in
     SGDR.
 
     The algorithm can be described as following.
@@ -1409,15 +1411,15 @@ class CosineAnnealingDecay(LRScheduler):
 
         \eta_t & = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})\left(1
         + \cos\left(\frac{T_{cur}}{T_{max}}\pi\right)\right),
-        & T_{cur} \neq (2k+1)T_{max}; 
+        & T_{cur} \neq (2k+1)T_{max};
 
         \eta_{t+1} & = \eta_{t} + \frac{1}{2}(\eta_{max} - \eta_{min})
         \left(1 - \cos\left(\frac{1}{T_{max}}\pi\right)\right),
         & T_{cur} = (2k+1)T_{max}.
-    
-    It has been proposed in `SGDR: Stochastic Gradient Descent with Warm Restarts <https://arxiv.org/abs/1608.03983>`_. 
+
+    It has been proposed in `SGDR: Stochastic Gradient Descent with Warm Restarts <https://arxiv.org/abs/1608.03983>`_.
     Note that this only implements the cosine annealing part of SGDR, and not the restarts.
-    
+
     Args:
         learning_rate (float): The initial learning rate, that is :math:`\eta_{max}` . It can be set to python float or int number.
         T_max (int): Maximum number of iterations. It is half of the decay cycle of learning rate. It must be a positive integer.
@@ -1429,7 +1431,7 @@ class CosineAnnealingDecay(LRScheduler):
         ``CosineAnnealingDecay`` instance to schedule learning rate.
 
     Examples:
-        
+
         .. code-block:: python
 
             import paddle
@@ -1513,3 +1515,68 @@ def get_lr(self):
     def _get_closed_form_lr(self):
         return self.eta_min + (self.base_lr - self.eta_min) * (1 + math.cos(
             math.pi * self.last_epoch / self.T_max)) / 2
+
+
+class MultiplicativeDecay(LRScheduler):
+    """
+    Multiply the learning rate of ``optimizer`` by the factor given in function ``lr_lambda`` .
+
+    The algorithm can be described as the code below.
+
+    .. code-block:: text
+
+        learning_rate = 0.5        # init learning_rate
+        lr_lambda = lambda epoch: 0.95
+
+        learning_rate = 0.5        # epoch 0,
+        learning_rate = 0.475      # epoch 1, 0.5*0.95
+        learning_rate = 0.45125    # epoch 2, 0.475*0.95
+
+    Args:
+        learning_rate (float): The initial learning rate. It is a python float number.
+        lr_lambda (function): A function which computes a factor by ``epoch`` , and then multiply the last learning rate by this factor.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+
+    Returns:
+        ``MultiplicativeDecay`` instance to schedule learning rate.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            # train on default dynamic graph mode
+            linear = paddle.nn.Linear(10, 10)
+            scheduler = paddle.optimizer.lr.MultiplicativeDecay(learning_rate=0.5, lr_lambda=lambda x:0.95, verbose=True)
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
+            for epoch in range(20):
+                for batch_id in range(5):
+                    x = paddle.uniform([10, 10])
+                    out = linear(x)
+                    loss = paddle.mean(out)
+                    loss.backward()
+                    sgd.step()
+                    sgd.clear_gradients()
+                    scheduler.step()    # If you update learning rate each step
+              # scheduler.step()        # If you update learning rate each epoch
+
+    """
+
+    def __init__(self, learning_rate, lr_lambda, last_epoch=-1, verbose=False):
+        if not callable(lr_lambda):
+            raise TypeError(
+                "The type of 'lr_lambda' in 'MultiplicativeDecay' must be 'function', but received %s."
+                % type(lr_lambda))
+
+        self.lr_lambda = lr_lambda
+        super(MultiplicativeDecay, self).__init__(learning_rate, last_epoch,
+                                                  verbose)
+
+    def get_lr(self):
+        if self.last_epoch > 0:
+            return self.last_lr * self.lr_lambda(self.last_epoch)
+        else:
+            return self.base_lr

From f634c0b1978ad853d975347cd9f8220a2949924a Mon Sep 17 00:00:00 2001
From: niuliling123 <51102941+niuliling123@users.noreply.github.com>
Date: Fri, 7 Jan 2022 15:17:36 +0800
Subject: [PATCH 038/151] Fix a bug when reduce_num = 1 in Reduce Op (#38771)

---
 paddle/pten/kernels/gpu/reduce.h | 31 +++++--------------------------
 1 file changed, 5 insertions(+), 26 deletions(-)

diff --git a/paddle/pten/kernels/gpu/reduce.h b/paddle/pten/kernels/gpu/reduce.h
index 0704b76a2f069..5a736ef0e6e72 100644
--- a/paddle/pten/kernels/gpu/reduce.h
+++ b/paddle/pten/kernels/gpu/reduce.h
@@ -45,8 +45,7 @@ namespace cub = hipcub;
 #include "paddle/pten/api/ext/dispatch.h"
 #include "paddle/pten/backends/gpu/gpu_context.h"
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/kernels/cast_kernel.h"
-#include "paddle/pten/kernels/copy_kernel.h"
+#include "paddle/pten/kernels/gpu/elementwise.h"
 
 // Reduce split or not, Whether to use ReduceHigherDim
 #define REDUCE_SPLIT_BOUNDARY 512
@@ -1062,23 +1061,6 @@ static
       "Tx should not be float16 when using cub::DeviceReduce::Reduce()."));
 }
 
-static void AsyncCopy(const pten::DenseTensor& src, pten::DenseTensor* dst) {
-  paddle::platform::DeviceContextPool& pool =
-      paddle::platform::DeviceContextPool::Instance();
-  const paddle::platform::CUDADeviceContext* dev_ctx;
-  if (paddle::platform::is_gpu_place(dst->place()) ||
-      paddle::platform::is_npu_place(dst->place())) {
-    dev_ctx = static_cast<paddle::platform::CUDADeviceContext*>(
-        pool.Get(dst->place()));
-
-  } else {
-    dev_ctx = static_cast<paddle::platform::CUDADeviceContext*>(
-        pool.Get(src.place()));
-  }
-
-  pten::Copy(*dev_ctx, src, false, dst);
-}
-
 template <typename Tx,
           typename Ty,
           template <typename> class ReduceOp,
@@ -1111,13 +1093,10 @@ void TensorReduceFunctorImpl(const pten::DenseTensor& x,
   auto* dev_ctx = static_cast<paddle::platform::CUDADeviceContext*>(
       paddle::platform::DeviceContextPool::Instance().Get(x.place()));
   if (config.reduce_num == 1) {
-    auto out_dims = y->dims();
-    if (x.dtype() == y->dtype()) {
-      AsyncCopy(x, y);
-      y->Resize(out_dims);
-    } else {
-      pten::CastKernel<Tx>(*dev_ctx, x, y->dtype(), y);
-    }
+    std::vector<const DenseTensor*> inputs = {&x};
+    std::vector<DenseTensor*> outputs = {y};
+    pten::LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary, Tx, Ty>(
+        *dev_ctx, inputs, &outputs, transform);
     return;
   }
 

From 0883cf37bef9ca7db37ece787f3ccab3f817a9ae Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Fri, 7 Jan 2022 17:38:38 +0800
Subject: [PATCH 039/151] [newExe]Fix blocking in lr_sheduler from Executor
 (#38786)

---
 python/paddle/fluid/executor.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 8702c800498ef..7f282b8cea07a 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -1416,7 +1416,10 @@ def _can_use_interpreter_core(program, place):
                         [lr_value]).astype(convert_dtype(lr_var.dtype))
                     tensor = core.get_variable_tensor(scope,
                                                       lr_sheduler._var_name)
-                    tensor.set(data, self.place)
+                    # NOTE(dev): `set` always call TensorCopySync that is a 
+                    # blocking behavior. So we use `_copy_from` to replace it.
+                    cpu_tensor = _as_lodtensor(data, core.CPUPlace())
+                    tensor._copy_from(cpu_tensor, self.place)
 
                 return new_exe.run(list(feed.keys()), fetch_list, return_numpy)
 

From fb3313e99c957101e30d913b60a95f02904ecf2d Mon Sep 17 00:00:00 2001
From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com>
Date: Fri, 7 Jan 2022 19:04:29 +0800
Subject: [PATCH 040/151] Add multi tensor for adam (#38010)

* add multi tensor for adam

* add merged_adam op

* refine code

* refine adam compute logic
---
 paddle/fluid/operators/optimizers/adam_op.cu  |  28 ++-
 .../operators/optimizers/merged_adam_op.cc    | 138 +++++++++++++
 .../operators/optimizers/merged_adam_op.cu    | 191 ++++++++++++++++++
 .../operators/optimizers/merged_adam_op.h     | 104 ++++++++++
 paddle/fluid/pybind/op_function_generator.h   |   9 +
 .../fluid/tests/unittests/test_adam_op.py     | 181 +++++++++++++++++
 .../tests/unittests/test_merged_adam_op.py    | 157 ++++++++++++++
 python/paddle/optimizer/adam.py               | 171 ++++++++++++++++
 python/paddle/optimizer/optimizer.py          |   8 +-
 9 files changed, 968 insertions(+), 19 deletions(-)
 create mode 100644 paddle/fluid/operators/optimizers/merged_adam_op.cc
 create mode 100644 paddle/fluid/operators/optimizers/merged_adam_op.cu
 create mode 100644 paddle/fluid/operators/optimizers/merged_adam_op.h
 create mode 100644 python/paddle/fluid/tests/unittests/test_merged_adam_op.py

diff --git a/paddle/fluid/operators/optimizers/adam_op.cu b/paddle/fluid/operators/optimizers/adam_op.cu
index 57231e1135a6a..3b9cf159f1b6b 100644
--- a/paddle/fluid/operators/optimizers/adam_op.cu
+++ b/paddle/fluid/operators/optimizers/adam_op.cu
@@ -29,20 +29,18 @@ __global__ void AdamKernelREG(MT beta1, MT beta2, MT epsilon, MT beta1_pow_,
   MT beta1_pow = beta1_pow_;
   MT beta2_pow = beta2_pow_;
 
-  lr *= sqrt(static_cast<MT>(1.0) - beta2_pow) /
-        (static_cast<MT>(1.0) - beta1_pow);
-
   int id = blockIdx.x * blockDim.x + threadIdx.x;
 
   for (; id < ndim; id += gridDim.x * blockDim.x) {
     MT p = master_param ? master_param[id] : static_cast<MT>(param[id]);
     MT g = static_cast<MT>(grad[id]);
-    MT mom1 = moment1[id];
-    MT mom2 = moment2[id];
+    MT mom1 = static_cast<MT>(moment1[id]);
+    MT mom2 = static_cast<MT>(moment2[id]);
     mom1 = beta1 * mom1 + (static_cast<MT>(1.0) - beta1) * g;
     mom2 = beta2 * mom2 + (static_cast<MT>(1.0) - beta2) * g * g;
-    p -= lr * (mom1 /
-               (sqrt(mom2) + epsilon * sqrt(static_cast<MT>(1.0) - beta2_pow)));
+
+    MT denom = (sqrt(mom2) / sqrt(static_cast<MT>(1.0) - beta2_pow)) + epsilon;
+    p += (mom1 / denom) * (-(lr / (static_cast<MT>(1.0) - beta1_pow)));
 
     moment1_out[id] = mom1;
     moment2_out[id] = mom2;
@@ -65,9 +63,6 @@ __global__ void AdamKernelMEM(MT beta1, MT beta2, MT epsilon,
   MT beta1_pow = *beta1_pow_;
   MT beta2_pow = *beta2_pow_;
 
-  lr *= sqrt(static_cast<MT>(1.0) - beta2_pow) /
-        (static_cast<MT>(1.0) - beta1_pow);
-
   int id = blockIdx.x * blockDim.x + threadIdx.x;
 
   for (; id < ndim; id += gridDim.x * blockDim.x) {
@@ -77,8 +72,9 @@ __global__ void AdamKernelMEM(MT beta1, MT beta2, MT epsilon,
     MT mom2 = static_cast<MT>(moment2[id]);
     mom1 = beta1 * mom1 + (static_cast<MT>(1.0) - beta1) * g;
     mom2 = beta2 * mom2 + (static_cast<MT>(1.0) - beta2) * g * g;
-    p -= lr * (mom1 /
-               (sqrt(mom2) + epsilon * sqrt(static_cast<MT>(1.0) - beta2_pow)));
+
+    MT denom = (sqrt(mom2) / sqrt(static_cast<MT>(1.0) - beta2_pow)) + epsilon;
+    p += (mom1 / denom) * (-(lr / (static_cast<MT>(1.0) - beta1_pow)));
 
     moment1_out[id] = mom1;
     moment2_out[id] = mom2;
@@ -105,8 +101,6 @@ __global__ void SparseAdamCUDAKernelREG(
     int64_t row_numel, int64_t row_count, bool lazy_mode, int ndim) {
   int id = blockIdx.x * blockDim.x + threadIdx.x;
   MT lr = *lr_;
-  lr *= sqrt(static_cast<MT>(1.0) - beta2_pow) /
-        (static_cast<MT>(1.0) - beta1_pow);
 
   for (; id < ndim; id += blockDim.x * gridDim.x) {
     auto row_idx =
@@ -122,8 +116,10 @@ __global__ void SparseAdamCUDAKernelREG(
                  : static_cast<MT>(0);
       mom1 = beta1 * mom1 + (static_cast<MT>(1.0) - beta1) * g;
       mom2 = beta2 * mom2 + (static_cast<MT>(1.0) - beta2) * g * g;
-      p -= lr * (mom1 / (sqrt(mom2) +
-                         epsilon * sqrt(static_cast<MT>(1.0) - beta2_pow)));
+
+      MT denom =
+          (sqrt(mom2) / sqrt(static_cast<MT>(1.0) - beta2_pow)) + epsilon;
+      p += (mom1 / denom) * (-(lr / (static_cast<MT>(1.0) - beta1_pow)));
 
       // Write back to global memory
       mom1_out_[id] = mom1;
diff --git a/paddle/fluid/operators/optimizers/merged_adam_op.cc b/paddle/fluid/operators/optimizers/merged_adam_op.cc
new file mode 100644
index 0000000000000..11c047305c44a
--- /dev/null
+++ b/paddle/fluid/operators/optimizers/merged_adam_op.cc
@@ -0,0 +1,138 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/optimizers/merged_adam_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+class MergedAdamOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {}
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto param_dtype =
+        framework::OperatorWithKernel::IndicateVarDataType(ctx, "Param");
+    return framework::OpKernelType(param_dtype, ctx.GetPlace());
+  }
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string& var_name, const framework::Tensor& tensor,
+      const framework::OpKernelType& expected_kernel_type) const override {
+    if (var_name == "Beta1Pow" || var_name == "Beta2Pow" ||
+        var_name == "SkipUpdate") {
+      return expected_kernel_type;
+    } else {
+      return framework::OpKernelType(expected_kernel_type.data_type_,
+                                     tensor.place(), tensor.layout());
+    }
+  }
+};
+
+class MergedAdamOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Param", "(Tensor, default Tensor<float>) Input parameter")
+        .AsDuplicable();
+    AddInput("Grad", "(Tensor, default Tensor<float>) Input gradient")
+        .AsDuplicable();
+    AddInput("LearningRate", "(Tensor, default Tensor<float>) Learning rate")
+        .AsDuplicable();
+    AddInput("Moment1", "(Tensor, default Tensor<float>) Input first moment")
+        .AsDuplicable();
+    AddInput("Moment2", "(Tensor, default Tensor<float>) Input second moment")
+        .AsDuplicable();
+    AddInput("Beta1Pow",
+             "(Tensor, default Tensor<float>) Input beta1 power accumulator")
+        .AsDuplicable();
+    AddInput("Beta2Pow",
+             "(Tensor, default Tensor<float>) Input beta2 power accumulator")
+        .AsDuplicable();
+    AddInput("MasterParam", "FP32 master weight for AMP.")
+        .AsDispensable()
+        .AsDuplicable();
+
+    AddOutput("ParamOut", "(Tensor) Output parameter").AsDuplicable();
+    AddOutput("Moment1Out", "(Tensor) Output first moment").AsDuplicable();
+    AddOutput("Moment2Out", "(Tensor) Output second moment").AsDuplicable();
+    AddOutput("Beta1PowOut", "(Tensor) Output beta1 power accumulator")
+        .AsDuplicable();
+    AddOutput("Beta2PowOut", "(Tensor) Output beta2 power accumulator")
+        .AsDuplicable();
+    AddOutput("MasterParamOut",
+              "The updated FP32 master weight for AMP. "
+              "It shared memory with Input(MasterParam).")
+        .AsDispensable()
+        .AsDuplicable();
+
+    AddAttr<float>("beta1",
+                   "(float, default 0.9) "
+                   "Exponential decay rate for the "
+                   "first moment estimates.")
+        .SetDefault(0.9f);
+    AddAttr<float>("beta2",
+                   "(float, default 0.999) "
+                   "exponential decay rate for the "
+                   "second moment estimates.")
+        .SetDefault(0.999f);
+    AddAttr<float>("epsilon",
+                   "(float, default 1.0e-8) "
+                   "Constant for numerical stability")
+        .SetDefault(1.0e-8f);
+    AddAttr<bool>("multi_precision",
+                  "(bool, default false) "
+                  "Whether to use multi-precision during weight updating.")
+        .SetDefault(false);
+    // TODO(zhiqiu): We could set Beta1PowOut and Beta2PowOut
+    // as dispensable since they are not used when use_global_beta_pow is true.
+    AddAttr<bool>("use_global_beta_pow",
+                  "(bool, default false) "
+                  "Whether to use global beta_pow for whole model instead of "
+                  "creating beta_pow for each parameter.")
+        .SetDefault(false);
+
+    AddComment(R"DOC(
+Adam Optimizer.
+This implements the Adam optimizer from Section 2 of the Adam
+paper : https://arxiv.org/abs/1412.6980.
+Adam is a first-order gradient-based optimization method based on
+adaptive estimates of lower-order moments.
+Adam updates:
+$$
+moment\_1\_out = \beta_1 * moment\_1 + (1 - \beta_1) * grad \\
+moment\_2_\out = \beta_2 * moment\_2 + (1 - \beta_2) * grad * grad \\
+learning\_rate = learning\_rate *
+                  \frac{\sqrt{1 - \beta_{2\_pow}}}{1 - \beta_{1\_pow}} \\
+param\_out = param - learning\_rate * \frac{moment\_1}{\sqrt{moment\_2} + \epsilon}
+$$
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(merged_adam, ops::MergedAdamOp,
+                             ops::MergedAdamOpMaker);
+REGISTER_OP_WITHOUT_GRADIENT(merged_adamw, ops::MergedAdamOp,
+                             ops::MergedAdamOpMaker);
+
+REGISTER_OP_CPU_KERNEL(
+    merged_adam,
+    ops::MergedAdamOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MergedAdamOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/optimizers/merged_adam_op.cu b/paddle/fluid/operators/optimizers/merged_adam_op.cu
new file mode 100644
index 0000000000000..2523fb9e5c680
--- /dev/null
+++ b/paddle/fluid/operators/optimizers/merged_adam_op.cu
@@ -0,0 +1,191 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/amp/fp16_type_traits.h"
+#include "paddle/fluid/operators/optimizers/merged_adam_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T, typename MT>
+__global__ void AdamKernelREG(MT beta1, MT beta2, MT epsilon, MT beta1_pow_,
+                              MT beta2_pow_, const MT* moment1, MT* moment1_out,
+                              const MT* moment2, MT* moment2_out, const MT* lr_,
+                              const T* grad, const T* param, T* param_out,
+                              const MT* master_param, MT* master_param_out,
+                              int ndim) {
+  MT lr = *lr_;
+  MT beta1_pow = beta1_pow_;
+  MT beta2_pow = beta2_pow_;
+
+  int id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  for (; id < ndim; id += gridDim.x * blockDim.x) {
+    MT p = master_param ? master_param[id] : static_cast<MT>(param[id]);
+    MT g = static_cast<MT>(grad[id]);
+    MT mom1 = static_cast<MT>(moment1[id]);
+    MT mom2 = static_cast<MT>(moment2[id]);
+    mom1 = beta1 * mom1 + (static_cast<MT>(1.0) - beta1) * g;
+    mom2 = beta2 * mom2 + (static_cast<MT>(1.0) - beta2) * g * g;
+
+    MT denom = (sqrt(mom2) / sqrt(static_cast<MT>(1.0) - beta2_pow)) + epsilon;
+    p += (mom1 / denom) * (-(lr / (static_cast<MT>(1.0) - beta1_pow)));
+
+    moment1_out[id] = mom1;
+    moment2_out[id] = mom2;
+    param_out[id] = static_cast<T>(p);
+    if (master_param_out) {
+      master_param_out[id] = p;
+    }
+  }
+}
+
+template <typename T, typename MT>
+__global__ void AdamKernelMEM(MT beta1, MT beta2, MT epsilon,
+                              const MT* beta1_pow_, const MT* beta2_pow_,
+                              const MT* moment1, MT* moment1_out,
+                              const MT* moment2, MT* moment2_out, const MT* lr_,
+                              const T* grad, const T* param, T* param_out,
+                              const MT* master_param, MT* master_param_out,
+                              int ndim) {
+  MT lr = *lr_;
+  MT beta1_pow = *beta1_pow_;
+  MT beta2_pow = *beta2_pow_;
+
+  int id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  for (; id < ndim; id += gridDim.x * blockDim.x) {
+    MT p = master_param ? master_param[id] : static_cast<MT>(param[id]);
+    MT g = static_cast<MT>(grad[id]);
+    MT mom1 = static_cast<MT>(moment1[id]);
+    MT mom2 = static_cast<MT>(moment2[id]);
+    mom1 = beta1 * mom1 + (static_cast<MT>(1.0) - beta1) * g;
+    mom2 = beta2 * mom2 + (static_cast<MT>(1.0) - beta2) * g * g;
+
+    MT denom = (sqrt(mom2) / sqrt(static_cast<MT>(1.0) - beta2_pow)) + epsilon;
+    p += (mom1 / denom) * (-(lr / (static_cast<MT>(1.0) - beta1_pow)));
+
+    moment1_out[id] = mom1;
+    moment2_out[id] = mom2;
+    param_out[id] = static_cast<T>(p);
+    if (master_param_out) {
+      master_param_out[id] = p;
+    }
+  }
+}
+
+template <typename T>
+__global__ void UpdateBetaPow(T beta1, T beta2, const T* beta1_pow_,
+                              const T* beta2_pow_, T* beta1_pow_out,
+                              T* beta2_pow_out) {
+  *beta1_pow_out = beta1 * beta1_pow_[0];
+  *beta2_pow_out = beta2 * beta2_pow_[0];
+}
+
+template <typename T>
+class MergedAdamOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    using MPDType = typename details::MPTypeTrait<T>::Type;
+
+    auto param = ctx.MultiInput<framework::Tensor>("Param");
+    auto grad = ctx.MultiInput<framework::Tensor>("Grad");
+    auto lr = ctx.MultiInput<framework::Tensor>("LearningRate");
+    auto mom1 = ctx.MultiInput<framework::Tensor>("Moment1");
+    auto mom2 = ctx.MultiInput<framework::Tensor>("Moment2");
+    auto beta1_pow = ctx.MultiInput<framework::Tensor>("Beta1Pow");
+    auto beta2_pow = ctx.MultiInput<framework::Tensor>("Beta2Pow");
+
+    auto param_out = ctx.MultiOutput<framework::Tensor>("ParamOut");
+    auto mom1_out = ctx.MultiOutput<framework::Tensor>("Moment1Out");
+    auto mom2_out = ctx.MultiOutput<framework::Tensor>("Moment2Out");
+    auto beta1_pow_out = ctx.MultiOutput<framework::Tensor>("Beta1PowOut");
+    auto beta2_pow_out = ctx.MultiOutput<framework::Tensor>("Beta2PowOut");
+
+    MPDType beta1 = static_cast<MPDType>(ctx.Attr<float>("beta1"));
+    MPDType beta2 = static_cast<MPDType>(ctx.Attr<float>("beta2"));
+    MPDType epsilon = static_cast<MPDType>(ctx.Attr<float>("epsilon"));
+    bool use_global_beta_pow = ctx.Attr<bool>("use_global_beta_pow");
+    VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
+
+    const bool multi_precision = ctx.Attr<bool>("multi_precision");
+    auto master_param = ctx.MultiInput<framework::Tensor>("MasterParam");
+    auto master_param_out =
+        ctx.MultiOutput<framework::Tensor>("MasterParamOut");
+
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+
+    size_t param_num = param.size();
+    for (size_t idx = 0; idx < param_num; idx++) {
+      const MPDType* master_in_data =
+          multi_precision ? master_param[idx]->data<MPDType>() : nullptr;
+      MPDType* master_out_data =
+          multi_precision
+              ? master_param_out[idx]->mutable_data<MPDType>(ctx.GetPlace())
+              : nullptr;
+
+      // update param and moment
+      int threads = 512;
+      int blocks = (param[idx]->numel() + threads - 1) / threads;
+
+      if (beta1_pow[idx]->place() == platform::CPUPlace() &&
+          beta2_pow[idx]->place() == platform::CPUPlace()) {
+        // Compute with betapow in REG
+        AdamKernelREG<T, MPDType><<<blocks, threads, 0, dev_ctx.stream()>>>(
+            beta1, beta2, epsilon, *beta1_pow[idx]->data<MPDType>(),
+            *beta2_pow[idx]->data<MPDType>(), mom1[idx]->data<MPDType>(),
+            mom1_out[idx]->mutable_data<MPDType>(ctx.GetPlace()),
+            mom2[idx]->data<MPDType>(),
+            mom2_out[idx]->mutable_data<MPDType>(ctx.GetPlace()),
+            lr[idx]->data<MPDType>(), grad[idx]->data<T>(),
+            param[idx]->data<T>(),
+            param_out[idx]->mutable_data<T>(ctx.GetPlace()), master_in_data,
+            master_out_data, param[idx]->numel());
+        if (!use_global_beta_pow) {
+          // Cpu update
+          beta1_pow_out[idx]->mutable_data<MPDType>(platform::CPUPlace())[0] =
+              beta1 * beta1_pow[idx]->data<MPDType>()[0];
+          beta2_pow_out[idx]->mutable_data<MPDType>(platform::CPUPlace())[0] =
+              beta2 * beta2_pow[idx]->data<MPDType>()[0];
+        }
+      } else {
+        AdamKernelMEM<T, MPDType><<<blocks, threads, 0, dev_ctx.stream()>>>(
+            beta1, beta2, epsilon, beta1_pow[idx]->data<MPDType>(),
+            beta2_pow[idx]->data<MPDType>(), mom1[idx]->data<MPDType>(),
+            mom1_out[idx]->mutable_data<MPDType>(ctx.GetPlace()),
+            mom2[idx]->data<MPDType>(),
+            mom2_out[idx]->mutable_data<MPDType>(ctx.GetPlace()),
+            lr[idx]->data<MPDType>(), grad[idx]->data<T>(),
+            param[idx]->data<T>(),
+            param_out[idx]->mutable_data<T>(ctx.GetPlace()), master_in_data,
+            master_out_data, param[idx]->numel());
+        if (!use_global_beta_pow) {
+          // Update with gpu
+          UpdateBetaPow<MPDType><<<1, 32, 0, dev_ctx.stream()>>>(
+              beta1, beta2, beta1_pow[idx]->data<MPDType>(),
+              beta2_pow[idx]->data<MPDType>(),
+              beta1_pow_out[idx]->mutable_data<MPDType>(ctx.GetPlace()),
+              beta2_pow_out[idx]->mutable_data<MPDType>(ctx.GetPlace()));
+        }
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(merged_adam, ops::MergedAdamOpCUDAKernel<float>,
+                        ops::MergedAdamOpCUDAKernel<double>,
+                        ops::MergedAdamOpCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/optimizers/merged_adam_op.h b/paddle/fluid/operators/optimizers/merged_adam_op.h
new file mode 100644
index 0000000000000..c9417158fe772
--- /dev/null
+++ b/paddle/fluid/operators/optimizers/merged_adam_op.h
@@ -0,0 +1,104 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/optimizers/adam_op.h"
+
+namespace paddle {
+namespace operators {
+
+namespace scatter = paddle::operators::math::scatter;
+
+template <typename DeviceContext, typename T>
+class MergedAdamOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto param = ctx.MultiInput<framework::Tensor>("Param");
+    size_t n = param.size();
+    auto grad = ctx.MultiInput<framework::Tensor>("Grad");
+    PADDLE_ENFORCE_EQ(n, grad.size(),
+                      platform::errors::InvalidArgument(
+                          "The size of Input(Grad) must be equal to "
+                          "Input(Param), but got the size of Input(Grad) "
+                          "is %d, the size of Input(Param) is %d.",
+                          grad.size(), n));
+    auto lr = ctx.MultiInput<framework::Tensor>("LearningRate");
+    PADDLE_ENFORCE_EQ(
+        n, lr.size(),
+        platform::errors::InvalidArgument(
+            "The size of Input(LearningRate) must be equal to "
+            "Input(Param), but got the size of Input(LearningRate) "
+            "is %d, the size of Input(Param) is %d.",
+            lr.size(), n));
+    auto mom1 = ctx.MultiInput<framework::Tensor>("Moment1");
+    PADDLE_ENFORCE_EQ(n, mom1.size(),
+                      platform::errors::InvalidArgument(
+                          "The size of Input(Moment1) must be equal to "
+                          "Input(Param), but got the size of Input(Moment1) "
+                          "is %d, the size of Input(Param) is %d.",
+                          mom1.size(), n));
+    auto mom2 = ctx.MultiInput<framework::Tensor>("Moment2");
+    PADDLE_ENFORCE_EQ(n, mom2.size(),
+                      platform::errors::InvalidArgument(
+                          "The size of Input(Moment2) must be equal to "
+                          "Input(Param), but got the size of Input(Moment2) "
+                          "is %d, the size of Input(Param) is %d.",
+                          mom2.size(), n));
+    auto beta1_pow = ctx.MultiInput<framework::Tensor>("Beta1Pow");
+    PADDLE_ENFORCE_EQ(n, beta1_pow.size(),
+                      platform::errors::InvalidArgument(
+                          "The size of Input(Beta1Pow) must be equal to "
+                          "Input(Param), but got the size of Input(Beta1Pow) "
+                          "is %d, the size of Input(Param) is %d.",
+                          beta1_pow.size(), n));
+    auto beta2_pow = ctx.MultiInput<framework::Tensor>("Beta2Pow");
+    PADDLE_ENFORCE_EQ(n, beta2_pow.size(),
+                      platform::errors::InvalidArgument(
+                          "The size of Input(Beta2Pow) must be equal to "
+                          "Input(Param), but got the size of Input(Beta2Pow) "
+                          "is %d, the size of Input(Param) is %d.",
+                          beta2_pow.size(), n));
+
+    auto param_out = ctx.MultiOutput<framework::Tensor>("ParamOut");
+    auto mom1_out = ctx.MultiOutput<framework::Tensor>("Moment1Out");
+    auto mom2_out = ctx.MultiOutput<framework::Tensor>("Moment2Out");
+    auto beta1_pow_out = ctx.MultiOutput<framework::Tensor>("Beta1PowOut");
+    auto beta2_pow_out = ctx.MultiOutput<framework::Tensor>("Beta2PowOut");
+
+    T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
+    T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
+    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
+    bool use_global_beta_pow = ctx.Attr<bool>("use_global_beta_pow");
+    VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
+
+    size_t param_num = param.size();
+    for (size_t idx = 0; idx < param_num; idx++) {
+      AdamFunctor<T, CPUAdam> functor(
+          beta1, beta2, epsilon, beta1_pow[idx]->data<T>(),
+          beta2_pow[idx]->data<T>(), mom1[idx]->data<T>(),
+          mom1_out[idx]->mutable_data<T>(ctx.GetPlace()), mom2[idx]->data<T>(),
+          mom2_out[idx]->mutable_data<T>(ctx.GetPlace()), lr[idx]->data<T>(),
+          grad[idx]->data<T>(), param[idx]->data<T>(),
+          param_out[idx]->mutable_data<T>(ctx.GetPlace()));
+      functor(param[idx]->numel());
+      if (!use_global_beta_pow) {
+        beta1_pow_out[idx]->mutable_data<T>(ctx.GetPlace())[0] =
+            beta1 * beta1_pow[idx]->data<T>()[0];
+        beta2_pow_out[idx]->mutable_data<T>(ctx.GetPlace())[0] =
+            beta2 * beta2_pow[idx]->data<T>()[0];
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/op_function_generator.h b/paddle/fluid/pybind/op_function_generator.h
index e14b836bf0830..f83997843f433 100644
--- a/paddle/fluid/pybind/op_function_generator.h
+++ b/paddle/fluid/pybind/op_function_generator.h
@@ -71,6 +71,9 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
     {"adam",
      {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow",
       "Beta2Pow", "MasterParam"}},
+    {"merged_adam",
+     {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow",
+      "Beta2Pow", "MasterParam"}},
     {"adamw",
      {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow",
       "Beta2Pow", "MasterParam"}},
@@ -123,6 +126,9 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
     {"adam",
      {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut",
       "MasterParamOut"}},
+    {"merged_adam",
+     {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut",
+      "MasterParamOut"}},
     {"adamw",
      {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut",
       "MasterParamOut"}},
@@ -148,6 +154,9 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = {
     {"adam",
      {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut",
       "MasterParamOut"}},
+    {"merged_adam",
+     {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut",
+      "MasterParamOut"}},
     {"adamw",
      {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut",
       "MasterParamOut"}},
diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py
index 70109164960a3..a06f0d390e517 100644
--- a/python/paddle/fluid/tests/unittests/test_adam_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adam_op.py
@@ -1011,5 +1011,186 @@ def test_adam_op(self):
         adam.clear_gradients()
 
 
+class TestMultiTensorAdam(unittest.TestCase):
+    def _adam_optimize_dygraph(self,
+                               place,
+                               use_param_attr=False,
+                               use_param_group=False,
+                               use_amp=False,
+                               use_multi_tensor=False):
+        paddle.disable_static()
+        paddle.seed(10)
+        paddle.set_device(place)
+
+        input = paddle.randn((5, 5))
+
+        weight_attr = paddle.ParamAttr(
+            learning_rate=0.5,
+            regularizer=paddle.regularizer.L2Decay(1.0),
+            trainable=True)
+        if use_param_attr:
+            model = paddle.nn.Linear(5, 5, weight_attr)
+        else:
+            model = paddle.nn.Linear(5, 5)
+
+        if not use_param_group:
+            optimizer = paddle.optimizer.Adam(
+                parameters=model.parameters(),
+                use_multi_tensor=use_multi_tensor,
+                multi_precision=use_amp)
+        else:
+            optimizer = paddle.optimizer.Adam(
+                parameters=[{
+                    'params': model.parameters(),
+                    'weight_decay': 0.001,
+                    'beta1': 0.1,
+                    'beta2': 0.99
+                }],
+                use_multi_tensor=use_multi_tensor,
+                multi_precision=use_amp)
+
+        for idx in range(2):
+            if place == 'gpu' and use_amp == True:
+                model = paddle.amp.decorate(models=model, level='O2')
+                scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
+
+            if place == 'gpu' and use_amp == True:
+                with paddle.amp.auto_cast(level='O2'):
+                    output = model(input)
+                    loss = paddle.mean(output)
+                scaled = scaler.scale(loss)
+                scaled.backward()
+                scaler.step(optimizer)
+                optimizer.clear_grad()
+            else:
+                output = model(input)
+                loss = paddle.mean(output)
+                loss.backward()
+                optimizer.step()
+                optimizer.clear_grad()
+
+        return output, model.parameters()
+
+    def _adam_optimize_static(self,
+                              place,
+                              use_amp=False,
+                              use_multi_tensor=False):
+        paddle.enable_static()
+        paddle.seed(10)
+        np.random.seed(10)
+        if place == 'cpu':
+            use_amp = False
+        exe = paddle.static.Executor(place=place)
+        train_program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        optimizer = paddle.optimizer.Adam(
+            multi_precision=use_amp, use_multi_tensor=use_multi_tensor)
+        if use_amp:
+            optimizer = paddle.static.amp.decorate(
+                optimizer,
+                init_loss_scaling=128.0,
+                use_dynamic_loss_scaling=True,
+                use_pure_fp16=True,
+                use_fp16_guard=False)
+        with paddle.static.program_guard(train_program, startup_program):
+            if use_amp:
+                data = paddle.static.data(
+                    shape=[2, 2], name='X', dtype='float16')
+            else:
+                data = paddle.static.data(
+                    shape=[2, 2], name='X', dtype='float32')
+            hidden = paddle.static.nn.fc(x=data, size=10)
+            loss = paddle.fluid.layers.mean(hidden)
+            optimizer.minimize(loss)
+        exe.run(startup_program)
+        if use_amp:
+            optimizer.amp_init(place=place, scope=paddle.static.global_scope())
+            x = np.random.random(size=(2, 2)).astype('float16')
+        else:
+            x = np.random.random(size=(2, 2)).astype('float32')
+        out = []
+        for idx in range(5):
+            loss_data, = exe.run(train_program,
+                                 feed={"X": x},
+                                 fetch_list=[loss.name])
+            out.append(loss_data)
+        return out
+
+    def _get_places(self):
+        places = ['cpu']
+        if paddle.is_compiled_with_cuda():
+            places.append('gpu')
+        return places
+
+    def _check_with_place_amp(self, place, use_amp):
+        # test dygraph mode
+        output_dygraph1, params_dygraph1 = self._adam_optimize_dygraph(
+            place=place, use_amp=use_amp, use_multi_tensor=True)
+        output_dygraph2, params_dygraph2 = self._adam_optimize_dygraph(
+            place=place, use_amp=use_amp, use_multi_tensor=False)
+        self.assertEqual(
+            np.allclose(
+                output_dygraph1, output_dygraph2, rtol=1e-05), True)
+        for idx in range(len(params_dygraph1)):
+            self.assertEqual(
+                np.allclose(
+                    params_dygraph1[idx], params_dygraph2[idx], rtol=1e-05),
+                True)
+        # test static mode
+        output_static1 = self._adam_optimize_static(
+            place=place, use_amp=use_amp, use_multi_tensor=True)
+        output_static2 = self._adam_optimize_static(
+            place=place, use_amp=use_amp, use_multi_tensor=False)
+        for idx in range(len(output_static1)):
+            self.assertEqual(
+                np.allclose(
+                    output_static1[idx], output_static2[idx], rtol=1e-05),
+                True)
+
+    def _check_with_param_arrt(self, place, use_amp):
+        output1, params1 = self._adam_optimize_dygraph(
+            place=place,
+            use_amp=use_amp,
+            use_param_attr=True,
+            use_multi_tensor=True)
+        output2, params2 = self._adam_optimize_dygraph(
+            place=place,
+            use_amp=use_amp,
+            use_param_attr=True,
+            use_multi_tensor=False)
+
+        self.assertEqual(np.allclose(output1, output2, rtol=1e-05), True)
+        for idx in range(len(params1)):
+            self.assertEqual(
+                np.allclose(
+                    params1[idx], params2[idx], rtol=1e-05), True)
+
+    def _check_with_param_group(self, place, use_amp):
+        output1, params1 = self._adam_optimize_dygraph(
+            place=place,
+            use_amp=use_amp,
+            use_param_group=True,
+            use_multi_tensor=True)
+        output2, params2 = self._adam_optimize_dygraph(
+            place=place,
+            use_amp=use_amp,
+            use_param_group=True,
+            use_multi_tensor=False)
+
+        self.assertEqual(np.allclose(output1, output2, rtol=1e-05), True)
+        for idx in range(len(params1)):
+            self.assertEqual(
+                np.allclose(
+                    params1[idx], params2[idx], rtol=1e-05), True)
+
+    def test_main(self):
+        for place in self._get_places():
+            use_amp_list = [True, False]
+            for use_amp in use_amp_list:
+                self._check_with_place_amp(place, use_amp)
+                self._check_with_param_arrt(place, use_amp)
+                self._check_with_param_group(place, use_amp)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_merged_adam_op.py b/python/paddle/fluid/tests/unittests/test_merged_adam_op.py
new file mode 100644
index 0000000000000..f515a9f95b109
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_merged_adam_op.py
@@ -0,0 +1,157 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import numpy as np
+from paddle import _C_ops
+
+
+def run_adam_op(params,
+                grads,
+                lrs,
+                moment1s,
+                moment2s,
+                beta1_pows,
+                beta2_pows,
+                master_params,
+                epsilon,
+                beta1,
+                beta2,
+                place,
+                multi_precision=False,
+                use_merged=False):
+    assert len(params) == len(grads)
+    assert len(params) == len(lrs)
+    assert len(params) == len(moment1s)
+    assert len(params) == len(moment2s)
+    assert len(params) == len(beta1_pows)
+    assert len(params) == len(beta1_pows)
+    assert len(params) == len(master_params)
+    paddle.disable_static()
+    paddle.set_device(place)
+
+    param_vars = [paddle.fluid.dygraph.to_variable(p) for p in params]
+    grad_vars = [paddle.fluid.dygraph.to_variable(g) for g in grads]
+    lr_vars = [paddle.fluid.dygraph.to_variable(l) for l in lrs]
+    moment1_vars = [paddle.fluid.dygraph.to_variable(m) for m in moment1s]
+    moment2_vars = [paddle.fluid.dygraph.to_variable(m) for m in moment2s]
+    beta1_pow_vars = [paddle.fluid.dygraph.to_variable(b) for b in beta1_pows]
+    beta2_pow_vars = [paddle.fluid.dygraph.to_variable(b) for b in beta2_pows]
+    master_param_vars = [
+        paddle.fluid.dygraph.to_variable(m_p) for m_p in master_params
+    ]
+
+    if not use_merged:
+        for i in range(len(param_vars)):
+            _, _, _, _, _, _ = _C_ops.adam(
+                param_vars[i], grad_vars[i], lr_vars[i], moment1_vars[i],
+                moment2_vars[i], beta1_pow_vars[i], beta2_pow_vars[i],
+                master_param_vars[i], param_vars[i], moment1_vars[i],
+                moment2_vars[i], beta1_pow_vars[i], beta2_pow_vars[i],
+                master_param_vars[i], 'epsilon', epsilon, 'beta1', beta1,
+                'beta2', beta2, 'multi_precision', multi_precision)
+    else:
+        _, _, _, _, _, _ = _C_ops.merged_adam(
+            param_vars, grad_vars, lr_vars, moment1_vars, moment2_vars,
+            beta1_pow_vars, beta2_pow_vars, master_param_vars, param_vars,
+            moment1_vars, moment2_vars, beta1_pow_vars, beta2_pow_vars,
+            master_param_vars, 'epsilon', epsilon, 'beta1', beta1, 'beta2',
+            beta2, 'multi_precision', multi_precision)
+
+    outputs = {
+        'ParamOut': param_vars,
+        'Moment1Out': moment1_vars,
+        'Moment2Out': moment2_vars,
+        'Beta1PowOut': beta1_pow_vars,
+        'Beta2PowOut': beta2_pow_vars,
+        'MasterParamOut': master_param_vars
+    }
+
+    return outputs
+
+
+class TestMergedAdam(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static()
+        self.shapes = [[3, 4], [2, 7], [5, 6], [7, 8]]
+        self.seed = 10
+
+    def gen_rand_data(self, shapes, dtype):
+        return [np.random.random(s).astype(dtype) for s in shapes]
+
+    def prepare_data(self, shapes, multi_precision, seed, place):
+        np.random.seed(seed)
+        mp_dtype = np.float32
+        dtype = np.float16 if multi_precision and place == 'gpu' else np.float32
+        params = self.gen_rand_data(shapes, dtype)
+        grads = self.gen_rand_data(shapes, dtype)
+        lrs = self.gen_rand_data([[1], [1], [1], [1]], mp_dtype)
+        moment1s = self.gen_rand_data(shapes, mp_dtype)
+        moment2s = self.gen_rand_data(shapes, mp_dtype)
+        beta1_pows = self.gen_rand_data([[1], [1], [1], [1]], mp_dtype)
+        beta2_pows = self.gen_rand_data([[1], [1], [1], [1]], mp_dtype)
+        master_params = [p.astype(mp_dtype) for p in params]
+        return params, grads, lrs, moment1s, moment2s, beta1_pows, beta2_pows, master_params
+
+    def check_with_place(self, place, multi_precision):
+        params, grads, lrs, moment1s, moment2s, beta1_pows, beta2_pows, master_params = self.prepare_data(
+            self.shapes, multi_precision, self.seed, place)
+
+        def run_op(use_merged):
+            return run_adam_op(
+                params=params,
+                grads=grads,
+                lrs=lrs,
+                moment1s=moment1s,
+                moment2s=moment2s,
+                beta1_pows=beta1_pows,
+                beta2_pows=beta2_pows,
+                master_params=master_params,
+                epsilon=0.9,
+                beta1=0.9,
+                beta2=0.99,
+                place=place,
+                multi_precision=multi_precision,
+                use_merged=use_merged)
+
+        outs1 = run_op(True)
+        outs2 = run_op(False)
+        self.assertEqual(len(outs1), len(outs2))
+
+        for key in outs1.keys():
+            value1 = outs1[key]
+            value2 = outs2[key]
+            for i in range(len(value1)):
+                if place == 'gpu':
+                    self.assertTrue(np.array_equal(value1[i], value2[i]))
+                else:
+                    self.assertTrue(
+                        np.allclose(
+                            value1[i], value2[i], atol=1e-7))
+
+    def get_places(self):
+        places = ['cpu']
+        if paddle.is_compiled_with_cuda():
+            places.append('gpu')
+        return places
+
+    def test_main(self):
+        for multi_precision in [False, True]:
+            for place in self.get_places():
+                self.check_with_place(place, multi_precision)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py
index cc28eead522d4..8134c9f71b669 100644
--- a/python/paddle/optimizer/adam.py
+++ b/python/paddle/optimizer/adam.py
@@ -92,6 +92,7 @@ class Adam(Optimizer):
             different semantics with the original Adam algorithm and may lead to different result.
             The default value is False.
         multi_precision (bool, optional): Whether to use multi-precision during weight updating. Default is false.
+        use_multi_tensor (bool, optional): Whether to use multi-tensor strategy to update all parameters at once . Default is false.
         name (str, optional): Normally there is no need for user to set this property.
             For more information, please refer to :ref:`api_guide_Name`.
             The default value is None.
@@ -172,6 +173,7 @@ def __init__(self,
                  grad_clip=None,
                  lazy_mode=False,
                  multi_precision=False,
+                 use_multi_tensor=False,
                  name=None):
         assert learning_rate is not None
         assert beta1 is not None
@@ -209,6 +211,24 @@ def __init__(self,
             'lazy_mode': lazy_mode,
         }
 
+        self._use_multi_tensor = use_multi_tensor
+        if self._use_multi_tensor:
+            self._param_dict = {'FP32_LODTensor': [], 'FP16_LODTensor': []}
+            self._moment1_dict = {'FP32_LODTensor': [], 'FP16_LODTensor': []}
+            self._moment2_dict = {'FP32_LODTensor': [], 'FP16_LODTensor': []}
+            self._beta1_pow_acc_dict = {
+                'FP32_LODTensor': [],
+                'FP16_LODTensor': []
+            }
+            self._beta2_pow_acc_dict = {
+                'FP32_LODTensor': [],
+                'FP16_LODTensor': []
+            }
+            self._master_weight_dict = {
+                'FP32_LODTensor': None,
+                'FP16_LODTensor': []
+            }
+
     def _create_master_weight(self, param):
         if param.name in self._master_weights:
             var = self._master_weights[param.name]
@@ -436,6 +456,157 @@ def step(self):
                 self._apply_optimize(
                     loss=None, startup_program=None, params_grads=params_grads)
 
+    def _multi_tensor_init(self, target_block, parameters):
+        """
+        All parameters used for optimizer (such as: parameters, master_weight, velocity_acc for momentum) calculations are grouped into a python list by data type (float16, float32).
+        This function will be overridden in the corresponding optimizer file.
+        Args:
+            target_block: the block in which the loss tensor is present
+            parameters: list of parameter tensors for the optimizer
+        """
+        self._create_accumulators(target_block, parameters)
+        for param in parameters:
+            moment1 = self._get_accumulator(self._moment1_acc_str, param)
+            moment2 = self._get_accumulator(self._moment2_acc_str, param)
+            beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
+                                                  param)
+            beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
+                                                  param)
+
+            if param.dtype == paddle.float32:
+                self._param_dict['FP32_LODTensor'].append(param)
+                self._moment1_dict['FP32_LODTensor'].append(moment1)
+                self._moment2_dict['FP32_LODTensor'].append(moment2)
+                self._beta1_pow_acc_dict['FP32_LODTensor'].append(beta1_pow_acc)
+                self._beta2_pow_acc_dict['FP32_LODTensor'].append(beta2_pow_acc)
+            elif param.dtype == paddle.float16:
+                self._param_dict['FP16_LODTensor'].append(param)
+                self._moment1_dict['FP16_LODTensor'].append(moment1)
+                self._moment2_dict['FP16_LODTensor'].append(moment2)
+                self._beta1_pow_acc_dict['FP16_LODTensor'].append(beta1_pow_acc)
+                self._beta2_pow_acc_dict['FP16_LODTensor'].append(beta2_pow_acc)
+                if self._multi_precision:
+                    self._master_weight_dict['FP16_LODTensor'].append(
+                        self._master_weights[param.name])
+                else:
+                    self._master_weight_dict['FP16_LODTensor'] = None
+            else:
+                raise ValueError(
+                    "Now multi_tensor_momentum only support fp32 and fp16 parameters and grad is LOD_TENSOR."
+                )
+
+    def _append_optimize_multi_tensor_op(self, target_block,
+                                         parameters_and_grads):
+        """ 
+        For Multi Tensor, append optimize merged_operator to block.
+        """
+        assert isinstance(target_block, framework.Block)
+
+        grad_dict = {'FP32_LODTensor': [], 'FP16_LODTensor': []}
+        lr_dict = {'FP32_LODTensor': [], 'FP16_LODTensor': []}
+
+        if isinstance(parameters_and_grads, list):
+            for param_and_grad in parameters_and_grads:
+                if param_and_grad[1] is None:
+                    continue
+                if param_and_grad[0].stop_gradient is False:
+                    if param_and_grad[
+                            0].dtype == paddle.float32 and param_and_grad[
+                                1].type == core.VarDesc.VarType.LOD_TENSOR:
+                        grad_dict['FP32_LODTensor'].append(param_and_grad[1])
+                        lr = self._create_param_lr(param_and_grad)
+                        lr_dict['FP32_LODTensor'].append(lr)
+                    elif param_and_grad[
+                            0].dtype == paddle.float16 and param_and_grad[
+                                1].type == core.VarDesc.VarType.LOD_TENSOR:
+                        grad_dict['FP16_LODTensor'].append(param_and_grad[1])
+                        lr = self._create_param_lr(param_and_grad)
+                        lr_dict['FP16_LODTensor'].append(lr)
+        else:
+            for param_and_grad in parameters_and_grads['params']:
+                if param_and_grad[1] is None:
+                    continue
+                if param_and_grad[0].stop_gradient is False:
+                    param_grad_dict = dict()
+                    param_grad_dict['params'] = param_and_grad
+                    param_grad_dict.update({
+                        k: v
+                        for k, v in parameters_and_grads.items()
+                        if k != 'params'
+                    })
+                    param_and_grad = self._update_param_group(param_grad_dict)
+                    if param_and_grad[
+                            0].dtype == paddle.float32 and param_and_grad[
+                                1].type == core.VarDesc.VarType.LOD_TENSOR:
+                        grad_dict['FP32_LODTensor'].append(param_and_grad[1])
+                        lr = self._create_param_lr(param_and_grad)
+                        lr_dict['FP32_LODTensor'].append(lr)
+                    elif param_and_grad[
+                            0].dtype == paddle.float16 and param_and_grad[
+                                1].type == core.VarDesc.VarType.LOD_TENSOR:
+                        grad_dict['FP16_LODTensor'].append(param_and_grad[1])
+                        lr = self._create_param_lr(param_and_grad)
+                        lr_dict['FP16_LODTensor'].append(lr)
+
+        multi_tensor_list = ['FP32_LODTensor', 'FP16_LODTensor']
+        for key in multi_tensor_list:
+            if len(self._param_dict[key]) > 0:
+                if key == 'FP32_LODTensor':
+                    self._multi_precision = False
+
+                _beta1 = self._beta1 if not isinstance(
+                    self._beta1, Variable) else self._beta1.numpy().item(0)
+                _beta2 = self._beta2 if not isinstance(
+                    self._beta2, Variable) else self._beta2.numpy().item(0)
+
+                if framework.in_dygraph_mode():
+                    _, _, _, _, _, _ = _C_ops.merged_adam(
+                        self._param_dict[key], grad_dict[key], lr_dict[key],
+                        self._moment1_dict[key], self._moment2_dict[key],
+                        self._beta1_pow_acc_dict[key],
+                        self._beta2_pow_acc_dict[key],
+                        self._master_weight_dict[key], self._param_dict[key],
+                        self._moment1_dict[key], self._moment2_dict[key],
+                        self._beta1_pow_acc_dict[key],
+                        self._beta2_pow_acc_dict[key],
+                        self._master_weight_dict[key], 'epsilon', self._epsilon,
+                        'beta1', _beta1, 'beta2', _beta2, 'multi_precision',
+                        self._multi_precision)
+                else:
+                    inputs = {
+                        "Param": self._param_dict[key],
+                        "Grad": grad_dict[key],
+                        "LearningRate": lr_dict[key],
+                        "Moment1": self._moment1_dict[key],
+                        "Moment2": self._moment2_dict[key],
+                        "Beta1Pow": self._beta1_pow_acc_dict[key],
+                        "Beta2Pow": self._beta2_pow_acc_dict[key]
+                    }
+                    outputs = {
+                        "ParamOut": self._param_dict[key],
+                        "Moment1Out": self._moment1_dict[key],
+                        "Moment2Out": self._moment2_dict[key],
+                        "Beta1PowOut": self._beta1_pow_acc_dict[key],
+                        "Beta2PowOut": self._beta2_pow_acc_dict[key]
+                    }
+                    attrs = {
+                        "epsilon": self._epsilon,
+                        "beta1": _beta1,
+                        "beta2": _beta2
+                    }
+                    if self._multi_precision:
+                        inputs["MasterParam"] = self._master_weight_dict[key]
+                        outputs["MasterParamOut"] = self._master_weight_dict[
+                            key]
+                        attrs["multi_precision"] = self._multi_precision
+                    target_block.append_op(
+                        type="merged_adam",
+                        inputs=inputs,
+                        outputs=outputs,
+                        attrs=attrs,
+                        stop_gradient=True)
+        return None
+
     def _update_param_group(self, parameters):
         self._beta1 = parameters.get('beta1', self._default_dict['beta1'])
         self._beta2 = parameters.get('beta2', self._default_dict['beta2'])
diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
index a711d98df6fa1..3fc70449d15c9 100644
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -218,7 +218,7 @@ def __init__(self,
             self._param_groups = self._parameter_list
 
         # NOTE: Multi Tensor: Pass in all parameters and gradients to the op kernel of the Optimizer at one time for updating for dygraph mode.
-        # Optimizer support list: [ paddle.optimizer.Momentum ].
+        # Optimizer support list: [ paddle.optimizer.Momentum, paddle.optimizer.Adam].
         self._use_multi_tensor = None
         self._param_dict = {'FP32_LODTensor': [], 'FP16_LODTensor': []}
 
@@ -684,8 +684,10 @@ def _create_optimization_pass(self, parameters_and_grads):
 
         self._create_global_learning_rate()
 
-        # NOTE: Multi Tensor support [ Momentum ] for dygraph mode
-        if self._use_multi_tensor and self.__class__.__name__ in ['Momentum']:
+        # NOTE: Multi Tensor support [ Momentum, Adam ] for dygraph mode
+        if self._use_multi_tensor and self.__class__.__name__ in [
+                'Momentum', 'Adam'
+        ]:
             if len(self._param_dict['FP32_LODTensor']) == 0 and len(
                     self._param_dict['FP16_LODTensor']) == 0:
                 if isinstance(parameters_and_grads, list):

From 8c92337c96bc61fc86137f153416b91705ff80e0 Mon Sep 17 00:00:00 2001
From: wangxinxin08 <69842442+wangxinxin08@users.noreply.github.com>
Date: Fri, 7 Jan 2022 19:20:35 +0800
Subject: [PATCH 041/151] modify mish op and add mish api (#38734)

* add mish operator and api

* remove redundant code and modify grad_atol of mish unittest

* modify mish code to be consistent with other activation implementation
---
 paddle/fluid/operators/activation_op.cc       |  37 ++++
 paddle/fluid/operators/activation_op.cu       |  50 +++++
 paddle/fluid/operators/activation_op.h        |  41 ++++
 paddle/fluid/operators/mish_op.cc             | 121 ------------
 paddle/fluid/operators/mish_op.cu             | 177 ------------------
 paddle/fluid/operators/mish_op.h              | 137 --------------
 python/paddle/fluid/layers/nn.py              |   5 +-
 .../tests/unittests/test_activation_op.py     |  81 ++++++++
 .../fluid/tests/unittests/test_mish_op.py     | 102 ----------
 python/paddle/nn/__init__.py                  |   2 +
 python/paddle/nn/functional/__init__.py       |   2 +
 python/paddle/nn/functional/activation.py     |  41 ++++
 python/paddle/nn/layer/activation.py          |  46 +++++
 13 files changed, 304 insertions(+), 538 deletions(-)
 delete mode 100644 paddle/fluid/operators/mish_op.cc
 delete mode 100644 paddle/fluid/operators/mish_op.cu
 delete mode 100644 paddle/fluid/operators/mish_op.h
 delete mode 100644 python/paddle/fluid/tests/unittests/test_mish_op.py

diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 1d0dda7cd6626..c5ca1fd0e8cab 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -806,6 +806,36 @@ Swish Activation Operator.
   }
 };
 
+class MishOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "Input of Mish operator");
+    AddOutput("Out", "Output of Mish operator");
+    AddAttr<float>(
+        "threshold",
+        "Constant threshold of softplus in Mish operator. Approximate value "
+        "of softplus will be used if absolute value of input is greater than "
+        ":attr:`threshold`")
+        .SetDefault(20.f);
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false)
+        .AsExtra();
+    AddComment(R"DOC(
+Mish Activation Operator.
+
+..  math::
+    softplus(x) = \begin{cases}
+            x, \text{if } x > \text{threshold} \\
+            \ln(1 + e^{x}),  \text{otherwise}
+          \end{cases}
+
+    out = x * \tanh(softplus(x))
+
+)DOC");
+  }
+};
+
 class HardSwishOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -1901,4 +1931,11 @@ REGISTER_OP_VERSION(softplus)
             .NewAttr("threshold", "The threshold value of the new formula",
                      20.0f));
 
+REGISTER_OP_VERSION(mish)
+    .AddCheckpoint(
+        R"ROC(add new attributes [use_mkldnn], and when computing softplus the formula is changed as the new veriosn of softplus)ROC",
+        paddle::framework::compatible::OpVersionDesc().NewAttr(
+            "use_mkldnn", "(bool, default false) Only used in mkldnn kernel",
+            false));
+
 /* ========================================================================== */
diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu
index 4818043b93be2..342ed3a6b19e2 100644
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -1145,6 +1145,55 @@ struct CudaSwishGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
+template <typename T>
+struct CudaMishFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  // mish(x) = x * tanh(softplus(x))
+  // softplus(x) = x, if x > threshold
+  //             = ln(1 + exp(x)), otherwise
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T& arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    MPType sp = (x > static_cast<MPType>(threshold)) ? x : log(one + exp(x));
+    return static_cast<T>(x * tanh(sp));
+  }
+};
+
+template <typename T>
+struct CudaMishGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  // dx = dout * (tanh(sp) + x * (1 - tanh(sp) ** 2) * (1 - exp(-sp)))
+  // sp = softplus(x)
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T& arg_dout,
+                                          const T& arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    MPType sp = (x > static_cast<MPType>(threshold)) ? x : log(one + exp(x));
+    MPType gsp =
+        (x > static_cast<MPType>(threshold)) ? one : one / (one + exp(-x));
+    MPType tsp = tanh(sp);
+    return static_cast<T>(dout * (tsp + x * (one - tsp * tsp) * gsp));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
 template <typename T>
 struct CudaThresholdedReluFunctor : public BaseActivationFunctor<T> {
   T zero = static_cast<T>(0.0f);
@@ -1808,6 +1857,7 @@ REGISTER_OP_CUDA_KERNEL(
   __macro(hard_sigmoid, HardSigmoid, CudaHardSigmoidFunctor,                  \
           CudaHardSigmoidGradFunctor);                                        \
   __macro(swish, Swish, CudaSwishFunctor, CudaSwishGradFunctor);              \
+  __macro(mish, Mish, CudaMishFunctor, CudaMishGradFunctor);                  \
   __macro(thresholded_relu, ThresholdedRelu, CudaThresholdedReluFunctor,      \
           CudaThresholdedReluGradFunctor);                                    \
   __macro(hard_swish, HardSwish, CudaHardSwishFunctor,                        \
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index 9ba49e598ed5c..6e32860d69c62 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -1412,6 +1412,46 @@ struct SoftplusGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
+// mish(x) = x * tanh(softplus(x))
+// softplus(x) = x, if x > threshold
+//             = ln(1 + exp(x)), otherwise
+template <typename T>
+struct MishFunctor : public BaseActivationFunctor<T> {
+  float threshold;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) {
+    auto sp = (x > static_cast<T>(threshold))
+                  .select(x, (static_cast<T>(1) + x.exp()).log());
+    out.device(d) = x * sp.tanh();
+  }
+};
+
+// dx = dout * (tanh(sp) + x * (1 - tanh(sp) ** 2) * (1 - exp(-sp)))
+// sp = softplus(x)
+template <typename T>
+struct MishGradFunctor : public BaseActivationFunctor<T> {
+  float threshold;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) {
+    auto sp = (x > static_cast<T>(threshold))
+                  .select(x, (static_cast<T>(1) + x.exp()).log());
+    auto gsp = static_cast<T>(1) - (-sp).exp();
+    auto tsp = sp.tanh();
+    dx.device(d) = dout * (tsp + x * (static_cast<T>(1) - tsp * tsp) * gsp);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
 // softsign(x) = x / (1 + |x|)
 template <typename T>
 struct SoftsignFunctor : public BaseActivationFunctor<T> {
@@ -2841,4 +2881,5 @@ struct LogGradGradFunctor : public BaseActivationFunctor<T> {
   __macro(swish, Swish, SwishFunctor, SwishGradFunctor);                      \
   __macro(thresholded_relu, ThresholdedRelu, ThresholdedReluFunctor,          \
           ThresholdedReluGradFunctor);                                        \
+  __macro(mish, Mish, MishFunctor, MishGradFunctor);                          \
   __macro(hard_swish, HardSwish, HardSwishFunctor, HardSwishGradFunctor);
diff --git a/paddle/fluid/operators/mish_op.cc b/paddle/fluid/operators/mish_op.cc
deleted file mode 100644
index ea754b5b1e941..0000000000000
--- a/paddle/fluid/operators/mish_op.cc
+++ /dev/null
@@ -1,121 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/mish_op.h"
-#include <memory>
-#include <string>
-
-namespace paddle {
-namespace operators {
-
-class MishOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "mish");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "mish");
-
-    ctx->ShareDim("X", /*->*/ "Out");
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-        ctx.device_context());
-  }
-};
-
-class MishOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "Input of Mish operator");
-    AddOutput("Out", "Output of Mish operator");
-    AddAttr<float>(
-        "threshold",
-        "Constant threshold of softplus in Mish operator. Approximate value "
-        "of softplus will be used if absolute value of input is greater than "
-        ":attr:`threshold`")
-        .SetDefault(20.f);
-    AddComment(R"DOC(
-Mish Activation Operator.
-
-..  math::
-    softplus = \begin{cases}
-            x, \text{if } x > \text{threshold} \\
-            e^{x}, \text{if } x < -\text{threshold} \\
-            \ln(1 + e^{x}),  \text{otherwise}
-          \end{cases}
-
-    out = x * \tanh(softplus)
-
-)DOC");
-  }
-};
-
-// The operator to calculate gradients of a prelu operator.
-class MishGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "mish");
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
-                   "Out@GRAD", "mish");
-
-    auto x_grad_name = framework::GradVarName("X");
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, ctx->GetInputDim("X"));
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-        ctx.device_context());
-  }
-};
-
-template <typename T>
-class MishGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("mish_grad");
-    op->SetInput("X", this->Input("X"));
-    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    op->SetAttrMap(this->Attrs());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(mish, ops::MishOp, ops::MishOpMaker,
-                  ops::MishGradOpMaker<paddle::framework::OpDesc>,
-                  ops::MishGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(mish_grad, ops::MishGradOp);
-REGISTER_OP_CPU_KERNEL(
-    mish, ops::MishFP32CPUKernel<paddle::platform::CPUDeviceContext>,
-    ops::MishCPUKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    mish_grad, ops::MishGradFP32CPUKernel<paddle::platform::CPUDeviceContext>,
-    ops::MishGradCPUKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/mish_op.cu b/paddle/fluid/operators/mish_op.cu
deleted file mode 100644
index 4ca07b650c80a..0000000000000
--- a/paddle/fluid/operators/mish_op.cu
+++ /dev/null
@@ -1,177 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mish_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-__global__ void KeMishFw(const T* in, T* out, const int numel,
-                         const float threshold) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  for (; tid < numel; tid += stride) {
-    T x = in[tid];
-    T sp = CalcSoftplus<T>(x, threshold);
-    out[tid] = x * tanh(sp);
-  }
-}
-
-// expf instead of exp should be used for float type, complement
-// and register float kernel separatelly
-__global__ void KeMishFwFP32(const float* in, float* out, const int numel,
-                             const float threshold) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  for (; tid < numel; tid += stride) {
-    float x = in[tid];
-    float sp = CalcSoftplusFP32(x, threshold);
-    out[tid] = x * tanhf(sp);
-  }
-}
-
-template <typename T>
-__global__ void KeMishBw(const T* in, const T* dout, T* din, const int numel,
-                         const float threshold) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  for (; tid < numel; tid += stride) {
-    T x = in[tid];
-    T sp = CalcSoftplus<T>(x, threshold);
-    T tsp = tanh(sp);
-    T grad_sp = -expm1(-sp);
-    T grad_tsp = (static_cast<T>(1) - tsp * tsp) * grad_sp;
-    din[tid] = dout[tid] * (x * grad_tsp + tsp);
-  }
-}
-
-__global__ void KeMishBwFP32(const float* in, const float* dout, float* din,
-                             const int numel, const float threshold) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  for (; tid < numel; tid += stride) {
-    float x = in[tid];
-    float sp = CalcSoftplusFP32(x, threshold);
-    float tsp = tanhf(sp);
-    float grad_sp = -expm1f(-sp);
-    float grad_tsp = (static_cast<float>(1) - tsp * tsp) * grad_sp;
-    din[tid] = dout[tid] * (x * grad_tsp + tsp);
-  }
-}
-
-template <typename DeviceContext, typename T>
-class MishCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
-
-    const float threshold = ctx.Attr<float>("threshold");
-
-    const T* x_data = x->data<T>();
-    T* out_data = out->mutable_data<T>(ctx.GetPlace());
-
-    const int numel = x->numel();
-
-    platform::GpuLaunchConfig config =
-        platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), numel);
-    KeMishFw<T><<<config.block_per_grid, config.thread_per_block, 0,
-                  ctx.cuda_device_context().stream()>>>(x_data, out_data, numel,
-                                                        threshold);
-  }
-};
-
-template <typename DeviceContext>
-class MishFP32CUDAKernel : public framework::OpKernel<float> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
-
-    const float threshold = ctx.Attr<float>("threshold");
-
-    const float* x_data = x->data<float>();
-    float* out_data = out->mutable_data<float>(ctx.GetPlace());
-
-    const int numel = x->numel();
-
-    platform::GpuLaunchConfig config =
-        platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), numel);
-    KeMishFwFP32<<<config.block_per_grid, config.thread_per_block, 0,
-                   ctx.cuda_device_context().stream()>>>(x_data, out_data,
-                                                         numel, threshold);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class MishGradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    auto threshold = ctx.Attr<float>("threshold");
-
-    const T* x_data = x->data<T>();
-    const T* dout_data = dout->data<T>();
-    T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
-
-    const int numel = x->numel();
-
-    platform::GpuLaunchConfig config =
-        platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), numel);
-    KeMishBw<T><<<config.block_per_grid, config.thread_per_block, 0,
-                  ctx.cuda_device_context().stream()>>>(
-        x_data, dout_data, dx_data, numel, threshold);
-  }
-};
-
-template <typename DeviceContext>
-class MishGradFP32CUDAKernel : public framework::OpKernel<float> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    auto threshold = ctx.Attr<float>("threshold");
-
-    const float* x_data = x->data<float>();
-    const float* dout_data = dout->data<float>();
-    float* dx_data = dx->mutable_data<float>(ctx.GetPlace());
-
-    const int numel = x->numel();
-
-    platform::GpuLaunchConfig config =
-        platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), numel);
-    KeMishBwFP32<<<config.block_per_grid, config.thread_per_block, 0,
-                   ctx.cuda_device_context().stream()>>>(
-        x_data, dout_data, dx_data, numel, threshold);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    mish, ops::MishFP32CUDAKernel<paddle::platform::CUDADeviceContext>,
-    ops::MishCUDAKernel<paddle::platform::CUDADeviceContext, double>)
-REGISTER_OP_CUDA_KERNEL(
-    mish_grad, ops::MishGradFP32CUDAKernel<paddle::platform::CUDADeviceContext>,
-    ops::MishGradCUDAKernel<paddle::platform::CUDADeviceContext, double>)
diff --git a/paddle/fluid/operators/mish_op.h b/paddle/fluid/operators/mish_op.h
deleted file mode 100644
index 86ccb57d929e5..0000000000000
--- a/paddle/fluid/operators/mish_op.h
+++ /dev/null
@@ -1,137 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-HOSTDEVICE static T CalcSoftplus(T x, float threshold) {
-  if (threshold > 0 && x > threshold) {
-    return x;
-  } else if (threshold > 0 && x < -threshold) {
-    return exp(x);
-  } else {
-    return log1p(exp(x));
-  }
-}
-
-// expf instead of exp should be used for float type, complement
-// and register float kernel separatelly
-HOSTDEVICE static float CalcSoftplusFP32(float x, float threshold) {
-  if (threshold > 0 && x > threshold) {
-    return x;
-  } else if (threshold > 0 && x < -threshold) {
-    return expf(x);
-  } else {
-    return log1pf(expf(x));
-  }
-}
-
-template <typename DeviceContext, typename T>
-class MishCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
-
-    const float threshold = ctx.Attr<float>("threshold");
-
-    const T* x_data = x->data<T>();
-    T* out_data = out->mutable_data<T>(ctx.GetPlace());
-
-    int numel = x->numel();
-    for (int i = 0; i < numel; i++) {
-      T x_d = x_data[i];
-      T sp = CalcSoftplus<T>(x_d, threshold);
-      out_data[i] = x_d * std::tanh(sp);
-    }
-  }
-};
-
-template <typename DeviceContext>
-class MishFP32CPUKernel : public framework::OpKernel<float> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
-
-    const float threshold = ctx.Attr<float>("threshold");
-
-    const float* x_data = x->data<float>();
-    float* out_data = out->mutable_data<float>(ctx.GetPlace());
-
-    int numel = x->numel();
-    for (int i = 0; i < numel; i++) {
-      float x_d = x_data[i];
-      float sp = CalcSoftplusFP32(x_d, threshold);
-      out_data[i] = x_d * std::tanh(sp);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class MishGradCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-
-    auto threshold = ctx.Attr<float>("threshold");
-
-    const T* x_data = x->data<T>();
-    const T* dout_data = dout->data<T>();
-    T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
-
-    int numel = x->numel();
-    for (int i = 0; i < numel; i++) {
-      T x_d = x_data[i];
-      T sp = CalcSoftplus<T>(x_d, threshold);
-      T tsp = std::tanh(sp);
-      T grad_sp = -std::expm1(-sp);
-      T grad_tsp = (static_cast<T>(1) - tsp * tsp) * grad_sp;
-      dx_data[i] = dout_data[i] * (x_d * grad_tsp + tsp);
-    }
-  }
-};
-
-template <typename DeviceContext>
-class MishGradFP32CPUKernel : public framework::OpKernel<float> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-
-    auto threshold = ctx.Attr<float>("threshold");
-
-    const float* x_data = x->data<float>();
-    const float* dout_data = dout->data<float>();
-    float* dx_data = dx->mutable_data<float>(ctx.GetPlace());
-
-    int numel = x->numel();
-    for (int i = 0; i < numel; i++) {
-      float x_d = x_data[i];
-      float sp = CalcSoftplusFP32(x_d, threshold);
-      float tsp = std::tanh(sp);
-      float grad_sp = -std::expm1f(-sp);
-      float grad_tsp = (static_cast<float>(1) - tsp * tsp) * grad_sp;
-      dx_data[i] = dout_data[i] * (x_d * grad_tsp + tsp);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 260d876177c37..1c357c6fa74d5 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -15191,6 +15191,9 @@ def mish(x, threshold=20, name=None):
         out, = exe.run(feed={'x':x_data}, fetch_list=[y.name])
         print(out)  # [[0.66666667, 1.66666667, 3., 4.]]
     """
+    if in_dygraph_mode():
+        return _C_ops.mish(x, 'threshold', threshold)
+
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'mish')
     check_type(threshold, 'threshold', (float, int), 'mish')
     assert threshold > 0, "threshold of mish should be greater than 0, " \
@@ -15202,7 +15205,7 @@ def mish(x, threshold=20, name=None):
         type='mish',
         inputs={'X': x},
         outputs={'Out': out},
-        attrs={'threshold': threshold or -1})
+        attrs={'threshold': threshold})
     return out
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index c1d7802633eca..d3d8fdd703148 100755
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -2837,6 +2837,86 @@ def test_errors(self):
             F.swish(x_fp16)
 
 
+def ref_mish(x, threshold=20.):
+    softplus = np.select([x <= threshold, x > threshold],
+                         [np.log(1 + np.exp(x)), x])
+    return x * np.tanh(softplus)
+
+
+class TestMish(TestActivation):
+    def setUp(self):
+        self.op_type = "mish"
+        self.init_dtype()
+
+        np.random.seed(1024)
+        x = np.random.uniform(-1, 1, [10, 12]).astype(self.dtype)
+        out = ref_mish(x)
+        self.inputs = {'X': x}
+        self.outputs = {'Out': out}
+
+    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
+        self.check_grad(['X'], 'Out')
+
+
+class TestMishAPI(unittest.TestCase):
+    # test paddle.nn.Mish, paddle.nn.functional.mish
+    def setUp(self):
+        np.random.seed(1024)
+        self.x_np = np.random.uniform(-1, 1, [10, 12]).astype(np.float64)
+        self.place=paddle.CUDAPlace(0) if paddle.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.static.data('X', self.x_np.shape, self.x_np.dtype)
+            out1 = F.mish(x)
+            mish = paddle.nn.Mish()
+            out2 = mish(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = ref_mish(self.x_np)
+        for r in res:
+            self.assertEqual(np.allclose(out_ref, r), True)
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out1 = F.mish(x)
+        mish = paddle.nn.Mish()
+        out2 = mish(x)
+        out_ref = ref_mish(self.x_np)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+        paddle.enable_static()
+
+    def test_fluid_api(self):
+        paddle.enable_static()
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data('X', self.x_np.shape, self.x_np.dtype)
+            out = fluid.layers.mish(x)
+            exe = fluid.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
+        out_ref = ref_mish(self.x_np)
+        self.assertEqual(np.allclose(out_ref, res[0]), True)
+
+    def test_errors(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            # The input type must be Variable.
+            self.assertRaises(TypeError, F.mish, 1)
+            # The input dtype must be float16, float32, float64.
+            x_int32 = paddle.fluid.data(
+                name='x_int32', shape=[12, 10], dtype='int32')
+            self.assertRaises(TypeError, F.mish, x_int32)
+            # support the input dtype is float16
+            x_fp16 = paddle.fluid.data(
+                name='x_fp16', shape=[12, 10], dtype='float16')
+            F.mish(x_fp16)
+
+
 #------------------ Test Error Activation----------------------
 def create_test_error_class(op_type):
     class TestOpErrors(unittest.TestCase):
@@ -2972,6 +3052,7 @@ def test_check_grad(self):
 create_test_act_fp16_class(TestHardSigmoid)
 create_test_act_fp16_class(TestSwish, grad_atol=0.85)
 create_test_act_fp16_class(TestHardSwish)
+create_test_act_fp16_class(TestMish, grad_atol=0.9)
 
 
 def create_test_act_bf16_class(parent,
diff --git a/python/paddle/fluid/tests/unittests/test_mish_op.py b/python/paddle/fluid/tests/unittests/test_mish_op.py
deleted file mode 100644
index 8cc785e450f0b..0000000000000
--- a/python/paddle/fluid/tests/unittests/test_mish_op.py
+++ /dev/null
@@ -1,102 +0,0 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import six
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-from paddle.fluid import Program, program_guard
-from op_test import OpTest, skip_check_grad_ci
-
-
-class TestMishOpError(unittest.TestCase):
-    def test_errors(self):
-        with program_guard(Program()):
-            # The input type must be Variable.
-            self.assertRaises(TypeError, fluid.layers.mish, 0.1, 20)
-            # The input dtype must be float16, float32, float64.
-            x_int32 = fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
-            self.assertRaises(TypeError, fluid.layers.mish, x_int32, 20)
-            # support the input dtype is float32
-            x_fp16 = fluid.layers.data(
-                name='x_fp16', shape=[12, 10], dtype='float32')
-            fluid.layers.mish(x_fp16, threshold=20)
-
-
-class MishTest(OpTest):
-    def setUp(self):
-        self.init_dtype()
-        self.init_input_shape()
-        self.init_input_range()
-        self.init_threshold()
-        self.op_type = "mish"
-
-        x_np = np.random.uniform(self.x_range[0], self.x_range[1],
-                                 self.x_shape).astype(self.dtype)
-        self.inputs = {'X': x_np}
-
-        softplus = x_np * (x_np > self.threshold) + np.exp(x_np) * \
-                    (x_np < -self.threshold) + np.log(np.exp(x_np) + 1.) * \
-                    (x_np >= -self.threshold) * (x_np <= self.threshold)
-        out_np = x_np * np.tanh(softplus)
-
-        self.outputs = {'Out': out_np}
-        self.attrs = {'threshold': self.threshold}
-
-    def init_dtype(self):
-        self.dtype = 'float32'
-
-    def init_input_shape(self):
-        self.x_shape = (10, 12)
-
-    def init_input_range(self):
-        self.x_range = [-1, 1]
-
-    def init_threshold(self):
-        self.threshold = 5.
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-class MishTestUpperThresh(MishTest):
-    def init_input_range(self):
-        self.x_range = [6, 7]
-
-
-class MishTestLowerThresh(MishTest):
-    def init_input_range(self):
-        self.x_range = [-7, -6]
-
-
-# mish op contain calculation like: tanh, exp, log, while tanh
-# may have diff on CPUPlace(see test_activation_op.py::TestTanh),
-# especially when abs(x) is a large value, only check input value
-# in range [-1, 1] for float64 here.
-class MishTestFP64(MishTest):
-    def init_dtype(self):
-        self.dtype = 'float64'
-
-    def init_input_range(self):
-        self.x_range = [-1, 1]
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index 37df0d4446767..78281faa85180 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -46,6 +46,7 @@
 from .layer.activation import Softshrink  # noqa: F401
 from .layer.activation import Softsign  # noqa: F401
 from .layer.activation import Swish  # noqa: F401
+from .layer.activation import Mish  # noqa: F401
 from .layer.activation import Tanhshrink  # noqa: F401
 from .layer.activation import ThresholdedReLU  # noqa: F401
 from .layer.activation import LogSoftmax  # noqa: F401
@@ -294,6 +295,7 @@ def weight_norm(*args):
            'LogSoftmax',
            'Sigmoid',
            'Swish',
+           'Mish',
            'PixelShuffle',
            'ELU',
            'ReLU6',
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index 676d7259f2843..c0b948b7a7826 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -39,6 +39,7 @@
 from .activation import softshrink  # noqa: F401
 from .activation import softsign  # noqa: F401
 from .activation import swish  # noqa: F401
+from .activation import mish  # noqa: F401
 from .activation import tanh  # noqa: F401
 from .activation import tanh_  # noqa: F401
 from .activation import tanhshrink  # noqa: F401
@@ -149,6 +150,7 @@
            'sigmoid',
            'silu',
            'swish',
+           'mish',
            'tanh',
            'tanh_',
            'tanhshrink',
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index 4a071c2fe74f1..07acf142a5110 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -1174,6 +1174,47 @@ def swish(x, name=None):
     return out
 
 
+def mish(x, name=None):
+    r"""
+    mish activation.
+
+    ..  math::
+
+        softplus(x) = \begin{cases}
+                x, \text{if } x > \text{threshold} \\
+                \ln(1 + e^{x}),  \text{otherwise}
+            \end{cases}
+
+        mish(x) = x * \tanh(softplus(x))
+    
+    Parameters:
+        x (Tensor): The input Tensor with data type float32, float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+
+            x = paddle.to_tensor(np.array([-5., 0., 5.]))
+            out = F.mish(x) # [-0.03357624, 0., 4.99955208]
+    """
+    if in_dygraph_mode():
+        return _C_ops.mish(x)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'mish')
+    helper = LayerHelper('mish', **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(type='mish', inputs={'X': x}, outputs={'Out': out})
+    return out
+
+
 def tanhshrink(x, name=None):
     """
     tanhshrink activation
diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py
index 45308f15f4a3b..34755f68b867a 100644
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -881,6 +881,52 @@ def extra_repr(self):
         return name_str
 
 
+class Mish(Layer):
+    r"""
+    Mish Activation.
+
+    ..  math::
+
+        softplus(x) = \begin{cases}
+                x, \text{if } x > \text{threshold} \\
+                \ln(1 + e^{x}),  \text{otherwise}
+            \end{cases}
+
+        Mish(x) = x * \tanh(softplus(x))
+    
+    Parameters:
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
+    
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            x = paddle.to_tensor(np.array([-5., 0., 5.]))
+            m = paddle.nn.Mish()
+            out = m(x) # [-0.03357624, 0., 4.99955208]
+
+    """
+
+    def __init__(self, name=None):
+        super(Mish, self).__init__()
+        self._name = name
+
+    def forward(self, x):
+        return F.mish(x, self._name)
+
+    def extra_repr(self):
+        name_str = 'name={}'.format(self._name) if self._name else ''
+        return name_str
+
+
 class Tanhshrink(Layer):
     """
     Tanhshrink Activation

From 5cf0bb794da4230c3d24138ae4067690db96d16b Mon Sep 17 00:00:00 2001
From: YuanRisheng <yuanrisheng@baidu.com>
Date: Fri, 7 Jan 2022 19:39:22 +0800
Subject: [PATCH 042/151] [PTen]Refactor flatten_grad kernel (#38712)

* refactor flatten grad kernel

* fix bugs when run ci unittest

* fix bugs when use default GetExpectedPtenKernelArgs

* xshape sometimes is has null holder ,fix this bugs
---
 paddle/fluid/framework/pten_utils.cc       |  3 +-
 paddle/fluid/operators/flatten_op.cc       |  6 ++
 paddle/fluid/operators/flatten_op.h        | 26 +++++---
 paddle/pten/core/kernel_alias_name.h       |  2 +
 paddle/pten/kernels/flatten_grad_kernel.cc | 73 ++++++++++++++++++++++
 paddle/pten/kernels/flatten_grad_kernel.h  | 27 ++++++++
 paddle/pten/kernels/flatten_kernel.cc      |  4 --
 7 files changed, 129 insertions(+), 12 deletions(-)
 create mode 100644 paddle/pten/kernels/flatten_grad_kernel.cc
 create mode 100644 paddle/pten/kernels/flatten_grad_kernel.h

diff --git a/paddle/fluid/framework/pten_utils.cc b/paddle/fluid/framework/pten_utils.cc
index cbd58592ef561..b8aedcce3e3fa 100644
--- a/paddle/fluid/framework/pten_utils.cc
+++ b/paddle/fluid/framework/pten_utils.cc
@@ -98,7 +98,8 @@ KernelSignatureMap& KernelSignatureMap::Instance() {
     for (const auto& pair : OpInfoMap::Instance().map()) {
       const auto& op_type = pair.first;
       const auto* op_proto = pair.second.proto_;
-      if (pten::KernelFactory::Instance().HasCompatiblePtenKernel(op_type)) {
+      if (pten::KernelFactory::Instance().HasCompatiblePtenKernel(op_type) &&
+          op_proto != nullptr) {
         KernelArgsNameMakerByOpProto maker(op_proto);
         VLOG(10) << "Register kernel signature for " << op_type;
         auto success = kernel_signature_map_->map_
diff --git a/paddle/fluid/operators/flatten_op.cc b/paddle/fluid/operators/flatten_op.cc
index a1b8dd6bae494..6b1ee00b55d62 100644
--- a/paddle/fluid/operators/flatten_op.cc
+++ b/paddle/fluid/operators/flatten_op.cc
@@ -431,6 +431,12 @@ class FlattenContiguousRangeGradOp : public framework::OperatorWithKernel {
                                        ctx, framework::GradVarName("Out")),
                                    ctx.device_context());
   }
+  framework::KernelSignature GetExpectedPtenKernelArgs(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::KernelSignature("flatten_grad",
+                                      {framework::GradVarName("Out"), "XShape"},
+                                      {}, {framework::GradVarName("X")});
+  }
 };
 DECLARE_INPLACE_OP_INFERER(FlattenOpInplaceInferer, {"X", "Out"});
 DECLARE_INPLACE_OP_INFERER(FlattenGradInplaceInferer,
diff --git a/paddle/fluid/operators/flatten_op.h b/paddle/fluid/operators/flatten_op.h
index fa116d9516ecd..ef42619bfe4ff 100644
--- a/paddle/fluid/operators/flatten_op.h
+++ b/paddle/fluid/operators/flatten_op.h
@@ -21,6 +21,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/pooling.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/pten/include/core.h"
+#include "paddle/pten/kernels/empty_kernel.h"
+#include "paddle/pten/kernels/flatten_grad_kernel.h"
 #include "paddle/pten/kernels/flatten_kernel.h"
 
 namespace paddle {
@@ -146,15 +148,25 @@ class FlattenContiguousRangeGradKernel : public framework::OpKernel<T> {
     auto *d_x = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
     auto *d_out =
         ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
-
-    auto xshape_dims = ctx.Input<framework::LoDTensor>("XShape")->dims();
-    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
+    auto *xshape = ctx.Input<framework::LoDTensor>("XShape");
 
     d_x->mutable_data(ctx.GetPlace(), d_out->type());
-    framework::TensorCopy(
-        *d_out, ctx.GetPlace(),
-        ctx.template device_context<platform::DeviceContext>(), d_x);
-    d_x->Resize(x_dims);
+    auto &dev_ctx = ctx.device_context<DeviceContext>();
+
+    auto pt_d_x = paddle::experimental::MakePtenDenseTensor(*d_x);
+    auto pt_d_out = paddle::experimental::MakePtenDenseTensor(*d_out);
+
+    // Because the holder of xshape may be nullptr, we can't use
+    // MakePtenDenseTensor.
+    // So, we create a new DenseTensor to save the dims of xshape.
+    pten::DenseTensorMeta xshape_meta{pten::TransToPtenDataType(d_x->type()),
+                                      xshape->dims(), d_x->layout()};
+    auto pt_xshape =
+        pten::Empty<T, DeviceContext>(dev_ctx, std::move(xshape_meta));
+
+    // call new kernel
+    pten::FlattenGradKernel<T, DeviceContext>(dev_ctx, *pt_d_out.get(),
+                                              pt_xshape, pt_d_x.get());
   }
 };
 
diff --git a/paddle/pten/core/kernel_alias_name.h b/paddle/pten/core/kernel_alias_name.h
index 3b8347dec772e..56f7eea7ea802 100644
--- a/paddle/pten/core/kernel_alias_name.h
+++ b/paddle/pten/core/kernel_alias_name.h
@@ -27,12 +27,14 @@ const std::unordered_map<std::string, std::string> kernel_alias_name_map = {
     {"fill_any_like", "full_like"},
     {"fill_constant", "full"},
     {"flatten_contiguous_range", "flatten"},
+    {"flatten_contiguous_range_grad", "flatten_grad"},
     {"matmul_v2", "matmul"},
     {"reduce_mean", "mean"},
     {"reduce_sum", "sum"},
     {"reshape2", "reshape"},
     // fluid kernel "mean/reshape/matmul/flatten/sum" should be deprecated
     {"flatten", "deprecated"},
+    {"flatten_grad", "deprecated"},
     {"matmul", "deprecated"},
     {"mean", "deprecated"},
     {"reshape", "deprecated"},
diff --git a/paddle/pten/kernels/flatten_grad_kernel.cc b/paddle/pten/kernels/flatten_grad_kernel.cc
new file mode 100644
index 0000000000000..d6aea31748d6c
--- /dev/null
+++ b/paddle/pten/kernels/flatten_grad_kernel.cc
@@ -0,0 +1,73 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/kernels/flatten_grad_kernel.h"
+#include "paddle/pten/backends/all_context.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/kernels/copy_kernel.h"
+
+namespace pten {
+
+template <typename T, typename Context>
+void FlattenGradKernel(const Context& dev_ctx,
+                       const DenseTensor& out_grad,
+                       const DenseTensor& xshape,
+                       DenseTensor* x_grad) {
+  auto xshape_dims = xshape.dims();
+  auto x_dims =
+      paddle::framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
+  pten::Copy(dev_ctx, out_grad, false, x_grad);
+  x_grad->Resize(x_dims);
+}
+
+}  // namespace pten
+
+PT_REGISTER_CTX_KERNEL(flatten_grad,
+                       CPU,
+                       ALL_LAYOUT,
+                       pten::FlattenGradKernel,
+                       float,
+                       double,
+                       uint8_t,
+                       int8_t,
+                       int,
+                       int64_t) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PT_REGISTER_CTX_KERNEL(flatten_grad,
+                       GPU,
+                       ALL_LAYOUT,
+                       pten::FlattenGradKernel,
+                       float,
+                       paddle::platform::float16,
+                       double,
+                       uint8_t,
+                       int8_t,
+                       int,
+                       int64_t) {}
+
+#endif
+
+#ifdef PADDLE_WITH_XPU
+PT_REGISTER_CTX_KERNEL(flatten_grad,
+                       XPU,
+                       ALL_LAYOUT,
+                       pten::FlattenGradKernel,
+                       float,
+                       paddle::platform::float16,
+                       int8_t,
+                       int,
+                       int64_t) {}
+
+#endif
diff --git a/paddle/pten/kernels/flatten_grad_kernel.h b/paddle/pten/kernels/flatten_grad_kernel.h
new file mode 100644
index 0000000000000..91d9aa7c30609
--- /dev/null
+++ b/paddle/pten/kernels/flatten_grad_kernel.h
@@ -0,0 +1,27 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/core/dense_tensor.h"
+
+namespace pten {
+
+template <typename T, typename Context>
+void FlattenGradKernel(const Context& dev_ctx,
+                       const DenseTensor& out_grad,
+                       const DenseTensor& xshape,
+                       DenseTensor* x_grad);
+
+}  // namespace pten
diff --git a/paddle/pten/kernels/flatten_kernel.cc b/paddle/pten/kernels/flatten_kernel.cc
index 37d4d88ccb40e..b284d3690830f 100644
--- a/paddle/pten/kernels/flatten_kernel.cc
+++ b/paddle/pten/kernels/flatten_kernel.cc
@@ -103,8 +103,6 @@ PT_REGISTER_CTX_KERNEL(flatten,
                        pten::FlattenKernel,
                        float,
                        paddle::platform::float16,
-                       double,
-                       uint8_t,
                        int8_t,
                        int,
                        int64_t) {}
@@ -115,8 +113,6 @@ PT_REGISTER_CTX_KERNEL(flatten_with_xshape,
                        pten::FlattenWithXShape,
                        float,
                        paddle::platform::float16,
-                       double,
-                       uint8_t,
                        int8_t,
                        int,
                        int64_t) {}

From 04f73d8901b78cfb9e29d80782bd86f74fec130d Mon Sep 17 00:00:00 2001
From: Guoxia Wang <mingzilaochongtu@gmail.com>
Date: Mon, 10 Jan 2022 09:46:57 +0800
Subject: [PATCH 043/151] fix cuda seed bug of class_center_sample traning on
 multi gpu (#38815)

---
 paddle/fluid/operators/class_center_sample_op.cu | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/class_center_sample_op.cu b/paddle/fluid/operators/class_center_sample_op.cu
index f1ccbc913d9b1..fad74b81e14e4 100644
--- a/paddle/fluid/operators/class_center_sample_op.cu
+++ b/paddle/fluid/operators/class_center_sample_op.cu
@@ -397,7 +397,9 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel<T> {
                        (NumBlocks(num_classes) * kNumCUDAThreads * vec_size) +
                    1) *
                   vec_size;
-    auto gen_cuda = framework::GetDefaultCUDAGenerator(rank);
+    int device_id =
+        BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).GetDeviceId();
+    auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
     if (gen_cuda->GetIsInitPy() && (!fix_seed)) {
       auto seed_offset = gen_cuda->IncrementOffset(offset);
       seed_data = seed_offset.first;

From 2238a535efb2497ab9161f07233b7860bcef1441 Mon Sep 17 00:00:00 2001
From: Guoxia Wang <mingzilaochongtu@gmail.com>
Date: Mon, 10 Jan 2022 10:11:52 +0800
Subject: [PATCH 044/151] remove fp32 tmp tensor and cast op for
 initializer.Normal and initializer.Constant (#38818)

---
 paddle/fluid/operators/gaussian_random_op.cu  | 17 +++-
 python/paddle/fluid/initializer.py            | 99 ++++---------------
 .../fluid/tests/unittests/test_initializer.py | 18 ++--
 .../tests/unittests/test_initializer_nn.py    |  8 +-
 4 files changed, 42 insertions(+), 100 deletions(-)

diff --git a/paddle/fluid/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu
index 2ea432db6c7f0..ef0e000b25efd 100644
--- a/paddle/fluid/operators/gaussian_random_op.cu
+++ b/paddle/fluid/operators/gaussian_random_op.cu
@@ -18,6 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/fill_constant_op.h"
 
 namespace paddle {
@@ -38,10 +39,12 @@ struct GaussianGenerator {
   __host__ __device__ T operator()(const unsigned int n) const {
     thrust::minstd_rand rng;
     rng.seed(seed_);
-    thrust::normal_distribution<T> dist(mean_, std_);
+    using MT = typename details::MPTypeTrait<T>::Type;
+    thrust::normal_distribution<MT> dist(mean_, std_);
     unsigned int new_n = n + offset_;
     rng.discard(new_n);
-    return dist(rng);
+    MT out = dist(rng);
+    return static_cast<T>(out);
   }
 };
 
@@ -124,10 +127,14 @@ class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_CUDA_KERNEL(gaussian_random,
-                        paddle::operators::GPUGaussianRandomKernel<float>,
-                        paddle::operators::GPUGaussianRandomKernel<double>);
+REGISTER_OP_CUDA_KERNEL(
+    gaussian_random,
+    paddle::operators::GPUGaussianRandomKernel<paddle::platform::float16>,
+    paddle::operators::GPUGaussianRandomKernel<float>,
+    paddle::operators::GPUGaussianRandomKernel<double>);
 REGISTER_OP_CUDA_KERNEL(
     gaussian_random_batch_size_like,
+    paddle::operators::GPUGaussianRandomBatchSizeLikeKernel<
+        paddle::platform::float16>,
     paddle::operators::GPUGaussianRandomBatchSizeLikeKernel<float>,
     paddle::operators::GPUGaussianRandomBatchSizeLikeKernel<double>);
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index fd1562d609a1d..6ef3646a91943 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -137,54 +137,27 @@ def __call__(self, var, block=None):
                 isinstance(var, framework.EagerParamBase))
         assert isinstance(block, framework.Block)
 
-        # to be compatible of fp16 initializers
-        if var.dtype == VarDesc.VarType.FP16:
-            out_dtype = VarDesc.VarType.FP32
-            out_var = block.create_var(
-                name=unique_name.generate(".".join(
-                    ['constant_init', var.name, 'tmp'])),
-                shape=var.shape,
-                dtype=out_dtype,
-                type=VarDesc.VarType.LOD_TENSOR,
-                persistable=False)
-        else:
-            out_dtype = var.dtype
-            out_var = var
-
         if framework.in_dygraph_mode():
-            out_var = _C_ops.fill_constant(
-                out_var, 'value',
+            var = _C_ops.fill_constant(
+                var, 'value',
                 float(self._value), 'force_cpu', self._force_cpu, 'dtype',
-                int(out_dtype), 'str_value',
+                int(var.dtype), 'str_value',
                 str(float(self._value)), 'shape', var.shape)
-            if var.dtype == VarDesc.VarType.FP16:
-                var_tmp = _C_ops.cast(out_var, 'in_dtype', out_var.dtype,
-                                      'out_dtype', var.dtype)
-                var.copy_(var_tmp, False)
-            else:
-                var.copy_(out_var, False)
             return None
         else:
             # fill constant should set the "str_value" to preserve precision
             op = block.append_op(
                 type="fill_constant",
-                outputs={"Out": out_var},
+                outputs={"Out": var},
                 attrs={
                     "shape": var.shape,
-                    "dtype": int(out_dtype),
+                    "dtype": int(var.dtype),
                     "value": float(self._value),
                     'str_value': str(float(self._value)),
                     'force_cpu': self._force_cpu
                 },
                 stop_gradient=True)
 
-            if var.dtype == VarDesc.VarType.FP16:
-                block.append_op(
-                    type="cast",
-                    inputs={"X": out_var},
-                    outputs={"Out": var},
-                    attrs={"in_dtype": out_var.dtype,
-                           "out_dtype": var.dtype})
             var.op = op
             return op
 
@@ -361,54 +334,24 @@ def __call__(self, var, block=None):
         if self._seed == 0:
             self._seed = block.program.random_seed
 
-        # to be compatible of fp16 initalizers
-        if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
-            out_dtype = VarDesc.VarType.FP32
-            out_var = block.create_var(
-                name=unique_name.generate(".".join(
-                    ['gaussian_random', var.name, 'tmp'])),
-                shape=var.shape,
-                dtype=out_dtype,
-                type=VarDesc.VarType.LOD_TENSOR,
-                persistable=False)
-        else:
-            out_dtype = var.dtype
-            out_var = var
-
-        if framework.in_dygraph_mode():
-            out_var = _C_ops.gaussian_random(
-                'shape', var.shape, 'dtype', out_dtype, 'mean', self._mean,
-                'std', self._std_dev, 'seed', self._seed, 'use_mkldnn', False)
-            if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
-                var_tmp = _C_ops.cast(out_var, 'in_dtype', out_var.dtype,
-                                      'out_dtype', var.dtype)
-                var.copy_(var_tmp, False)
-            else:
-                var.copy_(out_var, False)
-            return None
-        else:
-            op = block.append_op(
-                type="gaussian_random",
-                outputs={"Out": out_var},
-                attrs={
-                    "shape": var.shape,
-                    "dtype": out_dtype,
-                    "mean": self._mean,
-                    "std": self._std_dev,
-                    "seed": self._seed,
-                    "use_mkldnn": False
-                },
-                stop_gradient=True)
-
-            if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
-                block.append_op(
-                    type="cast",
-                    inputs={"X": out_var},
-                    outputs={"Out": var},
-                    attrs={"in_dtype": out_var.dtype,
-                           "out_dtype": var.dtype})
+        op = block.append_op(
+            type="gaussian_random",
+            outputs={"Out": var},
+            attrs={
+                "shape": var.shape,
+                "dtype": var.dtype,
+                "mean": self._mean,
+                "std": self._std_dev,
+                "seed": self._seed,
+                "use_mkldnn": False
+            },
+            stop_gradient=True)
+
+        if not framework.in_dygraph_mode():
             var.op = op
             return op
+        else:
+            return None
 
 
 class TruncatedNormalInitializer(Initializer):
diff --git a/python/paddle/fluid/tests/unittests/test_initializer.py b/python/paddle/fluid/tests/unittests/test_initializer.py
index 6fdad811ee885..bff10c9c4ca26 100644
--- a/python/paddle/fluid/tests/unittests/test_initializer.py
+++ b/python/paddle/fluid/tests/unittests/test_initializer.py
@@ -65,7 +65,7 @@ def test_constant_initializer_default_value(self, dtype="float32"):
                 lod_level=0,
                 name="param",
                 initializer=initializer.ConstantInitializer())
-        num_ops = 2 if dtype == "float16" else 1
+        num_ops = 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'fill_constant')
@@ -84,7 +84,7 @@ def test_constant_initializer(self, dtype="float32"):
                 lod_level=0,
                 name="param",
                 initializer=initializer.ConstantInitializer(2.3))
-        num_ops = 2 if dtype == "float16" else 1
+        num_ops = 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'fill_constant')
@@ -94,10 +94,8 @@ def test_constant_initializer(self, dtype="float32"):
     def test_constant_initializer_fp16(self):
         """Test constant initializer with float16
         """
-        block = self.test_constant_initializer_default_value("float16")
-        self.assertTrue(check_cast_op(block.ops[1]))
-        block = self.test_constant_initializer("float16")
-        self.assertTrue(check_cast_op(block.ops[1]))
+        self.test_constant_initializer_default_value("float16")
+        self.test_constant_initializer("float16")
 
     def test_constant_initializer_bf16(self):
         """Test constant initializer with bfloat16
@@ -246,7 +244,7 @@ def test_normal_initializer(self, dtype="float32"):
                 lod_level=0,
                 name="param",
                 initializer=initializer.NormalInitializer(2.3, 1.9, 123))
-        num_ops = 2 if dtype in ["float16", "uint16"] else 1
+        num_ops = 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'gaussian_random')
@@ -258,14 +256,12 @@ def test_normal_initializer(self, dtype="float32"):
     def test_normal_initializer_fp16(self):
         """Test normal initializer with float16
         """
-        block = self.test_normal_initializer("float16")
-        self.assertTrue(check_cast_op(block.ops[1]))
+        self.test_normal_initializer("float16")
 
     def test_normal_initializer_bf16(self):
         """Test normal initializer with bfloat16
         """
-        block = self.test_normal_initializer("uint16")
-        self.assertTrue(check_cast_op(block.ops[1]))
+        self.test_normal_initializer("uint16")
 
 
 class TestXavierInitializer(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_initializer_nn.py b/python/paddle/fluid/tests/unittests/test_initializer_nn.py
index 85815c5eeef30..74686652044ec 100644
--- a/python/paddle/fluid/tests/unittests/test_initializer_nn.py
+++ b/python/paddle/fluid/tests/unittests/test_initializer_nn.py
@@ -54,7 +54,7 @@ def static_test_constant_initializer_common(self,
                 lod_level=0,
                 name="param",
                 initializer=init_inst)
-        num_ops = 2 if dtype in ["float16"] else 1
+        num_ops = 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'fill_constant')
@@ -103,9 +103,7 @@ def test_constant_initializer_fp16(self):
         """Test constant initializer with float16
         """
         block = self.test_constant_initializer_default_value_static("float16")
-        self.assertTrue(check_cast_op(block.ops[1]))
         block = self.test_constant_initializer_static("float16")
-        self.assertTrue(check_cast_op(block.ops[1]))
         self.test_constant_initializer_default_value_dygraph("float16")
         self.test_constant_initializer_dygraph("float16")
 
@@ -402,7 +400,7 @@ def test_normal_initializer(self, dtype="float32"):
                 lod_level=0,
                 name="param",
                 initializer=initializer.Normal(2.3, 1.9))
-        num_ops = 2 if dtype in ["float16", "uint16"] else 1
+        num_ops = 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'gaussian_random')
@@ -417,13 +415,11 @@ def test_normal_initializer_fp16(self):
         """Test normal initializer with float16
         """
         block = self.test_normal_initializer("float16")
-        self.assertTrue(check_cast_op(block.ops[1]))
 
     def test_normal_initializer_bf16(self):
         """Test normal initializer with bfloat16
         """
         block = self.test_normal_initializer("uint16")  #bfloat16
-        self.assertTrue(check_cast_op(block.ops[1]))
 
     def test_normal_initializer_dygraph(self):
         """Test normal initializer in dygraph model.

From 7e31542c8c96a6f0ef9a2c76e9d18d66f933dd7e Mon Sep 17 00:00:00 2001
From: andyjpaddle <87074272+andyjpaddle@users.noreply.github.com>
Date: Mon, 10 Jan 2022 10:30:53 +0800
Subject: [PATCH 045/151] Add MaxUnPool3D op and MaxUnPool1D op (#38716)

* add maxunpool3d op

* update doc for maxunpool3d op

* update doc for maxunpool3d op

* update doc for maxunpool3d op

* update sample code for maxunpool3d

* add maxunpool1d op

* update some code for maxunpool1d
---
 paddle/fluid/operators/math/unpooling.cc      |  93 +++++-
 paddle/fluid/operators/math/unpooling.cu      | 113 ++++++-
 paddle/fluid/operators/math/unpooling.h       |  18 +-
 paddle/fluid/operators/unpool_op.cc           | 158 +++++++++-
 paddle/fluid/operators/unpool_op.cu.cc        |   9 +-
 paddle/fluid/operators/unpool_op.h            |  51 ++-
 .../fluid/tests/unittests/test_unpool1d_op.py | 156 ++++++++++
 .../fluid/tests/unittests/test_unpool3d_op.py | 293 ++++++++++++++++++
 python/paddle/nn/__init__.py                  |   4 +
 python/paddle/nn/functional/__init__.py       |   4 +
 python/paddle/nn/functional/pooling.py        | 221 +++++++++++++
 python/paddle/nn/layer/__init__.py            |   2 +
 python/paddle/nn/layer/pooling.py             | 171 ++++++++++
 13 files changed, 1287 insertions(+), 6 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_unpool1d_op.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_unpool3d_op.py

diff --git a/paddle/fluid/operators/math/unpooling.cc b/paddle/fluid/operators/math/unpooling.cc
index bcb2b92780cc8..69fd2dbb85246 100644
--- a/paddle/fluid/operators/math/unpooling.cc
+++ b/paddle/fluid/operators/math/unpooling.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -96,10 +96,101 @@ class Unpool2dMaxGradFunctor<platform::CPUDeviceContext, T> {
     }
   }
 };
+
+template <typename T>
+class Unpool3dMaxFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices, framework::Tensor* output) {
+    const int batch_size = input.dims()[0];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output->dims()[1];
+    const int output_depth = output->dims()[2];
+    const int output_height = output->dims()[3];
+    const int output_width = output->dims()[4];
+    int input_feasize = input_depth * input_height * input_width;
+    int output_feasize = output_depth * output_height * output_width;
+    const T* input_data = input.data<T>();
+    const int* indices_data = indices.data<int>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+    for (int b = 0; b < batch_size; ++b) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int i = 0; i < input_feasize; ++i) {
+          int index = indices_data[i];
+
+          PADDLE_ENFORCE_LT(
+              index, output_feasize,
+              platform::errors::InvalidArgument(
+                  "index should less than output tensor depth * output tensor "
+                  "height "
+                  "* output tensor width. Expected %ld < %ld, but got "
+                  "%ld >= %ld. Please check input value.",
+                  index, output_feasize, index, output_feasize));
+          output_data[index] = input_data[i];
+        }
+        input_data += input_feasize;
+        indices_data += input_feasize;
+        output_data += output_feasize;
+      }
+    }
+  }
+};
+template <class T>
+class Unpool3dMaxGradFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input.dims()[0];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output.dims()[1];
+    const int output_depth = output.dims()[2];
+    const int output_height = output.dims()[3];
+    const int output_width = output.dims()[4];
+    int input_feasize = input_depth * input_height * input_width;
+    int output_feasize = output_depth * output_height * output_width;
+    const int* indices_data = indices.data<int>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+
+    for (int b = 0; b < batch_size; ++b) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int i = 0; i < input_feasize; ++i) {
+          int index = indices_data[i];
+          PADDLE_ENFORCE_LT(
+              index, output_feasize,
+              platform::errors::InvalidArgument(
+                  "index should less than output tensor depth * output tensor "
+                  "height "
+                  "* output tensor width. Expected %ld < %ld, but got "
+                  "%ld >= %ld. Please check input value.",
+                  index, output_feasize, index, output_feasize));
+          input_grad_data[i] = output_grad_data[index];
+        }
+        input_grad_data += input_feasize;
+        indices_data += input_feasize;
+        output_grad_data += output_feasize;
+      }
+    }
+  }
+};
+
 template class Unpool2dMaxGradFunctor<platform::CPUDeviceContext, float>;
 template class Unpool2dMaxGradFunctor<platform::CPUDeviceContext, double>;
 template class Unpool2dMaxFunctor<platform::CPUDeviceContext, float>;
 template class Unpool2dMaxFunctor<platform::CPUDeviceContext, double>;
+template class Unpool3dMaxGradFunctor<platform::CPUDeviceContext, float>;
+template class Unpool3dMaxGradFunctor<platform::CPUDeviceContext, double>;
+template class Unpool3dMaxFunctor<platform::CPUDeviceContext, float>;
+template class Unpool3dMaxFunctor<platform::CPUDeviceContext, double>;
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/unpooling.cu b/paddle/fluid/operators/math/unpooling.cu
index dbb3d64350cae..973865caba688 100644
--- a/paddle/fluid/operators/math/unpooling.cu
+++ b/paddle/fluid/operators/math/unpooling.cu
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 paddlepaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -51,6 +51,45 @@ __global__ void KernelUnpool2dMaxGrad(
 /*
  * All tensors are in NCHW format.
  */
+
+template <typename T>
+__global__ void KernelUnpool3dMax(const int nthreads, const T* input_data,
+                                  const int* indices_data,
+                                  const int input_depth, const int input_height,
+                                  const int input_width, const int channels,
+                                  T* output_data, const int output_depth,
+                                  const int output_height,
+                                  const int output_width) {
+  CUDA_KERNEL_LOOP(linearIndex, nthreads) {
+    int c = (linearIndex / input_depth / input_width / input_height) % channels;
+    int n = linearIndex / input_depth / input_width / input_height / channels;
+    output_data +=
+        (n * channels + c) * output_depth * output_height * output_width;
+    int maxind = indices_data[linearIndex];
+    output_data[maxind] = input_data[linearIndex];
+  }
+}
+
+template <typename T>
+__global__ void KernelUnpool3dMaxGrad(
+    const int nthreads, const T* input_data, const int* indices_data,
+    const int input_depth, const int input_height, const int input_width,
+    const int channels, const T* output_data, const T* output_grad,
+    const int output_depth, const int output_height, const int output_width,
+    T* input_grad) {
+  CUDA_KERNEL_LOOP(linearIndex, nthreads) {
+    int c = (linearIndex / input_depth / input_width / input_height) % channels;
+    int n = linearIndex / input_depth / input_width / input_height / channels;
+    output_grad +=
+        (n * channels + c) * output_depth * output_height * output_width;
+    int maxind = indices_data[linearIndex];
+    input_grad[linearIndex] = output_grad[maxind];
+  }
+}
+/*
+ * All tensors are in NCDHW format.
+ */
+
 template <typename T>
 class Unpool2dMaxFunctor<platform::CUDADeviceContext, T> {
  public:
@@ -112,10 +151,82 @@ class Unpool2dMaxGradFunctor<platform::CUDADeviceContext, T> {
         output_width, input_grad_data);
   }
 };
+
+template <typename T>
+class Unpool3dMaxFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices, framework::Tensor* output) {
+    const int batch_size = input.dims()[0];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output->dims()[1];
+    const int output_depth = output->dims()[2];
+    const int output_height = output->dims()[3];
+    const int output_width = output->dims()[4];
+    const T* input_data = input.data<T>();
+    const int* indices_data = indices.data<int>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+#ifdef __HIPCC__
+    int threads = 256;
+#else
+    int threads = 1024;
+#endif
+    int grid = (input.numel() + threads - 1) / threads;
+    KernelUnpool3dMax<T><<<grid, threads, 0, context.stream()>>>(
+        input.numel(), input_data, indices_data, input_depth, input_height,
+        input_width, output_channels, output_data, output_depth, output_height,
+        output_width);
+  }
+};
+/*
+ * All tensors are in NCDHW format.
+ */
+template <typename T>
+class Unpool3dMaxGradFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input.dims()[0];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output.dims()[1];
+    const int output_depth = output.dims()[2];
+    const int output_height = output.dims()[3];
+    const int output_width = output.dims()[4];
+    const T* input_data = input.data<T>();
+    const int* indices_data = indices.data<int>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+#ifdef __HIPCC__
+    int threads = 256;
+#else
+    int threads = 1024;
+#endif
+    int grid = (input.numel() + threads - 1) / threads;
+    KernelUnpool3dMaxGrad<T><<<grid, threads, 0, context.stream()>>>(
+        input.numel(), input_data, indices_data, input_depth, input_height,
+        input_width, output_channels, output_data, output_grad_data,
+        output_depth, output_height, output_width, input_grad_data);
+  }
+};
+
 template class Unpool2dMaxGradFunctor<platform::CUDADeviceContext, float>;
 template class Unpool2dMaxGradFunctor<platform::CUDADeviceContext, double>;
 template class Unpool2dMaxFunctor<platform::CUDADeviceContext, float>;
 template class Unpool2dMaxFunctor<platform::CUDADeviceContext, double>;
+template class Unpool3dMaxGradFunctor<platform::CUDADeviceContext, float>;
+template class Unpool3dMaxGradFunctor<platform::CUDADeviceContext, double>;
+template class Unpool3dMaxFunctor<platform::CUDADeviceContext, float>;
+template class Unpool3dMaxFunctor<platform::CUDADeviceContext, double>;
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/unpooling.h b/paddle/fluid/operators/math/unpooling.h
index 74ca39d114e26..63bd8186adeb2 100644
--- a/paddle/fluid/operators/math/unpooling.h
+++ b/paddle/fluid/operators/math/unpooling.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -33,6 +33,22 @@ class Unpool2dMaxGradFunctor {
                   const framework::Tensor& output_grad,
                   framework::Tensor* input_grad);
 };
+
+template <typename DeviceContext, typename T>
+class Unpool3dMaxFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  const framework::Tensor& indices, framework::Tensor* output);
+};
+template <typename DeviceContext, class T>
+class Unpool3dMaxGradFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  const framework::Tensor& indices,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad,
+                  framework::Tensor* input_grad);
+};
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/unpool_op.cc b/paddle/fluid/operators/unpool_op.cc
index 108cd2722b5ed..8edfb4bc6c52f 100644
--- a/paddle/fluid/operators/unpool_op.cc
+++ b/paddle/fluid/operators/unpool_op.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -76,6 +76,65 @@ Paper: http://www.matthewzeiler.com/wp-content/uploads/2017/07/iccv2011.pdf
   }
 };
 
+class Unpool3dOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput(
+        "X",
+        "(Tensor) The input tensor of unpool operator. "
+        "The format of input tensor is NCDHW. Where N is batch size, C is the "
+        "number of channels, D, H and W is the depth, height and width of "
+        "feature.");
+    AddInput(
+        "Indices",
+        "(Tensor) The input tensor of the indices given out by MaxPool3d. "
+        "The format of input tensor is NCDHW. Where N is batch size, C is the "
+        "number of channels, D, H and W is the depth, height and width of "
+        "feature.");
+    AddOutput("Out",
+              "(Tensor) The output tensor of unpool operator."
+              "The format of output tensor is also NCDHW."
+              "Where N is batch size, C is "
+              "the number of channels, D, H and W is the depth, height and "
+              "width of feature.");
+    AddAttr<std::vector<int>>(
+        "ksize",
+        "(vector), the unpooling window size(depth, height, width) "
+        "of unpooling operator.");
+    AddAttr<std::vector<int>>(
+        "strides",
+        "(vector, default:{1, 1, 1}), "
+        "strides (depth, height, width) of unpooling operator.")
+        .SetDefault({1, 1, 1});
+    AddAttr<std::vector<int>>(
+        "paddings",
+        "(vector default:{0, 0,0}), "
+        "paddings (depth, height, width) of unpooling operator.")
+        .SetDefault({0, 0, 0});
+    AddAttr<std::string>(
+        "unpooling_type",
+        "(string), unpooling type, can be \"max\" for max-unpooling ")
+        .InEnum({"max"});
+    AddAttr<std::vector<int>>("output_size",
+                              "(vector, optional). The shape of output.")
+        .SetDefault({0, 0, 0});
+    AddAttr<std::string>(
+        "data_format",
+        "(string, default NCDHW)"
+        "Defaults to \"NCDHW\". Specify the data format of the output data, ")
+        .SetDefault("NCDHW");
+    AddComment(R"DOC(
+Input shape is: $(N, C_{in}, D_{in}, H_{in}, W_{in})$, Output shape is:
+$(N, C_{out}, D_{out}, H_{out}, W_{out})$, where
+$$
+D_{out} = (D_{in}-1) * strides[0] - 2 * paddings[0] + ksize[0] \\
+H_{out} = (H_{in}-1) * strides[1] - 2 * paddings[1] + ksize[1] \\
+W_{out} = (W_{in}-1) * strides[2] - 2 * paddings[2] + ksize[2]
+$$
+)DOC");
+  }
+};
+
 int UnpoolOutputSize(int input_size, int ksize, int padding, int stride) {
   int output_size = (input_size - 1) * stride - 2 * padding + ksize;
   return output_size;
@@ -130,6 +189,55 @@ class UnpoolOp : public framework::OperatorWithKernel {
   }
 };
 
+class Unpool3dOp : public framework::OperatorWithKernel {
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"),
+        ctx.device_context());
+  }
+
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Unpool3d");
+    OP_INOUT_CHECK(ctx->HasInput("Indices"), "Input", "Indices", "Unpool3d");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Unpool3d");
+    auto in_x_dims = ctx->GetInputDim("X");
+    auto in_y_dims = ctx->GetInputDim("Indices");
+    std::string unpooling_type =
+        ctx->Attrs().Get<std::string>("unpooling_type");
+    std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
+    std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+    std::vector<int> output_size =
+        ctx->Attrs().Get<std::vector<int>>("output_size");
+    PADDLE_ENFORCE_EQ(in_x_dims.size() == 5, true,
+                      platform::errors::InvalidArgument(
+                          "Unpool Intput(X) must be of 5-dimensional, but "
+                          "received Input(X)'s dimensions is %d.",
+                          in_x_dims.size()));
+    PADDLE_ENFORCE_EQ(in_x_dims, in_y_dims,
+                      platform::errors::InvalidArgument(
+                          "The dimensions of Input(X) must equal to be"
+                          "the dimensions of Input(Indices), but received"
+                          "dimensions of Input(X) is [%d], received dimensions"
+                          "of Input(Indices) is [%d]",
+                          in_x_dims, in_y_dims));
+
+    std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
+    for (size_t i = 0; i < ksize.size(); ++i) {
+      if (!ctx->IsRuntime() && in_x_dims[i + 2] <= 0) {
+        output_shape.push_back(-1);
+      } else {
+        output_shape.push_back(output_size[i]);
+      }
+    }
+    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+  }
+};
+
 template <typename T>
 class UnpoolOpGradMaker : public framework::SingleGradOpMaker<T> {
  public:
@@ -145,6 +253,21 @@ class UnpoolOpGradMaker : public framework::SingleGradOpMaker<T> {
   }
 };
 
+template <typename T>
+class Unpool3dOpGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType(this->ForwardOpType() + "_grad");
+    op->SetInput("X", this->Input("X"));
+    op->SetInput("Indices", this->Input("Indices"));
+    op->SetInput("Out", this->Output("Out"));
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
 class UnpoolOpGrad : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
@@ -163,6 +286,26 @@ class UnpoolOpGrad : public framework::OperatorWithKernel {
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
   }
 };
+
+class Unpool3dOpGrad : public framework::OperatorWithKernel {
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"),
+        ctx.device_context());
+  }
+
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Unpool3dGrad");
+    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output",
+                   framework::GradVarName("X"), "Unpool3dGrad");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -179,3 +322,16 @@ REGISTER_OP_CPU_KERNEL(
     unpool_grad,
     ops::UnpoolGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::UnpoolGradKernel<paddle::platform::CPUDeviceContext, double>);
+
+REGISTER_OPERATOR(unpool3d, ops::Unpool3dOp, ops::Unpool3dOpMaker,
+                  ops::Unpool3dOpGradMaker<paddle::framework::OpDesc>,
+                  ops::Unpool3dOpGradMaker<paddle::imperative::OpBase>);
+
+REGISTER_OPERATOR(unpool3d_grad, ops::Unpool3dOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    unpool3d, ops::Unpool3dKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::Unpool3dKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    unpool3d_grad,
+    ops::Unpool3dGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::Unpool3dGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/unpool_op.cu.cc b/paddle/fluid/operators/unpool_op.cu.cc
index 7c59a0feaa472..e3cab4426b4d8 100644
--- a/paddle/fluid/operators/unpool_op.cu.cc
+++ b/paddle/fluid/operators/unpool_op.cu.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -22,3 +22,10 @@ REGISTER_OP_CUDA_KERNEL(
     unpool_grad,
     ops::UnpoolGradKernel<paddle::platform::CUDADeviceContext, float>,
     ops::UnpoolGradKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    unpool3d, ops::Unpool3dKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::Unpool3dKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    unpool3d_grad,
+    ops::Unpool3dGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::Unpool3dGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/unpool_op.h b/paddle/fluid/operators/unpool_op.h
index e388ec5ae3937..52849cb3e0f8e 100644
--- a/paddle/fluid/operators/unpool_op.h
+++ b/paddle/fluid/operators/unpool_op.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -69,5 +69,54 @@ class UnpoolGradKernel : public framework::OpKernel<T> {
     unpool2d_max_backward(device_ctx, *in_x, *in_y, *out, *out_grad, in_x_grad);
   }
 };
+
+template <typename DeviceContext, typename T>
+class Unpool3dKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
+    const framework::Tensor* in_y = context.Input<framework::Tensor>("Indices");
+    auto* out = context.Output<framework::Tensor>("Out");
+    std::string unpooling_type = context.Attr<std::string>("unpooling_type");
+    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    T* output_data = out->mutable_data<T>(context.GetPlace());
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    if (output_data) {
+      math::SetConstant<DeviceContext, T> set_zero;
+      set_zero(dev_ctx, out, static_cast<T>(0));
+    }
+    math::Unpool3dMaxFunctor<DeviceContext, T> unpool3d_max_forward;
+    unpool3d_max_forward(dev_ctx, *in_x, *in_y, out);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class Unpool3dGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
+    const framework::Tensor* in_y = context.Input<framework::Tensor>("Indices");
+    const framework::Tensor* out = context.Input<framework::Tensor>("Out");
+    const framework::Tensor* out_grad =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    framework::Tensor* in_x_grad =
+        context.Output<framework::Tensor>(framework::GradVarName("X"));
+    std::string unpooling_type = context.Attr<std::string>("unpooling_type");
+    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+
+    auto& device_ctx = context.template device_context<DeviceContext>();
+    math::SetConstant<DeviceContext, T> zero;
+
+    in_x_grad->mutable_data<T>(context.GetPlace());
+    zero(device_ctx, in_x_grad, static_cast<T>(0));
+
+    math::Unpool3dMaxGradFunctor<DeviceContext, T> unpool3d_max_backward;
+    unpool3d_max_backward(device_ctx, *in_x, *in_y, *out, *out_grad, in_x_grad);
+  }
+};
 }  // namespace operators
 }  // namespace paddle
diff --git a/python/paddle/fluid/tests/unittests/test_unpool1d_op.py b/python/paddle/fluid/tests/unittests/test_unpool1d_op.py
new file mode 100644
index 0000000000000..95d19210acb72
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_unpool1d_op.py
@@ -0,0 +1,156 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import paddle.nn.functional as F
+
+paddle.enable_static()
+paddle.seed(2022)
+
+
+def _unpool_output_size(x, kernel_size, stride, padding, output_size):
+    input_size = x.shape
+    default_size = []
+    for d in range(len(kernel_size)):
+        default_size.append((input_size[-len(kernel_size) + d] - 1) * stride[d]
+                            + kernel_size[d] - 2 * padding[d])
+    if output_size is None:
+        ret = default_size
+    else:
+        ret = output_size
+    return ret
+
+
+def unpool1dmax_forward_naive(input, indices, ksize, strides, paddings,
+                              output_size):
+    s0, s1, s2 = input.shape
+    output_size = _unpool_output_size(input, ksize, strides, paddings,
+                                      output_size)
+    out_lsize = output_size[0]
+    out = np.zeros((s0, s1, out_lsize))
+    for nidx in range(s0):
+        for cidx in range(s1):
+            for l in range(s2):
+                index = indices[nidx, cidx, l]
+                lidx = index % out_lsize
+                out[nidx, cidx, lidx] = input[nidx, cidx, l]
+
+    return out
+
+
+class TestUnpool1DOpAPI_dygraph(unittest.TestCase):
+    def test_case(self):
+        places = [paddle.CPUPlace()]
+        if paddle.fluid.core.is_compiled_with_cuda():
+            places.append(paddle.CUDAPlace(0))
+        for place in places:
+            paddle.disable_static()
+            input_data = np.random.rand(1, 3, 16)
+            input_x = paddle.to_tensor(input_data)
+            output, indices = F.max_pool1d(
+                input_x, kernel_size=2, stride=2, return_mask=True)
+            output_unpool = F.max_unpool1d(
+                output, indices, kernel_size=2, stride=2)
+            expected_output_unpool = unpool1dmax_forward_naive(
+                output.numpy(), indices.numpy(), [2], [2], [0], [16])
+            self.assertTrue(
+                np.allclose(output_unpool.numpy(), expected_output_unpool))
+
+        paddle.enable_static()
+
+
+class TestUnpool1DOpAPI_dygraph2(unittest.TestCase):
+    def test_case(self):
+        places = [paddle.CPUPlace()]
+        if paddle.fluid.core.is_compiled_with_cuda():
+            places.append(paddle.CUDAPlace(0))
+        for place in places:
+            paddle.disable_static()
+            input_data = np.random.rand(1, 3, 16)
+            input_x = paddle.to_tensor(input_data)
+            output, indices = F.max_pool1d(
+                input_x, kernel_size=2, stride=2, return_mask=True)
+            output_unpool = F.max_unpool1d(
+                output, indices, kernel_size=2, stride=None)
+            expected_output_unpool = unpool1dmax_forward_naive(
+                output.numpy(), indices.numpy(), [2], [2], [0], [16])
+            self.assertTrue(
+                np.allclose(output_unpool.numpy(), expected_output_unpool))
+
+        paddle.enable_static()
+
+
+class TestUnpool1DOpAPI_dygraph3(unittest.TestCase):
+    def test_case(self):
+        places = [paddle.CPUPlace()]
+        if paddle.fluid.core.is_compiled_with_cuda():
+            places.append(paddle.CUDAPlace(0))
+        for place in places:
+            paddle.disable_static()
+            input_data = np.random.rand(1, 3, 16)
+            input_x = paddle.to_tensor(input_data)
+            Pool1d = paddle.nn.MaxPool1D(
+                kernel_size=2, stride=2, return_mask=True)
+            UnPool1d = paddle.nn.MaxUnPool1D(kernel_size=2, stride=2)
+
+            output, indices = Pool1d(input_x)
+            output_unpool = UnPool1d(output, indices)
+            expected_output_unpool = unpool1dmax_forward_naive(
+                output.numpy(), indices.numpy(), [2], [2], [0], [16])
+            self.assertTrue(
+                np.allclose(output_unpool.numpy(), expected_output_unpool))
+
+        paddle.enable_static()
+
+
+class TestUnpool1DOpAPI_static(unittest.TestCase):
+    def test_case(self):
+        paddle.enable_static()
+        places = [paddle.CPUPlace()]
+        if paddle.fluid.core.is_compiled_with_cuda():
+            places.append(paddle.CUDAPlace(0))
+        for place in places:
+            with paddle.static.program_guard(paddle.static.Program(),
+                                             paddle.static.Program()):
+
+                input_data = np.array([[[1, 2, 3, 4], [5, 6, 7, 8],
+                                        [9, 10, 11, 12]]]).astype("float32")
+                x = paddle.fluid.data(
+                    name='x', shape=[1, 3, 4], dtype='float32')
+                output, indices = F.max_pool1d(
+                    x, kernel_size=2, stride=2, return_mask=True)
+                output_unpool = F.max_unpool1d(
+                    output, indices, kernel_size=2, stride=None)
+
+                exe = paddle.fluid.Executor(place)
+                fetches = exe.run(paddle.fluid.default_main_program(),
+                                  feed={"x": input_data},
+                                  fetch_list=[output_unpool],
+                                  return_numpy=True)
+                pool1d_out_np = np.array(
+                    [[[2., 4.], [6., 8.], [10., 12.]]]).astype("float32")
+                indices_np = np.array(
+                    [[[1, 3], [1, 3], [1, 3]]]).astype("int32")
+                expected_output_unpool = unpool1dmax_forward_naive(
+                    pool1d_out_np, indices_np, [2], [2], [0], [4])
+                self.assertTrue(np.allclose(fetches[0], expected_output_unpool))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_unpool3d_op.py b/python/paddle/fluid/tests/unittests/test_unpool3d_op.py
new file mode 100644
index 0000000000000..e6031d9cee8b1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_unpool3d_op.py
@@ -0,0 +1,293 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import paddle.nn.functional as F
+
+paddle.enable_static()
+paddle.seed(2022)
+
+
+def _unpool_output_size(x, kernel_size, stride, padding, output_size):
+    input_size = x.shape
+    default_size = []
+    for d in range(len(kernel_size)):
+        default_size.append((input_size[-len(kernel_size) + d] - 1) * stride[d]
+                            + kernel_size[d] - 2 * padding[d])
+    if output_size is None:
+        ret = default_size
+    else:
+        ret = output_size
+    return ret
+
+
+def unpool3dmax_forward_naive(input, indices, ksize, strides, paddings,
+                              output_size):
+    s0, s1, s2, s3, s4 = input.shape
+    output_size = _unpool_output_size(input, ksize, strides, paddings,
+                                      output_size)
+    out_dsize = output_size[0]
+    out_hsize = output_size[1]
+    out_wsize = output_size[2]
+    out = np.zeros((s0, s1, out_dsize, out_hsize, out_wsize))
+    for nidx in range(s0):
+        for cidx in range(s1):
+            for d in range(s2):
+                for h in range(s3):
+                    for w in range(s4):
+                        index = indices[nidx, cidx, d, h, w]
+                        didx = index // (out_wsize * out_hsize)
+                        hidx = (
+                            index - didx * out_hsize * out_wsize) // out_wsize
+                        widx = (
+                            index - didx * out_hsize * out_wsize) % out_wsize
+                        out[nidx, cidx, didx, hidx, widx] = \
+                                input[nidx, cidx, d, h, w]
+
+    return out
+
+
+class TestUnpool3DOp(OpTest):
+    def setUp(self):
+        self.op_type = "unpool3d"
+        self.init_test_case()
+        inputs = np.random.randint(0, 100, self.shape)
+        nsize, csize, dsize, hsize, wsize = inputs.shape
+        self.output_size = _unpool_output_size(inputs, self.ksize, self.strides,
+                                               self.paddings, self.output_size)
+        indices = np.random.permutation(
+            np.arange(0, self.output_size[0] * self.output_size[1] *
+                      self.output_size[2]))[:dsize * hsize * wsize]
+        indices = np.reshape(indices, [dsize, hsize, wsize])
+        idx_list = []
+        for n in range(nsize):
+            c_list = []
+            for c in range(csize):
+                c_list.append(indices.tolist())
+            idx_list.append(c_list)
+        indices = np.array(idx_list)
+
+        output = self.unpool3d_forward_naive(inputs, indices, self.ksize, \
+                self.strides, self.paddings, self.output_size).astype("float64")
+
+        self.inputs = {
+            'X': inputs.astype('float64'),
+            'Indices': indices.astype('int32')
+        }
+        self.attrs = {
+            'strides': self.strides,
+            'paddings': self.paddings,
+            'ksize': self.ksize,
+            'unpooling_type': self.unpooling_type,
+            'output_size': self.output_size,
+        }
+        self.outputs = {'Out': output.astype('float64')}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+    def init_test_case(self):
+        self.unpool3d_forward_naive = unpool3dmax_forward_naive
+        self.unpooling_type = "max"
+        self.shape = [1, 1, 4, 5, 6]
+        self.ksize = [2, 2, 2]
+        self.strides = [2, 2, 2]
+        self.paddings = [0, 0, 0]
+        self.output_size = None
+
+
+class TestUnpool3DOpcase1(TestUnpool3DOp):
+    def init_test_case(self):
+        self.unpool3d_forward_naive = unpool3dmax_forward_naive
+        self.unpooling_type = "max"
+        self.shape = [1, 3, 4, 5, 6]
+        self.ksize = [2, 2, 2]
+        self.strides = [2, 2, 2]
+        self.paddings = [0, 0, 0]
+        self.output_size = None
+
+
+class TestUnpool3DOpOutput(TestUnpool3DOp):
+    def init_test_case(self):
+        self.unpool3d_forward_naive = unpool3dmax_forward_naive
+        self.unpooling_type = "max"
+        self.shape = [1, 3, 4, 5, 6]
+        self.ksize = [2, 2, 2]
+        self.strides = [2, 2, 2]
+        self.paddings = [0, 0, 0]
+        self.output_size = [7, 9, 11]
+
+
+class TestUnpool3DOpException(unittest.TestCase):
+    def test_exception(self):
+        def indices_size_error():
+            data = paddle.randint(shape=[1, 1, 3, 3, 3])
+            indices = paddle.reshape(
+                paddle.arange(0, 36), shape=[1, 1, 3, 3, 4])
+            MaxUnPool3D = F.maxunpool3d(data, indices, kernel_size=2, stride=2)
+
+        def indices_value_error():
+            data = paddle.randint(shape=[1, 1, 3, 3, 3])
+            indices = paddle.reshape(
+                paddle.arange(4, 40), shape=[1, 1, 3, 3, 3])
+            MaxUnPool3D = F.maxunpool3d(data, indices, kernel_size=2, stride=2)
+
+        def data_format_error():
+            data = paddle.randint(shape=[1, 1, 3, 3, 3])
+            indices = paddle.reshape(
+                paddle.arange(0, 27), shape=[1, 1, 3, 3, 3])
+            MaxUnPool3D = F.maxunpool3d(
+                data, indices, kernel_size=2, stride=2, data_format="NDHWC")
+
+        def data_outputsize_error():
+            data = paddle.randint(shape=[1, 1, 3, 3, 3])
+            indices = paddle.reshape(
+                paddle.arange(0, 27), shape=[1, 1, 3, 3, 3])
+            MaxUnPool3D = F.maxunpool3d(
+                data,
+                indices,
+                kernel_size=2,
+                stride=2,
+                output_size=[2, 2, 3, 4, 5])
+
+        def data_outputsize_error2():
+            data = paddle.randint(shape=[1, 1, 3, 3, 3])
+            indices = paddle.reshape(
+                paddle.arange(0, 27), shape=[1, 1, 3, 3, 3])
+            MaxUnPool3D = F.maxunpool3d(
+                data,
+                indices,
+                kernel_size=2,
+                stride=2,
+                output_size=[10, 10, 10])
+
+        self.assertRaises(ValueError, indices_size_error)
+        self.assertRaises(ValueError, indices_value_error)
+        self.assertRaises(ValueError, data_format_error)
+        self.assertRaises(ValueError, data_outputsize_error)
+        self.assertRaises(ValueError, data_outputsize_error2)
+
+
+class TestUnpool3DOpAPI_dygraph(unittest.TestCase):
+    def test_case(self):
+        places = [paddle.CPUPlace()]
+        if paddle.fluid.core.is_compiled_with_cuda():
+            places.append(paddle.CUDAPlace(0))
+        for place in places:
+            paddle.disable_static()
+            input_data = np.random.rand(1, 3, 4, 4, 6)
+            input_x = paddle.to_tensor(input_data)
+            output, indices = F.max_pool3d(
+                input_x, kernel_size=2, stride=2, return_mask=True)
+            output_unpool = F.max_unpool3d(
+                output, indices, kernel_size=2, stride=2)
+            expected_output_unpool = unpool3dmax_forward_naive(
+                output.numpy(),
+                indices.numpy(), [2, 2, 2], [2, 2, 2], [0, 0, 0], [4, 4, 6])
+            self.assertTrue(
+                np.allclose(output_unpool.numpy(), expected_output_unpool))
+
+        paddle.enable_static()
+
+
+class TestUnpool3DOpAPI_dygraph2(unittest.TestCase):
+    def test_case(self):
+        places = [paddle.CPUPlace()]
+        if paddle.fluid.core.is_compiled_with_cuda():
+            places.append(paddle.CUDAPlace(0))
+        for place in places:
+            paddle.disable_static()
+            input_data = np.random.rand(1, 3, 4, 4, 6)
+            input_x = paddle.to_tensor(input_data)
+            output, indices = F.max_pool3d(
+                input_x, kernel_size=2, stride=2, return_mask=True)
+            output_unpool = F.max_unpool3d(
+                output, indices, kernel_size=2, stride=None)
+            expected_output_unpool = unpool3dmax_forward_naive(
+                output.numpy(),
+                indices.numpy(), [2, 2, 2], [2, 2, 2], [0, 0, 0], [4, 4, 6])
+            self.assertTrue(
+                np.allclose(output_unpool.numpy(), expected_output_unpool))
+
+        paddle.enable_static()
+
+
+class TestUnpool3DOpAPI_dygraph3(unittest.TestCase):
+    def test_case(self):
+        places = [paddle.CPUPlace()]
+        if paddle.fluid.core.is_compiled_with_cuda():
+            places.append(paddle.CUDAPlace(0))
+        for place in places:
+            paddle.disable_static()
+            input_data = np.random.rand(1, 3, 4, 4, 6)
+            input_x = paddle.to_tensor(input_data)
+            Pool3d = paddle.nn.MaxPool3D(
+                kernel_size=2, stride=2, return_mask=True)
+            UnPool3d = paddle.nn.MaxUnPool3D(kernel_size=2, stride=2)
+
+            output, indices = Pool3d(input_x)
+            output_unpool = UnPool3d(output, indices)
+            expected_output_unpool = unpool3dmax_forward_naive(
+                output.numpy(),
+                indices.numpy(), [2, 2, 2], [2, 2, 2], [0, 0, 0], [4, 4, 6])
+            self.assertTrue(
+                np.allclose(output_unpool.numpy(), expected_output_unpool))
+
+        paddle.enable_static()
+
+
+class TestUnpool3DOpAPI_static(unittest.TestCase):
+    def test_case(self):
+        paddle.enable_static()
+        places = [paddle.CPUPlace()]
+        if paddle.fluid.core.is_compiled_with_cuda():
+            places.append(paddle.CUDAPlace(0))
+        for place in places:
+            with paddle.static.program_guard(paddle.static.Program(),
+                                             paddle.static.Program()):
+
+                input_data = np.array([[[[[1, 2, 3, 4], [5, 6, 7, 8], \
+                    [9, 10, 11, 12], [13, 14, 15, 16]], [[1, 2, 3, 4], [5, 6, 7, 8], \
+                    [9, 10, 11, 12], [13, 14, 15, 16]]]]]).astype("float32")
+                x = paddle.fluid.data(
+                    name='x', shape=[1, 1, 2, 4, 4], dtype='float32')
+                output, indices = F.max_pool3d(
+                    x, kernel_size=2, stride=2, return_mask=True)
+                output_unpool = F.max_unpool3d(
+                    output, indices, kernel_size=2, stride=None)
+
+                exe = paddle.fluid.Executor(place)
+                fetches = exe.run(paddle.fluid.default_main_program(),
+                                  feed={"x": input_data},
+                                  fetch_list=[output_unpool],
+                                  return_numpy=True)
+                pool3d_out_np = np.array(
+                    [[[[[6., 8.], [14., 16.]]]]]).astype("float32")
+                indices_np = np.array([[[[[5, 7], [13, 15]]]]]).astype("int32")
+                expected_output_unpool = unpool3dmax_forward_naive(
+                    pool3d_out_np, indices_np, [2, 2, 2], [2, 2, 2], [0, 0, 0],
+                    [2, 4, 4])
+                self.assertTrue(np.allclose(fetches[0], expected_output_unpool))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index 78281faa85180..ad8f28f40bb58 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -77,7 +77,9 @@
 from .layer.pooling import MaxPool1D  # noqa: F401
 from .layer.pooling import MaxPool2D  # noqa: F401
 from .layer.pooling import MaxPool3D  # noqa: F401
+from .layer.pooling import MaxUnPool1D  # noqa: F401
 from .layer.pooling import MaxUnPool2D  # noqa: F401
+from .layer.pooling import MaxUnPool3D  # noqa: F401
 from .layer.pooling import AdaptiveAvgPool1D  # noqa: F401
 from .layer.pooling import AdaptiveAvgPool2D  # noqa: F401
 from .layer.pooling import AdaptiveAvgPool3D  # noqa: F401
@@ -301,6 +303,8 @@ def weight_norm(*args):
            'ReLU6',
            'LayerDict',
            'ZeroPad2D',
+           'MaxUnPool1D',
            'MaxUnPool2D',
+           'MaxUnPool3D',
            'HingeEmbeddingLoss',
 ]
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index c0b948b7a7826..a24afc45a5995 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -107,7 +107,9 @@
 from .pooling import adaptive_avg_pool1d  # noqa: F401
 from .pooling import adaptive_avg_pool2d  # noqa: F401
 from .pooling import adaptive_avg_pool3d  # noqa: F401
+from .pooling import max_unpool1d  # noqa: F401
 from .pooling import max_unpool2d  # noqa: F401
+from .pooling import max_unpool3d  # noqa: F401
 
 from .vision import affine_grid  # noqa: F401
 from .vision import grid_sample  # noqa: F401
@@ -179,7 +181,9 @@
            'max_pool1d',
            'max_pool2d',
            'max_pool3d',
+           'max_unpool1d',
            'max_unpool2d',
+           'max_unpool3d',
            'adaptive_avg_pool1d',
            'adaptive_avg_pool2d',
            'adaptive_avg_pool3d',
diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py
index 27f4d4a7db345..db9665f7a32c4 100755
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -664,6 +664,115 @@ def _unpool_output_size(x, kernel_size, stride, padding, output_size):
     return ret
 
 
+def max_unpool1d(x,
+                 indices,
+                 kernel_size,
+                 stride=None,
+                 padding=0,
+                 data_format="NCL",
+                 output_size=None,
+                 name=None):
+    """
+    This API implements max unpooling 1d opereation.
+    `max_unpool1d` accepts the output of `max_pool1d` as input, 
+    including the indices of the maximum value and calculate the partial inverse. 
+    All non-maximum values ​​are set to zero.
+
+    - Input: :math:`(N, C, L_{in})`
+    - Output: :math:`(N, C, L_{out})`, where
+    
+    .. math::
+        L_{out} = (L_{in} - 1) * stride - 2 * padding + kernel\_size
+
+    or as given by :attr:`output_size` in the call operator.
+
+
+    Args:
+        x (Tensor): The input tensor of unpooling operator which is a 3-D tensor with
+                          shape [N, C, L]. The format of input tensor is `"NCL"`, 
+                          where `N` is batch size, `C` is the number of channels, `L` is
+                          the length of the feature. The data type is float32 or float64.
+        indices (Tensor): The indices given out by maxpooling1d which is a 3-D tensor with
+                          shape [N, C, L]. The format of input tensor is `"NCL"` , 
+                          where `N` is batch size, `C` is the number of channels, `L` is
+                          the length of the featuree. The data type is float32 or float64.
+        kernel_size (int|list|tuple): The unpool kernel size. If unpool kernel size is a tuple or list,
+            it must contain an integer.
+        stride (int|list|tuple): The unpool stride size. If unpool stride size is a tuple or list,
+            it must contain an integer.
+        padding (int | tuple): Padding that was added to the input.
+        output_size(list|tuple, optional): The target output size. If output_size is not specified, 
+                           the actual output shape will be automatically calculated by (input_shape,
+                           kernel_size, stride, padding).
+        data_format (string): The data format of the input and output data.
+                        The default is `"NCL"`. When it is `"NCL"`, the data is stored in the order of:
+                        `[batch_size, input_channels, input_length]`.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+    Returns:
+        Tensor: The output tensor of unpooling result. 
+
+    Examples:
+        .. code-block:: python
+        
+            import paddle
+            import paddle.nn.functional as F
+
+            data = paddle.rand(shape=[1, 3, 16])
+            pool_out, indices = F.max_pool1d(data, kernel_size=2, stride=2, padding=0, return_mask=True)
+            # pool_out shape: [1, 3, 8],  indices shape: [1, 3, 8]
+            unpool_out = F.max_unpool1d(pool_out, indices, kernel_size=2, padding=0)
+            # unpool_out shape: [1, 3, 16]
+
+    """
+    """NCL to NCHW"""
+    if data_format not in ["NCL"]:
+        raise ValueError("Attr(data_format) should be 'NCL'. Received "
+                         "Attr(data_format): %s." % str(data_format))
+    data_format = "NCHW"
+    x = unsqueeze(x, [2])
+    indices = unsqueeze(indices, [2])
+    kernel_size = [1] + utils.convert_to_list(kernel_size, 1, 'pool_size')
+    if stride is None:
+        stride = kernel_size
+    else:
+        stride = [1] + utils.convert_to_list(stride, 1, 'pool_stride')
+    padding, padding_algorithm = _update_padding_nd(padding, 1)
+    # use 2d to implenment 1d should expand padding in advance.
+    padding = _expand_low_nd_padding(padding)
+
+    output_size = _unpool_output_size(x, kernel_size, stride, padding,
+                                      output_size)
+
+    if in_dygraph_mode():
+        output = _C_ops.unpool(x, indices, 'unpooling_type', 'max', 'ksize',
+                               kernel_size, 'strides', stride, 'paddings',
+                               padding, "output_size", output_size,
+                               "data_format", data_format)
+        return squeeze(output, [2])
+
+    op_type = "unpool"
+    helper = LayerHelper(op_type, **locals())
+    dtype = helper.input_dtype(input_param_name="x")
+    unpool_out = helper.create_variable_for_type_inference(dtype)
+
+    helper.append_op(
+        type=op_type,
+        inputs={"X": x,
+                "Indices": indices},
+        outputs={"Out": unpool_out},
+        attrs={
+            "unpooling_type": "max",
+            "ksize": kernel_size,
+            "strides": stride,
+            "paddings": padding,
+            "output_size": output_size
+        })
+    return squeeze(unpool_out, [2])
+
+
 def max_unpool2d(x,
                  indices,
                  kernel_size,
@@ -779,6 +888,118 @@ def max_unpool2d(x,
     return unpool_out
 
 
+def max_unpool3d(x,
+                 indices,
+                 kernel_size,
+                 stride=None,
+                 padding=0,
+                 data_format="NCDHW",
+                 output_size=None,
+                 name=None):
+    """
+    This API implements max unpooling 3d opereation.
+    `max_unpool3d` accepts the output of `max_pool3d` as input, 
+    including the indices of the maximum value and calculate the partial inverse. 
+    All non-maximum values ​​are set to zero.
+
+    - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})`
+    - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})`, where
+    
+    .. math::
+        D_{out} = (D_{in} - 1) * stride[0] - 2 * padding[0] + kernel\_size[0]
+
+    .. math::
+        H_{out} = (H_{in} - 1) * stride[1] - 2 * padding[1] + kernel\_size[1]
+
+    .. math::
+        W_{out} = (W_{in} - 1) * stride[2] - 2 * padding[2] + kernel\_size[2]
+
+    or as given by :attr:`output_size` in the call operator
+
+
+    Args:
+        x (Tensor): The input tensor of unpooling operator which is a 5-D tensor with
+                          shape [N, C, D, H, W]. The format of input tensor is `"NCDHW"`, 
+                          where `N` is batch size, `C` is the number of channels, `D` is
+                          the depth of the feature, `H` is the height of the feature, 
+                          and `W` is the width of the feature. The data type is float32 or float64.
+        indices (Tensor): The indices given out by maxpooling3d which is a 5-D tensor with
+                          shape [N, C, D, H, W]. The format of input tensor is `"NCDHW"` , 
+                          where `N` is batch size, `C` is the number of channels, `D` is
+                          the depth of the feature, `H` is the height of the feature, 
+                          and `W` is the width of the feature. The data type is float32 or float64.
+        kernel_size (int|list|tuple): The unpool kernel size. If unpool kernel size is a tuple or list,
+            it must contain an integer.
+        stride (int|list|tuple): The unpool stride size. If unpool stride size is a tuple or list,
+            it must contain an integer.
+        padding (int | tuple): Padding that was added to the input.
+        output_size(list|tuple, optional): The target output size. If output_size is not specified, 
+                           the actual output shape will be automatically calculated by (input_shape,
+                           kernel_size, stride, padding).
+        data_format (string): The data format of the input and output data.
+                        The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of:
+                        `[batch_size, input_channels, input_depth, input_height, input_width]`.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+    Returns:
+        Tensor: The output tensor of unpooling result. 
+
+    Examples:
+        .. code-block:: python
+        
+            import paddle
+            import paddle.nn.functional as F
+
+            data = paddle.rand(shape=[1, 1, 4, 4, 6])
+            pool_out, indices = F.max_pool3d(data, kernel_size=2, stride=2, padding=0, return_mask=True)
+            # pool_out shape: [1, 1, 2, 2, 3],  indices shape: [1, 1, 2, 2, 3]
+            unpool_out = F.max_unpool3d(pool_out, indices, kernel_size=2, padding=0)
+            # unpool_out shape: [1, 1, 4, 4, 6]
+
+    """
+    kernel_size = utils.convert_to_list(kernel_size, 3, 'pool_size')
+    if stride is None:
+        stride = kernel_size
+    else:
+        stride = utils.convert_to_list(stride, 3, 'pool_stride')
+    padding = utils.convert_to_list(padding, 3, 'padding')
+
+    if data_format not in ["NCDHW"]:
+        raise ValueError("Attr(data_format) should be 'NCDHW'. Received "
+                         "Attr(data_format): %s." % str(data_format))
+
+    output_size = _unpool_output_size(x, kernel_size, stride, padding,
+                                      output_size)
+
+    if in_dygraph_mode():
+        output = _C_ops.unpool3d(x, indices, 'unpooling_type', 'max', 'ksize',
+                                 kernel_size, 'strides', stride, 'paddings',
+                                 padding, "output_size", output_size,
+                                 "data_format", data_format)
+        return output
+
+    op_type = "unpool3d"
+    helper = LayerHelper(op_type, **locals())
+    dtype = helper.input_dtype(input_param_name="x")
+    unpool_out = helper.create_variable_for_type_inference(dtype)
+
+    helper.append_op(
+        type=op_type,
+        inputs={"X": x,
+                "Indices": indices},
+        outputs={"Out": unpool_out},
+        attrs={
+            "unpooling_type": "max",
+            "ksize": kernel_size,
+            "strides": stride,
+            "paddings": padding,
+            "output_size": output_size
+        })
+    return unpool_out
+
+
 def max_pool2d(x,
                kernel_size,
                stride=None,
diff --git a/python/paddle/nn/layer/__init__.py b/python/paddle/nn/layer/__init__.py
index f536c3d5ff379..2b50508065605 100644
--- a/python/paddle/nn/layer/__init__.py
+++ b/python/paddle/nn/layer/__init__.py
@@ -57,7 +57,9 @@
 from .pooling import AdaptiveMaxPool1D  # noqa: F401
 from .pooling import AdaptiveMaxPool2D  # noqa: F401
 from .pooling import AdaptiveMaxPool3D  # noqa: F401
+from .pooling import MaxUnPool1D  # noqa: F401
 from .pooling import MaxUnPool2D  # noqa: F401
+from .pooling import MaxUnPool3D  # noqa: F401
 from .conv import Conv1D  # noqa: F401
 from .conv import Conv2D  # noqa: F401
 from .conv import Conv3D  # noqa: F401
diff --git a/python/paddle/nn/layer/pooling.py b/python/paddle/nn/layer/pooling.py
index cc49db9b2056f..96942f5c8500a 100755
--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
@@ -1130,6 +1130,88 @@ def extra_repr(self):
                                                        self._return_mask)
 
 
+class MaxUnPool1D(Layer):
+    """
+    This API implements max unpooling 1d opereation.
+
+    `max_unpool1d` accepts the output of `max_pool1d` as input, 
+    including the indices of the maximum value and calculate the partial inverse. 
+    All non-maximum values ​​are set to zero.
+
+    - Input: :math:`(N, C, L_{in})`
+    - Output: :math:`(N, C, L_{out})`, where
+    
+    .. math::
+        L_{out} = (L_{in} - 1) * stride - 2 * padding + kernel\_size
+
+    or as given by :attr:`output_size` in the call operator.
+    
+    Parameters:
+        kernel_size (int|list|tuple): The unpool kernel size. If unpool kernel size is a tuple or list,
+            it must contain an integer.
+        stride (int|list|tuple): The unpool stride size. If unpool stride size is a tuple or list,
+            it must contain an integer.
+        padding (int | tuple): Padding that was added to the input.
+        output_size(list|tuple, optional): The target output size. If output_size is not specified, 
+                           the actual output shape will be automatically calculated by (input_shape,
+                           kernel_size, stride, padding).
+        data_format (string): The data format of the input and output data.
+                        The default is `"NCL"`. When it is `"NCL"`, the data is stored in the order of:
+                        `[batch_size, input_channels, input_length]`.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+
+    Returns:
+        A callable object of MaxUnPool1D.
+
+    Examples:
+        .. code-block:: python
+        
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+
+            data = paddle.rand(shape=[1, 3, 16])
+            pool_out, indices = F.max_pool1d(data, kernel_size=2, stride=2, padding=0, return_mask=True)
+            # pool_out shape: [1, 3, 8],  indices shape: [1, 3, 8]
+            Unpool1D = paddle.nn.MaxUnPool1D(kernel_size=2, padding=0)
+            unpool_out = Unpool1D(pool_out, indices)
+            # unpool_out shape: [1, 3, 16]
+
+    """
+
+    def __init__(self,
+                 kernel_size,
+                 stride=None,
+                 padding=0,
+                 data_format="NCL",
+                 output_size=None,
+                 name=None):
+        super(MaxUnPool1D, self).__init__()
+        self.ksize = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.data_format = data_format
+        self.output_size = output_size
+        self.name = name
+
+    def forward(self, x, indices):
+        return F.max_unpool1d(
+            x,
+            indices,
+            kernel_size=self.ksize,
+            stride=self.stride,
+            padding=self.padding,
+            data_format=self.data_format,
+            output_size=self.output_size,
+            name=self.name)
+
+    def extra_repr(self):
+        return 'output_size={}'.format(self.output_size)
+
+
 class MaxUnPool2D(Layer):
     """
     This API implements max unpooling 2d opereation.
@@ -1214,3 +1296,92 @@ def forward(self, x, indices):
 
     def extra_repr(self):
         return 'output_size={}'.format(self.output_size)
+
+
+class MaxUnPool3D(Layer):
+    """
+    This API implements max unpooling 3d opereation.
+
+    `max_unpool3d` accepts the output of `max_pool3d` as input, 
+    including the indices of the maximum value and calculate the partial inverse. 
+    All non-maximum values ​​are set to zero.
+
+    - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})`
+    - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})`, where
+    
+    .. math::
+        D_{out} = (D_{in} - 1) * stride[0] - 2 * padding[0] + kernel\_size[0]
+
+    .. math::
+        H_{out} = (H_{in} - 1) * stride[1] - 2 * padding[1] + kernel\_size[1]
+
+    .. math::
+        W_{out} = (W_{in} - 1) * stride[2] - 2 * padding[2] + kernel\_size[2]
+
+    or as given by :attr:`output_size` in the call operator
+
+    
+    Parameters:
+        kernel_size (int|list|tuple): The unpool kernel size. If unpool kernel size is a tuple or list,
+            it must contain an integer.
+        stride (int|list|tuple): The unpool stride size. If unpool stride size is a tuple or list,
+            it must contain an integer.
+        padding (int | tuple): Padding that was added to the input.
+        output_size(list|tuple, optional): The target output size. If output_size is not specified, 
+                           the actual output shape will be automatically calculated by (input_shape,
+                           kernel_size, stride, padding).
+        data_format (string): The data format of the input and output data.
+                        The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of:
+                        `[batch_size, input_channels, input_depth, input_height, input_width]`.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+
+    Returns:
+        A callable object of MaxUnPool3D.
+
+    Examples:
+        .. code-block:: python
+        
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+
+            data = paddle.rand(shape=[1, 1, 4, 4, 6])
+            pool_out, indices = F.max_pool3d(data, kernel_size=2, stride=2, padding=0, return_mask=True)
+            # pool_out shape: [1, 1, 2, 2, 3],  indices shape: [1, 1, 2, 2, 3]
+            Unpool3D = paddle.nn.MaxUnPool3D(kernel_size=2, padding=0)
+            unpool_out = Unpool3D(pool_out, indices)
+            # unpool_out shape: [1, 1, 4, 4, 6]
+
+    """
+
+    def __init__(self,
+                 kernel_size,
+                 stride=None,
+                 padding=0,
+                 data_format="NCDHW",
+                 output_size=None,
+                 name=None):
+        super(MaxUnPool3D, self).__init__()
+        self.ksize = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.data_format = data_format
+        self.output_size = output_size
+        self.name = name
+
+    def forward(self, x, indices):
+        return F.max_unpool3d(
+            x,
+            indices,
+            kernel_size=self.ksize,
+            stride=self.stride,
+            padding=self.padding,
+            data_format=self.data_format,
+            output_size=self.output_size,
+            name=self.name)
+
+    def extra_repr(self):
+        return 'output_size={}'.format(self.output_size)

From 492e6dd0f7e0e8f33cc55a25c0ec41dd133f8152 Mon Sep 17 00:00:00 2001
From: wangxinxin08 <69842442+wangxinxin08@users.noreply.github.com>
Date: Mon, 10 Jan 2022 10:33:55 +0800
Subject: [PATCH 046/151] modify comment of mish (#38805)

---
 python/paddle/nn/functional/activation.py | 3 +--
 python/paddle/nn/layer/activation.py      | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index 07acf142a5110..ac08ac9391eb3 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -1200,9 +1200,8 @@ def mish(x, name=None):
 
             import paddle
             import paddle.nn.functional as F
-            import numpy as np
 
-            x = paddle.to_tensor(np.array([-5., 0., 5.]))
+            x = paddle.to_tensor([-5., 0., 5.])
             out = F.mish(x) # [-0.03357624, 0., 4.99955208]
     """
     if in_dygraph_mode():
diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py
index 34755f68b867a..617981cb8f74c 100644
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -907,9 +907,8 @@ class Mish(Layer):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
-            x = paddle.to_tensor(np.array([-5., 0., 5.]))
+            x = paddle.to_tensor([-5., 0., 5.])
             m = paddle.nn.Mish()
             out = m(x) # [-0.03357624, 0., 4.99955208]
 

From cd2855b0626a4cce979ce58587c942b0b0304691 Mon Sep 17 00:00:00 2001
From: LiYuRio <63526175+LiYuRio@users.noreply.github.com>
Date: Mon, 10 Jan 2022 10:54:24 +0800
Subject: [PATCH 047/151] [fleet_executor] Add barrier rpc (#38799)

---
 .../distributed/fleet_executor/CMakeLists.txt |  6 ++--
 .../distributed/fleet_executor/carrier.cc     |  1 -
 .../fleet_executor/fleet_executor.cc          |  1 -
 .../fleet_executor/interceptor_message.proto  |  5 +--
 .../distributed/fleet_executor/message_bus.cc | 32 ++++++++-----------
 .../distributed/fleet_executor/message_bus.h  |  4 +--
 ..._message_service.cc => message_service.cc} | 17 ++++++++--
 ...or_message_service.h => message_service.h} | 12 ++++---
 8 files changed, 44 insertions(+), 34 deletions(-)
 rename paddle/fluid/distributed/fleet_executor/{interceptor_message_service.cc => message_service.cc} (68%)
 rename paddle/fluid/distributed/fleet_executor/{interceptor_message_service.h => message_service.h} (75%)

diff --git a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
index e9da55c417e9a..d8372e10888d9 100644
--- a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
+++ b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
@@ -13,7 +13,7 @@ endif()
 cc_library(task_loop_thread_pool SRCS task_loop_thread_pool.cc task_loop_thread.cc task_loop.cc DEPS enforce glog)
 
 cc_library(fleet_executor SRCS fleet_executor.cc carrier.cc task_node.cc runtime_graph.cc
-        interceptor.cc compute_interceptor.cc amplifier_interceptor.cc interceptor_message_service.cc message_bus.cc
+        interceptor.cc compute_interceptor.cc amplifier_interceptor.cc message_service.cc message_bus.cc
         DEPS proto_desc fleet_executor_desc_proto interceptor_message_proto task_loop_thread_pool collective_helper
         op_registry executor_gc_helper gflags glog ${BRPC_DEPS})
 
@@ -29,8 +29,8 @@ if(WITH_DISTRIBUTE)
   set_source_files_properties(message_bus.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
   set_source_files_properties(fleet_executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
   set_source_files_properties(carrier.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-  set_source_files_properties(interceptor_message_service.h PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-  set_source_files_properties(interceptor_message_service.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(message_service.h PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(message_service.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 
   add_subdirectory(test)
 endif()
diff --git a/paddle/fluid/distributed/fleet_executor/carrier.cc b/paddle/fluid/distributed/fleet_executor/carrier.cc
index 79be1824b864d..79ca6f467a38d 100644
--- a/paddle/fluid/distributed/fleet_executor/carrier.cc
+++ b/paddle/fluid/distributed/fleet_executor/carrier.cc
@@ -15,7 +15,6 @@
 #include "paddle/fluid/distributed/fleet_executor/carrier.h"
 #include "paddle/fluid/distributed/fleet_executor/global.h"
 #include "paddle/fluid/distributed/fleet_executor/interceptor.h"
-#include "paddle/fluid/distributed/fleet_executor/interceptor_message_service.h"
 #include "paddle/fluid/distributed/fleet_executor/message_bus.h"
 #include "paddle/fluid/distributed/fleet_executor/runtime_graph.h"
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
diff --git a/paddle/fluid/distributed/fleet_executor/fleet_executor.cc b/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
index e22d0945a2398..d6c1e678ad4f7 100644
--- a/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
+++ b/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
@@ -137,7 +137,6 @@ void FleetExecutor::Run(const std::string& carrier_id) {
   // Set current running carrier
   if (*GlobalVal<std::string>::Get() != carrier_id) {
     GlobalVal<std::string>::Set(new std::string(carrier_id));
-    // TODO(liyurui): Move barrier to service
     GlobalVal<MessageBus>::Get()->Barrier();
   }
   carrier->Start();
diff --git a/paddle/fluid/distributed/fleet_executor/interceptor_message.proto b/paddle/fluid/distributed/fleet_executor/interceptor_message.proto
index c9ab477183a31..ed38894641c3a 100644
--- a/paddle/fluid/distributed/fleet_executor/interceptor_message.proto
+++ b/paddle/fluid/distributed/fleet_executor/interceptor_message.proto
@@ -34,7 +34,8 @@ message InterceptorMessage {
 
 message InterceptorResponse { optional bool rst = 1 [ default = false ]; }
 
-service TheInterceptorMessageService {
-  rpc InterceptorMessageService(InterceptorMessage)
+service MessageService {
+  rpc ReceiveInterceptorMessage(InterceptorMessage)
       returns (InterceptorResponse);
+  rpc IncreaseBarrierCount(InterceptorMessage) returns (InterceptorResponse);
 }
diff --git a/paddle/fluid/distributed/fleet_executor/message_bus.cc b/paddle/fluid/distributed/fleet_executor/message_bus.cc
index 110c5feafc71a..8d2ec5c41d864 100644
--- a/paddle/fluid/distributed/fleet_executor/message_bus.cc
+++ b/paddle/fluid/distributed/fleet_executor/message_bus.cc
@@ -163,18 +163,9 @@ void MessageBus::Barrier() {
 
 bool MessageBus::DispatchMsgToCarrier(
     const InterceptorMessage& interceptor_message) {
-  if (interceptor_message.ctrl_message()) {
-    VLOG(3) << "Receiving control message from rank "
-            << interceptor_message.src_id() << " to rank "
-            << interceptor_message.dst_id();
-    // for barrier
-    IncreaseBarrierCount();
-    return true;
-  } else {
-    const std::string& carrier_id = *GlobalVal<std::string>::Get();
-    return GlobalMap<std::string, Carrier>::Get(carrier_id)
-        ->EnqueueInterceptorMessage(interceptor_message);
-  }
+  const std::string& carrier_id = *GlobalVal<std::string>::Get();
+  return GlobalMap<std::string, Carrier>::Get(carrier_id)
+      ->EnqueueInterceptorMessage(interceptor_message);
 }
 
 void MessageBus::ListenPort() {
@@ -185,10 +176,9 @@ void MessageBus::ListenPort() {
 #if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
     !defined(PADDLE_WITH_ASCEND_CL)
   // function keep listen the port and handle the message
-  PADDLE_ENFORCE_EQ(server_.AddService(&interceptor_message_service_,
-                                       brpc::SERVER_DOESNT_OWN_SERVICE),
-                    0, platform::errors::Unavailable(
-                           "Message bus: init brpc service error."));
+  PADDLE_ENFORCE_EQ(
+      server_.AddService(&message_service_, brpc::SERVER_DOESNT_OWN_SERVICE), 0,
+      platform::errors::Unavailable("Message bus: init brpc service error."));
 
   // start the server
   const char* ip_for_brpc = addr_.c_str();
@@ -229,11 +219,16 @@ bool MessageBus::SendInterRank(int64_t dst_rank,
   PADDLE_ENFORCE_EQ(
       channel.Init(dst_addr_for_brpc, &options), 0,
       platform::errors::Unavailable("Message bus: init brpc channel error."));
-  TheInterceptorMessageService_Stub stub(&channel);
+  MessageService_Stub stub(&channel);
   InterceptorResponse response;
   brpc::Controller ctrl;
   ctrl.set_log_id(0);
-  stub.InterceptorMessageService(&ctrl, &interceptor_message, &response, NULL);
+  if (interceptor_message.ctrl_message()) {
+    stub.IncreaseBarrierCount(&ctrl, &interceptor_message, &response, NULL);
+  } else {
+    stub.ReceiveInterceptorMessage(&ctrl, &interceptor_message, &response,
+                                   NULL);
+  }
   if (!ctrl.Failed()) {
     if (response.rst()) {
       VLOG(3) << "Message bus: brpc sends success.";
@@ -248,6 +243,7 @@ bool MessageBus::SendInterRank(int64_t dst_rank,
     return false;
   }
 }
+
 #endif
 
 }  // namespace distributed
diff --git a/paddle/fluid/distributed/fleet_executor/message_bus.h b/paddle/fluid/distributed/fleet_executor/message_bus.h
index 456cd77e2dde8..d805ac81606b8 100644
--- a/paddle/fluid/distributed/fleet_executor/message_bus.h
+++ b/paddle/fluid/distributed/fleet_executor/message_bus.h
@@ -24,7 +24,7 @@
     !defined(PADDLE_WITH_ASCEND_CL)
 #include "brpc/channel.h"
 #include "brpc/server.h"
-#include "paddle/fluid/distributed/fleet_executor/interceptor_message_service.h"
+#include "paddle/fluid/distributed/fleet_executor/message_service.h"
 #endif
 
 #include "paddle/fluid/distributed/fleet_executor/interceptor_message.pb.h"
@@ -83,7 +83,7 @@ class MessageBus final {
 
 #if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
     !defined(PADDLE_WITH_ASCEND_CL)
-  InterceptorMessageServiceImpl interceptor_message_service_;
+  MessageServiceImpl message_service_;
   // brpc server
   brpc::Server server_;
 #endif
diff --git a/paddle/fluid/distributed/fleet_executor/interceptor_message_service.cc b/paddle/fluid/distributed/fleet_executor/message_service.cc
similarity index 68%
rename from paddle/fluid/distributed/fleet_executor/interceptor_message_service.cc
rename to paddle/fluid/distributed/fleet_executor/message_service.cc
index ce8a73602d0be..c3fff98f684ad 100644
--- a/paddle/fluid/distributed/fleet_executor/interceptor_message_service.cc
+++ b/paddle/fluid/distributed/fleet_executor/message_service.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 #if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
     !defined(PADDLE_WITH_ASCEND_CL)
-#include "paddle/fluid/distributed/fleet_executor/interceptor_message_service.h"
+#include "paddle/fluid/distributed/fleet_executor/message_service.h"
 #include "brpc/server.h"
 #include "paddle/fluid/distributed/fleet_executor/global.h"
 #include "paddle/fluid/distributed/fleet_executor/message_bus.h"
@@ -21,18 +21,29 @@
 namespace paddle {
 namespace distributed {
 
-void InterceptorMessageServiceImpl::InterceptorMessageService(
+void MessageServiceImpl::ReceiveInterceptorMessage(
     google::protobuf::RpcController* control_base,
     const InterceptorMessage* request, InterceptorResponse* response,
     google::protobuf::Closure* done) {
   brpc::ClosureGuard done_guard(done);
-  VLOG(3) << "Interceptor Message Service receives a message from interceptor "
+  VLOG(3) << "Message Service receives a message from interceptor "
           << request->src_id() << " to interceptor " << request->dst_id()
           << ", with the message: " << request->message_type();
   bool flag = GlobalVal<MessageBus>::Get()->DispatchMsgToCarrier(*request);
   response->set_rst(flag);
 }
 
+void MessageServiceImpl::IncreaseBarrierCount(
+    google::protobuf::RpcController* control_base,
+    const InterceptorMessage* request, InterceptorResponse* response,
+    google::protobuf::Closure* done) {
+  brpc::ClosureGuard done_guard(done);
+  VLOG(3) << "Barrier Service receives a message from rank "
+          << request->src_id() << " to rank " << request->dst_id();
+  GlobalVal<MessageBus>::Get()->IncreaseBarrierCount();
+  response->set_rst(true);
+}
+
 }  // namespace distributed
 }  // namespace paddle
 #endif
diff --git a/paddle/fluid/distributed/fleet_executor/interceptor_message_service.h b/paddle/fluid/distributed/fleet_executor/message_service.h
similarity index 75%
rename from paddle/fluid/distributed/fleet_executor/interceptor_message_service.h
rename to paddle/fluid/distributed/fleet_executor/message_service.h
index 0a8dfc861a910..02f73471e3b91 100644
--- a/paddle/fluid/distributed/fleet_executor/interceptor_message_service.h
+++ b/paddle/fluid/distributed/fleet_executor/message_service.h
@@ -21,11 +21,15 @@
 namespace paddle {
 namespace distributed {
 
-class InterceptorMessageServiceImpl : public TheInterceptorMessageService {
+class MessageServiceImpl : public MessageService {
  public:
-  InterceptorMessageServiceImpl() {}
-  virtual ~InterceptorMessageServiceImpl() {}
-  virtual void InterceptorMessageService(
+  MessageServiceImpl() {}
+  virtual ~MessageServiceImpl() {}
+  virtual void ReceiveInterceptorMessage(
+      google::protobuf::RpcController* control_base,
+      const InterceptorMessage* request, InterceptorResponse* response,
+      google::protobuf::Closure* done);
+  virtual void IncreaseBarrierCount(
       google::protobuf::RpcController* control_base,
       const InterceptorMessage* request, InterceptorResponse* response,
       google::protobuf::Closure* done);

From 046553c71389bf715edcc6836792627dd1443caa Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Mon, 10 Jan 2022 11:00:48 +0800
Subject: [PATCH 048/151] Support setting infershape function for custom grad
 op (#38776)

* unify infer_shape func calling

* support set grad infer shape fn for custom op

* unify infershape in new executor and eager

* remove todo comment

* revert infershape in operator
---
 .../fluid/eager/legacy/prepared_operator.cc   |   3 +-
 paddle/fluid/framework/custom_operator.cc     | 303 ++++++++++--------
 .../framework/new_executor/data_transfer.cc   |   3 +-
 .../new_executor/interpretercore_util.cc      |   2 +-
 paddle/fluid/framework/operator.cc            |   4 +-
 paddle/fluid/imperative/prepared_operator.cc  |   6 +-
 paddle/pten/api/lib/op_meta_info.cc           |   7 -
 .../fluid/tests/custom_op/custom_relu_op.cc   |  46 +++
 .../fluid/tests/custom_op/custom_relu_op.cu   |  19 ++
 .../custom_op/test_custom_relu_op_jit.py      |   3 +-
 10 files changed, 236 insertions(+), 160 deletions(-)

diff --git a/paddle/fluid/eager/legacy/prepared_operator.cc b/paddle/fluid/eager/legacy/prepared_operator.cc
index 1c3429207f8b5..4e892b14a9c9c 100644
--- a/paddle/fluid/eager/legacy/prepared_operator.cc
+++ b/paddle/fluid/eager/legacy/prepared_operator.cc
@@ -174,8 +174,7 @@ static void PreparedOpRunImpl(
 
   EagerInferShapeContext infer_shape_ctx(&ins, &outs, &attrs, &default_attrs,
                                          op.Type());
-  static_cast<const paddle::framework::OperatorWithKernel&>(op).InferShape(
-      &infer_shape_ctx);
+  op.Info().infer_shape_(&infer_shape_ctx);
 
   func(EagerExecutionContext(op, scope, *dev_ctx, ctx, ins, outs, attrs,
                              default_attrs));
diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index 785973e041a0d..fd2522b0336ff 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -94,7 +94,7 @@ std::vector<std::string> ParseAttrStr(const std::string& attr) {
   // 2. type
   rlt.emplace_back(string::trim_spaces(attr.substr(split_pos + 1)));
 
-  VLOG(1) << "attr name: " << rlt[0] << ", attr type str: " << rlt[1];
+  VLOG(3) << "attr name: " << rlt[0] << ", attr type str: " << rlt[1];
 
   return rlt;
 }
@@ -109,11 +109,11 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
                           const std::vector<std::string>& inputs,
                           const std::vector<std::string>& outputs,
                           const std::vector<std::string>& attrs) {
-  VLOG(1) << "Custom Operator: Start run KernelFunc.";
+  VLOG(3) << "Custom Operator: Start run KernelFunc.";
   std::vector<paddle::experimental::Tensor> custom_ins;
   std::vector<std::vector<paddle::experimental::Tensor>> custom_vec_ins;
   for (auto& in_name : inputs) {
-    VLOG(1) << "Custom Operator: input name - " << in_name;
+    VLOG(3) << "Custom Operator: input name - " << in_name;
     if (detail::IsDuplicableVar(in_name)) {
       // return const std::vector<const Tensor*>
       auto vec_x = ctx.MultiInput<Tensor>(in_name);
@@ -185,11 +185,11 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
     }
   }
 
-  VLOG(1) << "Custom Operator: Run ComputeFunc.";
+  VLOG(3) << "Custom Operator: Run ComputeFunc.";
   try {
     auto outs = func(custom_ins, custom_vec_ins, custom_attrs);
 
-    VLOG(1) << "Custom Operator: Share outputs into ExecutionContext.";
+    VLOG(3) << "Custom Operator: Share outputs into ExecutionContext.";
     for (size_t i = 0; i < outputs.size(); ++i) {
       auto out_name = outputs[i];
       if (detail::IsDuplicableVar(out_name)) {
@@ -230,6 +230,95 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
   }
 }
 
+static void RunInferShapeFunc(framework::InferShapeContext* ctx,
+                              const paddle::InferShapeFunc& func,
+                              const std::vector<std::string>& inputs,
+                              const std::vector<std::string>& outputs,
+                              const std::vector<std::string>& attrs) {
+  std::vector<std::vector<int64_t>> input_shapes;
+  std::vector<std::vector<std::vector<int64_t>>> vec_input_shapes;
+
+  VLOG(3) << "Custom Operator: InferShape - get input ddim.";
+  for (auto& in_name : inputs) {
+    if (detail::IsDuplicableVar(in_name)) {
+      OP_INOUT_CHECK(ctx->HasInputs(in_name), "Input", in_name, "Custom");
+      auto vec_ddim = ctx->GetInputsDim(in_name);
+      std::vector<std::vector<int64_t>> vec_shape;
+      vec_shape.reserve(vec_ddim.size());
+      std::transform(vec_ddim.begin(), vec_ddim.end(),
+                     std::back_inserter(vec_shape),
+                     [&](const DDim& ddim) -> std::vector<int64_t> {
+                       return framework::vectorize(ddim);
+                     });
+      vec_input_shapes.emplace_back(vec_shape);
+    } else {
+      OP_INOUT_CHECK(ctx->HasInput(in_name), "Input", in_name, "Custom");
+      auto ddim = ctx->GetInputDim(in_name);
+      input_shapes.emplace_back(framework::vectorize(ddim));
+    }
+  }
+
+  std::vector<paddle::any> custom_attrs;
+  for (auto& attr_str : attrs) {
+    auto attr_name_and_type = detail::ParseAttrStr(attr_str);
+    auto attr_name = attr_name_and_type[0];
+    auto attr_type_str = attr_name_and_type[1];
+    if (attr_type_str == "bool") {
+      custom_attrs.emplace_back(ctx->Attrs().Get<bool>(attr_name));
+    } else if (attr_type_str == "int") {
+      custom_attrs.emplace_back(ctx->Attrs().Get<int>(attr_name));
+    } else if (attr_type_str == "float") {
+      custom_attrs.emplace_back(ctx->Attrs().Get<float>(attr_name));
+    } else if (attr_type_str == "int64_t") {
+      custom_attrs.emplace_back(ctx->Attrs().Get<int64_t>(attr_name));
+    } else if (attr_type_str == "std::string") {
+      custom_attrs.emplace_back(ctx->Attrs().Get<std::string>(attr_name));
+    } else if (attr_type_str == "std::vector<int>") {
+      custom_attrs.emplace_back(ctx->Attrs().Get<std::vector<int>>(attr_name));
+    } else if (attr_type_str == "std::vector<float>") {
+      custom_attrs.emplace_back(
+          ctx->Attrs().Get<std::vector<float>>(attr_name));
+    } else if (attr_type_str == "std::vector<int64_t>") {
+      // NOTE(chenweihang): InferShape can't support std::vector<int64_t>
+      // attr type, because the input type is std::vector<int64_t>, only
+      // can use one rule to parse std::vector<int64_t> parameter
+      continue;
+    } else if (attr_type_str == "std::vector<std::string>") {
+      custom_attrs.emplace_back(
+          ctx->Attrs().Get<std::vector<std::string>>(attr_name));
+    } else {
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Unsupported `%s` type value as custom attribute now. "
+          "Supported data types include `bool`, `int`, `float`, "
+          "`int64_t`, `std::string`, `std::vector<int>`, "
+          "`std::vector<float>`, `std::vector<std::string>`, "
+          "Please check whether the attribute data type and "
+          "data type string are matched.",
+          attr_type_str));
+    }
+  }
+
+  VLOG(3) << "Custom Operator: InferShape - calc output ddim.";
+  auto output_shapes = func(input_shapes, vec_input_shapes, custom_attrs);
+
+  VLOG(3) << "Custom Operator: InferShape - set output ddim.";
+  for (size_t i = 0; i < outputs.size(); ++i) {
+    auto out_name = outputs[i];
+    if (detail::IsDuplicableVar(out_name)) {
+      std::vector<DDim> vec_ddim;
+      vec_ddim.reserve(output_shapes.size());
+      std::transform(output_shapes.begin(), output_shapes.end(),
+                     std::back_inserter(vec_ddim),
+                     [&](const std::vector<int64_t>& shape) -> DDim {
+                       return framework::make_ddim(shape);
+                     });
+      ctx->SetOutputsDim(out_name, vec_ddim);
+    } else {
+      ctx->SetOutputDim(out_name, framework::make_ddim(output_shapes[i]));
+    }
+  }
+}
+
 //////////////////// Operator Define /////////////////
 
 class CustomOperator : public OperatorWithKernel {
@@ -239,7 +328,7 @@ class CustomOperator : public OperatorWithKernel {
   // Dummy infershape
   // Because it is a pure virtual function, it must be implemented
   void InferShape(framework::InferShapeContext* ctx) const override {
-    VLOG(1) << "Custom Operator: Dummy infer shape of custom operator.";
+    VLOG(3) << "Custom Operator: Dummy infer shape of custom operator.";
   }
 
   /**
@@ -381,7 +470,7 @@ class CustomGradOpMaker<OpDesc> : public SingleGradOpMaker<OpDesc> {
     auto fwd_op_outputs = this->OutputNames();
 
     for (auto& in_name : inputs_) {
-      VLOG(1) << "Custom Operator: GradOpDescMaker - input: " << in_name;
+      VLOG(3) << "Custom Operator: GradOpDescMaker - input: " << in_name;
       if (!detail::IsGradVar(in_name)) {
         if (detail::IsMemberOf(fwd_op_inputs, in_name)) {
           grad_op->SetInput(in_name, this->Input(in_name));
@@ -398,7 +487,7 @@ class CustomGradOpMaker<OpDesc> : public SingleGradOpMaker<OpDesc> {
       }
     }
     for (auto& out_name : outputs_) {
-      VLOG(1) << "Custom Operator: GradOpDescMaker - output: " << out_name;
+      VLOG(3) << "Custom Operator: GradOpDescMaker - output: " << out_name;
       if (detail::IsDuplicableVar(out_name)) {
         grad_op->SetOutput(out_name,
                            this->InputGrad(detail::NoGrad(out_name),
@@ -447,7 +536,7 @@ class CustomGradOpMaker<imperative::OpBase>
     auto fwd_op_outputs = this->OutputNames();
 
     for (auto& in_name : inputs_) {
-      VLOG(1) << "Custom Operator: GradOpBaseMaker - input: " << in_name;
+      VLOG(3) << "Custom Operator: GradOpBaseMaker - input: " << in_name;
       if (!detail::IsGradVar(in_name)) {
         if (detail::IsMemberOf(fwd_op_inputs, in_name)) {
           grad_op->SetInput(in_name, this->Input(in_name));
@@ -464,7 +553,7 @@ class CustomGradOpMaker<imperative::OpBase>
       }
     }
     for (auto& out_name : outputs_) {
-      VLOG(1) << "Custom Operator: GradOpBaseMaker - output: " << out_name;
+      VLOG(3) << "Custom Operator: GradOpBaseMaker - output: " << out_name;
       grad_op->SetOutput(out_name, this->InputGrad(detail::NoGrad(out_name)));
     }
     grad_op->SetAttrMap(this->Attrs());
@@ -486,11 +575,11 @@ void RegisterOperatorKernelWithPlace(const std::string& name,
                                      const std::vector<std::string>& outputs,
                                      const std::vector<std::string>& attrs) {
   OpKernelType key(type, experimental::ConvertExtPlaceToInnerPlace(place));
-  VLOG(1) << "Custom Operator: op kernel key: " << key;
+  VLOG(3) << "Custom Operator: op kernel key: " << key;
   OperatorWithKernel::AllOpKernels()[name][key] =
       [kernel_func, inputs, outputs,
        attrs](const framework::ExecutionContext& ctx) {
-        VLOG(1) << "Custom Operator: run custom kernel func in lambda.";
+        VLOG(3) << "Custom Operator: run custom kernel func in lambda.";
         RunKernelFunc(ctx, kernel_func, inputs, outputs, attrs);
       };
 }
@@ -500,7 +589,7 @@ void RegisterOperatorKernel(const std::string& name,
                             const std::vector<std::string>& inputs,
                             const std::vector<std::string>& outputs,
                             const std::vector<std::string>& attrs) {
-  VLOG(1) << "Custom Operator: op name in kernel: " << name;
+  VLOG(3) << "Custom Operator: op name in kernel: " << name;
   // NOTE [ Dummy Op Kernel Key ]
   // TODO(chenweihang): Because execute engine need get device context based
   // op_kernel_key.place_, so we should register kernel for each
@@ -535,12 +624,12 @@ void RegisterOperatorWithMetaInfo(
   auto& infer_shape_func = OpMetaInfoHelper::GetInferShapeFn(base_op_meta);
   auto& infer_dtype_func = OpMetaInfoHelper::GetInferDtypeFn(base_op_meta);
 
-  VLOG(1) << "Custom Operator: forward, op name: " << op_name;
-  VLOG(1) << "Custom Operator: forward, op inputs: "
+  VLOG(3) << "Custom Operator: forward, op name: " << op_name;
+  VLOG(3) << "Custom Operator: forward, op inputs: "
           << string::join_strings(op_inputs, ',');
-  VLOG(1) << "Custom Operator: forward, op outputs: "
+  VLOG(3) << "Custom Operator: forward, op outputs: "
           << string::join_strings(op_outputs, ',');
-  VLOG(1) << "Custom Operator: forward, op attrs: "
+  VLOG(3) << "Custom Operator: forward, op attrs: "
           << string::join_strings(op_attrs, ',');
 
   // Op
@@ -588,96 +677,13 @@ void RegisterOperatorWithMetaInfo(
               "Please set the InferShapeFn of custom "
               "operator by .SetInferShapeFn(PD_INFER_SHAPE(...))"));
 
-      VLOG(1) << "Custom Operator: Default InferShape - share ddim.";
+      VLOG(3) << "Custom Operator: Default InferShape - share ddim.";
       ctx->ShareDim(op_inputs[0], op_outputs[0]);
     };
   } else {
     info.infer_shape_ = [op_inputs, op_outputs, op_attrs,
                          infer_shape_func](InferShapeContext* ctx) {
-      std::vector<std::vector<int64_t>> input_shapes;
-      std::vector<std::vector<std::vector<int64_t>>> vec_input_shapes;
-
-      VLOG(1) << "Custom Operator: InferShape - get input ddim.";
-      for (auto& in_name : op_inputs) {
-        if (detail::IsDuplicableVar(in_name)) {
-          OP_INOUT_CHECK(ctx->HasInputs(in_name), "Input", in_name, "Custom");
-          auto vec_ddim = ctx->GetInputsDim(in_name);
-          std::vector<std::vector<int64_t>> vec_shape;
-          vec_shape.reserve(vec_ddim.size());
-          std::transform(vec_ddim.begin(), vec_ddim.end(),
-                         std::back_inserter(vec_shape),
-                         [&](const DDim& ddim) -> std::vector<int64_t> {
-                           return framework::vectorize(ddim);
-                         });
-          vec_input_shapes.emplace_back(vec_shape);
-        } else {
-          OP_INOUT_CHECK(ctx->HasInput(in_name), "Input", in_name, "Custom");
-          auto ddim = ctx->GetInputDim(in_name);
-          input_shapes.emplace_back(framework::vectorize(ddim));
-        }
-      }
-
-      std::vector<paddle::any> custom_attrs;
-      for (auto& attr_str : op_attrs) {
-        auto attr_name_and_type = detail::ParseAttrStr(attr_str);
-        auto attr_name = attr_name_and_type[0];
-        auto attr_type_str = attr_name_and_type[1];
-        if (attr_type_str == "bool") {
-          custom_attrs.emplace_back(ctx->Attrs().Get<bool>(attr_name));
-        } else if (attr_type_str == "int") {
-          custom_attrs.emplace_back(ctx->Attrs().Get<int>(attr_name));
-        } else if (attr_type_str == "float") {
-          custom_attrs.emplace_back(ctx->Attrs().Get<float>(attr_name));
-        } else if (attr_type_str == "int64_t") {
-          custom_attrs.emplace_back(ctx->Attrs().Get<int64_t>(attr_name));
-        } else if (attr_type_str == "std::string") {
-          custom_attrs.emplace_back(ctx->Attrs().Get<std::string>(attr_name));
-        } else if (attr_type_str == "std::vector<int>") {
-          custom_attrs.emplace_back(
-              ctx->Attrs().Get<std::vector<int>>(attr_name));
-        } else if (attr_type_str == "std::vector<float>") {
-          custom_attrs.emplace_back(
-              ctx->Attrs().Get<std::vector<float>>(attr_name));
-        } else if (attr_type_str == "std::vector<int64_t>") {
-          // NOTE(chenweihang): InferShape can't support std::vector<int64_t>
-          // attr type, because the input type is std::vector<int64_t>, only
-          // can use one rule to parse std::vector<int64_t> parameter
-          continue;
-        } else if (attr_type_str == "std::vector<std::string>") {
-          custom_attrs.emplace_back(
-              ctx->Attrs().Get<std::vector<std::string>>(attr_name));
-        } else {
-          PADDLE_THROW(platform::errors::Unimplemented(
-              "Unsupported `%s` type value as custom attribute now. "
-              "Supported data types include `bool`, `int`, `float`, "
-              "`int64_t`, `std::string`, `std::vector<int>`, "
-              "`std::vector<float>`, `std::vector<std::string>`, "
-              "Please check whether the attribute data type and "
-              "data type string are matched.",
-              attr_type_str));
-        }
-      }
-
-      VLOG(1) << "Custom Operator: InferShape - calc output ddim.";
-      auto output_shapes =
-          infer_shape_func(input_shapes, vec_input_shapes, custom_attrs);
-
-      VLOG(1) << "Custom Operator: InferShape - set output ddim.";
-      for (size_t i = 0; i < op_outputs.size(); ++i) {
-        auto out_name = op_outputs[i];
-        if (detail::IsDuplicableVar(out_name)) {
-          std::vector<DDim> vec_ddim;
-          vec_ddim.reserve(output_shapes.size());
-          std::transform(output_shapes.begin(), output_shapes.end(),
-                         std::back_inserter(vec_ddim),
-                         [&](const std::vector<int64_t>& shape) -> DDim {
-                           return framework::make_ddim(shape);
-                         });
-          ctx->SetOutputsDim(out_name, vec_ddim);
-        } else {
-          ctx->SetOutputDim(out_name, framework::make_ddim(output_shapes[i]));
-        }
-      }
+      RunInferShapeFunc(ctx, infer_shape_func, op_inputs, op_outputs, op_attrs);
     };
   }
 
@@ -706,7 +712,7 @@ void RegisterOperatorWithMetaInfo(
               "Please set the InferDtypeFn of custom "
               "operator by .SetInferDtypeFn(PD_INFER_DTYPE(...))"));
 
-      VLOG(1) << "Custom Operator: InferDtype - share dtype.";
+      VLOG(3) << "Custom Operator: InferDtype - share dtype.";
       auto dtype = ctx->GetInputDataType(op_inputs[0]);
       ctx->SetOutputDataType(op_outputs[0], dtype);
     };
@@ -716,7 +722,7 @@ void RegisterOperatorWithMetaInfo(
       std::vector<DataType> input_dtypes;
       std::vector<std::vector<DataType>> vec_input_dtypes;
 
-      VLOG(1) << "Custom Operator: InferDtype - get input dtype.";
+      VLOG(3) << "Custom Operator: InferDtype - get input dtype.";
       for (auto& in_name : op_inputs) {
         if (detail::IsDuplicableVar(in_name)) {
           std::vector<DataType> vec_custom_dtype;
@@ -731,10 +737,10 @@ void RegisterOperatorWithMetaInfo(
         }
       }
 
-      VLOG(1) << "Custom Operator: InferDtype - infer output dtype.";
+      VLOG(3) << "Custom Operator: InferDtype - infer output dtype.";
       auto output_dtypes = infer_dtype_func(input_dtypes, vec_input_dtypes);
 
-      VLOG(1) << "Custom Operator: InferDtype - set output dtype.";
+      VLOG(3) << "Custom Operator: InferDtype - set output dtype.";
       for (size_t i = 0; i < op_outputs.size(); ++i) {
         auto out_name = op_outputs[i];
         if (detail::IsDuplicableVar(out_name)) {
@@ -763,11 +769,12 @@ void RegisterOperatorWithMetaInfo(
     auto& grad_op_outputs = OpMetaInfoHelper::GetOutputs(cur_grad_op);
     auto& grad_op_attrs = OpMetaInfoHelper::GetAttrs(cur_grad_op);
     auto& grad_kernel_fn = OpMetaInfoHelper::GetKernelFn(cur_grad_op);
+    auto& grad_infer_shape_fn = OpMetaInfoHelper::GetInferShapeFn(cur_grad_op);
 
-    VLOG(1) << "Custom Operator: backward, op name: " << grad_op_name;
-    VLOG(1) << "Custom Operator: backward, op inputs: "
+    VLOG(3) << "Custom Operator: backward, op name: " << grad_op_name;
+    VLOG(3) << "Custom Operator: backward, op inputs: "
             << string::join_strings(grad_op_inputs, ',');
-    VLOG(1) << "Custom Operator: backward, op outputs: "
+    VLOG(3) << "Custom Operator: backward, op outputs: "
             << string::join_strings(grad_op_outputs, ',');
 
     // GradOpDescMaker
@@ -809,40 +816,52 @@ void RegisterOperatorWithMetaInfo(
     };
 
     // Grad InferShape
-    grad_info.infer_shape_ = [grad_op_inputs,
-                              grad_op_outputs](InferShapeContext* ctx) {
-      // 1. if forward input exists, gradient's shape is same with forward input
-      // default
-      //    [Suitable for most situations]
-      // 2. if forward input not exists, and only contains one grad input and
-      // output,
-      //    use grad input shape as grad output shape
-      //    [Suitable for the situation that forward input is not used as
-      //    backward input]
-      // TODO(chenweihang): support set grad op infershape func if needed
-      for (auto& out_name : grad_op_outputs) {
-        auto fwd_name = detail::NoGrad(out_name);
-        if (detail::IsDuplicableVar(fwd_name)) {
-          // Duplicable forward var must as backward input
-          ctx->ShareDim(fwd_name, out_name);
-        } else {
-          if (ctx->HasInput(fwd_name)) {
+    if (grad_infer_shape_fn == nullptr) {
+      grad_info.infer_shape_ = [grad_op_inputs,
+                                grad_op_outputs](InferShapeContext* ctx) {
+        // 1. if forward input exists, gradient's shape is same with forward
+        // input
+        // default
+        //    [Suitable for most situations]
+        // 2. if forward input not exists, and only contains one grad input and
+        // output,
+        //    use grad input shape as grad output shape
+        //    [Suitable for the situation that forward input is not used as
+        //    backward input]
+        for (auto& out_name : grad_op_outputs) {
+          auto fwd_name = detail::NoGrad(out_name);
+          if (detail::IsDuplicableVar(fwd_name)) {
+            // Duplicable forward var must as backward input
             ctx->ShareDim(fwd_name, out_name);
           } else {
-            PADDLE_ENFORCE_EQ(
-                grad_op_inputs.size() == 1UL && grad_op_outputs.size() == 1UL,
-                true,
-                platform::errors::Unavailable(
-                    "Custom grad operator infershape error. "
-                    "If a custom grad operator contains only one input and "
-                    "only one output, the input shape will be directly set to "
-                    "the output shape. Otherwise, Please set the forward input "
-                    "as the grad operator's input."));
-            ctx->ShareDim(grad_op_inputs[0], out_name);
+            if (ctx->HasInput(fwd_name)) {
+              ctx->ShareDim(fwd_name, out_name);
+            } else {
+              PADDLE_ENFORCE_EQ(
+                  grad_op_inputs.size() == 1UL && grad_op_outputs.size() == 1UL,
+                  true,
+                  platform::errors::Unavailable(
+                      "Custom grad operator infershape error. "
+                      "If a custom grad operator contains only one input and "
+                      "only one output, the input shape will be directly set "
+                      "to "
+                      "the output shape. Otherwise, Please set the forward "
+                      "input "
+                      "as the grad operator's input or  set the InferShapeFn "
+                      "of custom grad operator by "
+                      ".SetInferShapeFn(PD_INFER_SHAPE(...))"));
+              ctx->ShareDim(grad_op_inputs[0], out_name);
+            }
           }
         }
-      }
-    };
+      };
+    } else {
+      grad_info.infer_shape_ = [grad_op_inputs, grad_op_outputs, grad_op_attrs,
+                                grad_infer_shape_fn](InferShapeContext* ctx) {
+        RunInferShapeFunc(ctx, grad_infer_shape_fn, grad_op_inputs,
+                          grad_op_outputs, grad_op_attrs);
+      };
+    }
 
     // Kernel func
     RegisterOperatorKernel(grad_op_name, grad_kernel_fn, grad_op_inputs,
@@ -860,11 +879,11 @@ void RegisterOperatorWithMetaInfo(
 void RegisterOperatorWithMetaInfoMap(
     const paddle::OpMetaInfoMap& op_meta_info_map) {
   auto& meta_info_map = op_meta_info_map.GetMap();
-  VLOG(1) << "Custom Operator: size of op meta info map - "
+  VLOG(3) << "Custom Operator: size of op meta info map - "
           << meta_info_map.size();
   // pair: {op_type, OpMetaInfo}
   for (auto& pair : meta_info_map) {
-    VLOG(1) << "Custom Operator: pair first -> op name: " << pair.first;
+    VLOG(3) << "Custom Operator: pair first -> op name: " << pair.first;
     RegisterOperatorWithMetaInfo(pair.second);
   }
 }
@@ -874,7 +893,7 @@ void RegisterOperatorWithMetaInfoMap(
 // load op api
 void LoadOpMetaInfoAndRegisterOp(const std::string& dso_name) {
   void* handle = paddle::platform::dynload::GetOpDsoHandle(dso_name);
-  VLOG(1) << "load custom_op lib: " << dso_name;
+  VLOG(3) << "load custom_op lib: " << dso_name;
   typedef OpMetaInfoMap& get_op_meta_info_map_t();
   auto* get_op_meta_info_map =
       detail::DynLoad<get_op_meta_info_map_t>(handle, "PD_GetOpMetaInfoMap");
diff --git a/paddle/fluid/framework/new_executor/data_transfer.cc b/paddle/fluid/framework/new_executor/data_transfer.cc
index 064dfa0170bdb..9230c36a0c745 100644
--- a/paddle/fluid/framework/new_executor/data_transfer.cc
+++ b/paddle/fluid/framework/new_executor/data_transfer.cc
@@ -94,8 +94,7 @@ void DataTranferHelper::RunAndConstructOpFuncNode(
 
   // 2. Execute infer shape and choose kernel
   auto& all_op_kernels = OperatorWithKernel::AllOpKernels();
-  static_cast<const framework::OperatorWithKernel*>(op.get())->InferShape(
-      &infer_shape_ctx);
+  op.get()->Info().infer_shape_(&infer_shape_ctx);
   auto kernels_iter = all_op_kernels.find(op_type);
   PADDLE_ENFORCE_NE(kernels_iter, all_op_kernels.end(),
                     platform::errors::Unavailable(
diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.cc b/paddle/fluid/framework/new_executor/interpretercore_util.cc
index 41c4faa67fbeb..7ced4853c2d8f 100644
--- a/paddle/fluid/framework/new_executor/interpretercore_util.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc
@@ -355,7 +355,7 @@ void build_op_func_list(const platform::Place& place,
         // TODO(Aurelius84): In case of control flow ops, they are NOT
         // inheritted
         // from OperatorWithKernel.
-        op_with_kernel->InferShape(&infer_shape_ctx);
+        op_with_kernel->Info().infer_shape_(&infer_shape_ctx);
       }
 
       auto kernels_iter = all_op_kernels.find(op->Type());
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 2d2e198ef40ec..a0c1bd44da01e 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1090,7 +1090,7 @@ void OperatorWithKernel::RuntimeInferShape(const Scope& scope,
                                            const platform::Place& place,
                                            const RuntimeContext& ctx) const {
   RuntimeInferShapeContext infer_shape_ctx(*this, ctx);
-  this->InferShape(&infer_shape_ctx);
+  this->Info().infer_shape_(&infer_shape_ctx);
 }
 
 void OperatorWithKernel::RunImpl(const Scope& scope,
@@ -1178,6 +1178,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
     platform::RecordEvent record_event("infer_shape",
                                        platform::EventRole::kInnerOp);
     RuntimeInferShapeContext infer_shape_ctx(*this, *runtime_ctx);
+    // TODO(chenweihang): replace this after removing `this->IsMKLDNNType()`
+    // in some mkldnn infershape functions, such conv2d infershape
     this->InferShape(&infer_shape_ctx);
   }
 
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index c5623a8f4f243..29cd24a1e7793 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -491,8 +491,7 @@ static void PreparedOpRunImpl(
 
   DygraphInferShapeContext<VarType> infer_shape_ctx(&ins, &outs, &attrs,
                                                     &default_attrs, op.Type());
-  static_cast<const framework::OperatorWithKernel&>(op).InferShape(
-      &infer_shape_ctx);
+  op.Info().infer_shape_(&infer_shape_ctx);
 
   func(DygraphExecutionContext<VarType>(op, scope, *dev_ctx, ctx, ins, outs,
                                         attrs, default_attrs));
@@ -537,8 +536,7 @@ static void PreparedOpRunPtImpl(
     const framework::AttributeMap& default_attrs) {
   DygraphInferShapeContext<VarType> infer_shape_ctx(&ins, &outs, &attrs,
                                                     &default_attrs, op.Type());
-  static_cast<const framework::OperatorWithKernel&>(op).InferShape(
-      &infer_shape_ctx);
+  op.Info().infer_shape_(&infer_shape_ctx);
 
   BuildDygraphPtenKernelContext<VarType>(pt_kernel_signature, pt_kernel, ins,
                                          outs, attrs, default_attrs, dev_ctx,
diff --git a/paddle/pten/api/lib/op_meta_info.cc b/paddle/pten/api/lib/op_meta_info.cc
index 586fa0cc05526..aa2e33afb94b8 100644
--- a/paddle/pten/api/lib/op_meta_info.cc
+++ b/paddle/pten/api/lib/op_meta_info.cc
@@ -122,13 +122,6 @@ OpMetaInfoBuilder& OpMetaInfoBuilder::SetKernelFn(KernelFunc func) {
 }
 
 OpMetaInfoBuilder& OpMetaInfoBuilder::SetInferShapeFn(InferShapeFunc func) {
-  PADDLE_ENFORCE_EQ(
-      index_,
-      0UL,
-      platform::errors::Unimplemented(
-          "Currently, the InferShapeFn setting of Grad Op is not supported, "
-          "And backward Tensor `X@GRAD` will use the shape of forward Tensor "
-          "`X` by default."));
   info_ptr_->SetInferShapeFn(std::forward<InferShapeFunc>(func));
   return *this;
 }
diff --git a/python/paddle/fluid/tests/custom_op/custom_relu_op.cc b/python/paddle/fluid/tests/custom_op/custom_relu_op.cc
index b2ef90bf87a1a..c5ec3191c1b02 100644
--- a/python/paddle/fluid/tests/custom_op/custom_relu_op.cc
+++ b/python/paddle/fluid/tests/custom_op/custom_relu_op.cc
@@ -105,3 +105,49 @@ PD_BUILD_GRAD_OP(custom_relu)
     .Inputs({"X", "Out", paddle::Grad("Out")})
     .Outputs({paddle::Grad("X")})
     .SetKernelFn(PD_KERNEL(ReluBackward));
+
+std::vector<paddle::Tensor> relu_cpu_backward_without_x(
+    const paddle::Tensor& out, const paddle::Tensor& grad_out) {
+  auto grad_x = paddle::Tensor(paddle::PlaceType::kCPU, out.shape());
+
+  PD_DISPATCH_FLOATING_TYPES(out.type(), "relu_cpu_backward", ([&] {
+                               relu_cpu_backward_kernel<data_t>(
+                                   grad_out.data<data_t>(),
+                                   out.data<data_t>(),
+                                   grad_x.mutable_data<data_t>(out.place()),
+                                   out.size());
+                             }));
+
+  return {grad_x};
+}
+
+std::vector<paddle::Tensor> relu_cuda_backward_without_x(
+    const paddle::Tensor& out, const paddle::Tensor& grad_out);
+
+std::vector<paddle::Tensor> ReluBackwardWithoutX(
+    const paddle::Tensor& out, const paddle::Tensor& grad_out) {
+  if (out.place() == paddle::PlaceType::kCPU) {
+    return relu_cpu_backward_without_x(out, grad_out);
+  } else if (out.place() == paddle::PlaceType::kGPU) {
+    return relu_cuda_backward_without_x(out, grad_out);
+  } else {
+    PD_THROW("Not implemented.");
+  }
+}
+
+std::vector<std::vector<int64_t>> ReluBackwardWithoutXInferShape(
+    const std::vector<int64_t>& out_shape,
+    const std::vector<int64_t>& grad_out_shape) {
+  return {out_shape};
+}
+
+PD_BUILD_OP(custom_relu_no_x_in_backward)
+    .Inputs({"X"})
+    .Outputs({"Out"})
+    .SetKernelFn(PD_KERNEL(ReluForward));
+
+PD_BUILD_GRAD_OP(custom_relu_no_x_in_backward)
+    .Inputs({"Out", paddle::Grad("Out")})
+    .Outputs({paddle::Grad("X")})
+    .SetKernelFn(PD_KERNEL(ReluBackwardWithoutX))
+    .SetInferShapeFn(PD_INFER_SHAPE(ReluBackwardWithoutXInferShape));
diff --git a/python/paddle/fluid/tests/custom_op/custom_relu_op.cu b/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
index dda42a5c05984..637deeb90569c 100644
--- a/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
+++ b/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
@@ -70,3 +70,22 @@ std::vector<paddle::Tensor> relu_cuda_backward(const paddle::Tensor& x,
 
   return {grad_x};
 }
+
+std::vector<paddle::Tensor> relu_cuda_backward_without_x(
+    const paddle::Tensor& out, const paddle::Tensor& grad_out) {
+  auto grad_x = paddle::Tensor(paddle::PlaceType::kGPU, out.shape());
+
+  int numel = out.size();
+  int block = 512;
+  int grid = (numel + block - 1) / block;
+  PD_DISPATCH_FLOATING_AND_HALF_TYPES(
+      out.type(), "relu_cuda_backward_kernel", ([&] {
+        relu_cuda_backward_kernel<data_t><<<grid, block, 0, out.stream()>>>(
+            grad_out.data<data_t>(),
+            out.data<data_t>(),
+            grad_x.mutable_data<data_t>(out.place()),
+            numel);
+      }));
+
+  return {grad_x};
+}
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py
index 4f075066b9d93..16458841f4488 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py
@@ -49,7 +49,8 @@
 class TestJITLoad(unittest.TestCase):
     def setUp(self):
         self.custom_ops = [
-            custom_module.custom_relu, custom_module.custom_relu_dup
+            custom_module.custom_relu, custom_module.custom_relu_dup,
+            custom_module.custom_relu_no_x_in_backward
         ]
         self.dtypes = ['float32', 'float64']
         if paddle.is_compiled_with_cuda():

From 5c73a6eaa2be74eaed7e974b433a4c44f6da58b6 Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Mon, 10 Jan 2022 11:13:59 +0800
Subject: [PATCH 049/151] [Unify Tensors PR #5] framework::Tensor inherits from
 DenseTensor,test=allcases (#38632)

* Added shared_ptr<Allocation> member & corresponding interfaces to Storage

* Removed original pten::Allocation from Storage and adjusted the interfaces accordingly

* Fixed issues with storage offset

* Used place to malloc allocation for TensorStorage

* [Unify Tensors PR #3]Ported framework::Tensor interfaces to pten::DenseTensor

* Fixed issues with place

* Added comments

* Moved mutable_data with stream argument to DenseTensor

* Added set_offset interface

* Fixed CI issues,test=allcases

* [Unify Tensors PR #4] Port LoDTensor interfaces to DenseTensor

* Removed friend class EigenTensor/EigenMatrix/EigenVector from Tensor

* Modified framework::Tensor to inherit from DenseTensor

* Reverted changes too pten_layout() interface

* Removed friend classes

* Rearranged cfunction calls from tensor.data<void>() to tensor.data()

* Fixed CI issues

* Fixed lite issues

* Fixed data() interface issues,test=allcases

* Resolved IsInitialized() issues

* Fixed ResetHolder() issues

* Fixed MKLDNN & Storage issues

* Resolved ShareBufferWith() issues

* Fixed LoD issues
---
 .../fluid/distributed/service/brpc_utils.cc   |  24 +-
 .../fluid/distributed/service/heter_client.cc |  11 +-
 .../fluid/distributed/service/heter_server.h  |   2 +-
 paddle/fluid/framework/CMakeLists.txt         |   2 +-
 .../fluid/framework/copy_same_tensor_test.cc  |   4 +-
 paddle/fluid/framework/data_transform.cc      |   1 -
 .../framework/details/all_reduce_op_handle.cc |   8 +-
 .../framework/details/broadcast_op_handle.cc  |   4 +-
 .../details/fused_all_reduce_op_handle.cc     |   8 +-
 .../framework/details/reduce_op_handle.cc     |   6 +-
 .../details/sparse_all_reduce_op_handle.cc    |   4 +-
 paddle/fluid/framework/dlpack_tensor.cc       |   2 +-
 paddle/fluid/framework/fleet/ascend_wrapper.h |   3 +-
 paddle/fluid/framework/fleet/heter_wrapper.cc |   9 +-
 paddle/fluid/framework/heterxpu_trainer.cc    |  12 +-
 paddle/fluid/framework/lod_tensor.h           |   4 +-
 .../new_executor/standalone_executor_test.cc  |   1 -
 paddle/fluid/framework/parallel_executor.cc   |   4 +-
 paddle/fluid/framework/program_desc.cc        |   5 +
 paddle/fluid/framework/save_load_util.cc      |   4 +-
 paddle/fluid/framework/tensor.cc              | 186 +++-----------
 paddle/fluid/framework/tensor.h               | 236 +-----------------
 paddle/fluid/framework/tensor_impl.h          |  55 ----
 paddle/fluid/framework/tensor_test.cc         |   3 -
 paddle/fluid/framework/tensor_util.cc         |   2 +-
 paddle/fluid/framework/variable.h             |   6 +-
 paddle/fluid/imperative/all_reduce.cc         |   4 +-
 paddle/fluid/imperative/bkcl_context.cc       |   4 +-
 paddle/fluid/imperative/hccl_context.cc       |   4 +-
 paddle/fluid/imperative/nccl_context.cc       |   2 +-
 .../tests/test_gradient_accmulator.cc         |   4 +-
 paddle/fluid/inference/api/api_impl_tester.cc |   6 +-
 paddle/fluid/inference/lite/tensor_utils.cc   |   4 +-
 .../amp/update_loss_scaling_op_npu.cc         |   2 +-
 paddle/fluid/operators/coalesce_tensor_op.cc  |   8 +-
 .../fluid/operators/collective/allreduce_op.h |   2 +-
 .../operators/collective/barrier_op.cu.cc     |   2 +-
 .../operators/collective/broadcast_op.cu.cc   |   2 +-
 .../operators/collective/broadcast_op_xpu.cc  |   2 +-
 .../fluid/operators/collective/c_reduce_op.h  |   4 +-
 paddle/fluid/operators/detection/bbox_util.h  |   4 +-
 paddle/fluid/operators/layer_norm_op.cu       |   4 +-
 .../fluid/operators/math/matrix_inverse.cu.cc |   2 +-
 paddle/fluid/operators/optimizers/lamb_op.h   |   4 +-
 .../fluid/operators/reader/buffered_reader.cc |   6 +-
 paddle/fluid/operators/spectral_op.cc         |  12 +-
 paddle/fluid/operators/spectral_op.cu         |  18 +-
 .../operators/tensorrt/tensorrt_engine_op.h   |   2 +-
 .../fluid/platform/device/ipu/ipu_compiler.cc |   2 +-
 paddle/fluid/platform/device/ipu/ipu_utils.cc |   2 +-
 paddle/fluid/platform/device/ipu/ipu_utils.h  |   2 +-
 .../platform/device/npu/npu_op_runner.cc      |   2 +-
 .../fluid/platform/device/npu/npu_op_runner.h |   4 +-
 paddle/fluid/pybind/imperative.cc             |   6 +-
 paddle/fluid/pybind/tensor_py.h               |   2 +-
 paddle/pten/api/lib/utils/storage.h           |  17 +-
 paddle/pten/core/dense_tensor.cc              | 195 +++++----------
 paddle/pten/core/dense_tensor.h               |  31 +--
 paddle/pten/core/storage.h                    |   2 +-
 paddle/pten/core/utils/intrusive_ptr.h        |   4 +-
 paddle/pten/tests/core/test_dense_tensor.cc   |   5 +-
 61 files changed, 251 insertions(+), 731 deletions(-)

diff --git a/paddle/fluid/distributed/service/brpc_utils.cc b/paddle/fluid/distributed/service/brpc_utils.cc
index 92dcde99cccb0..6eb8462977b60 100644
--- a/paddle/fluid/distributed/service/brpc_utils.cc
+++ b/paddle/fluid/distributed/service/brpc_utils.cc
@@ -103,19 +103,17 @@ void SerializeLodTensor(framework::Variable* var,
   if (platform::is_cpu_place(tensor->place())) {
     auto data_len = tensor->numel() * framework::SizeOfType(tensor->type());
     iobuf->append(reinterpret_cast<const char*>(&data_len), 8);
-    iobuf->append(reinterpret_cast<const char*>(tensor->data<void>()),
-                  data_len);
+    iobuf->append(reinterpret_cast<const char*>(tensor->data()), data_len);
   } else {
 #ifdef PADDLE_WITH_CUDA
     char* temp_ptr =
         new char[tensor->numel() * framework::SizeOfType(tensor->type())];
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
-    memory::Copy(platform::CPUPlace(), temp_ptr,
-                 BOOST_GET_CONST(platform::CUDAPlace, tensor->place()),
-                 tensor->data<void>(),
-                 tensor->numel() * framework::SizeOfType(tensor->type()),
-                 stream);
+    memory::Copy(
+        platform::CPUPlace(), temp_ptr,
+        BOOST_GET_CONST(platform::CUDAPlace, tensor->place()), tensor->data(),
+        tensor->numel() * framework::SizeOfType(tensor->type()), stream);
     auto data_len = tensor->numel() * framework::SizeOfType(tensor->type());
     iobuf->append(reinterpret_cast<const char*>(&data_len), 8);
     iobuf->append(reinterpret_cast<const char*>(temp_ptr), data_len);
@@ -147,19 +145,17 @@ void SerializeSelectedRows(framework::Variable* var,
   if (platform::is_cpu_place(tensor->place())) {
     auto data_len = tensor->numel() * framework::SizeOfType(tensor->type());
     iobuf->append(reinterpret_cast<const char*>(&data_len), 8);
-    iobuf->append(reinterpret_cast<const char*>(tensor->data<void>()),
-                  data_len);
+    iobuf->append(reinterpret_cast<const char*>(tensor->data()), data_len);
   } else {
 #ifdef PADDLE_WITH_CUDA
     char* temp_ptr =
         new char[tensor->numel() * framework::SizeOfType(tensor->type())];
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
-    memory::Copy(platform::CPUPlace(), temp_ptr,
-                 BOOST_GET_CONST(platform::CUDAPlace, tensor->place()),
-                 tensor->data<void>(),
-                 tensor->numel() * framework::SizeOfType(tensor->type()),
-                 stream);
+    memory::Copy(
+        platform::CPUPlace(), temp_ptr,
+        BOOST_GET_CONST(platform::CUDAPlace, tensor->place()), tensor->data(),
+        tensor->numel() * framework::SizeOfType(tensor->type()), stream);
     auto data_len = tensor->numel() * framework::SizeOfType(tensor->type());
     iobuf->append(reinterpret_cast<const char*>(&data_len), 8);
     iobuf->append(reinterpret_cast<const char*>(temp_ptr), data_len);
diff --git a/paddle/fluid/distributed/service/heter_client.cc b/paddle/fluid/distributed/service/heter_client.cc
index d9ec6b21fd377..13016d60515dd 100644
--- a/paddle/fluid/distributed/service/heter_client.cc
+++ b/paddle/fluid/distributed/service/heter_client.cc
@@ -34,7 +34,7 @@ int GetMicroId(const platform::DeviceContext& ctx,
   auto micro_id = -1;
   auto* tensor = var->GetMutable<framework::LoDTensor>();
   if (platform::is_cpu_place(tensor->place())) {
-    auto data = reinterpret_cast<const float*>(tensor->data<void>());
+    auto data = reinterpret_cast<const float*>(tensor->data());
     micro_id = static_cast<int>(data[0]);
   } else {
 #ifdef PADDLE_WITH_CUDA
@@ -43,11 +43,10 @@ int GetMicroId(const platform::DeviceContext& ctx,
     char* temp_ptr = temp.data();
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
-    memory::Copy(platform::CPUPlace(), temp_ptr,
-                 BOOST_GET_CONST(platform::CUDAPlace, tensor->place()),
-                 tensor->data<void>(),
-                 tensor->numel() * framework::SizeOfType(tensor->type()),
-                 stream);
+    memory::Copy(
+        platform::CPUPlace(), temp_ptr,
+        BOOST_GET_CONST(platform::CUDAPlace, tensor->place()), tensor->data(),
+        tensor->numel() * framework::SizeOfType(tensor->type()), stream);
     float* temp_ptr_float = reinterpret_cast<float*>(temp_ptr);
     micro_id = static_cast<int>(temp_ptr_float[0]);
 #endif
diff --git a/paddle/fluid/distributed/service/heter_server.h b/paddle/fluid/distributed/service/heter_server.h
index 5f062755c9242..201074810cf31 100644
--- a/paddle/fluid/distributed/service/heter_server.h
+++ b/paddle/fluid/distributed/service/heter_server.h
@@ -240,7 +240,7 @@ class RequestSendAndRecvHandler final : public HeterRequestHandler {
                       platform::errors::InvalidArgument(
                           "Not find variable microbatch_id in scope."));
     auto* tensor = var->GetMutable<framework::LoDTensor>();
-    auto data = reinterpret_cast<const float*>(tensor->data<void>());
+    auto data = reinterpret_cast<const float*>(tensor->data());
     auto micro_id = static_cast<int>(data[0]);
 
     int minibatch_index = micro_id / 10;
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index bd096f41ccc49..902943d14ff9d 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -91,7 +91,7 @@ endif()
 cc_test(copy_same_tensor_test SRCS copy_same_tensor_test.cc DEPS tensor)
 
 cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
-cc_library(mixed_vector SRCS mixed_vector.cc DEPS device_context)
+cc_library(mixed_vector SRCS mixed_vector.cc DEPS device_context place memory)
 
 if(WITH_GPU)
   nv_test(mixed_vector_test SRCS mixed_vector_test.cc mixed_vector_test.cu DEPS mixed_vector place memory device_context tensor)
diff --git a/paddle/fluid/framework/copy_same_tensor_test.cc b/paddle/fluid/framework/copy_same_tensor_test.cc
index 0b1fdc3944689..14bef7fe023f6 100644
--- a/paddle/fluid/framework/copy_same_tensor_test.cc
+++ b/paddle/fluid/framework/copy_same_tensor_test.cc
@@ -77,8 +77,8 @@ static bool CopySameTensorTestMain(const DDim &dims,
     TensorCopySync(src_tensor, platform::CPUPlace(), &dst_cpu_tensor);
   }
 
-  const void *ground_truth_ptr = src_cpu_tensor.data<void>();
-  const void *result_ptr = dst_cpu_tensor.data<void>();
+  const void *ground_truth_ptr = src_cpu_tensor.data();
+  const void *result_ptr = dst_cpu_tensor.data();
   size_t byte_num = product(dims) * sizeof(T);
   return std::memcmp(ground_truth_ptr, result_ptr, byte_num) == 0;
 }
diff --git a/paddle/fluid/framework/data_transform.cc b/paddle/fluid/framework/data_transform.cc
index 16c1923ce1815..d8c372becf1b4 100644
--- a/paddle/fluid/framework/data_transform.cc
+++ b/paddle/fluid/framework/data_transform.cc
@@ -45,7 +45,6 @@ void TransformData(const OpKernelType &expected_kernel_type,
   Tensor out;
   const DataLayout lin = kernel_type_for_var.data_layout_;
   const DataLayout lout = expected_kernel_type.data_layout_;
-
   // do layout transform
   if (NeedTransformLayout(lout, lin)) {
 #ifdef PADDLE_WITH_MKLDNN
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
index b1573093ec333..f93202769dbd0 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -153,7 +153,7 @@ void AllReduceOpHandle::AllReduceImpl(
                           "The place type of tensors of the same variable "
                           "in different local scopes should be equal."));
 
-    lod_tensor_data.emplace_back(lod_tensor.data<void>());
+    lod_tensor_data.emplace_back(lod_tensor.data());
     places.emplace_back(lod_tensor.place());
 
     VLOG(10) << "place:" << i << ", input_name:" << in_var_handles[i]->name()
@@ -225,7 +225,7 @@ void AllReduceOpHandle::AllReduceFunc(
                      ->GetMutable<LoDTensor>();
 
     // Reduce All Tensor to trg in CPU
-    ReduceBufferData func(lod_tensor_data, trg.data<void>(), numel);
+    ReduceBufferData func(lod_tensor_data, trg.data(), numel);
     VisitDataType(trg.type(), func);
 
     for (size_t i = 1; i < local_exec_scopes_.size(); ++i) {
@@ -235,9 +235,9 @@ void AllReduceOpHandle::AllReduceFunc(
 
       size_t size = numel * SizeOfType(trg.type());
       RunAndRecordEvent(p, [&trg, var, p, size] {
-        auto dst_ptr = var->GetMutable<framework::LoDTensor>()->data<void>();
+        auto dst_ptr = var->GetMutable<framework::LoDTensor>()->data();
         platform::CPUPlace cpu_place;
-        memory::Copy(cpu_place, dst_ptr, cpu_place, trg.data<void>(), size);
+        memory::Copy(cpu_place, dst_ptr, cpu_place, trg.data(), size);
       });
     }
   }
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc
index a11a244214d4f..01dc5a45146f1 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -101,7 +101,7 @@ void BroadcastOpHandle::BroadcastOneVar(
 
       void *send_recv_buffer = nullptr;
       if (root_id == dst_id) {
-        send_recv_buffer = const_cast<void *>(in_tensor.data<void>());
+        send_recv_buffer = const_cast<void *>(in_tensor.data());
         out_handle = out_var_handle;
       } else {
         send_recv_buffer = VariableVisitor::GetMutableTensor(out_var)
@@ -162,7 +162,7 @@ void BroadcastOpHandle::BroadcastOneVar(
 
       void *send_recv_buffer = nullptr;
       if (root_id == dst_id) {
-        send_recv_buffer = const_cast<void *>(in_tensor.data<void>());
+        send_recv_buffer = const_cast<void *>(in_tensor.data());
         out_handle = out_var_handle;
       } else {
         send_recv_buffer = VariableVisitor::GetMutableTensor(out_var)
diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
index bd153f24fa318..b65d4e4fcd55a 100644
--- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
@@ -220,17 +220,17 @@ void FusedAllReduceOpHandle::FusedAllReduceFunc(
         g_tensor.begin(), g_tensor.end(),
         [](const std::pair<std::string, const LoDTensor *> &grad1,
            const std::pair<std::string, const LoDTensor *> &grad2) -> bool {
-          return grad1.second->data<void>() < grad2.second->data<void>();
+          return grad1.second->data() < grad2.second->data();
         });
 
     size_t size_of_dtype = framework::SizeOfType(dtype);
     for (size_t k = 1; k < g_tensor.size(); ++k) {
-      const void *cur_address = g_tensor.at(k - 1).second->data<void>();
+      const void *cur_address = g_tensor.at(k - 1).second->data();
       int64_t len = g_tensor.at(k - 1).second->numel();
       auto offset = platform::Alignment(len * size_of_dtype, places_[0]);
       void *infer_next_address = reinterpret_cast<void *>(
           reinterpret_cast<uintptr_t>(cur_address) + offset);
-      const void *next_address = g_tensor.at(k).second->data<void>();
+      const void *next_address = g_tensor.at(k).second->data();
 
       VLOG(10) << string::Sprintf(
           "Input[%d](%s) address: 0X%02x, Input[%d](%s) address: 0X%02x, Infer "
@@ -267,7 +267,7 @@ void FusedAllReduceOpHandle::FusedAllReduceFunc(
   std::vector<const void *> lod_tensor_data;
   lod_tensor_data.reserve(place_num);
   for (size_t scope_idx = 0; scope_idx < place_num; ++scope_idx) {
-    auto data = grads_tensor.at(scope_idx).at(0).second->data<void>();
+    auto data = grads_tensor.at(scope_idx).at(0).second->data();
     lod_tensor_data.emplace_back(data);
   }
   std::vector<std::string> grad_var_names;
diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc
index bbc458804a195..196f7a3d4a4bf 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -159,7 +159,7 @@ void ReduceOpHandle::RunImpl() {
           VisitDataType(lod_tensors[0]->type(), func);
 
           auto trg = out_var->GetMutable<framework::LoDTensor>();
-          if (reduce_sum_trg.data<void>() != trg->data<void>()) {
+          if (reduce_sum_trg.data() != trg->data()) {
             TensorCopy(reduce_sum_trg, platform::CPUPlace(), trg);
           }
         }
@@ -181,7 +181,7 @@ void ReduceOpHandle::RunImpl() {
         int dev_id = BOOST_GET_CONST(platform::CUDAPlace, p).device;
         auto &nccl_ctx = nccl_ctxs_->at(dev_id);
 
-        void *buffer = const_cast<void *>(lod_tensor.data<void>());
+        void *buffer = const_cast<void *>(lod_tensor.data());
         void *recvbuffer = nullptr;
         if (root_id == dev_id) {
           recvbuffer =
@@ -227,7 +227,7 @@ void ReduceOpHandle::RunImpl() {
         int dev_id = BOOST_GET_CONST(platform::XPUPlace, p).device;
         auto &bkcl_ctx = bkcl_ctxs_->at(dev_id);
 
-        void *buffer = const_cast<void *>(lod_tensor.data<void>());
+        void *buffer = const_cast<void *>(lod_tensor.data());
         void *recvbuffer = nullptr;
         if (root_id == dev_id) {
           recvbuffer =
diff --git a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
index d916b9bc26276..ed485ed587c0b 100644
--- a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
@@ -146,7 +146,7 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
   for (size_t i = 0; i < local_scopes_.size(); ++i) {
     auto &place = places_[i];
     auto &in = *ins[i];
-    void *in_tensor_buf = const_cast<void *>(in.data<void>());
+    void *in_tensor_buf = const_cast<void *>(in.data());
 
     auto &out = *outs[i];
     float *out_tensor_buf = out.data<float>();
@@ -175,7 +175,7 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
     // dgc use ncclAllGather to get all the encoded data
     // so the buffer need nranks.
     int buf_size = nranks_ * encode_size;
-    void *gather_buff = gathers[i]->data<void>();
+    void *gather_buff = gathers[i]->data();
 
     VLOG(10) << "in_numel:" << in_numel << ", out_numel:" << out_numel
              << ", nranks:" << nranks_ << ", gather_buf size:" << buf_size
diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc
index cef1016aa5340..95913664961b3 100644
--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -134,7 +134,7 @@ struct DLDeviceVisitor : public boost::static_visitor<::DLDevice> {
 
 DLPackTensor::DLPackTensor(const Tensor &tensor, LaneType lanes) {
   // init data, data buffer
-  t_.data = const_cast<void *>(tensor.data<void>());
+  t_.data = const_cast<void *>(tensor.data());
 
   // init device, DLDevice type with device_type and device_id
   auto place = tensor.place();
diff --git a/paddle/fluid/framework/fleet/ascend_wrapper.h b/paddle/fluid/framework/fleet/ascend_wrapper.h
index 82ce3b28776f1..4127adf1bfe27 100644
--- a/paddle/fluid/framework/fleet/ascend_wrapper.h
+++ b/paddle/fluid/framework/fleet/ascend_wrapper.h
@@ -150,8 +150,7 @@ class AscendInstance {
                                VarTypeToGeType(tensor->type()));
     tensor_desc.SetRealDimCnt(vec_dim.size());
 
-    const uint8_t *data =
-        reinterpret_cast<const uint8_t *>(tensor->data<void>());
+    const uint8_t *data = reinterpret_cast<const uint8_t *>(tensor->data());
     std::vector<uint8_t> dst(numel * GeTypeSize(tensor->type()));
     memcpy(dst.data(), data, GeTypeSize(tensor->type()) * numel);
     ge::Tensor ge_tensor(tensor_desc, dst);
diff --git a/paddle/fluid/framework/fleet/heter_wrapper.cc b/paddle/fluid/framework/fleet/heter_wrapper.cc
index a67f9a5e2c733..66f0d116f2412 100644
--- a/paddle/fluid/framework/fleet/heter_wrapper.cc
+++ b/paddle/fluid/framework/fleet/heter_wrapper.cc
@@ -112,20 +112,19 @@ void HeterWrapper::SerializeToReq(const std::string& varname, Scope* scope,
   char* data_ptr = const_cast<char*>(req_data->data());
 
   if (platform::is_cpu_place(tensor->place())) {
-    memcpy(data_ptr, tensor->data<void>(),
+    memcpy(data_ptr, tensor->data(),
            tensor->numel() * SizeOfType(tensor->type()));
   } else {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     memory::Copy(platform::CPUPlace(), data_ptr,
                  BOOST_GET_CONST(platform::CUDAPlace, tensor->place()),
-                 tensor->data<void>(),
-                 tensor->numel() * SizeOfType(tensor->type()), nullptr);
+                 tensor->data(), tensor->numel() * SizeOfType(tensor->type()),
+                 nullptr);
 #endif
 #ifdef PADDLE_WITH_XPU
     memory::Copy(platform::CPUPlace(), data_ptr,
                  BOOST_GET_CONST(platform::XPUPlace, tensor->place()),
-                 tensor->data<void>(),
-                 tensor->numel() * SizeOfType(tensor->type()));
+                 tensor->data(), tensor->numel() * SizeOfType(tensor->type()));
 #endif
   }
 }
diff --git a/paddle/fluid/framework/heterxpu_trainer.cc b/paddle/fluid/framework/heterxpu_trainer.cc
index ebd737c2d5794..3ed886e874db0 100644
--- a/paddle/fluid/framework/heterxpu_trainer.cc
+++ b/paddle/fluid/framework/heterxpu_trainer.cc
@@ -339,7 +339,7 @@ int HeterXpuTrainer::EndPass(const HeterRequest* request,
         auto dev_id =
             BOOST_GET_CONST(platform::CUDAPlace, thread_tensor->place()).device;
         platform::CUDADeviceGuard guard(dev_id);
-        cudaMemset(thread_tensor->data<void>(), 0,
+        cudaMemset(thread_tensor->data(), 0,
                    thread_tensor->numel() * SizeOfType(thread_tensor->type()));
 #endif
 #ifdef PADDLE_WITH_XPU
@@ -351,11 +351,11 @@ int HeterXpuTrainer::EndPass(const HeterRequest* request,
         platform::DeviceContext* dev_ctx = pool.Get(place);
         const platform::XPUDeviceContext* xpu_ctx =
             reinterpret_cast<const platform::XPUDeviceContext*>(dev_ctx);
-        xpu::memset(xpu_ctx->x_context(), thread_tensor->data<void>(), 0,
+        xpu::memset(xpu_ctx->x_context(), thread_tensor->data(), 0,
                     thread_tensor->numel() * SizeOfType(thread_tensor->type()));
 #endif
       } else {
-        memset(thread_tensor->data<void>(), 0,
+        memset(thread_tensor->data(), 0,
                thread_tensor->numel() * SizeOfType(thread_tensor->type()));
       }
     }
@@ -367,7 +367,7 @@ int HeterXpuTrainer::EndPass(const HeterRequest* request,
       auto dev_id =
           BOOST_GET_CONST(platform::CUDAPlace, root_tensor->place()).device;
       platform::CUDADeviceGuard guard(dev_id);
-      cudaMemset(root_tensor->data<void>(), 0,
+      cudaMemset(root_tensor->data(), 0,
                  root_tensor->numel() * SizeOfType(root_tensor->type()));
 #endif
 #ifdef PADDLE_WITH_XPU
@@ -379,11 +379,11 @@ int HeterXpuTrainer::EndPass(const HeterRequest* request,
       platform::DeviceContext* dev_ctx = pool.Get(place);
       const platform::XPUDeviceContext* xpu_ctx =
           reinterpret_cast<const platform::XPUDeviceContext*>(dev_ctx);
-      xpu::memset(xpu_ctx->x_context(), root_tensor->data<void>(), 0,
+      xpu::memset(xpu_ctx->x_context(), root_tensor->data(), 0,
                   root_tensor->numel() * SizeOfType(root_tensor->type()));
 #endif
     } else {
-      memset(root_tensor->data<void>(), 0,
+      memset(root_tensor->data(), 0,
              root_tensor->numel() * SizeOfType(root_tensor->type()));
     }
   }
diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h
index 7dee0f44e384d..dff6d0e01839a 100644
--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
@@ -144,8 +144,8 @@ class LoDTensor : public Tensor {
    */
   size_t NumLevels() const { return lod_.size(); }
   /*
-   * Number of elements in a level.
-   */
+ * Number of elements in a level.
+ */
   size_t NumElements(size_t level = 0) const {
     PADDLE_ENFORCE_LT(
         level, NumLevels(),
diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
index 6876f219c92b9..b42f2da2a4d78 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
@@ -71,7 +71,6 @@ ProgramDesc load_from_file(const std::string& file_name) {
   fin.seekg(0, std::ios::beg);
   fin.read(&buffer[0], buffer.size());
   fin.close();
-
   ProgramDesc program_desc(buffer);
   return program_desc;
 }
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index ca86cda52fb61..9a38a2d5d6fe8 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -788,7 +788,7 @@ void ParallelExecutor::BCastParamsToDevices(
         void *buffer;
 
         if (i == 0 && trainer_id == 0) {
-          buffer = const_cast<void *>(main_tensor.data<void>());
+          buffer = const_cast<void *>(main_tensor.data());
         } else {
           auto local_scope = member_->local_scopes_[i];
           auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
@@ -831,7 +831,7 @@ void ParallelExecutor::BCastParamsToDevices(
         void *buffer;
 
         if (i == 0 && trainer_id == 0) {
-          buffer = const_cast<void *>(main_tensor.data<void>());
+          buffer = const_cast<void *>(main_tensor.data());
         } else {
           auto local_scope = member_->local_scopes_[i];
           auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
diff --git a/paddle/fluid/framework/program_desc.cc b/paddle/fluid/framework/program_desc.cc
index 4a31adcca65ec..60b93f4a71664 100644
--- a/paddle/fluid/framework/program_desc.cc
+++ b/paddle/fluid/framework/program_desc.cc
@@ -101,20 +101,25 @@ ProgramDesc::ProgramDesc(const std::string &binary_str) {
   PADDLE_ENFORCE_EQ(desc_.ParseFromString(binary_str), true,
                     platform::errors::InvalidArgument(
                         "Failed to parse program_desc from binary string."));
+  VLOG(1) << 3333;
   InitFromProto();
 }
 
 void ProgramDesc::InitFromProto() {
+  VLOG(1) << 4444;
   for (auto &block_desc : *desc_.mutable_blocks()) {
     blocks_.emplace_back(new BlockDesc(this, &block_desc));
   }
+  VLOG(1) << 5555;
   for (auto &block : blocks_) {
     for (auto *op : block->AllOps()) {
       for (const auto &attr : op->Proto()->attrs()) {
         if (attr.type() == proto::AttrType::BLOCK) {
+          VLOG(1) << 6666;
           size_t blk_idx = attr.block_idx();
           op->SetBlockAttr(attr.name(), this->MutableBlock(blk_idx));
         } else if (attr.type() == proto::AttrType::BLOCKS) {
+          VLOG(1) << 7777;
           auto blks_idx = attr.blocks_idx();
           std::vector<BlockDesc *> block_descs;
           for (int blk_idx : blks_idx) {
diff --git a/paddle/fluid/framework/save_load_util.cc b/paddle/fluid/framework/save_load_util.cc
index 1731a974b71d8..0f1a8e2a9ed5f 100644
--- a/paddle/fluid/framework/save_load_util.cc
+++ b/paddle/fluid/framework/save_load_util.cc
@@ -295,12 +295,12 @@ bool SaveTensorToDisk(const std::string& file_name,
     // save tensor
     uint64_t data_size =
         tensor->numel() * framework::SizeOfType(tensor->type());
-    auto* data_ptr = tensor->data<void>();
+    auto* data_ptr = tensor->data();
     if (platform::is_gpu_place(tensor->place())) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       framework::Tensor temp;
       TensorCopySync(*tensor, platform::CPUPlace(), &temp);
-      data_ptr = temp.data<void>();
+      data_ptr = temp.data();
 #else
       PADDLE_THROW(platform::errors::Unavailable(
           "Tensor is in CUDA device, but paddle not compiled with CUDA."));
diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc
index 2ccd295577039..e5dfe28be7a3c 100644
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/tensor.h"
+#include "paddle/pten/api/lib/utils/storage.h"
 
 DECLARE_bool(use_stream_safe_cuda_allocator);
 
@@ -26,148 +27,55 @@ class Allocation;
 
 namespace paddle {
 namespace framework {
-extern size_t SizeOfType(proto::VarType::Type type);
-void Tensor::check_memory_size() const {
-  PADDLE_ENFORCE_NOT_NULL(holder_, platform::errors::PreconditionNotMet(
-                                       "Tensor holds no memory. "
-                                       "Call Tensor::mutable_data firstly."));
-  size_t size = numel() * SizeOfType(type());
-
-  PADDLE_ENFORCE_LE(
-      size, memory_size(),
-      platform::errors::PreconditionNotMet(
-          "Tensor's dimension is out of bound."
-          "Tensor's dimension must be equal or less than the size of its "
-          "memory."
-          "But received  Tensor's dimension is d%, memory's size is %d.",
-          size, memory_size()));
-}
-
-Tensor::Tensor(const proto::VarType::Type& dtype)
-    : type_(dtype),
-      offset_(0),
-      inplace_version_counter_(std::make_shared<TensorInplaceVersion>(0)) {}
-
-size_t Tensor::memory_size() const {
-  return holder_ == nullptr ? 0UL : holder_->size() - offset_;
-}
-
-void* Tensor::mutable_data(const platform::Place& place,
-                           proto::VarType::Type type, size_t requested_size) {
-  type_ = type;
-  PADDLE_ENFORCE_GE(
-      numel(), 0,
-      platform::errors::PreconditionNotMet(
-          "The Tensor's element number must be equal or greater than zero. "
-          "The Tensor's shape is [",
-          dims(), "] now"));
-  size_t size = numel() * SizeOfType(type);
-  if (requested_size && (requested_size > size)) {
-    size = requested_size;
-  }
-  /* some versions of boost::variant don't have operator!= */
-  if (holder_ == nullptr || !(holder_->place() == place) ||
-      holder_->size() < size + offset_) {
-    // Reset holder first before re-allocate to save memory
-    holder_.reset();
-    holder_ = memory::AllocShared(place, size);
-    offset_ = 0;
-  }
-  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
-                                 offset_);
-}
-
-void* Tensor::mutable_data(const platform::Place& place,
-                           size_t requested_size) {
-  PADDLE_ENFORCE_NOT_NULL(this->holder_, platform::errors::PreconditionNotMet(
-                                             "The tensor is not initialized."));
-  return mutable_data(place, type_, requested_size);
-}
-
-void* Tensor::mutable_data(const platform::Place& place,
-                           proto::VarType::Type type,
-                           const platform::Stream& stream) {
-  type_ = type;
-  PADDLE_ENFORCE_GE(
-      numel(), 0,
-      platform::errors::PreconditionNotMet(
-          "The Tensor's element number must be equal or greater than zero. "
-          "The Tensor's shape is [",
-          dims(), "] now"));
-  size_t size = numel() * SizeOfType(type);
-
-  /* some versions of boost::variant don't have operator!= */
-  if (holder_ == nullptr || !(holder_->place() == place) ||
-      holder_->size() < size + offset_ ||
-      !(platform::is_gpu_place(place) &&
-        memory::InSameStream(holder_, stream))) {
-    holder_.reset();
-    holder_ = memory::AllocShared(place, size, stream);
-    offset_ = 0;
-  }
-  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
-                                 offset_);
-}
-
-Tensor& Tensor::ShareDataWith(const Tensor& src) {
-  src.check_memory_size();
-  *this = src;
-  return *this;
-}
-Tensor& Tensor::ShareInplaceVersionCounterWith(const Tensor& src) {
-  PADDLE_ENFORCE_NOT_NULL(
-      inplace_version_counter_,
-      platform::errors::PreconditionNotMet(
-          "Tensor does not hold inplace_version_counter_."));
-
-  inplace_version_counter_ = src.inplace_version_counter_;
-  return *this;
-}
 
 Tensor Tensor::Slice(int64_t begin_idx, int64_t end_idx) const {
   check_memory_size();
-  PADDLE_ENFORCE_GE(
-      begin_idx, 0,
-      platform::errors::OutOfRange("The start row index must be greater than 0."
-                                   "But received the start index is d%.",
-                                   begin_idx));
-  PADDLE_ENFORCE_LE(
-      end_idx, dims_[0],
-      platform::errors::OutOfRange("The end row index is out of bound."));
+  PADDLE_ENFORCE_GE(begin_idx, 0,
+                    paddle::platform::errors::OutOfRange(
+                        "The start row index must be greater than 0."
+                        "But received the start index is d%.",
+                        begin_idx));
+  PADDLE_ENFORCE_LE(end_idx, meta_.dims[0],
+                    paddle::platform::errors::OutOfRange(
+                        "The end row index is out of bound."));
   PADDLE_ENFORCE_LT(
       begin_idx, end_idx,
-      platform::errors::InvalidArgument(
+      paddle::platform::errors::InvalidArgument(
           "The start row index must be less than the end row index."
           "But received the start index = %d, the end index = %d.",
           begin_idx, end_idx));
 
-  if (dims_[0] == 1) {
+  if (meta_.dims[0] == 1) {
     return *this;
   } else {
-    size_t base = numel() / dims_[0];
+    size_t base = numel() / meta_.dims[0];
     Tensor dst;
-    dst.holder_ = holder_;
-    dst.set_layout(layout_);
-    dst.type_ = type_;
-    DDim dst_dims = dims_;
+    dst.storage_ = pten::make_intrusive<paddle::experimental::SharedStorage>(
+        storage_->data_shared());
+    dst.meta_.layout = meta_.layout;
+    dst.meta_.dtype = meta_.dtype;
+    DDim dst_dims = meta_.dims;
     dst_dims[0] = end_idx - begin_idx;
     dst.Resize(dst_dims);
-    dst.offset_ = offset_ + begin_idx * base * SizeOfType(type());
+    dst.meta_.offset = meta_.offset + begin_idx * base * SizeOf(dtype());
     return dst;
   }
 }
 
 std::vector<Tensor> Tensor::Split(int64_t split_size, int64_t axis) const {
   check_memory_size();
-  PADDLE_ENFORCE_GE(dims_.size(), 0,
-                    platform::errors::OutOfRange(
+
+  PADDLE_ENFORCE_GE(meta_.dims.size(), 0,
+                    paddle::platform::errors::OutOfRange(
                         "split expects at least a 1-dimensional tensor"));
+
   PADDLE_ENFORCE_GE(
       split_size, 0,
-      platform::errors::OutOfRange(
+      paddle::platform::errors::OutOfRange(
           "split expects split_size be non-negative, but got split_size is %d",
           split_size));
-  int64_t numel_size = dims_[axis];
+
+  int64_t numel_size = meta_.dims[axis];
 
   int64_t num_splits = 1;
   if (split_size != 0) {
@@ -187,49 +95,33 @@ std::vector<Tensor> Tensor::Split(int64_t split_size, int64_t axis) const {
 
 std::vector<Tensor> Tensor::Chunk(int64_t chunks, int64_t axis) const {
   check_memory_size();
-  PADDLE_ENFORCE_GE(dims_.size(), 0,
-                    platform::errors::OutOfRange(
+  PADDLE_ENFORCE_GE(meta_.dims.size(), 0,
+                    paddle::platform::errors::OutOfRange(
                         "split expects at least a 1-dimensional tensor"));
   PADDLE_ENFORCE_GE(
       chunks, 0,
-      platform::errors::OutOfRange(
+      paddle::platform::errors::OutOfRange(
           "chunks expects to be greater than 0, but got chunks is %d", chunks));
 
-  int64_t numel_size = dims_[axis];
+  int64_t numel_size = meta_.dims[axis];
   int64_t split_size = (numel_size + chunks - 1) / chunks;
   return Split(split_size, axis);
 }
 
-Tensor& Tensor::Resize(const DDim& dims) {
-  dims_ = dims;
+Tensor& Tensor::ShareDataWith(const Tensor& src) {
+  src.check_memory_size();
+  *this = src;
   return *this;
 }
+Tensor& Tensor::ShareInplaceVersionCounterWith(const Tensor& src) {
+  PADDLE_ENFORCE_NOT_NULL(
+      inplace_version_counter_,
+      platform::errors::PreconditionNotMet(
+          "Tensor does not hold inplace_version_counter_."));
 
-const DDim& Tensor::dims() const { return dims_; }
-
-int64_t Tensor::numel() const { return product(dims_); }
-
-void Tensor::ResetHolder(std::shared_ptr<memory::Allocation> holder) {
-  PADDLE_ENFORCE_EQ(
-      offset_, 0,
-      platform::errors::Fatal(
-          "Only the offset is supported to zero when the holder is reset."));
-  if (holder_) {
-    PADDLE_ENFORCE_LE(
-        numel() * SizeOfType(type()) + offset_, holder->size(),
-        paddle::platform::errors::InvalidArgument(
-            "The size of Holder is not enough to store the Tensor."));
-  }
-  holder_ = holder;
-}
-
-void Tensor::ResetHolderWithType(std::shared_ptr<memory::Allocation> holder,
-                                 const proto::VarType::Type& type) {
-  type_ = type;
-  ResetHolder(holder);
+  inplace_version_counter_ = src.inplace_version_counter_;
+  return *this;
 }
 
-void Tensor::set_type(const proto::VarType::Type& type) { type_ = type; }
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index 7eebd97d06523..e86009e9aafea 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -30,6 +30,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/stream/stream.h"
 
+#include "paddle/pten/core/dense_tensor.h"
+
 namespace paddle {
 namespace memory {
 namespace allocation {
@@ -75,98 +77,10 @@ class LoDTensor;
   Variable object but not a pointer.
 */
 
-class TensorInplaceVersion {
+class Tensor : public pten::DenseTensor {
  public:
-  explicit TensorInplaceVersion(uint32_t inplace_version = 0)
-      : inplace_version_(inplace_version) {}
-  bool IsUnique() const { return inplace_version_ == 0; }
-  void Bump() { ++inplace_version_; }
-  uint32_t CurrentVersion() const { return inplace_version_; }
-  void SetInplaceVersionToZero() { inplace_version_ = 0; }
-
- private:
-  uint32_t inplace_version_;
-};
-
-class Tensor {
-#ifdef PADDLE_WITH_MKLDNN
-
- public:
-  inline dnnl::memory::format_tag format() const { return format_; }
-
-  inline void set_format(const dnnl::memory::format_tag format) {
-    format_ = format;
-  }
-
- protected:
-  /**
-   * @brief the detail format of memory block which have layout as kMKLDNN
-   *
-   * @note MKLDNN lib support various memory format like nchw, nhwc, nChw8C,
-   *       nChw16c, etc. For a MKLDNN memory block, layout will be set as
-   *       DataLayout::kMKLDNN meanwhile detail memory format will be kept in
-   *       this field.
-   */
-
-  dnnl::memory::format_tag format_ = dnnl::memory::format_tag::undef;
-#endif
-
- public:
-  Tensor()
-      : type_(proto::VarType::FP32),
-        offset_(0),
-        inplace_version_counter_(std::make_shared<TensorInplaceVersion>(0)) {}
-
-  explicit Tensor(const proto::VarType::Type&);
-
-  /*! Return a pointer to mutable memory block. */
-  const void* data() const;
-
-  template <typename T>
-  T* data();
-
-  /*! Return a pointer to constant memory block. */
-  template <typename T>
-  const T* data() const;
-
-  inline bool IsInitialized() const;
-
-  /**
-   * @brief   Return a pointer to mutable memory block.
-   * @note    If not exist, then allocation.
-   */
-  template <typename T>
-  T* mutable_data(const platform::Place& place, size_t requested_size = 0);
-
-  void* mutable_data(const platform::Place& place, proto::VarType::Type type,
-                     size_t requested_size = 0);
-
-  void* mutable_data(const platform::Place& place, size_t requested_size = 0);
-
-  void* mutable_data(const platform::Place& place, proto::VarType::Type type,
-                     const platform::Stream& stream);
-
-  /**
-   * @brief     Return a pointer to mutable memory block.
-   *
-   * @param[in] dims           The dimensions of the memory block.
-   * @param[in] place          The place of the memory block.
-   * @param[in] requested_size The size of the block in bytes.
-   *
-   * @note      If not exist, then allocation.
-   */
-  template <typename T>
-  T* mutable_data(const DDim& dims, const platform::Place& place,
-                  size_t requested_size = 0);
-
-  /*! Return the dimensions of the memory block. */
-  const DDim& dims() const;
-
-  /*! Return the numel of the memory block. */
-  int64_t numel() const;
-
-  /*! Resize the dimensions of the memory block. */
-  Tensor& Resize(const DDim& dims);
+  using DenseTensor = pten::DenseTensor;
+  using DenseTensor::DenseTensor;
 
   /*! The internal of two tensors share the same memory block. */
   Tensor& ShareDataWith(const Tensor& src);
@@ -174,150 +88,16 @@ class Tensor {
   /*! The internal of two tensors share the same inplace version counter. */
   Tensor& ShareInplaceVersionCounterWith(const Tensor& src);
 
-  /**
-   * @brief  Return a sub-tensor of the given tensor.
-   *
-   * @param[in] begin_idx   The index of the start row(inclusive) to slice.
-   *                        The index number begins from 0.
-   * @param[in] end_idx     The index of the end row(exclusive) to slice.
-   *                        The index number begins from 0.
-   */
   Tensor Slice(int64_t begin_idx, int64_t end_idx) const;
 
-  /**
-   * @brief  Return a tensor list of the given tensor.
-   *
-   * @param[in] split_size  The size of tensor to be split along axis.
-   * @param[in] axis        The axis along which to split.
-   */
   std::vector<Tensor> Split(int64_t split_size, int64_t axis) const;
 
-  /**
-   * @brief  Return a tensor list of the given tensor.
-   *
-   * @param[in] chunks   The number of tensor to be split along axis.
-   * @param[in] axis     The axis along which to split.
-   */
   std::vector<Tensor> Chunk(int64_t chunks, int64_t axis) const;
 
-  const platform::Place& place() const {
-    PADDLE_ENFORCE_NOT_NULL(
-        holder_,
-        platform::errors::PreconditionNotMet(
-            "Tensor not initialized yet when Tensor::place() is called."));
-    return holder_->place();
+  Tensor& Resize(const DDim& dims) {
+    meta_.dims = dims;
+    return *this;
   }
-
-  proto::VarType::Type type() const {
-    PADDLE_ENFORCE_NOT_NULL(
-        holder_,
-        platform::errors::PreconditionNotMet(
-            "Tensor not initialized yet when Tensor::type() is called."));
-    return type_;
-  }
-
-  /**
-   * [Add method get the saved type of tensor]
-   *
-   * After the introduction of complex number calculations, Ops that support
-   * complex number calculations generally support type promotion, such as
-   * x(float32) + y(complex64) = out(complex64), then the type of the grad
-   * tensor should be dout(complex64), dx(float32), dy (complex64), but the
-   * type of dx to be recognized to be float32 by the grad Op relay on the type
-   * of forward tensor x. But many of our ops have registered InplaceInferer,
-   * covering the tensor memory of x with out, so as to save storage.
-   *
-   * In this case, the dim and type information recorded by x still exist,
-   * but because x becomes an uninitialized tensor, The type of x record cannot
-   * be obtained with x.type(), but the type is still valid here, so we
-   * add saved_type(), This method SHOULD NOT be called by general scenarios.
-   */
-  proto::VarType::Type saved_type() const { return type_; }
-
-  // memory size returns the holding memory size in byte.
-  size_t memory_size() const;
-
-  void check_memory_size() const;
-
-  DataLayout layout() const { return layout_; }
-
-  void set_layout(const DataLayout layout) { layout_ = layout; }
-
-  void clear() {
-    holder_ = nullptr;
-    offset_ = 0;
-  }
-
-  void ShareBufferWith(const Tensor& tensor) {
-    holder_ = tensor.holder_;
-    offset_ = tensor.offset_;
-    // NOTE(chenfeiyu): when sharing buffer, by definition only holder
-    // to the memory allocation and offset should be shared. Shape,
-    // data type, layout, and other metadata associated with a Tensor
-    // should not be copied.
-  }
-
-  void ShareDataTypeWith(const Tensor& tensor) { type_ = tensor.type_; }
-
-  bool IsSharedBufferWith(const Tensor& src) const {
-    return holder_ && holder_ == src.Holder();
-  }
-
-  const std::shared_ptr<memory::Allocation>& Holder() const { return holder_; }
-  size_t offset() const { return offset_; }
-  void set_offset(size_t offset) { offset_ = offset; }
-
-  std::shared_ptr<memory::Allocation> MoveMemoryHolder() {
-    return std::move(holder_);
-  }
-
-  void ResetHolder(std::shared_ptr<memory::Allocation> holder);
-
-  void ResetHolderWithType(std::shared_ptr<memory::Allocation> holder,
-                           const proto::VarType::Type& type);
-
-  void set_type(const proto::VarType::Type& type);
-
-  TensorInplaceVersion& InplaceVersionCounter() {
-    return *inplace_version_counter_;
-  }
-
- private:
-  /*! holds the memory block if allocated. */
-  std::shared_ptr<memory::Allocation> holder_;
-  proto::VarType::Type type_;
-  /**
-   * @brief points to elements dimensions.
-   *
-   * @note dims_ do not indicate the memory block size.
-   */
-
-  DDim dims_;
-
-  /**
-   * @brief the layout of memory block, default is NHWC.
-   *
-   * @note the memory allocation order, describe how weight/data is stored
-   *       For example, in 4-D Tensor(rank=4), there are three commonly
-   *       used layout. They are
-   *            NCHW, NHWC, CHWN.
-   *       N,C,H,W for respectively the batch size, the number of
-   *       feature maps, the height.
-   */
-  // Fix me: here just change the default layout to kNCHW
-  // it doesn't fix the real issue, i.e. feeder should set up tensor layout
-  // according to actual input data
-  DataLayout layout_ = DataLayout::kNCHW;
-
-  /**
-   * @brief   A PlaceHolder may be shared by more than one tensor.
-   *
-   * @note    Some of them may be slices of the others. So the offset_
-   *          is introduced here to indicate the byte offset between
-   *          PlaceHolder::ptr_ and where the tensor data really begins.
-   */
-  size_t offset_;
-  std::shared_ptr<TensorInplaceVersion> inplace_version_counter_;
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h
index a83b5d0662bb9..98ad9a629b5ab 100644
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -20,61 +20,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
-template <typename T>
-inline const T* Tensor::data() const {
-  check_memory_size();
-  bool valid =
-      std::is_same<T, void>::value || type_ == DataTypeTrait<T>::DataType();
-  PADDLE_ENFORCE_EQ(
-      valid, true,
-      platform::errors::InvalidArgument(
-          "Tensor holds the wrong type, it holds %s, but desires to be %s.",
-          DataTypeToString(type_),
-          DataTypeToString(DataTypeTrait<T>::DataType())));
-
-  return reinterpret_cast<const T*>(
-      reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
-}
-
-inline bool Tensor::IsInitialized() const { return holder_ != nullptr; }
-
-template <typename T>
-inline T* Tensor::data() {
-  check_memory_size();
-  bool valid =
-      std::is_same<T, void>::value || type_ == DataTypeTrait<T>::DataType();
-  PADDLE_ENFORCE_EQ(
-      valid, true,
-      platform::errors::InvalidArgument(
-          "Tensor holds the wrong type, it holds %s, but desires to be %s",
-          DataTypeToString(type_),
-          DataTypeToString(DataTypeTrait<T>::DataType())));
-
-  return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
-                              offset_);
-}
-
-inline const void* Tensor::data() const {
-  check_memory_size();
-  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
-                                 offset_);
-}
-
-template <typename T>
-inline T* Tensor::mutable_data(const DDim& dims, const platform::Place& place,
-                               size_t requested_size) {
-  static_assert(std::is_pod<T>::value, "T must be POD");
-  Resize(dims);
-  return mutable_data<T>(place, requested_size);
-}
-
-template <typename T>
-inline T* Tensor::mutable_data(const platform::Place& place,
-                               size_t requested_size) {
-  static_assert(std::is_pod<T>::value, "T must be POD");
-  return reinterpret_cast<T*>(
-      mutable_data(place, DataTypeTrait<T>::DataType(), requested_size));
-}
 
 inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
   int rank = src.dims().size();
diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc
index 71ff50c92ca59..a58f4a6b5f4c1 100644
--- a/paddle/fluid/framework/tensor_test.cc
+++ b/paddle/fluid/framework/tensor_test.cc
@@ -45,7 +45,6 @@ TEST(Tensor, DataAssert) {
   } catch (platform::EnforceNotMet& err) {
     caught = true;
     std::string ex_msg = err.what();
-    EXPECT_TRUE(ex_msg.find("holder_ should not be null") != std::string::npos);
     EXPECT_TRUE(ex_msg.find("Tensor holds no memory. Call "
                             "Tensor::mutable_data firstly.") !=
                 std::string::npos);
@@ -189,8 +188,6 @@ TEST(Tensor, ShareDataWith) {
     } catch (paddle::platform::EnforceNotMet& err) {
       caught = true;
       std::string ex_msg = err.what();
-      EXPECT_TRUE(ex_msg.find("holder_ should not be null") !=
-                  std::string::npos);
       EXPECT_TRUE(ex_msg.find("Tensor holds no memory. Call "
                               "Tensor::mutable_data firstly.") !=
                   std::string::npos);
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 6394e84c81a2b..f2323f6e2c6ee 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -45,7 +45,6 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place,
   VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to "
           << dst_place;
   src.check_memory_size();
-
   dst->Resize(src.dims());
   dst->set_layout(src.layout());
   auto src_place = src.place();
@@ -442,6 +441,7 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
   auto src_place = src.place();
   auto src_ptr = src.data();
   auto dst_ptr = dst->mutable_data(dst_place, src.type());
+  VLOG(4) << "src:" << src_ptr << ", dst:" << dst_ptr;
 
   if (src_ptr == dst_ptr && src_place == dst_place) {
     VLOG(3) << "Skip copy the same data from " << src_place << " to "
diff --git a/paddle/fluid/framework/variable.h b/paddle/fluid/framework/variable.h
index f8ad990a668ce..2fa48150903ad 100644
--- a/paddle/fluid/framework/variable.h
+++ b/paddle/fluid/framework/variable.h
@@ -72,7 +72,7 @@ class Variable {
  private:
   // This method hides type T, so it doesn't appear as a template parameter of
   // Variable.
-  framework::TensorInplaceVersion* InplaceVersionCounter();
+  pten::TensorInplaceVersion* InplaceVersionCounter();
 
  public:
   void SetInplaceVersionToZero();
@@ -114,8 +114,8 @@ class Variable {
   std::shared_ptr<Placeholder> holder_;
 };
 
-inline framework::TensorInplaceVersion* Variable::InplaceVersionCounter() {
-  framework::TensorInplaceVersion* version_counter_ptr(nullptr);
+inline pten::TensorInplaceVersion* Variable::InplaceVersionCounter() {
+  pten::TensorInplaceVersion* version_counter_ptr(nullptr);
   if (IsType<framework::LoDTensor>()) {
     version_counter_ptr =
         &GetMutable<framework::LoDTensor>()->InplaceVersionCounter();
diff --git a/paddle/fluid/imperative/all_reduce.cc b/paddle/fluid/imperative/all_reduce.cc
index 31da214fbc39a..78855cc5c9e2e 100644
--- a/paddle/fluid/imperative/all_reduce.cc
+++ b/paddle/fluid/imperative/all_reduce.cc
@@ -60,7 +60,7 @@ static void AllReduce(const framework::Tensor &src, framework::Tensor *dst,
       platform::errors::Unimplemented(
           "Imperative mode does not support multi-CPU training yet."));
 
-  const void *src_ptr = src.data<void>();
+  const void *src_ptr = src.data();
   dst->Resize(src.dims());
   auto *dst_ptr = dst->mutable_data(src.place(), src.type());
   auto nccl_dtype = platform::ToNCCLDataType(src.type());
@@ -129,7 +129,7 @@ static void AllReduce(const framework::SelectedRows &src,
   auto feature_size = framework::product(dims) / dims[0];
   dst_tensor->Resize(dims);
   auto *dst_tensor_ptr = dst_tensor->mutable_data(place, dtype);
-  const auto *src_tensor_ptr = src_tensor.data<void>();
+  const auto *src_tensor_ptr = src_tensor.data();
 
   auto sizeof_dtype = framework::SizeOfType(dtype);
   int64_t row_offset = 0;
diff --git a/paddle/fluid/imperative/bkcl_context.cc b/paddle/fluid/imperative/bkcl_context.cc
index 6569929d6f5d7..2072c41673aaf 100644
--- a/paddle/fluid/imperative/bkcl_context.cc
+++ b/paddle/fluid/imperative/bkcl_context.cc
@@ -39,7 +39,7 @@ static void AllReduce(const framework::Tensor &src, framework::Tensor *dst,
       platform::errors::Unimplemented(
           "Dynamic graph mode does not support multi-CPU training yet."));
 
-  const void *src_ptr = src.data<void>();
+  const void *src_ptr = src.data();
   dst->Resize(src.dims());
   auto *dst_ptr = dst->mutable_data(src.place(), src.type());
   auto bkcl_dtype = platform::ToBKCLDataType(src.type());
@@ -158,7 +158,7 @@ void BKCLParallelContext::Broadcast(framework::Variable *src, int ring_id) {
       platform::BKCLCommContext::Instance().Get(ring_id, place);
   XPUStream stream = comm->stream();
 
-  void *src_ptr = src_tensor->data<void>();
+  void *src_ptr = src_tensor->data();
   auto data_type = platform::ToBKCLDataType(src_tensor->type());
 
   PADDLE_ENFORCE_EQ(bkcl_broadcast(comm->comm(), src_ptr, src_ptr,
diff --git a/paddle/fluid/imperative/hccl_context.cc b/paddle/fluid/imperative/hccl_context.cc
index 55c52ae6c11de..818b2f424b6af 100644
--- a/paddle/fluid/imperative/hccl_context.cc
+++ b/paddle/fluid/imperative/hccl_context.cc
@@ -42,7 +42,7 @@ static void AllReduce(const framework::Tensor &src, framework::Tensor *dst,
       platform::errors::Unimplemented(
           "Imperative mode does not support multi-CPU training yet."));
 
-  void *src_ptr = const_cast<void *>(src.data<void>());
+  void *src_ptr = const_cast<void *>(src.data());
   dst->Resize(src.dims());
   void *dst_ptr = dst->mutable_data(src.place(), src.type());
   HcclDataType hccl_dtype = platform::ToHCCLDataType(src.type());
@@ -168,7 +168,7 @@ void HCCLParallelContext::Broadcast(framework::Variable *src, int ring_id) {
     aclrtStream stream = comm->stream();
 
     void *src_ptr =
-        reinterpret_cast<void *>(const_cast<void *>(src_tensor->data<void>()));
+        reinterpret_cast<void *>(const_cast<void *>(src_tensor->data()));
     auto hccl_dtype = platform::ToHCCLDataType(src_tensor->type());
     PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast(
         src_ptr, src_tensor->numel(), hccl_dtype, 0, comm->comm(),
diff --git a/paddle/fluid/imperative/nccl_context.cc b/paddle/fluid/imperative/nccl_context.cc
index 15146f6c1204e..1b50c515635d2 100644
--- a/paddle/fluid/imperative/nccl_context.cc
+++ b/paddle/fluid/imperative/nccl_context.cc
@@ -143,7 +143,7 @@ void NCCLParallelContext::Broadcast(framework::Variable *src, int ring_id) {
       platform::NCCLCommContext::Instance().Get(ring_id, place);
   gpuStream_t stream = comm->stream();
 
-  void *src_ptr = src_tensor->data<void>();
+  void *src_ptr = src_tensor->data();
   auto nccl_dtype = platform::ToNCCLDataType(src_tensor->type());
   PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast(
       src_ptr, src_tensor->numel(), nccl_dtype, 0, comm->comm(), stream));
diff --git a/paddle/fluid/imperative/tests/test_gradient_accmulator.cc b/paddle/fluid/imperative/tests/test_gradient_accmulator.cc
index cb4ab2e79cb99..0a7df9953ad45 100644
--- a/paddle/fluid/imperative/tests/test_gradient_accmulator.cc
+++ b/paddle/fluid/imperative/tests/test_gradient_accmulator.cc
@@ -176,8 +176,8 @@ static bool IsEqualVar(const framework::Variable& var1,
     return false;
   }
 
-  auto* t1_p = t1.data<void>();
-  auto* t2_p = t2.data<void>();
+  auto* t1_p = t1.data();
+  auto* t2_p = t2.data();
   return std::memcmp(t1_p, t2_p,
                      t1.numel() * framework::SizeOfType(t1.type())) == 0;
 }
diff --git a/paddle/fluid/inference/api/api_impl_tester.cc b/paddle/fluid/inference/api/api_impl_tester.cc
index 89aec34110b85..124279d246093 100644
--- a/paddle/fluid/inference/api/api_impl_tester.cc
+++ b/paddle/fluid/inference/api/api_impl_tester.cc
@@ -37,13 +37,13 @@ PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) {
   PaddleTensor pt;
 
   if (t->type() == framework::proto::VarType::INT64) {
-    pt.data.Reset(t->data<void>(), t->numel() * sizeof(int64_t));
+    pt.data.Reset(t->data(), t->numel() * sizeof(int64_t));
     pt.dtype = PaddleDType::INT64;
   } else if (t->type() == framework::proto::VarType::FP32) {
-    pt.data.Reset(t->data<void>(), t->numel() * sizeof(float));
+    pt.data.Reset(t->data(), t->numel() * sizeof(float));
     pt.dtype = PaddleDType::FLOAT32;
   } else if (t->type() == framework::proto::VarType::INT32) {
-    pt.data.Reset(t->data<void>(), t->numel() * sizeof(int32_t));
+    pt.data.Reset(t->data(), t->numel() * sizeof(int32_t));
     pt.dtype = PaddleDType::INT32;
   } else {
     PADDLE_THROW(platform::errors::Unimplemented(
diff --git a/paddle/fluid/inference/lite/tensor_utils.cc b/paddle/fluid/inference/lite/tensor_utils.cc
index cbc947ea6436a..b1e0eb5ef16ab 100644
--- a/paddle/fluid/inference/lite/tensor_utils.cc
+++ b/paddle/fluid/inference/lite/tensor_utils.cc
@@ -210,7 +210,7 @@ void TensorCopyAsync(paddle::lite_api::Tensor* dst,
   const size_t bytes =
       static_cast<size_t>(src.numel()) * framework::SizeOfType(src.type());
   dst->Resize(framework::vectorize(src.dims()));
-  const void* src_data = src.data<void>();
+  const void* src_data = src.data();
   void* dst_data{nullptr};
   dst_data = GetLiteTensorDataPtr(dst, GetLitePrecisionType(src.type()),
                                   GetLiteTargetType(src.place()));
@@ -242,7 +242,7 @@ void TensorCopyAsync(framework::LoDTensor* dst,
 template <>
 void TensorDataShare(paddle::lite_api::Tensor* dst, framework::LoDTensor* src) {
   dst->Resize(framework::vectorize(src->dims()));
-  dst->ShareExternalMemory(src->data<void>(), src->memory_size(),
+  dst->ShareExternalMemory(src->data(), src->memory_size(),
                            GetLiteTargetType(src->place()));
   dst->SetPrecision(GetLitePrecisionType(src->type()));
   paddle::lite::LoD lite_lod;
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
index 021f3a13ce7cf..8160368d72ad1 100644
--- a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
@@ -176,7 +176,7 @@ class LazyZerosNPU {
           NpuOpRunner("ZerosLike", {*zero_tensor}, {*zero_tensor});
       runner_zeros.Run(stream);
       zero_tensor->check_memory_size();
-      zero_ptr = zero_tensor->data<void>();
+      zero_ptr = zero_tensor->data();
     }
 
     for (size_t i = 0; i < xs.size(); ++i) {
diff --git a/paddle/fluid/operators/coalesce_tensor_op.cc b/paddle/fluid/operators/coalesce_tensor_op.cc
index 752e5dc4a8772..5655fd25ec24b 100644
--- a/paddle/fluid/operators/coalesce_tensor_op.cc
+++ b/paddle/fluid/operators/coalesce_tensor_op.cc
@@ -260,8 +260,7 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
                       size_of_dtype
                 : len;
       ss << "output(" << out_var_names[i] << ")  dim:(" << dim << ")"
-         << " address: " << out_tensors[i]->data<void>() << " len: " << len
-         << ", ";
+         << " address: " << out_tensors[i]->data() << " len: " << len << ", ";
       offset += len;
     }
     PADDLE_ENFORCE_EQ(
@@ -300,9 +299,8 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
                                     place, align_size) /
                     size_of_dtype
               : static_cast<size_t>(size);
-      const void *ptr = lod_tensors[i]->IsInitialized()
-                            ? lod_tensors[i]->data<void>()
-                            : nullptr;
+      const void *ptr =
+          lod_tensors[i]->IsInitialized() ? lod_tensors[i]->data() : nullptr;
       VLOG(4) << size << " " << len;
       ss << "input(" << var_names[i] << ") dim:(" << lod_tensors[i]->dims()
          << ") "
diff --git a/paddle/fluid/operators/collective/allreduce_op.h b/paddle/fluid/operators/collective/allreduce_op.h
index 4e6d86d49e863..226b2c5132318 100644
--- a/paddle/fluid/operators/collective/allreduce_op.h
+++ b/paddle/fluid/operators/collective/allreduce_op.h
@@ -43,7 +43,7 @@ class AllReduceOpKernel : public framework::OpKernel<T> {
 
     int dtype = platform::ToNCCLDataType(in->type());
     int64_t numel = in->numel();
-    auto* sendbuff = in->data<void>();
+    auto* sendbuff = in->data();
     out->Resize(in->dims());
     void* recvbuff = out->mutable_data<T>(place);
 
diff --git a/paddle/fluid/operators/collective/barrier_op.cu.cc b/paddle/fluid/operators/collective/barrier_op.cu.cc
index c9aef237699f3..a98a0bf6ab4a9 100644
--- a/paddle/fluid/operators/collective/barrier_op.cu.cc
+++ b/paddle/fluid/operators/collective/barrier_op.cu.cc
@@ -33,7 +33,7 @@ class BarrierOpCUDAKernel : public framework::OpKernel<T> {
     auto place = ctx.GetPlace();
     ncclDataType_t dtype = platform::ToNCCLDataType(in->type());
     int64_t numel = in->numel();
-    const void* sendbuff = in->data<void>();
+    const void* sendbuff = in->data();
     void* recvbuff = out->mutable_data<T>(place);
 
     int rid = ctx.Attr<int>("ring_id");
diff --git a/paddle/fluid/operators/collective/broadcast_op.cu.cc b/paddle/fluid/operators/collective/broadcast_op.cu.cc
index daaaf8b7a2e41..229d42e64e4e5 100644
--- a/paddle/fluid/operators/collective/broadcast_op.cu.cc
+++ b/paddle/fluid/operators/collective/broadcast_op.cu.cc
@@ -46,7 +46,7 @@ class NCCLBroadcastOpKernel : public framework::OpKernel<T> {
             "because this op can only be an In-Place operation."));
     void* send_recv_buffer = out->mutable_data<T>(ctx.GetPlace());
     PADDLE_ENFORCE_EQ(
-        send_recv_buffer, in->data<void>(),
+        send_recv_buffer, in->data(),
         platform::errors::PreconditionNotMet("Currently, the broadcast op can "
                                              "only be an In-Place operation."));
 
diff --git a/paddle/fluid/operators/collective/broadcast_op_xpu.cc b/paddle/fluid/operators/collective/broadcast_op_xpu.cc
index 9cd5c5fd22cc3..e8566803aecfa 100644
--- a/paddle/fluid/operators/collective/broadcast_op_xpu.cc
+++ b/paddle/fluid/operators/collective/broadcast_op_xpu.cc
@@ -52,7 +52,7 @@ class BKCLBroadcastOpKernel : public framework::OpKernel<T> {
             "because this op can only be an In-Place operation."));
     void* send_recv_buffer = out->mutable_data<T>(ctx.GetPlace());
     PADDLE_ENFORCE_EQ(
-        send_recv_buffer, in->data<void>(),
+        send_recv_buffer, in->data(),
         platform::errors::PreconditionNotMet("Currently, the broadcast op can "
                                              "only be an In-Place operation."));
 
diff --git a/paddle/fluid/operators/collective/c_reduce_op.h b/paddle/fluid/operators/collective/c_reduce_op.h
index b950339bd22be..c06b2683a6bbe 100644
--- a/paddle/fluid/operators/collective/c_reduce_op.h
+++ b/paddle/fluid/operators/collective/c_reduce_op.h
@@ -213,7 +213,7 @@ class CReduceOpXPUKernel : public framework::OpKernel<T> {
     auto place = ctx.GetPlace();
     BKCLDataType dtype = platform::ToBKCLDataType(in->type());
     int64_t numel = in->numel();
-    const void* sendbuff = in->data<void>();
+    const void* sendbuff = in->data();
     out->Resize(in->dims());
     void* recvbuff = out->mutable_data<T>(place);
 
@@ -276,7 +276,7 @@ class CReduceOpCUDAKernel : public framework::OpKernel<T> {
     auto place = ctx.GetPlace();
     ncclDataType_t dtype = platform::ToNCCLDataType(in->type());
     int64_t numel = in->numel();
-    const void* sendbuff = in->data<void>();
+    const void* sendbuff = in->data();
     out->Resize(in->dims());
     void* recvbuff = out->mutable_data<T>(place);
 
diff --git a/paddle/fluid/operators/detection/bbox_util.h b/paddle/fluid/operators/detection/bbox_util.h
index b262f05d6b187..18c45a1a4c6c1 100644
--- a/paddle/fluid/operators/detection/bbox_util.h
+++ b/paddle/fluid/operators/detection/bbox_util.h
@@ -144,8 +144,8 @@ void MaxIoU(const framework::Tensor& iou, framework::Tensor* max_iou) {
 
 static void AppendProposals(framework::Tensor* dst, int64_t offset,
                             const framework::Tensor& src) {
-  auto* out_data = dst->data<void>();
-  auto* to_add_data = src.data<void>();
+  auto* out_data = dst->data();
+  auto* to_add_data = src.data();
   size_t size_of_t = framework::SizeOfType(src.type());
   offset *= size_of_t;
   std::memcpy(
diff --git a/paddle/fluid/operators/layer_norm_op.cu b/paddle/fluid/operators/layer_norm_op.cu
index 3fe453bda2d9e..7725f336416db 100644
--- a/paddle/fluid/operators/layer_norm_op.cu
+++ b/paddle/fluid/operators/layer_norm_op.cu
@@ -64,8 +64,8 @@ class LayerNormKernel<platform::CUDADeviceContext, T>
     auto *mean_data = mean->mutable_data<U>(ctx.GetPlace());
     auto *var_data = var->mutable_data<U>(ctx.GetPlace());
 
-    auto *void_scale_data = (scale == nullptr ? nullptr : scale->data<void>());
-    auto *void_bias_data = (bias == nullptr ? nullptr : bias->data<void>());
+    auto *void_scale_data = (scale == nullptr ? nullptr : scale->data());
+    auto *void_bias_data = (bias == nullptr ? nullptr : bias->data());
 
     framework::proto::VarType::Type x_dtype = x->type();
     framework::proto::VarType::Type scale_bias_dtype;
diff --git a/paddle/fluid/operators/math/matrix_inverse.cu.cc b/paddle/fluid/operators/math/matrix_inverse.cu.cc
index 5deedf084c697..7d03f9590357e 100644
--- a/paddle/fluid/operators/math/matrix_inverse.cu.cc
+++ b/paddle/fluid/operators/math/matrix_inverse.cu.cc
@@ -48,7 +48,7 @@ class MatrixInverseFunctor<platform::CUDADeviceContext, T> {
       memory::Copy(boost::get<platform::CUDAPlace>(context.GetPlace()),
                    tmp_gpu_mat_data->ptr(),
                    boost::get<platform::CUDAPlace>(context.GetPlace()),
-                   a.data<void>(), a.numel() * sizeof(T), context.stream());
+                   a.data(), a.numel() * sizeof(T), context.stream());
       gpu_mat = reinterpret_cast<const T*>(tmp_gpu_mat_data->ptr());
     }
 
diff --git a/paddle/fluid/operators/optimizers/lamb_op.h b/paddle/fluid/operators/optimizers/lamb_op.h
index df17b5e5f40bc..e3798b49dcbb1 100644
--- a/paddle/fluid/operators/optimizers/lamb_op.h
+++ b/paddle/fluid/operators/optimizers/lamb_op.h
@@ -492,9 +492,9 @@ class LambOpKernel : public framework::OpKernel<T> {
     auto trust_ratio_div =
         ctx.AllocateTmpTensor<MT, DeviceContext>(param.dims(), dev_ctx);
 
-    const void* param_ptr = param.template data<void>();
+    const void* param_ptr = param.data();
     const void* master_param_ptr =
-        master_param ? master_param->template data<void>() : nullptr;
+        master_param ? master_param->data() : nullptr;
     void* param_out_ptr = param_out.template mutable_data<T>(ctx.GetPlace());
     void* master_param_out_ptr =
         master_param_out
diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc
index 6c28daa7eac72..3c0c8ad1cafce 100644
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -132,7 +132,7 @@ void BufferedReader::ReadAsync(size_t i) {
 
             memory::Copy(cuda_pinned_place, cuda_pinned_ptrs[i],
                          BOOST_GET_CONST(platform::CPUPlace, cpu[i].place()),
-                         cpu[i].data<void>(), size);
+                         cpu[i].data(), size);
 
             cuda[i].set_lod(cpu[i].lod());
           } else {
@@ -175,7 +175,7 @@ void BufferedReader::ReadAsync(size_t i) {
         platform::RecordEvent record_event("BufferedReader:MemoryCopy");
         for (size_t i = 0; i < cpu.size(); ++i) {
           auto cpu_place = cpu[i].place();
-          auto cpu_ptr = cpu[i].data<void>();
+          auto cpu_ptr = cpu[i].data();
           auto gpu_ptr = gpu_ptrs[i];
           auto size =
               cpu[i].numel() * paddle::framework::SizeOfType(cpu[i].type());
@@ -239,7 +239,7 @@ void BufferedReader::ReadAsync(size_t i) {
       platform::RecordEvent record_event("BufferedReader:MemoryCopy");
       for (size_t i = 0; i < cpu.size(); ++i) {
         auto cpu_place = cpu[i].place();
-        auto cpu_ptr = cpu[i].data<void>();
+        auto cpu_ptr = cpu[i].data();
         auto npu_ptr = npu_ptrs[i];
         auto size =
             cpu[i].numel() * paddle::framework::SizeOfType(cpu[i].type());
diff --git a/paddle/fluid/operators/spectral_op.cc b/paddle/fluid/operators/spectral_op.cc
index b5edc1dda533b..64751a21c837d 100644
--- a/paddle/fluid/operators/spectral_op.cc
+++ b/paddle/fluid/operators/spectral_op.cc
@@ -587,15 +587,13 @@ void exec_fft(const DeviceContext& ctx, const Tensor* x, Tensor* out,
                                   collapsed_input_conj.data<Ti>());
     for_range(functor);
     MKL_DFTI_CHECK(platform::dynload::DftiComputeBackward(
-        desc.get(), collapsed_input_conj.data<void>(),
-        collapsed_output.data<void>()));
+        desc.get(), collapsed_input_conj.data(), collapsed_output.data()));
   } else if (fft_type == FFTTransformType::R2C && !forward) {
     framework::Tensor collapsed_output_conj(collapsed_output.type());
     collapsed_output_conj.mutable_data<To>(collapsed_output.dims(),
                                            ctx.GetPlace());
     MKL_DFTI_CHECK(platform::dynload::DftiComputeForward(
-        desc.get(), collapsed_input.data<void>(),
-        collapsed_output_conj.data<void>()));
+        desc.get(), collapsed_input.data(), collapsed_output_conj.data()));
     // conjugate the output
     platform::ForRange<DeviceContext> for_range(ctx, collapsed_output.numel());
     math::ConjFunctor<To> functor(collapsed_output_conj.data<To>(),
@@ -605,12 +603,10 @@ void exec_fft(const DeviceContext& ctx, const Tensor* x, Tensor* out,
   } else {
     if (forward) {
       MKL_DFTI_CHECK(platform::dynload::DftiComputeForward(
-          desc.get(), collapsed_input.data<void>(),
-          collapsed_output.data<void>()));
+          desc.get(), collapsed_input.data(), collapsed_output.data()));
     } else {
       MKL_DFTI_CHECK(platform::dynload::DftiComputeBackward(
-          desc.get(), collapsed_input.data<void>(),
-          collapsed_output.data<void>()));
+          desc.get(), collapsed_input.data(), collapsed_output.data()));
     }
   }
 
diff --git a/paddle/fluid/operators/spectral_op.cu b/paddle/fluid/operators/spectral_op.cu
index 2066ce955cafe..d6a775dd55de8 100644
--- a/paddle/fluid/operators/spectral_op.cu
+++ b/paddle/fluid/operators/spectral_op.cu
@@ -115,22 +115,19 @@ void exec_cufft_plan(const DeviceContext& ctx, const FFTConfig& config,
     math::ConjFunctor<Ti> functor(input->data<Ti>(), input->numel(),
                                   input_conj.data<Ti>());
     for_range(functor);
-    exec_cufft_plan_raw(config, input_conj.data<void>(), output->data<void>(),
-                        forward);
+    exec_cufft_plan_raw(config, input_conj.data(), output->data(), forward);
   } else if (fft_type == FFTTransformType::R2C && !forward) {
     forward = true;
     framework::Tensor out_conj(output->type());
     out_conj.mutable_data<To>(output->dims(), ctx.GetPlace());
-    exec_cufft_plan_raw(config, input->data<void>(), out_conj.data<void>(),
-                        forward);
+    exec_cufft_plan_raw(config, input->data(), out_conj.data(), forward);
 
     platform::ForRange<DeviceContext> for_range(ctx, output->numel());
     math::ConjFunctor<To> functor(out_conj.data<To>(), output->numel(),
                                   output->data<To>());
     for_range(functor);
   } else {
-    exec_cufft_plan_raw(config, input->data<void>(), output->data<void>(),
-                        forward);
+    exec_cufft_plan_raw(config, input->data(), output->data(), forward);
   }
 }
 
@@ -227,22 +224,19 @@ void exec_hipfft_plan(const DeviceContext& ctx, const FFTConfig& config,
     math::ConjFunctor<Ti> functor(input->data<Ti>(), input->numel(),
                                   input_conj.data<Ti>());
     for_range(functor);
-    exec_hipfft_plan_raw(config, input_conj.data<void>(), output->data<void>(),
-                         forward);
+    exec_hipfft_plan_raw(config, input_conj.data(), output->data(), forward);
   } else if (fft_type == FFTTransformType::R2C && !forward) {
     forward = true;
     framework::Tensor out_conj(output->type());
     out_conj.mutable_data<To>(output->dims(), ctx.GetPlace());
-    exec_hipfft_plan_raw(config, input->data<void>(), out_conj.data<void>(),
-                         forward);
+    exec_hipfft_plan_raw(config, input->data(), out_conj.data(), forward);
 
     platform::ForRange<DeviceContext> for_range(ctx, output->numel());
     math::ConjFunctor<To> functor(out_conj.data<To>(), output->numel(),
                                   output->data<To>());
     for_range(functor);
   } else {
-    exec_hipfft_plan_raw(config, input->data<void>(), output->data<void>(),
-                         forward);
+    exec_hipfft_plan_raw(config, input->data(), output->data(), forward);
   }
 }
 
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
index 1c1f63331d056..5ebf67587f3cb 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -405,7 +405,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
       if (param_names_.count(x)) continue;
       auto &t =
           inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
-      calib_data.emplace(x, t.data<void>());
+      calib_data.emplace(x, t.data());
     }
     temp_calibrator->setBatch(calib_data);
     RunNativeImpl(scope, dev_place);
diff --git a/paddle/fluid/platform/device/ipu/ipu_compiler.cc b/paddle/fluid/platform/device/ipu/ipu_compiler.cc
index a1c5ed4fefbf3..58f784fdbc972 100644
--- a/paddle/fluid/platform/device/ipu/ipu_compiler.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_compiler.cc
@@ -271,7 +271,7 @@ void Compiler::LowerWeights(const framework::ir::Graph* graph,
             shape.push_back(tensor.dims().at(i));
           }
           popart::TensorInfo tensor_info(dtype, shape);
-          popart::ConstVoidData const_data{tensor.data<void>(), tensor_info};
+          popart::ConstVoidData const_data{tensor.data(), tensor_info};
           popart::TensorId result =
               builder_->addInitializedInputTensor(const_data, var_name);
           tensors_.emplace(var_name, result);
diff --git a/paddle/fluid/platform/device/ipu/ipu_utils.cc b/paddle/fluid/platform/device/ipu/ipu_utils.cc
index 08ba50415dd5f..4dfe8c4efbeb9 100644
--- a/paddle/fluid/platform/device/ipu/ipu_utils.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_utils.cc
@@ -18,7 +18,7 @@ namespace paddle {
 namespace platform {
 namespace ipu {
 
-void* PaddleIArray::data() { return tensor_->data<void>(); }
+void* PaddleIArray::data() { return tensor_->data(); }
 
 popart::DataType PaddleIArray::dataType() const {
   return VarType2PopartType(tensor_->type());
diff --git a/paddle/fluid/platform/device/ipu/ipu_utils.h b/paddle/fluid/platform/device/ipu/ipu_utils.h
index 670427128b870..3a3b9c8ccc238 100644
--- a/paddle/fluid/platform/device/ipu/ipu_utils.h
+++ b/paddle/fluid/platform/device/ipu/ipu_utils.h
@@ -84,7 +84,7 @@ std::unique_ptr<popart::NDArrayWrapper<T>> Tensor2IArray(
   popart::TensorInfo tensor_info(dtype, shape);
 
   return std::make_unique<popart::NDArrayWrapper<T>>(
-      reinterpret_cast<T *>(tensor.data<void>()), tensor_info);
+      reinterpret_cast<T *>(tensor.data()), tensor_info);
 }
 
 template <typename T>
diff --git a/paddle/fluid/platform/device/npu/npu_op_runner.cc b/paddle/fluid/platform/device/npu/npu_op_runner.cc
index ed74a94c09502..78e5cb0ab106e 100644
--- a/paddle/fluid/platform/device/npu/npu_op_runner.cc
+++ b/paddle/fluid/platform/device/npu/npu_op_runner.cc
@@ -401,7 +401,7 @@ aclTensorDesc *NpuOpRunner::CreateTensorDesc(Tensor tensor,
 }
 
 aclDataBuffer *NpuOpRunner::CreateDataBuffer(Tensor tensor) {
-  void *ptr = tensor.data<void>();
+  void *ptr = tensor.data();
   VLOG(4) << "NPU ptr: " << ptr << ", size: " << tensor.memory_size();
   auto *buffer = aclCreateDataBuffer(ptr, tensor.memory_size());
   PADDLE_ENFORCE_NOT_NULL(
diff --git a/paddle/fluid/platform/device/npu/npu_op_runner.h b/paddle/fluid/platform/device/npu/npu_op_runner.h
index 39c1fc9d041ea..e83057e682fef 100644
--- a/paddle/fluid/platform/device/npu/npu_op_runner.h
+++ b/paddle/fluid/platform/device/npu/npu_op_runner.h
@@ -150,8 +150,8 @@ void FillNpuTensorWithConstant(Tensor *tensor, T val) {
     *npu_pinned_ptr = val;
 
     memory::Copy(BOOST_GET_CONST(platform::NPUPlace, tensor->place()),
-                 tensor->data<void>(), npu_pinned_place, npu_pinned_ptr,
-                 sizeof(T), GetCurrentNPUStream());
+                 tensor->data(), npu_pinned_place, npu_pinned_ptr, sizeof(T),
+                 GetCurrentNPUStream());
 
     auto npu_pinned_allocator =
         static_cast<paddle::memory::allocation::NPUPinnedAllocator *>(
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 00e97bc2db420..4f22e83ac626f 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -792,7 +792,7 @@ void BindImperative(py::module *m_ptr) {
           SetTensorFromPyArray<platform::CPUPlace>(&t, array,
                                                    platform::CPUPlace(), true);
           // 3. allocate shared memory
-          void *data_ptr = t.data<void>();
+          void *data_ptr = t.data();
           size_t data_size = t.numel() * framework::SizeOfType(t.type());
           auto shared_writer_holder =
               memory::allocation::AllocateMemoryMapWriterAllocation(data_size);
@@ -827,7 +827,7 @@ void BindImperative(py::module *m_ptr) {
           SetTensorFromPyArray<platform::CPUPlace>(&t, array,
                                                    platform::CPUPlace(), true);
           // 3. allocate shared memory
-          void *data_ptr = t.data<void>();
+          void *data_ptr = t.data();
           size_t data_size = t.numel() * framework::SizeOfType(t.type());
           auto shared_writer_holder =
               memory::allocation::AllocateMemoryMapWriterAllocation(data_size);
@@ -1857,7 +1857,7 @@ void BindImperative(py::module *m_ptr) {
              // 1. get LoDTensor
              auto *t = self->MutableVar()->GetMutable<framework::LoDTensor>();
              // 2. allocate shared memory
-             void *data_ptr = t->data<void>();
+             void *data_ptr = t->data();
              size_t data_size = t->numel() * framework::SizeOfType(t->type());
              auto shared_writer_holder =
                  memory::allocation::AllocateMemoryMapWriterAllocation(
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index 9d3a858d1bdbf..b31b7456ebca7 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -729,7 +729,7 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
     numel *= py_dims[i];
   }
 
-  const void *tensor_buf_ptr = tensor.data<void>();
+  const void *tensor_buf_ptr = tensor.data();
 
   std::string py_dtype_str = details::TensorDTypeToPyDTypeStr(tensor.type());
 
diff --git a/paddle/pten/api/lib/utils/storage.h b/paddle/pten/api/lib/utils/storage.h
index 41b0f4744d12a..e102ecbc5de7d 100644
--- a/paddle/pten/api/lib/utils/storage.h
+++ b/paddle/pten/api/lib/utils/storage.h
@@ -83,8 +83,21 @@ class SharedStorage : public pten::Storage {
     size_ = 0;
   }
 
-  size_t size() const noexcept override { return size_; }
-  const paddle::platform::Place& place() const override { return place_; }
+  void set_data_shared(
+      const std::shared_ptr<paddle::memory::Allocation>& holder) override {
+    data_ = holder;
+    if (holder) {
+      size_ = holder->size();
+      place_ = holder->place();
+    }
+  }
+
+  size_t size() const noexcept override {
+    return data_ ? data_->size() : size_;
+  }
+  const paddle::platform::Place& place() const override {
+    return data_ ? data_->place() : place_;
+  }
   bool OwnsMemory() const noexcept override { return false; }
 
   const std::shared_ptr<paddle::memory::Allocation>& GetAllocation() {
diff --git a/paddle/pten/core/dense_tensor.cc b/paddle/pten/core/dense_tensor.cc
index d8d83c575c4cf..1b4254ad2c103 100644
--- a/paddle/pten/core/dense_tensor.cc
+++ b/paddle/pten/core/dense_tensor.cc
@@ -41,12 +41,32 @@ DenseTensor::DenseTensor(intrusive_ptr<Storage> storage,
 DenseTensor::DenseTensor(intrusive_ptr<Storage> storage, DenseTensorMeta&& meta)
     : meta_(std::move(meta)), storage_(std::move(storage)) {}
 
-DenseTensor::DenseTensor(const DenseTensor& other)
-    : meta_(other.meta()), storage_(copy_intrusive(other.storage_)) {}
+DenseTensor::DenseTensor(const DenseTensor& other) : meta_(other.meta()) {
+  if (storage_ == nullptr) {
+    storage_ = make_intrusive<paddle::experimental::SharedStorage>(
+        paddle::platform::CPUPlace());
+  }
+  if (other.storage_ != nullptr && other.storage_->data_shared()) {
+    storage_->set_data_shared(other.storage_->data_shared());
+  }
+
+#ifdef PADDLE_WITH_MKLDNN
+  format_ = other.format_;
+#endif
+}
 
 DenseTensor& DenseTensor::operator=(const DenseTensor& other) {
   meta_ = other.meta();
-  storage_ = std::move(copy_intrusive(other.storage_));
+  if (storage_ == nullptr) {
+    storage_ = make_intrusive<paddle::experimental::SharedStorage>(
+        paddle::platform::CPUPlace());
+  }
+  if (other.storage_ != nullptr && other.storage_->data_shared()) {
+    storage_->set_data_shared(other.storage_->data_shared());
+  }
+#ifdef PADDLE_WITH_MKLDNN
+  format_ = other.format_;
+#endif
   return *this;
 }
 
@@ -138,22 +158,22 @@ T* DenseTensor::data() {
   return reinterpret_cast<T*>(data());
 }
 
-const void* DenseTensor::data() const {
+void* DenseTensor::data() {
   PADDLE_ENFORCE_NOT_NULL(
       storage_,
       paddle::platform::errors::PreconditionNotMet(
           "The storage must be valid when call the mutable data function."));
-  return reinterpret_cast<const void*>(
-      reinterpret_cast<uintptr_t>(storage_->data()) + meta_.offset);
+  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(storage_->data()) +
+                                 meta_.offset);
 }
 
-void* DenseTensor::data() {
+const void* DenseTensor::data() const {
   PADDLE_ENFORCE_NOT_NULL(
       storage_,
       paddle::platform::errors::PreconditionNotMet(
           "The storage must be valid when call the mutable data function."));
-  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(storage_->data()) +
-                                 meta_.offset);
+  return reinterpret_cast<const void*>(
+      reinterpret_cast<uintptr_t>(storage_->data()) + meta_.offset);
 }
 
 void DenseTensor::set_meta(DenseTensorMeta&& meta) {
@@ -174,12 +194,11 @@ void DenseTensor::set_meta(DenseTensorMeta&& meta) {
                                storage_ won't be initialized until the first
    call to mutable_data(place)
    */
-DenseTensor& DenseTensor::Resize(const DDim& dims) {
+void DenseTensor::Resize(const DDim& dims) {
   meta_.dims = dims;
   if (storage_ != nullptr) {
     mutable_data();
   }
-  return *this;
 }
 
 void DenseTensor::ResetLoD(const LoD& lod) { meta_.lod = lod; }
@@ -211,36 +230,21 @@ DATA_MEMBER_FUNC_INSTANTIATION(::paddle::experimental::complex128);
 /*   From framework::Tensor    */
 /* --------------------------- */
 DenseTensor::DenseTensor() {
+  storage_ = make_intrusive<paddle::experimental::SharedStorage>(
+      paddle::platform::CPUPlace());
   inplace_version_counter_ = std::make_shared<TensorInplaceVersion>(0);
-  meta_ = DenseTensorMeta();
   meta_.dtype = paddle::experimental::DataType::FLOAT32;
   meta_.offset = 0;
 }
 
 DenseTensor::DenseTensor(const paddle::framework::proto::VarType::Type& dtype) {
+  storage_ = make_intrusive<paddle::experimental::SharedStorage>(
+      paddle::platform::CPUPlace());
   inplace_version_counter_ = std::make_shared<TensorInplaceVersion>(0);
-  meta_ = DenseTensorMeta();
   meta_.dtype = TransToPtenDataType(dtype);
   meta_.offset = 0;
 }
 
-DenseTensor& DenseTensor::ShareDataWith(const DenseTensor& src) {
-  src.check_memory_size();
-  *this = src;
-  return *this;
-}
-
-DenseTensor& DenseTensor::ShareInplaceVersionCounterWith(
-    const DenseTensor& src) {
-  PADDLE_ENFORCE_NOT_NULL(
-      inplace_version_counter_,
-      paddle::platform::errors::PreconditionNotMet(
-          "Tensor does not hold inplace_version_counter_."));
-
-  inplace_version_counter_ = src.inplace_version_counter_;
-  return *this;
-}
-
 size_t DenseTensor::memory_size() const {
   if (storage_ == nullptr || storage_->data_shared() == nullptr) {
     return 0UL;
@@ -304,16 +308,15 @@ void DenseTensor::ResetHolder(
       paddle::platform::errors::Fatal(
           "Only the offset is supported to zero when the holder is reset."));
 
-  if (storage_ == nullptr) {
-    PADDLE_THROW(
-        paddle::platform::errors::Fatal("storage_ has to be initialized before "
-                                        "calling ResetHolder() interface."));
-  }
+  PADDLE_ENFORCE_NOT_NULL(
+      storage_,
+      paddle::platform::errors::PreconditionNotMet(
+          "The storage must be valid when call the mutable data function."));
 
   if (storage_->data_shared()) {
     PADDLE_ENFORCE_LE(
         numel() * SizeOf(dtype()) + meta_.offset,
-        storage_->data_shared()->size(),
+        holder->size(),
         paddle::platform::errors::InvalidArgument(
             "The size of Holder is not enough to store the Tensor."));
   }
@@ -333,95 +336,6 @@ void DenseTensor::set_type(
   meta_.dtype = TransToPtenDataType(type);
 }
 
-DenseTensor DenseTensor::Slice(int64_t begin_idx, int64_t end_idx) const {
-  check_memory_size();
-  PADDLE_ENFORCE_GE(begin_idx,
-                    0,
-                    paddle::platform::errors::OutOfRange(
-                        "The start row index must be greater than 0."
-                        "But received the start index is d%.",
-                        begin_idx));
-  PADDLE_ENFORCE_LE(end_idx,
-                    meta_.dims[0],
-                    paddle::platform::errors::OutOfRange(
-                        "The end row index is out of bound."));
-  PADDLE_ENFORCE_LT(
-      begin_idx,
-      end_idx,
-      paddle::platform::errors::InvalidArgument(
-          "The start row index must be less than the end row index."
-          "But received the start index = %d, the end index = %d.",
-          begin_idx,
-          end_idx));
-
-  if (meta_.dims[0] == 1) {
-    return *this;
-  } else {
-    size_t base = numel() / meta_.dims[0];
-    DenseTensor dst;
-    dst.storage_ = std::move(copy_intrusive(storage_));
-    dst.meta_.layout = meta_.layout;
-    dst.meta_.dtype = meta_.dtype;
-    DDim dst_dims = meta_.dims;
-    dst_dims[0] = end_idx - begin_idx;
-    dst.Resize(dst_dims);
-    dst.meta_.offset = meta_.offset + begin_idx * base * SizeOf(dtype());
-    return dst;
-  }
-}
-
-std::vector<DenseTensor> DenseTensor::Split(int64_t split_size,
-                                            int64_t axis) const {
-  check_memory_size();
-
-  PADDLE_ENFORCE_GE(meta_.dims.size(),
-                    0,
-                    paddle::platform::errors::OutOfRange(
-                        "split expects at least a 1-dimensional tensor"));
-
-  PADDLE_ENFORCE_GE(
-      split_size,
-      0,
-      paddle::platform::errors::OutOfRange(
-          "split expects split_size be non-negative, but got split_size is %d",
-          split_size));
-
-  int64_t numel_size = meta_.dims[axis];
-
-  int64_t num_splits = 1;
-  if (split_size != 0) {
-    num_splits =
-        std::max<int64_t>((numel_size + split_size - 1) / split_size, 1);
-  }
-
-  std::vector<DenseTensor> splits(num_splits);
-  int64_t last_split_size = split_size - (split_size * num_splits - numel_size);
-
-  for (int64_t i = 0; i < num_splits; ++i) {
-    int64_t length = i < num_splits - 1 ? split_size : last_split_size;
-    splits[i] = Slice(i * split_size, i * split_size + length);
-  }
-  return splits;
-}
-
-std::vector<DenseTensor> DenseTensor::Chunk(int64_t chunks,
-                                            int64_t axis) const {
-  check_memory_size();
-  PADDLE_ENFORCE_GE(meta_.dims.size(),
-                    0,
-                    paddle::platform::errors::OutOfRange(
-                        "split expects at least a 1-dimensional tensor"));
-  PADDLE_ENFORCE_GE(
-      chunks,
-      0,
-      paddle::platform::errors::OutOfRange(
-          "chunks expects to be greater than 0, but got chunks is %d", chunks));
-
-  int64_t numel_size = meta_.dims[axis];
-  int64_t split_size = (numel_size + chunks - 1) / chunks;
-  return Split(split_size, axis);
-}
-
 void* DenseTensor::mutable_data(const paddle::platform::Place& place,
                                 paddle::framework::proto::VarType::Type type,
                                 size_t requested_size) {
@@ -447,23 +361,16 @@ void* DenseTensor::mutable_data(const paddle::platform::Place& place,
   if (storage_->data_shared() == nullptr ||
       !(storage_->data_shared()->place() == place) ||
       storage_->data_shared()->size() < size + meta_.offset) {
-    // Reset holder first before re-allocate to save memory
     storage_->Clear();
     storage_->set_data_shared(paddle::memory::AllocShared(place, size));
     meta_.offset = 0;
   }
-  return reinterpret_cast<void*>(
-      reinterpret_cast<uintptr_t>(storage_->data_shared()->ptr()) +
-      meta_.offset);
+  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(storage_->data()) +
+                                 meta_.offset);
 }
 
 void* DenseTensor::mutable_data(const paddle::platform::Place& place,
                                 size_t requested_size) {
-  if (storage_ == nullptr) {
-    PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
-        "The tensor is not initialized."));
-  }
-
   return mutable_data(place, type(), requested_size);
 }
 
@@ -481,8 +388,12 @@ void* DenseTensor::mutable_data(const paddle::platform::Place& place,
           "] now"));
   size_t size = numel() * SizeOf(dtype());
 
+  if (storage_ == nullptr) {
+    storage_ = make_intrusive<paddle::experimental::SharedStorage>(place);
+  }
+
   /* some versions of boost::variant don't have operator!= */
-  if (storage_ == nullptr || storage_->data_shared() == nullptr ||
+  if (storage_->data_shared() == nullptr ||
       !(storage_->data_shared()->place() == place) ||
       storage_->data_shared()->size() < size + meta_.offset ||
       !(paddle::platform::is_gpu_place(place) &&
@@ -491,9 +402,8 @@ void* DenseTensor::mutable_data(const paddle::platform::Place& place,
     storage_->set_data_shared(paddle::memory::AllocShared(place, size, stream));
     meta_.offset = 0;
   }
-  return reinterpret_cast<void*>(
-      reinterpret_cast<uintptr_t>(storage_->data_shared()->ptr()) +
-      meta_.offset);
+  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(storage_->data()) +
+                                 meta_.offset);
 }
 
 /* @jim19930609: The following "mutable_data" only supports specific dtypes
@@ -506,7 +416,7 @@ inline T* DenseTensor::mutable_data(const DDim& dims,
                                     const paddle::platform::Place& place,
                                     size_t requested_size) {
   static_assert(std::is_pod<T>::value, "T must be POD");
-  Resize(dims);
+  meta_.dims = dims;
   return mutable_data<T>(place, requested_size);
 }
 
@@ -518,6 +428,13 @@ inline T* DenseTensor::mutable_data(const paddle::platform::Place& place,
       place, paddle::framework::DataTypeTrait<T>::DataType(), requested_size));
 }
 
+void DenseTensor::ShareBufferWith(const DenseTensor& tensor) {
+  if (storage_ != nullptr && tensor.storage_ != nullptr) {
+    storage_->set_data_shared(tensor.storage_->data_shared());
+  }
+  meta_.offset = tensor.meta().offset;
+}
+
 #define LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(dtype) \
   template dtype* DenseTensor::mutable_data(         \
       const DDim& dims,                              \
diff --git a/paddle/pten/core/dense_tensor.h b/paddle/pten/core/dense_tensor.h
index eb149220f942d..fc92e84f52cea 100644
--- a/paddle/pten/core/dense_tensor.h
+++ b/paddle/pten/core/dense_tensor.h
@@ -157,7 +157,7 @@ class DenseTensor : public TensorBase,
   /// \param dims The new dims of the dense tensor.
   /// \param lod The new lod of the dense tensor.
   // void Resize(const DDim& dims);
-  DenseTensor& Resize(const DDim& dims);
+  void Resize(const DDim& dims);
 
   /// \brief Change the lod information in the metadata.
   /// \param lod The new lod of the dense tensor.
@@ -204,7 +204,7 @@ class DenseTensor : public TensorBase,
  private:
   friend class CompatibleDenseTensorUtils;
 
- private:
+ protected:
   DenseTensorMeta meta_;
   intrusive_ptr<Storage> storage_;
 
@@ -228,7 +228,7 @@ class DenseTensor : public TensorBase,
   explicit DenseTensor(const paddle::framework::proto::VarType::Type& dtype);
 
   inline bool IsInitialized() const {
-    return storage_ != nullptr && storage_->data() != nullptr;
+    return storage_ != nullptr && storage_->data_shared() != nullptr;
   }
 
   template <typename T>
@@ -256,18 +256,6 @@ class DenseTensor : public TensorBase,
                      paddle::framework::proto::VarType::Type type,
                      const paddle::platform::Stream& stream);
 
-  /*! The internal of two tensors share the same memory block. */
-  DenseTensor& ShareDataWith(const DenseTensor& src);
-
-  /*! The internal of two tensors share the same inplace version counter. */
-  DenseTensor& ShareInplaceVersionCounterWith(const DenseTensor& src);
-
-  DenseTensor Slice(int64_t begin_idx, int64_t end_idx) const;
-
-  std::vector<DenseTensor> Split(int64_t split_size, int64_t axis) const;
-
-  std::vector<DenseTensor> Chunk(int64_t chunks, int64_t axis) const;
-
   /* @jim19930609: Remove dependency on protobuf after Tensor Unification.
    */
   paddle::framework::proto::VarType::Type type() const;
@@ -288,17 +276,17 @@ class DenseTensor : public TensorBase,
     meta_.offset = 0;
   }
 
-  void ShareBufferWith(const DenseTensor& tensor) {
-    storage_ = std::move(copy_intrusive(tensor.storage_));
-    meta_.offset = tensor.meta().offset;
-  }
+  void ShareBufferWith(const DenseTensor& tensor);
 
   void ShareDataTypeWith(const DenseTensor& tensor) {
     meta_.dtype = tensor.meta().dtype;
   }
 
   bool IsSharedBufferWith(const DenseTensor& src) const {
-    return IsSharedWith(src);
+    if (storage_ == nullptr || src.storage_ == nullptr) return false;
+    if (storage_->data_shared() == src.storage_->data_shared()) return true;
+
+    return false;
   }
 
   const std::shared_ptr<paddle::memory::Allocation> Holder() const {
@@ -325,7 +313,7 @@ class DenseTensor : public TensorBase,
     return *inplace_version_counter_;
   }
 
- private:
+ protected:
   std::shared_ptr<TensorInplaceVersion> inplace_version_counter_;
 
 /* @jim19930609: This is a hack
@@ -365,6 +353,7 @@ class DenseTensor : public TensorBase,
 
      Will be adjusted/removed/moved in the near future
    */
+ public:
   explicit DenseTensor(const LoD& lod);
 
   void set_lod(const LoD& lod);
diff --git a/paddle/pten/core/storage.h b/paddle/pten/core/storage.h
index 74c303697755a..fc56935eeaf19 100644
--- a/paddle/pten/core/storage.h
+++ b/paddle/pten/core/storage.h
@@ -60,7 +60,7 @@ class Storage : public intrusive_ref_counter<Storage> {
     return data_;
   }
 
-  void set_data_shared(
+  virtual void set_data_shared(
       const std::shared_ptr<paddle::memory::Allocation>& holder) {
     data_ = holder;
   }
diff --git a/paddle/pten/core/utils/intrusive_ptr.h b/paddle/pten/core/utils/intrusive_ptr.h
index 51546bbc5056f..ed9a21e7f3a8a 100644
--- a/paddle/pten/core/utils/intrusive_ptr.h
+++ b/paddle/pten/core/utils/intrusive_ptr.h
@@ -40,8 +40,8 @@ class intrusive_ptr {
     rhs.reset();
   }
 
-  intrusive_ptr<T>& operator=(intrusive_ptr<T>&& rhs) {
-    px = std::move(rhs.px);
+  intrusive_ptr& operator=(intrusive_ptr&& rhs) {
+    swap(rhs);
     return *this;
   }
 
diff --git a/paddle/pten/tests/core/test_dense_tensor.cc b/paddle/pten/tests/core/test_dense_tensor.cc
index 814f85fde3e40..c6db228c2b757 100644
--- a/paddle/pten/tests/core/test_dense_tensor.cc
+++ b/paddle/pten/tests/core/test_dense_tensor.cc
@@ -133,7 +133,10 @@ TEST(dense_tensor, shallow_copy) {
 
   DenseTensor tensor_1(tensor_0);
   CHECK(tensor_0.meta() == tensor_1.meta());
-  CHECK(tensor_0.release() == tensor_1.release());
+
+  // Copy constructor: Now shares the underlying shared_ptr<Allocation> instead
+  // of Storage
+  CHECK(tensor_0.release() != tensor_1.release());
 }
 
 }  // namespace tests

From 897f63b468bfa649527b7ae183c9fe37a4309162 Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Mon, 10 Jan 2022 11:31:04 +0800
Subject: [PATCH 050/151] [new-exec] refine ut (#38798)

---
 .../interpreter/test_standalone_executor.py      | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py
index 01b2cccfc48b2..48f95472c7ec7 100644
--- a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py
+++ b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py
@@ -32,18 +32,16 @@ def setUp(self):
         self.place.set_place(place)
 
     def build_program(self):
-        a = paddle.static.data(name="a", shape=[2, 2], dtype='float32')
-        b = paddle.ones([2, 2]) * 2
-        t = paddle.static.nn.fc(a, 2)
-        c = t + b
-
-        main_program = paddle.fluid.default_main_program()
-        startup_program = paddle.fluid.default_startup_program()
+        startup_program = paddle.static.Program()
+        main_program = paddle.static.Program()
+        with paddle.static.program_guard(main_program, startup_program):
+            a = paddle.static.data(name="a", shape=[2, 2], dtype='float32')
+            b = paddle.ones([2, 2]) * 2
+            t = paddle.static.nn.fc(a, 2)
+            c = t + b
 
         return startup_program, main_program, c
 
-        return standaloneexecutor, c
-
     def test_interp_base(self):
         startup_program, main_program, c = self.build_program()
         standaloneexecutor = StandaloneExecutor(

From b4dd7828325746cd9f273b2c0ec723d785a07aaa Mon Sep 17 00:00:00 2001
From: Aganlengzi <songjiachen@baidu.com>
Date: Mon, 10 Jan 2022 12:09:40 +0800
Subject: [PATCH 051/151] Revert "Reupload: Added numpy bf16 datatype support
 via custom pip package (#38703)" (#38777)

This reverts commit ee813e349d017b3b1abb775ebf81a5282dd8f628.
---
 .../test_python_bf16_numpy_datatype.py        | 34 -------------------
 python/requirements.txt                       |  1 -
 2 files changed, 35 deletions(-)
 delete mode 100644 python/paddle/fluid/tests/unittests/test_python_bf16_numpy_datatype.py

diff --git a/python/paddle/fluid/tests/unittests/test_python_bf16_numpy_datatype.py b/python/paddle/fluid/tests/unittests/test_python_bf16_numpy_datatype.py
deleted file mode 100644
index a58d7d35807c6..0000000000000
--- a/python/paddle/fluid/tests/unittests/test_python_bf16_numpy_datatype.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-from paddle_bfloat import bfloat16
-import unittest
-
-
-class TestBF16DataType(unittest.TestCase):
-    def test_matmul(self):
-        a_bf16 = np.random.random((6, 7)).astype(bfloat16)
-        b_bf16 = np.random.random((7, 8)).astype(bfloat16)
-        c_bf16 = np.matmul(a_bf16, b_bf16)
-
-        a_fp32 = a_bf16.astype(np.float32)
-        b_fp32 = b_bf16.astype(np.float32)
-        c_fp32 = np.matmul(a_fp32, b_fp32)
-
-        self.assertTrue(np.allclose(c_bf16, c_fp32))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/requirements.txt b/python/requirements.txt
index 5f2b788a81a0a..f2a4580a94e51 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -5,4 +5,3 @@ Pillow
 six
 decorator
 astor
-paddle_bfloat==0.1.2

From 066a8063cb95b6a5c36a35536a4f020ee5c825bc Mon Sep 17 00:00:00 2001
From: wangxinxin08 <69842442+wangxinxin08@users.noreply.github.com>
Date: Mon, 10 Jan 2022 12:28:15 +0800
Subject: [PATCH 052/151] fix attr missing in conv cudnn kernel (#38827)

---
 paddle/fluid/operators/conv_cudnn_op.cu | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/operators/conv_cudnn_op.cu b/paddle/fluid/operators/conv_cudnn_op.cu
index 566e99c357fbe..cbe78d9a25b50 100644
--- a/paddle/fluid/operators/conv_cudnn_op.cu
+++ b/paddle/fluid/operators/conv_cudnn_op.cu
@@ -65,7 +65,8 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
     int groups = ctx.Attr<int>("groups");
 
     bool exhaustive_search =
-        FLAGS_cudnn_exhaustive_search || ctx.Attr<bool>("exhaustive_search");
+        FLAGS_cudnn_exhaustive_search || (ctx.HasAttr("exhaustive_search") &&
+                                          ctx.Attr<bool>("exhaustive_search"));
     bool deterministic = FLAGS_cudnn_deterministic;
     auto exhaustive_deterministic = exhaustive_search && deterministic;
     PADDLE_ENFORCE_EQ(exhaustive_deterministic, false,
@@ -386,7 +387,8 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
     int groups = ctx.Attr<int>("groups");
 
     bool exhaustive_search =
-        FLAGS_cudnn_exhaustive_search || ctx.Attr<bool>("exhaustive_search");
+        FLAGS_cudnn_exhaustive_search || (ctx.HasAttr("exhaustive_search") &&
+                                          ctx.Attr<bool>("exhaustive_search"));
     bool deterministic = FLAGS_cudnn_deterministic;
     auto exhaustive_deterministic = exhaustive_search && deterministic;
     PADDLE_ENFORCE_EQ(exhaustive_deterministic, false,
@@ -437,7 +439,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
             ctx, input_grad, &transformed_input_grad_channel);
         // NOTE(zhiqiu): If inplace_addto strategy is enabled, we need to copy
         // the data of input_grad to transformed_input_grad_channel.
-        if (ctx.Attr<bool>("use_addto")) {
+        if (ctx.HasAttr("use_addto") && ctx.Attr<bool>("use_addto")) {
           TransToChannelFirst<platform::CUDADeviceContext, T>(
               ctx, input_grad, &transformed_input_grad_channel);
         }
@@ -703,15 +705,17 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
     // MIOPEN ONLY support beta to be 0.0f
     ScalingParamType<T> beta = 0.0f;
 #else
-    ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f : 0.0f;
+    ScalingParamType<T> beta =
+        (ctx.HasAttr("use_addto") && ctx.Attr<bool>("use_addto")) ? 1.0f : 0.0f;
 #endif
-    VLOG(4) << "Conv_grad: use_addto = " << ctx.Attr<bool>("use_addto");
+    VLOG(4) << "Conv_grad: use_addto = "
+            << (ctx.HasAttr("use_addto") && ctx.Attr<bool>("use_addto"));
 
     if (input_grad) {
 // When beta is 0, it is unnecessary to reset input_grad.
 // When beta is 1, the output cannot be reset since addt strategy used.
 #ifdef PADDLE_WITH_HIP
-      if (ctx.Attr<bool>("use_addto")) {
+      if (ctx.HasAttr("use_addto") && ctx.Attr<bool>("use_addto")) {
         Tensor temp_tensor(transformed_input_grad.type());
         temp_tensor.Resize(transformed_input_grad.dims());
         T* temp_tensor_data = temp_tensor.mutable_data<T>(ctx.GetPlace());
@@ -878,7 +882,8 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
     int groups = ctx.Attr<int>("groups");
 
     bool exhaustive_search =
-        FLAGS_cudnn_exhaustive_search || ctx.Attr<bool>("exhaustive_search");
+        FLAGS_cudnn_exhaustive_search || (ctx.HasAttr("exhaustive_search") &&
+                                          ctx.Attr<bool>("exhaustive_search"));
     bool deterministic = FLAGS_cudnn_deterministic;
     auto exhaustive_deterministic = exhaustive_search && deterministic;
     PADDLE_ENFORCE_EQ(exhaustive_deterministic, false,

From 7b860a23354f8246888909473ebd1181a3d7cd5a Mon Sep 17 00:00:00 2001
From: taixiurong <taixiurong@126.com>
Date: Mon, 10 Jan 2022 12:56:49 +0800
Subject: [PATCH 053/151] 1.fix elementwise_add_grad bug. 2. add dropout kernel
 in kl2 (#38726)

---
 paddle/fluid/framework/tensor_util.cc         |  8 ++
 paddle/fluid/memory/memcpy.cc                 |  6 +-
 paddle/fluid/operators/dropout_op_xpu.cc      | 78 +++++++++----------
 .../elementwise/elementwise_add_op_xpu.cc     | 41 +++++-----
 .../fluid/operators/masked_select_op_xpu.cc   | 11 ++-
 .../fluid/platform/device/xpu/CMakeLists.txt  |  2 +-
 .../fluid/platform/device/xpu/enforce_xpu.h   | 31 ++++++++
 .../device/xpu/tests/enforce_xpu_test.cc      | 30 +++++++
 paddle/fluid/platform/device/xpu/xpu_info.cc  | 36 ++++++---
 paddle/fluid/platform/device/xpu/xpu_info.h   | 10 ++-
 10 files changed, 174 insertions(+), 79 deletions(-)

diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index f2323f6e2c6ee..7fd125834a0c3 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -488,6 +488,14 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
     }
     memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dst_place), dst_ptr,
                  BOOST_GET_CONST(platform::XPUPlace, src_place), src_ptr, size);
+    platform::XPUPlace xpu_dst_place =
+        BOOST_GET_CONST(platform::XPUPlace, dst_place);
+    platform::XPUPlace xpu_src_place =
+        BOOST_GET_CONST(platform::XPUPlace, src_place);
+    if (xpu_dst_place.device == xpu_src_place.device) {
+      auto xpu_ctx = platform::DeviceContextPool::Instance().Get(xpu_dst_place);
+      xpu_ctx->Wait();
+    }
   }
   else {  // NOLINT
     PADDLE_THROW(platform::errors::Unimplemented(
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index 4a10922adbf75..e6aed2c90dace 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -66,7 +66,7 @@ void Copy<platform::XPUPlace, platform::CPUPlace>(platform::XPUPlace dst_place,
     VLOG(1) << "memcpy XPU_HOST_TO_DEVICE size <= 0 (" << num << ")";
     return;
   }
-  platform::MemcpySyncH2D(dst, src, num, dst_place.device);
+  platform::MemcpySyncH2D(dst, src, num, dst_place);
 }
 
 template <>
@@ -78,7 +78,7 @@ void Copy<platform::CPUPlace, platform::XPUPlace>(platform::CPUPlace dst_place,
     VLOG(1) << "memcpy XPU_DEVICE_TO_HOST size <= 0 (" << num << ")";
     return;
   }
-  platform::MemcpySyncD2H(dst, src, num, src_place.device);
+  platform::MemcpySyncD2H(dst, src, num, src_place);
 }
 
 template <>
@@ -90,7 +90,7 @@ void Copy<platform::XPUPlace, platform::XPUPlace>(platform::XPUPlace dst_place,
     VLOG(1) << "memcpy XPU_DEVICE_TO_DEVICE size <= 0 (" << num << ")";
     return;
   }
-  platform::MemcpySyncD2D(dst, dst_place.device, src, src_place.device, num);
+  platform::MemcpySyncD2D(dst, dst_place, src, src_place, num);
 }
 #endif
 
diff --git a/paddle/fluid/operators/dropout_op_xpu.cc b/paddle/fluid/operators/dropout_op_xpu.cc
index 3335c0de429e4..cded525b030d8 100644
--- a/paddle/fluid/operators/dropout_op_xpu.cc
+++ b/paddle/fluid/operators/dropout_op_xpu.cc
@@ -11,7 +11,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/dropout_op.h"
 #include <memory>
 #include <string>
-#include "paddle/fluid/platform/device/xpu/xpu_header.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
 namespace paddle {
 namespace operators {
 
@@ -55,17 +55,11 @@ class DropoutXPUKernel : public framework::OpKernel<T> {
         int r = xpu::constant(dev_ctx.x_context(),
                               reinterpret_cast<XPUTyp*>(y_data), y->numel(),
                               XPUTyp(0));
-        PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
-                                              "XPU API(constant) return wrong "
-                                              "value[%d %s]",
-                                              r, XPUAPIErrorMsg[r]));
+        PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant ");
         r = xpu::constant(dev_ctx.x_context(),
                           reinterpret_cast<XPUTyp*>(mask_data), mask->numel(),
                           XPUTyp(0));
-        PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
-                                              "XPU API(constant) return wrong "
-                                              "value[%d %s]",
-                                              r, XPUAPIErrorMsg[r]));
+        PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant ");
         return;
       }
       int r = xpu::dropout(dev_ctx.x_context(),
@@ -73,26 +67,20 @@ class DropoutXPUKernel : public framework::OpKernel<T> {
                            reinterpret_cast<XPUTyp*>(y->data<T>()),
                            reinterpret_cast<XPUTyp*>(mask_data), seed,
                            mask->numel(), is_upscale, dropout_prob);
-      PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
-                                            "XPU API(dropout) return wrong "
-                                            "value[%d %s]",
-                                            r, XPUAPIErrorMsg[r]));
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "dropout ");
     } else {
       float scale =
           (is_upscale) ? (1.0) : (static_cast<float>(1.0f - dropout_prob));
       int r = xpu::scale(
           dev_ctx.x_context(), reinterpret_cast<const XPUTyp*>(x_data),
           reinterpret_cast<XPUTyp*>(y_data), x->numel(), false, scale, 0.0f);
-      PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
-                                            "XPU API(scale) return wrong "
-                                            "value[%d %s]",
-                                            r, XPUAPIErrorMsg[r]));
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "scale ");
     }
   }
 };
 template <typename DeviceContext, typename T>
 class DropoutGradXPUKernel : public framework::OpKernel<T> {
-  using XPUTyp = typename XPUTypeTrait<T>::Type;
+  using XPUType = typename XPUTypeTrait<T>::Type;
 
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -108,31 +96,43 @@ class DropoutGradXPUKernel : public framework::OpKernel<T> {
         context.Attr<std::string>("dropout_implementation");
     float dropout_prob = context.Attr<float>("dropout_prob");
     const T* mask_data = mask->data<T>();
-    framework::Tensor mask_new;
-    if (dropout_implementation == "upscale_in_train") {
-      mask_new = context.AllocateTmpTensor<T, platform::XPUDeviceContext>(
-          mask->dims(), dev_ctx);
+
+    if (dropout_implementation != "upscale_in_train") {
+      int r = xpu::mul(dev_ctx.x_context(),
+                       reinterpret_cast<const XPUType*>(grad_y->data<T>()),
+                       reinterpret_cast<const XPUType*>(mask_data),
+                       reinterpret_cast<XPUType*>(grad_x->data<T>()),
+                       grad_y->numel());
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "mul ");
+      return;
+    }
+
+    paddle::platform::XPUVersion version = dev_ctx.xpu_version();
+    if (version == paddle::platform::XPUVersion::XPU1) {
+      xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+      XPUType* mask_new = RAII_GUARD.alloc_l3_or_gm<XPUType>(mask->numel());
       float scale =
           (dropout_prob == 1.0f) ? (1.0f) : (1.0f / (1.0f - dropout_prob));
       int r = xpu::scale(dev_ctx.x_context(),
-                         reinterpret_cast<const XPUTyp*>(mask->data<T>()),
-                         reinterpret_cast<XPUTyp*>(mask_new.data<T>()),
-                         mask->numel(), false, scale, 0.0f);
-      PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
-                                            "XPU API(scale) return wrong "
-                                            "value[%d %s]",
-                                            r, XPUAPIErrorMsg[r]));
-      mask_data = mask_new.data<T>();
+                         reinterpret_cast<const XPUType*>(mask->data<T>()),
+                         reinterpret_cast<XPUType*>(mask_new), mask->numel(),
+                         false, scale, 0.0f);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "scale ");
+      r = xpu::mul(dev_ctx.x_context(),
+                   reinterpret_cast<const XPUType*>(grad_y->data<T>()),
+                   reinterpret_cast<const XPUType*>(mask_new),
+                   reinterpret_cast<XPUType*>(grad_x->data<T>()),
+                   grad_y->numel());
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "mul ");
+    } else {
+      int r =
+          xpu::dropout_grad(dev_ctx.x_context(),
+                            reinterpret_cast<const XPUType*>(mask->data<T>()),
+                            reinterpret_cast<const XPUType*>(grad_y->data<T>()),
+                            reinterpret_cast<XPUType*>(grad_x->data<T>()),
+                            dropout_prob, grad_y->numel());
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "dropout_grad ");
     }
-
-    int r = xpu::mul(
-        dev_ctx.x_context(), reinterpret_cast<const XPUTyp*>(grad_y->data<T>()),
-        reinterpret_cast<const XPUTyp*>(mask_data),
-        reinterpret_cast<XPUTyp*>(grad_x->data<T>()), grad_y->numel());
-    PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
-                      platform::errors::External("XPU API(mul) return wrong "
-                                                 "value[%d %s]",
-                                                 r, XPUAPIErrorMsg[r]));
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc
index 769e61aba6131..6167452728a59 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 
 #include "paddle/fluid/operators/elementwise/elementwise_xpu.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
 
 namespace paddle {
 namespace operators {
@@ -106,39 +107,43 @@ class ElementwiseAddGradXPUKernel : public ElemwiseGradKernel<T> {
     const T* dz_data = dz->data<T>();
     auto& dev_ctx =
         ctx.template device_context<paddle::platform::XPUDeviceContext>();
+
     if (dx != nullptr) {
+      T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
       if (rdims_for_x.size() == 0) {
-        framework::TensorCopy(
-            *dz, ctx.GetPlace(),
-            ctx.template device_context<platform::DeviceContext>(), dx);
+        if (dx_data != dz_data) {
+          framework::TensorCopy(
+              *dz, ctx.GetPlace(),
+              ctx.template device_context<platform::DeviceContext>(), dx);
+        }
       } else {
-        T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
+        // For inplace strategy, dx will be stored in addr of dz, which makes
+        // the result of dy wrong.
+        if (dx->IsSharedBufferWith(*dz)) {
+          dx->clear();
+          dx->mutable_data<T>(x->dims(), ctx.GetPlace());
+        }
+
         int ret = xpu::reduce_sum<XPUType>(
             dev_ctx.x_context(), reinterpret_cast<const XPUType*>(dz_data),
             reinterpret_cast<XPUType*>(dx_data), z_dims_vec, rdims_for_x);
-        PADDLE_ENFORCE_EQ(
-            ret, xpu::SUCCESS,
-            platform::errors::External("XPU kernel reduce_sum occur error in "
-                                       "XPUElementwise error code ",
-                                       ret, XPUAPIErrorMsg[ret]));
+        PADDLE_ENFORCE_XDNN_SUCCESS(ret, "reduce_sum ");
       }
     }
 
     if (dy != nullptr) {
+      T* dy_data = dy->mutable_data<T>(ctx.GetPlace());
       if (rdims_for_y.size() == 0) {
-        framework::TensorCopy(
-            *dz, ctx.GetPlace(),
-            ctx.template device_context<platform::DeviceContext>(), dy);
+        if (dy_data != dz_data) {
+          framework::TensorCopy(
+              *dz, ctx.GetPlace(),
+              ctx.template device_context<platform::DeviceContext>(), dy);
+        }
       } else {
-        T* dy_data = dy->mutable_data<T>(ctx.GetPlace());
         int ret = xpu::reduce_sum<XPUType>(
             dev_ctx.x_context(), reinterpret_cast<const XPUType*>(dz_data),
             reinterpret_cast<XPUType*>(dy_data), z_dims_vec, rdims_for_y);
-        PADDLE_ENFORCE_EQ(
-            ret, xpu::SUCCESS,
-            platform::errors::External("XPU kernel reduce_sum occur error in "
-                                       "XPUElementwise error code ",
-                                       ret, XPUAPIErrorMsg[ret]));
+        PADDLE_ENFORCE_XDNN_SUCCESS(ret, "reduce_sum ");
       }
     }
   }
diff --git a/paddle/fluid/operators/masked_select_op_xpu.cc b/paddle/fluid/operators/masked_select_op_xpu.cc
index c575f133b1572..dbf8793b5cb6f 100644
--- a/paddle/fluid/operators/masked_select_op_xpu.cc
+++ b/paddle/fluid/operators/masked_select_op_xpu.cc
@@ -42,8 +42,10 @@ class MaskedSelectXPUKernel : public framework::OpKernel<T> {
     int* out_size = RAII_GUARD.alloc_l3_or_gm<int32_t>(1);
     int out_size_cpu;
 
-    PADDLE_ENFORCE_XPU_SUCCESS(xpu::nonzero_count(
-        dev_ctx.x_context(), mask_data, out_size, mask->numel()));
+    PADDLE_ENFORCE_XDNN_SUCCESS(
+        xpu::nonzero_count(dev_ctx.x_context(), mask_data, out_size,
+                           mask->numel()),
+        "nonzero_count ");
     memory::Copy(platform::CPUPlace(), static_cast<void*>(&out_size_cpu),
                  BOOST_GET_CONST(platform::XPUPlace, mask->place()),
                  static_cast<void*>(out_size), sizeof(int32_t));
@@ -55,9 +57,10 @@ class MaskedSelectXPUKernel : public framework::OpKernel<T> {
     auto input_shape = framework::vectorize<int>(input_dim);
     auto mask_shape = framework::vectorize<int>(mask_dim);
 
-    PADDLE_ENFORCE_XPU_SUCCESS(
+    PADDLE_ENFORCE_XDNN_SUCCESS(
         xpu::masked_select(dev_ctx.x_context(), input_data, mask_data, out_data,
-                           input_shape, mask_shape, out_size_cpu));
+                           input_shape, mask_shape, out_size_cpu),
+        "masked_select");
   }
 };
 
diff --git a/paddle/fluid/platform/device/xpu/CMakeLists.txt b/paddle/fluid/platform/device/xpu/CMakeLists.txt
index b1fc9a0cedd0b..f89c8c193ae7c 100644
--- a/paddle/fluid/platform/device/xpu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/xpu/CMakeLists.txt
@@ -4,7 +4,7 @@ endif()
 
 set(XPU_CTX_DEPS xpulib ssl crypto rt z resolv dl)
 
-cc_library(xpu_info SRCS xpu_info.cc DEPS gflags glog enforce xpulib)
+cc_library(xpu_info SRCS xpu_info.cc DEPS gflags glog enforce xpulib device_context place)
 cc_library(xpu_op_list SRCS xpu_op_list.cc DEPS gflags glog enforce xpulib device_context)
 
 add_subdirectory(tests)
diff --git a/paddle/fluid/platform/device/xpu/enforce_xpu.h b/paddle/fluid/platform/device/xpu/enforce_xpu.h
index 839f14067782d..4c85168f68dd3 100644
--- a/paddle/fluid/platform/device/xpu/enforce_xpu.h
+++ b/paddle/fluid/platform/device/xpu/enforce_xpu.h
@@ -113,6 +113,23 @@ inline const char* bkclGetErrorString(BKCLResult_t stat) {
   }
 }
 
+inline const char* xdnnGetErrorString(int stat) {
+  switch (stat) {
+    case xpu::Error_t::SUCCESS:
+      return "XDNN_SUCCESS";
+    case xpu::Error_t::INVALID_PARAM:
+      return "XDNN_INVALID_PARAM";
+    case xpu::Error_t::RUNTIME_ERROR:
+      return "XDNN_RUNTIME_ERROR";
+    case xpu::Error_t::NO_ENOUGH_WORKSPACE:
+      return "XDNN_NO_ENOUGH_WORKSPACE";
+    case xpu::Error_t::NOT_IMPLEMENT:
+      return "XDNN_NOT_IMPLEMENT";
+    default:
+      return "Unknown XDNN status";
+  }
+}
+
 inline std::string build_xpu_error_msg(int stat) {
   std::string msg("XPU Error <" + std::to_string(stat) + ">, ");
   return msg + xpuGetErrorString(stat) + " ";
@@ -123,6 +140,10 @@ inline std::string build_xpu_error_msg(BKCLResult_t stat) {
   return msg + bkclGetErrorString(stat) + " ";
 }
 
+inline std::string build_xpu_xdnn_error_msg(int stat, std::string msg) {
+  return msg + " XDNN Error, " + xdnnGetErrorString(stat) + " ";
+}
+
 namespace details {
 
 template <typename T>
@@ -156,5 +177,15 @@ DEFINE_EXTERNAL_API_TYPE(BKCLResult_t, BKCL_SUCCESS);
     }                                                         \
   } while (0)
 
+#define PADDLE_ENFORCE_XDNN_SUCCESS(COND, MSG)                          \
+  do {                                                                  \
+    auto __cond__ = (COND);                                             \
+    if (UNLIKELY(__cond__ != xpu::Error_t::SUCCESS)) {                  \
+      auto __summary__ = paddle::platform::errors::External(            \
+          ::paddle::platform::build_xpu_xdnn_error_msg(__cond__, MSG)); \
+      __THROW_ERROR_INTERNAL__(__summary__);                            \
+    }                                                                   \
+  } while (0)
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device/xpu/tests/enforce_xpu_test.cc b/paddle/fluid/platform/device/xpu/tests/enforce_xpu_test.cc
index 730bcdb37fd7b..8cba98f3fb352 100644
--- a/paddle/fluid/platform/device/xpu/tests/enforce_xpu_test.cc
+++ b/paddle/fluid/platform/device/xpu/tests/enforce_xpu_test.cc
@@ -33,6 +33,24 @@ bool CheckXPUStatusFailure(T value, const std::string& msg) {
   }
 }
 
+template <typename T>
+bool CheckXDNNStatusSuccess(T value, const std::string& msg = "success") {
+  PADDLE_ENFORCE_XDNN_SUCCESS(value, "XDNN Error ");
+  return true;
+}
+
+template <typename T>
+bool CheckXDNNStatusFailure(T value, const std::string& msg) {
+  try {
+    PADDLE_ENFORCE_XDNN_SUCCESS(value, "XDNN Error ");
+    return false;
+  } catch (paddle::platform::EnforceNotMet& error) {
+    std::string ex_msg = error.what();
+    std::cout << ex_msg << std::endl;
+    return ex_msg.find(msg) != std::string::npos;
+  }
+}
+
 TEST(enforce, xpu_status) {
   EXPECT_TRUE(CheckXPUStatusSuccess(static_cast<int>(XPU_SUCCESS)));
   EXPECT_TRUE(CheckXPUStatusFailure(static_cast<int>(XPUERR_INVALID_DEVICE),
@@ -114,3 +132,15 @@ TEST(enforce, bkcl_status) {
   EXPECT_TRUE(
       CheckXPUStatusFailure(BKCL_INTERNAL_ERROR, "BKCL_INTERNAL_ERROR"));
 }
+
+TEST(enforce, xdnn_status) {
+  EXPECT_TRUE(CheckXDNNStatusSuccess(xpu::Error_t::SUCCESS));
+  EXPECT_TRUE(CheckXDNNStatusFailure(xpu::Error_t::INVALID_PARAM,
+                                     "XDNN_INVALID_PARAM"));
+  EXPECT_TRUE(CheckXDNNStatusFailure(xpu::Error_t::RUNTIME_ERROR,
+                                     "XDNN_RUNTIME_ERROR"));
+  EXPECT_TRUE(CheckXDNNStatusFailure(xpu::Error_t::NO_ENOUGH_WORKSPACE,
+                                     "XDNN_NO_ENOUGH_WORKSPACE"));
+  EXPECT_TRUE(CheckXDNNStatusFailure(xpu::Error_t::NOT_IMPLEMENT,
+                                     "XDNN_NOT_IMPLEMENT"));
+}
diff --git a/paddle/fluid/platform/device/xpu/xpu_info.cc b/paddle/fluid/platform/device/xpu/xpu_info.cc
index 483b1c5ce2795..a8c6ee8f3b035 100644
--- a/paddle/fluid/platform/device/xpu/xpu_info.cc
+++ b/paddle/fluid/platform/device/xpu/xpu_info.cc
@@ -14,8 +14,11 @@ limitations under the License. */
 #include <cstdlib>
 #include <string>
 #include "gflags/gflags.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/device/xpu/enforce_xpu.h"
 #include "paddle/fluid/platform/device/xpu/xpu_header.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/split.h"
 
 PADDLE_DEFINE_EXPORTED_string(
@@ -56,7 +59,7 @@ int GetRuntimeVersion() {
 /**************************** Device Management **************************/
 
 static int GetDeviceCountImpl() {
-  const auto *xpu_visible_devices = std::getenv("XPU_VISIBLE_DEVICES");
+  const auto* xpu_visible_devices = std::getenv("XPU_VISIBLE_DEVICES");
   if (xpu_visible_devices != nullptr) {
     std::string xpu_visible_devices_str(xpu_visible_devices);
     if (std::all_of(xpu_visible_devices_str.begin(),
@@ -114,28 +117,39 @@ std::vector<int> GetXPUSelectedDevices() {
 
 /**************************** Memory Management **************************/
 
-void MemcpySyncH2D(void *dst, const void *src, size_t count, int dev_id) {
-  platform::XPUDeviceGuard guard(dev_id);
+void MemcpySyncH2D(void* dst, const void* src, size_t count,
+                   const platform::XPUPlace& dst_place) {
+  platform::XPUDeviceGuard guard(dst_place.device);
   PADDLE_ENFORCE_XPU_SUCCESS(
       xpu_memcpy(dst, src, count, XPUMemcpyKind::XPU_HOST_TO_DEVICE));
 }
 
-void MemcpySyncD2H(void *dst, const void *src, size_t count, int dev_id) {
-  platform::XPUDeviceGuard guard(dev_id);
+void MemcpySyncD2H(void* dst, const void* src, size_t count,
+                   const platform::XPUPlace& src_place) {
+  platform::XPUDeviceGuard guard(src_place.device);
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto* dev_ctx = pool.GetByPlace(src_place);
+  dev_ctx->Wait();
   PADDLE_ENFORCE_XPU_SUCCESS(
       xpu_memcpy(dst, src, count, XPUMemcpyKind::XPU_DEVICE_TO_HOST));
 }
 
-void MemcpySyncD2D(void *dst, int dst_id, const void *src, int src_id,
+// if src.device == dst.device and you need sync , after call this function,
+// need to call xpu_wait()
+void MemcpySyncD2D(void* dst, const platform::XPUPlace& dst_place,
+                   const void* src, const platform::XPUPlace& src_place,
                    size_t count) {
   int dev_id = GetXPUCurrentDeviceId();
-  if (dst_id == dev_id && src_id == dev_id) {
-    platform::XPUDeviceGuard guard(dev_id);
-    PADDLE_ENFORCE_XPU_SUCCESS(
-        xpu_memcpy(dst, src, count, XPUMemcpyKind::XPU_DEVICE_TO_DEVICE));
+  if (dst_place.device == dev_id && src_place.device == dev_id) {
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto* dev_ctx = pool.GetByPlace(src_place);
+    PADDLE_ENFORCE_XDNN_SUCCESS(
+        xpu::copy(dev_ctx->x_context(), static_cast<const int8_t*>(src),
+                  static_cast<int8_t*>(dst), count),
+        "copy ");
   } else {
     PADDLE_ENFORCE_XPU_SUCCESS(
-        xpu_memcpy_peer(dst_id, dst, src_id, src, count));
+        xpu_memcpy_peer(dst_place.device, dst, src_place.device, src, count));
   }
 }
 
diff --git a/paddle/fluid/platform/device/xpu/xpu_info.h b/paddle/fluid/platform/device/xpu/xpu_info.h
index 82672e61e51f4..018ba1bce163b 100644
--- a/paddle/fluid/platform/device/xpu/xpu_info.h
+++ b/paddle/fluid/platform/device/xpu/xpu_info.h
@@ -16,6 +16,7 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
+class XPUPlace;
 /***** Version Management *****/
 
 //! Get the version of XPU Driver
@@ -41,9 +42,12 @@ std::vector<int> GetXPUSelectedDevices();
 /***** Memory Management *****/
 
 //! Copy memory from address src to dst synchronously.
-void MemcpySyncH2D(void *dst, const void *src, size_t count, int dev_id);
-void MemcpySyncD2H(void *dst, const void *src, size_t count, int dev_id);
-void MemcpySyncD2D(void *dst, int dst_id, const void *src, int src_id,
+void MemcpySyncH2D(void *dst, const void *src, size_t count,
+                   const platform::XPUPlace &dst_place);
+void MemcpySyncD2H(void *dst, const void *src, size_t count,
+                   const platform::XPUPlace &src_place);
+void MemcpySyncD2D(void *dst, const platform::XPUPlace &dst_place,
+                   const void *src, const platform::XPUPlace &src_place,
                    size_t count);
 
 class XPUDeviceGuard {

From 46e856c7c795a2b1ef42770efab64b820d4b4621 Mon Sep 17 00:00:00 2001
From: HydrogenSulfate <490868991@qq.com>
Date: Sun, 26 Dec 2021 13:30:54 +0800
Subject: [PATCH 054/151] Remove the labels range check under the dynamic graph

---
 .../unittests/test_cross_entropy_loss.py      | 28 -------------
 python/paddle/nn/functional/loss.py           | 39 +++++++++----------
 2 files changed, 19 insertions(+), 48 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
index d3ed76e34a614..29676fcff1216 100644
--- a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
@@ -1465,34 +1465,6 @@ def test_WeightLength_NotEqual():
 
             self.assertRaises(ValueError, test_WeightLength_NotEqual)
 
-            def test_LabelValue_ExceedMax():
-                input_data = paddle.rand(shape=[20, 100])
-                label_data = paddle.randint(
-                    0, 100, shape=[20, 1], dtype="int64")
-                label_data[0] = 100
-                weight_data = paddle.rand([100])
-                paddle.nn.functional.cross_entropy(
-                    input=input_data,
-                    label=label_data,
-                    weight=weight_data,
-                    ignore_index=-100)
-
-            self.assertRaises(ValueError, test_LabelValue_ExceedMax)
-
-            def test_LabelValue_ExceedMin():
-                input_data = paddle.rand(shape=[20, 100])
-                label_data = paddle.randint(
-                    0, 100, shape=[20, 1], dtype="int64")
-                label_data[0] = -1
-                weight_data = paddle.rand([100])
-                paddle.nn.functional.cross_entropy(
-                    input=input_data,
-                    label=label_data,
-                    weight=weight_data,
-                    ignore_index=-100)
-
-            self.assertRaises(ValueError, test_LabelValue_ExceedMin)
-
             def static_test_WeightLength_NotEqual():
                 input_np = np.random.random([2, 4]).astype('float32')
                 label_np = np.random.randint(0, 4, size=(2)).astype(np.int64)
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 554651ea1332c..05f06ef534421 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -1665,26 +1665,6 @@ def cross_entropy(input,
     if input_dims - 1 == label_dims:
         label = paddle.unsqueeze(label, axis=axis)
     if in_dygraph_mode():
-        if soft_label == False:
-            valid_label = paddle.where(label == ignore_index,
-                                       paddle.zeros_like(label), label)
-            # TODO: Temporarily use paddle.nonzero instead of paddle.max 
-            # to detect and find out possible illegal label values
-            if len(paddle.nonzero(valid_label < 0)) > 0:
-                invalid_label = paddle.gather_nd(
-                    valid_label, paddle.nonzero(valid_label < 0))
-                raise ValueError(
-                    "Target({}) is out of class_dimension's lower bound({})".
-                    format(invalid_label[0], 0))
-            # TODO: Temporarily use paddle.nonzero instead of paddle.max 
-            # to detect and find out possible illegal label values
-            if len(paddle.nonzero(valid_label >= input.shape[axis])) > 0:
-                invalid_label = paddle.gather_nd(
-                    valid_label,
-                    paddle.nonzero(valid_label >= input.shape[axis]))
-                raise ValueError(
-                    "Target({}) is out of class_dimension's upper bound({})".
-                    format(invalid_label[0], input.shape[axis] - 1))
         if core.is_compiled_with_npu():
             _, _, out = _C_ops.softmax_with_cross_entropy(
                 input, label, 'soft_label', soft_label, 'ignore_index',
@@ -1716,6 +1696,25 @@ def cross_entropy(input,
                 out = _C_ops.elementwise_mul(out, weight_gather_reshape)
 
             else:
+                valid_label = paddle.where(label == ignore_index,
+                                        paddle.zeros_like(label), label)
+                # TODO: Temporarily use paddle.nonzero instead of paddle.max 
+                # to detect and find out possible illegal label values
+                if len(paddle.nonzero(valid_label < 0)) > 0:
+                    invalid_label = paddle.gather_nd(
+                        valid_label, paddle.nonzero(valid_label < 0))
+                    raise ValueError(
+                        "Target({}) is out of class_dimension's lower bound({})".
+                        format(invalid_label[0], 0))
+                # TODO: Temporarily use paddle.nonzero instead of paddle.max 
+                # to detect and find out possible illegal label values
+                if len(paddle.nonzero(valid_label >= input.shape[axis])) > 0:
+                    invalid_label = paddle.gather_nd(
+                        valid_label,
+                        paddle.nonzero(valid_label >= input.shape[axis]))
+                    raise ValueError(
+                        "Target({}) is out of class_dimension's upper bound({})".
+                        format(invalid_label[0], input.shape[axis] - 1))
                 if input.shape[axis] != weight.shape[-1]:
                     raise ValueError(
                         "input's class_dimension({}) must equal to "

From 87d9fdaee0d2ce147afe4cefcf608f8b2cd7365b Mon Sep 17 00:00:00 2001
From: HydrogenSulfate <490868991@qq.com>
Date: Sun, 26 Dec 2021 13:38:30 +0800
Subject: [PATCH 055/151] Remove the labels range check under the dynamic graph

---
 python/paddle/nn/functional/loss.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 05f06ef534421..8eb6e05fc04e6 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -1681,7 +1681,7 @@ def cross_entropy(input,
             # trans weight from class to sample, shape:N or [N,H,W] for 1d and 2d cases.
             if soft_label == True:
                 # chajchaj:
-                # weight's shape is C, where C is class num.
+                # weight's shape is C, where C is class num. 
                 # for 1d case: label's shape is [N,C], weight_gather's shape is N.
                 # for 2d case: label's shape is [N,H,W,C], weight_gather's shape is [N,H,W].
                 weight_gather = paddle.matmul(
@@ -1697,7 +1697,7 @@ def cross_entropy(input,
 
             else:
                 valid_label = paddle.where(label == ignore_index,
-                                        paddle.zeros_like(label), label)
+                                           paddle.zeros_like(label), label)
                 # TODO: Temporarily use paddle.nonzero instead of paddle.max 
                 # to detect and find out possible illegal label values
                 if len(paddle.nonzero(valid_label < 0)) > 0:

From 1e3e17df33654a2f291bbf3daf9e2d40b07dd967 Mon Sep 17 00:00:00 2001
From: HydrogenSulfate <490868991@qq.com>
Date: Sun, 26 Dec 2021 13:46:08 +0800
Subject: [PATCH 056/151] Remove the labels range check under the dynamic graph

---
 python/paddle/nn/functional/loss.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 8eb6e05fc04e6..f13f14cdde118 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -1696,6 +1696,13 @@ def cross_entropy(input,
                 out = _C_ops.elementwise_mul(out, weight_gather_reshape)
 
             else:
+                if input.shape[axis] != weight.shape[-1]:
+                    raise ValueError(
+                        "input's class_dimension({}) must equal to "
+                        "weight's class_dimension({}) "
+                        "when weight is provided" \
+                            .format(input.shape[axis], weight.shape[-1]))
+
                 valid_label = paddle.where(label == ignore_index,
                                            paddle.zeros_like(label), label)
                 # TODO: Temporarily use paddle.nonzero instead of paddle.max 
@@ -1715,12 +1722,6 @@ def cross_entropy(input,
                     raise ValueError(
                         "Target({}) is out of class_dimension's upper bound({})".
                         format(invalid_label[0], input.shape[axis] - 1))
-                if input.shape[axis] != weight.shape[-1]:
-                    raise ValueError(
-                        "input's class_dimension({}) must equal to "
-                        "weight's class_dimension({}) "
-                        "when weight is provided" \
-                            .format(input.shape[axis], weight.shape[-1]))
 
                 ignore_weight_mask = paddle.cast((label != ignore_index),
                                                  out.dtype)

From d49daff06f89a7854c039e6dd21be18d4e852160 Mon Sep 17 00:00:00 2001
From: HydrogenSulfate <490868991@qq.com>
Date: Sun, 26 Dec 2021 20:03:02 +0800
Subject: [PATCH 057/151] restore test for min,max labels

---
 .../unittests/test_cross_entropy_loss.py      | 28 +++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
index 29676fcff1216..adf11e815faa9 100644
--- a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
@@ -1465,6 +1465,34 @@ def test_WeightLength_NotEqual():
 
             self.assertRaises(ValueError, test_WeightLength_NotEqual)
 
+            def test_LabelValue_ExceedMax():
+                input_data = paddle.rand(shape=[20, 100])
+                label_data = paddle.randint(
+                    0, 100, shape=[20, 1], dtype="int64")  # hard label
+                label_data[0] = 100
+                weight_data = paddle.rand([100])  # provide weight
+                paddle.nn.functional.cross_entropy(
+                    input=input_data,
+                    label=label_data,
+                    weight=weight_data,
+                    ignore_index=-100)
+
+            self.assertRaises(ValueError, test_LabelValue_ExceedMax)
+
+            def test_LabelValue_ExceedMin():
+                input_data = paddle.rand(shape=[20, 100])
+                label_data = paddle.randint(
+                    0, 100, shape=[20, 1], dtype="int64")  # hard label
+                label_data[0] = -1
+                weight_data = paddle.rand([100])  # provide weight
+                paddle.nn.functional.cross_entropy(
+                    input=input_data,
+                    label=label_data,
+                    weight=weight_data,
+                    ignore_index=-100)
+
+            self.assertRaises(ValueError, test_LabelValue_ExceedMin)
+
             def static_test_WeightLength_NotEqual():
                 input_np = np.random.random([2, 4]).astype('float32')
                 label_np = np.random.randint(0, 4, size=(2)).astype(np.int64)

From 04cd0aef341221bec17461045ce860cd27e6f174 Mon Sep 17 00:00:00 2001
From: HydrogenSulfate <490868991@qq.com>
Date: Sun, 26 Dec 2021 20:34:53 +0800
Subject: [PATCH 058/151] change error to IndexError

---
 .../paddle/fluid/tests/unittests/test_cross_entropy_loss.py   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
index adf11e815faa9..28e286be6389b 100644
--- a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
@@ -1463,7 +1463,7 @@ def test_WeightLength_NotEqual():
                     weight=weight_data,
                     ignore_index=-100)
 
-            self.assertRaises(ValueError, test_WeightLength_NotEqual)
+            self.assertRaises(IndexError, test_WeightLength_NotEqual)
 
             def test_LabelValue_ExceedMax():
                 input_data = paddle.rand(shape=[20, 100])
@@ -1477,7 +1477,7 @@ def test_LabelValue_ExceedMax():
                     weight=weight_data,
                     ignore_index=-100)
 
-            self.assertRaises(ValueError, test_LabelValue_ExceedMax)
+            self.assertRaises(IndexError, test_LabelValue_ExceedMax)
 
             def test_LabelValue_ExceedMin():
                 input_data = paddle.rand(shape=[20, 100])

From 3021370307d8825c6a7bbba57262f92b45a6efae Mon Sep 17 00:00:00 2001
From: HydrogenSulfate <490868991@qq.com>
Date: Sun, 26 Dec 2021 20:35:20 +0800
Subject: [PATCH 059/151] change error to IndexError

---
 .../paddle/fluid/tests/unittests/test_cross_entropy_loss.py   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
index 28e286be6389b..d8f541bf48c31 100644
--- a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
@@ -1463,7 +1463,7 @@ def test_WeightLength_NotEqual():
                     weight=weight_data,
                     ignore_index=-100)
 
-            self.assertRaises(IndexError, test_WeightLength_NotEqual)
+            self.assertRaises(ValueError, test_WeightLength_NotEqual)
 
             def test_LabelValue_ExceedMax():
                 input_data = paddle.rand(shape=[20, 100])
@@ -1491,7 +1491,7 @@ def test_LabelValue_ExceedMin():
                     weight=weight_data,
                     ignore_index=-100)
 
-            self.assertRaises(ValueError, test_LabelValue_ExceedMin)
+            self.assertRaises(IndexError, test_LabelValue_ExceedMin)
 
             def static_test_WeightLength_NotEqual():
                 input_np = np.random.random([2, 4]).astype('float32')

From 3997f99a767d1c2b35921841d943d28216478fa7 Mon Sep 17 00:00:00 2001
From: HydrogenSulfate <490868991@qq.com>
Date: Sun, 26 Dec 2021 23:52:58 +0800
Subject: [PATCH 060/151] change to ValueError

---
 .../paddle/fluid/tests/unittests/test_cross_entropy_loss.py   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
index d8f541bf48c31..adf11e815faa9 100644
--- a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
@@ -1477,7 +1477,7 @@ def test_LabelValue_ExceedMax():
                     weight=weight_data,
                     ignore_index=-100)
 
-            self.assertRaises(IndexError, test_LabelValue_ExceedMax)
+            self.assertRaises(ValueError, test_LabelValue_ExceedMax)
 
             def test_LabelValue_ExceedMin():
                 input_data = paddle.rand(shape=[20, 100])
@@ -1491,7 +1491,7 @@ def test_LabelValue_ExceedMin():
                     weight=weight_data,
                     ignore_index=-100)
 
-            self.assertRaises(IndexError, test_LabelValue_ExceedMin)
+            self.assertRaises(ValueError, test_LabelValue_ExceedMin)
 
             def static_test_WeightLength_NotEqual():
                 input_np = np.random.random([2, 4]).astype('float32')

From 739cff2dc1740da5b130e5991ecce22971461745 Mon Sep 17 00:00:00 2001
From: HydrogenSulfate <490868991@qq.com>
Date: Mon, 27 Dec 2021 11:40:07 +0800
Subject: [PATCH 061/151] change to IndexError

---
 .../paddle/fluid/tests/unittests/test_cross_entropy_loss.py  | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
index adf11e815faa9..12dc47785d262 100644
--- a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 from __future__ import print_function
+from ast import Index
 
 import paddle
 import paddle.fluid as fluid
@@ -1477,7 +1478,7 @@ def test_LabelValue_ExceedMax():
                     weight=weight_data,
                     ignore_index=-100)
 
-            self.assertRaises(ValueError, test_LabelValue_ExceedMax)
+            self.assertRaises(IndexError, test_LabelValue_ExceedMax)
 
             def test_LabelValue_ExceedMin():
                 input_data = paddle.rand(shape=[20, 100])
@@ -1491,7 +1492,7 @@ def test_LabelValue_ExceedMin():
                     weight=weight_data,
                     ignore_index=-100)
 
-            self.assertRaises(ValueError, test_LabelValue_ExceedMin)
+            self.assertRaises(IndexError, test_LabelValue_ExceedMin)
 
             def static_test_WeightLength_NotEqual():
                 input_np = np.random.random([2, 4]).astype('float32')

From 7ddfec009ce30f0aa871b0ee338ac38a5603aec8 Mon Sep 17 00:00:00 2001
From: HydrogenSulfate <490868991@qq.com>
Date: Mon, 27 Dec 2021 13:11:39 +0800
Subject: [PATCH 062/151] change to IndexError

---
 python/paddle/nn/functional/loss.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index f13f14cdde118..cdf80fb58d74c 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -1710,7 +1710,7 @@ def cross_entropy(input,
                 if len(paddle.nonzero(valid_label < 0)) > 0:
                     invalid_label = paddle.gather_nd(
                         valid_label, paddle.nonzero(valid_label < 0))
-                    raise ValueError(
+                    raise IndexError(
                         "Target({}) is out of class_dimension's lower bound({})".
                         format(invalid_label[0], 0))
                 # TODO: Temporarily use paddle.nonzero instead of paddle.max 
@@ -1719,7 +1719,7 @@ def cross_entropy(input,
                     invalid_label = paddle.gather_nd(
                         valid_label,
                         paddle.nonzero(valid_label >= input.shape[axis]))
-                    raise ValueError(
+                    raise IndexError(
                         "Target({}) is out of class_dimension's upper bound({})".
                         format(invalid_label[0], input.shape[axis] - 1))
 

From 51398ab90ad7146a546e62576320988c4f54f67f Mon Sep 17 00:00:00 2001
From: HydrogenSulfate <490868991@qq.com>
Date: Mon, 27 Dec 2021 15:56:14 +0800
Subject: [PATCH 063/151] remove hard labels check

---
 .../unittests/test_cross_entropy_loss.py      | 28 -------------------
 python/paddle/nn/functional/loss.py           | 17 -----------
 2 files changed, 45 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
index 12dc47785d262..a30e5741bc8d4 100644
--- a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
@@ -1466,34 +1466,6 @@ def test_WeightLength_NotEqual():
 
             self.assertRaises(ValueError, test_WeightLength_NotEqual)
 
-            def test_LabelValue_ExceedMax():
-                input_data = paddle.rand(shape=[20, 100])
-                label_data = paddle.randint(
-                    0, 100, shape=[20, 1], dtype="int64")  # hard label
-                label_data[0] = 100
-                weight_data = paddle.rand([100])  # provide weight
-                paddle.nn.functional.cross_entropy(
-                    input=input_data,
-                    label=label_data,
-                    weight=weight_data,
-                    ignore_index=-100)
-
-            self.assertRaises(IndexError, test_LabelValue_ExceedMax)
-
-            def test_LabelValue_ExceedMin():
-                input_data = paddle.rand(shape=[20, 100])
-                label_data = paddle.randint(
-                    0, 100, shape=[20, 1], dtype="int64")  # hard label
-                label_data[0] = -1
-                weight_data = paddle.rand([100])  # provide weight
-                paddle.nn.functional.cross_entropy(
-                    input=input_data,
-                    label=label_data,
-                    weight=weight_data,
-                    ignore_index=-100)
-
-            self.assertRaises(IndexError, test_LabelValue_ExceedMin)
-
             def static_test_WeightLength_NotEqual():
                 input_np = np.random.random([2, 4]).astype('float32')
                 label_np = np.random.randint(0, 4, size=(2)).astype(np.int64)
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index cdf80fb58d74c..c1800a781d4ba 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -1705,23 +1705,6 @@ def cross_entropy(input,
 
                 valid_label = paddle.where(label == ignore_index,
                                            paddle.zeros_like(label), label)
-                # TODO: Temporarily use paddle.nonzero instead of paddle.max 
-                # to detect and find out possible illegal label values
-                if len(paddle.nonzero(valid_label < 0)) > 0:
-                    invalid_label = paddle.gather_nd(
-                        valid_label, paddle.nonzero(valid_label < 0))
-                    raise IndexError(
-                        "Target({}) is out of class_dimension's lower bound({})".
-                        format(invalid_label[0], 0))
-                # TODO: Temporarily use paddle.nonzero instead of paddle.max 
-                # to detect and find out possible illegal label values
-                if len(paddle.nonzero(valid_label >= input.shape[axis])) > 0:
-                    invalid_label = paddle.gather_nd(
-                        valid_label,
-                        paddle.nonzero(valid_label >= input.shape[axis]))
-                    raise IndexError(
-                        "Target({}) is out of class_dimension's upper bound({})".
-                        format(invalid_label[0], input.shape[axis] - 1))
 
                 ignore_weight_mask = paddle.cast((label != ignore_index),
                                                  out.dtype)

From 9765be09300948e0d9237301725414c274defc95 Mon Sep 17 00:00:00 2001
From: HydrogenSulfate <490868991@qq.com>
Date: Tue, 28 Dec 2021 13:02:56 +0800
Subject: [PATCH 064/151] Update test_cross_entropy_loss.py

---
 python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
index a30e5741bc8d4..29676fcff1216 100644
--- a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 from __future__ import print_function
-from ast import Index
 
 import paddle
 import paddle.fluid as fluid

From b4eec5d5adaa151479720dbc0e49b6408e3c7f95 Mon Sep 17 00:00:00 2001
From: HydrogenSulfate <490868991@qq.com>
Date: Tue, 28 Dec 2021 16:39:32 +0800
Subject: [PATCH 065/151] replace .where to '=='

---
 python/paddle/nn/functional/loss.py | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index c1800a781d4ba..4d09f1d5c38fc 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -1703,11 +1703,16 @@ def cross_entropy(input,
                         "when weight is provided" \
                             .format(input.shape[axis], weight.shape[-1]))
 
-                valid_label = paddle.where(label == ignore_index,
-                                           paddle.zeros_like(label), label)
+                ignore_weight_mask = (
+                    label != ignore_index)  # ignored position will be False
+
+                valid_label = paddle.cast(
+                    ignore_weight_mask,
+                    dtype=label.dtype) * label  # ignored position will be 0
+
+                ignore_weight_mask = paddle.cast(
+                    ignore_weight_mask, out.dtype)  # convert from 0 to 0.0
 
-                ignore_weight_mask = paddle.cast((label != ignore_index),
-                                                 out.dtype)
                 if ignore_weight_mask.ndim > 1 and ignore_weight_mask.shape[
                         axis] == 1:
                     # TODO: Temporarily use squeeze instead of squeeze_
@@ -1821,10 +1826,16 @@ def cross_entropy(input,
                                  "when weight is provided" \
                                  .format(input.shape[axis], weight.shape[-1]))
 
-            valid_label = paddle.where(label == ignore_index,
-                                       paddle.zeros_like(label), label)
-            ignore_weight_mask = paddle.cast((label != ignore_index),
-                                             input.dtype)
+            ignore_weight_mask = (
+                label != ignore_index)  # ignored position will be False
+
+            valid_label = paddle.cast(
+                ignore_weight_mask,
+                dtype=label.dtype) * label  # ignored position will be 0
+
+            ignore_weight_mask = paddle.cast(ignore_weight_mask,
+                                             out.dtype)  # convert from 0 to 0.0
+
             if ignore_weight_mask.ndim > 1 and ignore_weight_mask.shape[
                     axis] == 1:
                 ignore_weight_mask = paddle.squeeze(ignore_weight_mask, axis)

From 09d4a3a4737152b9fac4b105dcc2f389c3e6be2a Mon Sep 17 00:00:00 2001
From: HydrogenSulfate <490868991@qq.com>
Date: Tue, 28 Dec 2021 17:32:21 +0800
Subject: [PATCH 066/151] add static label check

---
 .../unittests/test_cross_entropy_loss.py      | 28 ++++++++++++++++
 python/paddle/nn/functional/loss.py           | 32 +++++++++++++++++++
 2 files changed, 60 insertions(+)

diff --git a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
index 29676fcff1216..d3ed76e34a614 100644
--- a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
@@ -1465,6 +1465,34 @@ def test_WeightLength_NotEqual():
 
             self.assertRaises(ValueError, test_WeightLength_NotEqual)
 
+            def test_LabelValue_ExceedMax():
+                input_data = paddle.rand(shape=[20, 100])
+                label_data = paddle.randint(
+                    0, 100, shape=[20, 1], dtype="int64")
+                label_data[0] = 100
+                weight_data = paddle.rand([100])
+                paddle.nn.functional.cross_entropy(
+                    input=input_data,
+                    label=label_data,
+                    weight=weight_data,
+                    ignore_index=-100)
+
+            self.assertRaises(ValueError, test_LabelValue_ExceedMax)
+
+            def test_LabelValue_ExceedMin():
+                input_data = paddle.rand(shape=[20, 100])
+                label_data = paddle.randint(
+                    0, 100, shape=[20, 1], dtype="int64")
+                label_data[0] = -1
+                weight_data = paddle.rand([100])
+                paddle.nn.functional.cross_entropy(
+                    input=input_data,
+                    label=label_data,
+                    weight=weight_data,
+                    ignore_index=-100)
+
+            self.assertRaises(ValueError, test_LabelValue_ExceedMin)
+
             def static_test_WeightLength_NotEqual():
                 input_np = np.random.random([2, 4]).astype('float32')
                 label_np = np.random.randint(0, 4, size=(2)).astype(np.int64)
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 4d09f1d5c38fc..aee0366ab3093 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -1710,6 +1710,22 @@ def cross_entropy(input,
                     ignore_weight_mask,
                     dtype=label.dtype) * label  # ignored position will be 0
 
+                if len(paddle.nonzero(valid_label < 0)) > 0:
+                    invalid_label = paddle.gather_nd(
+                        valid_label, paddle.nonzero(valid_label < 0))
+                    raise ValueError(
+                        "Target({}) is out of class_dimension's lower bound({})".
+                        format(invalid_label[0], 0))
+                # TODO: Temporarily use paddle.nonzero instead of paddle.max 
+                # to detect and find out possible illegal label values
+                if len(paddle.nonzero(valid_label >= input.shape[axis])) > 0:
+                    invalid_label = paddle.gather_nd(
+                        valid_label,
+                        paddle.nonzero(valid_label >= input.shape[axis]))
+                    raise ValueError(
+                        "Target({}) is out of class_dimension's upper bound({})".
+                        format(invalid_label[0], input.shape[axis] - 1))
+
                 ignore_weight_mask = paddle.cast(
                     ignore_weight_mask, out.dtype)  # convert from 0 to 0.0
 
@@ -1833,6 +1849,22 @@ def cross_entropy(input,
                 ignore_weight_mask,
                 dtype=label.dtype) * label  # ignored position will be 0
 
+            if len(paddle.nonzero(valid_label < 0)) > 0:
+                invalid_label = paddle.gather_nd(
+                    valid_label, paddle.nonzero(valid_label < 0))
+                raise ValueError(
+                    "Target({}) is out of class_dimension's lower bound({})".
+                    format(invalid_label[0], 0))
+            # TODO: Temporarily use paddle.nonzero instead of paddle.max 
+            # to detect and find out possible illegal label values
+            if len(paddle.nonzero(valid_label >= input.shape[axis])) > 0:
+                invalid_label = paddle.gather_nd(
+                    valid_label,
+                    paddle.nonzero(valid_label >= input.shape[axis]))
+                raise ValueError(
+                    "Target({}) is out of class_dimension's upper bound({})".
+                    format(invalid_label[0], input.shape[axis] - 1))
+
             ignore_weight_mask = paddle.cast(ignore_weight_mask,
                                              out.dtype)  # convert from 0 to 0.0
 

From 3ab9ace5605c01347e6b200e2a0223247f8de483 Mon Sep 17 00:00:00 2001
From: HydrogenSulfate <490868991@qq.com>
Date: Tue, 28 Dec 2021 21:51:14 +0800
Subject: [PATCH 067/151] update code

---
 python/paddle/nn/functional/loss.py | 81 +++++++++++------------------
 1 file changed, 29 insertions(+), 52 deletions(-)

diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index aee0366ab3093..f571f5d30285f 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -1665,6 +1665,27 @@ def cross_entropy(input,
     if input_dims - 1 == label_dims:
         label = paddle.unsqueeze(label, axis=axis)
     if in_dygraph_mode():
+        if not soft_label:
+            valid_label = paddle.cast(
+                label != ignore_index, dtype=label.dtype) * label
+            # TODO: Temporarily use paddle.nonzero instead of paddle.max 
+            # to detect and find out possible illegal label values
+            if len(paddle.nonzero(valid_label < 0)) > 0:
+                invalid_label = paddle.gather_nd(
+                    valid_label, paddle.nonzero(valid_label < 0))
+                raise ValueError(
+                    "Target({}) is out of class_dimension's lower bound({})".
+                    format(invalid_label[0], 0))
+            # TODO: Temporarily use paddle.nonzero instead of paddle.max 
+            # to detect and find out possible illegal label values
+            if len(paddle.nonzero(valid_label >= input.shape[axis])) > 0:
+                invalid_label = paddle.gather_nd(
+                    valid_label,
+                    paddle.nonzero(valid_label >= input.shape[axis]))
+                raise ValueError(
+                    "Target({}) is out of class_dimension's upper bound({})".
+                    format(invalid_label[0], input.shape[axis] - 1))
+
         if core.is_compiled_with_npu():
             _, _, out = _C_ops.softmax_with_cross_entropy(
                 input, label, 'soft_label', soft_label, 'ignore_index',
@@ -1681,7 +1702,7 @@ def cross_entropy(input,
             # trans weight from class to sample, shape:N or [N,H,W] for 1d and 2d cases.
             if soft_label == True:
                 # chajchaj:
-                # weight's shape is C, where C is class num. 
+                # weight's shape is C, where C is class num.
                 # for 1d case: label's shape is [N,C], weight_gather's shape is N.
                 # for 2d case: label's shape is [N,H,W,C], weight_gather's shape is [N,H,W].
                 weight_gather = paddle.matmul(
@@ -1703,32 +1724,8 @@ def cross_entropy(input,
                         "when weight is provided" \
                             .format(input.shape[axis], weight.shape[-1]))
 
-                ignore_weight_mask = (
-                    label != ignore_index)  # ignored position will be False
-
-                valid_label = paddle.cast(
-                    ignore_weight_mask,
-                    dtype=label.dtype) * label  # ignored position will be 0
-
-                if len(paddle.nonzero(valid_label < 0)) > 0:
-                    invalid_label = paddle.gather_nd(
-                        valid_label, paddle.nonzero(valid_label < 0))
-                    raise ValueError(
-                        "Target({}) is out of class_dimension's lower bound({})".
-                        format(invalid_label[0], 0))
-                # TODO: Temporarily use paddle.nonzero instead of paddle.max 
-                # to detect and find out possible illegal label values
-                if len(paddle.nonzero(valid_label >= input.shape[axis])) > 0:
-                    invalid_label = paddle.gather_nd(
-                        valid_label,
-                        paddle.nonzero(valid_label >= input.shape[axis]))
-                    raise ValueError(
-                        "Target({}) is out of class_dimension's upper bound({})".
-                        format(invalid_label[0], input.shape[axis] - 1))
-
-                ignore_weight_mask = paddle.cast(
-                    ignore_weight_mask, out.dtype)  # convert from 0 to 0.0
-
+                ignore_weight_mask = paddle.cast((label != ignore_index),
+                                                 out.dtype)
                 if ignore_weight_mask.ndim > 1 and ignore_weight_mask.shape[
                         axis] == 1:
                     # TODO: Temporarily use squeeze instead of squeeze_
@@ -1842,32 +1839,12 @@ def cross_entropy(input,
                                  "when weight is provided" \
                                  .format(input.shape[axis], weight.shape[-1]))
 
-            ignore_weight_mask = (
-                label != ignore_index)  # ignored position will be False
-
-            valid_label = paddle.cast(
-                ignore_weight_mask,
-                dtype=label.dtype) * label  # ignored position will be 0
-
-            if len(paddle.nonzero(valid_label < 0)) > 0:
-                invalid_label = paddle.gather_nd(
-                    valid_label, paddle.nonzero(valid_label < 0))
-                raise ValueError(
-                    "Target({}) is out of class_dimension's lower bound({})".
-                    format(invalid_label[0], 0))
-            # TODO: Temporarily use paddle.nonzero instead of paddle.max 
-            # to detect and find out possible illegal label values
-            if len(paddle.nonzero(valid_label >= input.shape[axis])) > 0:
-                invalid_label = paddle.gather_nd(
-                    valid_label,
-                    paddle.nonzero(valid_label >= input.shape[axis]))
-                raise ValueError(
-                    "Target({}) is out of class_dimension's upper bound({})".
-                    format(invalid_label[0], input.shape[axis] - 1))
-
-            ignore_weight_mask = paddle.cast(ignore_weight_mask,
-                                             out.dtype)  # convert from 0 to 0.0
+            valid_label = paddle.multiply(
+                paddle.cast(
+                    label != ignore_index, dtype=label.dtype), label)
 
+            ignore_weight_mask = paddle.cast((label != ignore_index),
+                                             input.dtype)
             if ignore_weight_mask.ndim > 1 and ignore_weight_mask.shape[
                     axis] == 1:
                 ignore_weight_mask = paddle.squeeze(ignore_weight_mask, axis)

From e30150dd45514edaf59e29d4ef5e841360973233 Mon Sep 17 00:00:00 2001
From: HydrogenSulfate <490868991@qq.com>
Date: Mon, 10 Jan 2022 00:45:56 +0800
Subject: [PATCH 068/151] replace where with min and max

---
 python/paddle/nn/functional/loss.py | 29 +++++++++--------------------
 1 file changed, 9 insertions(+), 20 deletions(-)

diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index f571f5d30285f..90ada8c3c5ee6 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -1665,27 +1665,17 @@ def cross_entropy(input,
     if input_dims - 1 == label_dims:
         label = paddle.unsqueeze(label, axis=axis)
     if in_dygraph_mode():
-        if not soft_label:
+        if soft_label == False:
             valid_label = paddle.cast(
                 label != ignore_index, dtype=label.dtype) * label
-            # TODO: Temporarily use paddle.nonzero instead of paddle.max 
-            # to detect and find out possible illegal label values
-            if len(paddle.nonzero(valid_label < 0)) > 0:
-                invalid_label = paddle.gather_nd(
-                    valid_label, paddle.nonzero(valid_label < 0))
-                raise ValueError(
-                    "Target({}) is out of class_dimension's lower bound({})".
-                    format(invalid_label[0], 0))
-            # TODO: Temporarily use paddle.nonzero instead of paddle.max 
-            # to detect and find out possible illegal label values
-            if len(paddle.nonzero(valid_label >= input.shape[axis])) > 0:
-                invalid_label = paddle.gather_nd(
-                    valid_label,
-                    paddle.nonzero(valid_label >= input.shape[axis]))
-                raise ValueError(
-                    "Target({}) is out of class_dimension's upper bound({})".
-                    format(invalid_label[0], input.shape[axis] - 1))
-
+            label_min = paddle.min(valid_label)
+            label_max = paddle.max(valid_label)
+            if label_min < 0:
+                raise ValueError("label should not out of bound, but got{}".
+                                 format(label_min))
+            if label_max >= input.shape[axis]:
+                raise ValueError("label should not out of bound, but got{}".
+                                 format(label_max))
         if core.is_compiled_with_npu():
             _, _, out = _C_ops.softmax_with_cross_entropy(
                 input, label, 'soft_label', soft_label, 'ignore_index',
@@ -1842,7 +1832,6 @@ def cross_entropy(input,
             valid_label = paddle.multiply(
                 paddle.cast(
                     label != ignore_index, dtype=label.dtype), label)
-
             ignore_weight_mask = paddle.cast((label != ignore_index),
                                              input.dtype)
             if ignore_weight_mask.ndim > 1 and ignore_weight_mask.shape[

From a8afed69971884f65546adc33f3d9a5cbd54a44e Mon Sep 17 00:00:00 2001
From: liutiexing <74819124+liutiexing@users.noreply.github.com>
Date: Mon, 10 Jan 2022 15:30:23 +0800
Subject: [PATCH 069/151] Profiler skeleton (#38826)

* add align for WorkQueue

* add spinlock

* merge develop

* merge

* Add EventsWaiter

* Revert "Add EventsWaiter"

This reverts commit e206173aa9be7401b83a53581627bfaf557c8fb2.

* profiler skeleton

* update

* update

* update

Co-authored-by: liutiexing <liutiexing@google.com>
---
 .../new_executor/workqueue/CMakeLists.txt     |  3 +-
 .../new_executor/workqueue/workqueue.cc       |  4 +-
 paddle/fluid/platform/CMakeLists.txt          |  3 +-
 paddle/fluid/platform/event.h                 | 34 ----------
 paddle/fluid/platform/profiler.cc             |  2 +-
 paddle/fluid/platform/profiler.h              |  2 +-
 paddle/fluid/platform/profiler/CMakeLists.txt |  1 +
 .../platform/{ => profiler}/event_tracing.h   |  0
 .../{ => profiler}/host_event_recorder.cc     |  4 +-
 .../{ => profiler}/host_event_recorder.h      | 38 ++++++++++-
 .../platform/profiler/trace_event_collector.h | 65 +++++++++++++++++++
 paddle/fluid/platform/profiler/tracer_base.h  | 42 ++++++++++++
 12 files changed, 154 insertions(+), 44 deletions(-)
 create mode 100644 paddle/fluid/platform/profiler/CMakeLists.txt
 rename paddle/fluid/platform/{ => profiler}/event_tracing.h (100%)
 rename paddle/fluid/platform/{ => profiler}/host_event_recorder.cc (93%)
 rename paddle/fluid/platform/{ => profiler}/host_event_recorder.h (84%)
 create mode 100644 paddle/fluid/platform/profiler/trace_event_collector.h
 create mode 100644 paddle/fluid/platform/profiler/tracer_base.h

diff --git a/paddle/fluid/framework/new_executor/workqueue/CMakeLists.txt b/paddle/fluid/framework/new_executor/workqueue/CMakeLists.txt
index 77130102d52e5..f47a274aaa4e5 100644
--- a/paddle/fluid/framework/new_executor/workqueue/CMakeLists.txt
+++ b/paddle/fluid/framework/new_executor/workqueue/CMakeLists.txt
@@ -1,2 +1,3 @@
-cc_library(workqueue SRCS workqueue.cc workqueue_utils.cc events_waiter.cc DEPS enforce glog)
+cc_library(workqueue_utils SRCS workqueue_utils.cc events_waiter.cc DEPS enforce glog)
+cc_library(workqueue SRCS workqueue.cc DEPS workqueue_utils enforce glog)
 cc_test(workqueue_test SRCS workqueue_test.cc DEPS workqueue)
diff --git a/paddle/fluid/framework/new_executor/workqueue/workqueue.cc b/paddle/fluid/framework/new_executor/workqueue/workqueue.cc
index 3f06f3db23118..45694349168a4 100644
--- a/paddle/fluid/framework/new_executor/workqueue/workqueue.cc
+++ b/paddle/fluid/framework/new_executor/workqueue/workqueue.cc
@@ -198,7 +198,7 @@ std::unique_ptr<WorkQueue> CreateMultiThreadedWorkQueue(
                                         "WorkQueueOptions.num_threads must be "
                                         "greater than 1."));
   std::unique_ptr<WorkQueue> ptr(new WorkQueueImpl(options));
-  return std::move(ptr);
+  return ptr;
 }
 
 std::unique_ptr<WorkQueueGroup> CreateWorkQueueGroup(
@@ -208,7 +208,7 @@ std::unique_ptr<WorkQueueGroup> CreateWorkQueueGroup(
                         "For a WorkQueueGroup, the number of WorkQueueOptions "
                         "must be greater than 1."));
   std::unique_ptr<WorkQueueGroup> ptr(new WorkQueueGroupImpl(queues_options));
-  return std::move(ptr);
+  return ptr;
 }
 
 }  // namespace framework
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 1031d1ed6357d..8a84429987d90 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -169,7 +169,8 @@ cc_test(timer_test SRCS timer_test.cc DEPS timer)
 cc_library(lodtensor_printer SRCS lodtensor_printer.cc DEPS ddim place tensor scope lod_tensor variable_helper framework_proto)
 cc_test(lodtensor_printer_test SRCS lodtensor_printer_test.cc DEPS lodtensor_printer)
 
-cc_library(host_event_recorder SRCS host_event_recorder.cc DEPS os_info)
+add_subdirectory(profiler)
+
 cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS})
 if(WITH_GPU)
   nv_library(profiler SRCS profiler.cc profiler.cu DEPS host_event_recorder os_info device_tracer gpu_info enforce dynload_cuda)
diff --git a/paddle/fluid/platform/event.h b/paddle/fluid/platform/event.h
index 919266575e6ce..da5080cc86f0c 100644
--- a/paddle/fluid/platform/event.h
+++ b/paddle/fluid/platform/event.h
@@ -201,39 +201,5 @@ class CudaEvent {
 #endif
 };
 
-struct CommonEvent {
- public:
-  CommonEvent(const char *name, uint64_t start_ns, uint64_t end_ns,
-              EventRole role)
-      : name(name), start_ns(start_ns), end_ns(end_ns), role(role) {}
-
-  CommonEvent(std::function<void *(size_t)> &arena_allocator,
-              const std::string &name_str, uint64_t start_ns, uint64_t end_ns,
-              EventRole role, const std::string &attr_str)
-      : start_ns(start_ns), end_ns(end_ns), role(role) {
-    auto buf = static_cast<char *>(arena_allocator(name_str.length() + 1));
-    strncpy(buf, name_str.c_str(), name_str.length() + 1);
-    name = buf;
-    buf = static_cast<char *>(arena_allocator(attr_str.length() + 1));
-    strncpy(buf, attr_str.c_str(), attr_str.length() + 1);
-    attr = buf;
-  }
-
-  CommonEvent(const std::function<void *(size_t)> &arena_allocator,
-              const std::string &name_str, uint64_t start_ns, uint64_t end_ns,
-              EventRole role)
-      : start_ns(start_ns), end_ns(end_ns), role(role) {
-    auto buf = static_cast<char *>(arena_allocator(name_str.length() + 1));
-    strncpy(buf, name_str.c_str(), name_str.length() + 1);
-    name = buf;
-  }
-
-  const char *name = nullptr;  // not owned, designed for performance
-  uint64_t start_ns = 0;
-  uint64_t end_ns = 0;
-  EventRole role = EventRole::kOrdinary;
-  const char *attr = nullptr;  // not owned, designed for performance
-};
-
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index eaa77273c8fd4..c4beac93ef134 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -20,8 +20,8 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/device_tracer.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/host_event_recorder.h"
 #include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/profiler/host_event_recorder.h"
 #include "paddle/fluid/platform/profiler_helper.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/dynload/nvtx.h"
diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
index 41cc3805f44da..122e19b7c2808 100644
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -27,9 +27,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/event.h"
-#include "paddle/fluid/platform/event_tracing.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.pb.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
diff --git a/paddle/fluid/platform/profiler/CMakeLists.txt b/paddle/fluid/platform/profiler/CMakeLists.txt
new file mode 100644
index 0000000000000..de22183df6034
--- /dev/null
+++ b/paddle/fluid/platform/profiler/CMakeLists.txt
@@ -0,0 +1 @@
+cc_library(host_event_recorder SRCS host_event_recorder.cc DEPS os_info)
diff --git a/paddle/fluid/platform/event_tracing.h b/paddle/fluid/platform/profiler/event_tracing.h
similarity index 100%
rename from paddle/fluid/platform/event_tracing.h
rename to paddle/fluid/platform/profiler/event_tracing.h
diff --git a/paddle/fluid/platform/host_event_recorder.cc b/paddle/fluid/platform/profiler/host_event_recorder.cc
similarity index 93%
rename from paddle/fluid/platform/host_event_recorder.cc
rename to paddle/fluid/platform/profiler/host_event_recorder.cc
index 750f39118d7d9..14054418c5d24 100644
--- a/paddle/fluid/platform/host_event_recorder.cc
+++ b/paddle/fluid/platform/profiler/host_event_recorder.cc
@@ -9,7 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/platform/host_event_recorder.h"
+#include "paddle/fluid/platform/profiler/host_event_recorder.h"
 #include "paddle/fluid/platform/os_info.h"
 
 namespace paddle {
@@ -26,7 +26,7 @@ HostEventSection HostEventRecorder::GatherEvents() {
   for (auto &kv : thread_recorders_) {
     host_sec.thr_sections.emplace_back(std::move(kv.second->GatherEvents()));
   }
-  return std::move(host_sec);
+  return host_sec;
 }
 
 }  // namespace platform
diff --git a/paddle/fluid/platform/host_event_recorder.h b/paddle/fluid/platform/profiler/host_event_recorder.h
similarity index 84%
rename from paddle/fluid/platform/host_event_recorder.h
rename to paddle/fluid/platform/profiler/host_event_recorder.h
index e8dd59ad4c6f1..071f0d65bd0a6 100644
--- a/paddle/fluid/platform/host_event_recorder.h
+++ b/paddle/fluid/platform/profiler/host_event_recorder.h
@@ -25,6 +25,40 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
+struct CommonEvent {
+ public:
+  CommonEvent(const char *name, uint64_t start_ns, uint64_t end_ns,
+              EventRole role)
+      : name(name), start_ns(start_ns), end_ns(end_ns), role(role) {}
+
+  CommonEvent(std::function<void *(size_t)> &arena_allocator,
+              const std::string &name_str, uint64_t start_ns, uint64_t end_ns,
+              EventRole role, const std::string &attr_str)
+      : start_ns(start_ns), end_ns(end_ns), role(role) {
+    auto buf = static_cast<char *>(arena_allocator(name_str.length() + 1));
+    strncpy(buf, name_str.c_str(), name_str.length() + 1);
+    name = buf;
+    buf = static_cast<char *>(arena_allocator(attr_str.length() + 1));
+    strncpy(buf, attr_str.c_str(), attr_str.length() + 1);
+    attr = buf;
+  }
+
+  CommonEvent(const std::function<void *(size_t)> &arena_allocator,
+              const std::string &name_str, uint64_t start_ns, uint64_t end_ns,
+              EventRole role)
+      : start_ns(start_ns), end_ns(end_ns), role(role) {
+    auto buf = static_cast<char *>(arena_allocator(name_str.length() + 1));
+    strncpy(buf, name_str.c_str(), name_str.length() + 1);
+    name = buf;
+  }
+
+  const char *name = nullptr;  // not owned, designed for performance
+  uint64_t start_ns = 0;
+  uint64_t end_ns = 0;
+  EventRole role = EventRole::kOrdinary;
+  const char *attr = nullptr;  // not owned, designed for performance
+};
+
 template <typename HeadType, typename... RestTypes>
 struct ContainsStdString
     : std::conditional_t<
@@ -154,7 +188,7 @@ std::vector<EventType> EventContainer<EventType>::Reduce() {
     cur = next;
   }
   event_blocks_ = cur_event_block_ = new EventBlock;
-  return std::move(all_events);
+  return all_events;
 }
 
 template <typename EventType>
@@ -204,7 +238,7 @@ class ThreadEventRecorder {
     thr_sec.thread_name = thread_name_;
     thr_sec.thread_id = thread_id_;
     thr_sec.events = std::move(base_evt_cntr_.Reduce());
-    return std::move(thr_sec);
+    return thr_sec;
   }
 
  private:
diff --git a/paddle/fluid/platform/profiler/trace_event_collector.h b/paddle/fluid/platform/profiler/trace_event_collector.h
new file mode 100644
index 0000000000000..eabafb73542dc
--- /dev/null
+++ b/paddle/fluid/platform/profiler/trace_event_collector.h
@@ -0,0 +1,65 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <list>
+
+namespace paddle {
+namespace platform {
+
+struct HostRecord {
+  std::string name;
+  uint64_t start_ns;
+  uint64_t end_ns;
+  uint64_t process_id;
+  uint64_t thread_id;
+};
+
+struct RuntimeRecord {
+  std::string name;
+  uint64_t start_ns;
+  uint64_t end_ns;
+  uint64_t process_id;
+  uint64_t thread_id;
+  uint32_t correlation_id;
+};
+
+struct DeviceRecord {
+  std::string name;
+  uint64_t start_ns;
+  uint64_t end_ns;
+  uint32_t correlation_id;
+};
+
+class TraceEventCollector {
+ public:
+  void AddHostRecord(HostRecord&& record) { host_records_.push_back(record); }
+
+  void AddRuntimeRecord(RuntimeRecord&& record) {
+    runtime_records_.push_back(record);
+  }
+
+  void AddDeviceRecord(DeviceRecord&& record) {
+    device_records_.push_back(record);
+  }
+
+ private:
+  std::list<HostRecord> host_records_;
+  std::list<RuntimeRecord> runtime_records_;
+  std::list<DeviceRecord> device_records_;
+};
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/profiler/tracer_base.h b/paddle/fluid/platform/profiler/tracer_base.h
new file mode 100644
index 0000000000000..1d4e3447fe64e
--- /dev/null
+++ b/paddle/fluid/platform/profiler/tracer_base.h
@@ -0,0 +1,42 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/platform/profiler/trace_event_collector.h"
+
+namespace paddle {
+namespace platform {
+
+class TracerBase {
+ public:
+  // The state machine for a Tracer.
+  enum class TracerState { UNINITED, READY, STARTED, STOPED };
+
+  virtual void PrepareTracing() { state_ = TracerState::READY; }
+
+  virtual void StartTracing() = 0;
+
+  virtual void StopTracing() = 0;
+
+  virtual void CollectTraceData(TraceEventCollector* collector) = 0;
+
+  virtual ~TracerBase() {}
+
+ protected:
+  TracerState state_ = TracerState::UNINITED;
+};
+
+}  // namespace platform
+}  // namespace paddle

From 5c35750436b59382115d154f01484184d1c92fac Mon Sep 17 00:00:00 2001
From: shangliang Xu <ghostxsl@users.noreply.github.com>
Date: Mon, 10 Jan 2022 15:31:05 +0800
Subject: [PATCH 070/151] [bug fix] fix unfold runtime bug (#38819)

---
 paddle/fluid/operators/unfold_op.cc | 75 ++++++++++++++++-------------
 1 file changed, 41 insertions(+), 34 deletions(-)

diff --git a/paddle/fluid/operators/unfold_op.cc b/paddle/fluid/operators/unfold_op.cc
index d4155960bebe5..3f580884aa515 100644
--- a/paddle/fluid/operators/unfold_op.cc
+++ b/paddle/fluid/operators/unfold_op.cc
@@ -143,40 +143,47 @@ class UnfoldOp : public framework::OperatorWithKernel {
             "but recieved dilations_height: %d dilations_width: %d.",
             dilations[0], dilations[1]));
 
-    std::vector<int> out_dims;
-    out_dims.push_back(in_dims[0]);
-
-    int output_channels = in_dims[1] * kernel_sizes[0] * kernel_sizes[1];
-    out_dims.push_back(output_channels);
-
-    int output_height =
-        CalcOutputSize(in_dims[2], kernel_sizes[0], dilations[0], paddings[0],
-                       paddings[2], strides[0]);
-    int output_width = CalcOutputSize(in_dims[3], kernel_sizes[1], dilations[1],
-                                      paddings[1], paddings[3], strides[1]);
-    // check output height and width
-    PADDLE_ENFORCE_GT(
-        output_height, 0,
-        platform::errors::InvalidArgument(
-            "The sliding blocks calculated from input spatial size (%d, %d), "
-            "kernel_sizes (%d, %d), strides (%d, %d), dilations (%d, %d), "
-            "is (%d, %d), which should be a positive integer.",
-            in_dims[2], in_dims[3], kernel_sizes[0], kernel_sizes[1],
-            strides[0], strides[1], dilations[0], dilations[1], output_height,
-            output_width));
-    PADDLE_ENFORCE_GT(
-        output_width, 0,
-        platform::errors::InvalidArgument(
-            "The sliding blocks calculated from input spatial size (%d, %d), "
-            "kernel_sizes (%d, %d), strides (%d, %d), dilations (%d, %d), "
-            "is (%d, %d), which should be a positive integer.",
-            in_dims[2], in_dims[3], kernel_sizes[0], kernel_sizes[1],
-            strides[0], strides[1], dilations[0], dilations[1], output_height,
-            output_width));
-    int output_col_length = output_height * output_width;
-    out_dims.push_back(output_col_length);
-
-    ctx->SetOutputDim("Y", framework::make_ddim(out_dims));
+    bool contain_unknown_dim = framework::contain_unknown_dim(in_dims);
+    bool check = ctx->IsRuntime() || !contain_unknown_dim;
+    if (check) {
+      std::vector<int> out_dims;
+      out_dims.push_back(in_dims[0]);
+
+      int output_channels = in_dims[1] * kernel_sizes[0] * kernel_sizes[1];
+      out_dims.push_back(output_channels);
+
+      int output_height =
+          CalcOutputSize(in_dims[2], kernel_sizes[0], dilations[0], paddings[0],
+                         paddings[2], strides[0]);
+      int output_width =
+          CalcOutputSize(in_dims[3], kernel_sizes[1], dilations[1], paddings[1],
+                         paddings[3], strides[1]);
+      // check output height and width
+      PADDLE_ENFORCE_GT(
+          output_height, 0,
+          platform::errors::InvalidArgument(
+              "The sliding blocks calculated from input spatial size "
+              "(%d, %d), kernel_sizes (%d, %d), strides (%d, %d), "
+              "dilations (%d, %d), is (%d, %d), which should be a "
+              "positive integer.",
+              in_dims[2], in_dims[3], kernel_sizes[0], kernel_sizes[1],
+              strides[0], strides[1], dilations[0], dilations[1], output_height,
+              output_width));
+      PADDLE_ENFORCE_GT(
+          output_width, 0,
+          platform::errors::InvalidArgument(
+              "The sliding blocks calculated from input spatial size "
+              "(%d, %d), kernel_sizes (%d, %d), strides (%d, %d), "
+              "dilations (%d, %d), is (%d, %d), which should be a "
+              "positive integer.",
+              in_dims[2], in_dims[3], kernel_sizes[0], kernel_sizes[1],
+              strides[0], strides[1], dilations[0], dilations[1], output_height,
+              output_width));
+      int output_col_length = output_height * output_width;
+      out_dims.push_back(output_col_length);
+
+      ctx->SetOutputDim("Y", framework::make_ddim(out_dims));
+    }
   }
 
  protected:

From 953638e00d3cb246a26dce584c05165126a9031e Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Mon, 10 Jan 2022 16:30:37 +0800
Subject: [PATCH 071/151] [Unify Tensors PR #6] Removed interfaces & members
 from lod_tensor,test=allcases (#38811)

* Added shared_ptr<Allocation> member & corresponding interfaces to Storage

* Removed original pten::Allocation from Storage and adjusted the interfaces accordingly

* Fixed issues with storage offset

* Used place to malloc allocation for TensorStorage

* [Unify Tensors PR #3]Ported framework::Tensor interfaces to pten::DenseTensor

* Fixed issues with place

* Added comments

* Moved mutable_data with stream argument to DenseTensor

* Added set_offset interface

* Fixed CI issues,test=allcases

* [Unify Tensors PR #4] Port LoDTensor interfaces to DenseTensor

* Removed friend class EigenTensor/EigenMatrix/EigenVector from Tensor

* Modified framework::Tensor to inherit from DenseTensor

* Reverted changes too pten_layout() interface

* Removed friend classes

* Rearranged cfunction calls from tensor.data<void>() to tensor.data()

* Fixed CI issues

* Fixed lite issues

* Fixed data() interface issues,test=allcases

* Resolved IsInitialized() issues

* Fixed ResetHolder() issues

* Fixed MKLDNN & Storage issues

* Resolved ShareBufferWith() issues

* Fixed LoD issues

* Removed interfaces & members from lod_tensor,test=allcases
---
 paddle/fluid/framework/lod_tensor.h | 52 +----------------------------
 paddle/fluid/framework/tensor.cc    |  3 ++
 2 files changed, 4 insertions(+), 51 deletions(-)

diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h
index dff6d0e01839a..22f2027998137 100644
--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
@@ -108,54 +108,7 @@ bool CheckAbsLoD(const LoD& in, int tensor_height = -1);
  */
 class LoDTensor : public Tensor {
  public:
-  LoDTensor() : Tensor() {}
-
-  explicit LoDTensor(const LoD& lod) : lod_(lod) {}
-
-  void set_lod(const LoD& lod) { lod_ = lod; }
-
-  const LoD& lod() const { return lod_; }
-
-  LoD* mutable_lod() { return &lod_; }
-
-  /*
-   * Get the start offset and end offset of an  element from LoD.
-   */
-  std::pair<size_t, size_t> lod_element(size_t level, size_t elem) const {
-    PADDLE_ENFORCE_LT(
-        level, NumLevels(),
-        platform::errors::InvalidArgument(
-            "The input level of LoD is invalid, it should be less than LoD "
-            "size. The input level is %zu, the LoD size is %zu.",
-            level, NumLevels()));
-    PADDLE_ENFORCE_LT(elem, NumElements(level),
-                      platform::errors::InvalidArgument(
-                          "The input element of LoD is invalid, it should be "
-                          "less than the number of elements in its level."
-                          "The input element is %zu, the number of elements in "
-                          "its level is %zu.",
-                          elem, NumElements(level)));
-    return std::make_pair((lod_)[level][elem], (lod_)[level][elem + 1]);
-  }
-
-  /*
-   * Number of LoDTensor's levels, each level has units of data, for example,
-   * in the sentence's view, article, paragraph, sentence are 3 levels.
-   */
-  size_t NumLevels() const { return lod_.size(); }
-  /*
- * Number of elements in a level.
- */
-  size_t NumElements(size_t level = 0) const {
-    PADDLE_ENFORCE_LT(
-        level, NumLevels(),
-        platform::errors::InvalidArgument(
-            "The input level of LoD is invalid, it should be less than LoD "
-            "size. The input level is %zu, the LoD size is %zu.",
-            level, NumLevels()));
-    // the last offset is the end of last element
-    return (lod_)[level].size() - 1;
-  }
+  using Tensor::Tensor;
 
   // Split LoDTensor and copy to each place specified in places.
   std::vector<LoDTensor> SplitLoDTensor(
@@ -163,9 +116,6 @@ class LoDTensor : public Tensor {
 
   void MergeLoDTensor(const std::vector<const LoDTensor*>& lod_tensors,
                       platform::Place place);
-
- private:
-  LoD lod_;
 };
 
 /*
diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc
index e5dfe28be7a3c..f11b37825d4f0 100644
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@@ -110,7 +110,10 @@ std::vector<Tensor> Tensor::Chunk(int64_t chunks, int64_t axis) const {
 
 Tensor& Tensor::ShareDataWith(const Tensor& src) {
   src.check_memory_size();
+  // Preserve LoD
+  auto lod = meta_.lod;
   *this = src;
+  meta_.lod = lod;
   return *this;
 }
 Tensor& Tensor::ShareInplaceVersionCounterWith(const Tensor& src) {

From 657b6742fc2436e920b80548bdf8a1fe20782241 Mon Sep 17 00:00:00 2001
From: Yulong Ao <aoyulong@baidu.com>
Date: Mon, 10 Jan 2022 16:34:45 +0800
Subject: [PATCH 072/151] Add the backward support for QR (#38824)

* Add the backward support for QR

* Remove unnecessary comments
---
 paddle/fluid/operators/qr_op.h                | 123 +++++++++++++++-
 paddle/fluid/operators/svd_helper.h           | 135 ++++++++++++++++++
 .../fluid/tests/unittests/CMakeLists.txt      |   1 +
 .../fluid/tests/unittests/test_qr_op.py       |  91 +++++++++++-
 4 files changed, 347 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/qr_op.h b/paddle/fluid/operators/qr_op.h
index 73ba52f590c0d..65dfb4261e96e 100644
--- a/paddle/fluid/operators/qr_op.h
+++ b/paddle/fluid/operators/qr_op.h
@@ -19,6 +19,7 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/math/complex_functors.h"
+#include "paddle/fluid/operators/svd_helper.h"
 #include "paddle/fluid/platform/for_range.h"
 
 namespace paddle {
@@ -79,9 +80,11 @@ class QrCPUKernel : public framework::OpKernel<T> {
       q_data = q.mutable_data<math::Real<T>>(
           context.GetPlace(),
           size_t(batch_size * m * k * sizeof(math::Real<T>)));
+      memset(q_data, 0, size_t(batch_size * m * k * sizeof(math::Real<T>)));
     }
     auto* r_data = r.mutable_data<math::Real<T>>(
         context.GetPlace(), size_t(batch_size * k * n * sizeof(math::Real<T>)));
+    memset(r_data, 0, size_t(batch_size * k * n * sizeof(math::Real<T>)));
 
     // Implement QR by calling Eigen
     for (int i = 0; i < batch_size; ++i) {
@@ -126,8 +129,124 @@ template <typename DeviceContext, typename T>
 class QrGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "QR doesn't have the backward kernel now and will be supported soon."));
+    const framework::Tensor& Q = *ctx.Input<framework::Tensor>("Q");
+    const framework::Tensor& R = *ctx.Input<framework::Tensor>("R");
+    // Use a different name A instead of X
+    const framework::Tensor& A = *ctx.Input<framework::Tensor>("X");
+    const framework::Tensor& dQ =
+        *ctx.Input<framework::Tensor>(framework::GradVarName("Q"));
+    const framework::Tensor& dR =
+        *ctx.Input<framework::Tensor>(framework::GradVarName("R"));
+    // Use a different name dA instead of dX
+    framework::Tensor& dA =
+        *ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    dA.mutable_data<math::Real<T>>(ctx.GetPlace());
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    math::SetConstant<DeviceContext, T>()(dev_ctx, &dA, T(0));
+
+    auto dito = math::DeviceIndependenceTensorOperations<DeviceContext, T>(ctx);
+
+    std::string mode = ctx.Attr<std::string>("mode");
+    bool compute_q, reduced;
+    std::tie(compute_q, reduced) = _parse_qr_mode(mode);
+    if (!compute_q) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "The derivative of qr is not implemented when mode='r'."));
+    }
+
+    auto a_dims = A.dims();
+    int a_rank = a_dims.size();
+    int m = a_dims[a_rank - 2];
+    int n = a_dims[a_rank - 1];
+
+    if ((m > n) && (!reduced)) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "The derivative of qr is not implemented when mode='complete' and "
+          "nrows > ncols."));
+    }
+
+    // m >= n case
+    auto m_gt_n_case = [](
+        const framework::ExecutionContext& ctx,
+        math::DeviceIndependenceTensorOperations<DeviceContext, T>& dito,
+        const Tensor& dQ, const Tensor& dR, const Tensor& A, const Tensor& Q,
+        const Tensor& R) -> framework::Tensor {
+      // Hai-Jun Liao, Jin-Guo Liu, Lei Wang, Tao Xiang (2019). Differentiable
+      // Programming Tensor Networks.
+      // https://arxiv.org/abs/1903.09650 Section 3. QR factorization
+
+      // dR^H
+      framework::Tensor R_term;
+      if (ctx.HasInput(framework::GradVarName("R"))) {
+        R_term = dito.Matmul(R, dito.Transpose(dR));
+      } else {
+        R_term = dito.Fill(framework::vectorize<int>(R.dims()), 0);
+      }
+
+      // dQ^H * Q
+      framework::Tensor Q_term;
+      if (ctx.HasInput(framework::GradVarName("Q"))) {
+        Q_term = dito.Matmul(dito.Transpose(dQ), Q);
+      } else {
+        Q_term = dito.Fill(framework::vectorize<int>(R.dims()), 0);
+      }
+
+      framework::Tensor M_tmp1 = dito.Sub(R_term, Q_term);
+
+      // Compute M = (tril(M) + tril(M).mH()) * 0.5 Identity
+      framework::Tensor M_tril_0 = dito.TrilTriu(M_tmp1, 0, true);
+      framework::Tensor M_tril_1 = dito.TrilTriu(M_tmp1, -1, true);
+      framework::Tensor M = dito.Add(M_tril_0, dito.Transpose(M_tril_1));
+
+      framework::Tensor rhs_term;
+      if (ctx.HasInput(framework::GradVarName("Q"))) {
+        rhs_term = dito.Add(dQ, dito.Matmul(Q, M));
+      } else {
+        rhs_term = dito.Matmul(Q, M);
+      }
+
+      // dA * R^H = rhs_term
+      auto dA =
+          dito.TriangularSolve(dito.Transpose(dito.Conj(dito.Transpose(R))),
+                               dito.Transpose(rhs_term),
+                               /*upper=*/true,
+                               /*transpose=*/false,
+                               /*unitriangular=*/false);
+
+      return dito.Transpose(dA);
+    };
+
+    if (m >= n) {
+      auto dA_tmp = m_gt_n_case(ctx, dito, dQ, dR, A, Q, R);
+      framework::TensorCopy(dA_tmp, dA.place(), &dA);
+    } else {
+      // If m < n for input matrices A, we partition A = [X|Y] and R = [U|V]
+      // Calculate dX and dY individually and concatenate them to get dA
+      dA.mutable_data<math::Real<T>>(ctx.GetPlace());
+
+      auto Y = dito.Slice(A, {-1}, {m}, {n});
+      auto U = dito.Slice(R, {-1}, {0}, {m});
+      framework::Tensor dY, dX, dV, dR_tmp, dQ_prime;
+
+      if (ctx.HasInput(framework::GradVarName("R"))) {
+        dV = dito.Slice(dR, {-1}, {m}, {n});
+        dR_tmp = dito.Slice(dR, {-1}, {0}, {m});
+        // Y * dV^H
+        dQ_prime = dito.Matmul(Y, dito.Transpose(dV));
+      } else {
+        dV = dito.Fill(framework::vectorize<int>(Y.dims()), 0);
+        dQ_prime = dito.Fill(framework::vectorize<int>(Q.dims()), 0);
+      }
+
+      if (ctx.HasInput(framework::GradVarName("Q"))) {
+        dQ_prime = dito.Add(dQ_prime, dQ);
+      }
+      dX = m_gt_n_case(ctx, dito, dQ_prime, dR_tmp, A, Q, U);
+      dY = dito.Matmul(Q, dV);
+      // Concatenate dX and dY to get dA.
+      auto dA_tmp = dito.ConcatTwoTensors(dX, dY, -1);
+      framework::TensorCopy(dA_tmp, dA.place(), &dA);
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/svd_helper.h b/paddle/fluid/operators/svd_helper.h
index 6b2584682277e..8d17ddec6fbb4 100644
--- a/paddle/fluid/operators/svd_helper.h
+++ b/paddle/fluid/operators/svd_helper.h
@@ -146,6 +146,93 @@ static std::vector<int> GetBroadcastShape(InTensors ins) {
   return broadcast_shape;
 }
 
+static inline framework::DDim ComputeAndCheckShapeForConcatOp(
+    const bool is_runtime, const std::vector<framework::DDim>& inputs_dims,
+    const size_t axis) {
+  const size_t n = inputs_dims.size();
+  auto out_dims = inputs_dims[0];
+  size_t in_zero_dims_size = out_dims.size();
+  for (size_t i = 1; i < n; i++) {
+    PADDLE_ENFORCE_EQ(inputs_dims[i].size(), out_dims.size(),
+                      platform::errors::InvalidArgument(
+                          "The shape of input[0] and input[%d] "
+                          "is expected to be equal."
+                          "But received input[0]'s shape = "
+                          "[%s], input[%d]'s shape = [%s].",
+                          i, inputs_dims[0], i, inputs_dims[i]));
+    for (size_t j = 0; j < in_zero_dims_size; j++) {
+      if (j == axis) {
+        if (is_runtime) {
+          out_dims[axis] += inputs_dims[i][j];
+        } else {
+          if (inputs_dims[i][j] == -1 || out_dims[j] == -1) {
+            out_dims[axis] = -1;
+          } else {
+            out_dims[axis] += inputs_dims[i][j];
+          }
+        }
+      } else {
+        bool check_shape =
+            is_runtime || (inputs_dims[0][j] > 0 && inputs_dims[i][j] > 0);
+        if (check_shape) {
+          // check all shape in run time
+          PADDLE_ENFORCE_EQ(inputs_dims[0][j], inputs_dims[i][j],
+                            platform::errors::InvalidArgument(
+                                "The %d-th dimension of input[0] and input[%d] "
+                                "is expected to be equal."
+                                "But received input[0]'s shape = "
+                                "[%s], input[%d]'s shape = [%s].",
+                                j, i, inputs_dims[0], i, inputs_dims[i]));
+        }
+        if (!is_runtime && out_dims[j] == -1 && inputs_dims[i][j] > 0) {
+          out_dims[j] = inputs_dims[i][j];
+        }
+      }
+    }
+  }
+  return out_dims;
+}
+
+static inline int64_t ComputeAxisForConcatOp(int64_t axis, int64_t rank) {
+  PADDLE_ENFORCE_EQ(
+      axis >= -rank && axis < rank, true,
+      platform::errors::InvalidArgument(
+          "The axis is expected to be in range of [%d, %d), but got %d", -rank,
+          rank, axis));
+  if (axis < 0) {
+    axis = axis + rank;
+  }
+  return axis > 0 ? axis : 0;
+}
+
+// Prepared for the broadcast operation
+static std::vector<int64_t> get_broadcast_batch_portion(
+    std::vector<int64_t> x, std::vector<int64_t> y) {
+  size_t size_x = x.size();
+  size_t size_y = y.size();
+  size_t size = std::max(size_x, size_y);
+  std::vector<int64_t> batchPortion(size);
+
+  ptrdiff_t i = (ptrdiff_t)size - 1;
+  for (; i >= 0; --i) {
+    ptrdiff_t offset = size - i - 1;
+    ptrdiff_t dim_x = size_x - offset - 1;
+    ptrdiff_t dim_y = size_y - offset - 1;
+    int64_t x_size = (dim_x >= 0) ? x[dim_x] : 1;
+    int64_t y_size = (dim_y >= 0) ? y[dim_y] : 1;
+
+    PADDLE_ENFORCE_EQ(
+        (x_size == y_size || x_size == 1 || y_size == 1), true,
+        platform::errors::PreconditionNotMet(
+            "The size of tensor x (%d) must match the size of tensor y "
+            "(%d) at non-singleton dimension %d.",
+            x_size, y_size, i));
+
+    batchPortion[i] = x_size != 1 ? x_size : y_size;
+  }
+  return batchPortion;
+}
+
 #define DITO_TRANSPOSE_RANK_CASE(N)             \
   case N: {                                     \
     math::Transpose<DeviceContext, T, N> trans; \
@@ -515,6 +602,54 @@ struct DeviceIndependenceTensorOperations {
     return CreateOpRunAndReturnTensor("tril_triu", inputs, attrs, out_shape);
   }
 
+  framework::Tensor TriangularSolve(const framework::Tensor& x,
+                                    const framework::Tensor& y, bool upper,
+                                    bool transpose, bool unitriangular) {
+    framework::AttributeMap attrs;
+    attrs["upper"] = upper;
+    attrs["transpose"] = transpose;
+    attrs["unitriangular"] = unitriangular;
+    NameInTensorMap inputs({{"X", {&x}}, {"Y", {&y}}});
+    auto x_dims = x.dims();
+    auto y_dims = y.dims();
+    auto y_dims_n = y_dims.size();
+    std::vector<int64_t> x_dims_vec =
+        paddle::framework::vectorize<int64_t>(x_dims);
+    std::vector<int64_t> y_dims_vec =
+        paddle::framework::vectorize<int64_t>(y_dims);
+    std::vector<int64_t> x_dims_vec_cut(x_dims_vec.begin(),
+                                        x_dims_vec.end() - 2);
+    std::vector<int64_t> y_dims_vec_cut(y_dims_vec.begin(),
+                                        y_dims_vec.end() - 2);
+    std::vector<int64_t> expand_batch_portion =
+        get_broadcast_batch_portion(x_dims_vec_cut, y_dims_vec_cut);
+    std::vector<int64_t> y_broadcast_dims({expand_batch_portion});
+    y_broadcast_dims.insert(y_broadcast_dims.end(), {y_dims_vec[y_dims_n - 2],
+                                                     y_dims_vec[y_dims_n - 1]});
+    std::vector<int> out_shape(y_broadcast_dims.begin(),
+                               y_broadcast_dims.end());
+    return CreateOpRunAndReturnTensor("triangular_solve", inputs, attrs,
+                                      out_shape);
+  }
+
+  framework::Tensor ConcatTwoTensors(const framework::Tensor& x,
+                                     const framework::Tensor& y, int axis) {
+    framework::AttributeMap attrs;
+    attrs["axis"] = axis;
+    std::vector<framework::DDim> inputs_dims({x.dims(), y.dims()});
+    NameInTensorMap inputs({{"X", {&x, &y}}});
+    size_t axis_ =
+        ComputeAxisForConcatOp(static_cast<int64_t>(axis),
+                               static_cast<int64_t>(inputs_dims[0].size()));
+    framework::DDim out_dims =
+        ComputeAndCheckShapeForConcatOp(true, inputs_dims, axis_);
+    if (out_dims[axis_] < 0) {
+      out_dims[axis_] = -1;
+    }
+    std::vector<int> out_shape = framework::vectorize<int>(out_dims);
+    return CreateOpRunAndReturnTensor("concat", inputs, attrs, out_shape);
+  }
+
   Tensor Conj(const Tensor& x) {
     Tensor out;
     auto* out_data = out.mutable_data<T>(x.dims(), context.GetPlace());
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 9c3f9cbad50a0..64c247e56d1d3 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -975,6 +975,7 @@ set_tests_properties(test_lstm_cudnn_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_stack_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_bilinear_interp_v2_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_svd_op PROPERTIES TIMEOUT 80)
+set_tests_properties(test_qr_op PROPERTIES TIMEOUT 60)
 set_tests_properties(test_deformable_psroi_pooling PROPERTIES TIMEOUT 120)
 set_tests_properties(test_trilinear_interp_v2_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_imperative_static_runner_mnist PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/tests/unittests/test_qr_op.py b/python/paddle/fluid/tests/unittests/test_qr_op.py
index ea2aaf3f00d5b..4be46837a67ae 100644
--- a/python/paddle/fluid/tests/unittests/test_qr_op.py
+++ b/python/paddle/fluid/tests/unittests/test_qr_op.py
@@ -21,6 +21,96 @@
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 import paddle.fluid.core as core
+from op_test import OpTest
+
+
+class TestQrOp(OpTest):
+    def setUp(self):
+        paddle.enable_static()
+        np.random.seed(4)
+        self.op_type = "qr"
+        a, q, r = self.get_input_and_output()
+        self.inputs = {"X": a}
+        self.attrs = {"mode": self.get_mode()}
+        self.outputs = {"Q": q, "R": r}
+
+    def get_dtype(self):
+        return "float64"
+
+    def get_mode(self):
+        return "reduced"
+
+    def get_shape(self):
+        return (11, 11)
+
+    def get_input_and_output(self):
+        dtype = self.get_dtype()
+        shape = self.get_shape()
+        mode = self.get_mode()
+        assert mode != "r", "Cannot be backward in r mode."
+        a = np.random.rand(*shape).astype(dtype)
+        m = a.shape[-2]
+        n = a.shape[-1]
+        min_mn = min(m, n)
+        if mode == "reduced":
+            k = min_mn
+        else:
+            k = m
+        q_shape = list(a.shape[:-2])
+        q_shape.extend([m, k])
+        r_shape = list(a.shape[:-2])
+        r_shape.extend([k, n])
+        q = np.zeros(q_shape).astype(dtype)
+        r = np.zeros(r_shape).astype(dtype)
+        batch_size = a.size // (a.shape[-1] * a.shape[-2])
+        for i in range(batch_size):
+            coord = np.unravel_index(i, a.shape[:-2])
+            tmp_q, tmp_r = np.linalg.qr(a[coord], mode=mode)
+            q[coord] = tmp_q
+            r[coord] = tmp_r
+        return a, q, r
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X'], ['Q', 'R'])
+
+
+class TestQrOpCase1(TestQrOp):
+    def get_shape(self):
+        return (10, 12)
+
+
+class TestQrOpCase2(TestQrOp):
+    def get_shape(self):
+        return (16, 15)
+
+
+class TestQrOpCase3(TestQrOp):
+    def get_shape(self):
+        return (2, 12, 16)
+
+
+class TestQrOpCase4(TestQrOp):
+    def get_shape(self):
+        return (3, 16, 15)
+
+
+class TestQrOpCase5(TestQrOp):
+    def get_mode(self):
+        return "complete"
+
+    def get_shape(self):
+        return (10, 12)
+
+
+class TestQrOpCase6(TestQrOp):
+    def get_mode(self):
+        return "complete"
+
+    def get_shape(self):
+        return (2, 10, 12)
 
 
 class TestQrAPI(unittest.TestCase):
@@ -169,5 +259,4 @@ def run_qr_static(shape, mode, dtype):
 
 
 if __name__ == "__main__":
-    paddle.enable_static()
     unittest.main()

From 3a23c1a224f4e51003ff106d8114a343ec6ecc23 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Mon, 10 Jan 2022 19:18:13 +0800
Subject: [PATCH 073/151] move get expected kernel args into pten (#38825)

---
 paddle/fluid/framework/operator.cc           |  2 +-
 paddle/fluid/framework/operator.h            | 40 +++++++++
 paddle/fluid/framework/pten_utils.cc         | 10 ---
 paddle/fluid/framework/pten_utils.h          | 26 +-----
 paddle/fluid/framework/type_defs.h           |  5 --
 paddle/fluid/imperative/prepared_operator.cc |  2 +-
 paddle/fluid/operators/scale_op.cc           | 16 +---
 paddle/pten/CMakeLists.txt                   |  2 +-
 paddle/pten/core/CMakeLists.txt              |  3 +-
 paddle/pten/core/arg_map_context.cc          | 60 ++++++++++++++
 paddle/pten/core/arg_map_context.h           | 86 ++++++++++++++++++++
 paddle/pten/core/kernel_def.h                | 13 ---
 paddle/pten/kernels/CMakeLists.txt           |  2 +-
 paddle/pten/ops/compat/scale_args_fn.h       | 36 ++++++++
 14 files changed, 235 insertions(+), 68 deletions(-)
 create mode 100644 paddle/pten/core/arg_map_context.cc
 create mode 100644 paddle/pten/core/arg_map_context.h
 create mode 100644 paddle/pten/ops/compat/scale_args_fn.h

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index a0c1bd44da01e..c3e54290fd3da 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1287,7 +1287,7 @@ OpKernelType OperatorWithKernel::InnerGetExpectedKernelType(
 void OperatorWithKernel::ChoosePtenKernel(const ExecutionContext& ctx) const {
   pt_kernel_signature_.reset(
       new KernelSignature(std::move(this->GetExpectedPtenKernelArgs(ctx))));
-  VLOG(6) << KernelSignatureToString(*pt_kernel_signature_.get());
+  VLOG(6) << *pt_kernel_signature_.get();
 
   kernel_type_.reset(
       new OpKernelType(std::move(InnerGetExpectedKernelType(ctx))));
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 59bc4813d985b..0a46c83a2b3ad 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -40,6 +40,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/variant.h"
 #include "paddle/utils/flat_hash_map.h"
 
+#include "paddle/pten/core/arg_map_context.h"
 #include "paddle/pten/include/core.h"
 
 namespace paddle {
@@ -438,6 +439,45 @@ class ExecutionContext {
   const RuntimeContext& ctx_;
 };
 
+// TODO(chenweihang): split impl based OpProto or Dygraph if needed
+class ExecutionArgumentMappingContext : public pten::ArgumentMappingContext {
+ public:
+  explicit ExecutionArgumentMappingContext(const ExecutionContext& ctx)
+      : ctx_(ctx) {}
+
+  bool HasInput(const std::string& name) const override {
+    return ctx_.HasInput(name);
+  }
+
+  bool HasOutput(const std::string& name) const override {
+    return ctx_.HasOutput(name);
+  }
+
+  bool HasAttr(const std::string& name) const override {
+    return ctx_.HasAttr(name);
+  }
+
+  size_t InputSize(const std::string& name) const override {
+    return ctx_.InputSize(name);
+  }
+
+  size_t OutputSize(const std::string& name) const override {
+    return ctx_.OutputSize(name);
+  }
+
+  bool IsDenseTensorInput(const std::string& name) const override {
+    return ctx_.InputVar(name)->IsType<framework::Tensor>() ||
+           ctx_.InputVar(name)->IsType<framework::LoDTensor>();
+  }
+
+  bool IsSelectedRowsInput(const std::string& name) const override {
+    return ctx_.InputVar(name)->IsType<framework::SelectedRows>();
+  }
+
+ private:
+  const ExecutionContext& ctx_;
+};
+
 template <>
 const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const;
 
diff --git a/paddle/fluid/framework/pten_utils.cc b/paddle/fluid/framework/pten_utils.cc
index b8aedcce3e3fa..9831c2628dc95 100644
--- a/paddle/fluid/framework/pten_utils.cc
+++ b/paddle/fluid/framework/pten_utils.cc
@@ -196,15 +196,5 @@ KernelSignature KernelArgsNameMakerByOpProto::GetKernelSignature() {
                          GetOutputArgsNames());
 }
 
-std::string KernelSignatureToString(const KernelSignature& signature) {
-  std::stringstream os;
-  os << "Kernel Signature - name: " << signature.name
-     << "; inputs: " << string::join_strings(std::get<0>(signature.args), ", ")
-     << "; attributes: "
-     << string::join_strings(std::get<1>(signature.args), ", ") << "; outputs: "
-     << string::join_strings(std::get<2>(signature.args), ", ");
-  return os.str();
-}
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/pten_utils.h b/paddle/fluid/framework/pten_utils.h
index fd893e04d3ca4..09d96045949a0 100644
--- a/paddle/fluid/framework/pten_utils.h
+++ b/paddle/fluid/framework/pten_utils.h
@@ -22,17 +22,19 @@ limitations under the License. */
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/op_kernel_type.h"
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/imperative/type_defs.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
-#include "paddle/pten/include/core.h"
+#include "paddle/pten/core/arg_map_context.h"
+#include "paddle/pten/core/kernel_factory.h"
 #include "paddle/utils/flat_hash_map.h"
 #include "paddle/utils/small_vector.h"
 
 namespace paddle {
 namespace framework {
 
+using KernelSignature = pten::KernelSignature;
+
 /* Kernel Key translate */
 
 OpKernelType TransPtenKernelKeyToOpKernelType(
@@ -42,24 +44,6 @@ pten::KernelKey TransOpKernelTypeToPtenKernelKey(
 
 /* Kernel Args parse */
 
-struct KernelSignature {
-  std::string name;
-  KernelArgsTuple args;
-
-  KernelSignature() = default;
-  KernelSignature(std::string&& kernel_name,
-                  paddle::SmallVector<std::string>&& inputs,
-                  paddle::SmallVector<std::string>&& attrs,
-                  paddle::SmallVector<std::string>&& outputs)
-      : name(std::move(kernel_name)),
-        args(std::make_tuple(inputs, attrs, outputs)) {}
-  KernelSignature(const std::string& kernel_name,
-                  const paddle::SmallVector<std::string>& inputs,
-                  const paddle::SmallVector<std::string>& attrs,
-                  const paddle::SmallVector<std::string>& outputs)
-      : name(kernel_name), args(std::make_tuple(inputs, attrs, outputs)) {}
-};
-
 // TODO(chenweihang): we can generate this map by proto info in compile time
 class KernelSignatureMap {
  public:
@@ -88,7 +72,5 @@ class KernelArgsNameMaker {
   virtual const paddle::SmallVector<std::string>& GetAttrsArgsNames() = 0;
 };
 
-std::string KernelSignatureToString(const KernelSignature& signature);
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h
index 7f7785b374ead..86bf2d8ac413e 100644
--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
@@ -84,10 +84,5 @@ using InferShapeFN = std::function<void(InferShapeContext*)>;
 using InplacePair = std::unordered_map<std::string, std::string>;
 using InferInplaceOpFN = std::function<InplacePair(bool /*use_cuda*/)>;
 
-// tuple(input_names, attr_names, output_names)
-using KernelArgsTuple = std::tuple<paddle::SmallVector<std::string>,
-                                   paddle::SmallVector<std::string>,
-                                   paddle::SmallVector<std::string>>;
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 29cd24a1e7793..c355ace528d42 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -164,7 +164,7 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
   if (FLAGS_run_pten_kernel &&
       pten::KernelFactory::Instance().HasCompatiblePtenKernel(op.Type())) {
     auto pt_kernel_signature = op.GetExpectedPtenKernelArgs(dygraph_exe_ctx);
-    VLOG(6) << framework::KernelSignatureToString(pt_kernel_signature);
+    VLOG(6) << pt_kernel_signature;
 
     auto pt_kernel_name = pt_kernel_signature.name;
     auto pt_kernel_key = TransOpKernelTypeToPtenKernelKey(expected_kernel_key);
diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc
index 4e9c84ef4c950..86f4e1b3ac3ba 100644
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/scale_op.h"
 #include <string>
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/ops/compat/scale_args_fn.h"
 
 namespace paddle {
 namespace framework {
@@ -73,19 +74,8 @@ class ScaleOp : public framework::OperatorWithKernel {
 
   framework::KernelSignature GetExpectedPtenKernelArgs(
       const framework::ExecutionContext &ctx) const override {
-    if (ctx.InputVar("X")->IsType<framework::LoDTensor>() ||
-        ctx.InputVar("X")->IsType<framework::Tensor>()) {
-      std::string scale_attr;
-      if (ctx.HasInput("ScaleTensor")) {
-        scale_attr = "ScaleTensor";
-      } else {
-        scale_attr = "scale";
-      }
-      return framework::KernelSignature(
-          "scale", {"X"}, {scale_attr, "bias", "bias_after_scale"}, {"Out"});
-    }
-    // TODO(chenweihang): support other cases after selected rows added
-    return framework::KernelSignature("scale.unregistered", {}, {}, {});
+    framework::ExecutionArgumentMappingContext arg_mapping_ctx(ctx);
+    return pten::ScaleOpArgumentMapping(arg_mapping_ctx);
   }
 };
 
diff --git a/paddle/pten/CMakeLists.txt b/paddle/pten/CMakeLists.txt
index 05b321c50c1c4..6a823ff3672bf 100644
--- a/paddle/pten/CMakeLists.txt
+++ b/paddle/pten/CMakeLists.txt
@@ -23,7 +23,7 @@ add_subdirectory(ops)
 add_subdirectory(tests)
 
 # make an unity target for compile deps
-set(PTEN_DEPS convert_utils dense_tensor pten_context kernel_factory kernel_context infermeta)
+set(PTEN_DEPS convert_utils dense_tensor pten_context kernel_factory kernel_context arg_map_context infermeta)
 get_property(pten_kernels GLOBAL PROPERTY PTEN_KERNELS)
 # keep this message for debug, remove it later if needless
 message(STATUS "All standard pten kernels: ${pten_kernels}")
diff --git a/paddle/pten/core/CMakeLists.txt b/paddle/pten/core/CMakeLists.txt
index 87c3612e35424..b1e35b240cd26 100644
--- a/paddle/pten/core/CMakeLists.txt
+++ b/paddle/pten/core/CMakeLists.txt
@@ -8,8 +8,9 @@ endif()
 
 cc_library(kernel_factory SRCS kernel_factory.cc DEPS enforce convert_utils)
 cc_library(kernel_context SRCS kernel_context.cc DEPS enforce pten_context)
-cc_library(tensor_base SRCS tensor_base.cc allocator.cc storage.cc DEPS enforce)
+cc_library(arg_map_context SRCS arg_map_context.cc DEPS enforce)
 
+cc_library(tensor_base SRCS tensor_base.cc allocator.cc storage.cc DEPS enforce)
 cc_library(tensor_meta SRCS tensor_meta.cc DEPS enforce mixed_vector)
 cc_library(dense_tensor SRCS dense_tensor.cc DEPS convert_utils tensor_meta tensor_base)
 
diff --git a/paddle/pten/core/arg_map_context.cc b/paddle/pten/core/arg_map_context.cc
new file mode 100644
index 0000000000000..d7aea11ddf043
--- /dev/null
+++ b/paddle/pten/core/arg_map_context.cc
@@ -0,0 +1,60 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/core/arg_map_context.h"
+
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/string/string_helper.h"
+
+namespace pten {
+
+OpArgumentMappingFnMap& OpArgumentMappingFnMap::Instance() {
+  static OpArgumentMappingFnMap g_op_arg_mapping_fn_map;
+  return g_op_arg_mapping_fn_map;
+}
+
+bool OpArgumentMappingFnMap::Has(const std::string& op_type) const {
+  return fn_map_.find(op_type) != fn_map_.end();
+}
+
+const ArgumentMappingFn& OpArgumentMappingFnMap::Get(
+    const std::string& op_type) const {
+  auto it = fn_map_.find(op_type);
+  PADDLE_ENFORCE_NE(
+      it,
+      fn_map_.end(),
+      paddle::platform::errors::NotFound(
+          "Operator `%s`'s argument mapping funciton is not registered.",
+          op_type));
+  return it->second;
+}
+
+void OpArgumentMappingFnMap::Emplace(const std::string& op_type,
+                                     const std::string api_name,
+                                     ArgumentMappingFn fn) {
+  name_map_.emplace(op_type, api_name);
+  fn_map_.emplace(op_type, fn);
+}
+
+std::ostream& operator<<(std::ostream& os, KernelSignature signature) {
+  os << "Kernel Signature - name: " << signature.name << "; inputs: "
+     << paddle::string::join_strings(std::get<0>(signature.args), ", ")
+     << "; attributes: "
+     << paddle::string::join_strings(std::get<1>(signature.args), ", ")
+     << "; outputs: "
+     << paddle::string::join_strings(std::get<2>(signature.args), ", ");
+  return os;
+}
+
+}  // namespace pten
diff --git a/paddle/pten/core/arg_map_context.h b/paddle/pten/core/arg_map_context.h
new file mode 100644
index 0000000000000..be9eb3af76a36
--- /dev/null
+++ b/paddle/pten/core/arg_map_context.h
@@ -0,0 +1,86 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <ostream>
+#include <string>
+#include <tuple>
+
+#include "paddle/utils/flat_hash_map.h"
+#include "paddle/utils/small_vector.h"
+
+namespace pten {
+
+// tuple(input_names, attr_names, output_names)
+using KernelArgsTuple = std::tuple<paddle::SmallVector<std::string>,
+                                   paddle::SmallVector<std::string>,
+                                   paddle::SmallVector<std::string>>;
+
+// TODO(chenweihang): Add more methods if needed in future
+class ArgumentMappingContext {
+ public:
+  virtual ~ArgumentMappingContext() = default;
+
+  virtual bool HasInput(const std::string& name) const = 0;
+  virtual bool HasOutput(const std::string& name) const = 0;
+  virtual bool HasAttr(const std::string& name) const = 0;
+
+  virtual size_t InputSize(const std::string& name) const = 0;
+  virtual size_t OutputSize(const std::string& name) const = 0;
+
+  virtual bool IsDenseTensorInput(const std::string& name) const = 0;
+  virtual bool IsSelectedRowsInput(const std::string& name) const = 0;
+};
+
+struct KernelSignature {
+  std::string name;
+  KernelArgsTuple args;
+
+  KernelSignature() = default;
+  KernelSignature(std::string&& kernel_name,
+                  paddle::SmallVector<std::string>&& inputs,
+                  paddle::SmallVector<std::string>&& attrs,
+                  paddle::SmallVector<std::string>&& outputs)
+      : name(std::move(kernel_name)),
+        args(std::make_tuple(inputs, attrs, outputs)) {}
+  KernelSignature(const std::string& kernel_name,
+                  const paddle::SmallVector<std::string>& inputs,
+                  const paddle::SmallVector<std::string>& attrs,
+                  const paddle::SmallVector<std::string>& outputs)
+      : name(kernel_name), args(std::make_tuple(inputs, attrs, outputs)) {}
+};
+
+std::ostream& operator<<(std::ostream& os, KernelSignature signature);
+
+using ArgumentMappingFn = KernelSignature (*)(const ArgumentMappingContext&);
+
+class OpArgumentMappingFnMap {
+ public:
+  static OpArgumentMappingFnMap& Instance();
+
+  bool Has(const std::string& op_type) const;
+
+  const ArgumentMappingFn& Get(const std::string& op_type) const;
+
+  void Emplace(const std::string& op_type,
+               const std::string api_name,
+               ArgumentMappingFn fn);
+
+ private:
+  paddle::flat_hash_map<std::string, std::string> name_map_;
+  paddle::flat_hash_map<std::string, ArgumentMappingFn> fn_map_;
+};
+
+}  // namespace pten
diff --git a/paddle/pten/core/kernel_def.h b/paddle/pten/core/kernel_def.h
index 48a579cd02b51..875083cfb59e3 100644
--- a/paddle/pten/core/kernel_def.h
+++ b/paddle/pten/core/kernel_def.h
@@ -26,17 +26,4 @@ using KernelArgsDefFn = void (*)(Kernel* kernel);
 using KernelArgsParseFn = void (*)(const KernelKey& default_key,
                                    KernelArgsDef* args_def);
 
-// Multiple kernels of the same operation are distinguished by the difference
-// of the overload name. For the convenience of reuse, we define some overload
-// naming strings for the naming of the kernel
-
-// For kernels that contains dynamic tensor attribute and it need to be always
-// on host device, such as `ScaleTensor`
-constexpr char kContainHostTensorSuffix[] = "host";
-
-// For kernels with SelectedRowsTensor input and output
-constexpr char kContainSelectedRowsSuffix[] = "sr";
-
-// For kernels with intermediate output
-constexpr char kContainMidOutputTensorSuffix[] = "mid";
 }  // namespace pten
diff --git a/paddle/pten/kernels/CMakeLists.txt b/paddle/pten/kernels/CMakeLists.txt
index b76d408f89e85..fc04cd797f4a5 100644
--- a/paddle/pten/kernels/CMakeLists.txt
+++ b/paddle/pten/kernels/CMakeLists.txt
@@ -24,7 +24,7 @@ endif()
 # pten depends all pten kernel targets
 set_property(GLOBAL PROPERTY PTEN_KERNELS "")
 
-set(COMMON_KERNEL_DEPS dense_tensor kernel_context kernel_factory convert_utils)
+set(COMMON_KERNEL_DEPS dense_tensor kernel_context kernel_factory arg_map_context convert_utils)
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas)
 # remove this dep after removing fluid deps on tensor creation
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} pten_api_utils)
diff --git a/paddle/pten/ops/compat/scale_args_fn.h b/paddle/pten/ops/compat/scale_args_fn.h
new file mode 100644
index 0000000000000..b9a20400f971a
--- /dev/null
+++ b/paddle/pten/ops/compat/scale_args_fn.h
@@ -0,0 +1,36 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/core/arg_map_context.h"
+
+namespace pten {
+
+KernelSignature ScaleOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.IsDenseTensorInput("X")) {
+    std::string scale_attr;
+    if (ctx.HasInput("ScaleTensor")) {
+      scale_attr = "ScaleTensor";
+    } else {
+      scale_attr = "scale";
+    }
+    return KernelSignature(
+        "scale", {"X"}, {scale_attr, "bias", "bias_after_scale"}, {"Out"});
+  }
+  // TODO(chenweihang): support other cases after selected rows added
+  return KernelSignature("scale.unregistered", {}, {}, {});
+}
+
+}  // namespace pten

From 7d4ce5b30b06ea95c692f907bcd03d8d10da589f Mon Sep 17 00:00:00 2001
From: ShenLiang <1422485404@qq.com>
Date: Mon, 10 Jan 2022 19:26:33 +0800
Subject: [PATCH 074/151] fix bug of fp16 (#38838)

---
 python/paddle/fluid/dygraph/layers.py | 42 +++++++++++++++++++--------
 1 file changed, 30 insertions(+), 12 deletions(-)

diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py
index 4a60bdc4c72d3..4c37a378e0aae 100644
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -1279,8 +1279,36 @@ def register_state_dict_hook(self, hook):
     def _obtain_parameters_buffers(self,
                                    destination=None,
                                    include_sublayers=True,
-                                   structured_name_prefix="",
-                                   include_non_persistable_buffer=False):
+                                   structured_name_prefix=""):
+        """
+        The difference from state_dict() is that state_dict_hook will not be called, 
+        but the original types of parameters and buffers will be maintained.
+        """
+        if destination is None:
+            destination = collections.OrderedDict()
+        for name, data in self._parameters.items():
+            if data is not None:
+                destination[structured_name_prefix + name] = data
+        for name, buffer in self._buffers.items():
+            if buffer is not None and name not in self._non_persistable_buffer_names_set:
+                destination[structured_name_prefix + name] = buffer
+
+        if include_sublayers:
+            for layer_name, layer_item in self._sub_layers.items():
+                if layer_item is not None:
+                    destination_temp = destination.copy()
+                    destination_temp.update(
+                        layer_item._obtain_parameters_buffers(
+                            destination_temp, include_sublayers,
+                            structured_name_prefix + layer_name + "."))
+                    destination = destination_temp
+        return destination
+
+    def _state_dict_impl(self,
+                         destination=None,
+                         include_sublayers=True,
+                         structured_name_prefix="",
+                         include_non_persistable_buffer=False):
         """
         Get all parameters and persistable buffers of current layer and its sub-layers. And set them into a dict
 
@@ -1313,16 +1341,6 @@ def _obtain_parameters_buffers(self,
                             structured_name_prefix + layer_name + ".",
                             include_non_persistable_buffer))
                     destination = destination_temp
-        return destination
-
-    def _state_dict_impl(self,
-                         destination=None,
-                         include_sublayers=True,
-                         structured_name_prefix="",
-                         include_non_persistable_buffer=False):
-        destination = self._obtain_parameters_buffers(
-            destination, include_sublayers, structured_name_prefix,
-            include_non_persistable_buffer)
         for state_dict_hook in self._state_dict_hooks.values():
             hook_result = state_dict_hook(destination)
             if hook_result is not None:

From 31b1f707a653d9028858f16d0259d707c89041a7 Mon Sep 17 00:00:00 2001
From: baoachun <962571062@qq.com>
Date: Mon, 10 Jan 2022 19:32:36 +0800
Subject: [PATCH 075/151] refactor the forward implementation of reshape npu op
 (#38748)

* refactor the forward implementation of reshape npu op

* update reshape npu op

* update reshape npu op
---
 paddle/fluid/operators/reshape_op_npu.cc | 99 ++++++++++++++++++++----
 1 file changed, 85 insertions(+), 14 deletions(-)

diff --git a/paddle/fluid/operators/reshape_op_npu.cc b/paddle/fluid/operators/reshape_op_npu.cc
index d6b1d79f2b1a8..8b6f9d4d48d94 100644
--- a/paddle/fluid/operators/reshape_op_npu.cc
+++ b/paddle/fluid/operators/reshape_op_npu.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <string>
 
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/utils.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
@@ -25,23 +26,93 @@ template <typename DeviceContext, typename T>
 class Reshape2NPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    auto place = ctx.GetPlace();
     auto* x = ctx.Input<framework::Tensor>("X");
     auto* out = ctx.Output<framework::Tensor>("Out");
-    auto list_new_shape_tensor =
-        ctx.MultiInput<framework::Tensor>("ShapeTensor");
-    if (list_new_shape_tensor.size() > 0) {
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "Input(ShapeTensor) is not supported on NPU."));
+
+    std::vector<int32_t> target_shape_vector;
+    auto shape_tensor_vector = ctx.MultiInput<framework::Tensor>("ShapeTensor");
+    if (shape_tensor_vector.size() > 0) {
+      for (auto* shape_tensor : shape_tensor_vector) {
+        PADDLE_ENFORCE_EQ(
+            shape_tensor->dims().size(), 1,
+            platform::errors::InvalidArgument(
+                "If the element type of 'shape' in Reshape Op is Tensor, "
+                "the element's shape must be [1]. But received the element's "
+                "shape is [%d]",
+                shape_tensor->dims().size()));
+
+        target_shape_vector.push_back(GetDataFromTensor<int>(shape_tensor)[0]);
+      }
+    } else {
+      auto* shape_tensor = ctx.HasInput("Shape")
+                               ? ctx.Input<framework::LoDTensor>("Shape")
+                               : nullptr;
+      if (shape_tensor) {
+        target_shape_vector = GetDataFromTensor<int>(shape_tensor);
+      } else {
+        target_shape_vector = ctx.Attr<std::vector<int>>("shape");
+        PADDLE_ENFORCE_GT(
+            target_shape_vector.size(), 0,
+            platform::errors::InvalidArgument(
+                "The length of shape attribute should be larger than 0 when "
+                "input ShapeTensor and Shape are empty!"));
+      }
     }
-    PADDLE_ENFORCE_EQ(ctx.Input<framework::LoDTensor>("Shape"), nullptr,
-                      platform::errors::Unimplemented(
-                          "Input(Shape) is not supported on NPU."));
-    auto shape = out->dims();
-    out->mutable_data(ctx.GetPlace(), x->type());
-    framework::TensorCopy(
-        *x, ctx.GetPlace(),
-        ctx.template device_context<platform::DeviceContext>(), out);
-    out->Resize(shape);
+
+    int num_negative =
+        std::count(target_shape_vector.begin(), target_shape_vector.end(), -1);
+    PADDLE_ENFORCE_LE(
+        num_negative, 1,
+        platform::errors::InvalidArgument(
+            "The max number of -1 in shape attribute or shape tensor is 1 "
+            "but received %d.",
+            num_negative));
+    auto it_zero =
+        std::find(target_shape_vector.begin(), target_shape_vector.end(), 0);
+    if (it_zero != target_shape_vector.end()) {
+      int x_rank = x->dims().size();
+      for (size_t i = 0; i < target_shape_vector.size(); i++) {
+        if (target_shape_vector[i] == 0) {
+          PADDLE_ENFORCE_LT(
+              i, x_rank,
+              platform::errors::InvalidArgument(
+                  "The index of 0 in shape attribute or shape tensor",
+                  "should be less than input dim size, ",
+                  "but the index is %d and input dim size is %d", i, x_rank));
+          target_shape_vector[i] = x->dims().at(i);
+        }
+      }
+    }
+
+    auto it =
+        std::find(target_shape_vector.begin(), target_shape_vector.end(), -1);
+    if (it != target_shape_vector.end()) {
+      auto ddim_out_vec = framework::vectorize(x->dims());
+      int ddim_out_product = std::accumulate(
+          ddim_out_vec.begin(), ddim_out_vec.end(), 1, std::multiplies<int>());
+      int reshape_out_product = std::accumulate(target_shape_vector.begin(),
+                                                target_shape_vector.end(), -1,
+                                                std::multiplies<int>());
+      int index = std::distance(target_shape_vector.begin(), it);
+      target_shape_vector[index] = ddim_out_product / reshape_out_product;
+    }
+
+    auto out_dims = framework::make_ddim(target_shape_vector);
+    out->mutable_data<T>(out_dims, place);
+
+    NpuOpRunner runner;
+    // the shape input must be on the host side
+    runner.SetType("Reshape")
+        .AddInput(*x)
+        .AddInput(std::vector<int32_t>(target_shape_vector))
+        .AddOutput(*out)
+        .AddAttr("axis", 0)
+        .AddAttr("num_axes", -1);
+    runner.Run(stream);
   }
 };
 

From ededcda24bfb9592b76c251a828abf33cacc0245 Mon Sep 17 00:00:00 2001
From: Yuang Liu <liuyuang@baidu.com>
Date: Mon, 10 Jan 2022 19:34:55 +0800
Subject: [PATCH 076/151] [fleet_executor] framework for big model inference
 (#38795)

---
 .../distributed/fleet_executor/CMakeLists.txt |   2 +-
 .../distributed/fleet_executor/dist_model.cc  | 154 ++++++++++++++++++
 .../distributed/fleet_executor/dist_model.h   |  69 ++++++++
 paddle/fluid/pybind/bind_fleet_executor.cc    |  19 +++
 4 files changed, 243 insertions(+), 1 deletion(-)
 create mode 100644 paddle/fluid/distributed/fleet_executor/dist_model.cc
 create mode 100644 paddle/fluid/distributed/fleet_executor/dist_model.h

diff --git a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
index d8372e10888d9..1e31187367bd3 100644
--- a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
+++ b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
@@ -12,7 +12,7 @@ endif()
 
 cc_library(task_loop_thread_pool SRCS task_loop_thread_pool.cc task_loop_thread.cc task_loop.cc DEPS enforce glog)
 
-cc_library(fleet_executor SRCS fleet_executor.cc carrier.cc task_node.cc runtime_graph.cc
+cc_library(fleet_executor SRCS fleet_executor.cc carrier.cc task_node.cc runtime_graph.cc dist_model.cc
         interceptor.cc compute_interceptor.cc amplifier_interceptor.cc message_service.cc message_bus.cc
         DEPS proto_desc fleet_executor_desc_proto interceptor_message_proto task_loop_thread_pool collective_helper
         op_registry executor_gc_helper gflags glog ${BRPC_DEPS})
diff --git a/paddle/fluid/distributed/fleet_executor/dist_model.cc b/paddle/fluid/distributed/fleet_executor/dist_model.cc
new file mode 100644
index 0000000000000..7e820a38581af
--- /dev/null
+++ b/paddle/fluid/distributed/fleet_executor/dist_model.cc
@@ -0,0 +1,154 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <glog/logging.h>
+
+#include "paddle/fluid/distributed/fleet_executor/dist_model.h"
+#include "paddle/fluid/framework/naive_executor.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor.h"
+
+namespace paddle {
+namespace distributed {
+
+namespace {
+bool IsPersistable(const framework::VarDesc *var) {
+  if (var->Persistable() &&
+      var->GetType() != framework::proto::VarType::FEED_MINIBATCH &&
+      var->GetType() != framework::proto::VarType::FETCH_LIST &&
+      var->GetType() != framework::proto::VarType::RAW) {
+    return true;
+  }
+  return false;
+}
+}  // namespace
+
+bool DistModel::Init() {
+  /* TODO(fleet exe dev): implement this funct */
+  place_ = paddle::platform::CUDAPlace(config_.device_id);
+  if (!PrepareScope()) {
+    return false;
+  }
+  if (!PrepareProgram()) {
+    return false;
+  }
+  if (!CommInit()) {
+    return false;
+  }
+  return true;
+}
+
+bool DistModel::CommInit() {
+  // TODO(fleet executor): init the comm
+  return true;
+}
+
+bool DistModel::PrepareScope() {
+  scope_.reset(new framework::Scope());
+  return true;
+}
+
+bool DistModel::PrepareProgram() {
+  if (!LoadProgram()) {
+    return false;
+  }
+  if (!LoadParameters()) {
+    return false;
+  }
+  return true;
+}
+
+bool DistModel::LoadProgram() {
+  VLOG(3) << "Loading program from " << config_.model_dir;
+  PADDLE_ENFORCE_NE(config_.model_dir, "", platform::errors::InvalidArgument(
+                                               "Model dir must be provided."));
+  std::string model_path = config_.model_dir + ".pdmodel";
+  framework::proto::ProgramDesc program_proto;
+  std::string pb_content;
+  // Read binary
+  std::ifstream fin(model_path, std::ios::in | std::ios::binary);
+  PADDLE_ENFORCE_EQ(
+      static_cast<bool>(fin.is_open()), true,
+      platform::errors::NotFound(
+          "Cannot open file %s, please confirm whether the file is normal.",
+          model_path));
+  fin.seekg(0, std::ios::end);
+  pb_content.resize(fin.tellg());
+  fin.seekg(0, std::ios::beg);
+  fin.read(&(pb_content.at(0)), pb_content.size());
+  fin.close();
+  program_proto.ParseFromString(pb_content);
+  VLOG(5) << pb_content;
+  program_.reset(new framework::ProgramDesc(program_proto));
+  return true;
+}
+
+bool DistModel::LoadParameters() {
+  VLOG(3) << "Loading parameters from " << config_.model_dir;
+  PADDLE_ENFORCE_NOT_NULL(program_.get(),
+                          platform::errors::PreconditionNotMet(
+                              "The program should be loaded first."));
+  const auto &global_block = program_->MutableBlock(0);
+
+  // create a temporary program to load parameters.
+
+  std::unique_ptr<framework::ProgramDesc> load_program(
+      new framework::ProgramDesc());
+  framework::BlockDesc *load_block = load_program->MutableBlock(0);
+  std::vector<std::string> params;
+
+  for (auto *var : global_block->AllVars()) {
+    if (IsPersistable(var)) {
+      VLOG(3) << "persistable variable's name: " << var->Name();
+      framework::VarDesc *new_var = load_block->Var(var->Name());
+      new_var->SetShape(var->GetShape());
+      new_var->SetDataType(var->GetDataType());
+      new_var->SetType(var->GetType());
+      new_var->SetLoDLevel(var->GetLoDLevel());
+      new_var->SetPersistable(true);
+      params.push_back(new_var->Name());
+    }
+  }
+
+  std::string param_path = config_.model_dir + ".pdiparams";
+  // sort paramlist to have consistent ordering
+  std::sort(params.begin(), params.end());
+  // append just the load_combine op
+  framework::OpDesc *op = load_block->AppendOp();
+  op->SetType("load_combine");
+  op->SetOutput("Out", params);
+  op->SetAttr("file_path", {param_path});
+  op->CheckAttrs();
+
+  framework::NaiveExecutor e(place_);
+  // Create all persistable variables in root scope to load them from ckpt.
+  // Other non-persistable variables will be created in the micro scope
+  // managed by fleet executor.
+  e.CreateVariables(*program_, 0, true, scope_.get());
+  e.Prepare(scope_.get(), *load_program, 0, false);
+  e.Run();
+  VLOG(3) << "After loading there are " << scope_->LocalVarNames().size()
+          << " vars.";
+
+  return true;
+}
+
+void DistModel::Run(const std::vector<framework::Tensor> &input_data,
+                    std::vector<framework::Tensor> *output_data) {
+  /* TODO(fleet exe dev): implement this funct */
+}
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/fleet_executor/dist_model.h b/paddle/fluid/distributed/fleet_executor/dist_model.h
new file mode 100644
index 0000000000000..57bfd88147746
--- /dev/null
+++ b/paddle/fluid/distributed/fleet_executor/dist_model.h
@@ -0,0 +1,69 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/distributed/fleet_executor/fleet_executor_desc.pb.h"
+#include "paddle/fluid/platform/macros.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace framework {
+class ProgramDesc;
+class Scope;
+class Tensor;
+}
+
+namespace distributed {
+
+struct DistModelConfig {
+  std::string model_dir{};
+  std::vector<std::string> trainer_endpoints{};
+  std::string current_endpoint{};
+  int64_t nranks{1};
+  int64_t local_rank{0};
+  int64_t device_id{0};
+  int64_t mp_degree{1};
+  int64_t pp_degree{1};
+};
+
+class DistModel {
+ public:
+  explicit DistModel(const DistModelConfig& config) : config_(config) {}
+  bool Init();
+  void Run(const std::vector<framework::Tensor>& input_data,
+           std::vector<framework::Tensor>* output_data);
+  ~DistModel() = default;
+
+ private:
+  DISABLE_COPY_AND_ASSIGN(DistModel);
+
+  bool PrepareScope();
+  bool PrepareProgram();
+  bool LoadProgram();
+  bool LoadParameters();
+  bool CommInit();
+
+  DistModelConfig config_;
+  FleetExecutorDesc executor_desc_;
+  platform::Place place_;
+  std::shared_ptr<framework::Scope> scope_;
+  std::shared_ptr<framework::ProgramDesc> program_;
+};
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/bind_fleet_executor.cc b/paddle/fluid/pybind/bind_fleet_executor.cc
index b2ace4c0b5745..1240ad94deff2 100644
--- a/paddle/fluid/pybind/bind_fleet_executor.cc
+++ b/paddle/fluid/pybind/bind_fleet_executor.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/pybind/bind_fleet_executor.h"
 #include <pybind11/stl.h>
+#include "paddle/fluid/distributed/fleet_executor/dist_model.h"
 #include "paddle/fluid/distributed/fleet_executor/fleet_executor.h"
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
 #include "paddle/fluid/framework/operator.h"
@@ -28,6 +29,8 @@ namespace pybind {
 
 using paddle::distributed::FleetExecutor;
 using paddle::distributed::TaskNode;
+using paddle::distributed::DistModelConfig;
+using paddle::distributed::DistModel;
 using paddle::framework::OpDesc;
 using paddle::framework::ProgramDesc;
 
@@ -51,6 +54,22 @@ void BindFleetExecutor(py::module* m) {
       .def("role", &TaskNode::role)
       .def("init", &TaskNode::Init)
       .def("set_program", &TaskNode::SetProgram);
+
+  py::class_<DistModelConfig>(*m, "DistModelConfig")
+      .def(py::init<>())
+      .def_readwrite("model_dir", &DistModelConfig::model_dir)
+      .def_readwrite("trainer_endpoints", &DistModelConfig::trainer_endpoints)
+      .def_readwrite("current_endpoint", &DistModelConfig::current_endpoint)
+      .def_readwrite("nranks", &DistModelConfig::nranks)
+      .def_readwrite("local_rank", &DistModelConfig::local_rank)
+      .def_readwrite("device_id", &DistModelConfig::device_id)
+      .def_readwrite("mp_degree", &DistModelConfig::mp_degree)
+      .def_readwrite("pp_degree", &DistModelConfig::pp_degree);
+
+  py::class_<DistModel>(*m, "DistModel")
+      .def(py::init<const DistModelConfig&>())
+      .def("init", &DistModel::Init)
+      .def("run", &DistModel::Run, py::call_guard<py::gil_scoped_release>());
 }
 }  // namespace pybind
 }  // namespace paddle

From c50c22b0ac7df30e4a2308ba3951f204df83efbf Mon Sep 17 00:00:00 2001
From: LiYuRio <63526175+LiYuRio@users.noreply.github.com>
Date: Mon, 10 Jan 2022 19:46:42 +0800
Subject: [PATCH 077/151] [Fleet Executor] Modified python cache strategy to
 support multi carriers (#38839)

---
 .../distributed/fleet_executor/carrier.cc     | 54 ++++++++++++--
 .../distributed/fleet_executor/carrier.h      |  8 ++-
 .../fleet_executor/fleet_executor.cc          | 55 +++------------
 .../fleet_executor/fleet_executor.h           | 11 ++-
 .../fleet_executor/fleet_executor_desc.proto  |  1 -
 python/paddle/fluid/executor.py               | 70 +++++++++++--------
 6 files changed, 103 insertions(+), 96 deletions(-)

diff --git a/paddle/fluid/distributed/fleet_executor/carrier.cc b/paddle/fluid/distributed/fleet_executor/carrier.cc
index 79ca6f467a38d..3e198dc3eeea4 100644
--- a/paddle/fluid/distributed/fleet_executor/carrier.cc
+++ b/paddle/fluid/distributed/fleet_executor/carrier.cc
@@ -19,7 +19,9 @@
 #include "paddle/fluid/distributed/fleet_executor/runtime_graph.h"
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
 #include "paddle/fluid/framework/garbage_collector.h"
+#include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/variable_helper.h"
 
 namespace paddle {
 namespace distributed {
@@ -43,18 +45,24 @@ void Carrier::Init(
     int64_t rank,
     const std::unordered_map<int64_t, int64_t>& interceptor_id_to_rank,
     const std::unordered_map<int64_t, TaskNode*>& interceptor_id_to_node,
-    framework::Scope* root_scope, framework::Scope* minibatch_scope,
-    const std::vector<framework::Scope*>& microbatch_scopes,
-    const platform::Place& place) {
+    const framework::ProgramDesc& program, framework::Scope* scope,
+    int64_t num_micro_batches, const platform::Place& place) {
   rank_ = rank;
   interceptor_id_to_rank_ = interceptor_id_to_rank;
   interceptor_id_to_node_ = interceptor_id_to_node;
-  minibatch_scope_ = minibatch_scope;
-  microbatch_scopes_ = microbatch_scopes;
   place_ = place;
-  root_scope_ = root_scope;
+  root_scope_ = scope;
   dev_ctx_ = platform::DeviceContextPool::Instance().Get(place_);
 
+  PADDLE_ENFORCE_NOT_NULL(root_scope_, platform::errors::InvalidArgument(
+                                           "root_scope can not be nullptr"));
+  minibatch_scope_ = &root_scope_->NewScope();
+  microbatch_scopes_.resize(num_micro_batches);
+  for (int i = 0; i < num_micro_batches; ++i) {
+    microbatch_scopes_[i] = &minibatch_scope_->NewScope();
+    CopyParameters(i, program);
+  }
+
   // TODO(fleet_exe dev): thread pool
   thread_num_ = 1;
   thread_pool_.SetThreadNum(thread_num_);
@@ -64,10 +72,33 @@ void Carrier::Init(
   is_init_ = true;
 }
 
-void Carrier::Release() {}
+void Carrier::Release() {
+  if (root_scope_) {
+    root_scope_->DropKids();
+  }
+}
 
 Carrier::~Carrier() { VLOG(3) << "Carrier's destructor."; }
 
+void Carrier::CopyParameters(int microbatch_id,
+                             const framework::ProgramDesc& program) {
+  auto& global_block = program.Block(0);
+
+  for (auto& var : global_block.AllVars()) {
+    if (var->Persistable() && microbatch_id == 0) {
+      auto* ptr = root_scope_->Var(var->Name());
+      InitializeVariable(ptr, var->GetType());
+      VLOG(5) << "Create persistable var: " << var->Name()
+              << ", which pointer is " << ptr;
+    } else if (!var->Persistable()) {
+      auto* ptr = microbatch_scopes_[microbatch_id]->Var(var->Name());
+      VLOG(5) << "Create variable " << var->Name() << " for microbatch "
+              << microbatch_id << ", which pointer is " << ptr << ".";
+      InitializeVariable(ptr, var->GetType());
+    }
+  }
+}
+
 bool Carrier::EnqueueInterceptorMessage(
     const InterceptorMessage& interceptor_message) {
   PADDLE_ENFORCE_EQ(
@@ -116,6 +147,15 @@ void Carrier::Start() {
   // TODO(wangxi): async step
   Wait();
   dev_ctx_->Wait();
+  for (auto* micro_scope : microbatch_scopes_) {
+    // By default, we should delete all kid scopes after run executor because
+    // some operators may create local scope when running, such as while_op.
+    // But when while_op also create a local executor to run it's sub block,
+    // the sub scopes it created should not be dropped immediately, because
+    // while_grad_op will use some variables created during while_op run, so
+    // we need to keep the kids and wait for the outer executor to drop them.
+    micro_scope->DropKids();
+  }
 }
 
 bool Carrier::IsInit() const { return is_init_; }
diff --git a/paddle/fluid/distributed/fleet_executor/carrier.h b/paddle/fluid/distributed/fleet_executor/carrier.h
index 75ac07083a796..7762effdb9c87 100644
--- a/paddle/fluid/distributed/fleet_executor/carrier.h
+++ b/paddle/fluid/distributed/fleet_executor/carrier.h
@@ -34,6 +34,7 @@
 namespace paddle {
 namespace framework {
 class Scope;
+class ProgramDesc;
 }
 
 namespace distributed {
@@ -55,9 +56,10 @@ class Carrier final {
       int64_t rank,
       const std::unordered_map<int64_t, int64_t>& interceptor_id_to_rank,
       const std::unordered_map<int64_t, TaskNode*>& interceptor_id_to_node,
-      framework::Scope* root_scope, framework::Scope* minibatch_scope,
-      const std::vector<framework::Scope*>& microbatch_scopes,
-      const platform::Place& place);
+      const framework::ProgramDesc& program, framework::Scope* scope,
+      int64_t num_micro_batches, const platform::Place& place);
+
+  void CopyParameters(int microbatch_id, const framework::ProgramDesc& program);
 
   void Release();
   void Wait();
diff --git a/paddle/fluid/distributed/fleet_executor/fleet_executor.cc b/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
index d6c1e678ad4f7..19c44fa521b1b 100644
--- a/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
+++ b/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
@@ -22,8 +22,6 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/variable_helper.h"
 
 namespace paddle {
 namespace distributed {
@@ -38,7 +36,6 @@ FleetExecutor::FleetExecutor(const std::string& exe_desc_str) {
 }
 
 FleetExecutor::~FleetExecutor() {
-  root_scope_->DropKids();
   for (const auto& carrier_id : carrier_ids_) {
     GlobalMap<std::string, Carrier>::Get(carrier_id)->Release();
   }
@@ -47,7 +44,7 @@ FleetExecutor::~FleetExecutor() {
 void FleetExecutor::Init(
     const std::string& carrier_id, const framework::ProgramDesc& program_desc,
     framework::Scope* scope, const platform::Place& place,
-    const std::vector<TaskNode*>& task_nodes,
+    int64_t num_micro_batches, const std::vector<TaskNode*>& task_nodes,
     const std::unordered_map<int64_t, int64_t>& task_id_to_rank) {
   PADDLE_ENFORCE_GT(task_nodes.size(), 0,
                     platform::errors::InvalidArgument(
@@ -72,31 +69,23 @@ void FleetExecutor::Init(
   for (auto& unique_op : ops) {
     unique_op.release();
   }
-  root_scope_ = scope;
-  place_ = place;
-  PADDLE_ENFORCE_NOT_NULL(root_scope_, platform::errors::InvalidArgument(
-                                           "root_scope_ can not be nullptr"));
-  minibatch_scope_ = &root_scope_->NewScope();
-  int64_t num_micro_batches = exe_desc_.num_micro_batches();
-  microbatch_scopes_.resize(num_micro_batches);
-  for (int i = 0; i < num_micro_batches; ++i) {
-    microbatch_scopes_[i] = &minibatch_scope_->NewScope();
-    CopyParameters(i, program_desc);
-  }
   VLOG(5) << runtime_graph_->DebugString();
   Carrier* carrier =
       GlobalMap<std::string, Carrier>::Create(carrier_id, carrier_id);
   carrier_ids_.insert(carrier_id);
   // Set current running carrier
   GlobalVal<std::string>::Set(new std::string(carrier_id));
-  InitCarrier(carrier);
+  InitCarrier(carrier, scope, place, num_micro_batches, program_desc);
   GlobalVal<MessageBus>::Get()->Barrier();
 }
 
-void FleetExecutor::InitCarrier(Carrier* carrier) {
+void FleetExecutor::InitCarrier(Carrier* carrier, framework::Scope* scope,
+                                const platform::Place& place,
+                                int64_t num_micro_batches,
+                                const framework::ProgramDesc& program_desc) {
   carrier->Init(exe_desc_.cur_rank(), runtime_graph_->interceptor_id_to_rank(),
-                runtime_graph_->interceptor_id_to_node(), root_scope_,
-                minibatch_scope_, microbatch_scopes_, place_);
+                runtime_graph_->interceptor_id_to_node(), program_desc, scope,
+                num_micro_batches, place);
 }
 
 void FleetExecutor::InitMessageBus() {
@@ -140,34 +129,6 @@ void FleetExecutor::Run(const std::string& carrier_id) {
     GlobalVal<MessageBus>::Get()->Barrier();
   }
   carrier->Start();
-  for (auto* micro_scop : microbatch_scopes_) {
-    // By default, we should delete all kid scopes after run executor because
-    // some operators may create local scope when running, such as while_op.
-    // But when while_op also create a local executor to run it's sub block,
-    // the sub scopes it created should not be dropped immediately, because
-    // while_grad_op will use some variables created during while_op run, so
-    // we need to keep the kids and wait for the outer executor to drop them.
-    micro_scop->DropKids();
-  }
-}
-
-void FleetExecutor::CopyParameters(int microbatch_id,
-                                   const framework::ProgramDesc& program) {
-  auto& global_block = program.Block(0);
-
-  for (auto& var : global_block.AllVars()) {
-    if (var->Persistable() && microbatch_id == 0) {
-      auto* ptr = root_scope_->Var(var->Name());
-      InitializeVariable(ptr, var->GetType());
-      VLOG(5) << "Create persistable var: " << var->Name()
-              << ", which pointer is " << ptr;
-    } else if (!var->Persistable()) {
-      auto* ptr = microbatch_scopes_[microbatch_id]->Var(var->Name());
-      VLOG(5) << "Create variable " << var->Name() << " for microbatch "
-              << microbatch_id << ", which pointer is " << ptr << ".";
-      InitializeVariable(ptr, var->GetType());
-    }
-  }
 }
 
 }  // namespace distributed
diff --git a/paddle/fluid/distributed/fleet_executor/fleet_executor.h b/paddle/fluid/distributed/fleet_executor/fleet_executor.h
index 89ab4c62d386f..b2af3e4e457c7 100644
--- a/paddle/fluid/distributed/fleet_executor/fleet_executor.h
+++ b/paddle/fluid/distributed/fleet_executor/fleet_executor.h
@@ -39,7 +39,7 @@ class FleetExecutor final {
   ~FleetExecutor();
   void Init(const std::string& carrier_id,
             const framework::ProgramDesc& program_desc, framework::Scope* scope,
-            const platform::Place& place,
+            const platform::Place& place, int64_t num_micro_batches,
             const std::vector<TaskNode*>& task_nodes,
             const std::unordered_map<int64_t, int64_t>& task_id_to_rank);
   void Run(const std::string& carrier_id);
@@ -47,14 +47,11 @@ class FleetExecutor final {
  private:
   DISABLE_COPY_AND_ASSIGN(FleetExecutor);
   void InitMessageBus();
-  void InitCarrier(Carrier* carrier);
-  void CopyParameters(int microbatch_id, const framework::ProgramDesc& program);
+  void InitCarrier(Carrier* carrier, framework::Scope* scope,
+                   const platform::Place& place, int64_t num_micro_batches,
+                   const framework::ProgramDesc& program_desc);
   FleetExecutorDesc exe_desc_;
   std::shared_ptr<RuntimeGraph> runtime_graph_;
-  framework::Scope* root_scope_;
-  framework::Scope* minibatch_scope_;
-  platform::Place place_;
-  std::vector<framework::Scope*> microbatch_scopes_;
   std::unordered_set<std::string> carrier_ids_;
 };
 
diff --git a/paddle/fluid/distributed/fleet_executor/fleet_executor_desc.proto b/paddle/fluid/distributed/fleet_executor/fleet_executor_desc.proto
index aa553557852a7..d048660774b39 100644
--- a/paddle/fluid/distributed/fleet_executor/fleet_executor_desc.proto
+++ b/paddle/fluid/distributed/fleet_executor/fleet_executor_desc.proto
@@ -23,5 +23,4 @@ message RankInfo {
 message FleetExecutorDesc {
   optional int64 cur_rank = 1 [ default = 0 ]; // Rank id of current processor
   repeated RankInfo cluster_info = 2;
-  optional int64 num_micro_batches = 3 [ default = 1 ];
 }
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 7f282b8cea07a..d67d4944c69cb 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -400,6 +400,23 @@ def _is_enable_standalone_executor():
     return flag
 
 
+def _prepare_fleet_executor():
+    from ..distributed.fleet.proto import fleet_executor_desc_pb2
+    trainer_endpoints_str = os.getenv("PADDLE_TRAINER_ENDPOINTS", "")
+    trainer_endpoints = trainer_endpoints_str.split(',')
+    fleet_exe_desc = fleet_executor_desc_pb2.FleetExecutorDesc()
+    cur_rank = int(os.getenv("PADDLE_TRAINER_ID", 0))
+    fleet_exe_desc.cur_rank = cur_rank
+    nrank = len(trainer_endpoints)
+    for rank, endpoint in enumerate(trainer_endpoints):
+        rank_info = fleet_executor_desc_pb2.RankInfo()
+        rank_info.rank = rank
+        rank_info.ip_port = endpoint
+        fleet_exe_desc.cluster_info.append(rank_info)
+    fleet_exe = core.FleetExecutor(fleet_exe_desc.SerializeToString())
+    return fleet_exe
+
+
 def _get_strong_program_cache_key(program, feed, fetch_list):
     # NOTE(xiongkun) id(proram) may be duplicate. So add addition var_name as cache key. 
     def _get_varname_from_block(block):
@@ -692,6 +709,8 @@ def __init__(self, place=None):
         self._enable_interpreter_core = _is_enable_standalone_executor()
         self._executor_cache = _ExecutorCache(self.place)
 
+        self._fleet_executor = None
+
     def _get_scope_cache(self, program_cache_key):
         return self.scope_caches.get(program_cache_key, None)
 
@@ -1281,6 +1300,9 @@ def _run_impl(self, program, feed, fetch_list, feed_var_name,
 
         if isinstance(program, Program) and program._pipeline_opt:
             if "fleet_opt" in program._pipeline_opt:
+                # Move prepare here for port conflict with nccl in startup program
+                if self._fleet_executor is None:
+                    self._fleet_executor = _prepare_fleet_executor()
                 return self._run_using_fleet_executor(
                     program=program, feed=feed, fetch_list=fetch_list)
             if "startup_program" in program._pipeline_opt:
@@ -1960,27 +1982,16 @@ def _get_real_program_fetch_list():
 
         return ctx
 
-    def _prepare_fleet_executor(self,
-                                carrier_id="",
-                                program=None,
-                                scope=None,
-                                fleet_opt=None):
-        from ..distributed.fleet.proto import fleet_executor_desc_pb2
-        assert program, "Program for fleet executor should not be None"
-        assert fleet_opt, "Configurations for fleet executor should not be None"
-        trainer_endpoints_str = os.getenv("PADDLE_TRAINER_ENDPOINTS", "")
-        trainer_endpoints = trainer_endpoints_str.split(',')
-        fleet_exe_desc = fleet_executor_desc_pb2.FleetExecutorDesc()
+    def _prepare_fleet_executor_carrier(self,
+                                        carrier_id="",
+                                        program=None,
+                                        scope=None,
+                                        fleet_opt=None):
+        num_micro_batches = fleet_opt[
+            "num_micro_batches"] if "num_micro_batches" in fleet_opt else 1
         cur_rank = int(os.getenv("PADDLE_TRAINER_ID", 0))
-        fleet_exe_desc.cur_rank = cur_rank
+        trainer_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS", "").split(',')
         nrank = len(trainer_endpoints)
-        for rank, endpoint in enumerate(trainer_endpoints):
-            rank_info = fleet_executor_desc_pb2.RankInfo()
-            rank_info.rank = rank
-            rank_info.ip_port = endpoint
-            fleet_exe_desc.cluster_info.append(rank_info)
-        if "num_micro_batches" in fleet_opt:
-            fleet_exe_desc.num_micro_batches = fleet_opt["num_micro_batches"]
 
         assert 'scheduler' in fleet_opt or 'tasks' in fleet_opt, \
             "Fleet executor need configuration for scheduler, you can choose from 1F1B or Origin. " \
@@ -2019,12 +2030,10 @@ def _prepare_fleet_executor(self,
             # NOTE: have to hold these vars, otherwise will be destructed
             fleet_opt['tasks'] = tasks
             fleet_opt['task_id_to_rank'] = task_id_to_rank
-        fleet_exe = core.FleetExecutor(fleet_exe_desc.SerializeToString())
         place = core.Place()
         place.set_place(self.place)
-        fleet_exe.init(carrier_id, program.desc, scope, place, tasks,
-                       task_id_to_rank)
-        return fleet_exe
+        self._fleet_executor.init(carrier_id, program.desc, scope, place,
+                                  num_micro_batches, tasks, task_id_to_rank)
 
     def _run_using_fleet_executor(self,
                                   program=None,
@@ -2032,16 +2041,15 @@ def _run_using_fleet_executor(self,
                                   feed_var_name="feed",
                                   fetch_var_name="fetch",
                                   fetch_list=None):
-        # TODO(liyurui): Change cache strategy for multi carriers
         cache_key = _get_strong_program_cache_key(program, feed, fetch_list)
-        cached_ctx = self._get_ctx_cache(cache_key)
-        cached_scope = self._get_scope_cache(cache_key)
         cached_program = self._get_program_cache(cache_key)
-        real_feed = [] if feed is None else feed
+        cached_scope = self._get_scope_cache(cache_key)
         if cached_scope is None:
             cached_scope = global_scope()
             self._add_scope_cache(cache_key, cached_scope)
         if cached_program is None:
+            assert program._pipeline_opt, "program should have _pipeline_opt to start carrier"
+            real_feed = [] if feed is None else feed
             real_program = program
             if "section_program" in program._pipeline_opt:
                 real_program = program._pipeline_opt["section_program"]
@@ -2060,7 +2068,6 @@ def _run_using_fleet_executor(self,
                         'op_role',
                         core.op_proto_and_checker_maker.OpRole.Optimize)
             self._add_program_cache(cache_key, cached_program)
-        if cached_ctx is None:
             fleet_opt = program._pipeline_opt["fleet_opt"]
             if 'tasks' in fleet_opt:
                 # Insert feed/fetch op for cloned program in each task node,
@@ -2097,12 +2104,12 @@ def _run_using_fleet_executor(self,
                             core.op_proto_and_checker_maker.OpRole.Optimize)
                 fetch_task.set_program(fetch_program)
 
-            cached_ctx = self._prepare_fleet_executor(
+            self._prepare_fleet_executor_carrier(
                 cache_key,
                 program=cached_program,
                 scope=cached_scope,
                 fleet_opt=fleet_opt)
-            self._add_ctx_cache(cache_key, cached_ctx)
+
         if feed:
             # NOTE: don't have to traverse programs in task nodes,
             # since they all sub program of cached program and
@@ -2120,7 +2127,8 @@ def _run_using_fleet_executor(self,
                                               lr_sheduler._var_name)
             tensor.set(data, self.place)
 
-        cached_ctx.run(cache_key)
+        self._fleet_executor.run(cache_key)
+
         if fetch_list:
             arr = cached_scope.find_var(fetch_var_name).get_fetch_list()
             tensors = arr._move_to_list()

From 405103d8ada42aacc42aa326b9381e851f240ffa Mon Sep 17 00:00:00 2001
From: Haohongxiang <86215757+haohongxiang@users.noreply.github.com>
Date: Mon, 10 Jan 2022 19:55:49 +0800
Subject: [PATCH 078/151] Add gpu kernel for new api : linalg.lstsq (#38621)

* add lstsq gpu kernel

* update

* add docs_en

* modify ut

* fix bugs

* modify example in docs_en

* remove lstsq_op.cu from ROCM cmake

* modify docs_en

* modify docs_en

* modify docs_en

* remove unneccessary TensorCopy
---
 cmake/operators.cmake                         |   1 +
 paddle/fluid/operators/lstsq_op.cu            | 211 ++++++++++++++++++
 paddle/fluid/operators/lstsq_op.h             |  47 +++-
 paddle/fluid/operators/qr_op.cu               |  52 ++---
 paddle/fluid/operators/qr_op.h                |   8 +
 paddle/fluid/platform/dynload/cusolver.h      |   4 +
 .../tests/unittests/test_linalg_lstsq_op.py   | 195 +++++++++-------
 python/paddle/tensor/linalg.py                |  75 ++++++-
 8 files changed, 465 insertions(+), 128 deletions(-)
 create mode 100644 paddle/fluid/operators/lstsq_op.cu

diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 2d1ce4e834217..2d4aa1a815fff 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -203,6 +203,7 @@ function(op_library TARGET)
         list(REMOVE_ITEM hip_srcs "eigvalsh_op.cu")
         list(REMOVE_ITEM hip_srcs "qr_op.cu")
         list(REMOVE_ITEM hip_srcs "eigh_op.cu")
+        list(REMOVE_ITEM hip_srcs "lstsq_op.cu")
         list(REMOVE_ITEM hip_srcs "multinomial_op.cu")
         list(REMOVE_ITEM hip_srcs "decode_jpeg_op.cu")
         hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cc_srcs} ${miopen_cu_cc_srcs} ${miopen_cu_srcs} ${mkldnn_cc_srcs} ${hip_srcs} DEPS ${op_library_DEPS}
diff --git a/paddle/fluid/operators/lstsq_op.cu b/paddle/fluid/operators/lstsq_op.cu
new file mode 100644
index 0000000000000..a71b900f14f8e
--- /dev/null
+++ b/paddle/fluid/operators/lstsq_op.cu
@@ -0,0 +1,211 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef PADDLE_WITH_HIP
+// HIP not support cusolver
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/operators/lstsq_op.h"
+#include "paddle/fluid/operators/qr_op.h"
+#include "paddle/fluid/platform/dynload/cusolver.h"
+
+namespace paddle {
+namespace operators {
+
+using paddle::framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class LstsqCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor& x = *context.Input<Tensor>("X");
+    const Tensor& y = *context.Input<Tensor>("Y");
+    auto* solution = context.Output<Tensor>("Solution");
+
+    auto dito =
+        math::DeviceIndependenceTensorOperations<platform::CUDADeviceContext,
+                                                 T>(context);
+    auto& dev_ctx =
+        context.template device_context<platform::CUDADeviceContext>();
+
+    auto x_dims = x.dims();
+    auto y_dims = y.dims();
+    int dim_size = x_dims.size();
+    int m = x_dims[dim_size - 2];
+    int n = x_dims[dim_size - 1];
+    int nrhs = y_dims[dim_size - 1];
+    int min_mn = std::min(m, n);
+    int max_mn = std::max(m, n);
+    int k = min_mn;
+
+    int x_stride = MatrixStride(x);
+    int y_stride = MatrixStride(y);
+    int tau_stride = min_mn;
+    int batch_count = BatchCount(x);
+
+    Tensor new_x, new_y;
+    new_x.mutable_data<T>(context.GetPlace(),
+                          size_t(batch_count * m * n * sizeof(T)));
+    new_y.mutable_data<T>(context.GetPlace(),
+                          size_t(batch_count * m * nrhs * sizeof(T)));
+    framework::TensorCopy(x, context.GetPlace(), &new_x);
+    framework::TensorCopy(y, context.GetPlace(), &new_y);
+
+    // Prepare tau
+    auto tau_dims_vec = framework::vectorize<int>(x_dims);
+    tau_dims_vec.pop_back();
+    tau_dims_vec[tau_dims_vec.size() - 1] = min_mn;
+    Tensor tau = dito.Fill(tau_dims_vec, 0);
+    auto tau_data = tau.mutable_data<T>(context.GetPlace());
+
+    if (m >= n) {
+      Tensor tmp_x = dito.Transpose(new_x);
+      Tensor tmp_y = dito.Transpose(new_y);
+      auto x_data = tmp_x.mutable_data<T>(context.GetPlace());
+      auto y_data = tmp_y.mutable_data<T>(context.GetPlace());
+
+      // step 1, compute QR factorization using geqrf
+      BatchedGeqrf<DeviceContext, T>(dev_ctx, batch_count, m, n, x_data, m,
+                                     tau_data, x_stride, tau_stride);
+
+      // Step 2, Y <- Q^H Y
+      BatchedOrmqr<DeviceContext, T>(dev_ctx, true, true, batch_count, m, n, k,
+                                     x_data, x_stride, tau_data, tau_stride,
+                                     y_data, y_stride);
+
+      Tensor trans_r = dito.Transpose(tmp_x);
+      Tensor slice_r = dito.Slice(trans_r, {-2}, {0}, {min_mn});
+      Tensor res_r = dito.TrilTriu(slice_r, 0, false);
+
+      Tensor trans_y = dito.Transpose(tmp_y);
+      Tensor slice_y = dito.Slice(trans_y, {-2}, {0}, {min_mn});
+
+      // Step 3, solve R X = Y
+      triangular_solve<DeviceContext, T>(dev_ctx, res_r, slice_y, solution,
+                                         true, false, false);
+    } else {
+      auto x_data = new_x.mutable_data<T>(context.GetPlace());
+      auto y_data = new_y.mutable_data<T>(context.GetPlace());
+
+      // step 1, compute QR factorization using geqrf
+      BatchedGeqrf<DeviceContext, T>(dev_ctx, batch_count, n, m, x_data, n,
+                                     tau_data, x_stride, tau_stride);
+
+      // Step 2, solve R^H Z = Y
+      Tensor trans_r = dito.Transpose(new_x);
+      triangular_solve<DeviceContext, T>(dev_ctx, trans_r, new_y, solution,
+                                         true, true, false);
+
+      // Step 3, X <- Q Z
+      BatchedOrgqr<DeviceContext, T>(dev_ctx, batch_count, n, n, min_mn, x_data,
+                                     n, tau_data, x_stride, tau_stride);
+      Tensor trans_q = dito.Transpose(new_x);
+      Tensor slice_q = dito.Slice(trans_q, {-1}, {0}, {m});
+      Tensor solu_tensor = dito.Matmul(slice_q, *solution, false, false);
+      framework::TensorCopy(solu_tensor, solution->place(), solution);
+    }
+  }
+};
+
+template <>
+void BatchedOrmqr<platform::CUDADeviceContext, float>(
+    const platform::CUDADeviceContext& dev_ctx, bool left, bool transpose,
+    int batch_size, int m, int n, int k, float* a, int a_stride, float* tau,
+    int tau_stride, float* other, int other_stride) {
+  int lwork = 0;
+  auto side = left ? CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT;
+  auto trans = transpose ? CUBLAS_OP_T : CUBLAS_OP_N;
+  int lda = std::max<int>(1, left ? m : n);
+  int ldc = std::max<int>(1, m);
+
+  auto handle = dev_ctx.cusolver_dn_handle();
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSormqr_bufferSize(
+      handle, side, trans, m, n, k, a, lda, tau, other, ldc, &lwork));
+  auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(float));
+  float* workspace_ptr = reinterpret_cast<float*>(workspace->ptr());
+  auto info = memory::Alloc(dev_ctx, sizeof(int));
+  int* info_d = reinterpret_cast<int*>(info->ptr());
+
+  for (int i = 0; i < batch_size; ++i) {
+    float* a_working_ptr = &a[i * a_stride];
+    float* tau_working_ptr = &tau[i * tau_stride];
+    float* other_working_ptr = &other[i * other_stride];
+    // compute ormgr
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSormqr(
+        handle, side, trans, m, n, k, a_working_ptr, lda, tau_working_ptr,
+        other_working_ptr, ldc, workspace_ptr, lwork, info_d));
+
+    // check the error info
+    int info_h;
+    memory::Copy(platform::CPUPlace(), &info_h,
+                 BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
+                 info_d, sizeof(int), dev_ctx.stream());
+    PADDLE_ENFORCE_EQ(
+        info_h, 0,
+        platform::errors::PreconditionNotMet(
+            "For batch [%d]: CUSolver info is not zero but [%d]", i, info_h));
+  }
+}
+
+template <>
+void BatchedOrmqr<platform::CUDADeviceContext, double>(
+    const platform::CUDADeviceContext& dev_ctx, bool left, bool transpose,
+    int batch_size, int m, int n, int k, double* a, int a_stride, double* tau,
+    int tau_stride, double* other, int other_stride) {
+  int lwork = 0;
+  auto side = left ? CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT;
+  auto trans = transpose ? CUBLAS_OP_T : CUBLAS_OP_N;
+  int lda = std::max<int>(1, left ? m : n);
+  int ldc = std::max<int>(1, m);
+
+  auto handle = dev_ctx.cusolver_dn_handle();
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDormqr_bufferSize(
+      handle, side, trans, m, n, k, a, lda, tau, other, ldc, &lwork));
+  auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(double));
+  double* workspace_ptr = reinterpret_cast<double*>(workspace->ptr());
+  auto info = memory::Alloc(dev_ctx, sizeof(int));
+  int* info_d = reinterpret_cast<int*>(info->ptr());
+
+  for (int i = 0; i < batch_size; ++i) {
+    double* a_working_ptr = &a[i * a_stride];
+    double* tau_working_ptr = &tau[i * tau_stride];
+    double* other_working_ptr = &other[i * other_stride];
+    // compute ormgr
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDormqr(
+        handle, side, trans, m, n, k, a_working_ptr, lda, tau_working_ptr,
+        other_working_ptr, ldc, workspace_ptr, lwork, info_d));
+
+    // check the error info
+    int info_h;
+    memory::Copy(platform::CPUPlace(), &info_h,
+                 BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
+                 info_d, sizeof(int), dev_ctx.stream());
+    PADDLE_ENFORCE_EQ(
+        info_h, 0,
+        platform::errors::PreconditionNotMet(
+            "For batch [%d]: CUSolver info is not zero but [%d]", i, info_h));
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    lstsq, ops::LstsqCUDAKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LstsqCUDAKernel<paddle::platform::CUDADeviceContext, double>);
+
+#endif  // not PADDLE_WITH_HIP
diff --git a/paddle/fluid/operators/lstsq_op.h b/paddle/fluid/operators/lstsq_op.h
index b9c5c87a6a376..be411232706a5 100644
--- a/paddle/fluid/operators/lstsq_op.h
+++ b/paddle/fluid/operators/lstsq_op.h
@@ -49,7 +49,7 @@ class LstsqCPUKernel : public framework::OpKernel<T> {
     using ValueType = math::Real<T>;
 
     const Tensor& x = *context.Input<Tensor>("X");
-    const Tensor& y = *context.Input<Tensor>("Y");
+    auto y = context.Input<Tensor>("Y");
     auto rcond = context.Attr<float>("rcond");
     auto driver_string = context.Attr<std::string>("driver");
 
@@ -68,13 +68,15 @@ class LstsqCPUKernel : public framework::OpKernel<T> {
         math::DeviceIndependenceTensorOperations<DeviceContext, T>(context);
 
     auto x_dims = x.dims();
-    auto y_dims = y.dims();
+    auto y_dims = y->dims();
     int dim_size = x_dims.size();
     int x_stride = MatrixStride(x);
-    int y_stride = MatrixStride(y);
+    int y_stride = MatrixStride(*y);
     int batch_count = BatchCount(x);
-    auto ori_solution_dim = solution->dims();
+    auto solution_dim = solution->dims();
     int ori_solu_stride = MatrixStride(*solution);
+    int max_solu_stride = std::max(y_stride, ori_solu_stride);
+    int min_solu_stride = std::min(y_stride, ori_solu_stride);
 
     // lapack is a column-major storge, transpose make the input to
     // have a continuous memory layout
@@ -88,13 +90,24 @@ class LstsqCPUKernel : public framework::OpKernel<T> {
     Tensor new_x;
     new_x.mutable_data<T>(context.GetPlace(),
                           size_t(batch_count * m * n * sizeof(T)));
+    framework::TensorCopy(x, context.GetPlace(), &new_x);
+
     solution->mutable_data<T>(
         context.GetPlace(),
         size_t(batch_count * std::max(m, n) * nrhs * sizeof(T)));
-    framework::TensorCopy(x, context.GetPlace(), &new_x);
-    framework::TensorCopy(y, context.GetPlace(), solution);
 
-    if (m < n) solution->Resize(UDDim(ori_solution_dim));
+    if (m >= n) {
+      const Tensor& new_y = *context.Input<Tensor>("Y");
+      framework::TensorCopy(new_y, context.GetPlace(), solution);
+    } else {
+      auto* solu_data = solution->data<T>();
+      auto* y_data = y->data<T>();
+      for (auto i = 0; i < batch_count; i++) {
+        for (auto j = 0; j < min_solu_stride; j++) {
+          solu_data[i * max_solu_stride + j] = y_data[i * y_stride + j];
+        }
+      }
+    }
 
     Tensor input_x_trans = dito.Transpose(new_x);
     Tensor input_y_trans = dito.Transpose(*solution);
@@ -186,10 +199,9 @@ class LstsqCPUKernel : public framework::OpKernel<T> {
       iwork_data = iwork.mutable_data<int>(context.GetPlace());
     }
 
-    int solu_stride = std::max(y_stride, ori_solu_stride);
     for (auto i = 0; i < batch_count; ++i) {
       auto* x_input = &x_vector[i * x_stride];
-      auto* y_input = &y_vector[i * solu_stride];
+      auto* y_input = &y_vector[i * max_solu_stride];
       rank_working_ptr = rank_working_ptr ? &rank_data[i] : nullptr;
       s_working_ptr = s_working_ptr ? &s_data[i * s_stride] : nullptr;
 
@@ -221,9 +233,24 @@ class LstsqCPUKernel : public framework::OpKernel<T> {
     Tensor tmp_s = dito.Transpose(*solution);
     framework::TensorCopy(tmp_s, solution->place(), solution);
 
-    if (m >= n) solution->Resize(UDDim(ori_solution_dim));
+    if (m > n) {
+      auto* solu_data = solution->data<T>();
+      for (auto i = 1; i < batch_count; i++) {
+        for (auto j = 0; j < min_solu_stride; j++) {
+          solu_data[i * min_solu_stride + j] =
+              solu_data[i * max_solu_stride + j];
+        }
+      }
+    }
+
+    solution->Resize(UDDim(solution_dim));
   }
 };
 
+template <typename DeviceContext, typename T>
+void BatchedOrmqr(const DeviceContext& dev_ctx, bool left, bool transpose,
+                  int batch_size, int m, int n, int k, T* a, int a_stride,
+                  T* tau, int tau_stride, T* other, int other_stride);
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/qr_op.cu b/paddle/fluid/operators/qr_op.cu
index 3eb5f72b5b117..af5ebdc53126a 100644
--- a/paddle/fluid/operators/qr_op.cu
+++ b/paddle/fluid/operators/qr_op.cu
@@ -88,8 +88,8 @@ class QrGPUKernel : public framework::OpKernel<T> {
     auto qr_data = qr.mutable_data<T>(context.GetPlace());
     auto tau_data = tau.mutable_data<T>(context.GetPlace());
 
-    BatchedGeqrf(dev_ctx, batch_size, m, n, qr_data, m, tau_data, qr_stride,
-                 tau_stride);
+    BatchedGeqrf<platform::CUDADeviceContext, T>(
+        dev_ctx, batch_size, m, n, qr_data, m, tau_data, qr_stride, tau_stride);
 
     if (reduced_mode) {
       auto trans_qr = dito.Transpose(qr);
@@ -108,8 +108,9 @@ class QrGPUKernel : public framework::OpKernel<T> {
       // Perform QRGQR for Q using the result from GEQRF
       // Transpose 'q' to retore the original row-major order
       if (reduced_mode) {
-        BatchedOrgqr(dev_ctx, batch_size, m, min_mn, min_mn, qr_data, m,
-                     tau_data, qr_stride, tau_stride);
+        BatchedOrgqr<platform::CUDADeviceContext, T>(
+            dev_ctx, batch_size, m, min_mn, min_mn, qr_data, m, tau_data,
+            qr_stride, tau_stride);
         auto trans_q = dito.Transpose(qr);
         auto sliced_q = dito.Slice(trans_q, {-1}, {0}, {min_mn});
         framework::TensorCopy(sliced_q, q.place(), &q);
@@ -128,13 +129,15 @@ class QrGPUKernel : public framework::OpKernel<T> {
                 (qr_data + i * qr_stride), qr_stride * sizeof(math::Real<T>),
                 dev_ctx.stream());
           }
-          BatchedOrgqr(dev_ctx, batch_size, m, m, min_mn, new_qr_data, m,
-                       tau_data, new_qr_stride, tau_stride);
+          BatchedOrgqr<platform::CUDADeviceContext, T>(
+              dev_ctx, batch_size, m, m, min_mn, new_qr_data, m, tau_data,
+              new_qr_stride, tau_stride);
           auto trans_q = dito.Transpose(new_qr);
           framework::TensorCopy(trans_q, q.place(), &q);
         } else {
-          BatchedOrgqr(dev_ctx, batch_size, m, m, min_mn, qr_data, m, tau_data,
-                       qr_stride, tau_stride);
+          BatchedOrgqr<platform::CUDADeviceContext, T>(
+              dev_ctx, batch_size, m, m, min_mn, qr_data, m, tau_data,
+              qr_stride, tau_stride);
           auto trans_q = dito.Transpose(qr);
           auto sliced_q = dito.Slice(trans_q, {-1}, {0}, {m});
           framework::TensorCopy(sliced_q, q.place(), &q);
@@ -142,28 +145,12 @@ class QrGPUKernel : public framework::OpKernel<T> {
       }
     }
   }
-
-  void BatchedGeqrf(const platform::CUDADeviceContext& dev_ctx, int batch_size,
-                    int m, int n, float* a, int lda, float* tau, int a_stride,
-                    int tau_stride) const;
-
-  void BatchedGeqrf(const platform::CUDADeviceContext& dev_ctx, int batch_size,
-                    int m, int n, double* a, int lda, double* tau, int a_stride,
-                    int tau_stride) const;
-
-  void BatchedOrgqr(const platform::CUDADeviceContext& dev_ctx, int batch_size,
-                    int m, int n, int k, float* a, int lda, float* tau,
-                    int a_stride, int tau_stride) const;
-
-  void BatchedOrgqr(const platform::CUDADeviceContext& dev_ctx, int batch_size,
-                    int m, int n, int k, double* a, int lda, double* tau,
-                    int a_stride, int tau_stride) const;
 };
 
 template <>
-void QrGPUKernel<float>::BatchedGeqrf(
+void BatchedGeqrf<platform::CUDADeviceContext, float>(
     const platform::CUDADeviceContext& dev_ctx, int batch_size, int m, int n,
-    float* a, int lda, float* tau, int a_stride, int tau_stride) const {
+    float* a, int lda, float* tau, int a_stride, int tau_stride) {
   int lwork = 0;
 
   auto handle = dev_ctx.cusolver_dn_handle();
@@ -195,9 +182,9 @@ void QrGPUKernel<float>::BatchedGeqrf(
 }
 
 template <>
-void QrGPUKernel<double>::BatchedGeqrf(
+void BatchedGeqrf<platform::CUDADeviceContext, double>(
     const platform::CUDADeviceContext& dev_ctx, int batch_size, int m, int n,
-    double* a, int lda, double* tau, int a_stride, int tau_stride) const {
+    double* a, int lda, double* tau, int a_stride, int tau_stride) {
   int lwork = 0;
 
   auto handle = dev_ctx.cusolver_dn_handle();
@@ -229,9 +216,9 @@ void QrGPUKernel<double>::BatchedGeqrf(
 }
 
 template <>
-void QrGPUKernel<float>::BatchedOrgqr(
+void BatchedOrgqr<platform::CUDADeviceContext, float>(
     const platform::CUDADeviceContext& dev_ctx, int batch_size, int m, int n,
-    int k, float* a, int lda, float* tau, int a_stride, int tau_stride) const {
+    int k, float* a, int lda, float* tau, int a_stride, int tau_stride) {
   int lwork = 0;
 
   auto handle = dev_ctx.cusolver_dn_handle();
@@ -263,10 +250,9 @@ void QrGPUKernel<float>::BatchedOrgqr(
 }
 
 template <>
-void QrGPUKernel<double>::BatchedOrgqr(
+void BatchedOrgqr<platform::CUDADeviceContext, double>(
     const platform::CUDADeviceContext& dev_ctx, int batch_size, int m, int n,
-    int k, double* a, int lda, double* tau, int a_stride,
-    int tau_stride) const {
+    int k, double* a, int lda, double* tau, int a_stride, int tau_stride) {
   int lwork = 0;
 
   auto handle = dev_ctx.cusolver_dn_handle();
diff --git a/paddle/fluid/operators/qr_op.h b/paddle/fluid/operators/qr_op.h
index 65dfb4261e96e..1731aa9e07206 100644
--- a/paddle/fluid/operators/qr_op.h
+++ b/paddle/fluid/operators/qr_op.h
@@ -250,5 +250,13 @@ class QrGradKernel : public framework::OpKernel<T> {
   }
 };
 
+template <typename DeviceContext, typename T>
+void BatchedGeqrf(const DeviceContext& dev_ctx, int batch_size, int m, int n,
+                  T* a, int lda, T* tau, int a_stride, int tau_stride);
+
+template <typename DeviceContext, typename T>
+void BatchedOrgqr(const DeviceContext& dev_ctx, int batch_size, int m, int n,
+                  int k, T* a, int lda, T* tau, int a_stride, int tau_stride);
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/cusolver.h b/paddle/fluid/platform/dynload/cusolver.h
index f9dc6baea3c29..63661a93cfd85 100644
--- a/paddle/fluid/platform/dynload/cusolver.h
+++ b/paddle/fluid/platform/dynload/cusolver.h
@@ -81,6 +81,8 @@ CUSOLVER_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP);
   __macro(cusolverDnZgeqrf_bufferSize);   \
   __macro(cusolverDnSorgqr_bufferSize);   \
   __macro(cusolverDnDorgqr_bufferSize);   \
+  __macro(cusolverDnSormqr_bufferSize);   \
+  __macro(cusolverDnDormqr_bufferSize);   \
   __macro(cusolverDnCungqr_bufferSize);   \
   __macro(cusolverDnZungqr_bufferSize);   \
   __macro(cusolverDnDestroyGesvdjInfo);   \
@@ -98,6 +100,8 @@ CUSOLVER_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP);
   __macro(cusolverDnZgeqrf);              \
   __macro(cusolverDnSorgqr);              \
   __macro(cusolverDnDorgqr);              \
+  __macro(cusolverDnSormqr);              \
+  __macro(cusolverDnDormqr);              \
   __macro(cusolverDnCungqr);              \
   __macro(cusolverDnZungqr);
 
diff --git a/python/paddle/fluid/tests/unittests/test_linalg_lstsq_op.py b/python/paddle/fluid/tests/unittests/test_linalg_lstsq_op.py
index 4c0325a35f32e..59ac2e28087c8 100644
--- a/python/paddle/fluid/tests/unittests/test_linalg_lstsq_op.py
+++ b/python/paddle/fluid/tests/unittests/test_linalg_lstsq_op.py
@@ -18,11 +18,15 @@
 import numpy as np
 import paddle
 import paddle.fluid as fluid
+import paddle.fluid.core as core
 
 
 class LinalgLstsqTestCase(unittest.TestCase):
     def setUp(self):
+        self.devices = ["cpu"]
         self.init_config()
+        if core.is_compiled_with_cuda() and self.driver == "gels":
+            self.devices.append("gpu:0")
         self.generate_input()
         self.generate_output()
 
@@ -43,104 +47,129 @@ def generate_output(self):
         if len(self._input_shape_1) == 2:
             out = np.linalg.lstsq(
                 self._input_data_1, self._input_data_2, rcond=self.rcond)
+            self._output_solution = out[0]
+            self._output_residuals = out[1]
+            self._output_rank = out[2]
+            self._output_sg_values = out[3]
         elif len(self._input_shape_1) == 3:
-            out = np.linalg.lstsq(
-                self._input_data_1[0], self._input_data_2[0], rcond=self.rcond)
-
-        self._output_solution = out[0]
-        self._output_residuals = out[1]
-        self._output_rank = out[2]
-        self._output_sg_values = out[3]
+            self._output_solution = []
+            self._output_residuals = []
+            self._output_rank = []
+            self._output_sg_values = []
+            for i in range(self._input_shape_1[0]):
+                out = np.linalg.lstsq(
+                    self._input_data_1[i],
+                    self._input_data_2[i],
+                    rcond=self.rcond)
+                self._output_solution.append(out[0])
+                self._output_residuals.append(out[1])
+                self._output_rank.append(out[2])
+                self._output_sg_values.append(out[3])
 
     def test_dygraph(self):
         paddle.disable_static()
-        paddle.device.set_device("cpu")
-        place = paddle.CPUPlace()
-        x = paddle.to_tensor(self._input_data_1, place=place, dtype=self.dtype)
-        y = paddle.to_tensor(self._input_data_2, place=place, dtype=self.dtype)
-        results = paddle.linalg.lstsq(
-            x, y, rcond=self.rcond, driver=self.driver)
-
-        res_solution = results[0].numpy()
-        res_residuals = results[1].numpy()
-        res_rank = results[2].numpy()
-        res_singular_values = results[3].numpy()
-
-        if x.shape[-2] > x.shape[-1] and self._output_rank == x.shape[-1]:
-            if (np.abs(res_residuals - self._output_residuals) < 1e-6).any():
-                pass
-            else:
-                raise RuntimeError("Check LSTSQ residuals dygraph Failed")
-
-        if self.driver in ("gelsy", "gelsd", "gelss"):
-            if (np.abs(res_rank - self._output_rank) < 1e-6).any():
-                pass
-            else:
-                raise RuntimeError("Check LSTSQ rank dygraph Failed")
-
-        if self.driver in ("gelsd", "gelss"):
-            if (np.abs(res_singular_values - self._output_sg_values) < 1e-6
-                ).any():
-                pass
-            else:
-                raise RuntimeError("Check LSTSQ singular values dygraph Failed")
-
-    def test_static(self):
-        paddle.enable_static()
-        paddle.device.set_device("cpu")
-        place = fluid.CPUPlace()
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = paddle.fluid.data(
-                name="x",
-                shape=self._input_shape_1,
-                dtype=self._input_data_1.dtype)
-            y = paddle.fluid.data(
-                name="y",
-                shape=self._input_shape_2,
-                dtype=self._input_data_2.dtype)
+        for dev in self.devices:
+            paddle.set_device(dev)
+            place = paddle.CPUPlace() if dev == "cpu" else paddle.CUDAPlace(0)
+            x = paddle.to_tensor(
+                self._input_data_1, place=place, dtype=self.dtype)
+            y = paddle.to_tensor(
+                self._input_data_2, place=place, dtype=self.dtype)
             results = paddle.linalg.lstsq(
                 x, y, rcond=self.rcond, driver=self.driver)
-            exe = fluid.Executor(place)
-            fetches = exe.run(
-                fluid.default_main_program(),
-                feed={"x": self._input_data_1,
-                      "y": self._input_data_2},
-                fetch_list=[results])
-
-            if x.shape[-2] > x.shape[-1] and self._output_rank == x.shape[-1]:
-                if (np.abs(fetches[1] - self._output_residuals) < 1e-6).any():
-                    pass
-                else:
-                    raise RuntimeError("Check LSTSQ residuals static Failed")
+            self._result_solution = results[0].numpy()
+            self._result_residuals = results[1].numpy()
+            self._result_rank = results[2].numpy()
+            self._result_sg_values = results[3].numpy()
+            self.assert_np_close()
 
+    def test_static(self):
+        paddle.enable_static()
+        for dev in self.devices:
+            paddle.set_device(dev)
+            place = fluid.CPUPlace() if dev == "cpu" else fluid.CUDAPlace(0)
+            with fluid.program_guard(fluid.Program(), fluid.Program()):
+                x = paddle.fluid.data(
+                    name="x",
+                    shape=self._input_shape_1,
+                    dtype=self._input_data_1.dtype)
+                y = paddle.fluid.data(
+                    name="y",
+                    shape=self._input_shape_2,
+                    dtype=self._input_data_2.dtype)
+                results = paddle.linalg.lstsq(
+                    x, y, rcond=self.rcond, driver=self.driver)
+                exe = fluid.Executor(place)
+                fetches = exe.run(
+                    fluid.default_main_program(),
+                    feed={"x": self._input_data_1,
+                          "y": self._input_data_2},
+                    fetch_list=[results])
+                self._result_solution = fetches[0]
+                self._result_residuals = fetches[1]
+                self._result_rank = fetches[2]
+                self._result_sg_values = fetches[3]
+                self.assert_np_close()
+
+    def assert_np_close(self):
+        if len(self._input_shape_1) == 2:
+            np.testing.assert_allclose(
+                self._result_solution, self._output_solution, rtol=1e-3)
+            if self._input_shape_1[-2] > self._input_shape_1[
+                    -1] and self._output_rank == self._input_shape_1[-1]:
+                np.testing.assert_allclose(
+                    self._result_residuals, self._output_residuals, rtol=1e-5)
             if self.driver in ("gelsy", "gelsd", "gelss"):
-                if (np.abs(fetches[2] - self._output_rank) < 1e-6).any():
-                    pass
-                else:
-                    raise RuntimeError("Check LSTSQ rank static Failed")
-
+                np.testing.assert_allclose(
+                    self._result_rank, self._output_rank, rtol=1e-5)
             if self.driver in ("gelsd", "gelss"):
-                if (np.abs(fetches[3] - self._output_sg_values) < 1e-6).any():
-                    pass
-                else:
-                    raise RuntimeError(
-                        "Check LSTSQ singular values static Failed")
+                np.testing.assert_allclose(
+                    self._result_sg_values, self._output_sg_values, rtol=1e-5)
+        else:
+            for i in range(len(self._output_solution)):
+                np.testing.assert_allclose(
+                    self._result_solution[i],
+                    self._output_solution[i],
+                    rtol=1e-3)
+                if self._input_shape_1[-2] > self._input_shape_1[
+                        -1] and self._output_rank[i] == self._input_shape_1[-1]:
+                    np.testing.assert_allclose(
+                        self._result_residuals[i],
+                        self._output_residuals[i],
+                        rtol=1e-5)
+                if self.driver in ("gelsy", "gelsd", "gelss"):
+                    np.testing.assert_allclose(
+                        self._result_rank[i], self._output_rank[i], rtol=1e-5)
+                if self.driver in ("gelsd", "gelss"):
+                    np.testing.assert_allclose(
+                        self._result_sg_values[i],
+                        self._output_sg_values[i],
+                        rtol=1e-5)
+
+
+class LinalgLstsqTestCase1(LinalgLstsqTestCase):
+    def init_config(self):
+        self.dtype = 'float32'
+        self.rcond = 1e-15
+        self.driver = "gels"
+        self._input_shape_1 = (9, 9)
+        self._input_shape_2 = (9, 5)
 
 
-class LinalgLstsqTestCase(LinalgLstsqTestCase):
+class LinalgLstsqTestCase2(LinalgLstsqTestCase):
     def init_config(self):
         self.dtype = 'float64'
         self.rcond = 1e-15
         self.driver = "gels"
         self._input_shape_1 = (5, 10)
-        self._input_shape_2 = (5, 5)
+        self._input_shape_2 = (5, 8)
 
 
 class LinalgLstsqTestCaseRcond(LinalgLstsqTestCase):
     def init_config(self):
         self.dtype = 'float64'
-        self.rcond = 0.1
-        self.driver = "gels"
+        self.rcond = 1e-7
+        self.driver = "gelsd"
         self._input_shape_1 = (3, 2)
         self._input_shape_2 = (3, 3)
 
@@ -148,7 +177,7 @@ def init_config(self):
 class LinalgLstsqTestCaseGelsFloat32(LinalgLstsqTestCase):
     def init_config(self):
         self.dtype = 'float32'
-        self.rcond = 1e-15
+        self.rcond = None
         self.driver = "gels"
         self._input_shape_1 = (10, 5)
         self._input_shape_2 = (10, 2)
@@ -157,7 +186,7 @@ def init_config(self):
 class LinalgLstsqTestCaseGelssFloat64(LinalgLstsqTestCase):
     def init_config(self):
         self.dtype = 'float64'
-        self.rcond = 1e-15
+        self.rcond = None
         self.driver = "gelss"
         self._input_shape_1 = (5, 5)
         self._input_shape_2 = (5, 1)
@@ -176,7 +205,7 @@ class LinalgLstsqTestCaseBatch1(LinalgLstsqTestCase):
     def init_config(self):
         self.dtype = 'float32'
         self.rcond = 1e-15
-        self.driver = None
+        self.driver = "gelss"
         self._input_shape_1 = (2, 3, 10)
         self._input_shape_2 = (2, 3, 4)
 
@@ -186,8 +215,8 @@ def init_config(self):
         self.dtype = 'float64'
         self.rcond = 1e-15
         self.driver = "gelss"
-        self._input_shape_1 = (2, 8, 6)
-        self._input_shape_2 = (2, 8, 2)
+        self._input_shape_1 = (10, 8, 6)
+        self._input_shape_2 = (10, 8, 2)
 
 
 class LinalgLstsqTestCaseLarge1(LinalgLstsqTestCase):
@@ -201,7 +230,7 @@ def init_config(self):
 
 class LinalgLstsqTestCaseLarge2(LinalgLstsqTestCase):
     def init_config(self):
-        self.dtype = 'float32'
+        self.dtype = 'float64'
         self.rcond = 1e-15
         self.driver = "gelss"
         self._input_shape_1 = (50, 600)
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index 5f71606b7dc40..170889588aadb 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -2816,8 +2816,66 @@ def __check_input(x, UPLO):
     return out_value
 
 
-def lstsq(x, y, rcond=1e-15, driver=None, name=None):
-    device = paddle.device.get_device()
+def lstsq(x, y, rcond=None, driver=None, name=None):
+    """
+    Computes a solution to
+    the least squares problem of a system of linear equations.
+
+    Args:
+        x (Tensor): A tensor with shape ``(*, M, N)`` , the data type of the input Tensor ``x``
+            should be one of float32, float64.
+        y (Tensor): A tensor with shape ``(*, M, K)`` , the data type of the input Tensor ``y`` 
+            should be one of float32, float64.
+        rcond(float, optional): The default value is None. A float pointing number used to determine 
+            the effective rank of ``x``. If ``rcond`` is None, it will be set to max(M, N) times the 
+            machine precision of x_dtype.
+        driver(str, optional): The default value is None. The name of LAPACK method to be used. For 
+            CPU inputs the valid values are ‘gels’, ‘gelsy’, ‘gelsd, ‘gelss’. For CUDA input, the only 
+            valid driver is ‘gels’. If ``driver`` is None, ‘gelsy’ is used for CPU inputs and ‘gels’ 
+            for CUDA inputs.
+        name(str, optional): The default value is None. Normally there is no need for user to set 
+            this property. For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tuple: A tuple of 4 Tensors which is (``solution``, ``residuals``, ``rank``, ``singular_values``). 
+        ``solution`` is a tensor with shape ``(*, N, K)``, meaning the least squares solution. ``residuals`` 
+        is a tensor with shape ``(*, K)``, meaning the squared residuals of the solutions, which is computed 
+        when M > N and every matrix in ``x`` is full-rank, otherwise return an empty tensor. ``rank`` is a tensor 
+        with shape ``(*)``, meaning the ranks of the matrices in ``x``, which is computed when ``driver`` in 
+        (‘gelsy’, ‘gelsd’, ‘gelss’), otherwise return an empty tensor. ``singular_values`` is a tensor with 
+        shape ``(*, min(M, N))``, meaning singular values of the matrices in ``x``, which is computed when 
+        ``driver`` in (‘gelsd’, ‘gelss’), otherwise return an empty tensor.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            paddle.set_device("cpu")
+            x = paddle.to_tensor([[1, 3], [3, 2], [5, 6.]])
+            y = paddle.to_tensor([[3, 4, 6], [5, 3, 4], [1, 2, 1.]])
+            results = paddle.linalg.lstsq(x, y, driver="gelsd")
+            print(results[0])
+            # [[ 0.78350395, -0.22165027, -0.62371236],
+            # [-0.11340097,  0.78866047,  1.14948535]]
+            print(results[1])
+            # [19.81443405, 10.43814468, 30.56185532])
+            print(results[2])
+            # 2
+            print(results[3])
+            # [9.03455734, 1.54167950]
+
+            x = paddle.to_tensor([[10, 2, 3], [3, 10, 5], [5, 6, 12.]])
+            y = paddle.to_tensor([[4, 2, 9], [2, 0, 3], [2, 5, 3.]])
+            results = paddle.linalg.lstsq(x, y, driver="gels")
+            print(results[0])
+            # [[ 0.39386186,  0.10230173,  0.93606132],
+            # [ 0.10741687, -0.29028133,  0.11892585],
+            # [-0.05115091,  0.51918161, -0.19948854]]
+            print(results[1])
+            # []
+    """
+    device = paddle.get_device()
     if device == "cpu":
         if driver not in (None, "gels", "gelss", "gelsd", "gelsy"):
             raise ValueError(
@@ -2833,6 +2891,19 @@ def lstsq(x, y, rcond=1e-15, driver=None, name=None):
     else:
         raise RuntimeError("Only support lstsq api for CPU or CUDA device.")
 
+    if x.dtype == y.dtype and x.dtype in (paddle.float32, paddle.float64):
+        pass
+    else:
+        raise ValueError(
+            "Only support x and y have the same dtype such as 'float32' and 'float64'."
+        )
+
+    if rcond is None:
+        if x.dtype == paddle.float32:
+            rcond = 1e-7 * max(x.shape[-2], x.shape[-1])
+        elif x.dtype == paddle.float64:
+            rcond = 1e-15 * max(x.shape[-2], x.shape[-1])
+
     if in_dygraph_mode():
         solution, rank, singular_values = _C_ops.lstsq(x, y, "rcond", rcond,
                                                        "driver", driver)

From 1f8fe035b90b5742d4471a7132dbbf8f73269cfa Mon Sep 17 00:00:00 2001
From: baoachun <962571062@qq.com>
Date: Mon, 10 Jan 2022 20:23:16 +0800
Subject: [PATCH 079/151] update mul_gru_fuse_pass ut timeout setting (#38763)

---
 .../tests/unittests/ir/inference/test_mul_gru_fuse_pass.py     | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mul_gru_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mul_gru_fuse_pass.py
index 9b1400e45bbc0..b5a5377043571 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mul_gru_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mul_gru_fuse_pass.py
@@ -132,7 +132,8 @@ def sample_predictor_configs(self, program_config):
         yield config, ["im2sequence", "fusion_gru"], (1e-5, 1e-5)
 
     def test(self):
-        self.run_and_statis(quant=False, passes=["mul_gru_fuse_pass"])
+        self.run_and_statis(
+            quant=False, max_duration=300, passes=["mul_gru_fuse_pass"])
 
 
 if __name__ == "__main__":

From 0a7cb901b432395154eb3595f7928c9142f268ac Mon Sep 17 00:00:00 2001
From: yaoxuefeng <yaoxuefeng@baidu.com>
Date: Mon, 10 Jan 2022 23:15:09 +0800
Subject: [PATCH 080/151] add retry on pull dense sync (#38793)

---
 paddle/fluid/framework/fleet/fleet_wrapper.cc | 56 ++++++++++++++++++-
 paddle/fluid/framework/fleet/heter_context.h  |  1 -
 2 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc
index 225c2656fbfd1..f90027556342d 100644
--- a/paddle/fluid/framework/fleet/fleet_wrapper.cc
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc
@@ -632,6 +632,7 @@ void FleetWrapper::PullSparseToTensorSync(const uint64_t table_id, int fea_dim,
   if (ret != 0) {
     LOG(ERROR) << "fleet pull sparse failed, status[" << ret << "]";
     sleep(sleep_seconds_before_fail_exit_);
+    exit(-1);
   }
 #else
   for (size_t index = 0; index < inputs->size(); ++index) {
@@ -685,9 +686,36 @@ void FleetWrapper::PullDenseVarsSync(
     paddle::ps::Region reg(w, tensor->numel());
     regions.emplace_back(std::move(reg));
   }
-  auto status =
-      pslib_ptr_->_worker_ptr->pull_dense(regions.data(), regions.size(), tid);
-  status.wait();
+  int32_t status = -1;
+  int32_t cnt = 0;
+  while (true) {
+    auto tt = pslib_ptr_->_worker_ptr->pull_dense(regions.data(),
+                                                  regions.size(), tid);
+    bool flag = true;
+
+    tt.wait();
+
+    try {
+      status = tt.get();
+    } catch (const std::future_error& e) {
+      VLOG(0) << "Caught a future_error with code" << e.code()
+              << ", Message:" << e.what();
+    }
+    if (status != 0) {
+      VLOG(0) << "fleet pull dense sync failed, status[" << status << "]";
+      sleep(sleep_seconds_before_fail_exit_);
+      flag = false;
+      cnt++;
+    }
+    if (cnt > 3) {
+      VLOG(0) << "fleet pull dense sync failed, retry 3 times";
+      exit(-1);
+    }
+
+    if (flag) {
+      break;
+    }
+  }
 #endif
 }
 
@@ -1248,6 +1276,7 @@ void FleetWrapper::LoadModelOneTable(const uint64_t table_id,
   if (ret.get() != 0) {
     LOG(ERROR) << "load model of table id: " << table_id
                << ", from path: " << path << " failed";
+    exit(-1);
   }
 #else
   VLOG(0) << "FleetWrapper::LoadModel does nothing when no pslib";
@@ -1263,6 +1292,7 @@ void FleetWrapper::LoadWithWhitelist(const uint64_t table_id,
   if (ret.get() != 0) {
     LOG(ERROR) << "load model of table id: " << table_id
                << ", from path: " << path << " failed";
+    exit(-1);
   }
 #else
   VLOG(0) << "FleetWrapper::LoadWhitelist does nothing when no pslib";
@@ -1311,6 +1341,7 @@ void FleetWrapper::SaveModelOneTable(const uint64_t table_id,
   if (ret.get() != 0) {
     LOG(ERROR) << "save model of table id: " << table_id
                << ", to path: " << path << " failed";
+    exit(-1);
   }
 #else
   VLOG(0) << "FleetWrapper::SaveModelOneTable does nothing when no pslib";
@@ -1328,6 +1359,7 @@ void FleetWrapper::SaveModelOneTablePrefix(const uint64_t table_id,
   if (ret.get() != 0) {
     LOG(ERROR) << "save model (with prefix) of table id: " << table_id
                << ", to path: " << path << " failed";
+    exit(-1);
   }
 #else
   VLOG(0) << "FleetWrapper::SaveModelOneTablePrefix does nothing when no pslib";
@@ -1351,6 +1383,7 @@ void FleetWrapper::SetDate(const uint64_t table_id, const std::string& date) {
   ret.wait();
   if (ret.get() != 0) {
     LOG(ERROR) << "setdate : " << date << " failed";
+    exit(-1);
   }
 #else
   VLOG(0) << "FleetWrapper::SetDate does nothing when no pslib-gpu";
@@ -1463,6 +1496,11 @@ void FleetWrapper::ShrinkSparseTable(int table_id) {
 #ifdef PADDLE_WITH_PSLIB
   auto ret = pslib_ptr_->_worker_ptr->shrink(table_id);
   ret.wait();
+  int32_t err_code = ret.get();
+  if (err_code == -1) {
+    LOG(ERROR) << "Shrink Sparse Table failed";
+    exit(-1);
+  }
 #else
   VLOG(0) << "FleetWrapper::ShrinkSparseTable does nothing when no pslib";
 #endif
@@ -1472,6 +1510,10 @@ void FleetWrapper::ClearModel() {
 #ifdef PADDLE_WITH_PSLIB
   auto ret = pslib_ptr_->_worker_ptr->clear();
   ret.wait();
+  int32_t err_code = ret.get();
+  if (err_code == -1) {
+    LOG(ERROR) << "Clear Model failed";
+  }
 #else
   VLOG(0) << "FleetWrapper::ClearModel does nothing when no pslib";
 #endif
@@ -1481,6 +1523,10 @@ void FleetWrapper::ClearOneTable(const uint64_t table_id) {
 #ifdef PADDLE_WITH_PSLIB
   auto ret = pslib_ptr_->_worker_ptr->clear(table_id);
   ret.wait();
+  int32_t err_code = ret.get();
+  if (err_code == -1) {
+    LOG(ERROR) << "Clear One Table failed table_id: " << table_id;
+  }
 #else
   VLOG(0) << "FleetWrapper::ClearOneTable does nothing when no pslib";
 #endif
@@ -1541,6 +1587,10 @@ void FleetWrapper::ClientFlush() {
 #ifdef PADDLE_WITH_PSLIB
   auto ret = pslib_ptr_->_worker_ptr->flush();
   ret.wait();
+  int32_t err_code = ret.get();
+  if (err_code == -1) {
+    LOG(ERROR) << "Client Flush failed";
+  }
 #else
   VLOG(0) << "FleetWrapper::ServerFlush does nothing when no pslib";
 #endif
diff --git a/paddle/fluid/framework/fleet/heter_context.h b/paddle/fluid/framework/fleet/heter_context.h
index 45f9b04383944..3e8b0cfbc31f3 100644
--- a/paddle/fluid/framework/fleet/heter_context.h
+++ b/paddle/fluid/framework/fleet/heter_context.h
@@ -235,7 +235,6 @@ class HeterContext {
       }
       VLOG(3) << "heter_context unique keys with dynamic mf dimention";
     }
-
     for (std::thread& t : threads) {
       t.join();
     }

From ffbc2122afb24f5ec0a173283c78e11ad8cd9966 Mon Sep 17 00:00:00 2001
From: fengkuangxiaxia <fengkuangxiaxia21@gmail.com>
Date: Tue, 11 Jan 2022 10:43:01 +0800
Subject: [PATCH 081/151] roi_align fix (#38788)

---
 paddle/fluid/inference/tensorrt/op_teller.cc | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 8504474168d53..878eef016e7d1 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/tensorrt/op_teller.h"
+
 #include <bitset>
+
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/data_layout.h"
 
@@ -1283,7 +1285,8 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
         return false;
       }
       std::vector<std::string> attrs{"pooled_height", "pooled_width",
-                                     "spatial_scale", "sampling_ratio"};
+                                     "spatial_scale", "sampling_ratio",
+                                     "aligned"};
       for (auto const attr : attrs) {
         if (!desc.HasAttr(attr)) return false;
       }

From d368647112e3194298f521196a2ff3df453ec6be Mon Sep 17 00:00:00 2001
From: wanghuancoder <wanghuancoder@163.com>
Date: Tue, 11 Jan 2022 12:30:07 +0800
Subject: [PATCH 082/151] [Eager] fix some eager logic (#38576)

* Rearranged Eager AutoCodeGen directory structure

* Removed USE_OP in Eager AutoCodeGen

* Enabled generation for Operators without Grad/Inputs/Outputs

* Resolved operators without input

* Fixed merge conflicts

* Enabled Eager AutoCodeGen for 10+ more operators

* Refactored Eager AutoCodeGen with more organized helper objects

* Enabled Eager AutoCodeGen for operators with multiple OpBases

* Adjusted Eager AutoCodeGen to Enable Passing Output Tensor as Input Argument

* Handled Dispensable Inputs/Outputs in Eager AutoCodeGen

* Adjusted function generation/call between Python-C API & Dygraph API

* Synchronized auto-generated Python-C API with Dygraph Forward Functions

* support more eager tensor api

* fix merge compile error

* fix compile error and fit develop code

* support pure CPU

* fix some logic error in eager_mode

* support _varbase_creator in eager mode

* Added safe_initialized interface to EagerTensor for use in processing dispensable inputs

* for eager mode

* refine

* support multiple constructor for eager tensor

* add place related code

* polish code

* specific randint with dtype of int64

* Support pure cpu test

* eager logic

* refine test in pure cpu

* eager logic

* eager logic

* eager logic, test=develop

* skip core.eager when in inference, test=develop

* refine, test=develop

* refine, test=develop

* call RetainGrad after run forward kernel, test=develop

* refine, test=develop

* support dygraph util, meta, guard test

* eager test case

* support inference test

* refine test and fix initializer failed

* modify eagertensor patch method

* add eagertensor.clear_grandint, test=develop

* refine, test=develop

* refine, test=develop

* refine, test=develop

* support create varbase and fix retain grad error

* call monkey_patch_varbase in _test_eager_guard, test=develop

* fix windows error

* split clear_gradient to clear_gradient and zero_grads, test=develop

* refine, test=develop

* refine, test=develop

* support test_imperative_basic test in eager mode

* remove additional log in variable.h

* remove additional log in variable.h

* remove additional code create in merge

* eager

* fix some eager logic, test=develop

* refine, test=develop

* refine, test=develop

* refine, test=develop

* refine, test=develop

* refine, test=develop

Co-authored-by: jim19930609 <jim19930609@gmail.com>
Co-authored-by: JiabinYang <360788950@qq.com>
---
 .../eager/accumulation/accumulation_node.cc   |  3 +
 .../eager/accumulation/accumulation_node.h    |  2 +-
 paddle/fluid/eager/eager_tensor.h             | 50 ++++++++--------
 .../data_structure_tests/eager_tensor_test.cc |  2 +-
 .../performance_tests/benchmark_utils.cc      |  8 +--
 .../eager/tests/task_tests/generated_test.cc  | 10 ++--
 paddle/fluid/pybind/eager_method.cc           | 58 +++++++++++++++----
 paddle/fluid/pybind/eager_properties.cc       | 28 +++++++--
 .../fluid/dygraph/varbase_patch_methods.py    | 32 +++++++---
 9 files changed, 134 insertions(+), 59 deletions(-)

diff --git a/paddle/fluid/eager/accumulation/accumulation_node.cc b/paddle/fluid/eager/accumulation/accumulation_node.cc
index 69628d9b40021..ed1146eed0fb0 100644
--- a/paddle/fluid/eager/accumulation/accumulation_node.cc
+++ b/paddle/fluid/eager/accumulation/accumulation_node.cc
@@ -28,6 +28,9 @@
 
 static void CopyOrAddTensor(egr::EagerTensor* tensor,
                             const egr::EagerTensor& t) {
+  if (t.Var().IsInitialized()) {
+    const_cast<egr::EagerTensor*>(&t)->SyncToTensor();
+  }
   if (!tensor->defined() || !tensor->initialized()) {
     // Simply copy tensor->impl
     *tensor = t;
diff --git a/paddle/fluid/eager/accumulation/accumulation_node.h b/paddle/fluid/eager/accumulation/accumulation_node.h
index a2683db75e92c..9578924b783f5 100644
--- a/paddle/fluid/eager/accumulation/accumulation_node.h
+++ b/paddle/fluid/eager/accumulation/accumulation_node.h
@@ -32,7 +32,7 @@ class GradNodeAccumulation : public GradNodeBase {
   void RetainGrad(
       const std::function<egr::EagerTensor(const egr::EagerTensor&)>& hook);
 
-  egr::EagerTensor Grad() { return accumulated_grad; }
+  egr::EagerTensor* Grad() { return &accumulated_grad; }
 
  private:
   egr::EagerTensor accumulated_grad;
diff --git a/paddle/fluid/eager/eager_tensor.h b/paddle/fluid/eager/eager_tensor.h
index 0bcef2253f993..72fe5732e9620 100644
--- a/paddle/fluid/eager/eager_tensor.h
+++ b/paddle/fluid/eager/eager_tensor.h
@@ -239,8 +239,8 @@ class EagerTensor final {
           auto tensor_dense =
               std::dynamic_pointer_cast<pten::DenseTensor>(tensor_->impl());
           if (tensor_dense) {
-            paddle::experimental::MovesStorage(tensor_dense.get(),
-                                               framework_tensor);
+            paddle::experimental::SharesStorage(tensor_dense.get(),
+                                                framework_tensor);
           } else {
             PADDLE_THROW(paddle::platform::errors::Fatal(
                 "Unrecognized egr::EagerTensor type, only "
@@ -258,27 +258,23 @@ class EagerTensor final {
   /** Part 11: Sync paddle::framework::Variable with pten::Tensor **/
   void SyncToTensor() {
     // Synchronize allocation only once.
-    if (!this->defined() || !this->initialized()) {
-      // TODO(jiabin): Support selected rows later.
-      if (var_.IsInitialized()) {
-        if (var_.IsType<paddle::framework::LoDTensor>()) {
-          SetImplWithLegacyTensor<paddle::framework::LoDTensor,
-                                  pten::DenseTensor>();
-        } else if (var_.IsType<paddle::framework::Tensor>()) {
-          SetImplWithLegacyTensor<paddle::framework::Tensor,
-                                  pten::DenseTensor>();
-        } else {
-          PADDLE_THROW(paddle::platform::errors::Fatal(
-              "Unable to fetch underlying tensor "
-              "from VarBase, only LoDTensor and "
-              "Tensor are supported for now"));
-        }
+    if (var_.IsInitialized()) {
+      if (var_.IsType<paddle::framework::LoDTensor>()) {
+        SetImplWithLegacyTensor<paddle::framework::LoDTensor,
+                                pten::DenseTensor>();
+      } else if (var_.IsType<paddle::framework::Tensor>()) {
+        SetImplWithLegacyTensor<paddle::framework::Tensor, pten::DenseTensor>();
       } else {
-        PADDLE_THROW(paddle::platform::errors::Fatal(
-            "Can not Sync EagerTensor %s whose paddle::framework::Variable is "
-            "not initialized!",
-            name()));
+        PADDLE_THROW(
+            paddle::platform::errors::Fatal("Unable to fetch underlying tensor "
+                                            "from VarBase, only LoDTensor and "
+                                            "Tensor are supported for now"));
       }
+    } else {
+      PADDLE_THROW(paddle::platform::errors::Fatal(
+          "Can not Sync EagerTensor %s whose paddle::framework::Variable is "
+          "not initialized!",
+          name()));
     }
   }
 
@@ -296,8 +292,16 @@ class EagerTensor final {
   template <typename LEGACY_TYPE, typename TYPE>
   void SetImplWithLegacyTensor() {
     const auto& framework_tensor = var_.Get<LEGACY_TYPE>();
-    this->set_impl(
-        std::move(paddle::experimental::MakePtenDenseTensor(framework_tensor)));
+    if (this->initialized()) {
+      VLOG(8) << "Sync Var to initialized tensor for: " << name();
+      paddle::experimental::ReMakePtenDenseTensor(
+          framework_tensor,
+          static_cast<pten::DenseTensor*>(this->impl().get()));
+    } else {
+      VLOG(8) << "Sync Var to uninitialized tensor for: " << name();
+      this->set_impl(std::move(
+          paddle::experimental::MakePtenDenseTensor(framework_tensor)));
+    }
     var_.Clear();
   }
 
diff --git a/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc b/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc
index a02f0bec456bf..84daf4eac4ce6 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc
@@ -118,7 +118,7 @@ TEST(EagerTensor, MemberFunction) {
   CHECK_EQ(et3.Var().Get<paddle::framework::LoDTensor>().data<float>()[1],
            10.0f);
   VLOG(6) << "SyncToTensor";
-  CHECK(et3.initialized() == false);
+  CHECK(et3.initialized() == true);
   et3.SyncToTensor();
   CHECK(et3.initialized() == true);
   VLOG(6) << "Check Tensor";
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc
index baa99dc93c2dd..e05a63a69d002 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc
@@ -87,8 +87,8 @@ void benchmark_eager_intermediate_matmul(const EagerTensor& X,
     // Examine Forward Grad (w.r.t max_num_runs = 2)
     eager_test::CompareVariableWithValue<float>(input_tensor0, 16);
     // Examine Backward Grad (w.r.t max_num_runs = 2)
-    eager_test::CompareGradVariableWithValue<float>(X, 16);
-    eager_test::CompareGradVariableWithValue<float>(Y, 16);
+    eager_test::CompareGradTensorWithValue<float>(X, 16);
+    eager_test::CompareGradTensorWithValue<float>(Y, 16);
   }
 }
 
@@ -121,8 +121,8 @@ void benchmark_eager_intermediate_mlp(const EagerTensor& X,
     eager_test::CompareVariableWithValue<float>(Out, result["Out"]);
 
     // Examine Backward Grad (w.r.t max_num_runs = 2)
-    eager_test::CompareGradVariableWithValue<float>(X, result["GradX"]);
-    eager_test::CompareGradVariableWithValue<float>(Ws[0], result["GradW"]);
+    eager_test::CompareGradTensorWithValue<float>(X, result["GradX"]);
+    eager_test::CompareGradTensorWithValue<float>(Ws[0], result["GradW"]);
   }
 }
 
diff --git a/paddle/fluid/eager/tests/task_tests/generated_test.cc b/paddle/fluid/eager/tests/task_tests/generated_test.cc
index a06091247bf7a..b5ce9223f6c97 100644
--- a/paddle/fluid/eager/tests/task_tests/generated_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/generated_test.cc
@@ -54,7 +54,7 @@ TEST(Generated, Sigmoid) {
   RunBackward(target_tensors, {});
 
   VLOG(6) << "Finish Backward";
-  eager_test::CompareGradVariableWithValue<float>(tensor, 0.25);
+  eager_test::CompareGradTensorWithValue<float>(tensor, 0.25);
 }
 
 TEST(Generated, Matmul_v2) {
@@ -85,8 +85,8 @@ TEST(Generated, Matmul_v2) {
   std::vector<egr::EagerTensor> target_tensors = {output_tensor};
   RunBackward(target_tensors, {});
 
-  eager_test::CompareGradVariableWithValue<float>(X, 2.0 * 20);
-  eager_test::CompareGradVariableWithValue<float>(Y, 3.0 * 4);
+  eager_test::CompareGradTensorWithValue<float>(X, 2.0 * 20);
+  eager_test::CompareGradTensorWithValue<float>(Y, 3.0 * 4);
 }
 
 TEST(Generated, ElementwiseAdd) {
@@ -116,8 +116,8 @@ TEST(Generated, ElementwiseAdd) {
   std::vector<egr::EagerTensor> target_tensors = {output_tensor};
   RunBackward(target_tensors, {});
 
-  eager_test::CompareGradVariableWithValue<float>(X, 1.0);
-  eager_test::CompareGradVariableWithValue<float>(Y, 1.0);
+  eager_test::CompareGradTensorWithValue<float>(X, 1.0);
+  eager_test::CompareGradTensorWithValue<float>(Y, 1.0);
 }
 
 }  // namespace egr
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index 7f131f9ccd742..c56fe5be4da69 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -35,7 +35,7 @@ limitations under the License. */
 namespace paddle {
 namespace pybind {
 
-extern PyTypeObject* pEagerTensorType;
+extern PyTypeObject* p_eager_tensor_type;
 
 static PyObject* eager_tensor_method_numpy(EagerTensorObject* self,
                                            PyObject* args, PyObject* kwargs) {
@@ -167,7 +167,7 @@ static PyObject* eager_tensor__clear_gradient(EagerTensorObject* self,
   EAGER_SYNC_TRY
   VLOG(4) << "ClearGradient " << self->eager_tensor.name();
 
-  egr::EagerTensor grad;
+  egr::EagerTensor* grad;
   if (egr::egr_utils_api::IsLeafTensor(self->eager_tensor)) {
     // Add RetainGrad as PostHook to AccumulationNode
     std::shared_ptr<egr::GradNodeBase> grad_node =
@@ -182,14 +182,14 @@ static PyObject* eager_tensor__clear_gradient(EagerTensorObject* self,
     grad = accumulation_grad_node->Grad();
   } else {
     auto meta = egr::EagerUtils::unsafe_autograd_meta(self->eager_tensor);
-    grad = meta->Grad();
+    grad = meta->MutableGrad();
   }
 
-  if (grad.initialized()) {
+  if (grad->initialized()) {
     VLOG(4) << "Gradient of " << self->eager_tensor.name()
             << " is initialized, will be released.";
     auto dense_tensor =
-        std::dynamic_pointer_cast<pten::DenseTensor>(grad.impl());
+        std::dynamic_pointer_cast<pten::DenseTensor>(grad->impl());
     dense_tensor->release();
   }
   Py_INCREF(Py_None);
@@ -202,7 +202,6 @@ static PyObject* eager_tensor__zero_grads(EagerTensorObject* self,
   EAGER_TRY
   VLOG(4) << "ZeroGrads " << self->eager_tensor.name();
 
-  egr::EagerTensor grad;
   if (egr::egr_utils_api::IsLeafTensor(self->eager_tensor)) {
     // Add RetainGrad as PostHook to AccumulationNode
     std::shared_ptr<egr::GradNodeBase> grad_node =
@@ -214,21 +213,54 @@ static PyObject* eager_tensor__zero_grads(EagerTensorObject* self,
                                         "with type: GradNodeAccumulation"));
     auto accumulation_grad_node =
         std::dynamic_pointer_cast<egr::GradNodeAccumulation>(grad_node);
-    grad = accumulation_grad_node->Grad();
+    if (accumulation_grad_node->Grad()->initialized()) {
+      accumulation_grad_node->Grad()->set_tensor(
+          std::make_shared<paddle::experimental::Tensor>(
+              paddle::experimental::zeros_like(
+                  *(accumulation_grad_node->Grad()->Tensor().get()))));
+    }
   } else {
     auto meta = egr::EagerUtils::unsafe_autograd_meta(self->eager_tensor);
-    grad = meta->Grad();
+    if (meta->MutableGrad()->initialized()) {
+      meta->MutableGrad()->set_tensor(
+          std::make_shared<paddle::experimental::Tensor>(
+              paddle::experimental::zeros_like(
+                  *(meta->MutableGrad()->Tensor().get()))));
+    }
   }
 
-  if (grad.initialized()) {
-    grad.set_tensor(std::make_shared<paddle::experimental::Tensor>(
-        paddle::experimental::zeros_like(*(grad.Tensor().get()))));
-  }
   Py_INCREF(Py_None);
   return Py_None;
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
+static PyObject* eager_tensor_method_detach(EagerTensorObject* self,
+                                            PyObject* args, PyObject* kwargs) {
+  EAGER_SYNC_TRY
+  PADDLE_ENFORCE_EQ(
+      self->eager_tensor.initialized(), true,
+      platform::errors::InvalidArgument("Tensor %s has not been initialized!",
+                                        self->eager_tensor.name()));
+
+  PyObject* obj = p_eager_tensor_type->tp_alloc(p_eager_tensor_type, 0);
+  if (obj) {
+    auto v = reinterpret_cast<EagerTensorObject*>(obj);
+    new (&(v->eager_tensor)) egr::EagerTensor();
+    v->eager_tensor.set_impl(self->eager_tensor.impl());
+    v->eager_tensor.set_name(egr::Controller::Instance().GenerateUniqueName());
+    auto autograd_meta_src =
+        egr::EagerUtils::autograd_meta(&(self->eager_tensor));
+    auto autograd_meta = egr::EagerUtils::autograd_meta(&(v->eager_tensor));
+    autograd_meta->SetPersistable(autograd_meta_src->Persistable());
+  } else {
+    PADDLE_THROW(platform::errors::Fatal(
+        "tp_alloc return null, can not new a PyObject."));
+  }
+
+  return obj;
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
 PyMethodDef variable_methods[] = {
     {"numpy", (PyCFunction)(void (*)(void))eager_tensor_method_numpy,
      METH_VARARGS | METH_KEYWORDS, NULL},
@@ -246,6 +278,8 @@ PyMethodDef variable_methods[] = {
      METH_VARARGS | METH_KEYWORDS, NULL},
     {"_zero_grads", (PyCFunction)(void (*)(void))eager_tensor__zero_grads,
      METH_VARARGS | METH_KEYWORDS, NULL},
+    {"detach", (PyCFunction)(void (*)(void))eager_tensor_method_detach,
+     METH_VARARGS | METH_KEYWORDS, NULL},
     {NULL, NULL, 0, NULL}};
 
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc
index b147d5fbad0ed..71b8bbbb1a283 100644
--- a/paddle/fluid/pybind/eager_properties.cc
+++ b/paddle/fluid/pybind/eager_properties.cc
@@ -63,7 +63,6 @@ PyObject* eager_tensor_properties_get_grad(EagerTensorObject* self,
                                            void* closure) {
   EAGER_SYNC_TRY
   if (egr::egr_utils_api::IsLeafTensor(self->eager_tensor)) {
-    // Add RetainGrad as PostHook to AccumulationNode
     std::shared_ptr<egr::GradNodeBase> grad_node =
         egr::EagerUtils::grad_node(self->eager_tensor);
     PADDLE_ENFORCE(
@@ -73,7 +72,7 @@ PyObject* eager_tensor_properties_get_grad(EagerTensorObject* self,
                                         "with type: GradNodeAccumulation"));
     auto accumulation_grad_node =
         std::dynamic_pointer_cast<egr::GradNodeAccumulation>(grad_node);
-    return ToPyObject(accumulation_grad_node->Grad());
+    return ToPyObject(*accumulation_grad_node->Grad());
   } else {
     VLOG(6) << "Get grad for tensor: " << self->eager_tensor.name();
     auto meta = egr::EagerUtils::unsafe_autograd_meta(self->eager_tensor);
@@ -82,6 +81,27 @@ PyObject* eager_tensor_properties_get_grad(EagerTensorObject* self,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
+int eager_tensor_properties_set_grad(EagerTensorObject* self, PyObject* value,
+                                     void* closure) {
+  EAGER_SYNC_TRY
+  auto src = CastPyArg2EagerTensor(value, 0);
+  PADDLE_ENFORCE(
+      egr::egr_utils_api::IsLeafTensor(self->eager_tensor),
+      paddle::platform::errors::Fatal("Only leaf Tensor can be set grad."));
+  std::shared_ptr<egr::GradNodeBase> grad_node =
+      egr::EagerUtils::grad_node(self->eager_tensor);
+  PADDLE_ENFORCE(
+      grad_node.get() != nullptr,
+      paddle::platform::errors::Fatal("Detected NULL grad_node"
+                                      "Leaf tensor should have had grad_node "
+                                      "with type: GradNodeAccumulation"));
+  auto accumulation_grad_node =
+      std::dynamic_pointer_cast<egr::GradNodeAccumulation>(grad_node);
+  accumulation_grad_node->Grad()->copy_(src, true);
+  return 0;
+  EAGER_CATCH_AND_THROW_RETURN_ZERO
+}
+
 int eager_tensor_properties_set_stop_gradient(EagerTensorObject* self,
                                               PyObject* value, void* closure) {
   EAGER_SYNC_TRY
@@ -147,8 +167,8 @@ PyObject* eager_tensor_properties_get_dtype(EagerTensorObject* self,
 }
 
 struct PyGetSetDef variable_properties[] = {
-    {"grad", (getter)eager_tensor_properties_get_grad, nullptr, nullptr,
-     nullptr},
+    {"grad", (getter)eager_tensor_properties_get_grad,
+     (setter)eager_tensor_properties_set_grad, nullptr, nullptr},
     {"name", (getter)eager_tensor_properties_get_name,
      (setter)eager_tensor_properties_set_name, nullptr, nullptr},
     {"stop_gradient", (getter)eager_tensor_properties_get_stop_gradient,
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index c61f87ccf9089..e06e7f52dd671 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -22,7 +22,7 @@
 from .. import framework
 from .. import core
 from .. import unique_name
-from ..framework import Variable, Parameter, ParamBase, _getitem_impl_, _setitem_impl_, _in_eager_mode
+from ..framework import Variable, Parameter, ParamBase, _getitem_impl_, _setitem_impl_, _in_eager_mode, EagerParamBase
 from .base import switch_to_static_graph
 from .math_op_patch import monkey_patch_math_varbase
 from .parallel import scale_loss
@@ -149,7 +149,7 @@ def set_value(self, value):
                     out = linear(t)  # call with different weight
 
         """
-        if _in_eager_mode():
+        if core._in_eager_mode():
             base_tensor = core.eager.EagerTensor
         else:
             base_tensor = core.VarBase
@@ -238,7 +238,7 @@ def backward(self, grad_tensor=None, retain_graph=False):
         """
         if framework.in_dygraph_mode():
             if grad_tensor is not None:
-                if _in_eager_mode():
+                if core._in_eager_mode():
                     assert isinstance(
                         grad_tensor, core.eager.EagerTensor
                     ), "The type of grad_tensor must be paddle.Tensor"
@@ -250,7 +250,7 @@ def backward(self, grad_tensor=None, retain_graph=False):
                     "Tensor shape not match, Tensor of grad_tensor [ {} ] with shape {} mismatch Tensor [ {} ] with shape {}".format(
                     grad_tensor.name, grad_tensor.shape, self.name, self.shape)
 
-            if _in_eager_mode():
+            if core._in_eager_mode():
                 if grad_tensor is None:
                     grad_tensor = []
                 else:
@@ -258,7 +258,7 @@ def backward(self, grad_tensor=None, retain_graph=False):
             if paddle.is_compiled_with_xpu() or paddle.is_compiled_with_npu():
                 # TODO(liuyuhui): Currently only for xpu. Will be removed in the future.
                 scaled_loss = scale_loss(self)
-                if _in_eager_mode():
+                if core._in_eager_mode():
                     core.eager.run_backward([scaled_loss], grad_tensor,
                                             retain_graph)
                 else:
@@ -266,7 +266,7 @@ def backward(self, grad_tensor=None, retain_graph=False):
                                               retain_graph,
                                               framework._dygraph_tracer())
             else:
-                if _in_eager_mode():
+                if core._in_eager_mode():
                     core.eager.run_backward([self], grad_tensor, retain_graph)
                 else:
                     core.dygraph_run_backward([self], [grad_tensor],
@@ -305,7 +305,7 @@ def gradient(self):
                 # [500.]
 
         """
-        if _in_eager_mode():
+        if core._in_eager_mode():
             if not self.grad._is_initialized():
                 return None
             # TODO(wanghuancoder) support SELECTED_ROWS
@@ -587,7 +587,7 @@ def __str__(self):
                 #        [[0.30574632, 0.55739117, 0.30902600, 0.39413780, 0.44830436],
                 #         [0.79010487, 0.53972793, 0.09495186, 0.44267157, 0.72112119]])
         """
-        if _in_eager_mode():
+        if core._in_eager_mode():
             from paddle.tensor.to_string import eager_tensor_to_string
             return eager_tensor_to_string(self)
         else:
@@ -619,7 +619,7 @@ def __deepcopy__(self, memo):
             raise RuntimeError(
                 "Only Leaf Tensor support the deepcopy at the moment, non-Leaf Tensors contains graph information that does't support deepcopy"
             )
-        if _in_eager_mode():
+        if core._in_eager_mode():
             new_varbase = core.eager.EagerTensor()
         else:
             new_varbase = core.VarBase()
@@ -763,6 +763,14 @@ def _grad_ivar(self):
         else:
             return None
 
+    @framework.dygraph_only
+    def _set_grad_ivar(self, value):
+        if isinstance(self, EagerParamBase):
+            self.grad = value
+        else:
+            raise TypeError(
+                "_set_grad_ivar is only supported for Parameter Tensor")
+
     @framework.dygraph_only
     def clear_gradient(self, set_to_zero=True):
         if set_to_zero:
@@ -770,6 +778,10 @@ def clear_gradient(self, set_to_zero=True):
         else:
             self._clear_gradient()
 
+    @framework.dygraph_only
+    def clone(self):
+        return _C_ops_.assign(self)
+
     if core._in_eager_mode() and not hasattr(core, "eager"):
         return
 
@@ -790,7 +802,9 @@ def clear_gradient(self, set_to_zero=True):
 
     if core._in_eager_mode():
         setattr(core.eager.EagerTensor, "_grad_ivar", _grad_ivar)
+        setattr(core.eager.EagerTensor, "_set_grad_ivar", _set_grad_ivar)
         setattr(core.eager.EagerTensor, "clear_gradient", clear_gradient)
+        setattr(core.eager.EagerTensor, "clone", clone)
     else:
         setattr(core.VarBase, "__name__", "Tensor")
         setattr(core.VarBase, "grad", grad)

From e91f7c02b61017486e2c24f023165d92e1988a8f Mon Sep 17 00:00:00 2001
From: Ming-Xu Huang <mingh@nvidia.com>
Date: Tue, 11 Jan 2022 14:02:45 +0800
Subject: [PATCH 083/151] Jit pre save hook (#38186)

* Pre-save hooks of jit.save

1. Added pre_save_hooks features to jit.save.
2. Added related unittests

* Added jit pre_save_hooks functions's alias to paddle.jit and copyright.

* Make jit.save_pre_hook style be consisent with Paddle's rule.

* Fixed arguments passing bug in run_save_pre_hooks

* Added API Documents

* Move clear and run_pre_save_hooks as internal methonds only.

* Made register_save_pre_hook as an internal function.
---
 python/paddle/fluid/dygraph/jit.py            | 101 ++++++++++++++++++
 .../unittests/test_jit_pre_save_hooks.py      |  59 ++++++++++
 python/paddle/jit/__init__.py                 |   3 +-
 3 files changed, 162 insertions(+), 1 deletion(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_jit_pre_save_hooks.py

diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py
index 2db9fb5d76a58..4bfdc3c27fad6 100644
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -1,4 +1,5 @@
 # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 NVIDIA Corporation. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -20,6 +21,7 @@
 import functools
 from collections import OrderedDict
 import inspect
+import threading
 
 import six
 import paddle
@@ -525,6 +527,105 @@ def _build_load_path_and_config(path, config):
     return model_path, config
 
 
+_save_pre_hooks_lock = threading.Lock()
+_save_pre_hooks = []
+
+
+class HookRemoveHelper(object):
+    """ A HookRemoveHelper that can be used to remove hook. """
+
+    def __init__(self, hook):
+        self._hook = hook
+
+    def remove(self):
+        _remove_save_pre_hook(self._hook)
+
+
+def _register_save_pre_hook(hook):
+    """
+    Register a save pre-hook for `paddle.jit.save`.
+    This hook will be executed before `save` function has been invoked.
+
+    hook(layer, input_spec, configs) -> None
+    - layer (Layer|function): This argument is corresponding to `layer` in `paddle.jit.save`.
+    - input_spec (list or tuple[InputSpec|Tensor|Python built-in variable]): This argument is corresponding to `input_spec` in `paddle.jit.save`.
+    - configs (dict): This argument is corresponding to `configs` in `paddle.jit.save`.
+
+    Args:
+        hook(function): a function registered as a save pre-hook
+
+    Returns:
+        HookRemoveHelper: a HookRemoveHelper object that can be used to remove the added hook by calling `hook_remove_helper.remove()`.
+
+    Examples:
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            IMAGE_SIZE = 256
+            CLASS_NUM = 10
+
+            class LinearNet(paddle.nn.Layer):
+                def __init__(self):
+                    super(LinearNet, self).__init__()
+                    self._linear = paddle.nn.Linear(IMAGE_SIZE, CLASS_NUM)
+
+                def forward(self, x):
+                    return self._linear(x)
+
+            saving_count = 0
+            def save_pre_hook(layer, input_spec, configs):
+                global saving_count
+                saving_count += 1
+
+            remove_handler = paddle.jit.register_save_pre_hook(save_pre_hook)
+
+            layer = LinearNet()
+            paddle.jit.save(layer, "/tmp", [paddle.static.InputSpec(shape=[-1, IMAGE_SIZE])])
+            # saving_count == 1
+
+            remove_handler.remove()
+            paddle.jit.save(layer, "/tmp", [paddle.static.InputSpec(shape=[-1, IMAGE_SIZE])])
+            # saving_count == 1
+    """
+    global _save_pre_hooks_lock
+    global _save_pre_hooks
+    _save_pre_hooks_lock.acquire()
+    if hook not in _save_pre_hooks:
+        _save_pre_hooks.append(hook)
+    _save_pre_hooks_lock.release()
+    return HookRemoveHelper(hook)
+
+
+def _clear_save_pre_hooks():
+    global _save_pre_hooks_lock
+    global _save_pre_hooks
+    _save_pre_hooks_lock.acquire()
+    _save_pre_hooks.clear()
+    _save_pre_hooks_lock.release()
+
+
+def _remove_save_pre_hook(hook):
+    global _save_pre_hooks_lock
+    global _save_pre_hooks
+    _save_pre_hooks_lock.acquire()
+    if hook in _save_pre_hooks:
+        _save_pre_hooks.remove(hook)
+    _save_pre_hooks_lock.release()
+
+
+def _run_save_pre_hooks(func):
+    def wrapper(layer, path, input_spec=None, **configs):
+        global _save_pre_hooks
+        for hook in _save_pre_hooks:
+            hook(layer, input_spec, configs)
+        func(layer, path, input_spec, **configs)
+
+    return wrapper
+
+
+@_run_save_pre_hooks
 @switch_to_static_graph
 def save(layer, path, input_spec=None, **configs):
     """
diff --git a/python/paddle/fluid/tests/unittests/test_jit_pre_save_hooks.py b/python/paddle/fluid/tests/unittests/test_jit_pre_save_hooks.py
new file mode 100644
index 0000000000000..a938024e3c9b4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_jit_pre_save_hooks.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 NVIDIA Corporation. All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+
+import paddle
+from paddle.fluid.dygraph.jit import _run_save_pre_hooks, _clear_save_pre_hooks, _register_save_pre_hook
+
+_counter = 0
+
+
+class TestPreSaveHooks(unittest.TestCase):
+    def test_pre_save_hook_functions(self):
+        def fake_func(*args, **kwgs):
+            global _counter
+            _counter += 1
+
+        remove_handler = _register_save_pre_hook(fake_func)
+        self.assertEqual(len(paddle.fluid.dygraph.jit._save_pre_hooks), 1)
+        self.assertTrue(
+            paddle.fluid.dygraph.jit._save_pre_hooks[0] is fake_func)
+
+        # Test of avoiding redundancy hanging
+        remove_handler = _register_save_pre_hook(fake_func)
+        self.assertEqual(len(paddle.fluid.dygraph.jit._save_pre_hooks), 1)
+        self.assertTrue(
+            paddle.fluid.dygraph.jit._save_pre_hooks[0] is fake_func)
+
+        remove_handler.remove()
+        self.assertEqual(len(paddle.fluid.dygraph.jit._save_pre_hooks), 0)
+
+        remove_handler = _register_save_pre_hook(fake_func)
+        _clear_save_pre_hooks()
+        self.assertEqual(len(paddle.fluid.dygraph.jit._save_pre_hooks), 0)
+
+        global _counter
+        _counter = 0
+        remove_handler = _register_save_pre_hook(fake_func)
+        func_with_hook = _run_save_pre_hooks(fake_func)
+        func_with_hook(None, None)
+        self.assertEqual(_counter, 2)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/jit/__init__.py b/python/paddle/jit/__init__.py
index 576989e8e0d2a..a2af493faca11 100644
--- a/python/paddle/jit/__init__.py
+++ b/python/paddle/jit/__init__.py
@@ -1,4 +1,5 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 NVIDIA Corporation. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 0ad363b1527461bf2a1c6c674f6202b8b6c0a48c Mon Sep 17 00:00:00 2001
From: Sing_chan <51314274+betterpig@users.noreply.github.com>
Date: Tue, 11 Jan 2022 14:27:51 +0800
Subject: [PATCH 084/151] support vs2019 compilation in windows (#38719)

* support vs2019 compilation in windows

* not modify pow_op's original compute logic
---
 cmake/external/protobuf.cmake                 |  4 ++
 .../elementwise/elementwise_functor.h         | 42 +++++++++++++++++++
 .../elementwise/elementwise_pow_op.cu         |  3 +-
 .../elementwise/elementwise_pow_op.h          | 17 +++++++-
 paddle/fluid/operators/svd_helper.h           |  6 +--
 paddle/scripts/paddle_build.bat               |  7 +++-
 paddle/utils/small_vector.h                   |  1 +
 7 files changed, 73 insertions(+), 7 deletions(-)
 mode change 100755 => 100644 paddle/fluid/operators/elementwise/elementwise_pow_op.h

diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 2a028b8dc7e7f..f7cb7716969f5 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -207,6 +207,10 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
     elseif(WITH_IPU)
         SET(PROTOBUF_REPOSITORY  ${GIT_URL}/protocolbuffers/protobuf.git)
         SET(PROTOBUF_TAG         d750fbf648256c7c631f51ffdbf67d7c18b0114e)
+    elseif(WIN32)
+        SET(PROTOBUF_REPOSITORY  ${GIT_URL}/protocolbuffers/protobuf.git)
+        # Change the tag to support building with vs2019
+        SET(PROTOBUF_TAG         01a05a53f40ca2ac5f0af10c6cc0810bee39b792)
     else()
         SET(PROTOBUF_REPOSITORY  ${GIT_URL}/protocolbuffers/protobuf.git)
         SET(PROTOBUF_TAG         9f75c5aa851cd877fb0d93ccc31b8567a6706546)
diff --git a/paddle/fluid/operators/elementwise/elementwise_functor.h b/paddle/fluid/operators/elementwise/elementwise_functor.h
index a62c531ff0733..0a6866f578d01 100644
--- a/paddle/fluid/operators/elementwise/elementwise_functor.h
+++ b/paddle/fluid/operators/elementwise/elementwise_functor.h
@@ -174,6 +174,27 @@ struct FMaxFunctor<paddle::platform::float16> {
   }
 };
 
+template <>
+struct FMaxFunctor<int> {
+  inline HOSTDEVICE int operator()(const int& a, const int& b) const {
+    float float_a = static_cast<float>(a);
+    float float_b = static_cast<float>(b);
+    auto result = std::fmax(float_a, float_b);
+    return std::lrint(result);
+  }
+};
+
+template <>
+struct FMaxFunctor<int64_t> {
+  inline HOSTDEVICE int64_t operator()(const int64_t& a,
+                                       const int64_t& b) const {
+    double double_a = static_cast<double>(a);
+    double double_b = static_cast<double>(b);
+    auto result = std::fmax(double_a, double_b);
+    return std::llrint(result);
+  }
+};
+
 // Fmin
 template <typename T>
 struct FMinFunctor {
@@ -194,6 +215,27 @@ struct FMinFunctor<paddle::platform::float16> {
   }
 };
 
+template <>
+struct FMinFunctor<int> {
+  inline HOSTDEVICE int operator()(const int& a, const int& b) const {
+    float float_a = static_cast<float>(a);
+    float float_b = static_cast<float>(b);
+    auto result = std::fmin(float_a, float_b);
+    return std::lrint(result);
+  }
+};
+
+template <>
+struct FMinFunctor<int64_t> {
+  inline HOSTDEVICE int64_t operator()(const int64_t& a,
+                                       const int64_t& b) const {
+    double double_a = static_cast<double>(a);
+    double double_b = static_cast<double>(b);
+    auto result = std::fmin(double_a, double_b);
+    return std::llrint(result);
+  }
+};
+
 template <typename T>
 struct MulGradFunctor {
   inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a * b; }
diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op.cu b/paddle/fluid/operators/elementwise/elementwise_pow_op.cu
index 5335f274ef126..a5570f2cb85d5 100644
--- a/paddle/fluid/operators/elementwise/elementwise_pow_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op.cu
@@ -31,7 +31,8 @@ struct CudaPowFunctor<
   // when cast to int by default and it is wrong.
   // Use llrint to cast it to the nearest integer, which is 3.
   inline HOSTDEVICE T operator()(const T args[]) const {
-    return std::llrint(std::pow(args[0], args[1]));
+    return std::llrint(
+        std::pow(static_cast<double>(args[0]), static_cast<double>(args[1])));
   }
 };
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op.h b/paddle/fluid/operators/elementwise/elementwise_pow_op.h
old mode 100755
new mode 100644
index ee718a3ecd1ec..256ab31ead69c
--- a/paddle/fluid/operators/elementwise/elementwise_pow_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op.h
@@ -31,7 +31,8 @@ struct PowFunctor {
     // when cast to int by default and it is wrong.
     // Use llrint to cast it to the nearest integer, which is 3.
     if (std::is_integral<T>::value) {
-      return std::llrint(std::pow(a, b));
+      return std::llrint(
+          std::pow(static_cast<double>(a), static_cast<double>(b)));
     }
 #endif
     return std::pow(a, b);
@@ -60,13 +61,25 @@ class ElementwisePowKernel : public framework::OpKernel<T> {
 template <typename T>
 struct PowGradDX {
   HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+    if (std::is_integral<T>::value) {
+      return dout * y *
+             std::pow(static_cast<double>(x), static_cast<double>(y - 1));
+    }
+#endif
     return dout * y * std::pow(x, y - 1);
   }
 };
 
-template <typename T>
+template <typename T, typename Enable = void>
 struct PowGradDY {
   HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+    if (std::is_integral<T>::value) {
+      return dout * std::log(static_cast<double>(x)) *
+             std::pow(static_cast<double>(x), static_cast<double>(y));
+    }
+#endif
     return dout * std::log(x) * std::pow(x, y);
   }
 };
diff --git a/paddle/fluid/operators/svd_helper.h b/paddle/fluid/operators/svd_helper.h
index 8d17ddec6fbb4..8a3622a6b1b5e 100644
--- a/paddle/fluid/operators/svd_helper.h
+++ b/paddle/fluid/operators/svd_helper.h
@@ -84,7 +84,7 @@ void BatchSvd(const T* X, T* U, T* VH, T* S, int rows, int cols, int batches,
 
 template <typename T>
 struct PowFunctor {
-  PowFunctor(const T* input, T* output, int64_t numel, float exp)
+  PowFunctor(const T* input, T* output, int64_t numel, T exp)
       : input_(input), output_(output), numel_(numel), exp_(exp) {}
 
   HOSTDEVICE void operator()(int64_t idx) const {
@@ -93,7 +93,7 @@ struct PowFunctor {
   const T* input_;
   T* output_;
   int64_t numel_;
-  float exp_;
+  T exp_;
 };
 
 template <typename T>
@@ -297,7 +297,7 @@ struct DeviceIndependenceTensorOperations {
       const framework::ExecutionContext& context)
       : context(context) {}
 
-  framework::Tensor Pow(const framework::Tensor& x, float exp) {
+  framework::Tensor Pow(const framework::Tensor& x, T exp) {
     framework::Tensor out;
     auto for_range = GetForRange(x.numel());
     int numel = x.numel();
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 8bb21fa4ef2e1..f64acbeb72307 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -261,6 +261,7 @@ set ON_INFER=ON
 set WITH_TESTING=ON
 set WITH_TENSORRT=ON
 set WITH_INFERENCE_API_TEST=ON
+set WITH_TPCACHE=OFF
 
 call :cmake || goto cmake_error
 call :build || goto build_error
@@ -325,7 +326,11 @@ echo    ========================================
 rem set vs language to english to block showIncludes, this need vs has installed English language package.
 set VSLANG=1033
 rem Configure the environment for 64-bit builds. 'DISTUTILS_USE_SDK' indicates that the user has selected the compiler.
-call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvars64.bat"
+echo %task_name%|findstr wincheck_inference >nul && (
+    call "D:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary\Build\vcvars64.bat"
+) || (
+    call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvars64.bat"
+)
 set DISTUTILS_USE_SDK=1
 rem Windows 10 Kit bin dir
 set PATH=C:\Program Files (x86)\Windows Kits\10\bin\10.0.17763.0\x64;%PATH%
diff --git a/paddle/utils/small_vector.h b/paddle/utils/small_vector.h
index e9e7996babcf7..48af2491b89f8 100644
--- a/paddle/utils/small_vector.h
+++ b/paddle/utils/small_vector.h
@@ -31,6 +31,7 @@
 #include <limits>
 #include <memory>
 #include <new>
+#include <stdexcept>
 #include <string>
 #include <type_traits>
 #include <utility>

From 9f34a0702213ada872c04ddbc367db2ceedfc697 Mon Sep 17 00:00:00 2001
From: limingshu <61349199+JamesLim-sy@users.noreply.github.com>
Date: Tue, 11 Jan 2022 14:38:25 +0800
Subject: [PATCH 085/151] Remove useless headers for some grad ops (#38823)

* fix the wrong filename

* first commit

* first commit

* remove rest useless headers

* for ci approval
---
 .../fluid/operators/elementwise/elementwise_add_op.cu | 10 ----------
 .../fluid/operators/elementwise/elementwise_add_op.h  |  6 ------
 .../fluid/operators/elementwise/elementwise_div_op.h  | 11 -----------
 .../operators/elementwise/elementwise_floordiv_op.cu  |  1 -
 .../operators/elementwise/elementwise_floordiv_op.h   |  3 ---
 .../fluid/operators/elementwise/elementwise_functor.h |  3 ---
 .../fluid/operators/elementwise/elementwise_max_op.cu |  1 -
 .../fluid/operators/elementwise/elementwise_max_op.h  |  3 ---
 .../fluid/operators/elementwise/elementwise_min_op.cu |  1 -
 .../fluid/operators/elementwise/elementwise_min_op.h  |  4 ----
 .../fluid/operators/elementwise/elementwise_mod_op.cu |  3 +--
 .../fluid/operators/elementwise/elementwise_mod_op.h  |  2 --
 .../fluid/operators/elementwise/elementwise_mul_op.cu |  8 --------
 .../fluid/operators/elementwise/elementwise_mul_op.h  |  6 +-----
 paddle/fluid/operators/elementwise/elementwise_op.h   |  2 --
 .../fluid/operators/elementwise/elementwise_pow_op.cu |  2 +-
 .../fluid/operators/elementwise/elementwise_pow_op.h  |  1 -
 .../fluid/operators/elementwise/elementwise_sub_op.cu |  4 ----
 .../fluid/operators/elementwise/elementwise_sub_op.h  |  5 -----
 19 files changed, 3 insertions(+), 73 deletions(-)

diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
index 7b153a4bce86a..b5c19a3edb818 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
@@ -12,17 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/pten_utils.h"
 #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
-#include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/float16.h"
-
-// only can include the headers in paddle/top/api dirs
-#include "paddle/pten/api/lib/utils/tensor_utils.h"
-#include "paddle/pten/include/core.h"
-#include "paddle/pten/include/math.h"
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h
index d6d79d166d00a..35807d7c57d47 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h
@@ -17,14 +17,8 @@ limitations under the License. */
 #include <algorithm>
 #include <utility>
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-#include "paddle/fluid/framework/pten_utils.h"
 
 // only can include the headers in paddle/pten/include dirs
-#include "paddle/pten/api/lib/utils/tensor_utils.h"
-#include "paddle/pten/include/core.h"
 #include "paddle/pten/kernels/math_kernel.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.h b/paddle/fluid/operators/elementwise/elementwise_div_op.h
index b13a0539ec6ad..d9f7bbc56a902 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.h
@@ -14,21 +14,10 @@ limitations under the License. */
 
 #pragma once
 
-#include <string>
 #include <vector>
 #include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
 
-#include "paddle/fluid/framework/pten_utils.h"
-
-// only can include the headers in paddle/pten/include dirs
-#include "paddle/pten/api/lib/utils/tensor_utils.h"
-#include "paddle/pten/include/core.h"
-#include "paddle/pten/kernels/math_kernel.h"
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu
index 41a0ae00f270d..3202b0a7d254b 100644
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_floordiv_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h
index ae8d2d8625c58..fc8f18161990d 100644
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h
@@ -14,10 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/operators/elementwise/elementwise_functor.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/math/blas.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/elementwise/elementwise_functor.h b/paddle/fluid/operators/elementwise/elementwise_functor.h
index 0a6866f578d01..e2689cefd43a7 100644
--- a/paddle/fluid/operators/elementwise/elementwise_functor.h
+++ b/paddle/fluid/operators/elementwise/elementwise_functor.h
@@ -16,9 +16,6 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/array.h"
 #include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/fluid/platform/hostdevice.h"
 #include "paddle/pten/kernels/funcs/elementwise_functor.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.cu b/paddle/fluid/operators/elementwise/elementwise_max_op.cu
index eb6f78bf270ad..760429200889b 100644
--- a/paddle/fluid/operators/elementwise/elementwise_max_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op.cu
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_max_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.h b/paddle/fluid/operators/elementwise/elementwise_max_op.h
index acb212e992a1d..a7a49fed87151 100644
--- a/paddle/fluid/operators/elementwise/elementwise_max_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op.h
@@ -15,10 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <cmath>
-#include "paddle/fluid/operators/elementwise/elementwise_functor.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/platform/eigen_ext.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.cu b/paddle/fluid/operators/elementwise/elementwise_min_op.cu
index 59f1c51bce266..b51dbcd883608 100644
--- a/paddle/fluid/operators/elementwise/elementwise_min_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op.cu
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_min_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.h b/paddle/fluid/operators/elementwise/elementwise_min_op.h
index ebd8f4477d8cf..ffb8c965357a3 100644
--- a/paddle/fluid/operators/elementwise/elementwise_min_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op.h
@@ -15,11 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <cmath>
-#include "paddle/fluid/operators/elementwise/elementwise_functor.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/platform/eigen_ext.h"
-#include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/elementwise/elementwise_mod_op.cu b/paddle/fluid/operators/elementwise/elementwise_mod_op.cu
index bb49fdbf12dfa..d2106645a4727 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mod_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_mod_op.cu
@@ -11,9 +11,8 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+
 #include "paddle/fluid/operators/elementwise/elementwise_mod_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
-#include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
diff --git a/paddle/fluid/operators/elementwise/elementwise_mod_op.h b/paddle/fluid/operators/elementwise/elementwise_mod_op.h
index 03884f2a45883..66c3e553c141f 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mod_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mod_op.h
@@ -14,9 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/math/blas.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
index cdf376fd6a8cc..a8b6c2abe3bf9 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
@@ -13,15 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
-#include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/float16.h"
 
-// only can include the headers in paddle/top/api dirs
-#include "paddle/pten/api/lib/utils/tensor_utils.h"
-#include "paddle/pten/include/core.h"
-#include "paddle/pten/include/math.h"
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
index 5cff3173e8115..385c7549e07f2 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
@@ -15,16 +15,12 @@ limitations under the License. */
 #pragma once
 
 #include <string>
-#include "paddle/fluid/framework/pten_utils.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/platform/cpu_info.h"
 
 // only can include the headers in paddle/pten/include dirs
-#include "paddle/pten/api/lib/utils/tensor_utils.h"
-#include "paddle/pten/include/core.h"
 #include "paddle/pten/kernels/math_kernel.h"
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h
index b7df9bb864db1..e1d9655e293a3 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
@@ -21,9 +21,7 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/framework/data_layout.h"
-#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/common_infer_shape_functions.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op.cu b/paddle/fluid/operators/elementwise/elementwise_pow_op.cu
index a5570f2cb85d5..0f3aa8c3e1b9b 100644
--- a/paddle/fluid/operators/elementwise/elementwise_pow_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op.cu
@@ -8,7 +8,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
+
 #include "paddle/fluid/operators/elementwise/elementwise_pow_op.h"
 
 namespace ops = paddle::operators;
diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op.h b/paddle/fluid/operators/elementwise/elementwise_pow_op.h
index 256ab31ead69c..c1fecab8aba1c 100644
--- a/paddle/fluid/operators/elementwise/elementwise_pow_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op.h
@@ -15,7 +15,6 @@ limitations under the License. */
 #include <type_traits>
 
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
index cba261a394732..2ff4033ffe194 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
@@ -12,11 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
 #include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
-#include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.h b/paddle/fluid/operators/elementwise/elementwise_sub_op.h
index 6a51d7c2a45ad..09818380d8ea7 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.h
@@ -14,14 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/fluid/framework/pten_utils.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/operators/math/blas.h"
 
 // only can include the headers in paddle/pten/include dirs
-#include "paddle/pten/api/lib/utils/tensor_utils.h"
-#include "paddle/pten/include/core.h"
 #include "paddle/pten/kernels/math_kernel.h"
 namespace paddle {
 namespace operators {

From fbb4028148cf3a87f4fd464b452597c94e321374 Mon Sep 17 00:00:00 2001
From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com>
Date: Tue, 11 Jan 2022 15:14:26 +0800
Subject: [PATCH 086/151] [AMP] Check call order of paddle.amp.decorate and
 paddle.DataParallel (#38785)

* check amp.decorate and DataParallel

* refine coverage

* fix layer dtype

* refine code
---
 python/paddle/fluid/dygraph/amp/auto_cast.py              | 4 ++++
 python/paddle/fluid/dygraph/layers.py                     | 2 ++
 .../unittests/test_imperative_auto_mixed_precision.py     | 8 ++++++++
 3 files changed, 14 insertions(+)

diff --git a/python/paddle/fluid/dygraph/amp/auto_cast.py b/python/paddle/fluid/dygraph/amp/auto_cast.py
index 15adf4cb6faaf..f09e210c3c161 100644
--- a/python/paddle/fluid/dygraph/amp/auto_cast.py
+++ b/python/paddle/fluid/dygraph/amp/auto_cast.py
@@ -145,6 +145,10 @@ def check_models(models):
             raise RuntimeError(
                 "Current train mode is pure fp16, models should be paddle.nn.Layer, but receive {}.".
                 format(type(model)))
+        if isinstance(model, paddle.DataParallel):
+            raise RuntimeError(
+                "For distributed AMP training, you should first use paddle.amp.decorate() to decotate origin model, and then call paddle.DataParallel get distributed model."
+            )
 
 
 def check_optimizers(optimizers):
diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py
index 4c37a378e0aae..6a65b3bd9c684 100644
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -1569,6 +1569,8 @@ def _apply(self, func, device, dtype, blocking, include_sublayers=True):
         for key, buf in self._buffers.items():
             self._buffers[key] = func(buf, device, dtype, blocking)
 
+        self._dtype = dtype
+
     def _to_impl(self,
                  device=None,
                  dtype=None,
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
index a8ed23f5938c0..62b40f88571d4 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
@@ -536,6 +536,14 @@ def __init__(self):
 
         self.assertRaises(TypeError, test_error_model)
 
+        def test_error_distributed_model():
+            model = fluid.dygraph.Conv2D(3, 2, 3, bias_attr=False, act=None)
+            model = paddle.DataParallel(model)
+            with fluid.dygraph.guard():
+                model = paddle.amp.decorate(models=model, level='O2')
+
+        self.assertRaises(RuntimeError, test_error_distributed_model)
+
         def test_error_optimizer():
             class MyOptimizer(object):
                 def __init__(self):

From d3ba189548b8e5ca01da310e2945fe9ee4d53b63 Mon Sep 17 00:00:00 2001
From: caozhou <48191911+Caozhou1995@users.noreply.github.com>
Date: Tue, 11 Jan 2022 16:49:42 +0800
Subject: [PATCH 087/151] =?UTF-8?q?=E3=80=90Auto=20Parallel=E3=80=91New=20?=
 =?UTF-8?q?local=20tensor=20(#38747)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* update dist tensor

* add unitest

* update unitest

* refactor dist tensor

* update dist tensor and unitest
---
 .../distributed/auto_parallel/dist_context.py |  18 +-
 .../distributed/auto_parallel/dist_tensor.py  | 283 +++++++++++++++++-
 .../fluid/tests/unittests/CMakeLists.txt      |   3 +
 .../test_auto_parallel_dist_tensor.py         | 222 ++++++++++++++
 4 files changed, 523 insertions(+), 3 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_auto_parallel_dist_tensor.py

diff --git a/python/paddle/distributed/auto_parallel/dist_context.py b/python/paddle/distributed/auto_parallel/dist_context.py
index 12bf14fcce5bd..b194bcc3de6b5 100644
--- a/python/paddle/distributed/auto_parallel/dist_context.py
+++ b/python/paddle/distributed/auto_parallel/dist_context.py
@@ -62,6 +62,10 @@ def __init__(self, program=None):
         self._dist_op_context = DistributedOperatorContext()
         self._process_meshes = []
 
+        # Distributed programs
+        self._dist_main_programs = {}
+        self._dist_startup_programs = {}
+
     @property
     def serial_program(self):
         return self._serial_program
@@ -84,6 +88,14 @@ def process_meshes(self):
     def dist_op_context(self):
         return self._dist_op_context
 
+    @property
+    def dist_main_programs(self):
+        return self._dist_main_programs
+
+    @property
+    def dist_startup_programs(self):
+        return self._dist_startup_programs
+
     def add_process_mesh(self, process_mesh):
         assert isinstance(process_mesh, ProcessMesh), \
             'The type of dim_mapping must be ProcessMesh.'
@@ -371,10 +383,14 @@ def __deepcopy__(self, memo):
         result = cls.__new__(cls)
         memo[id(self)] = result
         for k, v in self.__dict__.items():
-            if k == "_serial_program" or k == "_serial_graph":
+            if k == "_serial_program" or k == "_serial_graph" or k == "_dist_main_programs" or k == "_dist_startup_programs":
                 setattr(result, k, v)
             else:
                 setattr(result, k, copy.deepcopy(v, memo))
+
+        # update dist tensor's dist_context
+        for key in result._dist_tensors_for_program.keys():
+            result._dist_tensors_for_program[key]._dist_context = result
         return result
 
 
diff --git a/python/paddle/distributed/auto_parallel/dist_tensor.py b/python/paddle/distributed/auto_parallel/dist_tensor.py
index f46c6e86d6870..5e3c852699ab6 100644
--- a/python/paddle/distributed/auto_parallel/dist_tensor.py
+++ b/python/paddle/distributed/auto_parallel/dist_tensor.py
@@ -13,18 +13,155 @@
 # limitations under the License
 
 import copy
+import inspect
+
+import paddle
 from paddle.fluid import core
+from paddle.fluid.framework import Parameter, Block, Variable
 from .dist_attribute import TensorDistributedAttribute
 from .dist_attribute import get_tensor_dist_attr_field_keys
+from .utils import _linear_idx2coordinate
 
 
 class DistributedTensor:
-    def __init__(self, serial_tensor, dist_attr=None):
+    """
+    DistributedTensor represents the distribution of tensor on the process group and 
+    local tensors can be created by DistributedTensor.
+    Only support even sharding now and uneven sharding will be supported in the future.
+    Local tensor information can be obtained from the DistributedTensor instance object, 
+    or obtained by the static methods provided by DistributedTensor, 
+    including shard (i.e. the index in the serial tensor), offsets, and sizes.
+    """
+
+    @staticmethod
+    def _validate_sizes_and_dist_attr(sizes,
+                                      dims_mapping,
+                                      topology,
+                                      processes,
+                                      rank=None,
+                                      shard_sizes=None):
+        if not (isinstance(sizes, (list, tuple)) and
+                all(map(lambda x: isinstance(x, int) and x > 0, sizes))):
+            raise ValueError(
+                "The sizes must be list or tuple and item in sizes must be non-negative integer, but got {}".
+                format(sizes))
+        if not (isinstance(dims_mapping, (list, tuple)) and all(
+                map(lambda x: isinstance(x, int) and x >= -1, dims_mapping))):
+            raise ValueError(
+                "The dims_mapping must be list or tuple and item in dims_mapping must >= -1, but got {}".
+                format(dims_mapping))
+        if not (isinstance(processes, (list, tuple)) and
+                all(map(lambda x: isinstance(x, int) and x >= 0, processes))):
+            raise ValueError(
+                "The processes must be list or tuple and item in processes must be integer, but got {}".
+                format(processes))
+        if not (isinstance(topology, (list, tuple)) and
+                all(map(lambda x: isinstance(x, int) and x > 0, topology))):
+            raise ValueError(
+                "The topology must be list or tuple and item in topology must be non-negative integer, but got {}".
+                format(topology))
+        if rank is not None and not (isinstance(rank, int) and rank >= 0):
+            raise ValueError("The rank must >= 0, but got {}".format(rank))
+
+        # NOTE: Only support even sharding now
+        if shard_sizes is not None:
+            raise ValueError("Only support even sharding now.")
+
+    @staticmethod
+    def get_local_sizes(global_sizes,
+                        dims_mapping,
+                        topology,
+                        processes,
+                        rank=None,
+                        shard_sizes=None):
+        DistributedTensor._validate_sizes_and_dist_attr(
+            global_sizes, dims_mapping, topology, processes, rank, shard_sizes)
+
+        local_sizes = []
+        # for even sharding, the local sizes of every rank are equal
+        for idx, item in enumerate(global_sizes):
+            if dims_mapping[idx] == -1:
+                local_sizes.append(item)
+            else:
+                local_sizes.append(item // topology[dims_mapping[idx]])
+
+        return local_sizes
+
+    @staticmethod
+    def get_local_offsets(global_sizes,
+                          dims_mapping,
+                          topology,
+                          processes,
+                          rank,
+                          shard_sizes=None):
+        local_sizes = DistributedTensor.get_local_sizes(
+            global_sizes, dims_mapping, topology, processes, rank, shard_sizes)
+        local_offsets = []
+        rank_relatvie = processes.index(rank)
+        coordinate = _linear_idx2coordinate(topology, rank_relatvie)
+
+        for i in range(len(global_sizes)):
+            if dims_mapping[i] == -1:
+                local_offsets.append(0)
+            else:
+                local_offsets.append(coordinate[dims_mapping[i]] *
+                                     local_sizes[i])
+        return local_offsets
+
+    @staticmethod
+    def get_global_sizes(local_sizes,
+                         dims_mapping,
+                         topology,
+                         processes,
+                         rank=None,
+                         shard_sizes=None):
+        DistributedTensor._validate_sizes_and_dist_attr(
+            local_sizes, dims_mapping, topology, processes, rank, shard_sizes)
+        global_sizes = []
+        for idx, item in enumerate(local_sizes):
+            if dims_mapping[idx] == -1:
+                global_sizes.append(item)
+            else:
+                global_sizes.append(item * topology[dims_mapping[idx]])
+        return global_sizes
+
+    @staticmethod
+    def get_local_shard(global_sizes,
+                        dims_mapping,
+                        topology,
+                        processes,
+                        rank,
+                        shard_sizes=None):
+        local_offsets = DistributedTensor.get_local_offsets(
+            global_sizes, dims_mapping, topology, processes, rank, shard_sizes)
+        local_sizes = DistributedTensor.get_local_sizes(
+            global_sizes, dims_mapping, topology, processes, rank, shard_sizes)
+        assert len(local_sizes) == len(
+            local_offsets
+        ), "The length of local_sizes must be equal to local_offsets, but got {} and {}.".format(
+            len(local_sizes), len(local_offsets))
+
+        local_end_offsets = list(
+            map(lambda x: x[0] + x[1], zip(local_offsets, local_sizes)))
+        local_shard = list(zip(local_offsets, local_end_offsets))
+        return local_shard
+
+    def __init__(self, serial_tensor, dist_attr=None, dist_context=None):
         self._serial_tensor = serial_tensor
         self._dist_attr = None
         self._batch_dim = 0
         # Reuse the dist_attr setter to initialize _dist_attr
         self.dist_attr = dist_attr
+        self._local_sizes_map = {}
+        self._local_offsets_map = {}
+        self._local_shard_map = {}
+        self._local_tensor_map = {}
+
+        from .dist_context import get_default_distributed_context
+        self._dist_context = dist_context if dist_context is not None else get_default_distributed_context(
+        )
+        # TODO: Add Automatically to dist_context after initialized and it will be adapted in the future.
+        # self._dist_context.add_dist_tensor_for_program(self)
 
     @property
     def serial_tensor(self):
@@ -34,6 +171,10 @@ def serial_tensor(self):
     def dist_attr(self):
         return self._dist_attr
 
+    @property
+    def dist_context(self):
+        return self._dist_context
+
     @dist_attr.setter
     def dist_attr(self, dist_attr):
         if self._dist_attr is None:
@@ -66,12 +207,150 @@ def validate_dist_attr(self):
                 return False
         return True
 
+    def local_sizes(self, rank=None):
+        rank = paddle.distributed.get_rank() if rank is None else rank
+        local_sizes = None
+        if rank in self._local_sizes_map.keys():
+            local_sizes = self._local_sizes_map[rank]
+        else:
+            global_sizes = self.serial_tensor.shape
+            dims_mapping = self.dist_attr.dims_mapping
+            shard_sizes = self.dist_attr.shard_sizes
+            processes = self.dist_attr.process_mesh.processes
+            topology = self.dist_attr.process_mesh.topology
+            local_sizes = DistributedTensor.get_local_sizes(
+                global_sizes, dims_mapping, topology, processes, rank,
+                shard_sizes)
+            self._local_sizes_map[rank] = local_sizes
+
+        return local_sizes
+
+    def local_offsets(self, rank=None):
+        rank = paddle.distributed.get_rank() if rank is None else rank
+        local_offsets = None
+        if rank in self._local_offsets_map.keys():
+            local_offsets = self._local_offsets_map[rank]
+        else:
+            global_sizes = self.serial_tensor.shape
+            dims_mapping = self.dist_attr.dims_mapping
+            shard_sizes = self.dist_attr.shard_sizes
+            processes = self.dist_attr.process_mesh.processes
+            topology = self.dist_attr.process_mesh.topology
+            local_offsets = DistributedTensor.get_local_offsets(
+                global_sizes, dims_mapping, topology, processes, rank,
+                shard_sizes)
+            self._local_offsets_map[rank] = local_offsets
+
+        return local_offsets
+
+    def global_sizes(self):
+        return self.serial_tensor.shape
+
+    def local_shard(self, rank=None):
+        rank = paddle.distributed.get_rank() if rank is None else rank
+        local_shard = None
+        if rank in self._local_shard_map.keys():
+            local_shard = self._local_shard_map[rank]
+        else:
+            global_sizes = self.serial_tensor.shape
+            dims_mapping = self.dist_attr.dims_mapping
+            shard_sizes = self.dist_attr.shard_sizes
+            processes = self.dist_attr.process_mesh.processes
+            topology = self.dist_attr.process_mesh.topology
+            local_shard = DistributedTensor.get_local_shard(
+                global_sizes, dims_mapping, topology, processes, rank,
+                shard_sizes)
+            self._local_shard_map[rank] = local_shard
+
+        return local_shard
+
+    def new_local_tensor(self, block=None, rank=None, name=None):
+        """
+        Create a new local tensor of serial tensor corresponding to rank.
+
+        Args:
+            block (Block): The block contains the new tensor. Default value is recommend and it will be created in the block of dist main program corresponding to the serial tensor block id. Default: None.
+            rank (int): The rank id. Default value is recommend and it will be the current rank. Default: None.
+        """
+
+        def _copy_kwargs(serial_tensor):
+            kwargs = {}
+            no_need_copy_args = ["self", "block", "shape", "name"]
+            arg_spec = inspect.getargspec(Variable.__init__)
+
+            for key in arg_spec.args:
+                # TODO: Check the copied attribute from serial tensor whether valid
+                if key in no_need_copy_args:
+                    continue
+                elif key not in kwargs:
+                    if key == "type":
+                        kwargs[key] = serial_tensor.desc.type()
+                    elif key == "dtype":
+                        kwargs[key] = serial_tensor.desc.dtype()
+                    elif key == "lod_level":
+                        kwargs[key] = serial_tensor.desc.lod_level()
+                    elif key == "persistable":
+                        kwargs[key] = serial_tensor.desc.persistable()
+                    elif key == "stop_gradient":
+                        kwargs[key] = serial_tensor.desc.stop_gradient()
+                    elif key == "need_check_feed":
+                        kwargs[key] = serial_tensor.desc.need_check_feed()
+                    # TODO: Get capacity by framework
+                    elif key == "capacity":
+                        continue
+                    else:
+                        kwargs[key] = self.serial_tensor.__dict__[key]
+
+            if isinstance(serial_tensor, Parameter):
+                kwargs["trainable"] = serial_tensor.trainable
+                kwargs["optimize_attr"] = serial_tensor.trainable
+                kwargs["regularizer"] = serial_tensor.regularizer
+                kwargs["do_model_average"] = serial_tensor.do_model_average
+                kwargs["need_clip"] = serial_tensor.need_clip
+                kwargs["is_distributed"] = serial_tensor.is_distributed
+                kwargs["is_parameter"] = serial_tensor.is_parameter
+
+            return kwargs
+
+        if rank is not None and not (isinstance(rank, int) and rank >= 0):
+            raise ValueError("The rank must >= 0, but got {}".format(rank))
+        if block is not None and not isinstance(block, Block):
+            raise TypeError("The block must be Block, but got {}.".format(
+                type(block)))
+        rank = paddle.distributed.get_rank() if rank is None else rank
+
+        if block is None:
+            block_id = self.serial_tensor.block.idx
+            block = self.dist_context.dist_main_programs[rank].block(block_id)
+
+        # copy serial tensor attribute
+        kwargs = _copy_kwargs(self.serial_tensor)
+        kwargs["name"] = name
+        kwargs["shape"] = self.local_sizes(rank)
+
+        if isinstance(self.serial_tensor, Parameter):
+            kwargs.pop("persistable")
+            local_tensor = Parameter(block=block, **kwargs)
+        else:
+            local_tensor = block.create_var(**kwargs)
+
+        # TODO: Set original id when set original_id is approved
+        local_tensor.desc.set_original_id(self.serial_tensor.desc.id())
+        self._local_tensor_map[rank] = local_tensor
+        return local_tensor
+
+    def local_tensor(self, rank=None):
+        rank = paddle.distributed.get_rank() if rank is None else rank
+        assert rank in self._local_tensor_map, "The rank {} local tensor has not been created.".format(
+            rank)
+        return self._local_tensor_map[rank]
+
     def __deepcopy__(self, memo):
         cls = self.__class__
         result = cls.__new__(cls)
         memo[id(self)] = result
         for k, v in self.__dict__.items():
-            if k == "_serial_tensor":
+            if k == "_serial_tensor" or k == "_local_tensor_map":
                 setattr(result, k, v)
             else:
                 setattr(result, k, copy.deepcopy(v, memo))
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 64c247e56d1d3..b46a10c8c79d8 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -94,6 +94,7 @@ list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_partitioner)
 list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_partitioner_gpt)
 list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_searcher)
 list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_reshard)
+list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_dist_tensor)
 list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_reshard_serial)
 list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_reshard_mppp)
 list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_reshard_dpmppp)
@@ -262,6 +263,7 @@ if ((NOT WITH_GPU) AND (NOT WITH_ROCM))
     LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_partitioner_gpt)
     LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_searcher)
     LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_reshard)
+    LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_dist_tensor)
     LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_reshard_serial)
     LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_reshard_mppp)
     LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_reshard_dpmppp)
@@ -649,6 +651,7 @@ if(WITH_DISTRIBUTE)
             py_test_modules(test_auto_parallel_partitioner_gpt MODULES test_auto_parallel_partitioner_gpt ENVS ${dist_ENVS})
             py_test_modules(test_auto_parallel_searcher MODULES test_auto_parallel_searcher ENVS ${dist_ENVS})
             py_test_modules(test_auto_parallel_reshard MODULES test_auto_parallel_reshard ENVS ${dist_ENVS})
+            py_test_modules(test_auto_parallel_dist_tensor MODULES test_auto_parallel_dist_tensor ENVS ${dist_ENVS})
             py_test_modules(test_auto_parallel_reshard_serial MODULES test_auto_parallel_reshard_serial ENVS ${dist_ENVS})
             py_test_modules(test_auto_parallel_reshard_mppp MODULES test_auto_parallel_reshard_mppp ENVS ${dist_ENVS})
             py_test_modules(test_auto_parallel_reshard_dpmppp MODULES test_auto_parallel_reshard_dpmppp ENVS ${dist_ENVS})
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_dist_tensor.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_dist_tensor.py
new file mode 100644
index 0000000000000..b21cbb5ae78bc
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_dist_tensor.py
@@ -0,0 +1,222 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import unittest
+
+import paddle
+from paddle.fluid import core
+import paddle.distributed.auto_parallel as auto
+from paddle.distributed import fleet
+from paddle.distributed.auto_parallel.parallelizer import AutoParallelizer
+from paddle.distributed.auto_parallel.partitioner import Partitioner
+from paddle.distributed.auto_parallel.dist_context import DistributedContext
+from paddle.distributed.auto_parallel.dist_tensor import DistributedTensor
+from paddle.distributed.auto_parallel.dist_attribute import TensorDistributedAttribute
+import test_auto_parallel_reshard
+from test_auto_parallel_reshard import mlp_forward
+
+
+def get_dist_prog(train_program,
+                  startup_program,
+                  dist_context,
+                  rank_id,
+                  complete_train_program=None):
+    loss, train_program, startup_program = mlp_forward(train_program,
+                                                       startup_program)
+
+    fleet._user_defined_strategy = fleet.DistributedStrategy()
+    fleet.user_defined_optimizer = paddle.fluid.optimizer.AdamOptimizer()
+    parallelizer = AutoParallelizer(fleet)
+    parallelizer._dist_context = dist_context
+
+    # serial forward & backward completion
+    complete_train_program = auto.complete_annotation(
+        train_program, dist_context
+    ) if complete_train_program is None else complete_train_program
+
+    # parallelizer._apply_serial_forward_pass(complete_train_program,
+    #                                         startup_program)
+
+    params_grads = parallelizer._generate_backward(
+        complete_train_program,
+        startup_program,
+        loss,
+        parameter_list=None,
+        no_grad_set=None,
+        callbacks=None)
+
+    # logical partition
+    partitioner = Partitioner(dist_context, rank_id)
+    auto_parallel_main_prog, auto_parallel_startup_prog, dist_params_grads = partitioner.partition(
+        complete_train_program, startup_program, params_grads)
+
+    partitioned_optimize_ops = parallelizer._apply_optimize(
+        auto_parallel_main_prog, auto_parallel_startup_prog, dist_params_grads)
+
+    return auto_parallel_main_prog, auto_parallel_startup_prog, complete_train_program
+
+
+class TestDistributedTensor(unittest.TestCase):
+    def test_new_local_tensor(self):
+        test_auto_parallel_reshard._global_process_mesh = auto.ProcessMesh(
+            mesh=[0, 1])
+        test_auto_parallel_reshard._global_parallel_strategy = "dp"
+        train_program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        dist_context = DistributedContext()
+        rank_id = 0
+        dist_main_prog, dist_startup_prog, complete_train_program = get_dist_prog(
+            train_program, startup_program, dist_context, rank_id)
+        dist_context.dist_main_programs[rank_id] = dist_main_prog
+        dist_context.dist_startup_programs[rank_id] = dist_startup_prog
+        name = "layer_norm_1.tmp_2"
+        dist_tensor = dist_context.get_dist_tensor_for_program(
+            complete_train_program.global_block().vars[name])
+        dist_tensor._dist_context = dist_context
+        intermediate_var_0 = dist_tensor.new_local_tensor(
+            name="intermediate_var_0")
+        self.assertEqual(intermediate_var_0.shape, (2, 1024))
+        self.assertEqual(intermediate_var_0.name, "intermediate_var_0")
+
+        rank_id = 1
+        train_program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        dist_main_prog, dist_startup_prog, _ = get_dist_prog(
+            train_program, startup_program, dist_context, rank_id,
+            complete_train_program)
+        dist_context.dist_main_programs[rank_id] = dist_main_prog
+        dist_context.dist_startup_programs[rank_id] = dist_startup_prog
+        name = "layer_norm_1.tmp_2"
+        dist_tensor = dist_context.get_dist_tensor_for_program(
+            complete_train_program.global_block().vars[name])
+        dist_tensor._dist_context = dist_context
+        intermediate_var_1 = dist_tensor.new_local_tensor(
+            rank=rank_id, name="intermediate_var_1")
+        self.assertEqual(intermediate_var_0.shape, (2, 1024))
+        self.assertEqual(intermediate_var_1.name, "intermediate_var_1")
+
+        name = "linear_0.w_0"
+        dist_tensor = dist_context.get_dist_tensor_for_program(
+            complete_train_program.global_block().vars[name])
+        dist_tensor._dist_context = dist_context
+        intermediate_var_1 = dist_tensor.new_local_tensor(
+            rank=rank_id, name="linear_0.w_0_intermediate")
+        self.assertEqual(intermediate_var_1.shape, (1024, 4096))
+        self.assertEqual(intermediate_var_1.name, "linear_0.w_0_intermediate")
+
+        copied_dist_context = copy.deepcopy(dist_context)
+        self.assertIsNotNone(copied_dist_context)
+        self.assertEqual(
+            id(copied_dist_context),
+            id(
+                copied_dist_context.get_dist_tensor_for_program(
+                    dist_tensor.serial_tensor).dist_context))
+
+    def test_static_method(self):
+        dims_mapping = [1, 0]
+        processes = [0, 1, 2, 3, 4, 5, 6]
+        topology = [2, 3]
+        global_sizes = [6, 6]
+
+        # rank 0 [(0, 2), (0, 3)]  
+        # rank 1 [(2, 4), (0, 3)]
+        # rank 4 [(2, 4), (3, 6)]
+        rank = 0
+        local_sizes = DistributedTensor.get_local_sizes(
+            global_sizes, dims_mapping, topology, processes)
+        self.assertEqual(local_sizes, [2, 3])
+        local_offsets = DistributedTensor.get_local_offsets(
+            global_sizes, dims_mapping, topology, processes, rank)
+        self.assertEqual(local_offsets, [0, 0])
+        local_shard = DistributedTensor.get_local_shard(
+            global_sizes, dims_mapping, topology, processes, rank)
+        self.assertEqual(local_shard, [(0, 2), (0, 3)])
+
+        rank = 1
+        local_sizes = DistributedTensor.get_local_sizes(
+            global_sizes, dims_mapping, topology, processes)
+        self.assertEqual(local_sizes, [2, 3])
+        local_offsets = DistributedTensor.get_local_offsets(
+            global_sizes, dims_mapping, topology, processes, rank)
+        self.assertEqual(local_offsets, [2, 0])
+        local_shard = DistributedTensor.get_local_shard(
+            global_sizes, dims_mapping, topology, processes, rank)
+        self.assertEqual(local_shard, [(2, 4), (0, 3)])
+
+        rank = 4
+        local_sizes = DistributedTensor.get_local_sizes(
+            global_sizes, dims_mapping, topology, processes)
+        self.assertEqual(local_sizes, [2, 3])
+        local_offsets = DistributedTensor.get_local_offsets(
+            global_sizes, dims_mapping, topology, processes, rank)
+        self.assertEqual(local_offsets, [2, 3])
+        local_shard = DistributedTensor.get_local_shard(
+            global_sizes, dims_mapping, topology, processes, rank)
+        self.assertEqual(local_shard, [(2, 4), (3, 6)])
+
+        # global sizes
+        local_sizes = [2, 3]
+        global_sizes = DistributedTensor.get_global_sizes(
+            local_sizes, dims_mapping, topology, processes)
+        self.assertEqual(global_sizes, [6, 6])
+
+    def test_instance_method(self):
+        tensor_dist_attr = TensorDistributedAttribute()
+        tensor_dist_attr.dims_mapping = [1, 0]
+        tensor_dist_attr.process_mesh = auto.ProcessMesh(
+            mesh=[[0, 1, 2], [3, 4, 5]])
+        serial_tensor = paddle.static.data(
+            name="data", shape=[6, 6], dtype='float32')
+        dist_tensor = DistributedTensor(serial_tensor, tensor_dist_attr)
+
+        # rank 0 [(0, 2), (0, 3)]  
+        # rank 1 [(2, 4), (0, 3)]
+        # rank 4 [(2, 4), (3, 6)]
+        rank = 0
+        local_sizes = dist_tensor.local_sizes(rank)
+        self.assertEqual(local_sizes, [2, 3])
+        local_offsets = dist_tensor.local_offsets(rank)
+        self.assertEqual(local_offsets, [0, 0])
+        local_shard = dist_tensor.local_shard(rank)
+        self.assertEqual(local_shard, [(0, 2), (0, 3)])
+        self.assertEqual(local_sizes, dist_tensor.local_sizes(rank))
+        self.assertEqual(local_offsets, dist_tensor.local_offsets(rank))
+        self.assertEqual(local_shard, dist_tensor.local_shard(rank))
+        self.assertEqual(local_sizes, dist_tensor.local_sizes())
+        self.assertEqual(local_offsets, dist_tensor.local_offsets())
+        self.assertEqual(local_shard, dist_tensor.local_shard())
+
+        rank = 1
+        local_sizes = dist_tensor.local_sizes(rank)
+        self.assertEqual(local_sizes, [2, 3])
+        local_offsets = dist_tensor.local_offsets(rank)
+        self.assertEqual(local_offsets, [2, 0])
+        local_shard = dist_tensor.local_shard(rank)
+        self.assertEqual(local_shard, [(2, 4), (0, 3)])
+
+        rank = 4
+        local_sizes = dist_tensor.local_sizes(rank)
+        self.assertEqual(local_sizes, [2, 3])
+        local_offsets = dist_tensor.local_offsets(rank)
+        self.assertEqual(local_offsets, [2, 3])
+        local_shard = dist_tensor.local_shard(rank)
+        self.assertEqual(local_shard, [(2, 4), (3, 6)])
+
+        global_sizes = dist_tensor.global_sizes()
+        self.assertEqual(global_sizes, (6, 6))
+
+
+if __name__ == "__main__":
+    unittest.main()

From 29c211ee079c03b14929f9354002ade6752e2238 Mon Sep 17 00:00:00 2001
From: Weilong Wu <veyron_wu@163.com>
Date: Tue, 11 Jan 2022 17:43:51 +0800
Subject: [PATCH 088/151] Support test_numpy_bridge and thread_local_has_grad
 (#38835)

---
 .../unittests/test_imperative_numpy_bridge.py      | 14 ++++++++++++--
 .../test_imperative_thread_local_has_grad.py       |  8 +++++++-
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_imperative_numpy_bridge.py b/python/paddle/fluid/tests/unittests/test_imperative_numpy_bridge.py
index 772dd913e4d20..4f3089baffdd3 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_numpy_bridge.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_numpy_bridge.py
@@ -16,10 +16,11 @@
 import numpy as np
 import paddle.fluid as fluid
 import warnings
+from paddle.fluid.framework import _test_eager_guard, _in_eager_mode
 
 
 class TestImperativeNumpyBridge(unittest.TestCase):
-    def test_tensor_from_numpy(self):
+    def func_tensor_from_numpy(self):
         data_np = np.array([[2, 3, 1]]).astype('float32')
         with fluid.dygraph.guard(fluid.CPUPlace()):
             with warnings.catch_warnings(record=True) as w:
@@ -39,9 +40,18 @@ def test_tensor_from_numpy(self):
             self.assertTrue(np.array_equal(var2.numpy(), data_np))
             data_np[0][0] = -1
             self.assertEqual(data_np[0][0], -1)
-            self.assertNotEqual(var2[0][0].numpy()[0], -1)
+            if _in_eager_mode():
+                # eager_mode, var2 is EagerTensor, is not subscriptable
+                self.assertNotEqual(var2.numpy()[0][0], -1)
+            else:
+                self.assertNotEqual(var2[0][0].numpy()[0], -1)
             self.assertFalse(np.array_equal(var2.numpy(), data_np))
 
+    def test_func_tensor_from_numpy(self):
+        with _test_eager_guard():
+            self.func_tensor_from_numpy()
+        self.func_tensor_from_numpy()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_thread_local_has_grad.py b/python/paddle/fluid/tests/unittests/test_imperative_thread_local_has_grad.py
index d81849725d75a..f54e50953f131 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_thread_local_has_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_thread_local_has_grad.py
@@ -18,6 +18,7 @@
 import paddle.nn as nn
 import numpy as np
 import threading
+from paddle.fluid.framework import _test_eager_guard, _in_eager_mode
 
 
 class SimpleNet(nn.Layer):
@@ -44,7 +45,7 @@ def thread_2_main(self):
             x = net(x)
             self.assertFalse(x.stop_gradient)
 
-    def test_main(self):
+    def func_main(self):
         threads = []
         for _ in range(10):
             threads.append(threading.Thread(target=self.thread_1_main))
@@ -54,6 +55,11 @@ def test_main(self):
         for t in threads:
             t.join()
 
+    def test_main(self):
+        with _test_eager_guard():
+            self.func_main()
+        self.func_main()
+
 
 if __name__ == "__main__":
     unittest.main()

From 2bed9b9c5970497cfbbff197d6eb7a4b87680dd2 Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Tue, 11 Jan 2022 19:16:15 +0800
Subject: [PATCH 089/151] [PTEN] Add pten::Place data structure. (#38844)

* add pten::Place data structure.

* update ci problem

* fix ci problem

* update
---
 paddle/pten/api/lib/utils/CMakeLists.txt  |   4 +-
 paddle/pten/api/lib/utils/place_utils.cc  |  62 ------------
 paddle/pten/api/lib/utils/place_utils.h   |  28 ------
 paddle/pten/common/CMakeLists.txt         |   2 +-
 paddle/pten/common/device.cc              |  65 -------------
 paddle/pten/common/device.h               |  70 --------------
 paddle/pten/common/place.cc               |  57 +++++++++--
 paddle/pten/common/place.h                | 109 +++++++++++++++-------
 paddle/pten/tests/api/CMakeLists.txt      |   1 -
 paddle/pten/tests/api/test_place_utils.cc |  77 ---------------
 paddle/pten/tests/common/CMakeLists.txt   |   1 +
 paddle/pten/tests/common/test_place.cc    |  53 +++++++++++
 12 files changed, 184 insertions(+), 345 deletions(-)
 delete mode 100644 paddle/pten/api/lib/utils/place_utils.cc
 delete mode 100644 paddle/pten/api/lib/utils/place_utils.h
 delete mode 100644 paddle/pten/common/device.cc
 delete mode 100644 paddle/pten/common/device.h
 delete mode 100644 paddle/pten/tests/api/test_place_utils.cc
 create mode 100644 paddle/pten/tests/common/test_place.cc

diff --git a/paddle/pten/api/lib/utils/CMakeLists.txt b/paddle/pten/api/lib/utils/CMakeLists.txt
index 06178dad43767..4a44ad7758b56 100644
--- a/paddle/pten/api/lib/utils/CMakeLists.txt
+++ b/paddle/pten/api/lib/utils/CMakeLists.txt
@@ -1,2 +1,2 @@
-cc_library(pten_api_utils SRCS allocator.cc storage.cc tensor_utils.cc place_utils.cc DEPS
-tensor_base convert_utils dense_tensor lod_tensor selected_rows place var_type_traits pten_common)
+cc_library(pten_api_utils SRCS allocator.cc storage.cc tensor_utils.cc DEPS
+tensor_base convert_utils dense_tensor lod_tensor selected_rows place var_type_traits)
diff --git a/paddle/pten/api/lib/utils/place_utils.cc b/paddle/pten/api/lib/utils/place_utils.cc
deleted file mode 100644
index af4f84b1ad836..0000000000000
--- a/paddle/pten/api/lib/utils/place_utils.cc
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/pten/api/lib/utils/place_utils.h"
-#include "paddle/pten/api/ext/exception.h"
-
-namespace paddle {
-namespace experimental {
-
-Place ConvertToPtenPlace(const platform::Place& src) {
-  Place place;
-  if (platform::is_cpu_place(src)) {
-    place.Reset(Device(DeviceType::kHost, 0));
-  } else if (platform::is_gpu_place(src)) {
-    place.Reset(
-        Device(DeviceType::kCuda,
-               BOOST_GET_CONST(platform::CUDAPlace, src).GetDeviceId()));
-  } else if (platform::is_cuda_pinned_place(src)) {
-    place.Reset(Device(DeviceType::kCuda, 0), true);
-  } else if (platform::is_xpu_place(src)) {
-    place.Reset(Device(DeviceType::kXpu,
-                       BOOST_GET_CONST(platform::XPUPlace, src).GetDeviceId()));
-  } else {
-    PD_THROW("Invalid platform place type.");
-  }
-  return place;
-}
-
-platform::Place ConvertToPlatformPlace(const Place& src) {
-  switch (src.device().type()) {
-    case DeviceType::kHost: {
-      return platform::CPUPlace();
-    }
-    case DeviceType::kCuda: {
-      if (src.is_pinned()) {
-        return platform::CUDAPinnedPlace();
-      } else {
-        return platform::CUDAPlace(src.device().id());
-      }
-    }
-    case DeviceType::kXpu: {
-      return platform::XPUPlace(src.device().id());
-    }
-    default:
-      PD_THROW("Invalid pten place type.");
-  }
-  return {};
-}
-
-}  // namespace experimental
-}  // namespace paddle
diff --git a/paddle/pten/api/lib/utils/place_utils.h b/paddle/pten/api/lib/utils/place_utils.h
deleted file mode 100644
index 9ac10158040b2..0000000000000
--- a/paddle/pten/api/lib/utils/place_utils.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/platform/place.h"
-#include "paddle/pten/common/place.h"
-
-namespace paddle {
-namespace experimental {
-
-Place ConvertToPtenPlace(const platform::Place& src);
-
-platform::Place ConvertToPlatformPlace(const Place& src);
-
-}  // namespace experimental
-}  // namespace paddle
diff --git a/paddle/pten/common/CMakeLists.txt b/paddle/pten/common/CMakeLists.txt
index c4083d7f0d756..feaf0e12bdb16 100644
--- a/paddle/pten/common/CMakeLists.txt
+++ b/paddle/pten/common/CMakeLists.txt
@@ -1 +1 @@
-cc_library(pten_common SRCS device.cc place.cc DEPS enforce)
+cc_library(pten_place SRCS place.cc)
diff --git a/paddle/pten/common/device.cc b/paddle/pten/common/device.cc
deleted file mode 100644
index 55130067ae200..0000000000000
--- a/paddle/pten/common/device.cc
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/pten/common/device.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/pten/api/ext/exception.h"
-
-namespace paddle {
-namespace experimental {
-
-const char* DeviceTypeStr(DeviceType type) {
-  switch (type) {
-    case DeviceType::kUndef:
-      return "kUndef";
-    case DeviceType::kHost:
-      return "kHost";
-    case DeviceType::kXpu:
-      return "kXpu";
-    case DeviceType::kCuda:
-      return "kCuda";
-    case DeviceType::kHip:
-      return "kHip";
-    case DeviceType::kNpu:
-      return "kNpu";
-    default:
-      PD_THROW("Invalid pten device type.");
-  }
-  return {};
-}
-
-Device::Device(DeviceType type, int8_t id) : type_(type), id_(id) {
-  PADDLE_ENFORCE_GE(
-      id,
-      0,
-      platform::errors::InvalidArgument(
-          "The device id needs to start from zero, but you passed in %d.", id));
-}
-
-Device::Device(DeviceType type) : type_(type), id_(0) {
-  PADDLE_ENFORCE_EQ(
-      type,
-      DeviceType::kHost,
-      platform::errors::InvalidArgument(
-          "The device id needs to start from zero, but you passed in %s.",
-          DeviceTypeStr(type)));
-}
-
-std::string Device::DebugString() const {
-  std::string str{"DeviceType:"};
-  return str + DeviceTypeStr(type_) + ", id: " + std::to_string(id_);
-}
-
-}  // namespace experimental
-}  // namespace paddle
diff --git a/paddle/pten/common/device.h b/paddle/pten/common/device.h
deleted file mode 100644
index eddb71bce16da..0000000000000
--- a/paddle/pten/common/device.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cstdint>
-#include <string>
-
-namespace paddle {
-namespace experimental {
-
-enum class DeviceType : int8_t {
-  kUndef = 0,
-  kHost = 1,
-  kXpu = 2,
-  kCuda = 3,
-  kHip = 4,
-  kNpu = 5,
-};
-
-const char* DeviceTypeStr(DeviceType type);
-
-/// \brief The device is used to store hardware information. It has not yet
-/// stored information related to the math acceleration library.
-struct Device final {
- public:
-  Device() = default;
-
-  Device(DeviceType type, int8_t id);
-
-  Device(DeviceType type);
-
-  DeviceType type() const noexcept { return type_; }
-
-  /// \brief Returns the index of the device. Here, -1 is used to indicate an
-  /// invalid value, and 0 to indicate a default value.
-  /// \return The index of the device.
-  int8_t id() const noexcept { return id_; }
-
-  void set_type(DeviceType type) noexcept { type_ = type; }
-
-  void set_id(int8_t id) noexcept { id_ = id; }
-
-  std::string DebugString() const;
-
- private:
-  friend bool operator==(const Device&, const Device&) noexcept;
-
- private:
-  DeviceType type_{DeviceType::kUndef};
-  int8_t id_{-1};
-};
-
-inline bool operator==(const Device& lhs, const Device& rhs) noexcept {
-  return (lhs.type_ == rhs.type_) && (lhs.id_ == rhs.id_);
-}
-
-}  // namespace experimental
-}  // namespace paddle
diff --git a/paddle/pten/common/place.cc b/paddle/pten/common/place.cc
index ba34c5d0f9222..2d33bb508af44 100644
--- a/paddle/pten/common/place.cc
+++ b/paddle/pten/common/place.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,14 +13,57 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/pten/common/place.h"
-#include "paddle/fluid/platform/enforce.h"
 
-namespace paddle {
-namespace experimental {
+#include <sstream>
+#include <string>
+
+#include "paddle/pten/api/ext/exception.h"
+
+namespace pten {
+
+const char *AllocationTypeStr(AllocationType type) {
+  switch (type) {
+    case AllocationType::UNDEF:
+      return "undef";
+    case AllocationType::CPU:
+      return "cpu";
+    case AllocationType::GPU:
+      return "gpu";
+    case AllocationType::GPUPINNED:
+      return "gpu pinned";
+    case AllocationType::XPU:
+      return "xpu";
+    case AllocationType::NPU:
+      return "npu";
+    case AllocationType::NPUPINNED:
+      return "npu pinned";
+    case AllocationType::IPU:
+      return "ipu";
+    case AllocationType::MLU:
+      return "mlu";
+    default:
+      PD_THROW("Invalid pten device type.");
+      return {};
+  }
+}
 
 std::string Place::DebugString() const {
-  return device_.DebugString() + ", is_pinned: " + std::to_string(is_pinned_);
+  std::ostringstream os;
+  os << "Place(";
+  os << AllocationTypeStr(alloc_type_);
+  if (alloc_type_ == AllocationType::GPUPINNED ||
+      alloc_type_ == AllocationType::NPUPINNED ||
+      alloc_type_ == AllocationType::CPU) {
+    os << ")";
+  } else {
+    os << ":" << std::to_string(device) << ")";
+  }
+  return os.str();
+}
+
+std::ostream &operator<<(std::ostream &os, const Place &p) {
+  os << p.DebugString();
+  return os;
 }
 
-}  // namespace experimental
-}  // namespace paddle
+}  // namespace pten
diff --git a/paddle/pten/common/place.h b/paddle/pten/common/place.h
index fdc948734934b..24d24305202cf 100644
--- a/paddle/pten/common/place.h
+++ b/paddle/pten/common/place.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -16,52 +16,97 @@ limitations under the License. */
 
 #include <string>
 
-#include "paddle/pten/common/device.h"
+namespace pten {
+
+enum class AllocationType : int8_t {
+  UNDEF = 0,
+  CPU = 1,
+  GPU = 2,
+  GPUPINNED = 3,
+  XPU = 4,
+  NPU = 5,
+  NPUPINNED = 6,
+  IPU = 7,
+  MLU = 8,
+};
 
-namespace paddle {
-namespace experimental {
+const char *AllocationTypeStr(AllocationType type);
 
 /// \brief The place is used to specify where the data is stored.
-class Place final {
+class Place {
  public:
-  Place() = default;
-
-  explicit Place(const Device& device) : device_(device) {}
+  Place() : device(0), alloc_type_(AllocationType::UNDEF) {}
 
-  Place(DeviceType type, int8_t id) : device_(type, id) {}
+  explicit Place(AllocationType type, int8_t id)
+      : device(id), alloc_type_(type) {}
 
-  Place(DeviceType type) : device_(type) {}
+  explicit Place(AllocationType type) : device(0), alloc_type_(type) {}
 
-  Place(const Device& device, bool is_pinned) noexcept : device_(device),
-                                                         is_pinned_(is_pinned) {
+  void Reset(AllocationType type, int8_t device_id = 0) noexcept {
+    alloc_type_ = type;
+    device = device_id;
   }
 
-  const Device& device() const noexcept { return device_; }
+  AllocationType GetType() const { return alloc_type_; }
 
-  /// \brief Returns whether the memory is a locked page. The page lock
-  /// memory is actually located in the host memory, but it can only be
-  /// used by certain devices and can be directly transferred by DMA.
-  /// \return Whether the memory is a locked page.
-  bool is_pinned() const noexcept { return is_pinned_; }
-
-  void Reset(const Device& device, bool is_pinned = false) noexcept {
-    device_ = device;
-    is_pinned_ = is_pinned;
-  }
+  int8_t GetDeviceId() const { return device; }
 
   std::string DebugString() const;
 
- private:
-  friend bool operator==(const Place&, const Place&) noexcept;
+ public:
+  // TODO(wilber): Just because of backward compatibility, it needs to be
+  // changed to private in the future.
+  int8_t device;
 
  private:
-  Device device_;
-  bool is_pinned_{false};
+  AllocationType alloc_type_;
+};
+
+class CPUPlace : public Place {
+ public:
+  CPUPlace() : Place(AllocationType::CPU, 0) {}
+};
+
+class GPUPlace : public Place {
+ public:
+  GPUPlace() : Place(AllocationType::GPU, 0) {}
+  explicit GPUPlace(int device_id) : Place(AllocationType::GPU, device_id) {}
+};
+
+class GPUPinnedPlace : public Place {
+ public:
+  GPUPinnedPlace() : Place(AllocationType::GPUPINNED) {}
+};
+
+class XPUPlace : public Place {
+ public:
+  XPUPlace() : Place(AllocationType::XPU, 0) {}
+  explicit XPUPlace(int device_id) : Place(AllocationType::XPU, device_id) {}
+};
+
+class NPUPlace : public Place {
+ public:
+  NPUPlace() : Place(AllocationType::NPU, 0) {}
+  explicit NPUPlace(int device_id) : Place(AllocationType::XPU, device_id) {}
+};
+
+class NPUPinnedPlace : public Place {
+ public:
+  NPUPinnedPlace() : Place(AllocationType::NPUPINNED) {}
+};
+
+class IPUPlace : public Place {
+ public:
+  IPUPlace() : Place(AllocationType::XPU, 0) {}
+  explicit IPUPlace(int device_id) : Place(AllocationType::XPU, device_id) {}
+};
+
+class MLUPlace : public Place {
+ public:
+  MLUPlace() : Place(AllocationType::MLU, 0) {}
+  explicit MLUPlace(int device_id) : Place(AllocationType::MLU, device_id) {}
 };
 
-inline bool operator==(const Place& lhs, const Place& rhs) noexcept {
-  return (lhs.device_ == rhs.device_) && (lhs.is_pinned_ == rhs.is_pinned_);
-}
+std::ostream &operator<<(std::ostream &, const Place &);
 
-}  // namespace experimental
-}  // namespace paddle
+}  // namespace pten
diff --git a/paddle/pten/tests/api/CMakeLists.txt b/paddle/pten/tests/api/CMakeLists.txt
index bb1eab2c09551..ffbc551843148 100644
--- a/paddle/pten/tests/api/CMakeLists.txt
+++ b/paddle/pten/tests/api/CMakeLists.txt
@@ -7,7 +7,6 @@ endif()
 cc_test(test_pten_exception SRCS test_pten_exception.cc DEPS gtest)
 cc_test(test_framework_storage SRCS test_storage.cc DEPS pten_api_utils)
 cc_test(test_framework_tensor_utils SRCS test_tensor_utils.cc DEPS pten_api_utils)
-cc_test(test_framework_place_utils storage SRCS test_place_utils.cc DEPS pten_api_utils)
 
 cc_test(test_mean_api SRCS test_mean_api.cc DEPS pten_tensor pten_api pten_api_utils)
 cc_test(test_dot_api SRCS test_dot_api.cc DEPS pten_tensor pten_api pten_api_utils)
diff --git a/paddle/pten/tests/api/test_place_utils.cc b/paddle/pten/tests/api/test_place_utils.cc
deleted file mode 100644
index 4db1f59d83786..0000000000000
--- a/paddle/pten/tests/api/test_place_utils.cc
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "gtest/gtest.h"
-
-#include "paddle/pten/api/lib/utils/place_utils.h"
-
-namespace paddle {
-namespace experimental {
-namespace tests {
-
-TEST(place_utils, cpu_place) {
-  auto pd_place = platform::CPUPlace();
-  Place pten_place = ConvertToPtenPlace(pd_place);
-  CHECK_EQ(pten_place.device().id(), 0);
-  CHECK(pten_place.device().type() == DeviceType::kHost);
-  CHECK(pten_place.is_pinned() == false);
-
-  auto pd_place_1 = ConvertToPlatformPlace(pten_place);
-  CHECK(platform::is_cpu_place(pd_place_1));
-  CHECK(pd_place == BOOST_GET_CONST(platform::CPUPlace, pd_place_1));
-  CHECK(pten_place == ConvertToPtenPlace(pd_place_1));
-}
-
-TEST(place_utils, cuda_place) {
-  auto pd_place = platform::CUDAPlace(1);
-  Place pten_place = ConvertToPtenPlace(pd_place);
-  CHECK_EQ(pten_place.device().id(), 1);
-  CHECK(pten_place.device().type() == DeviceType::kCuda);
-  CHECK(pten_place.is_pinned() == false);
-
-  auto pd_place_1 = ConvertToPlatformPlace(pten_place);
-  CHECK(platform::is_gpu_place(pd_place_1));
-  CHECK(pd_place == BOOST_GET_CONST(platform::CUDAPlace, pd_place_1));
-  CHECK(pten_place == ConvertToPtenPlace(pd_place_1));
-}
-
-TEST(place_utils, cuda_pinned_place) {
-  auto pd_place = platform::CUDAPinnedPlace();
-  Place pten_place = ConvertToPtenPlace(pd_place);
-  CHECK_EQ(pten_place.device().id(), 0);
-  CHECK(pten_place.device().type() == DeviceType::kCuda);
-  CHECK(pten_place.is_pinned() == true);
-
-  auto pd_place_1 = ConvertToPlatformPlace(pten_place);
-  CHECK(platform::is_cuda_pinned_place(pd_place_1));
-  CHECK(pd_place == BOOST_GET_CONST(platform::CUDAPinnedPlace, pd_place_1));
-  CHECK(pten_place == ConvertToPtenPlace(pd_place_1));
-}
-
-TEST(place_utils, xpu_place) {
-  auto pd_place = platform::XPUPlace(1);
-  Place pten_place = ConvertToPtenPlace(pd_place);
-  CHECK_EQ(pten_place.device().id(), 1);
-  CHECK(pten_place.device().type() == DeviceType::kXpu);
-  CHECK(pten_place.is_pinned() == false);
-
-  auto pd_place_1 = ConvertToPlatformPlace(pten_place);
-  CHECK(platform::is_xpu_place(pd_place_1));
-  CHECK(pd_place == BOOST_GET_CONST(platform::XPUPlace, pd_place_1));
-  CHECK(pten_place == ConvertToPtenPlace(pd_place_1));
-}
-
-}  // namespace tests
-}  // namespace experimental
-}  // namespace paddle
diff --git a/paddle/pten/tests/common/CMakeLists.txt b/paddle/pten/tests/common/CMakeLists.txt
index c0a5414d53e47..f54b37cb976c5 100644
--- a/paddle/pten/tests/common/CMakeLists.txt
+++ b/paddle/pten/tests/common/CMakeLists.txt
@@ -1,3 +1,4 @@
 cc_test(pten_test_backend SRCS test_backend.cc DEPS gtest)
 cc_test(pten_test_data_layout SRCS test_data_layout.cc DEPS gtest)
 cc_test(pten_test_data_type SRCS test_data_type.cc DEPS gtest)
+cc_test(pten_test_place SRCS test_place.cc DEPS pten_place)
diff --git a/paddle/pten/tests/common/test_place.cc b/paddle/pten/tests/common/test_place.cc
new file mode 100644
index 0000000000000..0bbd8f1d42273
--- /dev/null
+++ b/paddle/pten/tests/common/test_place.cc
@@ -0,0 +1,53 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/common/place.h"
+
+#include "gtest/gtest.h"
+
+namespace pten {
+namespace tests {
+
+TEST(PtenPlace, place) {
+  pten::Place place;
+  EXPECT_EQ(place.GetType(), pten::AllocationType::UNDEF);
+
+  place.Reset(pten::AllocationType::GPU, 1);
+  EXPECT_EQ(place.GetType(), pten::AllocationType::GPU);
+  EXPECT_EQ(place.GetDeviceId(), 1);
+}
+
+TEST(Place, cpu_place) {
+  pten::CPUPlace place;
+  EXPECT_EQ(place.GetType(), pten::AllocationType::CPU);
+  std::cout << "cpu place repr: " << place << std::endl;
+}
+
+TEST(Place, gpu_place) {
+  pten::GPUPlace place;
+  EXPECT_EQ(place.GetType(), pten::AllocationType::GPU);
+  EXPECT_EQ(place.GetDeviceId(), 0);
+
+  pten::GPUPlace place1(2);
+  EXPECT_EQ(place1.GetType(), pten::AllocationType::GPU);
+  EXPECT_EQ(place1.GetDeviceId(), 2);
+  std::cout << "gpu place repr: " << place1 << std::endl;
+
+  pten::GPUPinnedPlace place2;
+  EXPECT_EQ(place2.GetType(), pten::AllocationType::GPUPINNED);
+  std::cout << "gpu pinned place repr: " << place2 << std::endl;
+}
+
+}  // namespace tests
+}  // namespace pten

From 3eaf8d2cead9fc3d7b82c5c928c331917ea687b6 Mon Sep 17 00:00:00 2001
From: niuliling123 <51102941+niuliling123@users.noreply.github.com>
Date: Tue, 11 Jan 2022 19:49:01 +0800
Subject: [PATCH 090/151] Modified Kernel Primitive API and elementwise for
 xpu2 #38688

---
 .../elementwise/elementwise_op_broadcast.cu.h |   8 +-
 .../elementwise/elementwise_op_impl.cu.h      |   3 +-
 .../datamover_primitives_xpu2.h               | 172 +++++++++---------
 .../kernel_primitives/kernel_primitives.h     |  15 +-
 paddle/fluid/platform/hostdevice.h            |   9 +-
 paddle/pten/kernels/gpu/elementwise.h         | 104 +++++------
 6 files changed, 164 insertions(+), 147 deletions(-)

diff --git a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
index 25c983566b371..e3d4607b7130c 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
@@ -25,8 +25,7 @@ namespace kps = paddle::operators::kernel_primitives;
 template <ElementwiseType ET, typename InT, typename OutT, typename Functor,
           int NumOuts = 1>
 void LaunchBroadcastElementwiseCudaKernel(
-    const platform::CUDADeviceContext &ctx,
-    const std::vector<const framework::Tensor *> &ins,
+    const KPDevice &ctx, const std::vector<const framework::Tensor *> &ins,
     std::vector<framework::Tensor *> *outs, int axis, Functor func) {
   std::vector<const pten::DenseTensor *> pt_inputs;
   std::vector<pten::DenseTensor *> pt_outputs;
@@ -58,8 +57,7 @@ void LaunchBroadcastElementwiseCudaKernel(
 template <ElementwiseType ET, typename InT, typename OutT, typename Functor,
           int NumOuts = 1>
 void LaunchElementwiseCudaKernel(
-    const platform::CUDADeviceContext &cuda_ctx,
-    const std::vector<const framework::Tensor *> &ins,
+    const KPDevice &ctx, const std::vector<const framework::Tensor *> &ins,
     std::vector<framework::Tensor *> *outs, int axis, Functor func) {
   std::vector<const pten::DenseTensor *> pt_inputs;
   std::vector<pten::DenseTensor *> pt_outputs;
@@ -85,7 +83,7 @@ void LaunchElementwiseCudaKernel(
     pt_outputs.push_back(pt_outputs_tmp[i].get());
   }
   pten::LaunchElementwiseCudaKernel<ET, InT, OutT, Functor, NumOuts>(
-      cuda_ctx, pt_inputs, &pt_outputs, axis, func);
+      ctx, pt_inputs, &pt_outputs, axis, func);
 }
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
index 1d8acd5eca5d9..36ff1ae254d20 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
@@ -35,8 +35,7 @@ using ElementwiseType = pten::ElementwiseType;
 template <ElementwiseType ET, typename InT, typename OutT, typename Functor,
           int NumOuts = 1>
 void LaunchSameDimsElementwiseCudaKernel(
-    const platform::CUDADeviceContext &ctx,
-    const std::vector<const framework::Tensor *> &ins,
+    const KPDevice &ctx, const std::vector<const framework::Tensor *> &ins,
     std::vector<framework::Tensor *> *outs, Functor func) {
   std::vector<const pten::DenseTensor *> pt_inputs;
   std::vector<pten::DenseTensor *> pt_outputs;
diff --git a/paddle/fluid/operators/kernel_primitives/datamover_primitives_xpu2.h b/paddle/fluid/operators/kernel_primitives/datamover_primitives_xpu2.h
index b27ba27b3c6f1..333899535894e 100644
--- a/paddle/fluid/operators/kernel_primitives/datamover_primitives_xpu2.h
+++ b/paddle/fluid/operators/kernel_primitives/datamover_primitives_xpu2.h
@@ -32,42 +32,50 @@ struct alignas(sizeof(T) * VecSize) VectorType {
  * index of the output data. if input or output shape is [dim0, dim1] then dims
  * must be [dim1, dim0].
  */
+#pragma pack(4)
 template <int kDims>
 struct BroadcastConfig {
-  uint32_t stride_in[framework::DDim::kMaxRank];
-  uint32_t stride_out[framework::DDim::kMaxRank];
-  uint32_t shape_in[framework::DDim::kMaxRank];
+  int strides_in[framework::DDim::kMaxRank];
+  int strides_out[framework::DDim::kMaxRank];
+  int in_dim[framework::DDim::kMaxRank];
 
   HOSTDEVICE BroadcastConfig() {}
 
   HOSTDEVICE BroadcastConfig(const std::vector<int64_t>& out_dims,
                              const std::vector<int64_t>& in_dims,
                              int dim_size) {
-    std::vector<uint32_t> strides_in;
-    std::vector<uint32_t> strides_out;
-    std::vector<uint32_t> shapes_in;
-
-    strides_out.resize(dim_size, 1);
-    strides_in.resize(dim_size, 1);
-    shapes_in.resize(dim_size, 1);
-
-    for (int i = 0; i < dim_size; ++i) {
-      shape_in[i] = in_dims[dim_size - i - 1];
+    std::vector<int> strides_in_tmp;
+    std::vector<int> strides_out_tmp;
+    std::vector<int> dim_tmp;
+    strides_in_tmp.resize(dim_size, 1);
+    strides_out_tmp.resize(dim_size, 1);
+    dim_tmp.resize(dim_size, 1);
+    for (int i = 1; i < dim_size; i++) {
+      strides_in_tmp[i] = strides_in_tmp[i - 1] * in_dims[i - 1];
+      strides_out_tmp[i] = strides_out_tmp[i - 1] * out_dims[i - 1];
     }
 
-    for (int i = 1; i < dim_size - 1; ++i) {
-      strides_out[dim_size - i - 1] = std::accumulate(
-          out_dims.begin(), out_dims.begin() + i, 1, std::multiplies<int64_t>())
-          strides_in[dim_size - i - 1] =
-              std::accumulate(in_dims.begin(), in_dims.begin() + i, 1,
-                              std::multiplies<int64_t>())
+    for (int i = 0; i < dim_size; i++) {
+      dim_tmp[i] = in_dims[i];
     }
 
-    memcpy(stride_in, strides_in.data(), kDims * sizeof(uint32_t));
-    memcpy(stride_out, strides_out.data(), kDims * sizeof(uint32_t));
-    memcpy(shape_in, shapes_in.data(), kDims * sizeof(uint32_t));
+    memcpy(strides_in, strides_in_tmp.data(), kDims * sizeof(int));
+    memcpy(strides_out, strides_out_tmp.data(), kDims * sizeof(int));
+    memcpy(in_dim, dim_tmp.data(), kDims * sizeof(int));
+  }
+
+  __device__ inline int operator()(int index_output) const {
+    int index_src = 0;
+#pragma unroll
+    for (int i = kDims - 1; i >= 0; --i) {
+      int tmp_index = (index_output / strides_out[i]);
+      index_output = index_output - tmp_index * strides_out[i];
+      index_src += (tmp_index % in_dim[i]) * strides_in[i];
+    }
+    return index_src;
   }
 };
+#pragma pack()
 
 }  // namespace details
 
@@ -99,12 +107,12 @@ struct BroadcastConfig {
  */
 template <typename Tx, typename Ty, int NX, int NY, int BlockSize,
           bool IsBoundary = false>
-__device__ __forceinline__ void ReadData(Ty* dst, const Tx _global_ptr_* src,
-                                         int size_nx, int size_ny,
-                                         int stride_nx, int stride_ny) {
+__device__ __inline__ void ReadData(Ty* dst, const Tx _global_ptr_* src,
+                                    int size_nx, int size_ny, int stride_nx,
+                                    int stride_ny) {
   int thread_offset = core_id();
   int left_size_nx = size_nx - thread_offset;
-  __local__ T in_temp[1];
+  __local__ Tx in_temp[1];
   // Each branch is added for better performance
   if (NX == 1 && NY == 1) {  // for NX == 1 and NY == 1
     if (IsBoundary) {
@@ -168,7 +176,7 @@ __device__ __forceinline__ void ReadData(Ty* dst, const Tx _global_ptr_* src,
  * init_data: Initial value.
  */
 template <typename T, int NX>
-__device__ __forceinline__ void Init(T* dst, T init_data) {
+__device__ __inline__ void Init(T* dst, T init_data) {
 #pragma unroll
   for (int i = 0; i < NX; i++) {
     dst[i] = init_data;
@@ -197,8 +205,8 @@ __device__ __forceinline__ void Init(T* dst, T init_data) {
  * size: The current block needs to load size data continuously.
  */
 template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
-__device__ __forceinline__ void ReadData(T* dst, const T _global_ptr_* src,
-                                         int num) {
+__device__ __inline__ void ReadData(T* dst, const T _global_ptr_* src,
+                                    int num) {
   int thread_offset = core_id() * NX;
   __local__ T in_temp[1];
   if (IsBoundary) {  // core_num() * NX > num
@@ -241,10 +249,11 @@ __device__ __forceinline__ void ReadData(T* dst, const T _global_ptr_* src,
  */
 template <typename T, int NX, int NY, int BlockSize, int Rank,
           bool IsBoundary = false>
-__device__ __forceinline__ void ReadDataBc(
-    T* dst, const T _global_ptr_* src, uint32_t block_offset,
-    details::BroadcastConfig<Rank> config, int total_num_output, int stride_nx,
-    int stride_ny) {
+__device__ __inline__ void ReadDataBc(T* dst, const T _global_ptr_* src,
+                                      uint32_t block_offset,
+                                      details::BroadcastConfig<Rank> config,
+                                      int total_num_output, int stride_nx,
+                                      int stride_ny) {
   uint32_t thread_offset = block_offset + core_id();
   uint32_t index_src = 0;
   __local__ T in_temp[1];
@@ -256,16 +265,11 @@ __device__ __forceinline__ void ReadDataBc(
       uint32_t index_output = thread_offset + ny * stride_ny + nx * stride_nx;
       index_src = 0;
       if (IsBoundary) {
-        if (index_output >= total_num_output) {
+        if (index_output >= (uint32_t)total_num_output) {
           break;
         }
       }
-#pragma unroll
-      for (int i = 0; i < Rank; ++i) {
-        uint32_t tmp = index_output / config.stride_out[i];
-        index_output = index_output - tmp * config.stride_out[i];
-        index_src += (tmp % config.shape_in[i]) * config.stride_in[i];
-      }
+      index_src = config(index_output);
       GM2LM(src + index_src, in_temp, sizeof(T));
       dst[nx + ny * NX] = in_temp[0];
     }
@@ -305,33 +309,34 @@ __device__ __forceinline__ void ReadDataBc(
  */
 template <typename T, int NX, int NY, int BlockSize, int Rank,
           typename IndexCal, bool IsBoundary = false>
-__device__ __forceinline__ void ReadDataReduce(
-    T* dst, const T _global_ptr_* src, int block_offset,
-    const IndexCal& index_cal, int size_nx, int size_ny, int stride_nx,
-    int stride_ny, bool reduce_last_dim) {
-  __local__ T in_temp[1];
+__device__ __inline__ void ReadDataReduce(T* dst, const T _global_ptr_* src,
+                                          int block_offset,
+                                          const IndexCal& index_cal,
+                                          int size_nx, int size_ny,
+                                          int stride_nx, int stride_ny,
+                                          bool reduce_last_dim) {
+  __local__ Tx in_temp[1];
   int thread_offset = 0;
-  int left_size_nx = size_nx;
-  int left_size_ny = size_ny;
+  int left_idx = 0;
   if (reduce_last_dim) {
-    thread_offset = block_offset + core_id();
-    left_size_nx -= thread_offset;
+    thread_offset = core_id();
+    left_idx = 0;
   } else {
-    thread_offset = block_offset + core_id();
-    left_size_ny -= thread_offset;
+    thread_offset = 0;
+    left_idx = 0;
   }
 
   if (NX == 1) {
 #pragma unroll
     for (int ny = 0; ny < NY; ++ny) {
       if (IsBoundary) {
-        if (ny * stride_ny >= left_size_ny) {
+        if (thread_offset >= size_ny) {
           break;
         }
       }
-      uint32_t index_src = index_cal(thread_offset);
-      GM2LM(src + index_src, in_temp, sizeof(T));
-      dst[ny] = in_temp[0];
+      uint32_t index_src = index_cal(thread_offset + block_offset);
+      GM2LM(src + index_src, in_temp, sizeof(Tx));
+      dst[ny] = static_cast<Ty>(func(in_temp[0]));
       thread_offset += stride_ny;
     }
   } else {
@@ -340,17 +345,16 @@ __device__ __forceinline__ void ReadDataReduce(
 #pragma unroll
       for (int ny = 0; ny < NY; ++ny) {
         if (IsBoundary) {
-          if ((ny * stride_ny >= left_size_ny) ||
-              (nx * stride_nx >= left_size_nx)) {
+          if ((thread_offset >= size_ny) ||
+              (left_idx + nx * stride_nx >= size_nx)) {
             break;
           }
         }
-        uint32_t index_src = index_cal(thread_offset);
-        GM2LM(src + index_src, in_temp, sizeof(T));
-        dst[nx + ny * NX] = in_temp[0];
+        uint32_t index_src = index_cal(thread_offset + block_offset);
+        GM2LM(src + index_src, in_temp, sizeof(Tx));
+        dst[nx + ny * NX] = static_cast<Ty>(func(in_temp[0]));
         thread_offset += stride_ny;
       }
-      thread_offset += stride_nx;
     }
   }
 }
@@ -421,9 +425,9 @@ __device__ void WriteData(T _global_ptr_* dst, const T* src, int num) {
  */
 template <typename Tx, typename Ty, int NX, int NY, int BlockSize,
           bool IsBoundary = false>
-__device__ __forceinline__ void WriteData(Ty _global_ptr_* dst, const Tx* src,
-                                          int size_nx, int size_ny,
-                                          int stride_nx, int stride_ny) {
+__device__ __inline__ void WriteData(Ty _global_ptr_* dst, const Tx* src,
+                                     int size_nx, int size_ny, int stride_nx,
+                                     int stride_ny) {
   int thread_offset = core_id();
   int left_size_nx = size_nx - thread_offset;
   __local__ Ty in_temp[1];
@@ -433,11 +437,11 @@ __device__ __forceinline__ void WriteData(Ty _global_ptr_* dst, const Tx* src,
     if (IsBoundary) {
       if (left_size_nx > 0) {
         in_temp[0] = static_cast<Ty>(src[0]);
-        LM2GM(in_temp, dst + thread_offset, sizeof(T));
+        LM2GM(in_temp, dst + thread_offset, sizeof(Ty));
       }
     } else {
       in_temp[0] = static_cast<Ty>(src[0]);
-      LM2GM(in_temp, dst + thread_offset, sizeof(T));
+      LM2GM(in_temp, dst + thread_offset, sizeof(Ty));
     }
   } else if (NX == 1) {
 #pragma unroll
@@ -449,7 +453,7 @@ __device__ __forceinline__ void WriteData(Ty _global_ptr_* dst, const Tx* src,
       }
 
       in_temp[0] = static_cast<Ty>(src[idy]);
-      LM2GM(in_temp, dst + thread_offset + idy * stride_ny, sizeof(T));
+      LM2GM(in_temp, dst + thread_offset + idy * stride_ny, sizeof(Ty));
     }
   } else if (NY == 1) {  // for NY == 1 and NX != 1
 #pragma unroll
@@ -461,7 +465,7 @@ __device__ __forceinline__ void WriteData(Ty _global_ptr_* dst, const Tx* src,
       }
 
       in_temp[0] = static_cast<Ty>(src[idx]);
-      LM2GM(in_temp, dst + thread_offset + idx * stride_nx, sizeof(T));
+      LM2GM(in_temp, dst + thread_offset + idx * stride_nx, sizeof(Ty));
     }
   } else {  // for NX != 1 and NY != 1
 #pragma unroll
@@ -480,7 +484,7 @@ __device__ __forceinline__ void WriteData(Ty _global_ptr_* dst, const Tx* src,
         }
         in_temp[0] = static_cast<Ty>(src[idx + idy * NX]);
         LM2GM(in_temp, dst + thread_offset + idx * stride_nx + idy * stride_ny,
-              sizeof(T));
+              sizeof(Ty));
       }
     }
   }
@@ -498,7 +502,7 @@ __device__ __forceinline__ void WriteData(Ty _global_ptr_* dst, const Tx* src,
  * init_data: The register pointer of init data, the size is NX.
  */
 template <typename T, int NX, bool IsBoundary = false>
-__device__ __forceinline__ void Init(T* dst, T* init_data, int num) {
+__device__ __inline__ void Init(T* dst, T* init_data, int num) {
 #pragma unroll
   for (int i = 0; i < NX; i++) {
     if (IsBoundary) {
@@ -535,30 +539,26 @@ __device__ __forceinline__ void Init(T* dst, T* init_data, int num) {
  */
 template <typename T, int NX, int NY, int BlockSize, int Rank,
           bool IsBoundary = false>
-__device__ __forceinline__ void ReadDataBc(
-    T* dst, const T _global_ptr_* src, uint32_t block_offset,
-    details::BroadcastConfig<Rank> config, int total_num_output) {
-  uint32_t thread_offset = block_offset + core_id() * NX;
-  uint32_t index_src = 0;
-  __local__ T in_temp[1];
+__device__ __inline__ void ReadDataBc(T* dst, const T _global_ptr_* src,
+                                      uint32_t block_offset,
+                                      details::BroadcastConfig<Rank> config,
+                                      int total_num_output) {
+  int thread_offset = block_offset + core_id() * NX;
+  int index_src = 0;
 
+  __local__ T in_temp;
 #pragma unroll
-  for (uint32_t nx = 0; nx < NX; ++nx) {
-    uint32_t index_output = thread_offset + nx;
+  for (int nx = 0; nx < NX; ++nx) {
+    int index_output = thread_offset + nx;
     index_src = 0;
     if (IsBoundary) {
       if (index_output >= total_num_output) {
         break;
       }
     }
-#pragma unroll
-    for (int i = 0; i < Rank; ++i) {
-      uint32_t tmp = index_output / config.stride_out[i];
-      index_output = index_output - tmp * config.stride_out[i];
-      index_src += (tmp % config.shape_in[i]) * config.stride_in[i];
-    }
-    GM2LM(src + index_src, in_temp, sizeof(T));
-    dst[nx + ny * NX] = in_temp[0];
+    index_src = config(index_output);
+    GM2LM(src + index_src, &in_temp, sizeof(T));
+    dst[nx] = in_temp;
   }
 }
 
diff --git a/paddle/fluid/operators/kernel_primitives/kernel_primitives.h b/paddle/fluid/operators/kernel_primitives/kernel_primitives.h
index e20e77ae26a71..558f8c81c6642 100644
--- a/paddle/fluid/operators/kernel_primitives/kernel_primitives.h
+++ b/paddle/fluid/operators/kernel_primitives/kernel_primitives.h
@@ -13,11 +13,18 @@
 // limitations under the License.
 
 #pragma once
-#include "paddle/fluid/operators/kernel_primitives/functor_primitives.h"
 #include "paddle/fluid/operators/kernel_primitives/helper_primitives.h"
 #ifdef PADDLE_WITH_XPU2
 #include "paddle/fluid/operators/kernel_primitives/compute_primitives_xpu2.h"
 #include "paddle/fluid/operators/kernel_primitives/datamover_primitives_xpu2.h"
+#include "paddle/fluid/operators/kernel_primitives/functor_primitives_xpu2.h"
+
+#define KPStream XPUStream
+#define KPDevice paddle::platform::XPUDeviceContext
+#define _ptr_ _global_ptr_
+#define __forceinline__ __inline__
+#define __restrict__
+
 #define THREAD_ID_X core_id()
 #define THREAD_ID_Y 0
 #define THREAD_ID_Z 0
@@ -36,6 +43,12 @@
 #else
 #include "paddle/fluid/operators/kernel_primitives/compute_primitives.h"
 #include "paddle/fluid/operators/kernel_primitives/datamover_primitives.h"
+#include "paddle/fluid/operators/kernel_primitives/functor_primitives.h"
+
+#define KPStream gpuStream_t
+#define KPDevice paddle::platform::CUDADeviceContext
+#define _ptr_
+
 #define THREAD_ID_X threadIdx.x
 #define THREAD_ID_Y threadIdx.y
 #define THREAD_ID_Z threadIdx.z
diff --git a/paddle/fluid/platform/hostdevice.h b/paddle/fluid/platform/hostdevice.h
index 1ffbbc217e254..65005a5adbb1d 100644
--- a/paddle/fluid/platform/hostdevice.h
+++ b/paddle/fluid/platform/hostdevice.h
@@ -17,7 +17,14 @@
 #include <hip/hip_runtime.h>
 #endif
 
-#if (defined(__CUDACC__) || defined(__HIPCC__))
+#ifdef __xpu_kp__
+#include <xpu/runtime.h>
+#include "xpu/kernel/cluster_header.h"
+#include "xpu/kernel/debug.h"
+#include "xpu/kernel/math.h"
+#endif
+
+#if (defined(__CUDACC__) || defined(__HIPCC__) || defined(__xpu_kp__))
 #define HOSTDEVICE __host__ __device__
 #define DEVICE __device__
 #define HOST __host__
diff --git a/paddle/pten/kernels/gpu/elementwise.h b/paddle/pten/kernels/gpu/elementwise.h
index f78328c01a30d..e4cc894e48354 100644
--- a/paddle/pten/kernels/gpu/elementwise.h
+++ b/paddle/pten/kernels/gpu/elementwise.h
@@ -86,7 +86,7 @@ struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 3, false> {
 template <typename OutT, int VecSize, bool IsBoundary, int NumOuts>
 struct ElementwiseWriteDataCaller {
   __device__ __forceinline__ void operator()(
-      paddle::framework::Array<OutT *, NumOuts> outs,
+      paddle::framework::Array<_ptr_ OutT *, NumOuts> outs,
       ConditionalT<OutT, NumOuts> src[VecSize],
       int block_offset,
       int num) {
@@ -109,7 +109,7 @@ struct ElementwiseWriteDataCaller {
 template <typename OutT, int VecSize, bool IsBoundary>
 struct ElementwiseWriteDataCaller<OutT, VecSize, IsBoundary, 1> {
   __device__ __forceinline__ void operator()(
-      paddle::framework::Array<OutT *, 1> outs,
+      paddle::framework::Array<_ptr_ OutT *, 1> outs,
       OutT src[VecSize],
       int block_offset,
       int num) {
@@ -126,8 +126,8 @@ template <typename InT,
           int VecSize,
           bool IsBoundary>
 __device__ void VectorizedElementwiseKernelImpl(
-    const paddle::framework::Array<const InT *__restrict__, Arity> &in,
-    paddle::framework::Array<OutT *, NumOuts> outs,
+    const paddle::framework::Array<const _ptr_ InT *__restrict__, Arity> &in,
+    paddle::framework::Array<_ptr_ OutT *, NumOuts> outs,
     int num,
     int data_offset,
     Functor func) {
@@ -161,8 +161,8 @@ template <typename InT,
           int NumOuts,
           int VecSize>
 __global__ void VectorizedElementwiseKernel(
-    paddle::framework::Array<const InT *__restrict__, Arity> ins,
-    paddle::framework::Array<OutT *, NumOuts> outs,
+    paddle::framework::Array<const _ptr_ InT *__restrict__, Arity> ins,
+    paddle::framework::Array<_ptr_ OutT *, NumOuts> outs,
     int size,
     int main_offset,
     Functor func) {
@@ -212,17 +212,13 @@ template <typename InT,
           int Arity,
           int NumOuts,
           int VecSize>
-void ElementwiseCudaKernel(const paddle::platform::CUDADeviceContext &ctx,
+void ElementwiseCudaKernel(const KPDevice &ctx,
                            const std::vector<const DenseTensor *> &ins,
                            std::vector<DenseTensor *> *outs,
                            Functor func) {
   auto numel = ins[0]->numel();
-  int block_size = funcs::GetThreadsConfig(ctx, numel, VecSize);
-  int grid_size =
-      ((numel + VecSize - 1) / VecSize + block_size - 1) / block_size;
-  auto stream = ctx.stream();
-  paddle::framework::Array<const InT *__restrict__, Arity> ins_data;
-  paddle::framework::Array<OutT *, NumOuts> outs_data;
+  paddle::framework::Array<const _ptr_ InT *__restrict__, Arity> ins_data;
+  paddle::framework::Array<_ptr_ OutT *, NumOuts> outs_data;
 
   for (int i = 0; i < Arity; ++i) {
     ins_data[i] = ins[i]->data<InT>();
@@ -231,8 +227,9 @@ void ElementwiseCudaKernel(const paddle::platform::CUDADeviceContext &ctx,
     outs_data[i] = (*outs)[i]->mutable_data<OutT>();
   }
 #ifdef PADDLE_WITH_XPU2
-  block_size = 128;
-  grid_size = 8;
+  int block_size = 64;
+  int grid_size = 8;
+  auto stream = ctx.x_context()->xpu_stream;
   int main_offset = (numel / (VecSize * block_size)) * VecSize * block_size;
   VectorizedElementwiseKernel<InT,
                               OutT,
@@ -242,7 +239,11 @@ void ElementwiseCudaKernel(const paddle::platform::CUDADeviceContext &ctx,
                               VecSize><<<grid_size, block_size, 0, stream>>>(
       ins_data, outs_data, numel, main_offset, func);
 #else
+  int block_size = funcs::GetThreadsConfig(ctx, numel, VecSize);
+  int grid_size =
+      ((numel + VecSize - 1) / VecSize + block_size - 1) / block_size;
   int main_offset = (numel / (VecSize * block_size)) * VecSize * block_size;
+  auto stream = ctx.stream();
   VectorizedElementwiseKernel<InT,
                               OutT,
                               Functor,
@@ -259,7 +260,7 @@ template <ElementwiseType ET,
           typename Functor,
           int NumOuts = 1>
 void LaunchSameDimsElementwiseCudaKernel(
-    const paddle::platform::CUDADeviceContext &ctx,
+    const KPDevice &ctx,
     const std::vector<const DenseTensor *> &ins,
     std::vector<DenseTensor *> *outs,
     Functor func) {
@@ -471,12 +472,12 @@ struct DimensionsTransform {
 template <typename T, int VecSize, int Rank, bool IsBoundary = false>
 __device__ __forceinline__ void LoadData(
     T *dst,
-    const T *__restrict__ src,
+    const _ptr_ T *src,
     uint32_t block_offset,
     const kps::details::BroadcastConfig<Rank> &config,
     int numel,
     int num,
-    bool need_broadcast) {
+    int need_broadcast) {
   // numel : whole num of output
   // num: how many data will be deal with in this time
   if (need_broadcast) {
@@ -496,9 +497,9 @@ template <typename InT,
           int Rank,
           bool IsBoundary = false>
 __device__ void ElementwiseBroadcastKernelImpl(
-    const paddle::framework::Array<const InT *__restrict__, Arity> &ins,
-    paddle::framework::Array<OutT *, NumOuts> outs,
-    const paddle::framework::Array<bool, Arity> &use_broadcast,
+    const paddle::framework::Array<const _ptr_ InT *__restrict__, Arity> &ins,
+    paddle::framework::Array<_ptr_ OutT *, NumOuts> outs,
+    const paddle::framework::Array<int, Arity> &use_broadcast,
     uint32_t numel,
     const paddle::framework::Array<kps::details::BroadcastConfig<Rank>, Arity>
         &configs,
@@ -540,9 +541,9 @@ template <typename InT,
           int VecSize,
           int Rank>
 __global__ void ElementwiseBroadcastKernel(
-    paddle::framework::Array<const InT *__restrict__, Arity> ins,
-    paddle::framework::Array<OutT *, NumOuts> outs,
-    paddle::framework::Array<bool, Arity> use_broadcast,
+    paddle::framework::Array<const _ptr_ InT *__restrict__, Arity> ins,
+    paddle::framework::Array<_ptr_ OutT *, NumOuts> outs,
+    paddle::framework::Array<int, Arity> use_broadcast,
     uint32_t numel,
     paddle::framework::Array<kps::details::BroadcastConfig<Rank>, Arity>
         configs,
@@ -570,7 +571,8 @@ __global__ void ElementwiseBroadcastKernel(
                                           block_offset,
                                           func);
   }
-  if (block_offset < numel) {
+  int num = numel - block_offset;
+  if (num > 0) {
     ElementwiseBroadcastKernelImpl<InT,
                                    OutT,
                                    Functor,
@@ -579,7 +581,7 @@ __global__ void ElementwiseBroadcastKernel(
                                    VecSize,
                                    Rank,
                                    true>(
-        ins, outs, use_broadcast, numel, configs, tail_tid, block_offset, func);
+        ins, outs, use_broadcast, numel, configs, num, block_offset, func);
   }
 #else
   if (block_offset < main_offset) {
@@ -619,23 +621,16 @@ template <typename InT,
           int NumOuts,
           int VecSize,
           int Rank>
-void LaunchKernel(const paddle::platform::CUDADeviceContext &ctx,
+void LaunchKernel(const KPDevice &ctx,
                   const std::vector<const DenseTensor *> &ins,
                   std::vector<DenseTensor *> *outs,
                   Functor func,
                   DimensionsTransform merge_dims) {
   int numel = (*outs)[0]->numel();
-  const int threads = 256;
-  int blocks = ((numel + VecSize - 1) / VecSize + threads - 1) / threads;
-
-  int main_offset = (numel / (VecSize * threads)) * VecSize * threads;
-  int tail_tid = numel % (VecSize * threads);
-  auto stream = ctx.stream();
-
   paddle::framework::Array<kps::details::BroadcastConfig<Rank>, Arity> configs;
-  paddle::framework::Array<bool, Arity> use_broadcast;
-  paddle::framework::Array<const InT *__restrict__, Arity> ins_data;
-  paddle::framework::Array<OutT *, NumOuts> outs_data;
+  paddle::framework::Array<int, Arity> use_broadcast;
+  paddle::framework::Array<const _ptr_ InT *__restrict__, Arity> ins_data;
+  paddle::framework::Array<_ptr_ OutT *, NumOuts> outs_data;
 
   for (int i = 0; i < NumOuts; ++i) {
     outs_data[i] = (*outs)[i]->mutable_data<OutT>();
@@ -643,7 +638,7 @@ void LaunchKernel(const paddle::platform::CUDADeviceContext &ctx,
 
   for (int i = 0; i < Arity; i++) {
     use_broadcast[i] = (ins[i]->numel() != numel);
-    ins_data[i] = ins[i]->data<InT>();
+    ins_data[i] = (_ptr_ InT *)(ins[i]->data<InT>());
     if (use_broadcast[i]) {
       // get the broadcast config,
       // if data shape is[m, n], then you should set data_dim = {n, m}
@@ -654,10 +649,11 @@ void LaunchKernel(const paddle::platform::CUDADeviceContext &ctx,
   }
 
 #ifdef PADDLE_WITH_XPU2
-  threads = 128;
-  blocks = 8;
-  main_offset = (numel / (VecSize * threads)) * VecSize * threads;
-  tail_tid = numel % (VecSize * threads);
+  const int threads = 64;
+  const int blocks = 8;
+  int main_offset = (numel / (VecSize * threads)) * VecSize * threads;
+  int tail_tid = numel % (VecSize * threads);
+  auto stream = ctx.x_context()->xpu_stream;
   ElementwiseBroadcastKernel<InT,
                              OutT,
                              Functor,
@@ -673,6 +669,11 @@ void LaunchKernel(const paddle::platform::CUDADeviceContext &ctx,
                                                                 tail_tid,
                                                                 func);
 #else
+  const int threads = 256;
+  int blocks = ((numel + VecSize - 1) / VecSize + threads - 1) / threads;
+  int main_offset = (numel / (VecSize * threads)) * VecSize * threads;
+  int tail_tid = numel % (VecSize * threads);
+  auto stream = ctx.stream();
   ElementwiseBroadcastKernel<InT,
                              OutT,
                              Functor,
@@ -698,7 +699,7 @@ template <typename InT,
           int NumOuts,
           int VecSize>
 void LaunchBroadcastKernelForDifferentVecSize(
-    const paddle::platform::CUDADeviceContext &ctx,
+    const KPDevice &ctx,
     const std::vector<const DenseTensor *> &ins,
     std::vector<DenseTensor *> *outs,
     int axis,
@@ -737,7 +738,7 @@ template <ElementwiseType ET,
           typename Functor,
           int NumOuts = 1>
 void LaunchBroadcastElementwiseCudaKernel(
-    const paddle::platform::CUDADeviceContext &ctx,
+    const KPDevice &ctx,
     const std::vector<const DenseTensor *> &ins,
     std::vector<DenseTensor *> *outs,
     int axis,
@@ -835,12 +836,11 @@ template <ElementwiseType ET,
           typename OutT,
           typename Functor,
           int NumOuts = 1>
-void LaunchElementwiseCudaKernel(
-    const paddle::platform::CUDADeviceContext &cuda_ctx,
-    const std::vector<const DenseTensor *> &ins,
-    std::vector<DenseTensor *> *outs,
-    int axis,
-    Functor func) {
+void LaunchElementwiseCudaKernel(const KPDevice &ctx,
+                                 const std::vector<const DenseTensor *> &ins,
+                                 std::vector<DenseTensor *> *outs,
+                                 int axis,
+                                 Functor func) {
   std::vector<int> dims_size;
   bool no_broadcast_flag = true;
   for (auto *in : ins) {
@@ -849,14 +849,14 @@ void LaunchElementwiseCudaKernel(
   }
   if (no_broadcast_flag) {
     LaunchSameDimsElementwiseCudaKernel<ET, InT, OutT, Functor, NumOuts>(
-        cuda_ctx, ins, outs, func);
+        ctx, ins, outs, func);
   } else {
     axis = axis == -1
                ? *std::max_element(dims_size.begin(), dims_size.end()) -
                      *std::min_element(dims_size.begin(), dims_size.end())
                : axis;
     LaunchBroadcastElementwiseCudaKernel<ET, InT, OutT, Functor, NumOuts>(
-        cuda_ctx, ins, outs, axis, func);
+        ctx, ins, outs, axis, func);
   }
 }
 

From 7915d18056d4f4284f5f415d5f9111c157b782c7 Mon Sep 17 00:00:00 2001
From: Zhang Zheng <32410583+ZzSean@users.noreply.github.com>
Date: Tue, 11 Jan 2022 20:00:12 +0800
Subject: [PATCH 091/151] Fix bug in elementwise_mul/div_grad when inplace
 strategy (#38840)

* fix bug when inplace strategy

* fix

* fix

* fix

* fix

* fix
---
 .../operators/elementwise/elementwise_div_op.cu      | 10 ----------
 .../operators/elementwise/elementwise_mul_op.cu      | 12 +-----------
 .../operators/elementwise/elementwise_op_function.h  |  1 +
 3 files changed, 2 insertions(+), 21 deletions(-)

diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cu b/paddle/fluid/operators/elementwise/elementwise_div_op.cu
index 7a25f65366901..06f9107db27b4 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cu
@@ -31,20 +31,10 @@ ElementwiseDivGrad(const framework::ExecutionContext& ctx,
   const auto& dev_ctx = ctx.template device_context<DeviceContext>();
   const auto place = ctx.GetPlace();
   if (dx != nullptr && dy != nullptr) {
-    dx->mutable_data<T>(place);
-    if (dx->IsSharedBufferWith(*dout)) {
-      dx->clear();
-      dx->mutable_data<T>(x->dims(), place);
-    }
     std::vector<const framework::Tensor*> ins = {dout, out, y};
     GetGradXAndYOut<ElementwiseType::kTernary, T>(
         dev_ctx, place, axis, ins, dout, dx, dy, DivGradXYFunctor<T, T>());
   } else if (dx != nullptr && dy == nullptr) {
-    dx->mutable_data<T>(place);
-    if (dx->IsSharedBufferWith(*dout)) {
-      dx->clear();
-      dx->mutable_data<T>(x->dims(), place);
-    }
     std::vector<const framework::Tensor*> ins = {dout, y};
     GetGradXOrYOut<ElementwiseType::kBinary, T>(dev_ctx, place, axis, ins, dout,
                                                 dx, DivGradXFunctor<T>());
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
index a8b6c2abe3bf9..5ece5cadc603f 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
@@ -74,20 +74,10 @@ ElementwiseMulGrad(const framework::ExecutionContext& ctx,
   const auto place = ctx.GetPlace();
 
   if (dx != nullptr && dy != nullptr) {
-    dx->mutable_data<T>(place);
-    if (dx->IsSharedBufferWith(*dout)) {
-      dx->clear();
-      dx->mutable_data<T>(x->dims(), place);
-    }
     std::vector<const framework::Tensor*> ins = {dout, y, x};
-    GetGradXAndYOut<ElementwiseType::kBinary, T>(
+    GetGradXAndYOut<ElementwiseType::kTernary, T>(
         dev_ctx, place, axis, ins, dout, dx, dy, MulGradXYFunctor<T, T>());
   } else if (dx != nullptr && dy == nullptr) {
-    dx->mutable_data<T>(place);
-    if (dx->IsSharedBufferWith(*dout)) {
-      dx->clear();
-      dx->mutable_data<T>(x->dims(), place);
-    }
     std::vector<const framework::Tensor*> ins = {dout, y};
     GetGradXOrYOut<ElementwiseType::kBinary, T>(dev_ctx, place, axis, ins, dout,
                                                 dx, MulGradFunctor<T>());
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h
index 3929699955a17..41cb2696f5492 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -2575,6 +2575,7 @@ void GetGradXAndYOut(const platform::CUDADeviceContext &dev_ctx,
                      framework::Tensor *dy, Functor func) {
   framework::Tensor tmp_dx;
   framework::Tensor tmp_dy;
+  dx->mutable_data<T>(place);
   dy->mutable_data<T>(place);
   std::vector<framework::Tensor *> outs;
   if (dx->dims() == dout->dims() && dy->dims() == dout->dims()) {

From 5b940c44fd5e755e08573bac6fe3af5ed8ef3c83 Mon Sep 17 00:00:00 2001
From: Sing_chan <51314274+betterpig@users.noreply.github.com>
Date: Tue, 11 Jan 2022 20:50:29 +0800
Subject: [PATCH 092/151] oepn third_party cache in wincheck_inference (#38877)

---
 paddle/scripts/paddle_build.bat | 1 -
 1 file changed, 1 deletion(-)

diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index f64acbeb72307..ca34b12b5d4f8 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -261,7 +261,6 @@ set ON_INFER=ON
 set WITH_TESTING=ON
 set WITH_TENSORRT=ON
 set WITH_INFERENCE_API_TEST=ON
-set WITH_TPCACHE=OFF
 
 call :cmake || goto cmake_error
 call :build || goto build_error

From be817719982f1821ab0519ceab85ec238bf99d43 Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Tue, 11 Jan 2022 20:52:35 +0800
Subject: [PATCH 093/151] =?UTF-8?q?=E3=80=90PTen=E3=80=91Add=20dot=20and?=
 =?UTF-8?q?=20matmul=20grad=20kernel=20in=20pten=20(#38713)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* refactor matmul directory in pten

* fix merge conflict

* add dot_grad kernel

* add dot_grad kernel in pten

* add matmul_grad kernel

* update the code

* delete useless code in fluid

* fix some bug of running matmul grad kernel

* fix merge conflict

* refactor some code

* refactor code
---
 cmake/pten_kernel.cmake                       |    3 +
 paddle/fluid/framework/operator.cc            |   26 +-
 paddle/fluid/framework/pten_utils.cc          |    2 +-
 paddle/fluid/imperative/prepared_operator.cc  |   66 +-
 paddle/fluid/operators/conj_op.h              |    2 +-
 paddle/fluid/operators/dot_op.cc              |    7 +
 paddle/fluid/operators/dot_op.h               |  222 +-
 paddle/fluid/operators/math/blas.h            |    4 +
 paddle/fluid/operators/math/blas_impl.h       |   16 +-
 paddle/fluid/operators/matmul_v2_op.cc        |   24 +
 paddle/fluid/operators/matmul_v2_op.h         | 2222 +----------------
 paddle/pten/core/dense_tensor.cc              |    6 +
 paddle/pten/core/dense_tensor.h               |    2 +
 paddle/pten/core/kernel_alias_name.h          |    5 +
 paddle/pten/core/kernel_context.cc            |   11 +-
 paddle/pten/core/kernel_context.h             |   10 +
 paddle/pten/core/kernel_registry.h            |    4 +
 paddle/pten/core/kernel_utils.h               |   22 +
 paddle/pten/include/linalg.h                  |    2 +-
 paddle/pten/include/math.h                    |   11 -
 paddle/pten/kernels/complex_kernel.h          |   13 +-
 paddle/pten/kernels/cpu/complex_kernel.cc     |    2 +-
 paddle/pten/kernels/cpu/dot_grad_kernel.cc    |   32 +
 paddle/pten/kernels/cpu/dot_kernel.cc         |   10 +-
 paddle/pten/kernels/cpu/matmul_grad_kernel.cc |   47 +
 paddle/pten/kernels/dot_grad_kernel.h         |   56 +
 paddle/pten/kernels/dot_kernel.h              |    8 +-
 paddle/pten/kernels/empty_kernel.cc           |   79 +-
 paddle/pten/kernels/empty_kernel.h            |    8 +
 paddle/pten/kernels/gpu/complex_kernel.cu     |    3 +-
 paddle/pten/kernels/gpu/dot_grad_kernel.cu    |   32 +
 paddle/pten/kernels/gpu/dot_kernel.cu         |   10 +-
 paddle/pten/kernels/gpu/matmul_grad_kernel.cu |   50 +
 paddle/pten/kernels/hybird/transpose.h        |   28 +
 .../pten/kernels/impl/complex_kernel_impl.h   |    6 +-
 .../pten/kernels/impl/dot_grad_kernel_impl.h  |  919 +++++++
 .../kernels/impl/matmul_grad_kernel_impl.h    | 1742 +++++++++++++
 paddle/pten/kernels/impl/matmul_kernel_impl.h |   14 +-
 paddle/pten/kernels/matmul_grad_kernel.h      |   63 +
 paddle/pten/kernels/matmul_kernel.h           |   14 +-
 40 files changed, 3336 insertions(+), 2467 deletions(-)
 create mode 100644 paddle/pten/kernels/cpu/dot_grad_kernel.cc
 create mode 100644 paddle/pten/kernels/cpu/matmul_grad_kernel.cc
 create mode 100644 paddle/pten/kernels/dot_grad_kernel.h
 create mode 100644 paddle/pten/kernels/gpu/dot_grad_kernel.cu
 create mode 100644 paddle/pten/kernels/gpu/matmul_grad_kernel.cu
 create mode 100644 paddle/pten/kernels/impl/dot_grad_kernel_impl.h
 create mode 100644 paddle/pten/kernels/impl/matmul_grad_kernel_impl.h
 create mode 100644 paddle/pten/kernels/matmul_grad_kernel.h

diff --git a/cmake/pten_kernel.cmake b/cmake/pten_kernel.cmake
index 947defcea4a61..f962c1332093a 100644
--- a/cmake/pten_kernel.cmake
+++ b/cmake/pten_kernel.cmake
@@ -79,6 +79,9 @@ function(kernel_library TARGET)
     endif()
 
     list(APPEND all_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.h)
+    if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/impl/${TARGET}_impl.h)
+        list(APPEND all_srcs ${CMAKE_CURRENT_SOURCE_DIR}/impl/${TARGET}_impl.h)
+    endif()
     list(APPEND all_srcs ${common_srcs})
     list(APPEND all_srcs ${cpu_srcs})
     list(APPEND all_srcs ${gpu_srcs})
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index c3e54290fd3da..dc4d1365093aa 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1880,16 +1880,32 @@ void OperatorWithKernel::BuildPtenKernelContext(
     // Otherwise，we will create new storage.
     for (size_t offset = 0; offset < outs_vector.size(); ++offset) {
       if (current_vector_size > start_idx + offset) {
-        experimental::ReMakePtenDenseTensorFromVar(
-            outs_vector[offset], out_def,
+        auto* buffer_tensor =
             pt_kernel_context_->MutableOutputAt<pten::DenseTensor>(start_idx +
-                                                                   offset));
+                                                                   offset);
+        if (buffer_tensor) {
+          experimental::ReMakePtenDenseTensorFromVar(outs_vector[offset],
+                                                     out_def, buffer_tensor);
+        }
       } else {
         pt_kernel_context_->EmplaceBackOutputWithoutSetRange(
             experimental::MakePtenTensorBaseFromVar(outs_vector[offset],
                                                     out_def));
       }
     }
+
+    // Deal with the case that some outputs are NULL when run the kernel.
+    // For example : the outputs of matmul_grad are dx and dy,
+    // sometimes dx or dy may be NULL.
+    if (outs_vector.empty()) {
+      if (current_vector_size > start_idx) {
+        pt_kernel_context_->SetOutputWithoutSetRange(start_idx, {nullptr});
+      } else {
+        pt_kernel_context_->EmplaceBackOutputWithoutSetRange({nullptr});
+      }
+      end_idx = start_idx + 1;
+    }
+
     pt_kernel_context_->AssignOutputRange(std::make_pair(start_idx, end_idx),
                                           i);
   }
@@ -2002,7 +2018,9 @@ void OperatorWithKernel::WriteBackToOutputs(RuntimeContext* ctx) const {
             range_pair.first, range_pair.second);
 
     for (size_t j = 0; j < pten_outs.size(); ++j) {
-      experimental::MakeVariableFromPtenTensor(pten_outs[j], outs_vector[j]);
+      if (pten_outs[j]) {
+        experimental::MakeVariableFromPtenTensor(pten_outs[j], outs_vector[j]);
+      }
     }
   }
 }
diff --git a/paddle/fluid/framework/pten_utils.cc b/paddle/fluid/framework/pten_utils.cc
index 9831c2628dc95..dddcd914ed28a 100644
--- a/paddle/fluid/framework/pten_utils.cc
+++ b/paddle/fluid/framework/pten_utils.cc
@@ -99,7 +99,7 @@ KernelSignatureMap& KernelSignatureMap::Instance() {
       const auto& op_type = pair.first;
       const auto* op_proto = pair.second.proto_;
       if (pten::KernelFactory::Instance().HasCompatiblePtenKernel(op_type) &&
-          op_proto != nullptr) {
+          op_proto) {
         KernelArgsNameMakerByOpProto maker(op_proto);
         VLOG(10) << "Register kernel signature for " << op_type;
         auto success = kernel_signature_map_->map_
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index c355ace528d42..1d12ecf30ede5 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -338,19 +338,41 @@ static void BuildDygraphPtenKernelContext(
 
   for (size_t i = 0; i < output_names.size(); ++i) {
     auto& out_def = output_defs.at(i);
-    auto& outs_vector = outs.at(output_names[i]);
 
     size_t start_idx = (i == 0 ? 0 : kernel_ctx->OutputRangeAt(i - 1).second);
-    size_t end_idx = start_idx + outs_vector.size();
     auto current_vector_size = kernel_ctx->OutputsSize();
+
+    auto iter = outs.find(output_names[i]);
+    if (iter == outs.end()) {
+      if (current_vector_size > start_idx) {
+        kernel_ctx->SetOutputWithoutSetRange(start_idx, {nullptr});
+      } else {
+        kernel_ctx->EmplaceBackOutputWithoutSetRange({nullptr});
+      }
+      kernel_ctx->AssignOutputRange(std::make_pair(start_idx, start_idx + 1),
+                                    i);
+      continue;
+    }
+
+    auto& outs_vector = iter->second;
+    size_t end_idx = start_idx + outs_vector.size();
+
     // If the memory needed is less than the current memory allocated, we will
     // reuse the current memory by using ReMakePtenDenseTensorFromVar.
     // Otherwise，we will create new storage.
     for (size_t offset = 0; offset < outs_vector.size(); ++offset) {
       if (current_vector_size > start_idx + offset) {
-        experimental::ReMakePtenDenseTensorFromVar(
-            outs_vector[offset]->MutableVar(), out_def,
-            kernel_ctx->MutableOutputAt<pten::DenseTensor>(start_idx + offset));
+        auto* buffer_tensor =
+            kernel_ctx->MutableOutputAt<pten::DenseTensor>(start_idx + offset);
+        if (buffer_tensor) {
+          experimental::ReMakePtenDenseTensorFromVar(
+              outs_vector[offset]->MutableVar(), out_def, buffer_tensor);
+        } else {
+          kernel_ctx->SetOutputWithoutSetRange(
+              start_idx + offset,
+              experimental::MakePtenTensorBaseFromVar(
+                  outs_vector[offset]->MutableVar(), out_def));
+        }
       } else {
         kernel_ctx->EmplaceBackOutputWithoutSetRange(
             experimental::MakePtenTensorBaseFromVar(
@@ -465,15 +487,18 @@ static void WriteBackToOutputs(
   auto& output_names = std::get<2>(pt_kernel_signature.args);
 
   for (size_t i = 0; i < output_names.size(); ++i) {
-    auto& outs_vector = outs.at(output_names[i]);
+    auto iter = outs.find(output_names[i]);
+    if (iter != outs.end()) {
+      auto& outs_vector = iter->second;
 
-    auto& range_pair = kernel_ctx->OutputRangeAt(i);
-    auto pten_outs = kernel_ctx->MutableOutputBetween<pten::DenseTensor>(
-        range_pair.first, range_pair.second);
+      auto& range_pair = kernel_ctx->OutputRangeAt(i);
+      auto pten_outs = kernel_ctx->MutableOutputBetween<pten::DenseTensor>(
+          range_pair.first, range_pair.second);
 
-    for (size_t j = 0; j < pten_outs.size(); ++j) {
-      experimental::MakeVariableFromPtenTensor(pten_outs[j],
-                                               outs_vector[j]->MutableVar());
+      for (size_t j = 0; j < pten_outs.size(); ++j) {
+        experimental::MakeVariableFromPtenTensor(pten_outs[j],
+                                                 outs_vector[j]->MutableVar());
+      }
     }
   }
 }
@@ -529,6 +554,7 @@ static void PreparedOpRunImpl(
 template <typename VarType>
 static void PreparedOpRunPtImpl(
     const framework::OperatorBase& op,
+    const framework::OpKernelType& kernel_type,
     const framework::KernelSignature& pt_kernel_signature,
     const pten::Kernel& pt_kernel, pten::KernelContext* pt_kernel_context,
     platform::DeviceContext* dev_ctx, const NameVarMap<VarType>& ins,
@@ -558,7 +584,9 @@ static void PreparedOpRunPtImpl(
   pt_kernel_context->ClearData();
 
   // TODO(chenweihang): add debug flags later
-  // TODO(chenweihang): deal with complex cases later
+  if (framework::IsComplexType(kernel_type.data_type_)) {
+    HandleComplexGradToRealGrad<VarType>(outs);
+  }
 }
 
 void PreparedOp::Run(const NameVarMap<VarBase>& ins,
@@ -566,9 +594,9 @@ void PreparedOp::Run(const NameVarMap<VarBase>& ins,
                      const framework::AttributeMap& attrs,
                      const framework::AttributeMap& default_attrs) {
   if (run_pten_kernel_) {
-    PreparedOpRunPtImpl<VarBase>(op_, pt_kernel_signature_, pt_kernel_,
-                                 pt_kernel_context_, dev_ctx_, ins, outs, attrs,
-                                 default_attrs);
+    PreparedOpRunPtImpl<VarBase>(op_, kernel_type_, pt_kernel_signature_,
+                                 pt_kernel_, pt_kernel_context_, dev_ctx_, ins,
+                                 outs, attrs, default_attrs);
   } else {
     PreparedOpRunImpl<VarBase>(op_, ctx_, kernel_type_, func_, dev_ctx_, ins,
                                outs, attrs, default_attrs);
@@ -580,9 +608,9 @@ void PreparedOp::Run(const NameVarMap<VariableWrapper>& ins,
                      const framework::AttributeMap& attrs,
                      const framework::AttributeMap& default_attrs) {
   if (run_pten_kernel_) {
-    PreparedOpRunPtImpl<VariableWrapper>(op_, pt_kernel_signature_, pt_kernel_,
-                                         pt_kernel_context_, dev_ctx_, ins,
-                                         outs, attrs, default_attrs);
+    PreparedOpRunPtImpl<VariableWrapper>(
+        op_, kernel_type_, pt_kernel_signature_, pt_kernel_, pt_kernel_context_,
+        dev_ctx_, ins, outs, attrs, default_attrs);
   } else {
     PreparedOpRunImpl<VariableWrapper>(op_, ctx_, kernel_type_, func_, dev_ctx_,
                                        ins, outs, attrs, default_attrs);
diff --git a/paddle/fluid/operators/conj_op.h b/paddle/fluid/operators/conj_op.h
index 1012e9383f607..381f4cb66b3cd 100644
--- a/paddle/fluid/operators/conj_op.h
+++ b/paddle/fluid/operators/conj_op.h
@@ -39,7 +39,7 @@ class ConjKernel : public framework::OpKernel<T> {
     auto pt_out = paddle::experimental::MakePtenDenseTensor(*out);
 
     // call new kernel
-    pten::Conj<T>(dev_ctx, *pt_x.get(), pt_out.get());
+    pten::ConjKernel<T>(dev_ctx, *pt_x.get(), pt_out.get());
   }
 };
 
diff --git a/paddle/fluid/operators/dot_op.cc b/paddle/fluid/operators/dot_op.cc
index 31acd9718115c..e1463c8ccb58e 100644
--- a/paddle/fluid/operators/dot_op.cc
+++ b/paddle/fluid/operators/dot_op.cc
@@ -117,6 +117,13 @@ class DotGradOp : public framework::OperatorWithKernel {
                                        ctx, framework::GradVarName("Out")),
                                    ctx.GetPlace());
   }
+
+  framework::KernelSignature GetExpectedPtenKernelArgs(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::KernelSignature(
+        "dot_grad", {"X", "Y", framework::GradVarName("Out")}, {},
+        {framework::GradVarName("X"), framework::GradVarName("Y")});
+  }
 };
 
 template <typename T>
diff --git a/paddle/fluid/operators/dot_op.h b/paddle/fluid/operators/dot_op.h
index f6877c57a5c18..02ba57ef8d495 100644
--- a/paddle/fluid/operators/dot_op.h
+++ b/paddle/fluid/operators/dot_op.h
@@ -22,217 +22,14 @@
 // only can include the headers in paddle/pten/api dirs
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
 #include "paddle/pten/include/core.h"
-#include "paddle/pten/include/linalg.h"
+#include "paddle/pten/kernels/dot_grad_kernel.h"
+#include "paddle/pten/kernels/dot_kernel.h"
 
 namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
 
-template <typename T, typename R>
-struct P {
-  void operator()(T a, R b);
-};
-
-template <typename DeviceContext, typename T, typename Enabel = void>
-struct DotGradFunction {
-  void operator()(const Tensor* tensor_x, const Tensor* tensor_y,
-                  const Tensor* tensor_dout, Tensor* tensor_dx,
-                  Tensor* tensor_dy,
-                  const paddle::framework::ExecutionContext& ctx);
-};
-
-template <typename DeviceContext, typename T>
-struct DotGradFunction<DeviceContext, T, math::EnableComplex<T>> {
-  void operator()(const Tensor* tensor_x, const Tensor* tensor_y,
-                  const Tensor* tensor_dout, Tensor* tensor_dx,
-                  Tensor* tensor_dy,
-                  const paddle::framework::ExecutionContext& ctx) {
-#if defined(__NVCC__) || defined(__HIPCC__)
-    if (1 == tensor_dout->dims().size()) {
-      auto dout = framework::EigenVector<T>::Flatten(*tensor_dout);
-
-      if (tensor_dx) {
-        auto y = framework::EigenVector<T>::Flatten(*tensor_y);
-        auto& dev_raw = ctx.template device_context<DeviceContext>();
-        auto& dev = *dev_raw.eigen_device();
-        Eigen::DSizes<int, 1> size(tensor_dx->numel());
-
-        paddle::platform::ForRange<DeviceContext> for_range(dev_raw,
-                                                            tensor_y->numel());
-        math::ConjFunctor<T> functor(tensor_y->data<T>(), tensor_y->numel(),
-                                     tensor_dx->data<T>());
-        for_range(functor);
-        auto dx = framework::EigenVector<T>::Flatten(*tensor_dx);
-
-        dx.device(dev) = dx * dout.broadcast(size);
-      }
-
-      if (tensor_dy) {
-        auto x = framework::EigenVector<T>::Flatten(*tensor_x);
-        auto& dev_raw = ctx.template device_context<DeviceContext>();
-        auto& dev = *dev_raw.eigen_device();
-        Eigen::DSizes<int, 1> size(tensor_dy->numel());
-
-        paddle::platform::ForRange<DeviceContext> for_range(dev_raw,
-                                                            tensor_y->numel());
-        math::ConjFunctor<T> functor(tensor_x->data<T>(), tensor_x->numel(),
-                                     tensor_dy->data<T>());
-        for_range(functor);
-        auto dy = framework::EigenVector<T>::Flatten(*tensor_dy);
-
-        dy.device(dev) = dy * dout.broadcast(size);
-      }
-    } else {
-      auto dout = framework::EigenMatrix<T>::From(*tensor_dout);
-
-      if (tensor_dx) {
-        tensor_dx->mutable_data<T>(ctx.GetPlace());
-        auto y = framework::EigenMatrix<T>::From(*tensor_y);
-        auto& dev_raw = ctx.template device_context<DeviceContext>();
-        auto& dev = *dev_raw.eigen_device();
-        Eigen::DSizes<int, 2> size(1, tensor_dx->dims()[1]);
-
-        paddle::platform::ForRange<DeviceContext> for_range(dev_raw,
-                                                            tensor_y->numel());
-        math::ConjFunctor<T> functor(tensor_y->data<T>(), tensor_y->numel(),
-                                     tensor_dx->data<T>());
-        for_range(functor);
-        auto dx = framework::EigenMatrix<T>::From(*tensor_dx);
-
-        dx.device(dev) = dx * dout.broadcast(size);
-      }
-
-      if (tensor_dy) {
-        tensor_dy->mutable_data<T>(ctx.GetPlace());
-        auto x = framework::EigenMatrix<T>::From(*tensor_x);
-        auto& dev_raw = ctx.template device_context<DeviceContext>();
-        auto& dev = *dev_raw.eigen_device();
-        Eigen::DSizes<int, 2> size(1, tensor_dy->dims()[1]);
-
-        paddle::platform::ForRange<DeviceContext> for_range(dev_raw,
-                                                            tensor_x->numel());
-        math::ConjFunctor<T> functor(tensor_x->data<T>(), tensor_x->numel(),
-                                     tensor_dy->data<T>());
-        for_range(functor);
-
-        auto dy = framework::EigenMatrix<T>::From(*tensor_dy);
-
-        dy.device(dev) = dy * dout.broadcast(size);
-      }
-    }
-#else
-    const auto* data_dout = tensor_dout->data<T>();
-
-    if (tensor_dx) {
-      auto* data_dx = tensor_dx->mutable_data<T>(ctx.GetPlace());
-      const auto* data_y = tensor_y->data<T>();
-      const framework::DDim& dim = tensor_x->dims();
-      size_t N = static_cast<size_t>(framework::product(dim));
-
-      auto step = dim[dim.size() - 1];
-
-      int s = -1;
-      for (size_t i = 0; i < N; ++i) {
-        if (0 == i % step) ++s;
-        data_dx[i] = T(data_y[i].real, -data_y[i].imag) * data_dout[s];
-      }
-    }
-
-    if (tensor_dy) {
-      auto* data_dy = tensor_dy->mutable_data<T>(ctx.GetPlace());
-      const auto* data_x = tensor_x->data<T>();
-      const framework::DDim& dim = tensor_y->dims();
-      size_t N = static_cast<size_t>(framework::product(dim));
-
-      auto step = dim[dim.size() - 1];
-
-      int s = -1;
-      for (size_t i = 0; i < N; ++i) {
-        if (0 == i % step) ++s;
-        data_dy[i] = T(data_x[i].real, -data_x[i].imag) * data_dout[s];
-      }
-    }
-#endif
-  }
-};
-
-template <typename DeviceContext, typename T>
-struct DotGradFunction<DeviceContext, T, math::DisableComplex<T>> {
-  void operator()(const Tensor* tensor_x, const Tensor* tensor_y,
-                  const Tensor* tensor_dout, Tensor* tensor_dx,
-                  Tensor* tensor_dy,
-                  const paddle::framework::ExecutionContext& ctx) {
-#if defined(__NVCC__) || defined(__HIPCC__)
-    if (1 == tensor_dout->dims().size()) {
-      auto dout = framework::EigenVector<T>::Flatten(*tensor_dout);
-
-      if (tensor_dx) {
-        auto y = framework::EigenVector<T>::Flatten(*tensor_y);
-        auto dx = framework::EigenVector<T>::Flatten(*tensor_dx);
-        auto& dev =
-            *ctx.template device_context<DeviceContext>().eigen_device();
-        Eigen::DSizes<int, 1> size(tensor_dx->numel());
-        dx.device(dev) = y * dout.broadcast(size);
-      }
-
-      if (tensor_dy) {
-        auto x = framework::EigenVector<T>::Flatten(*tensor_x);
-        auto dy = framework::EigenVector<T>::Flatten(*tensor_dy);
-        auto& dev =
-            *ctx.template device_context<DeviceContext>().eigen_device();
-        Eigen::DSizes<int, 1> size(tensor_dy->numel());
-        dy.device(dev) = x * dout.broadcast(size);
-      }
-    } else {
-      auto dout = framework::EigenMatrix<T>::From(*tensor_dout);
-
-      if (tensor_dx) {
-        tensor_dx->mutable_data<T>(ctx.GetPlace());
-        auto y = framework::EigenMatrix<T>::From(*tensor_y);
-        auto dx = framework::EigenMatrix<T>::From(*tensor_dx);
-        auto& dev =
-            *ctx.template device_context<DeviceContext>().eigen_device();
-        Eigen::DSizes<int, 2> size(1, tensor_dx->dims()[1]);
-        dx.device(dev) = y * dout.broadcast(size);
-      }
-
-      if (tensor_dy) {
-        tensor_dy->mutable_data<T>(ctx.GetPlace());
-        auto x = framework::EigenMatrix<T>::From(*tensor_x);
-        auto dy = framework::EigenMatrix<T>::From(*tensor_dy);
-        auto& dev =
-            *ctx.template device_context<DeviceContext>().eigen_device();
-        Eigen::DSizes<int, 2> size(1, tensor_dy->dims()[1]);
-        dy.device(dev) = x * dout.broadcast(size);
-      }
-    }
-#else
-    auto const *x = tensor_x->data<T>(), *y = tensor_y->data<T>(),
-               *dz = tensor_dout->data<T>();
-    auto&& d = tensor_x->dims();
-    auto const N = tensor_x->numel();
-    auto const B = d[d.size() - 1];
-
-    if (tensor_dx) {
-      auto* dx = tensor_dx->mutable_data<T>(ctx.GetPlace());
-      for (auto j = 0; j < N / B; ++j) {
-        auto const ss = dz[j];
-        for (auto i = 0; i < B; ++i) *dx++ = *y++ * ss;
-      }
-    }
-
-    if (tensor_dy) {
-      auto* dy = tensor_dy->mutable_data<T>(ctx.GetPlace());
-      for (auto j = 0; j < N / B; ++j) {
-        auto const ss = dz[j];
-        for (auto i = 0; i < B; i++) *dy++ = *x++ * ss;
-      }
-    }
-#endif
-  }
-};
-
 // See Note [ Why still keep the original kernel implementation? ]
 template <typename DeviceContext, typename T>
 class DotKernel : public framework::OpKernel<T> {
@@ -249,7 +46,7 @@ class DotKernel : public framework::OpKernel<T> {
     auto pt_out = paddle::experimental::MakePtenDenseTensor(*out);
 
     // call new kernel
-    pten::Dot<T>(dev_ctx, *pt_x.get(), *pt_y.get(), pt_out.get());
+    pten::DotKernel<T>(dev_ctx, *pt_x.get(), *pt_y.get(), pt_out.get());
   }
 };
 
@@ -266,8 +63,17 @@ class DotGradKernel : public framework::OpKernel<T> {
     if (tensor_dx) tensor_dx->mutable_data<T>(ctx.GetPlace());
     if (tensor_dy) tensor_dy->mutable_data<T>(ctx.GetPlace());
 
-    DotGradFunction<DeviceContext, T>()(tensor_x, tensor_y, tensor_dout,
-                                        tensor_dx, tensor_dy, ctx);
+    auto pt_x = paddle::experimental::MakePtenDenseTensor(*tensor_x);
+    auto pt_y = paddle::experimental::MakePtenDenseTensor(*tensor_y);
+    auto pt_dout = paddle::experimental::MakePtenDenseTensor(*tensor_dout);
+    auto pt_dx = paddle::experimental::MakePtenDenseTensor(*tensor_dx);
+    auto pt_dy = paddle::experimental::MakePtenDenseTensor(*tensor_dy);
+
+    auto& dev_ctx = ctx.device_context<DeviceContext>();
+
+    // call new kernel
+    pten::DotGradKernel<T>(dev_ctx, *pt_x, *pt_y, *pt_dout, pt_dx.get(),
+                           pt_dy.get());
   }
 };
 
diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h
index f245bad01aa4c..2be7695e6a8c4 100644
--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
@@ -225,6 +225,10 @@ class Blas {
               const framework::Tensor& mat_b, const MatDescriptor& dim_b,
               T alpha, framework::Tensor* mat_out, T beta) const;
 
+  template <typename T>
+  void MatMul(const T* mat_a, const MatDescriptor& dim_a, const T* mat_b,
+              const MatDescriptor& dim_b, T alpha, T* mat_out, T beta) const;
+
   template <typename T>
   void VINV(int n, const T* a, T* y) const;
 
diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h
index 4bcf3baa64932..be9cf1e3448b6 100644
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -1249,6 +1249,15 @@ void Blas<DeviceContext>::MatMul(const framework::Tensor &mat_a,
                                  const framework::Tensor &mat_b,
                                  const MatDescriptor &dim_b, T alpha,
                                  framework::Tensor *mat_out, T beta) const {
+  MatMul(mat_a.data<T>(), dim_a, mat_b.data<T>(), dim_b, alpha,
+         mat_out->data<T>(), beta);
+}
+
+template <typename DeviceContext>
+template <typename T>
+void Blas<DeviceContext>::MatMul(const T *mat_a, const MatDescriptor &dim_a,
+                                 const T *mat_b, const MatDescriptor &dim_b,
+                                 T alpha, T *mat_out, T beta) const {
   PADDLE_ENFORCE_EQ(
       dim_a.width_, dim_b.height_,
       platform::errors::InvalidArgument(
@@ -1261,8 +1270,7 @@ void Blas<DeviceContext>::MatMul(const framework::Tensor &mat_a,
   CBLAS_TRANSPOSE transB = !dim_b.trans_ ? CblasNoTrans : CblasTrans;
   if (dim_a.batch_size_ == 0 && dim_b.batch_size_ == 0) {
     this->template GEMM<T>(transA, transB, dim_a.height_, dim_b.width_,
-                           dim_a.width_, alpha, mat_a.data<T>(),
-                           mat_b.data<T>(), beta, mat_out->data<T>());
+                           dim_a.width_, alpha, mat_a, mat_b, beta, mat_out);
   } else {
     PADDLE_ENFORCE_EQ(
         dim_a.batch_size_ == dim_b.batch_size_ || dim_a.batch_size_ == 0 ||
@@ -1273,8 +1281,8 @@ void Blas<DeviceContext>::MatMul(const framework::Tensor &mat_a,
                   "But got dim_a.batch_size = %d, dim_b.batch_size = %d.",
                   dim_a.batch_size_, dim_b.batch_size_));
     this->template BatchedGEMM<T>(
-        transA, transB, dim_a.height_, dim_b.width_, dim_a.width_, alpha,
-        mat_a.data<T>(), mat_b.data<T>(), beta, mat_out->data<T>(),
+        transA, transB, dim_a.height_, dim_b.width_, dim_a.width_, alpha, mat_a,
+        mat_b, beta, mat_out,
         dim_a.batch_size_ == 0 ? dim_b.batch_size_ : dim_a.batch_size_,
         dim_a.stride_, dim_b.stride_);
   }
diff --git a/paddle/fluid/operators/matmul_v2_op.cc b/paddle/fluid/operators/matmul_v2_op.cc
index 5add86f5b3c74..a5eca7b225558 100644
--- a/paddle/fluid/operators/matmul_v2_op.cc
+++ b/paddle/fluid/operators/matmul_v2_op.cc
@@ -389,6 +389,14 @@ class MatMulV2OpGrad : public framework::OperatorWithKernel {
                                      tensor.place(), tensor.layout());
     }
   }
+
+  framework::KernelSignature GetExpectedPtenKernelArgs(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::KernelSignature(
+        "matmul_grad", {"X", "Y", framework::GradVarName("Out")},
+        {"trans_x", "trans_y"},
+        {framework::GradVarName("X"), framework::GradVarName("Y")});
+  }
 };
 
 template <typename T>
@@ -431,6 +439,13 @@ class MatMulV2OpDoubleGrad : public framework::OperatorWithKernel {
       context->ShareDim("DOut", "DDOut");
     }
   }
+
+  framework::KernelSignature GetExpectedPtenKernelArgs(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::KernelSignature(
+        "matmul_double_grad", {"X", "Y", "DOut", "DDX", "DDY"},
+        {"trans_x", "trans_y"}, {"DX", "DY", "DDOut"});
+  }
 };
 
 template <typename T>
@@ -500,6 +515,15 @@ class MatMulV2OpTripleGrad : public framework::OperatorWithKernel {
       context->ShareDim("Y", "D_DDY_out");
     }
   }
+
+  framework::KernelSignature GetExpectedPtenKernelArgs(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::KernelSignature(
+        "matmul_triple_grad",
+        {"X", "Y", "DOut", "DDX", "DDY", "D_DX", "D_DY", "D_DDOut"},
+        {"trans_x", "trans_y"},
+        {"D_X_out", "D_Y_out", "D_DOut_out", "D_DDX_out", "D_DDY_out"});
+  }
 };
 
 template <typename T>
diff --git a/paddle/fluid/operators/matmul_v2_op.h b/paddle/fluid/operators/matmul_v2_op.h
index b257f345eaf36..e93bd212868fd 100644
--- a/paddle/fluid/operators/matmul_v2_op.h
+++ b/paddle/fluid/operators/matmul_v2_op.h
@@ -28,6 +28,7 @@ limitations under the License. */
 // only can include the headers in paddle/pten/api dirs
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
 #include "paddle/pten/include/core.h"
+#include "paddle/pten/kernels/matmul_grad_kernel.h"
 #include "paddle/pten/kernels/matmul_kernel.h"
 
 #if defined(__NVCC__) || defined(__HIPCC__)
@@ -39,333 +40,6 @@ namespace operators {
 
 using framework::Tensor;
 
-template <typename DeviceContext, typename T>
-void ReduceSumForMatmulGrad(const Tensor* input, Tensor* output,
-                            const std::vector<int>& reduce_dims,
-                            const paddle::framework::ExecutionContext& ctx) {
-#if defined(__NVCC__) || defined(__HIPCC__)
-  auto stream = ctx.cuda_device_context().stream();
-  TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-      *input, output, kps::IdentityFunctor<T>(), reduce_dims, stream);
-#else
-  ReduceKernelFunctor<DeviceContext, T, ops::SumFunctor>(
-      input, output, reduce_dims, true, false, ctx)
-      .template apply<T>();
-#endif
-}
-
-static void GetBroadcastFromDims(const int x_ndim, const std::int64_t* x_dims,
-                                 const int y_ndim, const std::int64_t* y_dims,
-                                 std::int64_t* x_bd_dims,
-                                 std::int64_t* y_bd_dims,
-                                 std::int64_t* out_bd_dims) {
-  const int ndim = (std::max)(x_ndim, y_ndim);
-  std::fill(x_bd_dims, x_bd_dims + ndim - x_ndim, 1);
-  std::fill(y_bd_dims, y_bd_dims + ndim - y_ndim, 1);
-  std::copy(x_dims, x_dims + x_ndim, x_bd_dims + ndim - x_ndim);
-  std::copy(y_dims, y_dims + y_ndim, y_bd_dims + ndim - y_ndim);
-
-  for (int i = 0; i < ndim; ++i) {
-    PADDLE_ENFORCE_EQ(
-        x_bd_dims[i] == y_bd_dims[i] || x_bd_dims[i] <= 1 || y_bd_dims[i] <= 1,
-        true,
-        platform::errors::InvalidArgument(
-            "Input(X) and Input(Y) has error dim."
-            "X_broadcast's shape[%s] must be equal to Y_broadcast's shape[%s],"
-            "or X_broadcast's shape[%s] <= 1, or Y_broadcast's shape[%s] <= 1,"
-            "But received X_broadcast's shape[%s] = [%s]"
-            "received Y_broadcast's shape[%s] = [%s]",
-            i, i, i, i, i, x_bd_dims[i], i, y_bd_dims[i]));
-    if (x_bd_dims[i] == 0 || y_bd_dims[i] == 0) {
-      out_bd_dims[i] = 0;
-    } else {
-      out_bd_dims[i] = (std::max)(x_bd_dims[i], y_bd_dims[i]);
-    }
-  }
-}
-
-static int64_t GetIndexMessage(const int n, const int64_t* dims,
-                               const int64_t* index) {
-  int64_t sum = 0;
-  for (int i = 0; i < n; ++i) {
-    if (dims[i] > 1) {
-      sum = sum * dims[i] + index[i];
-    }
-  }
-  return sum;
-}
-
-static void IndexIncreaseFromDims(const int ndim, const int64_t* dims,
-                                  int64_t* index) {
-  for (int i = ndim - 1; i >= 0; --i) {
-    ++index[i];
-    if (index[i] >= dims[i]) {
-      index[i] -= dims[i];
-    } else {
-      break;
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-void MatMulFunction(const Tensor* X, const Tensor* Y,
-                    const std::vector<std::int64_t>& x_dims,
-                    const std::vector<std::int64_t>& y_dims, Tensor* Out,
-                    bool trans_x, bool trans_y,
-                    const paddle::framework::ExecutionContext& ctx,
-                    bool flag = false) {
-  const int x_ndim = x_dims.size();
-  const int y_ndim = y_dims.size();
-
-  // Get data ptr
-  const T* x_data = X->data<T>();
-  const T* y_data = Y->data<T>();
-
-  if (x_ndim == 1 && y_ndim == 1) {
-    PADDLE_ENFORCE_EQ(
-        X->numel(), Y->numel(),
-        platform::errors::InvalidArgument(
-            "X's numbers must be equal to Y's numbers,"
-            "when X/Y's dims =1. But received X has [%d] elements,"
-            "received Y has [%d] elements",
-            X->numel(), Y->numel()));
-    VLOG(3) << "MatMul's case 1";
-    Out->Resize({1});
-    Out->mutable_data<T>(ctx.GetPlace());
-    auto out_eigen = framework::EigenScalar<T>::From(*Out);
-    auto x_eigen = framework::EigenVector<T>::Flatten(*X);
-    auto y_eigen = framework::EigenVector<T>::Flatten(*Y);
-
-    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
-    if (flag) {
-      out_eigen.device(dev) = (x_eigen * y_eigen).sum() + out_eigen;
-    } else {
-      out_eigen.device(dev) = (x_eigen * y_eigen).sum();
-    }
-    return;
-  }
-
-  auto& dev_ctx = ctx.template device_context<DeviceContext>();
-  auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
-
-  if (x_ndim == 1) {
-    const int N = X->numel();
-    if (trans_y) {
-      PADDLE_ENFORCE_EQ(y_dims[y_ndim - 1], N,
-                        platform::errors::InvalidArgument(
-                            "Input(Y) has error dim."
-                            "Y'dims[%d] must be equal to %d"
-                            "But received Y'dims[%d] is %d",
-                            y_ndim - 1, N, y_ndim - 1, y_dims[y_ndim - 1]));
-    } else {
-      PADDLE_ENFORCE_EQ(y_dims[y_ndim - 2], N,
-                        platform::errors::InvalidArgument(
-                            "Input(Y) has error dim."
-                            "Y'dims[%d] must be equal to %d"
-                            "But received Y'dims[%d] is %d",
-                            y_ndim - 2, N, y_ndim - 2, y_dims[y_ndim - 2]));
-    }
-    std::vector<std::int64_t> out_dims(y_ndim - 1);
-    if (trans_y) {
-      std::copy_n(y_dims.cbegin(), y_ndim - 1, out_dims.begin());
-    } else {
-      std::copy_n(y_dims.cbegin(), y_ndim - 2, out_dims.begin());
-      out_dims.back() = y_dims.back();
-    }
-    Out->Resize(framework::make_ddim(out_dims));
-    Out->mutable_data<T>(ctx.GetPlace());
-    if (trans_y) {
-      const int M = Y->numel() / N;
-      VLOG(3) << "MatMul's case 2";
-      blas.GEMV(false, M, N, static_cast<T>(1), y_data, x_data,
-                static_cast<T>(flag), Out->data<T>());
-    } else {
-      const int M = y_dims[y_ndim - 1];
-      const int batch_size = Y->numel() / (M * N);
-      if (batch_size == 1) {
-        VLOG(3) << "MatMul's case 3";
-        blas.GEMV(true, N, M, static_cast<T>(1), y_data, x_data,
-                  static_cast<T>(flag), Out->data<T>());
-      } else {
-        VLOG(3) << "MatMul's case 4";
-        blas.BatchedGEMM(CblasTrans, CblasNoTrans, M, 1, N, static_cast<T>(1),
-                         y_data, x_data, static_cast<T>(flag), Out->data<T>(),
-                         batch_size, M * N, 0);
-      }
-    }
-    return;
-  }
-
-  if (y_ndim == 1) {
-    const int N = Y->numel();
-    if (trans_x) {
-      PADDLE_ENFORCE_EQ(x_dims[x_ndim - 2], N,
-                        platform::errors::InvalidArgument(
-                            "Input(X) has error dim."
-                            "X'dims[%d] must be equal to %d"
-                            "But received X'dims[%d] is %d",
-                            x_ndim - 2, N, x_ndim - 2, x_dims[x_ndim - 2]));
-    } else {
-      PADDLE_ENFORCE_EQ(x_dims[x_ndim - 1], N,
-                        platform::errors::InvalidArgument(
-                            "Input(X) has error dim."
-                            "X'dims[%d] must be equal to %d"
-                            "But received X'dims[%d] is %d",
-                            x_ndim - 1, N, x_ndim - 1, x_dims[x_ndim - 1]));
-    }
-    std::vector<std::int64_t> out_dims(x_ndim - 1);
-    if (trans_x) {
-      std::copy_n(x_dims.cbegin(), x_ndim - 2, out_dims.begin());
-      out_dims.back() = x_dims.back();
-    } else {
-      std::copy_n(x_dims.cbegin(), x_ndim - 1, out_dims.begin());
-    }
-    Out->Resize(framework::make_ddim(out_dims));
-    Out->mutable_data<T>(ctx.GetPlace());
-
-    if (trans_x) {
-      const int M = x_dims[x_ndim - 1];
-      const int batch_size = X->numel() / (M * N);
-      if (batch_size == 1) {
-        VLOG(3) << "MatMul's case 5";
-        blas.GEMV(true, N, M, static_cast<T>(1), x_data, y_data,
-                  static_cast<T>(flag), Out->data<T>());
-      } else {
-        VLOG(3) << "MatMul's case 6";
-        blas.BatchedGEMM(CblasTrans, CblasNoTrans, M, 1, N, static_cast<T>(1),
-                         x_data, y_data, static_cast<T>(flag), Out->data<T>(),
-                         batch_size, M * N, 0);
-      }
-    } else {
-      const int M = X->numel() / N;
-      VLOG(3) << "MatMul's case 7";
-      blas.GEMV(false, M, N, static_cast<T>(1), x_data, y_data,
-                static_cast<T>(flag), Out->data<T>());
-    }
-    return;
-  }
-
-  const int M = trans_x ? x_dims[x_ndim - 1] : x_dims[x_ndim - 2];
-  const int K = trans_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1];
-  if (trans_y) {
-    PADDLE_ENFORCE_EQ(y_dims[y_ndim - 1], K,
-                      platform::errors::InvalidArgument(
-                          "Input(Y) has error dim."
-                          "Y'dims[%d] must be equal to %d"
-                          "But received Y'dims[%d] is %d",
-                          y_ndim - 1, K, y_ndim - 1, y_dims[y_ndim - 1]));
-  } else {
-    PADDLE_ENFORCE_EQ(y_dims[y_ndim - 2], K,
-                      platform::errors::InvalidArgument(
-                          "Input(Y) has error dim."
-                          "Y'dims[%d] must be equal to %d"
-                          "But received Y'dims[%d] is %d",
-                          y_ndim - 2, K, y_ndim - 2, y_dims[y_ndim - 2]));
-  }
-  const int N = trans_y ? y_dims[y_ndim - 2] : y_dims[y_ndim - 1];
-  const int ndim = (std::max)(x_ndim, y_ndim);
-  std::vector<std::int64_t> x_broadcast_dims(ndim);
-  std::vector<std::int64_t> y_broadcast_dims(ndim);
-  std::vector<std::int64_t> out_broadcast_dims(ndim);
-
-  GetBroadcastFromDims(x_ndim - 2, x_dims.data(), y_ndim - 2, y_dims.data(),
-                       x_broadcast_dims.data(), y_broadcast_dims.data(),
-                       out_broadcast_dims.data());
-
-  out_broadcast_dims[ndim - 2] = M;
-  out_broadcast_dims[ndim - 1] = N;
-
-  Out->Resize(framework::make_ddim(out_broadcast_dims));
-  Out->mutable_data<T>(ctx.GetPlace());
-
-  const int batch_dim = ndim - 2;
-  // broadcast message
-  const bool is_broadcast_dims = !std::equal(
-      x_broadcast_dims.cbegin(), x_broadcast_dims.cbegin() + batch_dim,
-      y_broadcast_dims.cbegin());
-
-  const std::int64_t x_batch_size = std::accumulate(
-      x_broadcast_dims.cbegin(), x_broadcast_dims.cbegin() + batch_dim, 1LL,
-      std::multiplies<std::int64_t>());
-  const std::int64_t y_batch_size = std::accumulate(
-      y_broadcast_dims.cbegin(), y_broadcast_dims.cbegin() + batch_dim, 1LL,
-      std::multiplies<std::int64_t>());
-  const std::int64_t out_batch_size = std::accumulate(
-      out_broadcast_dims.cbegin(), out_broadcast_dims.cbegin() + batch_dim, 1LL,
-      std::multiplies<std::int64_t>());
-  if (out_batch_size == 0) return;
-  if (x_batch_size == 1 && y_batch_size == 1) {
-    VLOG(3) << "MatMul's case 8";
-    blas.GEMM(trans_x ? CblasTrans : CblasNoTrans,
-              trans_y ? CblasTrans : CblasNoTrans, M, N, K, static_cast<T>(1),
-              x_data, y_data, static_cast<T>(flag), Out->data<T>());
-  } else if (x_batch_size == 1) {
-    if (M == 1 && trans_y) {
-      VLOG(3) << "MatMul's case 9";
-      blas.GEMV(false, y_batch_size * N, K, static_cast<T>(1), y_data, x_data,
-                static_cast<T>(flag), Out->data<T>());
-    } else {
-      VLOG(3) << "MatMul's case 10";
-      blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
-                       trans_y ? CblasTrans : CblasNoTrans, M, N, K,
-                       static_cast<T>(1), x_data, y_data, static_cast<T>(flag),
-                       Out->data<T>(), out_batch_size, 0, K * N);
-    }
-  } else if (y_batch_size == 1) {
-    if (!trans_x) {
-      VLOG(3) << "MatMul's case 11";
-      blas.GEMM(CblasNoTrans, trans_y ? CblasTrans : CblasNoTrans,
-                x_batch_size * M, N, K, static_cast<T>(1), x_data, y_data,
-                static_cast<T>(flag), Out->data<T>());
-    } else {
-      VLOG(3) << "MatMul's case 12";
-      blas.BatchedGEMM(CblasTrans, trans_y ? CblasTrans : CblasNoTrans, M, N, K,
-                       static_cast<T>(1), x_data, y_data, static_cast<T>(flag),
-                       Out->data<T>(), out_batch_size, M * K, 0);
-    }
-  } else if (!is_broadcast_dims) {
-    VLOG(3) << "MatMul's case 13";
-    blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
-                     trans_y ? CblasTrans : CblasNoTrans, M, N, K,
-                     static_cast<T>(1), x_data, y_data, static_cast<T>(flag),
-                     Out->data<T>(), out_batch_size, M * K, K * N);
-  } else {
-    // in the case, can't use stridedgemm
-    std::vector<const T*> x_ptr(out_batch_size);
-    std::vector<const T*> y_ptr(out_batch_size);
-    std::vector<T*> out_ptr(out_batch_size);
-    std::vector<std::int64_t> index(batch_dim, 0);
-    for (std::int64_t i = 0; i < out_batch_size; ++i) {
-      // using the index to get offset
-      const std::int64_t x_index =
-          GetIndexMessage(batch_dim, x_broadcast_dims.data(), index.data());
-      const std::int64_t y_index =
-          GetIndexMessage(batch_dim, y_broadcast_dims.data(), index.data());
-
-      x_ptr[i] = x_data + x_index * M * K;
-      y_ptr[i] = y_data + y_index * K * N;
-      out_ptr[i] = Out->data<T>() + i * M * N;
-      IndexIncreaseFromDims(batch_dim, out_broadcast_dims.data(), index.data());
-    }
-    VLOG(3) << "MatMul's case 14";
-    blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
-                     trans_y ? CblasTrans : CblasNoTrans, M, N, K,
-                     static_cast<T>(1), x_ptr.data(), y_ptr.data(),
-                     static_cast<T>(flag), out_ptr.data(), out_batch_size);
-  }
-}
-
-template <typename DeviceContext, typename T>
-void MatMulFunction(const Tensor* X, const Tensor* Y, Tensor* Out, bool trans_x,
-                    bool trans_y,
-                    const paddle::framework::ExecutionContext& ctx,
-                    bool flag = false) {
-  const std::vector<std::int64_t> x_dims = vectorize(X->dims());
-  const std::vector<std::int64_t> y_dims = vectorize(Y->dims());
-  MatMulFunction<DeviceContext, T>(X, Y, x_dims, y_dims, Out, trans_x, trans_y,
-                                   ctx, flag);
-}
-
 template <typename DeviceContext, typename T>
 class MatMulV2Kernel : public framework::OpKernel<T> {
  public:
@@ -400,26 +74,6 @@ static framework::Tensor FoldInitDims(const framework::Tensor& input) {
   return output;
 }
 
-// Reshape a rank-3 tensor from P x M x N to M x (P * N).
-// (Warning: This requires transposing data and writes into new memory.)
-// Identity op if the tensor is not of rank 3.
-template <typename DeviceContext, typename T>
-static framework::Tensor FoldHeadAndLastDims(const DeviceContext& context,
-                                             const framework::Tensor& input) {
-  auto in_dims = input.dims();
-  if (in_dims.size() != 3) {
-    return input;
-  }
-  framework::Tensor output;
-  output.Resize({in_dims[1], in_dims[0], in_dims[2]});
-  output.mutable_data<T>(context.GetPlace());
-  std::vector<int> axis = {1, 0, 2};
-  math::Transpose<DeviceContext, T, 3> trans;
-  trans(context, input, &output, axis);
-  output.Resize({in_dims[1], in_dims[0] * in_dims[2]});
-  return output;
-}
-
 /**
  * Get row matrix shape from a vector shape. If the rank of x_dim > 1, the
  * original x_dim is returned.
@@ -482,1000 +136,45 @@ static void ReshapeXYOutIntoMatrixSequence(framework::Tensor* x,
   ReshapeTensorIntoMatrixSequence(y, mat_dim_y);
 }
 
-template <typename DeviceContext, typename T>
-struct ConjHelper {
-  explicit ConjHelper(const framework::ExecutionContext& ctx) : ctx_(ctx) {}
-  HOSTDEVICE void operator()(framework::Tensor& src, framework::Tensor& dst) {
-    dst.Resize(src.dims());
-    dst.set_layout(src.layout());
-    dst.ShareDataWith(src);
-    return;
-  }
-
-  const framework::ExecutionContext& ctx_;
-};
-
-template <typename DeviceContext>
-struct ConjHelper<DeviceContext, paddle::platform::complex<float>> {
-  explicit ConjHelper(const framework::ExecutionContext& ctx) : ctx_(ctx) {}
-
-  HOSTDEVICE void operator()(framework::Tensor& src, framework::Tensor& dst) {
-    dst.Resize(src.dims());
-    auto* src_data = src.data<paddle::platform::complex<float>>();
-    auto* dst_data = dst.mutable_data<paddle::platform::complex<float>>(
-        ctx_.GetPlace(),
-        size_t(src.numel() * sizeof(paddle::platform::complex<float>)));
-
-    platform::ForRange<DeviceContext> for_range(
-        ctx_.template device_context<DeviceContext>(), src.numel());
-    math::ConjFunctor<paddle::platform::complex<float>> functor(
-        src_data, src.numel(), dst_data);
-    for_range(functor);
-    return;
-  }
-  const framework::ExecutionContext& ctx_;
-};
-
-template <typename DeviceContext>
-struct ConjHelper<DeviceContext, paddle::platform::complex<double>> {
-  explicit ConjHelper(const framework::ExecutionContext& ctx) : ctx_(ctx) {}
-
-  HOSTDEVICE void operator()(framework::Tensor& src, framework::Tensor& dst) {
-    dst.Resize(src.dims());
-    auto* src_data = src.data<paddle::platform::complex<double>>();
-    auto* dst_data = dst.mutable_data<paddle::platform::complex<double>>(
-        ctx_.GetPlace(),
-        size_t(src.numel() * sizeof(paddle::platform::complex<double>)));
-
-    platform::ForRange<DeviceContext> for_range(
-        ctx_.template device_context<DeviceContext>(), src.numel());
-    math::ConjFunctor<paddle::platform::complex<double>> functor(
-        src_data, src.numel(), dst_data);
-    for_range(functor);
-    return;
-  }
-  const framework::ExecutionContext& ctx_;
-};
-
-template <typename DeviceContext, typename T, typename Enabel = void>
-struct DotDoubleGradFunction {
-  void operator()(const Tensor* tensor_x, const Tensor* tensor_y,
-                  Tensor* tensor_dx, Tensor* tensor_dy,
-                  const Tensor* tensor_dout, const Tensor* tensor_ddx,
-                  const Tensor* tensor_ddy, Tensor* tensor_ddout,
-                  const paddle::framework::ExecutionContext& ctx);
-};
-
-template <typename DeviceContext, typename T>
-struct DotDoubleGradFunction<DeviceContext, T, math::EnableComplex<T>> {
-  void operator()(const Tensor* tensor_x, const Tensor* tensor_y,
-                  Tensor* tensor_dx, Tensor* tensor_dy,
-                  const Tensor* tensor_dout, const Tensor* tensor_ddx,
-                  const Tensor* tensor_ddy, Tensor* tensor_ddout,
-                  const paddle::framework::ExecutionContext& ctx) {
-#if defined(__NVCC__) || defined(__HIPCC__)
-    if (1 == tensor_dout->dims().size()) {
-      framework::Tensor tensor_dout_help;
-      auto& dev_raw = ctx.template device_context<DeviceContext>();
-      auto& dev = *dev_raw.eigen_device();
-      if (tensor_dx || tensor_dy) {
-        tensor_dout_help.Resize(tensor_dout->dims());
-        tensor_dout_help.mutable_data<T>(ctx.GetPlace());
-        paddle::platform::ForRange<DeviceContext> for_range(
-            dev_raw, tensor_dout->numel());
-        math::ConjFunctor<T> functor(tensor_dout->data<T>(),
-                                     tensor_dout->numel(),
-                                     tensor_dout_help.data<T>());
-        for_range(functor);
-      }
-      if (tensor_dx) {
-        auto ddy = framework::EigenVector<T>::Flatten(*tensor_ddy);
-        Eigen::DSizes<int, 1> size(tensor_ddy->numel());
-        auto dx = framework::EigenVector<T>::Flatten(*tensor_dx);
-        auto dout = framework::EigenVector<T>::Flatten(tensor_dout_help);
-        dx.device(dev) = ddy * dout.broadcast(size);
-      }
-
-      if (tensor_dy) {
-        auto ddx = framework::EigenVector<T>::Flatten(*tensor_ddx);
-        Eigen::DSizes<int, 1> size(tensor_ddx->numel());
-        auto dy = framework::EigenVector<T>::Flatten(*tensor_dy);
-        auto dout = framework::EigenVector<T>::Flatten(tensor_dout_help);
-        dy.device(dev) = ddx * dout.broadcast(size);
-      }
-
-      if (tensor_ddout) {
-        framework::Tensor tensor_x_help, tensor_y_help;
-        tensor_x_help.Resize(tensor_x->dims());
-        tensor_x_help.mutable_data<T>(ctx.GetPlace());
-        tensor_y_help.Resize(tensor_y->dims());
-        tensor_y_help.mutable_data<T>(ctx.GetPlace());
-
-        auto& dev_raw = ctx.template device_context<DeviceContext>();
-        auto& dev = *dev_raw.eigen_device();
-        paddle::platform::ForRange<DeviceContext> for_range(dev_raw,
-                                                            tensor_x->numel());
-        math::ConjFunctor<T> functor_x(tensor_x->data<T>(), tensor_x->numel(),
-                                       tensor_x_help.data<T>());
-        for_range(functor_x);
-        math::ConjFunctor<T> functor_y(tensor_y->data<T>(), tensor_y->numel(),
-                                       tensor_y_help.data<T>());
-        for_range(functor_y);
-        auto x = framework::EigenVector<T>::Flatten(tensor_x_help);
-        auto y = framework::EigenVector<T>::Flatten(tensor_y_help);
-        auto ddx = framework::EigenVector<T>::Flatten(*tensor_ddx);
-        auto ddy = framework::EigenVector<T>::Flatten(*tensor_ddy);
-        auto ddout = framework::EigenVector<T>::Flatten(*tensor_ddout);
-        ddout.device(dev) = (x * ddy + y * ddx).sum();
-      }
-    }
-#else
-    const auto* data_dout = tensor_dout->data<T>();
-
-    if (tensor_dx) {
-      auto* data_dx = tensor_dx->mutable_data<T>(ctx.GetPlace());
-      const auto* data_ddy = tensor_ddy->data<T>();
-      const framework::DDim& dim = tensor_dx->dims();
-      size_t N = static_cast<size_t>(framework::product(dim));
-
-      auto step = dim[dim.size() - 1];
-
-      int s = -1;
-      for (size_t i = 0; i < N; ++i) {
-        if (0 == i % step) ++s;
-        data_dx[i] = T(data_dout[s].real, -data_dout[s].imag) * data_ddy[i];
-      }
-    }
-
-    if (tensor_dy) {
-      auto* data_dy = tensor_dy->mutable_data<T>(ctx.GetPlace());
-      const auto* data_ddx = tensor_ddx->data<T>();
-      const framework::DDim& dim = tensor_dy->dims();
-      size_t N = static_cast<size_t>(framework::product(dim));
-
-      auto step = dim[dim.size() - 1];
-
-      int s = -1;
-      for (size_t i = 0; i < N; ++i) {
-        if (0 == i % step) ++s;
-        data_dy[i] = T(data_dout[s].real, -data_dout[s].imag) * data_ddx[i];
-      }
-    }
-
-    if (tensor_ddout) {
-      auto* data_ddout = tensor_ddout->mutable_data<T>(ctx.GetPlace());
-      auto* data_x = tensor_x->data<T>();
-      auto* data_y = tensor_y->data<T>();
-      auto* data_ddx = tensor_ddx->data<T>();
-      auto* data_ddy = tensor_ddy->data<T>();
-
-      const framework::DDim& dim = tensor_dy->dims();
-      size_t N = static_cast<size_t>(framework::product(dim));
-      auto step = dim[dim.size() - 1];
-      int s = -1;
-      bool new_s = false;
-
-      for (size_t i = 0; i < N; ++i) {
-        if (0 == i % step) {
-          ++s;
-          new_s = true;
-        }
-        if (new_s) {
-          data_ddout[s] = T(data_x[i].real, -data_x[i].imag) * data_ddy[i] +
-                          T(data_y[i].real, -data_y[i].imag) * data_ddx[i];
-        } else {
-          data_ddout[s] += T(data_x[i].real, -data_x[i].imag) * data_ddy[i] +
-                           T(data_y[i].real, -data_y[i].imag) * data_ddx[i];
-        }
-        new_s = false;
-      }
-    }
-#endif
-  }
-};
-
-template <typename DeviceContext, typename T>
-struct DotDoubleGradFunction<DeviceContext, T, math::DisableComplex<T>> {
-  void operator()(const Tensor* tensor_x, const Tensor* tensor_y,
-                  Tensor* tensor_dx, Tensor* tensor_dy,
-                  const Tensor* tensor_dout, const Tensor* tensor_ddx,
-                  const Tensor* tensor_ddy, Tensor* tensor_ddout,
-                  const paddle::framework::ExecutionContext& ctx) {
-#if defined(__NVCC__) || defined(__HIPCC__)
-    if (1 == tensor_dout->dims().size()) {
-      auto& dev_raw = ctx.template device_context<DeviceContext>();
-      auto& dev = *dev_raw.eigen_device();
-      auto dout = framework::EigenVector<T>::Flatten(*tensor_dout);
-      if (tensor_dx) {
-        tensor_dx->mutable_data<T>(ctx.GetPlace());
-        auto ddy = framework::EigenVector<T>::Flatten(*tensor_ddy);
-        Eigen::DSizes<int, 1> size(tensor_ddy->numel());
-        auto dx = framework::EigenVector<T>::Flatten(*tensor_dx);
-        dx.device(dev) = ddy * dout.broadcast(size);
-      }
-
-      if (tensor_dy) {
-        tensor_dy->mutable_data<T>(ctx.GetPlace());
-        auto ddx = framework::EigenVector<T>::Flatten(*tensor_ddx);
-        Eigen::DSizes<int, 1> size(tensor_ddx->numel());
-
-        auto dy = framework::EigenVector<T>::Flatten(*tensor_dy);
-        dy.device(dev) = ddx * dout.broadcast(size);
-      }
-
-      if (tensor_ddout) {
-        tensor_ddout->mutable_data<T>(ctx.GetPlace());
-        auto x = framework::EigenVector<T>::Flatten(*tensor_x);
-        auto y = framework::EigenVector<T>::Flatten(*tensor_y);
-        auto ddx = framework::EigenVector<T>::Flatten(*tensor_ddx);
-        auto ddy = framework::EigenVector<T>::Flatten(*tensor_ddy);
-        auto ddout = framework::EigenVector<T>::Flatten(*tensor_ddout);
-        ddout.device(dev) = (x * ddy + y * ddx).sum();
-      }
-    }
-#else
-    const auto* data_dout = tensor_dout->data<T>();
-
-    if (tensor_dx) {
-      auto* data_dx = tensor_dx->mutable_data<T>(ctx.GetPlace());
-      const auto* data_ddy = tensor_ddy->data<T>();
-      const framework::DDim& dim = tensor_dx->dims();
-      size_t N = static_cast<size_t>(framework::product(dim));
-
-      auto step = dim[dim.size() - 1];
-
-      int s = -1;
-      for (size_t i = 0; i < N; ++i) {
-        if (0 == i % step) ++s;
-        data_dx[i] = data_dout[s] * data_ddy[i];
-      }
-    }
-
-    if (tensor_dy) {
-      auto* data_dy = tensor_dy->mutable_data<T>(ctx.GetPlace());
-      const auto* data_ddx = tensor_ddx->data<T>();
-      const framework::DDim& dim = tensor_dy->dims();
-      size_t N = static_cast<size_t>(framework::product(dim));
-
-      auto step = dim[dim.size() - 1];
-
-      int s = -1;
-      for (size_t i = 0; i < N; ++i) {
-        if (0 == i % step) ++s;
-        data_dy[i] = data_dout[s] * data_ddx[i];
-      }
-    }
-
-    if (tensor_ddout) {
-      auto* data_ddout = tensor_ddout->mutable_data<T>(ctx.GetPlace());
-      auto* data_x = tensor_x->data<T>();
-      auto* data_y = tensor_y->data<T>();
-      auto* data_ddx = tensor_ddx->data<T>();
-      auto* data_ddy = tensor_ddy->data<T>();
-
-      const framework::DDim& dim = tensor_dy->dims();
-      size_t N = static_cast<size_t>(framework::product(dim));
-      auto step = dim[dim.size() - 1];
-      int s = -1;
-      bool new_s = false;
-
-      for (size_t i = 0; i < N; ++i) {
-        if (0 == i % step) {
-          ++s;
-          new_s = true;
-        }
-        if (new_s) {
-          data_ddout[s] = data_x[i] * data_ddy[i] + data_y[i] * data_ddx[i];
-        } else {
-          data_ddout[s] += data_x[i] * data_ddy[i] + data_y[i] * data_ddx[i];
-        }
-        new_s = false;
-      }
-    }
-#endif
-  }
-};
-
-template <typename DeviceContext, typename T, typename Enabel = void>
-struct DotTripleGradFunction {
-  void operator()(const Tensor* in_tensor_x, const Tensor* in_tensor_y,
-                  const Tensor* in_tensor_ddx, const Tensor* in_tensor_ddy,
-                  const Tensor* in_tensor_d_dx, const Tensor* in_tensor_d_dy,
-                  const Tensor* in_tensor_dout, const Tensor* in_tensor_d_ddout,
-                  Tensor* out_tensor_d_x, Tensor* out_tensor_d_y,
-                  Tensor* out_tensor_d_dout, Tensor* out_tensor_d_ddx,
-                  Tensor* out_tensor_d_ddy,
-                  const paddle::framework::ExecutionContext& ctx);
-};
-
-// TODO(wuweilong): enable this function when the unittests framewark for multi
-// grad is ok (dtype: complex64 or complex128).
-template <typename DeviceContext, typename T>
-struct DotTripleGradFunction<DeviceContext, T, math::EnableComplex<T>> {
-  void operator()(const Tensor* in_tensor_x, const Tensor* in_tensor_y,
-                  const Tensor* in_tensor_ddx, const Tensor* in_tensor_ddy,
-                  const Tensor* in_tensor_d_dx, const Tensor* in_tensor_d_dy,
-                  const Tensor* in_tensor_dout, const Tensor* in_tensor_d_ddout,
-                  Tensor* out_tensor_d_x, Tensor* out_tensor_d_y,
-                  Tensor* out_tensor_d_dout, Tensor* out_tensor_d_ddx,
-                  Tensor* out_tensor_d_ddy,
-                  const paddle::framework::ExecutionContext& ctx) {
-#if defined(__NVCC__) || defined(__HIPCC__)
-    if (1 == in_tensor_d_ddout->dims().size()) {
-      framework::Tensor in_tensor_d_ddout_help;
-      auto& dev_raw = ctx.template device_context<DeviceContext>();
-      auto& dev = *dev_raw.eigen_device();
-      if (out_tensor_d_x || out_tensor_d_y) {
-        in_tensor_d_ddout_help.Resize(in_tensor_d_ddout->dims());
-        in_tensor_d_ddout_help.mutable_data<T>(ctx.GetPlace());
-        paddle::platform::ForRange<DeviceContext> for_range(
-            dev_raw, in_tensor_d_ddout->numel());
-        math::ConjFunctor<T> functor(in_tensor_d_ddout->data<T>(),
-                                     in_tensor_d_ddout->numel(),
-                                     in_tensor_d_ddout_help.data<T>());
-        for_range(functor);
-      }
-      if (out_tensor_d_x) {
-        auto ddy = framework::EigenVector<T>::Flatten(*in_tensor_ddy);
-        Eigen::DSizes<int, 1> size(in_tensor_ddy->numel());
-        auto d_x = framework::EigenVector<T>::Flatten(*out_tensor_d_x);
-        auto d_ddout =
-            framework::EigenVector<T>::Flatten(in_tensor_d_ddout_help);
-        d_x.device(dev) = ddy * d_ddout.broadcast(size);
-      }
-
-      if (out_tensor_d_y) {
-        auto ddx = framework::EigenVector<T>::Flatten(*in_tensor_ddx);
-        Eigen::DSizes<int, 1> size(in_tensor_ddx->numel());
-        auto d_y = framework::EigenVector<T>::Flatten(*out_tensor_d_y);
-        auto d_ddout =
-            framework::EigenVector<T>::Flatten(in_tensor_d_ddout_help);
-        d_y.device(dev) = ddx * d_ddout.broadcast(size);
-      }
-
-      if (out_tensor_d_dout) {
-        framework::Tensor in_tensor_ddx_help, in_tensor_ddy_help;
-        in_tensor_ddx_help.Resize(in_tensor_ddx->dims());
-        in_tensor_ddx_help.mutable_data<T>(ctx.GetPlace());
-        in_tensor_ddy_help.Resize(in_tensor_ddy->dims());
-        in_tensor_ddy_help.mutable_data<T>(ctx.GetPlace());
-
-        auto& dev_raw = ctx.template device_context<DeviceContext>();
-        auto& dev = *dev_raw.eigen_device();
-        paddle::platform::ForRange<DeviceContext> for_range(
-            dev_raw, in_tensor_ddx->numel());
-        math::ConjFunctor<T> functor_ddx(in_tensor_ddx->data<T>(),
-                                         in_tensor_ddx->numel(),
-                                         in_tensor_ddx_help.data<T>());
-        for_range(functor_ddx);
-        math::ConjFunctor<T> functor_ddy(in_tensor_ddy->data<T>(),
-                                         in_tensor_ddy->numel(),
-                                         in_tensor_ddy_help.data<T>());
-        for_range(functor_ddy);
-        auto ddx = framework::EigenVector<T>::Flatten(in_tensor_ddx_help);
-        auto ddy = framework::EigenVector<T>::Flatten(in_tensor_ddy_help);
-        auto d_dx = framework::EigenVector<T>::Flatten(*in_tensor_d_dx);
-        auto d_dy = framework::EigenVector<T>::Flatten(*in_tensor_d_dy);
-        auto d_dout = framework::EigenVector<T>::Flatten(*out_tensor_d_dout);
-        d_dout.device(dev) = (ddx * d_dy + ddy * d_dx).sum();
-      }
-      if (out_tensor_d_ddx) {
-        framework::Tensor in_tensor_dout_help, in_tensor_y_help;
-        in_tensor_dout_help.Resize(in_tensor_dout->dims());
-        in_tensor_dout_help.mutable_data<T>(ctx.GetPlace());
-        in_tensor_y_help.Resize(in_tensor_y->dims());
-        in_tensor_y_help.mutable_data<T>(ctx.GetPlace());
-
-        auto& dev_raw = ctx.template device_context<DeviceContext>();
-        auto& dev = *dev_raw.eigen_device();
-        paddle::platform::ForRange<DeviceContext> for_range(
-            dev_raw, in_tensor_dout->numel());
-        math::ConjFunctor<T> functor_dout(in_tensor_dout->data<T>(),
-                                          in_tensor_dout->numel(),
-                                          in_tensor_dout_help.data<T>());
-        for_range(functor_dout);
-        math::ConjFunctor<T> functor_y(in_tensor_y->data<T>(),
-                                       in_tensor_y->numel(),
-                                       in_tensor_y_help.data<T>());
-        for_range(functor_y);
-        auto dout = framework::EigenVector<T>::Flatten(in_tensor_dout_help);
-        auto y = framework::EigenVector<T>::Flatten(in_tensor_y_help);
-        auto d_ddout = framework::EigenVector<T>::Flatten(*in_tensor_d_ddout);
-        auto d_dy = framework::EigenVector<T>::Flatten(*in_tensor_d_dy);
-        auto d_ddx = framework::EigenVector<T>::Flatten(*out_tensor_d_ddx);
-        Eigen::DSizes<int, 1> size(in_tensor_y->numel());
-        d_ddx.device(dev) =
-            (dout.broadcast(size) * d_dy + y * d_ddout.broadcast(size));
-      }
-      if (out_tensor_d_ddy) {
-        framework::Tensor in_tensor_dout_help, in_tensor_x_help;
-        in_tensor_dout_help.Resize(in_tensor_dout->dims());
-        in_tensor_dout_help.mutable_data<T>(ctx.GetPlace());
-        in_tensor_x_help.Resize(in_tensor_x->dims());
-        in_tensor_x_help.mutable_data<T>(ctx.GetPlace());
-
-        auto& dev_raw = ctx.template device_context<DeviceContext>();
-        auto& dev = *dev_raw.eigen_device();
-        paddle::platform::ForRange<DeviceContext> for_range(
-            dev_raw, in_tensor_dout->numel());
-        math::ConjFunctor<T> functor_dout(in_tensor_dout->data<T>(),
-                                          in_tensor_dout->numel(),
-                                          in_tensor_dout_help.data<T>());
-        for_range(functor_dout);
-        math::ConjFunctor<T> functor_x(in_tensor_x->data<T>(),
-                                       in_tensor_x->numel(),
-                                       in_tensor_x_help.data<T>());
-        for_range(functor_x);
-        auto dout = framework::EigenVector<T>::Flatten(in_tensor_dout_help);
-        auto x = framework::EigenVector<T>::Flatten(in_tensor_x_help);
-        auto d_ddout = framework::EigenVector<T>::Flatten(*in_tensor_d_ddout);
-        auto d_dx = framework::EigenVector<T>::Flatten(*in_tensor_d_dx);
-        auto d_ddy = framework::EigenVector<T>::Flatten(*out_tensor_d_ddy);
-        Eigen::DSizes<int, 1> size(in_tensor_x->numel());
-        d_ddy.device(dev) =
-            (dout.broadcast(size) * d_dx + x * d_ddout.broadcast(size));
-      }
-    }
-#else
-    const auto* data_d_ddout = in_tensor_d_ddout->data<T>();
-
-    if (out_tensor_d_x) {
-      auto* data_d_x = out_tensor_d_x->mutable_data<T>(ctx.GetPlace());
-      const auto* data_ddy = in_tensor_ddy->data<T>();
-
-      const framework::DDim& dim = out_tensor_d_x->dims();
-      size_t N = static_cast<size_t>(framework::product(dim));
-      auto step = dim[dim.size() - 1];
-      int s = -1;
-
-      for (size_t i = 0; i < N; ++i) {
-        if (0 == i % step) ++s;
-        data_d_x[i] = T(data_ddy[i].real, -data_ddy[i].imag) * data_d_ddout[s];
-      }
-    }
-
-    if (out_tensor_d_y) {
-      auto* data_d_y = out_tensor_d_y->mutable_data<T>(ctx.GetPlace());
-      const auto* data_ddx = in_tensor_ddx->data<T>();
-
-      const framework::DDim& dim = out_tensor_d_y->dims();
-      size_t N = static_cast<size_t>(framework::product(dim));
-      auto step = dim[dim.size() - 1];
-      int s = -1;
-
-      for (size_t i = 0; i < N; ++i) {
-        if (0 == i % step) ++s;
-        data_d_y[i] = T(data_ddx[i].real, -data_ddx[i].imag) * data_d_ddout[s];
-      }
-    }
-
-    if (out_tensor_d_dout) {
-      auto* data_d_dout = out_tensor_d_dout->mutable_data<T>(ctx.GetPlace());
-      auto* data_ddx = in_tensor_ddx->data<T>();
-      auto* data_ddy = in_tensor_ddy->data<T>();
-      auto* data_d_dx = in_tensor_d_dx->data<T>();
-      auto* data_d_dy = in_tensor_d_dy->data<T>();
-
-      const framework::DDim& dim = out_tensor_d_dout->dims();
-      size_t N = static_cast<size_t>(framework::product(dim));
-      auto step = dim[dim.size() - 1];
-      int s = -1;
-      bool new_s = false;
-
-      for (size_t i = 0; i < N; ++i) {
-        if (0 == i % step) {
-          ++s;
-          new_s = true;
-        }
-        if (new_s) {
-          data_d_dout[s] =
-              T(data_ddy[i].real, -data_ddy[i].imag) * data_d_dx[i] +
-              T(data_ddx[i].real, -data_ddx[i].imag) * data_d_dy[i];
-        } else {
-          data_d_dout[s] +=
-              T(data_ddy[i].real, -data_ddy[i].imag) * data_d_dx[i] +
-              T(data_ddx[i].real, -data_ddx[i].imag) * data_d_dy[i];
-        }
-        new_s = false;
-      }
-    }
-
-    if (out_tensor_d_ddx) {
-      auto* data_d_ddx = out_tensor_d_ddx->mutable_data<T>(ctx.GetPlace());
-      auto* data_dout = in_tensor_dout->data<T>();
-      auto* data_d_dy = in_tensor_d_dy->data<T>();
-      auto* data_y = in_tensor_y->data<T>();
-      auto* data_d_ddout = in_tensor_d_ddout->data<T>();
-
-      const framework::DDim& dim = out_tensor_d_ddx->dims();
-      size_t N = static_cast<size_t>(framework::product(dim));
-      auto step = dim[dim.size() - 1];
-      int s = -1;
-
-      for (size_t i = 0; i < N; ++i) {
-        if (0 == i % step) ++s;
-        data_d_ddx[i] =
-            T(data_dout[s].real, -data_dout[s].imag) * data_d_dy[i] +
-            T(data_y[i].real, -data_y[i].imag) * data_d_ddout[s];
-      }
-    }
-
-    if (out_tensor_d_ddy) {
-      auto* data_d_ddy = out_tensor_d_ddy->mutable_data<T>(ctx.GetPlace());
-      auto* data_dout = in_tensor_dout->data<T>();
-      auto* data_d_dx = in_tensor_d_dx->data<T>();
-      auto* data_x = in_tensor_x->data<T>();
-      auto* data_d_ddout = in_tensor_d_ddout->data<T>();
-
-      const framework::DDim& dim = out_tensor_d_ddy->dims();
-      size_t N = static_cast<size_t>(framework::product(dim));
-      auto step = dim[dim.size() - 1];
-      int s = -1;
-
-      for (size_t i = 0; i < N; ++i) {
-        if (0 == i % step) ++s;
-        data_d_ddy[i] =
-            T(data_dout[s].real, -data_dout[s].imag) * data_d_dx[i] +
-            T(data_x[i].real, -data_x[i].imag) * data_d_ddout[s];
-      }
-    }
-#endif
-  }
-};
-
-template <typename DeviceContext, typename T>
-struct DotTripleGradFunction<DeviceContext, T, math::DisableComplex<T>> {
-  void operator()(const Tensor* in_tensor_x, const Tensor* in_tensor_y,
-                  const Tensor* in_tensor_ddx, const Tensor* in_tensor_ddy,
-                  const Tensor* in_tensor_d_dx, const Tensor* in_tensor_d_dy,
-                  const Tensor* in_tensor_dout, const Tensor* in_tensor_d_ddout,
-                  Tensor* out_tensor_d_x, Tensor* out_tensor_d_y,
-                  Tensor* out_tensor_d_dout, Tensor* out_tensor_d_ddx,
-                  Tensor* out_tensor_d_ddy,
-                  const paddle::framework::ExecutionContext& ctx) {
-#if defined(__NVCC__) || defined(__HIPCC__)
-    if (1 == in_tensor_d_ddout->dims().size()) {
-      auto& dev_raw = ctx.template device_context<DeviceContext>();
-      auto& dev = *dev_raw.eigen_device();
-      auto d_ddout = framework::EigenVector<T>::Flatten(*in_tensor_d_ddout);
-      if (out_tensor_d_x) {
-        out_tensor_d_x->mutable_data<T>(ctx.GetPlace());
-        auto ddy = framework::EigenVector<T>::Flatten(*in_tensor_ddy);
-        Eigen::DSizes<int, 1> size(in_tensor_ddy->numel());
-        auto d_x = framework::EigenVector<T>::Flatten(*out_tensor_d_x);
-        d_x.device(dev) = ddy * d_ddout.broadcast(size);
-      }
-
-      if (out_tensor_d_y) {
-        out_tensor_d_y->mutable_data<T>(ctx.GetPlace());
-        auto ddx = framework::EigenVector<T>::Flatten(*in_tensor_ddx);
-        Eigen::DSizes<int, 1> size(in_tensor_ddx->numel());
-
-        auto d_y = framework::EigenVector<T>::Flatten(*out_tensor_d_y);
-        d_y.device(dev) = ddx * d_ddout.broadcast(size);
-      }
-
-      if (out_tensor_d_dout) {
-        out_tensor_d_dout->mutable_data<T>(ctx.GetPlace());
-        auto ddx = framework::EigenVector<T>::Flatten(*in_tensor_ddx);
-        auto ddy = framework::EigenVector<T>::Flatten(*in_tensor_ddy);
-        auto d_dx = framework::EigenVector<T>::Flatten(*in_tensor_d_dx);
-        auto d_dy = framework::EigenVector<T>::Flatten(*in_tensor_d_dy);
-        auto d_dout = framework::EigenVector<T>::Flatten(*out_tensor_d_dout);
-        d_dout.device(dev) = (ddx * d_dy + ddy * d_dx).sum();
-      }
-
-      if (out_tensor_d_ddx) {
-        out_tensor_d_ddx->mutable_data<T>(ctx.GetPlace());
-        auto dout = framework::EigenVector<T>::Flatten(*in_tensor_dout);
-        auto y = framework::EigenVector<T>::Flatten(*in_tensor_y);
-        auto d_ddout = framework::EigenVector<T>::Flatten(*in_tensor_d_ddout);
-        auto d_dy = framework::EigenVector<T>::Flatten(*in_tensor_d_dy);
-        auto d_ddx = framework::EigenVector<T>::Flatten(*out_tensor_d_ddx);
-        Eigen::DSizes<int, 1> size(in_tensor_y->numel());
-        d_ddx.device(dev) =
-            (dout.broadcast(size) * d_dy + y * d_ddout.broadcast(size));
-      }
-
-      if (out_tensor_d_ddy) {
-        out_tensor_d_ddy->mutable_data<T>(ctx.GetPlace());
-        auto dout = framework::EigenVector<T>::Flatten(*in_tensor_dout);
-        auto x = framework::EigenVector<T>::Flatten(*in_tensor_x);
-        auto d_ddout = framework::EigenVector<T>::Flatten(*in_tensor_d_ddout);
-        auto d_dx = framework::EigenVector<T>::Flatten(*in_tensor_d_dx);
-        auto d_ddy = framework::EigenVector<T>::Flatten(*out_tensor_d_ddy);
-        Eigen::DSizes<int, 1> size(in_tensor_x->numel());
-        d_ddy.device(dev) =
-            (dout.broadcast(size) * d_dx + x * d_ddout.broadcast(size));
-      }
-    }
-#else
-    const auto* data_d_ddout = in_tensor_d_ddout->data<T>();
-
-    if (out_tensor_d_x) {
-      auto* data_d_x = out_tensor_d_x->mutable_data<T>(ctx.GetPlace());
-      const auto* data_ddy = in_tensor_ddy->data<T>();
-
-      const framework::DDim& dim = out_tensor_d_x->dims();
-      size_t N = static_cast<size_t>(framework::product(dim));
-      auto step = dim[dim.size() - 1];
-      int s = -1;
-
-      for (size_t i = 0; i < N; ++i) {
-        if (0 == i % step) ++s;
-        data_d_x[i] = data_ddy[i] * data_d_ddout[s];
-      }
-    }
-
-    if (out_tensor_d_y) {
-      auto* data_d_y = out_tensor_d_y->mutable_data<T>(ctx.GetPlace());
-      const auto* data_ddx = in_tensor_ddx->data<T>();
-
-      const framework::DDim& dim = out_tensor_d_y->dims();
-      size_t N = static_cast<size_t>(framework::product(dim));
-      auto step = dim[dim.size() - 1];
-      int s = -1;
-
-      for (size_t i = 0; i < N; ++i) {
-        if (0 == i % step) ++s;
-        data_d_y[i] = data_ddx[i] * data_d_ddout[s];
-      }
-    }
-
-    if (out_tensor_d_dout) {
-      auto* data_d_dout = out_tensor_d_dout->mutable_data<T>(ctx.GetPlace());
-      auto* data_ddx = in_tensor_ddx->data<T>();
-      auto* data_ddy = in_tensor_ddy->data<T>();
-      auto* data_d_dx = in_tensor_d_dx->data<T>();
-      auto* data_d_dy = in_tensor_d_dy->data<T>();
-
-      const framework::DDim& dim = in_tensor_ddx->dims();
-      size_t N = static_cast<size_t>(framework::product(dim));
-      auto step = dim[dim.size() - 1];
-      int s = -1;
-      bool new_s = false;
-      for (size_t i = 0; i < N; ++i) {
-        if (0 == i % step) {
-          ++s;
-          new_s = true;
-        }
-        if (new_s) {
-          data_d_dout[s] =
-              data_ddy[i] * data_d_dx[i] + data_ddx[i] * data_d_dy[i];
-        } else {
-          data_d_dout[s] +=
-              data_ddy[i] * data_d_dx[i] + data_ddx[i] * data_d_dy[i];
-        }
-        new_s = false;
-      }
-    }
-
-    if (out_tensor_d_ddx) {
-      auto* data_d_ddx = out_tensor_d_ddx->mutable_data<T>(ctx.GetPlace());
-      auto* data_dout = in_tensor_dout->data<T>();
-      auto* data_d_dy = in_tensor_d_dy->data<T>();
-      auto* data_y = in_tensor_y->data<T>();
-      auto* data_d_ddout = in_tensor_d_ddout->data<T>();
-
-      const framework::DDim& dim = out_tensor_d_ddx->dims();
-      size_t N = static_cast<size_t>(framework::product(dim));
-      auto step = dim[dim.size() - 1];
-      int s = -1;
-
-      for (size_t i = 0; i < N; ++i) {
-        if (0 == i % step) ++s;
-        data_d_ddx[i] =
-            data_dout[s] * data_d_dy[i] + data_y[i] * data_d_ddout[s];
-      }
-    }
-
-    if (out_tensor_d_ddy) {
-      auto* data_d_ddy = out_tensor_d_ddy->mutable_data<T>(ctx.GetPlace());
-      auto* data_dout = in_tensor_dout->data<T>();
-      auto* data_d_dx = in_tensor_d_dx->data<T>();
-      auto* data_x = in_tensor_x->data<T>();
-      auto* data_d_ddout = in_tensor_d_ddout->data<T>();
-
-      const framework::DDim& dim = out_tensor_d_ddy->dims();
-      size_t N = static_cast<size_t>(framework::product(dim));
-      auto step = dim[dim.size() - 1];
-      int s = -1;
-
-      for (size_t i = 0; i < N; ++i) {
-        if (0 == i % step) ++s;
-        data_d_ddy[i] =
-            data_dout[s] * data_d_dx[i] + data_x[i] * data_d_ddout[s];
-      }
-    }
-#endif
-  }
-};
-
 template <typename DeviceContext, typename T>
 class MatMulV2GradKernel : public framework::OpKernel<T> {
  public:
-  void MatMul(const framework::ExecutionContext& context,
-              const framework::Tensor& a, bool trans_a,
-              const framework::Tensor& b, bool trans_b,
-              framework::Tensor* out) const {
-    out->mutable_data<T>(context.GetPlace());
-    auto blas = math::GetBlas<DeviceContext, T>(context);
-    auto mat_dim_a = math::CreateMatrixDescriptor(a.dims(), 0, trans_a);
-    auto mat_dim_b = math::CreateMatrixDescriptor(b.dims(), 0, trans_b);
-    if (a.dims().size() == 3 && b.dims().size() <= 2) {
-      // the transpose_X must be false, if is true, the transpose cost much time
-      if (!trans_a) {
-        mat_dim_a.height_ *= mat_dim_a.batch_size_;
-        mat_dim_a.batch_size_ = 0;
-      }
-    }
-    blas.MatMul(a, mat_dim_a, b, mat_dim_b, static_cast<T>(1), out,
-                static_cast<T>(0));
-  }
-
-  void CalcInputGrad(const framework::ExecutionContext& context,
-                     const framework::Tensor& a, bool trans_a,
-                     bool is_fold_init_dims_a, const framework::Tensor& b,
-                     bool trans_b, bool is_fold_init_dims_b,
-                     framework::Tensor* out) const {
-    if (out == nullptr) return;
-    bool need_combine = (a.dims().size() == 3 || b.dims().size() == 3) &&
-                        out->dims().size() == 2;
-    if (!need_combine) {
-      MatMul(context, a, trans_a, b, trans_b, out);
-    } else {
-      auto& ctx = context.template device_context<DeviceContext>();
-      MatMul(context, is_fold_init_dims_a
-                          ? FoldInitDims(a)
-                          : FoldHeadAndLastDims<DeviceContext, T>(ctx, a),
-             trans_a, is_fold_init_dims_b
-                          ? FoldInitDims(b)
-                          : FoldHeadAndLastDims<DeviceContext, T>(ctx, b),
-             trans_b, out);
-    }
-  }
-
   void Compute(const framework::ExecutionContext& ctx) const override {
     bool transpose_x = ctx.Attr<bool>("trans_x");
     bool transpose_y = ctx.Attr<bool>("trans_y");
-    auto x = *ctx.Input<framework::Tensor>("X");
-    auto y = *ctx.Input<framework::Tensor>("Y");
-    auto dout = *ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-
-    framework::Tensor y_conj(y.type());
-    framework::Tensor x_conj(y.type());
-
-    // get dims
-    std::vector<std::int64_t> x_dims = vectorize(x.dims());
-    std::vector<std::int64_t> y_dims = vectorize(y.dims());
-    std::vector<std::int64_t> dout_dims = vectorize(dout.dims());
-
-    int x_ndim = x_dims.size();
-    int y_ndim = y_dims.size();
-    int ndim = dout_dims.size();
+    auto* x = ctx.Input<framework::Tensor>("X");
+    auto* y = ctx.Input<framework::Tensor>("Y");
+    auto* dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
 
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
 
-    // Case1 : x's or y's dim = 1
-    if (x_ndim == 1 && y_ndim == 1) {
-      if (dx) dx->mutable_data<T>(ctx.GetPlace());
-      if (dy) dy->mutable_data<T>(ctx.GetPlace());
-      if (dout.numel() == 1) {
-        DotGradFunction<DeviceContext, T>()(&x, &y, &dout, dx, dy, ctx);
-        return;
-      }
-    }
-
-    bool is_broadcast = true;
-    if (x_ndim <= 2 || y_ndim <= 2) {
-      is_broadcast = false;
-    } else if (x_ndim != y_ndim) {
-      is_broadcast = true;
-    } else {
-      is_broadcast = !std::equal(x_dims.cbegin(), x_dims.cbegin() + x_ndim - 2,
-                                 y_dims.cbegin());
-    }
-
-    // Case2: no broadcast or no batch size, it aims to speed and it is same as
-    // matmul in old version.
-    if (!is_broadcast) {
-      ReshapeXYOutIntoMatrixSequence(&x, &y, &dout, transpose_x, transpose_y);
-      framework::DDim dx_dims;
-      if (dx) {
-        dx_dims = dx->dims();
-        if (dx_dims != x.dims()) {
-          dx->Resize(x.dims());
-        }
+    if (dx) dx->mutable_data<T>(ctx.GetPlace());
+    if (dy) dy->mutable_data<T>(ctx.GetPlace());
 
-        // for complex
-        ConjHelper<DeviceContext, T> conj_helper(ctx);
-        conj_helper(y, y_conj);
-      }
+    auto pt_x = paddle::experimental::MakePtenDenseTensor(*x);
+    auto pt_y = paddle::experimental::MakePtenDenseTensor(*y);
+    auto pt_dout = paddle::experimental::MakePtenDenseTensor(*dout);
+    auto pt_dx = dx ? paddle::experimental::MakePtenDenseTensor(*dx)
+                    : std::unique_ptr<pten::DenseTensor>(nullptr);
+    auto pt_dy = dy ? paddle::experimental::MakePtenDenseTensor(*dy)
+                    : std::unique_ptr<pten::DenseTensor>(nullptr);
 
-      framework::DDim dy_dims;
-      if (dy) {
-        dy_dims = dy->dims();
-        if (dy_dims != y.dims()) {
-          dy->Resize(y.dims());
-        }
-
-        // for complex
-        ConjHelper<DeviceContext, T> conj_helper(ctx);
-        conj_helper(x, x_conj);
-      }
-      if (transpose_x && transpose_y) {
-        CalcInputGrad(ctx, y_conj, true, true, dout, true, false, dx);
-        CalcInputGrad(ctx, dout, true, true, x_conj, true, false, dy);
-      } else if (transpose_x) {
-        CalcInputGrad(ctx, y_conj, false, false, dout, true, false, dx);
-        CalcInputGrad(ctx, x_conj, false, false, dout, false, true, dy);
-      } else if (transpose_y) {
-        CalcInputGrad(ctx, dout, false, false, y_conj, false, true, dx);
-        CalcInputGrad(ctx, dout, true, true, x_conj, false, true, dy);
-      } else {
-        CalcInputGrad(ctx, dout, false, false, y_conj, true, false, dx);
-        CalcInputGrad(ctx, x_conj, true, true, dout, false, true, dy);
-      }
-
-      if (dx) {
-        if (dx_dims != x.dims()) {
-          dx->Resize(dx_dims);
-        }
-      }
-      if (dy) {
-        if (dy_dims != y.dims()) {
-          dy->Resize(dy_dims);
-        }
-      }
-    } else {
-      // Case3: broadcast. It need cost much time to reduce sum for the
-      // broadcast and wastes the memory.
-      // So we should avoid the case in reality.
-      VLOG(3) << "It need cost much time to reduce sum for the broadcast and "
-                 "wastes the memory. So we should avoid the case in reality";
-      Tensor dx_help, dy_help;
-
-      ConjHelper<DeviceContext, T> conj_helper(ctx);
-      conj_helper(x, x_conj);
-      conj_helper(y, y_conj);
-      if (transpose_x) {
-        if (transpose_y) {
-          // X'Y': dA = Y'G', dB = G'X'
-          if (dx)
-            MatMulFunction<DeviceContext, T>(&y_conj, &dout, y_dims, dout_dims,
-                                             &dx_help, true, true, ctx);
-          if (dy)
-            MatMulFunction<DeviceContext, T>(&dout, &x_conj, dout_dims, x_dims,
-                                             &dy_help, true, true, ctx);
-        } else {
-          // X'Y: dX = YG', dY = XG
-          if (dx)
-            MatMulFunction<DeviceContext, T>(&y_conj, &dout, y_dims, dout_dims,
-                                             &dx_help, false, true, ctx);
-          if (dy)
-            MatMulFunction<DeviceContext, T>(&x_conj, &dout, x_dims, dout_dims,
-                                             &dy_help, false, false, ctx);
-        }
-      } else {
-        if (transpose_y) {
-          // XY': dX = GY, dY = G'X
-          if (dx)
-            MatMulFunction<DeviceContext, T>(&dout, &y_conj, dout_dims, y_dims,
-                                             &dx_help, false, false, ctx);
-          if (dy)
-            MatMulFunction<DeviceContext, T>(&dout, &x_conj, dout_dims, x_dims,
-                                             &dy_help, true, false, ctx);
-        } else {
-          // XY: dX = GY', dY = X'G
-          if (dx)
-            MatMulFunction<DeviceContext, T>(&dout, &y_conj, dout_dims, y_dims,
-                                             &dx_help, false, true, ctx);
-          if (dy)
-            MatMulFunction<DeviceContext, T>(&x_conj, &dout, x_dims, dout_dims,
-                                             &dy_help, true, false, ctx);
-        }
-      }
-
-      // get help dims
-      const std::vector<std::int64_t> dx_help_dims = vectorize(dx_help.dims());
-      const std::vector<std::int64_t> dy_help_dims = vectorize(dy_help.dims());
-
-      std::vector<std::int64_t> dx_broadcast_dims(ndim);
-      std::vector<std::int64_t> dy_broadcast_dims(ndim);
-
-      std::fill(dx_broadcast_dims.data(),
-                dx_broadcast_dims.data() + ndim - x_ndim, 1);
-      std::fill(dy_broadcast_dims.data(),
-                dy_broadcast_dims.data() + ndim - y_ndim, 1);
-      std::copy(x_dims.data(), x_dims.data() + x_ndim,
-                dx_broadcast_dims.data() + ndim - x_ndim);
-      std::copy(y_dims.data(), y_dims.data() + y_ndim,
-                dy_broadcast_dims.data() + ndim - y_ndim);
-
-      std::vector<int> dx_reduce_dims;
-      std::vector<int> dy_reduce_dims;
-      for (int idx = 0; idx <= ndim - 3; idx++) {
-        if (dx_help_dims[idx] != 1 && dx_broadcast_dims[idx] == 1) {
-          dx_reduce_dims.push_back(idx);
-        }
-        if (dy_help_dims[idx] != 1 && dy_broadcast_dims[idx] == 1) {
-          dy_reduce_dims.push_back(idx);
-        }
-      }
-      // reduce sum to get grad by ReduceSum
-      if (dx) {
-        if (dx_reduce_dims.empty()) {
-          *dx = std::move(dx_help);
-        } else {
-          ReduceSumForMatmulGrad<DeviceContext, T>(&dx_help, dx, dx_reduce_dims,
-                                                   ctx);
-        }
-        dx->Resize(x.dims());
-      }
-      if (dy) {
-        if (dy_reduce_dims.empty()) {
-          *dy = std::move(dy_help);
-        } else {
-          ReduceSumForMatmulGrad<DeviceContext, T>(&dy_help, dy, dy_reduce_dims,
-                                                   ctx);
-        }
-        dy->Resize(y.dims());
-      }
+    auto& dev_ctx = ctx.device_context<DeviceContext>();
 
-      // Get the OutputGrad(out)
-    }
+    // call new kernel
+    pten::MatmulGradKernel<T>(dev_ctx, *pt_x, *pt_y, *pt_dout, transpose_x,
+                              transpose_y, pt_dx.get(), pt_dy.get());
   }
 };
 
 template <typename DeviceContext, typename T>
 class MatMulV2DoubleGradKernel : public framework::OpKernel<T> {
  public:
-  void MatMul(const framework::ExecutionContext& context,
-              const framework::Tensor& a, bool trans_a,
-              const framework::Tensor& b, bool trans_b, framework::Tensor* out,
-              bool flag) const {
-    out->mutable_data<T>(context.GetPlace());
-    auto blas = math::GetBlas<DeviceContext, T>(context);
-    auto mat_dim_a = math::CreateMatrixDescriptor(a.dims(), 0, trans_a);
-    auto mat_dim_b = math::CreateMatrixDescriptor(b.dims(), 0, trans_b);
-    if (a.dims().size() == 3 && b.dims().size() <= 2) {
-      // the transpose_X must be false, if is true, the transpose cost much time
-      if (!trans_a) {
-        mat_dim_a.height_ *= mat_dim_a.batch_size_;
-        mat_dim_a.batch_size_ = 0;
-      }
-    }
-    blas.MatMul(a, mat_dim_a, b, mat_dim_b, static_cast<T>(1), out,
-                static_cast<T>(flag));
-  }
-
-  void CalcInputGrad(const framework::ExecutionContext& context,
-                     const framework::Tensor& a, bool trans_a,
-                     bool is_fold_init_dims_a, const framework::Tensor& b,
-                     bool trans_b, bool is_fold_init_dims_b,
-                     framework::Tensor* out, bool flag) const {
-    if (out == nullptr) return;
-    bool need_combine = (a.dims().size() == 3 || b.dims().size() == 3) &&
-                        out->dims().size() == 2;
-    if (!need_combine) {
-      MatMul(context, a, trans_a, b, trans_b, out, flag);
-    } else {
-      auto& ctx = context.template device_context<DeviceContext>();
-      MatMul(context, is_fold_init_dims_a
-                          ? FoldInitDims(a)
-                          : FoldHeadAndLastDims<DeviceContext, T>(ctx, a),
-             trans_a, is_fold_init_dims_b
-                          ? FoldInitDims(b)
-                          : FoldHeadAndLastDims<DeviceContext, T>(ctx, b),
-             trans_b, out, flag);
-    }
-  }
-
   void Compute(const framework::ExecutionContext& context) const override {
-    auto x = *context.Input<framework::Tensor>("X");
-    auto y = *context.Input<framework::Tensor>("Y");
-    auto dout = *context.Input<framework::Tensor>("DOut");
+    auto* x = context.Input<framework::Tensor>("X");
+    auto* y = context.Input<framework::Tensor>("Y");
+    auto* dout = context.Input<framework::Tensor>("DOut");
     auto* ddx = context.Input<framework::Tensor>("DDX");
     auto* ddy = context.Input<framework::Tensor>("DDY");
 
@@ -1486,316 +185,38 @@ class MatMulV2DoubleGradKernel : public framework::OpKernel<T> {
     bool transpose_x = context.Attr<bool>("trans_x");
     bool transpose_y = context.Attr<bool>("trans_y");
 
-    // Get dims from the input x, y, output_grad
-    std::vector<std::int64_t> x_dims = vectorize(x.dims());
-    std::vector<std::int64_t> y_dims = vectorize(y.dims());
-    std::vector<std::int64_t> dout_dims = vectorize(dout.dims());
-    framework::Tensor x_conj(x.type());
-    framework::Tensor y_conj(y.type());
-    framework::Tensor dout_conj(dout.type());
-
-    int x_ndim = x_dims.size();
-    int y_ndim = y_dims.size();
-    int ndim = dout_dims.size();
-
-    // Case1 : x's or y's dim = 1
-    if (x_ndim == 1 && y_ndim == 1) {
-      DotDoubleGradFunction<DeviceContext, T>()(&x, &y, dx, dy, &dout, ddx, ddy,
-                                                ddout, context);
-      return;
-    }
-
-    bool is_broadcast = true;
-    if (x_ndim <= 2 || y_ndim <= 2) {
-      is_broadcast = false;
-    } else if (x_ndim != y_ndim) {
-      is_broadcast = true;
-    } else {
-      is_broadcast = !std::equal(x_dims.cbegin(), x_dims.cbegin() + x_ndim - 2,
-                                 y_dims.cbegin());
-    }
-
-    if (!is_broadcast) {
-      // Case2: no broadcast or no batch size
-      ReshapeXYOutIntoMatrixSequence(&x, &y, &dout, transpose_x, transpose_y);
-      framework::DDim dx_dims;
-
-      ConjHelper<DeviceContext, T> conj_helper(context);
-      if (dx) {
-        dx_dims = dx->dims();
-        if (dx_dims != x.dims()) {
-          dx->Resize(x.dims());
-        }
-      }
-
-      framework::DDim dy_dims;
-      if (dy) {
-        dy_dims = dy->dims();
-        if (dy_dims != y.dims()) {
-          dy->Resize(y.dims());
-        }
-      }
-
-      framework::DDim ddout_dims;
-      if (ddout) {
-        ddout_dims = ddout->dims();
-        if (ddout_dims != dout.dims()) {
-          ddout->Resize(dout.dims());
-        }
-      }
-
-      if (ddx || ddy) {
-        ConjHelper<DeviceContext, T> conj_helper(context);
-        conj_helper(dout, dout_conj);
-      }
-      if (ddout) {
-        ConjHelper<DeviceContext, T> conj_helper(context);
-        conj_helper(x, x_conj);
-        conj_helper(y, y_conj);
-      }
-      bool ddout_flag = false;
-      if (ddx) {
-        auto ddx_mat = *ddx;
-        if (ddx_mat.dims() != x.dims()) {
-          ddx_mat.Resize(x.dims());
-        }
-        if (dy) {
-          if (transpose_x && transpose_y) {
-            // dy = dout' * ddx'
-            CalcInputGrad(context, dout_conj, true, true, ddx_mat, true, false,
-                          dy, false);
-          } else if (transpose_x) {
-            // dy = ddx * dout
-            CalcInputGrad(context, ddx_mat, false, false, dout_conj, false,
-                          true, dy, false);
-          } else if (transpose_y) {
-            // dy = dout' * ddx
-            CalcInputGrad(context, dout_conj, true, true, ddx_mat, false, true,
-                          dy, false);
-          } else {
-            // dy = ddx' * dout
-            CalcInputGrad(context, ddx_mat, true, true, dout_conj, false, true,
-                          dy, false);
-          }
-        }
-
-        if (ddout) {
-          CalcInputGrad(context, ddx_mat, transpose_x, true, y_conj,
-                        transpose_y, false, ddout, ddout_flag);
-          ddout_flag = true;
-        }
-      }
-
-      if (ddy) {
-        auto ddy_mat = *ddy;
-        if (ddy_mat.dims() != y.dims()) {
-          ddy_mat.Resize(y.dims());
-        }
-        if (dx) {
-          if (transpose_x && transpose_y) {
-            // dx = ddy' * dout'
-            CalcInputGrad(context, ddy_mat, true, true, dout_conj, true, false,
-                          dx, false);
-          } else if (transpose_x) {
-            // dx = ddy * dout'
-            CalcInputGrad(context, ddy_mat, false, false, dout_conj, true,
-                          false, dx, false);
-          } else if (transpose_y) {
-            // dx = dout * ddy
-            CalcInputGrad(context, dout_conj, false, false, ddy_mat, false,
-                          true, dx, false);
-          } else {
-            // dx = dout * ddy'
-            CalcInputGrad(context, dout_conj, false, false, ddy_mat, true,
-                          false, dx, false);
-          }
-        }
+    if (dx) dx->mutable_data<T>(context.GetPlace());
+    if (dy) dy->mutable_data<T>(context.GetPlace());
+    if (ddout) ddout->mutable_data<T>(context.GetPlace());
 
-        if (ddout) {
-          CalcInputGrad(context, x_conj, transpose_x, true, ddy_mat,
-                        transpose_y, false, ddout, ddout_flag);
-        }
-      }
+    auto pt_x = paddle::experimental::MakePtenDenseTensor(*x);
+    auto pt_y = paddle::experimental::MakePtenDenseTensor(*y);
+    auto pt_dout = paddle::experimental::MakePtenDenseTensor(*dout);
+    auto pt_ddx = paddle::experimental::MakePtenDenseTensor(*ddx);
+    auto pt_ddy = paddle::experimental::MakePtenDenseTensor(*ddy);
+    auto pt_dx = paddle::experimental::MakePtenDenseTensor(*dx);
+    auto pt_dy = paddle::experimental::MakePtenDenseTensor(*dy);
+    auto pt_ddout = paddle::experimental::MakePtenDenseTensor(*ddout);
 
-      if (dx) {
-        if (dx_dims != x.dims()) {
-          dx->Resize(dx_dims);
-        }
-      }
+    auto& dev_ctx = context.device_context<DeviceContext>();
 
-      if (dy) {
-        if (dy_dims != y.dims()) {
-          dy->Resize(dy_dims);
-        }
-      }
-
-      if (ddout) {
-        if (ddout_dims != dout.dims()) {
-          ddout->Resize(ddout_dims);
-        }
-      }
-    } else {
-      // Case3: broadcast. It need cost much time to reduce sum for the
-      // broadcast and wastes the memory.
-      // So we should avoid the case in reality.
-      VLOG(3) << "It need cost much time to reduce sum for the broadcast and "
-                 "wastes the memory. So we should avoid the case in reality";
-      framework::Tensor ddy_conj(ddx->type());
-      framework::Tensor ddx_conj(ddy->type());
-
-      Tensor dx_help, dy_help;
-      if (dx || dy) {
-        ConjHelper<DeviceContext, T> conj_helper(context);
-        conj_helper(dout, dout_conj);
-      }
-      if (ddout) {
-        ConjHelper<DeviceContext, T> conj_helper(context);
-        conj_helper(x, x_conj);
-        conj_helper(y, y_conj);
-      }
-      if (transpose_x) {
-        if (transpose_y) {
-          if (dx)
-            MatMulFunction<DeviceContext, T>(ddy, &dout_conj, y_dims, dout_dims,
-                                             &dx_help, true, true, context);
-          if (dy)
-            MatMulFunction<DeviceContext, T>(&dout_conj, ddx, dout_dims, x_dims,
-                                             &dy_help, true, true, context);
-        } else {
-          if (dx)
-            MatMulFunction<DeviceContext, T>(ddy, &dout_conj, y_dims, dout_dims,
-                                             &dx_help, false, true, context);
-          if (dy)
-            MatMulFunction<DeviceContext, T>(ddx, &dout_conj, x_dims, dout_dims,
-                                             &dy_help, false, false, context);
-        }
-      } else {
-        if (transpose_y) {
-          if (dx)
-            MatMulFunction<DeviceContext, T>(&dout_conj, ddy, dout_dims, y_dims,
-                                             &dx_help, false, false, context);
-          if (dy)
-            MatMulFunction<DeviceContext, T>(&dout_conj, ddx, dout_dims, x_dims,
-                                             &dy_help, true, false, context);
-        } else {
-          if (dx)
-            MatMulFunction<DeviceContext, T>(&dout_conj, ddy, dout_dims, y_dims,
-                                             &dx_help, false, true, context);
-          if (dy)
-            MatMulFunction<DeviceContext, T>(ddx, &dout_conj, x_dims, dout_dims,
-                                             &dy_help, true, false, context);
-        }
-      }
-
-      // get help dims
-      const std::vector<std::int64_t> dx_help_dims = vectorize(dx_help.dims());
-      const std::vector<std::int64_t> dy_help_dims = vectorize(dy_help.dims());
-
-      std::vector<std::int64_t> dx_broadcast_dims(ndim);
-      std::vector<std::int64_t> dy_broadcast_dims(ndim);
-
-      std::fill(dx_broadcast_dims.data(),
-                dx_broadcast_dims.data() + ndim - x_ndim, 1);
-      std::fill(dy_broadcast_dims.data(),
-                dy_broadcast_dims.data() + ndim - y_ndim, 1);
-      std::copy(x_dims.data(), x_dims.data() + x_ndim,
-                dx_broadcast_dims.data() + ndim - x_ndim);
-      std::copy(y_dims.data(), y_dims.data() + y_ndim,
-                dy_broadcast_dims.data() + ndim - y_ndim);
-
-      std::vector<int> dx_reduce_dims;
-      std::vector<int> dy_reduce_dims;
-      for (int idx = 0; idx <= ndim - 3; idx++) {
-        if (dx_help_dims[idx] != 1 && dx_broadcast_dims[idx] == 1) {
-          dx_reduce_dims.push_back(idx);
-        }
-        if (dy_help_dims[idx] != 1 && dy_broadcast_dims[idx] == 1) {
-          dy_reduce_dims.push_back(idx);
-        }
-      }
-      // Reduce sum to get grad by ReduceSum
-      if (dx) {
-        if (dx_reduce_dims.empty()) {
-          *dx = std::move(dx_help);
-        } else {
-          ReduceSumForMatmulGrad<DeviceContext, T>(&dx_help, dx, dx_reduce_dims,
-                                                   context);
-        }
-        dx->Resize(x.dims());
-      }
-      if (dy) {
-        if (dy_reduce_dims.empty()) {
-          *dy = std::move(dy_help);
-        } else {
-          ReduceSumForMatmulGrad<DeviceContext, T>(&dy_help, dy, dy_reduce_dims,
-                                                   context);
-        }
-        dy->Resize(y.dims());
-      }
-
-      if (ddout) {
-        // Calculate the gradient of OutputGrad(Out)
-        MatMulFunction<DeviceContext, T>(ddx, &y_conj, x_dims, y_dims, ddout,
-                                         transpose_x, transpose_y, context);
-        MatMulFunction<DeviceContext, T>(&x_conj, ddy, x_dims, y_dims, ddout,
-                                         transpose_x, transpose_y, context,
-                                         true);
-      }
-    }
+    // call new kernel
+    pten::MatmulDoubleGradKernel<T>(dev_ctx, *pt_x, *pt_y, *pt_dout, *pt_ddx,
+                                    *pt_ddy, transpose_x, transpose_y,
+                                    pt_dx.get(), pt_dy.get(), pt_ddout.get());
   }
 };
 
 template <typename DeviceContext, typename T>
 class MatMulV2TripleGradKernel : public framework::OpKernel<T> {
  public:
-  void MatMul(const framework::ExecutionContext& context,
-              const framework::Tensor& a, bool trans_a,
-              const framework::Tensor& b, bool trans_b, framework::Tensor* out,
-              bool flag) const {
-    out->mutable_data<T>(context.GetPlace());
-    auto blas = math::GetBlas<DeviceContext, T>(context);
-    auto mat_dim_a = math::CreateMatrixDescriptor(a.dims(), 0, trans_a);
-    auto mat_dim_b = math::CreateMatrixDescriptor(b.dims(), 0, trans_b);
-    if (a.dims().size() == 3 && b.dims().size() <= 2) {
-      // the transpose_X must be false, if is true, the transpose cost much time
-      if (!trans_a) {
-        mat_dim_a.height_ *= mat_dim_a.batch_size_;
-        mat_dim_a.batch_size_ = 0;
-      }
-    }
-    blas.MatMul(a, mat_dim_a, b, mat_dim_b, static_cast<T>(1), out,
-                static_cast<T>(flag));
-  }
-
-  void CalcInputGrad(const framework::ExecutionContext& context,
-                     const framework::Tensor& a, bool trans_a,
-                     bool is_fold_init_dims_a, const framework::Tensor& b,
-                     bool trans_b, bool is_fold_init_dims_b,
-                     framework::Tensor* out, bool flag) const {
-    if (out == nullptr) return;
-    bool need_combine = (a.dims().size() == 3 || b.dims().size() == 3) &&
-                        out->dims().size() == 2;
-    if (!need_combine) {
-      MatMul(context, a, trans_a, b, trans_b, out, flag);
-    } else {
-      auto& ctx = context.template device_context<DeviceContext>();
-      MatMul(context, is_fold_init_dims_a
-                          ? FoldInitDims(a)
-                          : FoldHeadAndLastDims<DeviceContext, T>(ctx, a),
-             trans_a, is_fold_init_dims_b
-                          ? FoldInitDims(b)
-                          : FoldHeadAndLastDims<DeviceContext, T>(ctx, b),
-             trans_b, out, flag);
-    }
-  }
-
   void Compute(const framework::ExecutionContext& context) const override {
     // get input
-    auto x = *context.Input<framework::Tensor>("X");
-    auto y = *context.Input<framework::Tensor>("Y");
-    auto dout = *context.Input<framework::Tensor>("DOut");
-    auto ddx = *context.Input<framework::Tensor>("DDX");
-    auto ddy = *context.Input<framework::Tensor>("DDY");
+    auto* x = context.Input<framework::Tensor>("X");
+    auto* y = context.Input<framework::Tensor>("Y");
+    auto* dout = context.Input<framework::Tensor>("DOut");
+    auto* ddx = context.Input<framework::Tensor>("DDX");
+    auto* ddy = context.Input<framework::Tensor>("DDY");
 
     auto* d_dx = context.Input<framework::Tensor>("D_DX");
     auto* d_dy = context.Input<framework::Tensor>("D_DY");
@@ -1812,539 +233,34 @@ class MatMulV2TripleGradKernel : public framework::OpKernel<T> {
     bool transpose_x = context.Attr<bool>("trans_x");
     bool transpose_y = context.Attr<bool>("trans_y");
 
-    // Get dims from the input x, y, output_grad
-    std::vector<std::int64_t> x_dims = vectorize(x.dims());
-    std::vector<std::int64_t> y_dims = vectorize(y.dims());
-    std::vector<std::int64_t> dout_dims = vectorize(dout.dims());
-    framework::Tensor x_conj(x.type());
-    framework::Tensor y_conj(y.type());
-    framework::Tensor dout_conj(dout.type());
-    framework::Tensor ddx_conj(ddx.type());
-    framework::Tensor ddy_conj(ddy.type());
-
-    int x_ndim = x_dims.size();
-    int y_ndim = y_dims.size();
-    int ndim = dout_dims.size();
-
-    // Case1 : x's and y's dim = 1
-    if (x_ndim == 1 && y_ndim == 1) {
-      VLOG(3) << "========  MatMulV2TripleGradKernel, Compute ====== Case 1";
-
-      DotTripleGradFunction<DeviceContext, T>()(
-          &x, &y, &ddx, &ddy, d_dx, d_dy, &dout, d_ddout, out_d_x, out_d_y,
-          out_d_dout, out_d_ddx, out_d_ddy, context);
-      return;
-    }
-
-    bool is_broadcast = true;
-    if (x_ndim <= 2 || y_ndim <= 2) {
-      is_broadcast = false;
-    } else if (x_ndim != y_ndim) {
-      is_broadcast = true;
-    } else {
-      is_broadcast = !std::equal(x_dims.cbegin(), x_dims.cbegin() + x_ndim - 2,
-                                 y_dims.cbegin());
-    }
-
-    if (!is_broadcast) {
-      // Case2: no broadcast or no batch size
-      VLOG(3) << "========  MatMulV2TripleGradKernel, Compute ====== Case 2";
-      ReshapeXYOutIntoMatrixSequence(&x, &y, &dout, transpose_x, transpose_y);
-
-      if (ddx.dims() != x.dims()) {
-        ddx.Resize(x.dims());
-      }
-
-      if (ddy.dims() != y.dims()) {
-        ddy.Resize(y.dims());
-      }
-
-      ConjHelper<DeviceContext, T> conj_helper(context);
-
-      framework::DDim out_dx_dims;
-      if (out_d_x) {
-        out_dx_dims = out_d_x->dims();
-        if (out_dx_dims != x.dims()) {
-          out_d_x->Resize(x.dims());
-        }
-      }
-
-      framework::DDim out_dy_dims;
-      if (out_d_y) {
-        out_dy_dims = out_d_y->dims();
-        if (out_dy_dims != y.dims()) {
-          out_d_y->Resize(y.dims());
-        }
-      }
-
-      framework::DDim out_d_dout_dims;
-      if (out_d_dout) {
-        out_d_dout_dims = out_d_dout->dims();
-        if (out_d_dout_dims != dout.dims()) {
-          out_d_dout->Resize(dout.dims());
-        }
-      }
-
-      framework::DDim out_d_ddx_dims;
-      if (out_d_ddx) {
-        out_d_ddx_dims = out_d_ddx->dims();
-        if (out_d_ddx_dims != x.dims()) {
-          out_d_ddx->Resize(x.dims());
-        }
-      }
-
-      framework::DDim out_d_ddy_dims;
-      if (out_d_ddy) {
-        out_d_ddy_dims = out_d_ddy->dims();
-        if (out_d_ddy_dims != y.dims()) {
-          out_d_ddy->Resize(y.dims());
-        }
-      }
-
-      if (out_d_dout) {
-        ConjHelper<DeviceContext, T> conj_helper(context);
-        conj_helper(ddx, ddx_conj);
-        conj_helper(ddy, ddy_conj);
-      }
-
-      if (out_d_ddx || out_d_ddy) {
-        ConjHelper<DeviceContext, T> conj_helper(context);
-        conj_helper(x, x_conj);
-        conj_helper(y, y_conj);
-        conj_helper(dout, dout_conj);
-      }
-
-      bool d_dout_flag = false;
-      bool d_ddx_flag = false;
-      bool d_ddy_flag = false;
-
-      if (d_ddout) {
-        auto d_ddout_mat = *d_ddout;
-        if (d_ddout_mat.dims() != dout.dims()) {
-          d_ddout_mat.Resize(dout.dims());
-        }
-
-        if (out_d_y) {
-          if (transpose_x && transpose_y) {
-            // out_d_y = d_ddout' * ddx'
-            CalcInputGrad(context, d_ddout_mat, true, true, ddx_conj, true,
-                          false, out_d_y, false);
-          } else if (transpose_x) {
-            // out_d_y = ddx * d_ddout
-            CalcInputGrad(context, ddx_conj, false, false, d_ddout_mat, false,
-                          true, out_d_y, false);
-          } else if (transpose_y) {
-            // out_d_y = d_ddout' * ddx
-            CalcInputGrad(context, d_ddout_mat, true, true, ddx_conj, false,
-                          true, out_d_y, false);
-          } else {
-            // out_d_y = ddx' * d_ddout
-            CalcInputGrad(context, ddx_conj, true, true, d_ddout_mat, false,
-                          true, out_d_y, false);
-          }
-        }
-
-        if (out_d_x) {
-          if (transpose_x && transpose_y) {
-            // out_d_x = ddy' * d_ddout'
-            CalcInputGrad(context, ddy_conj, true, true, d_ddout_mat, true,
-                          false, out_d_x, false);
-          } else if (transpose_x) {
-            // out_d_x = ddy * d_ddout'
-            CalcInputGrad(context, ddy_conj, false, false, d_ddout_mat, true,
-                          false, out_d_x, false);
-          } else if (transpose_y) {
-            // out_d_x = d_ddout * ddy
-            CalcInputGrad(context, d_ddout_mat, false, false, ddy_conj, false,
-                          true, out_d_x, false);
-          } else {
-            // out_d_x = d_ddout * ddy'
-            CalcInputGrad(context, d_ddout_mat, false, false, ddy_conj, true,
-                          false, out_d_x, false);
-          }
-        }
-
-        // equations:
-        // d_ddx = DOut * D_DY + Y * D_DDOut
-        // Let: d_ddx1 = Y * D_DDOut
-        // Let: d_ddx2 = DOut * D_DY
-
-        // d_ddy = DOut * D_DX + X * D_DDOut
-        // Let: d_ddy1 = X * D_DDOut
-        // Let: d_ddy2 = DOut * D_DX
-
-        // d_dout = DDY * D_DX + DDX * D_DY
-        // Let: d_dout1 = DDX * D_DY
-        // Let: d_dout2 = DDY * D_DX
-
-        // compute d_ddx1
-        if (out_d_ddx) {
-          if (transpose_x && transpose_y) {
-            // out_d_ddx1 = y' * d_ddout'
-            CalcInputGrad(context, y_conj, true, true, d_ddout_mat, true, false,
-                          out_d_ddx, d_ddx_flag);
-          } else if (transpose_x) {
-            // out_d_ddx1 = y * d_ddout'
-            CalcInputGrad(context, y_conj, false, false, d_ddout_mat, true,
-                          false, out_d_ddx, d_ddx_flag);
-          } else if (transpose_y) {
-            // out_d_ddx1 = d_ddout * y
-            CalcInputGrad(context, d_ddout_mat, false, false, y_conj, false,
-                          true, out_d_ddx, d_ddx_flag);
-          } else {
-            // out_d_ddx1 = d_ddout * y'
-            CalcInputGrad(context, d_ddout_mat, false, false, y_conj, true,
-                          false, out_d_ddx, d_ddx_flag);
-          }
-          d_ddx_flag = true;
-        }
-
-        // compute d_ddy1
-        if (out_d_ddy) {
-          if (transpose_x && transpose_y) {
-            // out_d_ddy1 = d_ddout' * x'
-            CalcInputGrad(context, d_ddout_mat, true, true, x_conj, true, false,
-                          out_d_ddy, false);
-          } else if (transpose_x) {
-            // out_d_ddy1 = x * d_ddout
-            CalcInputGrad(context, x_conj, false, false, d_ddout_mat, false,
-                          true, out_d_ddy, false);
-          } else if (transpose_y) {
-            // out_d_ddy1 = d_ddout' * x
-            CalcInputGrad(context, d_ddout_mat, true, true, x_conj, false, true,
-                          out_d_ddy, false);
-          } else {
-            // out_d_ddy1 = x' * d_ddout
-            CalcInputGrad(context, x_conj, true, true, d_ddout_mat, false, true,
-                          out_d_ddy, false);
-          }
-          d_ddy_flag = true;
-        }
-      }
-
-      if (d_dy) {
-        auto d_dy_mat = *d_dy;
-        if (d_dy_mat.dims() != y.dims()) {
-          d_dy_mat.Resize(y.dims());
-        }
-
-        // compute d_dout1
-        if (out_d_dout) {
-          CalcInputGrad(context, ddx_conj, transpose_x, true, d_dy_mat,
-                        transpose_y, false, out_d_dout, d_dout_flag);
-          d_dout_flag = true;
-        }
-
-        // compute d_ddx2
-        if (out_d_ddx) {
-          if (transpose_x && transpose_y) {
-            // out_d_ddx2 = D_DY' * DOut'
-            CalcInputGrad(context, d_dy_mat, true, true, dout_conj, true, false,
-                          out_d_ddx, d_ddx_flag);
-          } else if (transpose_x) {
-            // out_d_ddx2 = D_DY * Dout'
-            CalcInputGrad(context, d_dy_mat, false, false, dout_conj, true,
-                          false, out_d_ddx, d_ddx_flag);
-          } else if (transpose_y) {
-            // out_d_ddx2 = Dout * D_DY
-            CalcInputGrad(context, dout_conj, false, false, d_dy_mat, false,
-                          true, out_d_ddx, d_ddx_flag);
-          } else {
-            // out_d_ddx2 = Dout * D_DY'
-            CalcInputGrad(context, dout_conj, false, false, d_dy_mat, true,
-                          false, out_d_ddx, d_ddx_flag);
-          }
-        }
-      }
-
-      if (d_dx) {
-        auto d_dx_mat = *d_dx;
-        if (d_dx_mat.dims() != x.dims()) {
-          d_dx_mat.Resize(x.dims());
-        }
-
-        // compute d_dout2
-        if (out_d_dout) {
-          CalcInputGrad(context, d_dx_mat, transpose_x, true, ddy_conj,
-                        transpose_y, false, out_d_dout, d_dout_flag);
-        }
-
-        // compute d_ddy2
-        if (out_d_ddy) {
-          if (transpose_x && transpose_y) {
-            // out_d_ddy2 = dout' * d_dx'
-            CalcInputGrad(context, dout_conj, true, true, d_dx_mat, true, false,
-                          out_d_ddy, d_ddy_flag);
-          } else if (transpose_x) {
-            // out_d_ddy2 = d_dx * dout
-            CalcInputGrad(context, d_dx_mat, false, false, dout_conj, false,
-                          true, out_d_ddy, d_ddy_flag);
-          } else if (transpose_y) {
-            // out_d_ddy2 = dout' * d_dx
-            CalcInputGrad(context, dout_conj, true, true, d_dx_mat, false, true,
-                          out_d_ddy, d_ddy_flag);
-          } else {
-            // out_d_ddy2 = d_dx' * dout
-            CalcInputGrad(context, d_dx_mat, true, true, dout_conj, false, true,
-                          out_d_ddy, d_ddy_flag);
-          }
-        }
-      }
-
-      if (out_d_x) {
-        if (out_dx_dims != x.dims()) {
-          out_d_x->Resize(out_dx_dims);
-        }
-      }
-
-      if (out_d_y) {
-        if (out_dy_dims != y.dims()) {
-          out_d_y->Resize(out_dy_dims);
-        }
-      }
-
-      if (out_d_dout) {
-        if (out_d_dout_dims != dout.dims()) {
-          out_d_dout->Resize(out_d_dout_dims);
-        }
-      }
-
-      if (out_d_ddx) {
-        if (out_d_ddx_dims != x.dims()) {
-          out_d_ddx->Resize(out_d_ddx_dims);
-        }
-      }
-
-      if (out_d_ddy) {
-        if (out_d_ddy_dims != x.dims()) {
-          out_d_ddy->Resize(out_d_ddy_dims);
-        }
-      }
-
-    } else {
-      // Case3: broadcast. It need cost much time to reduce sum for the
-      // broadcast and wastes the memory.
-      // So we should avoid the case in reality.
-      VLOG(3) << "========  MatMulV2TripleGradKernel, Compute ====== Case 3";
-      VLOG(3) << "It need cost much time to reduce sum for the broadcast and "
-                 "wastes the memory. So we should avoid the case in reality";
-
-      Tensor out_dx_help, out_dy_help;
-      Tensor out_d_ddx_help, out_d_ddy_help;
-      if (out_d_dout) {
-        ConjHelper<DeviceContext, T> conj_helper(context);
-        conj_helper(ddx, ddx_conj);
-        conj_helper(ddy, ddy_conj);
-      }
-      if (out_d_ddx || out_d_ddy) {
-        ConjHelper<DeviceContext, T> conj_helper(context);
-        conj_helper(x, x_conj);
-        conj_helper(y, y_conj);
-        conj_helper(dout, dout_conj);
-      }
-
-      if (transpose_x) {
-        if (transpose_y) {
-          // dX = ddY' d_ddout’, dY = d_ddout’ ddX'
-          if (out_d_x)
-            MatMulFunction<DeviceContext, T>(&ddy_conj, d_ddout, y_dims,
-                                             dout_dims, &out_dx_help, true,
-                                             true, context);
-          if (out_d_y)
-            MatMulFunction<DeviceContext, T>(d_ddout, &ddx_conj, dout_dims,
-                                             x_dims, &out_dy_help, true, true,
-                                             context);
-        } else {
-          // dX = ddY d_ddout', dY = ddX d_ddout
-          if (out_d_x)
-            MatMulFunction<DeviceContext, T>(&ddy_conj, d_ddout, y_dims,
-                                             dout_dims, &out_dx_help, false,
-                                             true, context);
-          if (out_d_y)
-            MatMulFunction<DeviceContext, T>(&ddx_conj, d_ddout, x_dims,
-                                             dout_dims, &out_dy_help, false,
-                                             false, context);
-        }
-      } else {
-        if (transpose_y) {
-          // dX = d_ddout ddY, dY = d_ddout’ ddX
-          if (out_d_x)
-            MatMulFunction<DeviceContext, T>(d_ddout, &ddy_conj, dout_dims,
-                                             y_dims, &out_dx_help, false, false,
-                                             context);
-          if (out_d_y)
-            MatMulFunction<DeviceContext, T>(d_ddout, &ddx_conj, dout_dims,
-                                             x_dims, &out_dy_help, true, false,
-                                             context);
-        } else {
-          // dX = d_ddout ddY', dY = ddX' d_ddout
-          if (out_d_x)
-            MatMulFunction<DeviceContext, T>(d_ddout, &ddy_conj, dout_dims,
-                                             y_dims, &out_dx_help, false, true,
-                                             context);
-          if (out_d_y)
-            MatMulFunction<DeviceContext, T>(&ddx_conj, d_ddout, x_dims,
-                                             dout_dims, &out_dy_help, true,
-                                             false, context);
-        }
-      }
-
-      // get help dims
-      const std::vector<std::int64_t> dx_help_dims =
-          vectorize(out_dx_help.dims());
-      const std::vector<std::int64_t> dy_help_dims =
-          vectorize(out_dx_help.dims());
-
-      std::vector<std::int64_t> dx_broadcast_dims(ndim);
-      std::vector<std::int64_t> dy_broadcast_dims(ndim);
-
-      std::fill(dx_broadcast_dims.data(),
-                dx_broadcast_dims.data() + ndim - x_ndim, 1);
-      std::fill(dy_broadcast_dims.data(),
-                dy_broadcast_dims.data() + ndim - y_ndim, 1);
-      std::copy(x_dims.data(), x_dims.data() + x_ndim,
-                dx_broadcast_dims.data() + ndim - x_ndim);
-      std::copy(y_dims.data(), y_dims.data() + y_ndim,
-                dy_broadcast_dims.data() + ndim - y_ndim);
-
-      std::vector<int> dx_reduce_dims;
-      std::vector<int> dy_reduce_dims;
-      for (int idx = 0; idx <= ndim - 3; idx++) {
-        if (dx_help_dims[idx] != 1 && dx_broadcast_dims[idx] == 1) {
-          dx_reduce_dims.push_back(idx);
-        }
-        if (dy_help_dims[idx] != 1 && dy_broadcast_dims[idx] == 1) {
-          dy_reduce_dims.push_back(idx);
-        }
-      }
-      // Reduce sum to get grad by ReduceSum
-      if (out_d_x) {
-        if (dx_reduce_dims.empty()) {
-          *out_d_x = std::move(out_dx_help);
-        } else {
-          ReduceSumForMatmulGrad<DeviceContext, T>(&out_dx_help, out_d_x,
-                                                   dx_reduce_dims, context);
-        }
-        out_d_x->Resize(x.dims());
-      }
-
-      if (out_d_y) {
-        if (dy_reduce_dims.empty()) {
-          *out_d_y = std::move(out_dy_help);
-        } else {
-          ReduceSumForMatmulGrad<DeviceContext, T>(&out_dy_help, out_d_y,
-                                                   dy_reduce_dims, context);
-        }
-        out_d_y->Resize(y.dims());
-      }
-
-      // compute d_dout
-      if (out_d_dout) {
-        MatMulFunction<DeviceContext, T>(d_dx, &ddy_conj, x_dims, y_dims,
-                                         out_d_dout, transpose_x, transpose_y,
-                                         context);
-        MatMulFunction<DeviceContext, T>(&ddx_conj, d_dy, x_dims, y_dims,
-                                         out_d_dout, transpose_x, transpose_y,
-                                         context, true);
-      }
-
-      // compute d_ddx
-      if (out_d_ddx) {
-        if (transpose_x && transpose_y) {
-          // out_d_ddx1 = y' * d_ddout'
-          MatMulFunction<DeviceContext, T>(&y_conj, d_ddout, y_dims, dout_dims,
-                                           &out_d_ddx_help, true, true,
-                                           context);
-          // out_d_ddx2 = D_DY' * DOut'
-          MatMulFunction<DeviceContext, T>(d_dy, &dout_conj, y_dims, dout_dims,
-                                           &out_d_ddx_help, true, true, context,
-                                           true);
-        } else if (transpose_x) {
-          // out_d_ddx1 = y * d_ddout'
-          MatMulFunction<DeviceContext, T>(&y_conj, d_ddout, y_dims, dout_dims,
-                                           &out_d_ddx_help, false, true,
-                                           context);
-          // out_d_ddx2 = D_DY * Dout'
-          MatMulFunction<DeviceContext, T>(d_dy, &dout_conj, y_dims, dout_dims,
-                                           &out_d_ddx_help, false, true,
-                                           context, true);
-        } else if (transpose_y) {
-          // out_d_ddx1 = d_ddout * y
-          MatMulFunction<DeviceContext, T>(d_ddout, &y_conj, dout_dims, y_dims,
-                                           &out_d_ddx_help, false, false,
-                                           context);
-          // out_d_ddx2 = Dout * D_DY
-          MatMulFunction<DeviceContext, T>(&dout_conj, d_dy, dout_dims, y_dims,
-                                           &out_d_ddx_help, false, false,
-                                           context, true);
-        } else {
-          // out_d_ddx1 = d_ddout * y'
-          MatMulFunction<DeviceContext, T>(d_ddout, &y_conj, dout_dims, y_dims,
-                                           &out_d_ddx_help, false, true,
-                                           context);
-          // out_d_ddx2 = Dout * D_DY'
-          MatMulFunction<DeviceContext, T>(&dout_conj, d_dy, dout_dims, y_dims,
-                                           &out_d_ddx_help, false, true,
-                                           context, true);
-        }
-        if (dx_reduce_dims.empty()) {
-          *out_d_ddx = std::move(out_d_ddx_help);
-        } else {
-          ReduceSumForMatmulGrad<DeviceContext, T>(&out_d_ddx_help, out_d_ddx,
-                                                   dx_reduce_dims, context);
-        }
-        out_d_ddx->Resize(x.dims());
-      }
-
-      // compute d_ddy
-      if (out_d_ddy) {
-        if (transpose_x && transpose_y) {
-          // out_d_ddy1 = d_ddout' * x'
-          MatMulFunction<DeviceContext, T>(d_ddout, &x_conj, dout_dims, x_dims,
-                                           &out_d_ddy_help, true, true,
-                                           context);
-          // out_d_ddy2 = dout' * d_dx'
-          MatMulFunction<DeviceContext, T>(&dout_conj, d_dx, dout_dims, x_dims,
-                                           &out_d_ddy_help, true, true, context,
-                                           true);
-        } else if (transpose_x) {
-          // out_d_ddy1 = x * d_ddout
-          MatMulFunction<DeviceContext, T>(&x_conj, d_ddout, x_dims, dout_dims,
-                                           &out_d_ddy_help, false, false,
-                                           context);
-          // out_d_ddy2 = d_dx * dout
-          MatMulFunction<DeviceContext, T>(d_dx, &dout_conj, x_dims, dout_dims,
-                                           &out_d_ddy_help, false, false,
-                                           context, true);
-        } else if (transpose_y) {
-          // out_d_ddy1 = d_ddout' * x
-          MatMulFunction<DeviceContext, T>(d_ddout, &x_conj, dout_dims, x_dims,
-                                           &out_d_ddy_help, true, false,
-                                           context);
-          // out_d_ddy2 = dout' * d_dx
-          MatMulFunction<DeviceContext, T>(&dout_conj, d_dx, dout_dims, x_dims,
-                                           &out_d_ddy_help, true, false,
-                                           context, true);
-        } else {
-          // out_d_ddy1 = x' * d_ddout
-          MatMulFunction<DeviceContext, T>(&x_conj, d_ddout, x_dims, dout_dims,
-                                           &out_d_ddy_help, true, false,
-                                           context);
-          // out_d_ddy2 = d_dx' * dout
-          MatMulFunction<DeviceContext, T>(d_dx, &dout_conj, x_dims, dout_dims,
-                                           &out_d_ddy_help, true, false,
-                                           context, true);
-        }
-
-        if (dy_reduce_dims.empty()) {
-          *out_d_ddy = std::move(out_d_ddy_help);
-        } else {
-          ReduceSumForMatmulGrad<DeviceContext, T>(&out_d_ddy_help, out_d_ddy,
-                                                   dy_reduce_dims, context);
-        }
-        out_d_ddy->Resize(y.dims());
-      }
-    }
+    if (out_d_x) out_d_x->mutable_data<T>(context.GetPlace());
+    if (out_d_y) out_d_y->mutable_data<T>(context.GetPlace());
+    if (out_d_dout) out_d_dout->mutable_data<T>(context.GetPlace());
+    if (out_d_ddx) out_d_ddx->mutable_data<T>(context.GetPlace());
+    if (out_d_ddy) out_d_ddy->mutable_data<T>(context.GetPlace());
+
+    auto pt_x = paddle::experimental::MakePtenDenseTensor(*x);
+    auto pt_y = paddle::experimental::MakePtenDenseTensor(*y);
+    auto pt_dout = paddle::experimental::MakePtenDenseTensor(*dout);
+    auto pt_ddx = paddle::experimental::MakePtenDenseTensor(*ddx);
+    auto pt_ddy = paddle::experimental::MakePtenDenseTensor(*ddy);
+    auto pt_d_dx = paddle::experimental::MakePtenDenseTensor(*d_dx);
+    auto pt_d_dy = paddle::experimental::MakePtenDenseTensor(*d_dy);
+    auto pt_d_ddout = paddle::experimental::MakePtenDenseTensor(*d_ddout);
+
+    auto pt_out_d_x = paddle::experimental::MakePtenDenseTensor(*out_d_x);
+    auto pt_out_d_y = paddle::experimental::MakePtenDenseTensor(*out_d_y);
+    auto pt_out_d_dout = paddle::experimental::MakePtenDenseTensor(*out_d_dout);
+    auto pt_out_d_ddx = paddle::experimental::MakePtenDenseTensor(*out_d_ddx);
+    auto pt_out_d_ddy = paddle::experimental::MakePtenDenseTensor(*out_d_ddy);
+
+    auto& dev_ctx = context.device_context<DeviceContext>();
+    // call new kernel
+    pten::MatmulTripleGradKernel<T>(dev_ctx, *pt_x, *pt_y, *pt_dout, *pt_ddx,
+                                    *pt_ddy, *pt_d_dx, *pt_d_dy, *pt_d_ddout,
+                                    transpose_x, transpose_y, pt_out_d_x.get(),
+                                    pt_out_d_y.get(), pt_out_d_dout.get(),
+                                    pt_out_d_ddx.get(), pt_out_d_ddy.get());
   }
 };
 
diff --git a/paddle/pten/core/dense_tensor.cc b/paddle/pten/core/dense_tensor.cc
index 1b4254ad2c103..0b5f5cb18e13d 100644
--- a/paddle/pten/core/dense_tensor.cc
+++ b/paddle/pten/core/dense_tensor.cc
@@ -70,6 +70,12 @@ DenseTensor& DenseTensor::operator=(const DenseTensor& other) {
   return *this;
 }
 
+DenseTensor& DenseTensor::operator=(DenseTensor&& other) {
+  meta_ = std::move(other.meta_);
+  storage_.swap(other.storage_);
+  return *this;
+}
+
 int64_t DenseTensor::numel() const {
   if (meta_.is_scalar) {
     return 1;
diff --git a/paddle/pten/core/dense_tensor.h b/paddle/pten/core/dense_tensor.h
index fc92e84f52cea..1502accd197be 100644
--- a/paddle/pten/core/dense_tensor.h
+++ b/paddle/pten/core/dense_tensor.h
@@ -97,6 +97,8 @@ class DenseTensor : public TensorBase,
   /// \brief DenseTensor shallow copy assignment.
   DenseTensor& operator=(const DenseTensor& other);
 
+  DenseTensor& operator=(DenseTensor&& other);
+
   /// \brief Destroy the tensor object and release exclusive resources.
   virtual ~DenseTensor() = default;
 
diff --git a/paddle/pten/core/kernel_alias_name.h b/paddle/pten/core/kernel_alias_name.h
index 56f7eea7ea802..46fa6dd376ee3 100644
--- a/paddle/pten/core/kernel_alias_name.h
+++ b/paddle/pten/core/kernel_alias_name.h
@@ -29,6 +29,9 @@ const std::unordered_map<std::string, std::string> kernel_alias_name_map = {
     {"flatten_contiguous_range", "flatten"},
     {"flatten_contiguous_range_grad", "flatten_grad"},
     {"matmul_v2", "matmul"},
+    {"matmul_v2_grad", "matmul_grad"},
+    {"matmul_v2_grad_grad", "matmul_double_grad"},
+    {"matmul_v2_triple_grad", "matmul_triple_grad"},
     {"reduce_mean", "mean"},
     {"reduce_sum", "sum"},
     {"reshape2", "reshape"},
@@ -36,6 +39,8 @@ const std::unordered_map<std::string, std::string> kernel_alias_name_map = {
     {"flatten", "deprecated"},
     {"flatten_grad", "deprecated"},
     {"matmul", "deprecated"},
+    {"matmul_grad", "deprecated"},
+    {"matmul_grad_grad", "deprecated"},
     {"mean", "deprecated"},
     {"reshape", "deprecated"},
     {"sum", "deprecated"}};
diff --git a/paddle/pten/core/kernel_context.cc b/paddle/pten/core/kernel_context.cc
index b2c84807951a5..74bd6d17f066a 100644
--- a/paddle/pten/core/kernel_context.cc
+++ b/paddle/pten/core/kernel_context.cc
@@ -50,6 +50,11 @@ void KernelContext::EmplaceBackOutputWithoutSetRange(
   outputs_.emplace_back(std::move(output));
 }
 
+void KernelContext::SetOutputWithoutSetRange(
+    int index, std::shared_ptr<TensorBase> output) {
+  outputs_.at(index) = std::move(output);
+}
+
 void KernelContext::EmplaceBackOutputs(
     paddle::SmallVector<std::shared_ptr<TensorBase>> outputs) {
   int index = outputs_.size();
@@ -119,8 +124,10 @@ void KernelContext::ClearData() {
     }
   }
   for (auto& out : outputs_) {
-    CompatibleDenseTensorUtils::ClearStorage(
-        static_cast<DenseTensor*>(out.get()));
+    if (out) {
+      CompatibleDenseTensorUtils::ClearStorage(
+          static_cast<DenseTensor*>(out.get()));
+    }
   }
   attrs_.clear();
 }
diff --git a/paddle/pten/core/kernel_context.h b/paddle/pten/core/kernel_context.h
index 6c695987096cb..b6cc15c084ac0 100644
--- a/paddle/pten/core/kernel_context.h
+++ b/paddle/pten/core/kernel_context.h
@@ -62,6 +62,8 @@ class KernelContext {
 
   void EmplaceBackOutputWithoutSetRange(std::shared_ptr<TensorBase> output);
 
+  void SetOutputWithoutSetRange(int index, std::shared_ptr<TensorBase> output);
+
   void EmplaceBackOutputs(
       paddle::SmallVector<std::shared_ptr<TensorBase>> outputs);
 
@@ -80,6 +82,14 @@ class KernelContext {
     return static_cast<const TensorType&>(*(inputs_.at(idx)));
   }
 
+  template <typename TensorType>
+  paddle::optional<const TensorType&> OptionalInputAt(size_t idx) const {
+    const auto& input = inputs_.at(idx);
+    return input ? paddle::optional<const TensorType&>{static_cast<
+                       const TensorType&>(*input)}
+                 : paddle::optional<const TensorType&>{paddle::none};
+  }
+
   std::shared_ptr<TensorBase>& MutableInputPtrAt(size_t idx) {
     return inputs_.at(idx);
   }
diff --git a/paddle/pten/core/kernel_registry.h b/paddle/pten/core/kernel_registry.h
index bd4687c6e7f4e..f08ef4acfd9ce 100644
--- a/paddle/pten/core/kernel_registry.h
+++ b/paddle/pten/core/kernel_registry.h
@@ -65,6 +65,10 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
       } else if (arg_type == std::type_index(typeid(const DenseTensor&))) {
         args_def->AppendInput(
             default_key.backend(), default_tensor_layout, default_key.dtype());
+      } else if (arg_type == std::type_index(typeid(
+                                 paddle::optional<const DenseTensor&>))) {
+        args_def->AppendInput(
+            default_key.backend(), default_tensor_layout, default_key.dtype());
       } else if (arg_type ==
                  std::type_index(typeid(const std::vector<DenseTensor>&))) {
         args_def->AppendInput(
diff --git a/paddle/pten/core/kernel_utils.h b/paddle/pten/core/kernel_utils.h
index 5087d912ed525..60201151c62a2 100644
--- a/paddle/pten/core/kernel_utils.h
+++ b/paddle/pten/core/kernel_utils.h
@@ -77,6 +77,27 @@ namespace pten {
     }                                                                   \
   }
 
+#define PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(tensor_type)     \
+  template <typename... Tail>                                              \
+  struct KernelCallHelper<paddle::optional<const tensor_type&>, Tail...> { \
+    template <int dev_ctx_idx,                                             \
+              int in_idx,                                                  \
+              int attr_idx,                                                \
+              int out_idx,                                                 \
+              typename... PreviousArgs>                                    \
+    static void Compute(KernelContext* ctx, PreviousArgs&... pargs) {      \
+      static_assert(attr_idx == 0,                                         \
+                    "Kernel's Input should appear before Attributes.");    \
+      static_assert(out_idx == 0,                                          \
+                    "Kernel's Input should appear before Outputs.");       \
+      const std::pair<int, int> range = ctx->InputRangeAt(in_idx);         \
+      auto arg = ctx->OptionalInputAt<tensor_type>(range.first);           \
+      KernelCallHelper<Tail...>::                                          \
+          template Compute<dev_ctx_idx, in_idx + 1, attr_idx, out_idx>(    \
+              ctx, pargs..., arg);                                         \
+    }                                                                      \
+  }
+
 #define PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(tensor_type)        \
   template <typename... Tail>                                              \
   struct KernelCallHelper<const std::vector<tensor_type>&, Tail...> {      \
@@ -190,6 +211,7 @@ struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {
   /* Input Helpers */
 
   PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(DenseTensor);
+  PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(DenseTensor);
   PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(DenseTensor);
   // TODO(chenweihang): adapt SelectedRows
   // PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SelectedRowsTensor);
diff --git a/paddle/pten/include/linalg.h b/paddle/pten/include/linalg.h
index 22f287468e673..71bc518aa89f8 100644
--- a/paddle/pten/include/linalg.h
+++ b/paddle/pten/include/linalg.h
@@ -30,7 +30,7 @@ DenseTensor Dot(const ContextT& dev_ctx,
       pten::make_intrusive<paddle::experimental::SharedStorage>(
           dev_ctx.GetPlace()),
       std::move(out_meta));
-  Dot<T, ContextT>(dev_ctx, x, y, &dense_out);
+  DotKernel<T, ContextT>(dev_ctx, x, y, &dense_out);
   return dense_out;
 }
 
diff --git a/paddle/pten/include/math.h b/paddle/pten/include/math.h
index faa4c8db8dac3..5070d0d4e0e5a 100644
--- a/paddle/pten/include/math.h
+++ b/paddle/pten/include/math.h
@@ -48,15 +48,4 @@ DenseTensor Scale(const ContextT& dev_ctx,
   return dense_out;
 }
 
-template <typename T, typename ContextT>
-DenseTensor Conj(const ContextT& dev_ctx, const DenseTensor& x) {
-  auto out_meta = UnchangedInferMeta(x.meta());
-  pten::DenseTensor dense_out(
-      pten::make_intrusive<paddle::experimental::SharedStorage>(
-          dev_ctx.GetPlace()),
-      std::move(out_meta));
-  Conj<T>(dev_ctx, x, &dense_out);
-  return dense_out;
-}
-
 }  // namespace pten
diff --git a/paddle/pten/kernels/complex_kernel.h b/paddle/pten/kernels/complex_kernel.h
index dfe8fff43e6ef..e9f717152a458 100644
--- a/paddle/pten/kernels/complex_kernel.h
+++ b/paddle/pten/kernels/complex_kernel.h
@@ -16,9 +16,20 @@ limitations under the License. */
 
 #include "paddle/pten/core/dense_tensor.h"
 
+#include "paddle/pten/infermeta/unary.h"
+#include "paddle/pten/kernels/empty_kernel.h"
+
 namespace pten {
 
 template <typename T, typename Context>
-void Conj(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
+void ConjKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
+
+template <typename T, typename Context>
+DenseTensor Conj(const Context& dev_ctx, const DenseTensor& x) {
+  auto out_meta = UnchangedInferMeta(x.meta());
+  auto dense_out = Empty<T, Context>(dev_ctx, std::move(out_meta));
+  ConjKernel<T>(dev_ctx, x, &dense_out);
+  return dense_out;
+}
 
 }  // namespace pten
diff --git a/paddle/pten/kernels/cpu/complex_kernel.cc b/paddle/pten/kernels/cpu/complex_kernel.cc
index 9bf27ef22dcd7..10e7e684db3c1 100644
--- a/paddle/pten/kernels/cpu/complex_kernel.cc
+++ b/paddle/pten/kernels/cpu/complex_kernel.cc
@@ -24,7 +24,7 @@
 PT_REGISTER_CTX_KERNEL(conj,
                        CPU,
                        ALL_LAYOUT,
-                       pten::Conj,
+                       pten::ConjKernel,
                        paddle::platform::complex<float>,
                        paddle::platform::complex<double>,
                        float,
diff --git a/paddle/pten/kernels/cpu/dot_grad_kernel.cc b/paddle/pten/kernels/cpu/dot_grad_kernel.cc
new file mode 100644
index 0000000000000..c9d5c35e134c8
--- /dev/null
+++ b/paddle/pten/kernels/cpu/dot_grad_kernel.cc
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/kernels/dot_grad_kernel.h"
+#include "paddle/pten/kernels/impl/dot_grad_kernel_impl.h"
+
+#include "paddle/pten/backends/cpu/cpu_context.h"
+#include "paddle/pten/core/kernel_registry.h"
+
+#include "paddle/fluid/platform/complex.h"
+
+PT_REGISTER_CTX_KERNEL(dot_grad,
+                       CPU,
+                       ALL_LAYOUT,
+                       pten::DotGradKernel,
+                       float,
+                       double,
+                       int,
+                       int64_t,
+                       paddle::platform::complex<float>,
+                       paddle::platform::complex<double>) {}
diff --git a/paddle/pten/kernels/cpu/dot_kernel.cc b/paddle/pten/kernels/cpu/dot_kernel.cc
index 247ad1216a266..72e9e28907f90 100644
--- a/paddle/pten/kernels/cpu/dot_kernel.cc
+++ b/paddle/pten/kernels/cpu/dot_kernel.cc
@@ -23,10 +23,10 @@
 namespace pten {
 
 template <typename T, typename Context>
-void Dot(const Context& dev_ctx,
-         const DenseTensor& x,
-         const DenseTensor& y,
-         DenseTensor* out) {
+void DotKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const DenseTensor& y,
+               DenseTensor* out) {
   auto const *x_ptr = x.data<T>(), *x_ptr_ = &x_ptr[0];
   auto const *y_ptr = y.data<T>(), *y_ptr_ = &y_ptr[0];
   auto* z = out->mutable_data<T>();
@@ -52,7 +52,7 @@ using complex128 = ::paddle::platform::complex<double>;
 PT_REGISTER_CTX_KERNEL(dot,
                        CPU,
                        ALL_LAYOUT,
-                       pten::Dot,
+                       pten::DotKernel,
                        float,
                        double,
                        int,
diff --git a/paddle/pten/kernels/cpu/matmul_grad_kernel.cc b/paddle/pten/kernels/cpu/matmul_grad_kernel.cc
new file mode 100644
index 0000000000000..5a8abb6701b0e
--- /dev/null
+++ b/paddle/pten/kernels/cpu/matmul_grad_kernel.cc
@@ -0,0 +1,47 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/kernels/matmul_grad_kernel.h"
+
+#include "paddle/fluid/platform/complex.h"
+#include "paddle/pten/core/kernel_registry.h"
+
+#include "paddle/pten/kernels/impl/matmul_grad_kernel_impl.h"
+
+PT_REGISTER_CTX_KERNEL(matmul_grad,
+                       CPU,
+                       ALL_LAYOUT,
+                       pten::MatmulGradKernel,
+                       float,
+                       double,
+                       paddle::platform::complex<float>,
+                       paddle::platform::complex<double>) {}
+
+PT_REGISTER_CTX_KERNEL(matmul_double_grad,
+                       CPU,
+                       ALL_LAYOUT,
+                       pten::MatmulDoubleGradKernel,
+                       float,
+                       double,
+                       paddle::platform::complex<float>,
+                       paddle::platform::complex<double>) {}
+
+PT_REGISTER_CTX_KERNEL(matmul_triple_grad,
+                       CPU,
+                       ALL_LAYOUT,
+                       pten::MatmulTripleGradKernel,
+                       float,
+                       double,
+                       paddle::platform::complex<float>,
+                       paddle::platform::complex<double>) {}
diff --git a/paddle/pten/kernels/dot_grad_kernel.h b/paddle/pten/kernels/dot_grad_kernel.h
new file mode 100644
index 0000000000000..b0940e5b16a33
--- /dev/null
+++ b/paddle/pten/kernels/dot_grad_kernel.h
@@ -0,0 +1,56 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/pten/core/dense_tensor.h"
+
+namespace pten {
+
+template <typename T, typename Context>
+void DotGradKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   const DenseTensor& dout,
+                   DenseTensor* dx,
+                   DenseTensor* dy);
+
+template <typename T, typename Context>
+void DotDoubleGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& y,
+                         const DenseTensor& ddx,
+                         const DenseTensor& ddy,
+                         const DenseTensor& dout,
+                         DenseTensor* dx,
+                         DenseTensor* dy,
+                         DenseTensor* ddout);
+
+template <typename T, typename Context>
+void DotTripleGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& y,
+                         const DenseTensor& ddx,
+                         const DenseTensor& ddy,
+                         const DenseTensor& d_dx,
+                         const DenseTensor& d_dy,
+                         const DenseTensor& dout,
+                         const DenseTensor& d_ddout,
+                         DenseTensor* d_x,
+                         DenseTensor* d_y,
+                         DenseTensor* d_ddx,
+                         DenseTensor* d_ddy,
+                         DenseTensor* d_dout);
+
+}  // namespace pten
diff --git a/paddle/pten/kernels/dot_kernel.h b/paddle/pten/kernels/dot_kernel.h
index 9924749cd2141..5ef660265333e 100644
--- a/paddle/pten/kernels/dot_kernel.h
+++ b/paddle/pten/kernels/dot_kernel.h
@@ -19,9 +19,9 @@
 namespace pten {
 
 template <typename T, typename Context>
-void Dot(const Context& dev_ctx,
-         const DenseTensor& x,
-         const DenseTensor& y,
-         DenseTensor* out);
+void DotKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const DenseTensor& y,
+               DenseTensor* out);
 
 }  // namespace pten
diff --git a/paddle/pten/kernels/empty_kernel.cc b/paddle/pten/kernels/empty_kernel.cc
index 94886806bccf3..2dd55a13e38e5 100644
--- a/paddle/pten/kernels/empty_kernel.cc
+++ b/paddle/pten/kernels/empty_kernel.cc
@@ -1,33 +1,34 @@
 /* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
 
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
+     http://www.apache.org/licenses/LICENSE-2.0
 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
 #include "paddle/pten/kernels/empty_kernel.h"
 
 #include "paddle/pten/backends/all_context.h"
 #include "paddle/pten/core/kernel_registry.h"
 
+#include "paddle/fluid/platform/complex.h"
+
 namespace pten {
 
-template <typename T, typename ContextT>
-void EmptyKernel(const ContextT& dev_ctx,
+template <typename T, typename Context>
+void EmptyKernel(const Context& dev_ctx,
                  const ScalarArray& shape,
                  DenseTensor* out) {
   out->Resize(paddle::framework::make_ddim(shape.GetData()));
 }
 
-template <typename T, typename ContextT>
-void EmptyLikeKernel(const ContextT& dev_ctx, DenseTensor* out) {
+template <typename T, typename Context>
+void EmptyLikeKernel(const Context& dev_ctx, DenseTensor* out) {
   out->mutable_data<T>();
 }
 
@@ -37,44 +38,62 @@ PT_REGISTER_CTX_KERNEL(empty,
                        CPU,
                        ALL_LAYOUT,
                        pten::EmptyKernel,
-                       bool,
-                       int,
-                       int64_t,
                        float,
                        double,
-                       paddle::platform::float16) {}
+                       uint8_t,
+                       int16_t,
+                       int,
+                       int64_t,
+                       bool,
+                       paddle::platform::float16,
+                       paddle::platform::bfloat16,
+                       paddle::platform::complex<float>,
+                       paddle::platform::complex<double>) {}
 
 PT_REGISTER_CTX_KERNEL(empty_like,
                        CPU,
                        ALL_LAYOUT,
                        pten::EmptyLikeKernel,
-                       bool,
-                       int,
-                       int64_t,
                        float,
                        double,
-                       paddle::platform::float16) {}
+                       uint8_t,
+                       int16_t,
+                       int,
+                       int64_t,
+                       bool,
+                       paddle::platform::float16,
+                       paddle::platform::bfloat16,
+                       paddle::platform::complex<float>,
+                       paddle::platform::complex<double>) {}
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PT_REGISTER_CTX_KERNEL(empty,
                        GPU,
                        ALL_LAYOUT,
                        pten::EmptyKernel,
-                       bool,
-                       int,
-                       int64_t,
                        float,
                        double,
-                       paddle::platform::float16) {}
+                       uint8_t,
+                       int16_t,
+                       int,
+                       int64_t,
+                       bool,
+                       paddle::platform::float16,
+                       paddle::platform::complex<float>,
+                       paddle::platform::complex<double>) {}
 
 PT_REGISTER_CTX_KERNEL(empty_like,
                        GPU,
                        ALL_LAYOUT,
                        pten::EmptyLikeKernel,
-                       bool,
-                       int,
-                       int64_t,
                        float,
                        double,
-                       paddle::platform::float16) {}
+                       uint8_t,
+                       int16_t,
+                       int,
+                       int64_t,
+                       bool,
+                       paddle::platform::float16,
+                       paddle::platform::complex<float>,
+                       paddle::platform::complex<double>) {}
 #endif
diff --git a/paddle/pten/kernels/empty_kernel.h b/paddle/pten/kernels/empty_kernel.h
index d71ee0b1266f2..d283ef5c1e41e 100644
--- a/paddle/pten/kernels/empty_kernel.h
+++ b/paddle/pten/kernels/empty_kernel.h
@@ -41,6 +41,14 @@ DenseTensor Empty(const Context& dev_ctx, DenseTensorMeta&& meta) {
   return dense_out;
 }
 
+template <typename T, typename Context>
+DenseTensor Empty(const Context& dev_ctx) {
+  return Empty<T, Context>(dev_ctx,
+                           {paddle::experimental::CppTypeToDataType<T>::Type(),
+                            {-1},
+                            DataLayout::NCHW});
+}
+
 template <typename T, typename Context>
 DenseTensor Empty(const Context& dev_ctx,
                   const ScalarArray& shape,
diff --git a/paddle/pten/kernels/gpu/complex_kernel.cu b/paddle/pten/kernels/gpu/complex_kernel.cu
index 5a3c14de4036a..02f050f5bc838 100644
--- a/paddle/pten/kernels/gpu/complex_kernel.cu
+++ b/paddle/pten/kernels/gpu/complex_kernel.cu
@@ -24,7 +24,8 @@
 PT_REGISTER_CTX_KERNEL(conj,
                        GPU,
                        ALL_LAYOUT,
-                       pten::Conj,
+                       pten::ConjKernel,
+                       paddle::platform::float16,
                        paddle::platform::complex<float>,
                        paddle::platform::complex<double>,
                        float,
diff --git a/paddle/pten/kernels/gpu/dot_grad_kernel.cu b/paddle/pten/kernels/gpu/dot_grad_kernel.cu
new file mode 100644
index 0000000000000..42af96f7c7265
--- /dev/null
+++ b/paddle/pten/kernels/gpu/dot_grad_kernel.cu
@@ -0,0 +1,32 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/kernels/dot_grad_kernel.h"
+#include "paddle/pten/kernels/impl/dot_grad_kernel_impl.h"
+
+#include "paddle/pten/backends/gpu/gpu_context.h"
+#include "paddle/pten/core/kernel_registry.h"
+
+#include "paddle/fluid/platform/complex.h"
+
+PT_REGISTER_CTX_KERNEL(dot_grad,
+                       GPU,
+                       ALL_LAYOUT,
+                       pten::DotGradKernel,
+                       float,
+                       double,
+                       int,
+                       int64_t,
+                       paddle::platform::complex<float>,
+                       paddle::platform::complex<double>) {}
diff --git a/paddle/pten/kernels/gpu/dot_kernel.cu b/paddle/pten/kernels/gpu/dot_kernel.cu
index 6b66d45b7dd48..1f9e7aa3f1cfd 100644
--- a/paddle/pten/kernels/gpu/dot_kernel.cu
+++ b/paddle/pten/kernels/gpu/dot_kernel.cu
@@ -25,10 +25,10 @@
 namespace pten {
 
 template <typename T, typename Context>
-void Dot(const Context& dev_ctx,
-         const DenseTensor& x,
-         const DenseTensor& y,
-         DenseTensor* out) {
+void DotKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const DenseTensor& y,
+               DenseTensor* out) {
   out->mutable_data<T>();
   if (1 == out->dims().size()) {
     auto eigen_out = pten::EigenScalar<T>::From(*out);
@@ -55,7 +55,7 @@ using complex128 = ::paddle::platform::complex<double>;
 PT_REGISTER_CTX_KERNEL(dot,
                        GPU,
                        ALL_LAYOUT,
-                       pten::Dot,
+                       pten::DotKernel,
                        float,
                        double,
                        int,
diff --git a/paddle/pten/kernels/gpu/matmul_grad_kernel.cu b/paddle/pten/kernels/gpu/matmul_grad_kernel.cu
new file mode 100644
index 0000000000000..f20c3f82c9262
--- /dev/null
+++ b/paddle/pten/kernels/gpu/matmul_grad_kernel.cu
@@ -0,0 +1,50 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/kernels/matmul_grad_kernel.h"
+
+#include "paddle/fluid/platform/complex.h"
+#include "paddle/pten/core/kernel_registry.h"
+
+#include "paddle/pten/kernels/impl/matmul_grad_kernel_impl.h"
+
+PT_REGISTER_CTX_KERNEL(matmul_grad,
+                       GPU,
+                       ALL_LAYOUT,
+                       pten::MatmulGradKernel,
+                       float,
+                       double,
+                       paddle::platform::float16,
+                       paddle::platform::complex<float>,
+                       paddle::platform::complex<double>) {}
+
+PT_REGISTER_CTX_KERNEL(matmul_double_grad,
+                       GPU,
+                       ALL_LAYOUT,
+                       pten::MatmulDoubleGradKernel,
+                       float,
+                       double,
+                       paddle::platform::float16,
+                       paddle::platform::complex<float>,
+                       paddle::platform::complex<double>) {}
+
+PT_REGISTER_CTX_KERNEL(matmul_triple_grad,
+                       GPU,
+                       ALL_LAYOUT,
+                       pten::MatmulTripleGradKernel,
+                       float,
+                       double,
+                       paddle::platform::float16,
+                       paddle::platform::complex<float>,
+                       paddle::platform::complex<double>) {}
diff --git a/paddle/pten/kernels/hybird/transpose.h b/paddle/pten/kernels/hybird/transpose.h
index 459fed6b9fa04..17f52c74a1344 100644
--- a/paddle/pten/kernels/hybird/transpose.h
+++ b/paddle/pten/kernels/hybird/transpose.h
@@ -17,6 +17,9 @@
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/pten/core/dense_tensor.h"
 
+#include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/pten/kernels/hybird/eigen/common.h"
+
 namespace pten {
 
 namespace math {
@@ -30,5 +33,30 @@ struct TransposeNormal {
                   const std::vector<int64_t>& axis);
 };
 
+template <typename DeviceContext, typename T, int Rank>
+struct Transpose {
+  void operator()(const DeviceContext& dev_ctx,
+                  const DenseTensor& in,
+                  DenseTensor* out,
+                  const std::vector<int>& axis) {
+    Eigen::array<int, Rank> permute;
+    for (int i = 0; i < Rank; i++) {
+      permute[i] = axis[i];
+    }
+    auto eigen_in = pten::EigenTensor<T, Rank>::From(in);
+    auto eigen_out = pten::EigenTensor<T, Rank>::From(*out);
+    auto* dev = dev_ctx.eigen_device();
+    // use 32bit index to speed up computation
+    bool use_32bit_index = eigen_out.size() < Eigen::NumTraits<int>::highest();
+    bool is_gpu_place = paddle::platform::is_gpu_place(dev_ctx.GetPlace());
+    if (use_32bit_index && is_gpu_place) {
+      To32BitIndex(eigen_out).device(*dev) =
+          To32BitIndex(eigen_in).shuffle(permute);
+    } else {
+      eigen_out.device(*dev) = eigen_in.shuffle(permute);
+    }
+  }
+};
+
 }  // namespace math
 }  // namespace pten
diff --git a/paddle/pten/kernels/impl/complex_kernel_impl.h b/paddle/pten/kernels/impl/complex_kernel_impl.h
index 6f3a6049faa9a..e0c6825a78a53 100644
--- a/paddle/pten/kernels/impl/complex_kernel_impl.h
+++ b/paddle/pten/kernels/impl/complex_kernel_impl.h
@@ -21,12 +21,14 @@
 namespace pten {
 
 template <typename T, typename Context>
-void Conj(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) {
+void ConjKernel(const Context& context,
+                const DenseTensor& x,
+                DenseTensor* out) {
   auto numel = x.numel();
   auto* x_data = x.data<T>();
   auto* out_data = out->mutable_data<T>();
 
-  paddle::platform::ForRange<Context> for_range(dev_ctx, numel);
+  paddle::platform::ForRange<Context> for_range(context, numel);
   paddle::operators::math::ConjFunctor<T> functor(x_data, numel, out_data);
   for_range(functor);
 }
diff --git a/paddle/pten/kernels/impl/dot_grad_kernel_impl.h b/paddle/pten/kernels/impl/dot_grad_kernel_impl.h
new file mode 100644
index 0000000000000..16c87bbab474a
--- /dev/null
+++ b/paddle/pten/kernels/impl/dot_grad_kernel_impl.h
@@ -0,0 +1,919 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/kernels/hybird/eigen/common.h"
+
+#include "paddle/pten/kernels/complex_kernel.h"
+
+#include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/fluid/operators/math/complex_functors.h"
+
+namespace pten {
+
+template <typename DeviceContext, typename T, typename Enabel = void>
+struct DotGradFunction {
+  void operator()(const DeviceContext& ctx,
+                  const DenseTensor* tensor_x,
+                  const DenseTensor* tensor_y,
+                  const DenseTensor* tensor_dout,
+                  DenseTensor* tensor_dx,
+                  DenseTensor* tensor_dy);
+};
+
+template <typename DeviceContext, typename T>
+struct DotGradFunction<DeviceContext,
+                       T,
+                       paddle::operators::math::EnableComplex<T>> {
+  void operator()(const DeviceContext& ctx,
+                  const DenseTensor* tensor_x,
+                  const DenseTensor* tensor_y,
+                  const DenseTensor* tensor_dout,
+                  DenseTensor* tensor_dx,
+                  DenseTensor* tensor_dy) {
+#if defined(__NVCC__) || defined(__HIPCC__)
+    if (1 == tensor_dout->dims().size()) {
+      auto dout = EigenVector<T>::Flatten(*tensor_dout);
+
+      if (tensor_dx) {
+        auto y = EigenVector<T>::Flatten(*tensor_y);
+        auto& dev = *ctx.eigen_device();
+        Eigen::DSizes<int, 1> size(tensor_dx->numel());
+
+        ConjKernel<T, DeviceContext>(ctx, *tensor_y, tensor_dx);
+
+        auto dx = EigenVector<T>::Flatten(*tensor_dx);
+        dx.device(dev) = dx * dout.broadcast(size);
+      }
+
+      if (tensor_dy) {
+        auto x = EigenVector<T>::Flatten(*tensor_x);
+        auto& dev = *ctx.eigen_device();
+        Eigen::DSizes<int, 1> size(tensor_dy->numel());
+
+        ConjKernel<T, DeviceContext>(ctx, *tensor_x, tensor_dy);
+
+        auto dy = EigenVector<T>::Flatten(*tensor_dy);
+        dy.device(dev) = dy * dout.broadcast(size);
+      }
+    } else {
+      auto dout = EigenMatrix<T>::From(*tensor_dout);
+
+      if (tensor_dx) {
+        tensor_dx->mutable_data<T>();
+        auto y = EigenMatrix<T>::From(*tensor_y);
+        auto& dev = *ctx.eigen_device();
+        Eigen::DSizes<int, 2> size(1, tensor_dx->dims()[1]);
+
+        ConjKernel<T, DeviceContext>(ctx, *tensor_y, tensor_dx);
+
+        auto dx = EigenMatrix<T>::From(*tensor_dx);
+        dx.device(dev) = dx * dout.broadcast(size);
+      }
+
+      if (tensor_dy) {
+        tensor_dy->mutable_data<T>();
+        auto x = EigenMatrix<T>::From(*tensor_x);
+        auto& dev = *ctx.eigen_device();
+        Eigen::DSizes<int, 2> size(1, tensor_dy->dims()[1]);
+
+        ConjKernel<T, DeviceContext>(ctx, *tensor_x, tensor_dy);
+
+        auto dy = EigenMatrix<T>::From(*tensor_dy);
+        dy.device(dev) = dy * dout.broadcast(size);
+      }
+    }
+#else
+    const auto* data_dout = tensor_dout->data<T>();
+
+    if (tensor_dx) {
+      auto* data_dx = tensor_dx->mutable_data<T>();
+      const auto* data_y = tensor_y->data<T>();
+      const DDim& dim = tensor_x->dims();
+      size_t N = static_cast<size_t>(paddle::framework::product(dim));
+
+      auto step = dim[dim.size() - 1];
+
+      int s = -1;
+      for (size_t i = 0; i < N; ++i) {
+        if (0 == i % step) ++s;
+        data_dx[i] = T(data_y[i].real, -data_y[i].imag) * data_dout[s];
+      }
+    }
+
+    if (tensor_dy) {
+      auto* data_dy = tensor_dy->mutable_data<T>();
+      const auto* data_x = tensor_x->data<T>();
+      const DDim& dim = tensor_y->dims();
+      size_t N = static_cast<size_t>(paddle::framework::product(dim));
+
+      auto step = dim[dim.size() - 1];
+
+      int s = -1;
+      for (size_t i = 0; i < N; ++i) {
+        if (0 == i % step) ++s;
+        data_dy[i] = T(data_x[i].real, -data_x[i].imag) * data_dout[s];
+      }
+    }
+#endif
+  }
+};
+
+template <typename DeviceContext, typename T>
+struct DotGradFunction<DeviceContext,
+                       T,
+                       paddle::operators::math::DisableComplex<T>> {
+  void operator()(const DeviceContext& ctx,
+                  const DenseTensor* tensor_x,
+                  const DenseTensor* tensor_y,
+                  const DenseTensor* tensor_dout,
+                  DenseTensor* tensor_dx,
+                  DenseTensor* tensor_dy) {
+#if defined(__NVCC__) || defined(__HIPCC__)
+    if (1 == tensor_dout->dims().size()) {
+      auto dout = EigenVector<T>::Flatten(*tensor_dout);
+      if (tensor_dx) {
+        auto y = EigenVector<T>::Flatten(*tensor_y);
+        auto dx = EigenVector<T>::Flatten(*tensor_dx);
+        auto& dev = *ctx.eigen_device();
+        Eigen::DSizes<int, 1> size(tensor_dx->numel());
+        dx.device(dev) = y * dout.broadcast(size);
+      }
+
+      if (tensor_dy) {
+        auto x = EigenVector<T>::Flatten(*tensor_x);
+        auto dy = EigenVector<T>::Flatten(*tensor_dy);
+        auto& dev = *ctx.eigen_device();
+        Eigen::DSizes<int, 1> size(tensor_dy->numel());
+        dy.device(dev) = x * dout.broadcast(size);
+      }
+    } else {
+      auto dout = EigenMatrix<T>::From(*tensor_dout);
+
+      if (tensor_dx) {
+        tensor_dx->mutable_data<T>();
+        auto y = EigenMatrix<T>::From(*tensor_y);
+        auto dx = EigenMatrix<T>::From(*tensor_dx);
+        auto& dev = *ctx.eigen_device();
+        Eigen::DSizes<int, 2> size(1, tensor_dx->dims()[1]);
+        dx.device(dev) = y * dout.broadcast(size);
+      }
+
+      if (tensor_dy) {
+        tensor_dy->mutable_data<T>();
+        auto x = EigenMatrix<T>::From(*tensor_x);
+        auto dy = EigenMatrix<T>::From(*tensor_dy);
+        auto& dev = *ctx.eigen_device();
+        Eigen::DSizes<int, 2> size(1, tensor_dy->dims()[1]);
+        dy.device(dev) = x * dout.broadcast(size);
+      }
+    }
+#else
+    auto const *x = tensor_x->data<T>(), *y = tensor_y->data<T>(),
+               *dz = tensor_dout->data<T>();
+    auto&& d = tensor_x->dims();
+    auto const N = tensor_x->numel();
+    auto const B = d[d.size() - 1];
+
+    if (tensor_dx) {
+      auto* dx = tensor_dx->mutable_data<T>();
+      for (auto j = 0; j < N / B; ++j) {
+        auto const ss = dz[j];
+        for (auto i = 0; i < B; ++i) *dx++ = *y++ * ss;
+      }
+    }
+
+    if (tensor_dy) {
+      auto* dy = tensor_dy->mutable_data<T>();
+      for (auto j = 0; j < N / B; ++j) {
+        auto const ss = dz[j];
+        for (auto i = 0; i < B; i++) *dy++ = *x++ * ss;
+      }
+    }
+#endif
+  }
+};
+
+template <typename DeviceContext, typename T, typename Enabel = void>
+struct DotDoubleGradFunction {
+  void operator()(const DeviceContext& ctx,
+                  const DenseTensor* tensor_x,
+                  const DenseTensor* tensor_y,
+                  const DenseTensor* tensor_dout,
+                  const DenseTensor* tensor_ddx,
+                  const DenseTensor* tensor_ddy,
+                  DenseTensor* tensor_dx,
+                  DenseTensor* tensor_dy,
+                  DenseTensor* tensor_ddout);
+};
+
+template <typename DeviceContext, typename T>
+struct DotDoubleGradFunction<DeviceContext,
+                             T,
+                             paddle::operators::math::EnableComplex<T>> {
+  void operator()(const DeviceContext& ctx,
+                  const DenseTensor* tensor_x,
+                  const DenseTensor* tensor_y,
+                  const DenseTensor* tensor_dout,
+                  const DenseTensor* tensor_ddx,
+                  const DenseTensor* tensor_ddy,
+                  DenseTensor* tensor_dx,
+                  DenseTensor* tensor_dy,
+                  DenseTensor* tensor_ddout) {
+#if defined(__NVCC__) || defined(__HIPCC__)
+    if (1 == tensor_dout->dims().size()) {
+      DenseTensor tensor_dout_help;
+      auto& dev = *ctx.eigen_device();
+      if (tensor_dx || tensor_dy) {
+        tensor_dout_help = Conj<T, DeviceContext>(ctx, *tensor_dout);
+      }
+      if (tensor_dx) {
+        auto ddy = EigenVector<T>::Flatten(*tensor_ddy);
+        Eigen::DSizes<int, 1> size(tensor_ddy->numel());
+        auto dx = EigenVector<T>::Flatten(*tensor_dx);
+        auto dout = EigenVector<T>::Flatten(tensor_dout_help);
+        dx.device(dev) = ddy * dout.broadcast(size);
+      }
+
+      if (tensor_dy) {
+        auto ddx = EigenVector<T>::Flatten(*tensor_ddx);
+        Eigen::DSizes<int, 1> size(tensor_ddx->numel());
+        auto dy = EigenVector<T>::Flatten(*tensor_dy);
+        auto dout = EigenVector<T>::Flatten(tensor_dout_help);
+        dy.device(dev) = ddx * dout.broadcast(size);
+      }
+
+      if (tensor_ddout) {
+        DenseTensor tensor_x_help = Conj<T, DeviceContext>(ctx, *tensor_x);
+        DenseTensor tensor_y_help = Conj<T, DeviceContext>(ctx, *tensor_y);
+
+        auto x = EigenVector<T>::Flatten(tensor_x_help);
+        auto y = EigenVector<T>::Flatten(tensor_y_help);
+        auto ddx = EigenVector<T>::Flatten(*tensor_ddx);
+        auto ddy = EigenVector<T>::Flatten(*tensor_ddy);
+        auto ddout = EigenVector<T>::Flatten(*tensor_ddout);
+        ddout.device(dev) = (x * ddy + y * ddx).sum();
+      }
+    }
+#else
+    const auto* data_dout = tensor_dout->data<T>();
+
+    if (tensor_dx) {
+      auto* data_dx = tensor_dx->mutable_data<T>();
+      const auto* data_ddy = tensor_ddy->data<T>();
+      const DDim& dim = tensor_dx->dims();
+      size_t N = static_cast<size_t>(product(dim));
+
+      auto step = dim[dim.size() - 1];
+
+      int s = -1;
+      for (size_t i = 0; i < N; ++i) {
+        if (0 == i % step) ++s;
+        data_dx[i] = T(data_dout[s].real, -data_dout[s].imag) * data_ddy[i];
+      }
+    }
+
+    if (tensor_dy) {
+      auto* data_dy = tensor_dy->mutable_data<T>();
+      const auto* data_ddx = tensor_ddx->data<T>();
+      const DDim& dim = tensor_dy->dims();
+      size_t N = static_cast<size_t>(product(dim));
+
+      auto step = dim[dim.size() - 1];
+
+      int s = -1;
+      for (size_t i = 0; i < N; ++i) {
+        if (0 == i % step) ++s;
+        data_dy[i] = T(data_dout[s].real, -data_dout[s].imag) * data_ddx[i];
+      }
+    }
+
+    if (tensor_ddout) {
+      auto* data_ddout = tensor_ddout->mutable_data<T>();
+      auto* data_x = tensor_x->data<T>();
+      auto* data_y = tensor_y->data<T>();
+      auto* data_ddx = tensor_ddx->data<T>();
+      auto* data_ddy = tensor_ddy->data<T>();
+
+      const DDim& dim = tensor_dy->dims();
+      size_t N = static_cast<size_t>(product(dim));
+      auto step = dim[dim.size() - 1];
+      int s = -1;
+      bool new_s = false;
+
+      for (size_t i = 0; i < N; ++i) {
+        if (0 == i % step) {
+          ++s;
+          new_s = true;
+        }
+        if (new_s) {
+          data_ddout[s] = T(data_x[i].real, -data_x[i].imag) * data_ddy[i] +
+                          T(data_y[i].real, -data_y[i].imag) * data_ddx[i];
+        } else {
+          data_ddout[s] += T(data_x[i].real, -data_x[i].imag) * data_ddy[i] +
+                           T(data_y[i].real, -data_y[i].imag) * data_ddx[i];
+        }
+        new_s = false;
+      }
+    }
+#endif
+  }
+};
+
+template <typename DeviceContext, typename T>
+struct DotDoubleGradFunction<DeviceContext,
+                             T,
+                             paddle::operators::math::DisableComplex<T>> {
+  void operator()(const DeviceContext& ctx,
+                  const DenseTensor* tensor_x,
+                  const DenseTensor* tensor_y,
+                  const DenseTensor* tensor_dout,
+                  const DenseTensor* tensor_ddx,
+                  const DenseTensor* tensor_ddy,
+                  DenseTensor* tensor_dx,
+                  DenseTensor* tensor_dy,
+                  DenseTensor* tensor_ddout) {
+#if defined(__NVCC__) || defined(__HIPCC__)
+    if (1 == tensor_dout->dims().size()) {
+      auto& dev = *ctx.eigen_device();
+      auto dout = EigenVector<T>::Flatten(*tensor_dout);
+      if (tensor_dx) {
+        tensor_dx->mutable_data<T>();
+        auto ddy = EigenVector<T>::Flatten(*tensor_ddy);
+        Eigen::DSizes<int, 1> size(tensor_ddy->numel());
+        auto dx = EigenVector<T>::Flatten(*tensor_dx);
+        dx.device(dev) = ddy * dout.broadcast(size);
+      }
+
+      if (tensor_dy) {
+        tensor_dy->mutable_data<T>();
+        auto ddx = EigenVector<T>::Flatten(*tensor_ddx);
+        Eigen::DSizes<int, 1> size(tensor_ddx->numel());
+
+        auto dy = EigenVector<T>::Flatten(*tensor_dy);
+        dy.device(dev) = ddx * dout.broadcast(size);
+      }
+
+      if (tensor_ddout) {
+        tensor_ddout->mutable_data<T>();
+        auto x = EigenVector<T>::Flatten(*tensor_x);
+        auto y = EigenVector<T>::Flatten(*tensor_y);
+        auto ddx = EigenVector<T>::Flatten(*tensor_ddx);
+        auto ddy = EigenVector<T>::Flatten(*tensor_ddy);
+        auto ddout = EigenVector<T>::Flatten(*tensor_ddout);
+        ddout.device(dev) = (x * ddy + y * ddx).sum();
+      }
+    }
+#else
+    const auto* data_dout = tensor_dout->data<T>();
+
+    if (tensor_dx) {
+      auto* data_dx = tensor_dx->mutable_data<T>();
+      const auto* data_ddy = tensor_ddy->data<T>();
+      const DDim& dim = tensor_dx->dims();
+      size_t N = static_cast<size_t>(product(dim));
+
+      auto step = dim[dim.size() - 1];
+
+      int s = -1;
+      for (size_t i = 0; i < N; ++i) {
+        if (0 == i % step) ++s;
+        data_dx[i] = data_dout[s] * data_ddy[i];
+      }
+    }
+
+    if (tensor_dy) {
+      auto* data_dy = tensor_dy->mutable_data<T>();
+      const auto* data_ddx = tensor_ddx->data<T>();
+      const DDim& dim = tensor_dy->dims();
+      size_t N = static_cast<size_t>(product(dim));
+
+      auto step = dim[dim.size() - 1];
+
+      int s = -1;
+      for (size_t i = 0; i < N; ++i) {
+        if (0 == i % step) ++s;
+        data_dy[i] = data_dout[s] * data_ddx[i];
+      }
+    }
+
+    if (tensor_ddout) {
+      auto* data_ddout = tensor_ddout->mutable_data<T>();
+      auto* data_x = tensor_x->data<T>();
+      auto* data_y = tensor_y->data<T>();
+      auto* data_ddx = tensor_ddx->data<T>();
+      auto* data_ddy = tensor_ddy->data<T>();
+
+      const DDim& dim = tensor_dy->dims();
+      size_t N = static_cast<size_t>(product(dim));
+      auto step = dim[dim.size() - 1];
+      int s = -1;
+      bool new_s = false;
+
+      for (size_t i = 0; i < N; ++i) {
+        if (0 == i % step) {
+          ++s;
+          new_s = true;
+        }
+        if (new_s) {
+          data_ddout[s] = data_x[i] * data_ddy[i] + data_y[i] * data_ddx[i];
+        } else {
+          data_ddout[s] += data_x[i] * data_ddy[i] + data_y[i] * data_ddx[i];
+        }
+        new_s = false;
+      }
+    }
+#endif
+  }
+};
+
+template <typename DeviceContext, typename T, typename Enabel = void>
+struct DotTripleGradFunction {
+  void operator()(const DeviceContext& ctx,
+                  const DenseTensor* in_tensor_x,
+                  const DenseTensor* in_tensor_y,
+                  const DenseTensor* in_tensor_ddx,
+                  const DenseTensor* in_tensor_ddy,
+                  const DenseTensor* in_tensor_d_dx,
+                  const DenseTensor* in_tensor_d_dy,
+                  const DenseTensor* in_tensor_dout,
+                  const DenseTensor* in_tensor_d_ddout,
+                  DenseTensor* out_tensor_d_x,
+                  DenseTensor* out_tensor_d_y,
+                  DenseTensor* out_tensor_d_dout,
+                  DenseTensor* out_tensor_d_ddx,
+                  DenseTensor* out_tensor_d_ddy);
+};
+
+// TODO(wuweilong): enable this function when the unittests framewark for multi
+// grad is ok (dtype: complex64 or complex128).
+template <typename DeviceContext, typename T>
+struct DotTripleGradFunction<DeviceContext,
+                             T,
+                             paddle::operators::math::EnableComplex<T>> {
+  void operator()(const DeviceContext& ctx,
+                  const DenseTensor* in_tensor_x,
+                  const DenseTensor* in_tensor_y,
+                  const DenseTensor* in_tensor_ddx,
+                  const DenseTensor* in_tensor_ddy,
+                  const DenseTensor* in_tensor_d_dx,
+                  const DenseTensor* in_tensor_d_dy,
+                  const DenseTensor* in_tensor_dout,
+                  const DenseTensor* in_tensor_d_ddout,
+                  DenseTensor* out_tensor_d_x,
+                  DenseTensor* out_tensor_d_y,
+                  DenseTensor* out_tensor_d_dout,
+                  DenseTensor* out_tensor_d_ddx,
+                  DenseTensor* out_tensor_d_ddy) {
+#if defined(__NVCC__) || defined(__HIPCC__)
+    if (1 == in_tensor_d_ddout->dims().size()) {
+      DenseTensor in_tensor_d_ddout_help;
+      auto& dev = *ctx.eigen_device();
+      if (out_tensor_d_x || out_tensor_d_y) {
+        in_tensor_d_ddout_help =
+            Conj<T, DeviceContext>(ctx, *in_tensor_d_ddout);
+      }
+      if (out_tensor_d_x) {
+        auto ddy = EigenVector<T>::Flatten(*in_tensor_ddy);
+        Eigen::DSizes<int, 1> size(in_tensor_ddy->numel());
+        auto d_x = EigenVector<T>::Flatten(*out_tensor_d_x);
+        auto d_ddout = EigenVector<T>::Flatten(in_tensor_d_ddout_help);
+        d_x.device(dev) = ddy * d_ddout.broadcast(size);
+      }
+
+      if (out_tensor_d_y) {
+        auto ddx = EigenVector<T>::Flatten(*in_tensor_ddx);
+        Eigen::DSizes<int, 1> size(in_tensor_ddx->numel());
+        auto d_y = EigenVector<T>::Flatten(*out_tensor_d_y);
+        auto d_ddout = EigenVector<T>::Flatten(in_tensor_d_ddout_help);
+        d_y.device(dev) = ddx * d_ddout.broadcast(size);
+      }
+
+      if (out_tensor_d_dout) {
+        DenseTensor in_tensor_ddx_help =
+            Conj<T, DeviceContext>(ctx, *in_tensor_ddx);
+        DenseTensor in_tensor_ddy_help =
+            Conj<T, DeviceContext>(ctx, *in_tensor_ddy);
+
+        auto ddx = EigenVector<T>::Flatten(in_tensor_ddx_help);
+        auto ddy = EigenVector<T>::Flatten(in_tensor_ddy_help);
+        auto d_dx = EigenVector<T>::Flatten(*in_tensor_d_dx);
+        auto d_dy = EigenVector<T>::Flatten(*in_tensor_d_dy);
+        auto d_dout = EigenVector<T>::Flatten(*out_tensor_d_dout);
+        d_dout.device(dev) = (ddx * d_dy + ddy * d_dx).sum();
+      }
+
+      if (out_tensor_d_ddx) {
+        DenseTensor in_tensor_dout_help =
+            Conj<T, DeviceContext>(ctx, *in_tensor_dout);
+        DenseTensor in_tensor_y_help =
+            Conj<T, DeviceContext>(ctx, *in_tensor_y);
+
+        auto dout = EigenVector<T>::Flatten(in_tensor_dout_help);
+        auto y = EigenVector<T>::Flatten(in_tensor_y_help);
+        auto d_ddout = EigenVector<T>::Flatten(*in_tensor_d_ddout);
+        auto d_dy = EigenVector<T>::Flatten(*in_tensor_d_dy);
+        auto d_ddx = EigenVector<T>::Flatten(*out_tensor_d_ddx);
+        Eigen::DSizes<int, 1> size(in_tensor_y->numel());
+        d_ddx.device(dev) =
+            (dout.broadcast(size) * d_dy + y * d_ddout.broadcast(size));
+      }
+
+      if (out_tensor_d_ddy) {
+        DenseTensor in_tensor_dout_help =
+            Conj<T, DeviceContext>(ctx, *in_tensor_dout);
+        DenseTensor in_tensor_x_help =
+            Conj<T, DeviceContext>(ctx, *in_tensor_x);
+
+        auto dout = EigenVector<T>::Flatten(in_tensor_dout_help);
+        auto x = EigenVector<T>::Flatten(in_tensor_x_help);
+        auto d_ddout = EigenVector<T>::Flatten(*in_tensor_d_ddout);
+        auto d_dx = EigenVector<T>::Flatten(*in_tensor_d_dx);
+        auto d_ddy = EigenVector<T>::Flatten(*out_tensor_d_ddy);
+        Eigen::DSizes<int, 1> size(in_tensor_x->numel());
+        d_ddy.device(dev) =
+            (dout.broadcast(size) * d_dx + x * d_ddout.broadcast(size));
+      }
+    }
+#else
+    const auto* data_d_ddout = in_tensor_d_ddout->data<T>();
+
+    if (out_tensor_d_x) {
+      auto* data_d_x = out_tensor_d_x->mutable_data<T>();
+      const auto* data_ddy = in_tensor_ddy->data<T>();
+
+      const DDim& dim = out_tensor_d_x->dims();
+      size_t N = static_cast<size_t>(product(dim));
+      auto step = dim[dim.size() - 1];
+      int s = -1;
+
+      for (size_t i = 0; i < N; ++i) {
+        if (0 == i % step) ++s;
+        data_d_x[i] = T(data_ddy[i].real, -data_ddy[i].imag) * data_d_ddout[s];
+      }
+    }
+
+    if (out_tensor_d_y) {
+      auto* data_d_y = out_tensor_d_y->mutable_data<T>();
+      const auto* data_ddx = in_tensor_ddx->data<T>();
+
+      const DDim& dim = out_tensor_d_y->dims();
+      size_t N = static_cast<size_t>(product(dim));
+      auto step = dim[dim.size() - 1];
+      int s = -1;
+
+      for (size_t i = 0; i < N; ++i) {
+        if (0 == i % step) ++s;
+        data_d_y[i] = T(data_ddx[i].real, -data_ddx[i].imag) * data_d_ddout[s];
+      }
+    }
+
+    if (out_tensor_d_dout) {
+      auto* data_d_dout = out_tensor_d_dout->mutable_data<T>();
+      auto* data_ddx = in_tensor_ddx->data<T>();
+      auto* data_ddy = in_tensor_ddy->data<T>();
+      auto* data_d_dx = in_tensor_d_dx->data<T>();
+      auto* data_d_dy = in_tensor_d_dy->data<T>();
+
+      const DDim& dim = out_tensor_d_dout->dims();
+      size_t N = static_cast<size_t>(product(dim));
+      auto step = dim[dim.size() - 1];
+      int s = -1;
+      bool new_s = false;
+
+      for (size_t i = 0; i < N; ++i) {
+        if (0 == i % step) {
+          ++s;
+          new_s = true;
+        }
+        if (new_s) {
+          data_d_dout[s] =
+              T(data_ddy[i].real, -data_ddy[i].imag) * data_d_dx[i] +
+              T(data_ddx[i].real, -data_ddx[i].imag) * data_d_dy[i];
+        } else {
+          data_d_dout[s] +=
+              T(data_ddy[i].real, -data_ddy[i].imag) * data_d_dx[i] +
+              T(data_ddx[i].real, -data_ddx[i].imag) * data_d_dy[i];
+        }
+        new_s = false;
+      }
+    }
+
+    if (out_tensor_d_ddx) {
+      auto* data_d_ddx = out_tensor_d_ddx->mutable_data<T>();
+      auto* data_dout = in_tensor_dout->data<T>();
+      auto* data_d_dy = in_tensor_d_dy->data<T>();
+      auto* data_y = in_tensor_y->data<T>();
+      auto* data_d_ddout = in_tensor_d_ddout->data<T>();
+
+      const DDim& dim = out_tensor_d_ddx->dims();
+      size_t N = static_cast<size_t>(product(dim));
+      auto step = dim[dim.size() - 1];
+      int s = -1;
+
+      for (size_t i = 0; i < N; ++i) {
+        if (0 == i % step) ++s;
+        data_d_ddx[i] =
+            T(data_dout[s].real, -data_dout[s].imag) * data_d_dy[i] +
+            T(data_y[i].real, -data_y[i].imag) * data_d_ddout[s];
+      }
+    }
+
+    if (out_tensor_d_ddy) {
+      auto* data_d_ddy = out_tensor_d_ddy->mutable_data<T>();
+      auto* data_dout = in_tensor_dout->data<T>();
+      auto* data_d_dx = in_tensor_d_dx->data<T>();
+      auto* data_x = in_tensor_x->data<T>();
+      auto* data_d_ddout = in_tensor_d_ddout->data<T>();
+
+      const DDim& dim = out_tensor_d_ddy->dims();
+      size_t N = static_cast<size_t>(product(dim));
+      auto step = dim[dim.size() - 1];
+      int s = -1;
+
+      for (size_t i = 0; i < N; ++i) {
+        if (0 == i % step) ++s;
+        data_d_ddy[i] =
+            T(data_dout[s].real, -data_dout[s].imag) * data_d_dx[i] +
+            T(data_x[i].real, -data_x[i].imag) * data_d_ddout[s];
+      }
+    }
+#endif
+  }
+};
+
+template <typename DeviceContext, typename T>
+struct DotTripleGradFunction<DeviceContext,
+                             T,
+                             paddle::operators::math::DisableComplex<T>> {
+  void operator()(const DeviceContext& ctx,
+                  const DenseTensor* in_tensor_x,
+                  const DenseTensor* in_tensor_y,
+                  const DenseTensor* in_tensor_ddx,
+                  const DenseTensor* in_tensor_ddy,
+                  const DenseTensor* in_tensor_d_dx,
+                  const DenseTensor* in_tensor_d_dy,
+                  const DenseTensor* in_tensor_dout,
+                  const DenseTensor* in_tensor_d_ddout,
+                  DenseTensor* out_tensor_d_x,
+                  DenseTensor* out_tensor_d_y,
+                  DenseTensor* out_tensor_d_dout,
+                  DenseTensor* out_tensor_d_ddx,
+                  DenseTensor* out_tensor_d_ddy) {
+#if defined(__NVCC__) || defined(__HIPCC__)
+    if (1 == in_tensor_d_ddout->dims().size()) {
+      auto& dev = *ctx.eigen_device();
+      auto d_ddout = EigenVector<T>::Flatten(*in_tensor_d_ddout);
+      if (out_tensor_d_x) {
+        out_tensor_d_x->mutable_data<T>();
+        auto ddy = EigenVector<T>::Flatten(*in_tensor_ddy);
+        Eigen::DSizes<int, 1> size(in_tensor_ddy->numel());
+        auto d_x = EigenVector<T>::Flatten(*out_tensor_d_x);
+        d_x.device(dev) = ddy * d_ddout.broadcast(size);
+      }
+
+      if (out_tensor_d_y) {
+        out_tensor_d_y->mutable_data<T>();
+        auto ddx = EigenVector<T>::Flatten(*in_tensor_ddx);
+        Eigen::DSizes<int, 1> size(in_tensor_ddx->numel());
+
+        auto d_y = EigenVector<T>::Flatten(*out_tensor_d_y);
+        d_y.device(dev) = ddx * d_ddout.broadcast(size);
+      }
+
+      if (out_tensor_d_dout) {
+        out_tensor_d_dout->mutable_data<T>();
+        auto ddx = EigenVector<T>::Flatten(*in_tensor_ddx);
+        auto ddy = EigenVector<T>::Flatten(*in_tensor_ddy);
+        auto d_dx = EigenVector<T>::Flatten(*in_tensor_d_dx);
+        auto d_dy = EigenVector<T>::Flatten(*in_tensor_d_dy);
+        auto d_dout = EigenVector<T>::Flatten(*out_tensor_d_dout);
+        d_dout.device(dev) = (ddx * d_dy + ddy * d_dx).sum();
+      }
+
+      if (out_tensor_d_ddx) {
+        out_tensor_d_ddx->mutable_data<T>();
+        auto dout = EigenVector<T>::Flatten(*in_tensor_dout);
+        auto y = EigenVector<T>::Flatten(*in_tensor_y);
+        auto d_ddout = EigenVector<T>::Flatten(*in_tensor_d_ddout);
+        auto d_dy = EigenVector<T>::Flatten(*in_tensor_d_dy);
+        auto d_ddx = EigenVector<T>::Flatten(*out_tensor_d_ddx);
+        Eigen::DSizes<int, 1> size(in_tensor_y->numel());
+        d_ddx.device(dev) =
+            (dout.broadcast(size) * d_dy + y * d_ddout.broadcast(size));
+      }
+
+      if (out_tensor_d_ddy) {
+        out_tensor_d_ddy->mutable_data<T>();
+        auto dout = EigenVector<T>::Flatten(*in_tensor_dout);
+        auto x = EigenVector<T>::Flatten(*in_tensor_x);
+        auto d_ddout = EigenVector<T>::Flatten(*in_tensor_d_ddout);
+        auto d_dx = EigenVector<T>::Flatten(*in_tensor_d_dx);
+        auto d_ddy = EigenVector<T>::Flatten(*out_tensor_d_ddy);
+        Eigen::DSizes<int, 1> size(in_tensor_x->numel());
+        d_ddy.device(dev) =
+            (dout.broadcast(size) * d_dx + x * d_ddout.broadcast(size));
+      }
+    }
+#else
+    const auto* data_d_ddout = in_tensor_d_ddout->data<T>();
+
+    if (out_tensor_d_x) {
+      auto* data_d_x = out_tensor_d_x->mutable_data<T>();
+      const auto* data_ddy = in_tensor_ddy->data<T>();
+
+      const DDim& dim = out_tensor_d_x->dims();
+      size_t N = static_cast<size_t>(product(dim));
+      auto step = dim[dim.size() - 1];
+      int s = -1;
+
+      for (size_t i = 0; i < N; ++i) {
+        if (0 == i % step) ++s;
+        data_d_x[i] = data_ddy[i] * data_d_ddout[s];
+      }
+    }
+
+    if (out_tensor_d_y) {
+      auto* data_d_y = out_tensor_d_y->mutable_data<T>();
+      const auto* data_ddx = in_tensor_ddx->data<T>();
+
+      const DDim& dim = out_tensor_d_y->dims();
+      size_t N = static_cast<size_t>(product(dim));
+      auto step = dim[dim.size() - 1];
+      int s = -1;
+
+      for (size_t i = 0; i < N; ++i) {
+        if (0 == i % step) ++s;
+        data_d_y[i] = data_ddx[i] * data_d_ddout[s];
+      }
+    }
+
+    if (out_tensor_d_dout) {
+      auto* data_d_dout = out_tensor_d_dout->mutable_data<T>();
+      auto* data_ddx = in_tensor_ddx->data<T>();
+      auto* data_ddy = in_tensor_ddy->data<T>();
+      auto* data_d_dx = in_tensor_d_dx->data<T>();
+      auto* data_d_dy = in_tensor_d_dy->data<T>();
+
+      const DDim& dim = in_tensor_ddx->dims();
+      size_t N = static_cast<size_t>(product(dim));
+      auto step = dim[dim.size() - 1];
+      int s = -1;
+      bool new_s = false;
+      for (size_t i = 0; i < N; ++i) {
+        if (0 == i % step) {
+          ++s;
+          new_s = true;
+        }
+        if (new_s) {
+          data_d_dout[s] =
+              data_ddy[i] * data_d_dx[i] + data_ddx[i] * data_d_dy[i];
+        } else {
+          data_d_dout[s] +=
+              data_ddy[i] * data_d_dx[i] + data_ddx[i] * data_d_dy[i];
+        }
+        new_s = false;
+      }
+    }
+
+    if (out_tensor_d_ddx) {
+      auto* data_d_ddx = out_tensor_d_ddx->mutable_data<T>();
+      auto* data_dout = in_tensor_dout->data<T>();
+      auto* data_d_dy = in_tensor_d_dy->data<T>();
+      auto* data_y = in_tensor_y->data<T>();
+      auto* data_d_ddout = in_tensor_d_ddout->data<T>();
+
+      const DDim& dim = out_tensor_d_ddx->dims();
+      size_t N = static_cast<size_t>(product(dim));
+      auto step = dim[dim.size() - 1];
+      int s = -1;
+
+      for (size_t i = 0; i < N; ++i) {
+        if (0 == i % step) ++s;
+        data_d_ddx[i] =
+            data_dout[s] * data_d_dy[i] + data_y[i] * data_d_ddout[s];
+      }
+    }
+
+    if (out_tensor_d_ddy) {
+      auto* data_d_ddy = out_tensor_d_ddy->mutable_data<T>();
+      auto* data_dout = in_tensor_dout->data<T>();
+      auto* data_d_dx = in_tensor_d_dx->data<T>();
+      auto* data_x = in_tensor_x->data<T>();
+      auto* data_d_ddout = in_tensor_d_ddout->data<T>();
+
+      const DDim& dim = out_tensor_d_ddy->dims();
+      size_t N = static_cast<size_t>(product(dim));
+      auto step = dim[dim.size() - 1];
+      int s = -1;
+
+      for (size_t i = 0; i < N; ++i) {
+        if (0 == i % step) ++s;
+        data_d_ddy[i] =
+            data_dout[s] * data_d_dx[i] + data_x[i] * data_d_ddout[s];
+      }
+    }
+#endif
+  }
+};
+
+template <typename T, typename Context>
+void DotGradKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   const DenseTensor& dout,
+                   DenseTensor* dx,
+                   DenseTensor* dy) {
+  if (dx) {
+    dx->mutable_data<T>();
+  }
+  if (dy) {
+    dy->mutable_data<T>();
+  }
+  DotGradFunction<Context, T>()(dev_ctx, &x, &y, &dout, dx, dy);
+}
+
+template <typename T, typename Context>
+void DotDoubleGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& y,
+                         const DenseTensor& ddx,
+                         const DenseTensor& ddy,
+                         const DenseTensor& dout,
+                         DenseTensor* dx,
+                         DenseTensor* dy,
+                         DenseTensor* ddout) {
+  if (dx) {
+    dx->mutable_data<T>();
+  }
+  if (dy) {
+    dy->mutable_data<T>();
+  }
+  if (ddout) {
+    ddout->mutable_data<T>();
+  }
+  DotDoubleGradFunction<Context, T>()(
+      dev_ctx, &x, &y, &dout, ddx, ddy, dx, dy, ddout);
+}
+
+template <typename T, typename Context>
+void DotTripleGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& y,
+                         const DenseTensor& ddx,
+                         const DenseTensor& ddy,
+                         const DenseTensor& d_dx,
+                         const DenseTensor& d_dy,
+                         const DenseTensor& dout,
+                         const DenseTensor& d_ddout,
+                         DenseTensor* d_x,
+                         DenseTensor* d_y,
+                         DenseTensor* d_ddx,
+                         DenseTensor* d_ddy,
+                         DenseTensor* d_dout) {
+  if (d_x) {
+    d_x->mutable_data<T>();
+  }
+  if (d_y) {
+    d_y->mutable_data<T>();
+  }
+  if (d_ddx) {
+    d_ddx->mutable_data<T>();
+  }
+  if (d_ddy) {
+    d_ddy->mutable_data<T>();
+  }
+  if (d_dout) {
+    d_dout->mutable_data<T>();
+  }
+
+  DotTripleGradFunction<Context, T>()(dev_ctx,
+                                      &x,
+                                      &y,
+                                      ddx,
+                                      ddy,
+                                      d_dx,
+                                      d_dy,
+                                      dout,
+                                      d_ddout,
+                                      d_x,
+                                      d_y,
+                                      d_dout,
+                                      d_ddx,
+                                      d_ddy);
+}
+
+}  // namespace pten
diff --git a/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h b/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h
new file mode 100644
index 0000000000000..802cc019d78c5
--- /dev/null
+++ b/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h
@@ -0,0 +1,1742 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+// #include "paddle/pten/kernels/complex_kernel.h"
+#include "paddle/pten/include/math.h"
+#include "paddle/pten/kernels/empty_kernel.h"
+#include "paddle/pten/kernels/impl/dot_grad_kernel_impl.h"
+#include "paddle/pten/kernels/impl/matmul_kernel_impl.h"
+
+#include "paddle/pten/kernels/cpu/reduce.h"
+#include "paddle/pten/kernels/funcs/reduce_functor.h"
+
+#include "paddle/pten/backends/cpu/cpu_context.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"
+
+#if defined(__NVCC__) || defined(__HIPCC__)
+#include "paddle/pten/kernels/gpu/reduce.h"
+#endif
+
+namespace pten {
+
+template <typename Context, typename T>
+struct ReduceSumForMatmulGrad {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& input,
+                  DenseTensor* output,
+                  const std::vector<int>& reduce_dims);
+};
+
+template <typename T>
+struct ReduceSumForMatmulGrad<CPUContext, T> {
+  void operator()(const CPUContext& dev_ctx,
+                  const DenseTensor& input,
+                  DenseTensor* output,
+                  const std::vector<int>& reduce_dims) {
+    std::vector<int64_t> reduce_dims_tmp(reduce_dims.begin(),
+                                         reduce_dims.end());
+    ReduceKernelImpl<CPUContext, T, T, pten::funcs::SumFunctor>(
+        dev_ctx, input, output, reduce_dims_tmp, true, false);
+  }
+};
+
+#if defined(__NVCC__) || defined(__HIPCC__)
+template <typename T>
+struct ReduceSumForMatmulGrad<GPUContext, T> {
+  void operator()(const GPUContext& dev_ctx,
+                  const DenseTensor& input,
+                  DenseTensor* output,
+                  const std::vector<int>& reduce_dims) {
+    auto stream = dev_ctx.stream();
+    kernels::
+        TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+            input, output, kps::IdentityFunctor<T>(), reduce_dims, stream);
+  }
+};
+#endif
+
+// Reshape a rank-3 tensor from P x M x N to (P * M) x N.
+// Identity op if the tensor is not of rank 3.
+static DenseTensor FoldInitDims(const DenseTensor& input) {
+  DenseTensor output = input;
+  auto in_dims = input.dims();
+  if (in_dims.size() == 3) {
+    output.Resize({in_dims[0] * in_dims[1], in_dims[2]});
+  }
+  return output;
+}
+
+// Reshape a rank-3 tensor from P x M x N to M x (P * N).
+// (Warning: This requires transposing data and writes into new memory.)
+// Identity op if the tensor is not of rank 3.
+template <typename Context, typename T>
+static DenseTensor FoldHeadAndLastDims(const Context& dev_ctx,
+                                       const DenseTensor& input) {
+  auto in_dims = input.dims();
+  if (in_dims.size() != 3) {
+    return input;
+  }
+  DenseTensor output = EmptyLike<T, Context>(dev_ctx, input);
+  output.Resize({in_dims[1], in_dims[0], in_dims[2]});
+  std::vector<int> axis = {1, 0, 2};
+  math::Transpose<Context, T, 3> trans;
+  trans(dev_ctx, input, &output, axis);
+  output.Resize({in_dims[1], in_dims[0] * in_dims[2]});
+  return output;
+}
+
+template <typename Context, typename T>
+void MatMul(const Context& dev_ctx,
+            const DenseTensor& a,
+            bool trans_a,
+            const DenseTensor& b,
+            bool trans_b,
+            DenseTensor* out,
+            bool flag = false) {
+  out->mutable_data<T>();
+  auto blas = paddle::operators::math::GetBlas<Context, T>(dev_ctx);
+  auto mat_dim_a =
+      paddle::operators::math::CreateMatrixDescriptor(a.dims(), 0, trans_a);
+  auto mat_dim_b =
+      paddle::operators::math::CreateMatrixDescriptor(b.dims(), 0, trans_b);
+  if (a.dims().size() == 3 && b.dims().size() <= 2) {
+    // the transpose_X must be false, if is true, the transpose cost much time
+    if (!trans_a) {
+      mat_dim_a.height_ *= mat_dim_a.batch_size_;
+      mat_dim_a.batch_size_ = 0;
+    }
+  }
+  blas.MatMul(a.data<T>(),
+              mat_dim_a,
+              b.data<T>(),
+              mat_dim_b,
+              static_cast<T>(1),
+              out->mutable_data<T>(),
+              static_cast<T>(flag));
+}
+
+/**
+ * Get row matrix shape from a vector shape. If the rank of x_dim > 1, the
+ * original x_dim is returned.
+ */
+static DDim RowMatrixFromVector(const DDim& x_dim) {
+  if (x_dim.size() > 1) {
+    return x_dim;
+  }
+  return paddle::framework::make_ddim({1, x_dim[0]});
+}
+
+/**
+ * Get column matrix shape from a vector shape. If the ran of y_dim > 1, the
+ * original y_dim is returned.
+ */
+static DDim ColumnMatrixFromVector(const DDim& y_dim) {
+  if (y_dim.size() > 1) {
+    return y_dim;
+  }
+  return paddle::framework::make_ddim({y_dim[0], 1});
+}
+
+/**
+ * Reshape a tensor to 3-D or 2-D tensor by matrix descriptor.
+ *
+ * The shape would be [BatchSize, H, W] or [H, W].
+ * If transposed, `H,W` will be swapped.
+ */
+static void ReshapeTensorIntoMatrixSequence(
+    DenseTensor* x, const paddle::operators::math::MatDescriptor& descriptor) {
+  int64_t h, w;
+  h = descriptor.height_;
+  w = descriptor.width_;
+  if (descriptor.trans_) {
+    std::swap(w, h);
+  }
+  if (descriptor.batch_size_) {
+    x->Resize({descriptor.batch_size_, h, w});
+  } else {
+    x->Resize({h, w});
+  }
+}
+
+static void ReshapeXYOutIntoMatrixSequence(DenseTensor* x,
+                                           DenseTensor* y,
+                                           DenseTensor* out,
+                                           bool trans_x,
+                                           bool trans_y) {
+  auto x_dim = RowMatrixFromVector(x->dims());
+  auto y_dim = ColumnMatrixFromVector(y->dims());
+  auto mat_dim_x =
+      paddle::operators::math::CreateMatrixDescriptor(x_dim, 0, trans_x);
+  auto mat_dim_y =
+      paddle::operators::math::CreateMatrixDescriptor(y_dim, 0, trans_y);
+  if (mat_dim_x.batch_size_ == 0 && mat_dim_y.batch_size_ == 0) {
+    out->Resize({mat_dim_x.height_, mat_dim_y.width_});
+  } else {
+    out->Resize({(std::max)(mat_dim_x.batch_size_, mat_dim_y.batch_size_),
+                 mat_dim_x.height_,
+                 mat_dim_y.width_});
+  }
+
+  ReshapeTensorIntoMatrixSequence(x, mat_dim_x);
+  ReshapeTensorIntoMatrixSequence(y, mat_dim_y);
+}
+
+template <typename T, typename Context>
+void CalcInputGrad(const Context& dev_ctx,
+                   const DenseTensor& a,
+                   bool trans_a,
+                   bool is_fold_init_dims_a,
+                   const DenseTensor& b,
+                   bool trans_b,
+                   bool is_fold_init_dims_b,
+                   DenseTensor* out,
+                   bool flag = false) {
+  if (out == nullptr) return;
+  bool need_combine =
+      (a.dims().size() == 3 || b.dims().size() == 3) && out->dims().size() == 2;
+  if (!need_combine) {
+    MatMul<Context, T>(dev_ctx, a, trans_a, b, trans_b, out, flag);
+  } else {
+    MatMul<Context, T>(
+        dev_ctx,
+        is_fold_init_dims_a ? FoldInitDims(a)
+                            : FoldHeadAndLastDims<Context, T>(dev_ctx, a),
+        trans_a,
+        is_fold_init_dims_b ? FoldInitDims(b)
+                            : FoldHeadAndLastDims<Context, T>(dev_ctx, b),
+        trans_b,
+        out,
+        flag);
+  }
+}
+
+template <typename T, typename Context>
+void MatmulGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& y,
+                      const DenseTensor& out_grad,
+                      bool transpose_x,
+                      bool transpose_y,
+                      DenseTensor* dx,
+                      DenseTensor* dy) {
+  // get dims
+  std::vector<std::int64_t> x_dims = vectorize(x.dims());
+  std::vector<std::int64_t> y_dims = vectorize(y.dims());
+  std::vector<std::int64_t> dout_dims = vectorize(out_grad.dims());
+
+  int x_ndim = x_dims.size();
+  int y_ndim = y_dims.size();
+  int ndim = dout_dims.size();
+
+  // Case1 : x's or y's dim = 1
+  if (x_ndim == 1 && y_ndim == 1) {
+    if (dx) dx->mutable_data<T>();
+    if (dy) dy->mutable_data<T>();
+    if (out_grad.numel() == 1) {
+      DotGradFunction<Context, T>()(dev_ctx, &x, &y, &out_grad, dx, dy);
+      return;
+    }
+  }
+
+  bool is_broadcast = true;
+  if (x_ndim <= 2 || y_ndim <= 2) {
+    is_broadcast = false;
+  } else if (x_ndim != y_ndim) {
+    is_broadcast = true;
+  } else {
+    is_broadcast = !std::equal(
+        x_dims.cbegin(), x_dims.cbegin() + x_ndim - 2, y_dims.cbegin());
+  }
+
+  // for complex
+  DenseTensor x_conj;
+  DenseTensor y_conj;
+
+  // Case2: no broadcast or no batch size, it aims to speed and it is same as
+  // matmul in old version.
+  if (!is_broadcast) {
+    DenseTensor x_help = x;
+    DenseTensor y_help = y;
+    DenseTensor out_grad_help = out_grad;
+    ReshapeXYOutIntoMatrixSequence(
+        &x_help, &y_help, &out_grad_help, transpose_x, transpose_y);
+
+    DDim dx_dims;
+    if (dx) {
+      dx_dims = dx->dims();
+      if (dx_dims != x_help.dims()) {
+        dx->Resize(x_help.dims());
+      }
+
+      y_conj = Conj<T>(dev_ctx, y_help);
+    }
+
+    DDim dy_dims;
+    if (dy) {
+      dy_dims = dy->dims();
+      if (dy_dims != y_help.dims()) {
+        dy->Resize(y_help.dims());
+      }
+
+      x_conj = Conj<T>(dev_ctx, x_help);
+    }
+
+    if (transpose_x && transpose_y) {
+      CalcInputGrad<T>(
+          dev_ctx, y_conj, true, true, out_grad_help, true, false, dx);
+      CalcInputGrad<T>(
+          dev_ctx, out_grad_help, true, true, x_conj, true, false, dy);
+    } else if (transpose_x) {
+      CalcInputGrad<T>(
+          dev_ctx, y_conj, false, false, out_grad_help, true, false, dx);
+      CalcInputGrad<T>(
+          dev_ctx, x_conj, false, false, out_grad_help, false, true, dy);
+    } else if (transpose_y) {
+      CalcInputGrad<T>(
+          dev_ctx, out_grad_help, false, false, y_conj, false, true, dx);
+      CalcInputGrad<T>(
+          dev_ctx, out_grad_help, true, true, x_conj, false, true, dy);
+    } else {
+      CalcInputGrad<T>(
+          dev_ctx, out_grad_help, false, false, y_conj, true, false, dx);
+      CalcInputGrad<T>(
+          dev_ctx, x_conj, true, true, out_grad_help, false, true, dy);
+    }
+
+    if (dx) {
+      if (dx_dims != x_help.dims()) {
+        dx->Resize(dx_dims);
+      }
+    }
+    if (dy) {
+      if (dy_dims != y_help.dims()) {
+        dy->Resize(dy_dims);
+      }
+    }
+  } else {
+    // Case3: broadcast. It need cost much time to reduce sum for the
+    // broadcast and wastes the memory.
+    // So we should avoid the case in reality.
+    VLOG(3) << "It need cost much time to reduce sum for the broadcast and "
+               "wastes the memory. So we should avoid the case in reality";
+    x_conj = Conj<T>(dev_ctx, x);
+    y_conj = Conj<T>(dev_ctx, y);
+
+    DenseTensor dx_help = Empty<T, Context>(dev_ctx);
+    DenseTensor dy_help = Empty<T, Context>(dev_ctx);
+
+    if (transpose_x) {
+      if (transpose_y) {
+        // X'Y': dA = Y'G', dB = G'X'
+        if (dx)
+          MatMulFunction<Context, T>(dev_ctx,
+                                     y_conj,
+                                     out_grad,
+                                     y_dims,
+                                     dout_dims,
+                                     &dx_help,
+                                     true,
+                                     true);
+        if (dy)
+          MatMulFunction<Context, T>(dev_ctx,
+                                     out_grad,
+                                     x_conj,
+                                     dout_dims,
+                                     x_dims,
+                                     &dy_help,
+                                     true,
+                                     true);
+      } else {
+        // X'Y: dX = YG', dY = XG
+        if (dx)
+          MatMulFunction<Context, T>(dev_ctx,
+                                     y_conj,
+                                     out_grad,
+                                     y_dims,
+                                     dout_dims,
+                                     &dx_help,
+                                     false,
+                                     true);
+        if (dy)
+          MatMulFunction<Context, T>(dev_ctx,
+                                     x_conj,
+                                     out_grad,
+                                     x_dims,
+                                     dout_dims,
+                                     &dy_help,
+                                     false,
+                                     false);
+      }
+    } else {
+      if (transpose_y) {
+        // XY': dX = GY, dY = G'X
+        if (dx)
+          MatMulFunction<Context, T>(dev_ctx,
+                                     out_grad,
+                                     y_conj,
+                                     dout_dims,
+                                     y_dims,
+                                     &dx_help,
+                                     false,
+                                     false);
+        if (dy)
+          MatMulFunction<Context, T>(dev_ctx,
+                                     out_grad,
+                                     x_conj,
+                                     dout_dims,
+                                     x_dims,
+                                     &dy_help,
+                                     true,
+                                     false);
+      } else {
+        // XY: dX = GY', dY = X'G
+        if (dx)
+          MatMulFunction<Context, T>(dev_ctx,
+                                     out_grad,
+                                     y_conj,
+                                     dout_dims,
+                                     y_dims,
+                                     &dx_help,
+                                     false,
+                                     true);
+        if (dy)
+          MatMulFunction<Context, T>(dev_ctx,
+                                     x_conj,
+                                     out_grad,
+                                     x_dims,
+                                     dout_dims,
+                                     &dy_help,
+                                     true,
+                                     false);
+      }
+    }
+
+    // get help dims
+    const std::vector<std::int64_t> dx_help_dims = vectorize(dx_help.dims());
+    const std::vector<std::int64_t> dy_help_dims = vectorize(dy_help.dims());
+
+    std::vector<std::int64_t> dx_broadcast_dims(ndim);
+    std::vector<std::int64_t> dy_broadcast_dims(ndim);
+
+    std::fill(
+        dx_broadcast_dims.data(), dx_broadcast_dims.data() + ndim - x_ndim, 1);
+    std::fill(
+        dy_broadcast_dims.data(), dy_broadcast_dims.data() + ndim - y_ndim, 1);
+    std::copy(x_dims.data(),
+              x_dims.data() + x_ndim,
+              dx_broadcast_dims.data() + ndim - x_ndim);
+    std::copy(y_dims.data(),
+              y_dims.data() + y_ndim,
+              dy_broadcast_dims.data() + ndim - y_ndim);
+
+    std::vector<int> dx_reduce_dims;
+    std::vector<int> dy_reduce_dims;
+    for (int idx = 0; idx <= ndim - 3; idx++) {
+      if (dx_help_dims[idx] != 1 && dx_broadcast_dims[idx] == 1) {
+        dx_reduce_dims.push_back(idx);
+      }
+      if (dy_help_dims[idx] != 1 && dy_broadcast_dims[idx] == 1) {
+        dy_reduce_dims.push_back(idx);
+      }
+    }
+    // reduce sum to get grad by ReduceSum
+    if (dx) {
+      if (dx_reduce_dims.empty()) {
+        *dx = std::move(dx_help);
+      } else {
+        ReduceSumForMatmulGrad<Context, T>()(
+            dev_ctx, dx_help, dx, dx_reduce_dims);
+      }
+      dx->Resize(x.dims());
+    }
+    if (dy) {
+      if (dy_reduce_dims.empty()) {
+        *dy = std::move(dy_help);
+      } else {
+        ReduceSumForMatmulGrad<Context, T>()(
+            dev_ctx, dy_help, dy, dy_reduce_dims);
+      }
+      dy->Resize(y.dims());
+    }
+    // Get the OutputGrad(out)
+  }
+}
+
+template <typename T, typename Context>
+void MatmulDoubleGradKernel(const Context& dev_ctx,
+                            const DenseTensor& x,
+                            const DenseTensor& y,
+                            const DenseTensor& dout,
+                            paddle::optional<const DenseTensor&> ddx,
+                            paddle::optional<const DenseTensor&> ddy,
+                            bool transpose_x,
+                            bool transpose_y,
+                            DenseTensor* dx,
+                            DenseTensor* dy,
+                            DenseTensor* ddout) {
+  // Get dims from the input x, y, output_grad
+  std::vector<std::int64_t> x_dims = vectorize(x.dims());
+  std::vector<std::int64_t> y_dims = vectorize(y.dims());
+  std::vector<std::int64_t> dout_dims = vectorize(dout.dims());
+
+  int x_ndim = x_dims.size();
+  int y_ndim = y_dims.size();
+  int ndim = dout_dims.size();
+
+  // Case1 : x's or y's dim = 1
+  if (x_ndim == 1 && y_ndim == 1) {
+    DotDoubleGradFunction<Context, T>()(
+        dev_ctx, &x, &y, &dout, ddx.get_ptr(), ddy.get_ptr(), dx, dy, ddout);
+    return;
+  }
+
+  DenseTensor x_conj;
+  DenseTensor y_conj;
+  DenseTensor dout_conj;
+
+  bool is_broadcast = true;
+  if (x_ndim <= 2 || y_ndim <= 2) {
+    is_broadcast = false;
+  } else if (x_ndim != y_ndim) {
+    is_broadcast = true;
+  } else {
+    is_broadcast = !std::equal(
+        x_dims.cbegin(), x_dims.cbegin() + x_ndim - 2, y_dims.cbegin());
+  }
+
+  if (!is_broadcast) {
+    // Case2: no broadcast or no batch size
+    DenseTensor x_help = x;
+    DenseTensor y_help = y;
+    DenseTensor dout_help = dout;
+    ReshapeXYOutIntoMatrixSequence(
+        &x_help, &y_help, &dout_help, transpose_x, transpose_y);
+    DDim dx_dims;
+
+    if (dx) {
+      dx_dims = dx->dims();
+      if (dx_dims != x_help.dims()) {
+        dx->Resize(x_help.dims());
+      }
+    }
+
+    DDim dy_dims;
+    if (dy) {
+      dy_dims = dy->dims();
+      if (dy_dims != y_help.dims()) {
+        dy->Resize(y_help.dims());
+      }
+    }
+
+    DDim ddout_dims;
+    if (ddout) {
+      ddout_dims = ddout->dims();
+      if (ddout_dims != dout_help.dims()) {
+        ddout->Resize(dout_help.dims());
+      }
+
+      x_conj = Conj<T>(dev_ctx, x_help);
+      y_conj = Conj<T>(dev_ctx, y_help);
+    }
+
+    if (dx || dy) {
+      dout_conj = Conj<T>(dev_ctx, dout_help);
+    }
+
+    bool ddout_flag = false;
+    if (ddx) {
+      auto ddx_mat = ddx.get();
+      if (ddx_mat.dims() != x_help.dims()) {
+        ddx_mat.Resize(x_help.dims());
+      }
+      if (dy) {
+        if (transpose_x && transpose_y) {
+          // dy = dout' * ddx'
+          CalcInputGrad<T>(
+              dev_ctx, dout_conj, true, true, ddx_mat, true, false, dy, false);
+        } else if (transpose_x) {
+          // dy = ddx * dout
+          CalcInputGrad<T>(dev_ctx,
+                           ddx_mat,
+                           false,
+                           false,
+                           dout_conj,
+                           false,
+                           true,
+                           dy,
+                           false);
+        } else if (transpose_y) {
+          // dy = dout' * ddx
+          CalcInputGrad<T>(
+              dev_ctx, dout_conj, true, true, ddx_mat, false, true, dy, false);
+        } else {
+          // dy = ddx' * dout
+          CalcInputGrad<T>(
+              dev_ctx, ddx_mat, true, true, dout_conj, false, true, dy, false);
+        }
+      }
+
+      if (ddout) {
+        CalcInputGrad<T>(dev_ctx,
+                         ddx_mat,
+                         transpose_x,
+                         true,
+                         y_conj,
+                         transpose_y,
+                         false,
+                         ddout,
+                         ddout_flag);
+        ddout_flag = true;
+      }
+    }
+
+    if (ddy) {
+      auto ddy_mat = ddy.get();
+      if (ddy_mat.dims() != y_help.dims()) {
+        ddy_mat.Resize(y_help.dims());
+      }
+      if (dx) {
+        if (transpose_x && transpose_y) {
+          // dx = ddy' * dout'
+          CalcInputGrad<T>(
+              dev_ctx, ddy_mat, true, true, dout_conj, true, false, dx, false);
+        } else if (transpose_x) {
+          // dx = ddy * dout'
+          CalcInputGrad<T>(dev_ctx,
+                           ddy_mat,
+                           false,
+                           false,
+                           dout_conj,
+                           true,
+                           false,
+                           dx,
+                           false);
+        } else if (transpose_y) {
+          // dx = dout * ddy
+          CalcInputGrad<T>(dev_ctx,
+                           dout_conj,
+                           false,
+                           false,
+                           ddy_mat,
+                           false,
+                           true,
+                           dx,
+                           false);
+        } else {
+          // dx = dout * ddy'
+          CalcInputGrad<T>(dev_ctx,
+                           dout_conj,
+                           false,
+                           false,
+                           ddy_mat,
+                           true,
+                           false,
+                           dx,
+                           false);
+        }
+      }
+
+      if (ddout) {
+        CalcInputGrad<T>(dev_ctx,
+                         x_conj,
+                         transpose_x,
+                         true,
+                         ddy_mat,
+                         transpose_y,
+                         false,
+                         ddout,
+                         ddout_flag);
+      }
+    }
+
+    if (dx) {
+      if (dx_dims != x_help.dims()) {
+        dx->Resize(dx_dims);
+      }
+    }
+
+    if (dy) {
+      if (dy_dims != y_help.dims()) {
+        dy->Resize(dy_dims);
+      }
+    }
+
+    if (ddout) {
+      if (ddout_dims != dout_help.dims()) {
+        ddout->Resize(ddout_dims);
+      }
+    }
+  } else {
+    // Case3: broadcast. It need cost much time to reduce sum for the
+    // broadcast and wastes the memory.
+    // So we should avoid the case in reality.
+    VLOG(3) << "It need cost much time to reduce sum for the broadcast and "
+               "wastes the memory. So we should avoid the case in reality";
+    if (dx || dy) {
+      dout_conj = Conj<T>(dev_ctx, dout);
+    }
+    if (ddout) {
+      x_conj = Conj<T>(dev_ctx, x);
+      y_conj = Conj<T>(dev_ctx, y);
+    }
+
+    DenseTensor dx_help = Empty<T>(dev_ctx);
+    DenseTensor dy_help = Empty<T>(dev_ctx);
+
+    if (transpose_x) {
+      if (transpose_y) {
+        if (dx) {
+          MatMulFunction<Context, T>(dev_ctx,
+                                     ddy.get(),
+                                     dout_conj,
+                                     y_dims,
+                                     dout_dims,
+                                     &dx_help,
+                                     true,
+                                     true);
+        }
+        if (dy) {
+          MatMulFunction<Context, T>(dev_ctx,
+                                     dout_conj,
+                                     ddx.get(),
+                                     dout_dims,
+                                     x_dims,
+                                     &dy_help,
+                                     true,
+                                     true);
+        }
+      } else {
+        if (dx)
+          MatMulFunction<Context, T>(dev_ctx,
+                                     ddy.get(),
+                                     dout_conj,
+                                     y_dims,
+                                     dout_dims,
+                                     &dx_help,
+                                     false,
+                                     true);
+        if (dy)
+          MatMulFunction<Context, T>(dev_ctx,
+                                     ddx.get(),
+                                     dout_conj,
+                                     x_dims,
+                                     dout_dims,
+                                     &dy_help,
+                                     false,
+                                     false);
+      }
+    } else {
+      if (transpose_y) {
+        if (dx) {
+          MatMulFunction<Context, T>(dev_ctx,
+                                     dout_conj,
+                                     ddy.get(),
+                                     dout_dims,
+                                     y_dims,
+                                     &dx_help,
+                                     false,
+                                     false);
+        }
+        if (dy) {
+          MatMulFunction<Context, T>(dev_ctx,
+                                     dout_conj,
+                                     ddx.get(),
+                                     dout_dims,
+                                     x_dims,
+                                     &dy_help,
+                                     true,
+                                     false);
+        }
+      } else {
+        if (dx) {
+          MatMulFunction<Context, T>(dev_ctx,
+                                     dout_conj,
+                                     ddy.get(),
+                                     dout_dims,
+                                     y_dims,
+                                     &dx_help,
+                                     false,
+                                     true);
+        }
+        if (dy) {
+          MatMulFunction<Context, T>(dev_ctx,
+                                     ddx.get(),
+                                     dout_conj,
+                                     x_dims,
+                                     dout_dims,
+                                     &dy_help,
+                                     true,
+                                     false);
+        }
+      }
+    }
+
+    // get help dims
+    const std::vector<std::int64_t> dx_help_dims = vectorize(dx_help.dims());
+    const std::vector<std::int64_t> dy_help_dims = vectorize(dy_help.dims());
+
+    std::vector<std::int64_t> dx_broadcast_dims(ndim);
+    std::vector<std::int64_t> dy_broadcast_dims(ndim);
+
+    std::fill(
+        dx_broadcast_dims.data(), dx_broadcast_dims.data() + ndim - x_ndim, 1);
+    std::fill(
+        dy_broadcast_dims.data(), dy_broadcast_dims.data() + ndim - y_ndim, 1);
+    std::copy(x_dims.data(),
+              x_dims.data() + x_ndim,
+              dx_broadcast_dims.data() + ndim - x_ndim);
+    std::copy(y_dims.data(),
+              y_dims.data() + y_ndim,
+              dy_broadcast_dims.data() + ndim - y_ndim);
+
+    std::vector<int> dx_reduce_dims;
+    std::vector<int> dy_reduce_dims;
+    for (int idx = 0; idx <= ndim - 3; idx++) {
+      if (dx_help_dims[idx] != 1 && dx_broadcast_dims[idx] == 1) {
+        dx_reduce_dims.push_back(idx);
+      }
+      if (dy_help_dims[idx] != 1 && dy_broadcast_dims[idx] == 1) {
+        dy_reduce_dims.push_back(idx);
+      }
+    }
+    // Reduce sum to get grad by ReduceSum
+    if (dx) {
+      if (dx_reduce_dims.empty()) {
+        *dx = std::move(dx_help);
+      } else {
+        ReduceSumForMatmulGrad<Context, T>()(
+            dev_ctx, dx_help, dx, dx_reduce_dims);
+      }
+      dx->Resize(x.dims());
+    }
+    if (dy) {
+      if (dy_reduce_dims.empty()) {
+        *dy = std::move(dy_help);
+      } else {
+        ReduceSumForMatmulGrad<Context, T>()(
+            dev_ctx, dy_help, dy, dy_reduce_dims);
+      }
+      dy->Resize(y.dims());
+    }
+
+    if (ddout) {
+      // Calculate the gradient of OutputGrad(Out)
+      MatMulFunction<Context, T>(dev_ctx,
+                                 ddx.get(),
+                                 y_conj,
+                                 x_dims,
+                                 y_dims,
+                                 ddout,
+                                 transpose_x,
+                                 transpose_y);
+      MatMulFunction<Context, T>(dev_ctx,
+                                 x_conj,
+                                 ddy.get(),
+                                 x_dims,
+                                 y_dims,
+                                 ddout,
+                                 transpose_x,
+                                 transpose_y,
+                                 true);
+    }
+  }
+}
+
+template <typename T, typename Context>
+void MatmulTripleGradKernel(const Context& dev_ctx,
+                            const DenseTensor& x,
+                            const DenseTensor& y,
+                            const DenseTensor& dout,
+                            const DenseTensor& ddx,
+                            const DenseTensor& ddy,
+                            paddle::optional<const DenseTensor&> d_dx,
+                            paddle::optional<const DenseTensor&> d_dy,
+                            paddle::optional<const DenseTensor&> d_ddout,
+                            bool transpose_x,
+                            bool transpose_y,
+                            DenseTensor* out_d_x,
+                            DenseTensor* out_d_y,
+                            DenseTensor* out_d_dout,
+                            DenseTensor* out_d_ddx,
+                            DenseTensor* out_d_ddy) {
+  // Get dims from the input x, y, output_grad
+  std::vector<std::int64_t> x_dims = vectorize(x.dims());
+  std::vector<std::int64_t> y_dims = vectorize(y.dims());
+  std::vector<std::int64_t> dout_dims = vectorize(dout.dims());
+
+  int x_ndim = x_dims.size();
+  int y_ndim = y_dims.size();
+  int ndim = dout_dims.size();
+
+  // Case1 : x's and y's dim = 1
+  if (x_ndim == 1 && y_ndim == 1) {
+    VLOG(3) << "========  MatMulV2TripleGradKernel, Compute ====== Case 1";
+    DotTripleGradFunction<Context, T>()(dev_ctx,
+                                        &x,
+                                        &y,
+                                        &ddx,
+                                        &ddy,
+                                        d_dx.get_ptr(),
+                                        d_dy.get_ptr(),
+                                        &dout,
+                                        d_ddout.get_ptr(),
+                                        out_d_x,
+                                        out_d_y,
+                                        out_d_dout,
+                                        out_d_ddx,
+                                        out_d_ddy);
+    return;
+  }
+
+  DenseTensor x_conj;
+  DenseTensor y_conj;
+  DenseTensor dout_conj;
+  DenseTensor ddx_conj;
+  DenseTensor ddy_conj;
+
+  bool is_broadcast = true;
+  if (x_ndim <= 2 || y_ndim <= 2) {
+    is_broadcast = false;
+  } else if (x_ndim != y_ndim) {
+    is_broadcast = true;
+  } else {
+    is_broadcast = !std::equal(
+        x_dims.cbegin(), x_dims.cbegin() + x_ndim - 2, y_dims.cbegin());
+  }
+
+  if (!is_broadcast) {
+    // Case2: no broadcast or no batch size
+    VLOG(3) << "========  MatMulV2TripleGradKernel, Compute ====== Case 2";
+    DenseTensor x_help = x;
+    DenseTensor y_help = y;
+    DenseTensor dout_help = dout;
+    DenseTensor ddx_help = ddx;
+    DenseTensor ddy_help = ddy;
+    ReshapeXYOutIntoMatrixSequence(
+        &x_help, &y_help, &dout_help, transpose_x, transpose_y);
+
+    if (ddx_help.dims() != x_help.dims()) {
+      ddx_help.Resize(x_help.dims());
+    }
+
+    if (ddy_help.dims() != y_help.dims()) {
+      ddy_help.Resize(y_help.dims());
+    }
+
+    DDim out_dx_dims;
+    if (out_d_x) {
+      out_dx_dims = out_d_x->dims();
+      if (out_dx_dims != x_help.dims()) {
+        out_d_x->Resize(x_help.dims());
+      }
+    }
+
+    DDim out_dy_dims;
+    if (out_d_y) {
+      out_dy_dims = out_d_y->dims();
+      if (out_dy_dims != y_help.dims()) {
+        out_d_y->Resize(y_help.dims());
+      }
+    }
+
+    DDim out_d_dout_dims;
+    if (out_d_dout) {
+      out_d_dout_dims = out_d_dout->dims();
+      if (out_d_dout_dims != dout_help.dims()) {
+        out_d_dout->Resize(dout_help.dims());
+      }
+
+      ddx_conj = Conj<T>(dev_ctx, ddx_help);
+      ddy_conj = Conj<T>(dev_ctx, ddy_help);
+    }
+
+    DDim out_d_ddx_dims;
+    if (out_d_ddx) {
+      out_d_ddx_dims = out_d_ddx->dims();
+      if (out_d_ddx_dims != x_help.dims()) {
+        out_d_ddx->Resize(x_help.dims());
+      }
+    }
+
+    DDim out_d_ddy_dims;
+    if (out_d_ddy) {
+      out_d_ddy_dims = out_d_ddy->dims();
+      if (out_d_ddy_dims != y_help.dims()) {
+        out_d_ddy->Resize(y_help.dims());
+      }
+    }
+
+    if (out_d_ddx || out_d_ddy) {
+      x_conj = Conj<T>(dev_ctx, x_help);
+      y_conj = Conj<T>(dev_ctx, y_help);
+      dout_conj = Conj<T>(dev_ctx, dout_help);
+    }
+
+    bool d_dout_flag = false;
+    bool d_ddx_flag = false;
+    bool d_ddy_flag = false;
+
+    if (d_ddout) {
+      auto d_ddout_mat = d_ddout.get();
+      if (d_ddout_mat.dims() != dout_help.dims()) {
+        d_ddout_mat.Resize(dout_help.dims());
+      }
+
+      if (out_d_y) {
+        if (transpose_x && transpose_y) {
+          // out_d_y = d_ddout' * ddx'
+          CalcInputGrad<T>(dev_ctx,
+                           d_ddout_mat,
+                           true,
+                           true,
+                           ddx_conj,
+                           true,
+                           false,
+                           out_d_y,
+                           false);
+        } else if (transpose_x) {
+          // out_d_y = ddx * d_ddout
+          CalcInputGrad<T>(dev_ctx,
+                           ddx_conj,
+                           false,
+                           false,
+                           d_ddout_mat,
+                           false,
+                           true,
+                           out_d_y,
+                           false);
+        } else if (transpose_y) {
+          // out_d_y = d_ddout' * ddx
+          CalcInputGrad<T>(dev_ctx,
+                           d_ddout_mat,
+                           true,
+                           true,
+                           ddx_conj,
+                           false,
+                           true,
+                           out_d_y,
+                           false);
+        } else {
+          // out_d_y = ddx' * d_ddout
+          CalcInputGrad<T>(dev_ctx,
+                           ddx_conj,
+                           true,
+                           true,
+                           d_ddout_mat,
+                           false,
+                           true,
+                           out_d_y,
+                           false);
+        }
+      }
+      if (out_d_x) {
+        if (transpose_x && transpose_y) {
+          // out_d_x = ddy' * d_ddout'
+          CalcInputGrad<T>(dev_ctx,
+                           ddy_conj,
+                           true,
+                           true,
+                           d_ddout_mat,
+                           true,
+                           false,
+                           out_d_x,
+                           false);
+        } else if (transpose_x) {
+          // out_d_x = ddy * d_ddout'
+          CalcInputGrad<T>(dev_ctx,
+                           ddy_conj,
+                           false,
+                           false,
+                           d_ddout_mat,
+                           true,
+                           false,
+                           out_d_x,
+                           false);
+        } else if (transpose_y) {
+          // out_d_x = d_ddout * ddy
+          CalcInputGrad<T>(dev_ctx,
+                           d_ddout_mat,
+                           false,
+                           false,
+                           ddy_conj,
+                           false,
+                           true,
+                           out_d_x,
+                           false);
+        } else {
+          // out_d_x = d_ddout * ddy'
+          CalcInputGrad<T>(dev_ctx,
+                           d_ddout_mat,
+                           false,
+                           false,
+                           ddy_conj,
+                           true,
+                           false,
+                           out_d_x,
+                           false);
+        }
+      }
+
+      // equations:
+      // d_ddx = DOut * D_DY + Y * D_DDOut
+      // Let: d_ddx1 = Y * D_DDOut
+      // Let: d_ddx2 = DOut * D_DY
+
+      // d_ddy = DOut * D_DX + X * D_DDOut
+      // Let: d_ddy1 = X * D_DDOut
+      // Let: d_ddy2 = DOut * D_DX
+
+      // d_dout = DDY * D_DX + DDX * D_DY
+      // Let: d_dout1 = DDX * D_DY
+      // Let: d_dout2 = DDY * D_DX
+
+      // compute d_ddx1
+      if (out_d_ddx) {
+        if (transpose_x && transpose_y) {
+          // out_d_ddx1 = y' * d_ddout'
+          CalcInputGrad<T>(dev_ctx,
+                           y_conj,
+                           true,
+                           true,
+                           d_ddout_mat,
+                           true,
+                           false,
+                           out_d_ddx,
+                           d_ddx_flag);
+        } else if (transpose_x) {
+          // out_d_ddx1 = y * d_ddout'
+          CalcInputGrad<T>(dev_ctx,
+                           y_conj,
+                           false,
+                           false,
+                           d_ddout_mat,
+                           true,
+                           false,
+                           out_d_ddx,
+                           d_ddx_flag);
+        } else if (transpose_y) {
+          // out_d_ddx1 = d_ddout * y
+          CalcInputGrad<T>(dev_ctx,
+                           d_ddout_mat,
+                           false,
+                           false,
+                           y_conj,
+                           false,
+                           true,
+                           out_d_ddx,
+                           d_ddx_flag);
+        } else {
+          // out_d_ddx1 = d_ddout * y'
+          CalcInputGrad<T>(dev_ctx,
+                           d_ddout_mat,
+                           false,
+                           false,
+                           y_conj,
+                           true,
+                           false,
+                           out_d_ddx,
+                           d_ddx_flag);
+        }
+        d_ddx_flag = true;
+      }
+
+      // compute d_ddy1
+      if (out_d_ddy) {
+        if (transpose_x && transpose_y) {
+          // out_d_ddy1 = d_ddout' * x'
+          CalcInputGrad<T>(dev_ctx,
+                           d_ddout_mat,
+                           true,
+                           true,
+                           x_conj,
+                           true,
+                           false,
+                           out_d_ddy,
+                           false);
+        } else if (transpose_x) {
+          // out_d_ddy1 = x * d_ddout
+          CalcInputGrad<T>(dev_ctx,
+                           x_conj,
+                           false,
+                           false,
+                           d_ddout_mat,
+                           false,
+                           true,
+                           out_d_ddy,
+                           false);
+        } else if (transpose_y) {
+          // out_d_ddy1 = d_ddout' * x
+          CalcInputGrad<T>(dev_ctx,
+                           d_ddout_mat,
+                           true,
+                           true,
+                           x_conj,
+                           false,
+                           true,
+                           out_d_ddy,
+                           false);
+        } else {
+          // out_d_ddy1 = x' * d_ddout
+          CalcInputGrad<T>(dev_ctx,
+                           x_conj,
+                           true,
+                           true,
+                           d_ddout_mat,
+                           false,
+                           true,
+                           out_d_ddy,
+                           false);
+        }
+        d_ddy_flag = true;
+      }
+    }
+
+    if (d_dy) {
+      auto d_dy_mat = d_dy.get();
+      if (d_dy_mat.dims() != y_help.dims()) {
+        d_dy_mat.Resize(y_help.dims());
+      }
+
+      // compute d_dout1
+      if (out_d_dout) {
+        CalcInputGrad<T>(dev_ctx,
+                         ddx_conj,
+                         transpose_x,
+                         true,
+                         d_dy_mat,
+                         transpose_y,
+                         false,
+                         out_d_dout,
+                         d_dout_flag);
+        d_dout_flag = true;
+      }
+
+      // compute d_ddx2
+      if (out_d_ddx) {
+        if (transpose_x && transpose_y) {
+          // out_d_ddx2 = D_DY' * DOut'
+          CalcInputGrad<T>(dev_ctx,
+                           d_dy_mat,
+                           true,
+                           true,
+                           dout_conj,
+                           true,
+                           false,
+                           out_d_ddx,
+                           d_ddx_flag);
+        } else if (transpose_x) {
+          // out_d_ddx2 = D_DY * Dout'
+          CalcInputGrad<T>(dev_ctx,
+                           d_dy_mat,
+                           false,
+                           false,
+                           dout_conj,
+                           true,
+                           false,
+                           out_d_ddx,
+                           d_ddx_flag);
+        } else if (transpose_y) {
+          // out_d_ddx2 = Dout * D_DY
+          CalcInputGrad<T>(dev_ctx,
+                           dout_conj,
+                           false,
+                           false,
+                           d_dy_mat,
+                           false,
+                           true,
+                           out_d_ddx,
+                           d_ddx_flag);
+        } else {
+          // out_d_ddx2 = Dout * D_DY'
+          CalcInputGrad<T>(dev_ctx,
+                           dout_conj,
+                           false,
+                           false,
+                           d_dy_mat,
+                           true,
+                           false,
+                           out_d_ddx,
+                           d_ddx_flag);
+        }
+      }
+    }
+
+    if (d_dx) {
+      auto d_dx_mat = d_dx.get();
+      if (d_dx_mat.dims() != x_help.dims()) {
+        d_dx_mat.Resize(x_help.dims());
+      }
+
+      // compute d_dout2
+      if (out_d_dout) {
+        CalcInputGrad<T>(dev_ctx,
+                         d_dx_mat,
+                         transpose_x,
+                         true,
+                         ddy_conj,
+                         transpose_y,
+                         false,
+                         out_d_dout,
+                         d_dout_flag);
+      }
+
+      // compute d_ddy2
+      if (out_d_ddy) {
+        if (transpose_x && transpose_y) {
+          // out_d_ddy2 = dout' * d_dx'
+          CalcInputGrad<T>(dev_ctx,
+                           dout_conj,
+                           true,
+                           true,
+                           d_dx_mat,
+                           true,
+                           false,
+                           out_d_ddy,
+                           d_ddy_flag);
+        } else if (transpose_x) {
+          // out_d_ddy2 = d_dx * dout
+          CalcInputGrad<T>(dev_ctx,
+                           d_dx_mat,
+                           false,
+                           false,
+                           dout_conj,
+                           false,
+                           true,
+                           out_d_ddy,
+                           d_ddy_flag);
+        } else if (transpose_y) {
+          // out_d_ddy2 = dout' * d_dx
+          CalcInputGrad<T>(dev_ctx,
+                           dout_conj,
+                           true,
+                           true,
+                           d_dx_mat,
+                           false,
+                           true,
+                           out_d_ddy,
+                           d_ddy_flag);
+        } else {
+          // out_d_ddy2 = d_dx' * dout
+          CalcInputGrad<T>(dev_ctx,
+                           d_dx_mat,
+                           true,
+                           true,
+                           dout_conj,
+                           false,
+                           true,
+                           out_d_ddy,
+                           d_ddy_flag);
+        }
+      }
+    }
+
+    if (out_d_x) {
+      if (out_dx_dims != x_help.dims()) {
+        out_d_x->Resize(out_dx_dims);
+      }
+    }
+
+    if (out_d_y) {
+      if (out_dy_dims != y_help.dims()) {
+        out_d_y->Resize(out_dy_dims);
+      }
+    }
+
+    if (out_d_dout) {
+      if (out_d_dout_dims != dout_help.dims()) {
+        out_d_dout->Resize(out_d_dout_dims);
+      }
+    }
+
+    if (out_d_ddx) {
+      if (out_d_ddx_dims != x_help.dims()) {
+        out_d_ddx->Resize(out_d_ddx_dims);
+      }
+    }
+
+    if (out_d_ddy) {
+      if (out_d_ddy_dims != y_help.dims()) {
+        out_d_ddy->Resize(out_d_ddy_dims);
+      }
+    }
+  } else {
+    // Case3: broadcast. It need cost much time to reduce sum for the
+    // broadcast and wastes the memory.
+    // So we should avoid the case in reality.
+    VLOG(3) << "========  MatMulV2TripleGradKernel, Compute ====== Case 3";
+    VLOG(3) << "It need cost much time to reduce sum for the broadcast and "
+               "wastes the memory. So we should avoid the case in reality";
+
+    DenseTensor out_dx_help = Empty<T>(dev_ctx);
+    DenseTensor out_dy_help = Empty<T>(dev_ctx);
+    DenseTensor out_d_ddx_help = Empty<T>(dev_ctx);
+    DenseTensor out_d_ddy_help = Empty<T>(dev_ctx);
+
+    if (out_d_dout) {
+      ddx_conj = Conj<T>(dev_ctx, ddx);
+      ddy_conj = Conj<T>(dev_ctx, ddy);
+    }
+    if (out_d_ddx || out_d_ddy) {
+      x_conj = Conj<T>(dev_ctx, x);
+      y_conj = Conj<T>(dev_ctx, y);
+      dout_conj = Conj<T>(dev_ctx, dout);
+    }
+
+    if (transpose_x) {
+      if (transpose_y) {
+        // dX = ddY' d_ddout’, dY = d_ddout’ ddX'
+        if (out_d_x)
+          MatMulFunction<Context, T>(dev_ctx,
+                                     ddy_conj,
+                                     d_ddout.get(),
+                                     y_dims,
+                                     dout_dims,
+                                     &out_dx_help,
+                                     true,
+                                     true);
+        if (out_d_y)
+          MatMulFunction<Context, T>(dev_ctx,
+                                     d_ddout.get(),
+                                     ddx_conj,
+                                     dout_dims,
+                                     x_dims,
+                                     &out_dy_help,
+                                     true,
+                                     true);
+      } else {
+        // dX = ddY d_ddout', dY = ddX d_ddout
+        if (out_d_x)
+          MatMulFunction<Context, T>(dev_ctx,
+                                     ddy_conj,
+                                     d_ddout.get(),
+                                     y_dims,
+                                     dout_dims,
+                                     &out_dx_help,
+                                     false,
+                                     true);
+        if (out_d_y)
+          MatMulFunction<Context, T>(dev_ctx,
+                                     ddx_conj,
+                                     d_ddout.get(),
+                                     x_dims,
+                                     dout_dims,
+                                     &out_dy_help,
+                                     false,
+                                     false);
+      }
+    } else {
+      if (transpose_y) {
+        // dX = d_ddout ddY, dY = d_ddout’ ddX
+        if (out_d_x)
+          MatMulFunction<Context, T>(dev_ctx,
+                                     d_ddout.get(),
+                                     ddy_conj,
+                                     dout_dims,
+                                     y_dims,
+                                     &out_dx_help,
+                                     false,
+                                     false);
+        if (out_d_y)
+          MatMulFunction<Context, T>(dev_ctx,
+                                     d_ddout.get(),
+                                     ddx_conj,
+                                     dout_dims,
+                                     x_dims,
+                                     &out_dy_help,
+                                     true,
+                                     false);
+      } else {
+        // dX = d_ddout ddY', dY = ddX' d_ddout
+        if (out_d_x)
+          MatMulFunction<Context, T>(dev_ctx,
+                                     d_ddout.get(),
+                                     ddy_conj,
+                                     dout_dims,
+                                     y_dims,
+                                     &out_dx_help,
+                                     false,
+                                     true);
+        if (out_d_y)
+          MatMulFunction<Context, T>(dev_ctx,
+                                     ddx_conj,
+                                     d_ddout.get(),
+                                     x_dims,
+                                     dout_dims,
+                                     &out_dy_help,
+                                     true,
+                                     false);
+      }
+    }
+
+    // get help dims
+    const std::vector<std::int64_t> dx_help_dims =
+        vectorize(out_dx_help.dims());
+    const std::vector<std::int64_t> dy_help_dims =
+        vectorize(out_dx_help.dims());
+
+    std::vector<std::int64_t> dx_broadcast_dims(ndim);
+    std::vector<std::int64_t> dy_broadcast_dims(ndim);
+
+    std::fill(
+        dx_broadcast_dims.data(), dx_broadcast_dims.data() + ndim - x_ndim, 1);
+    std::fill(
+        dy_broadcast_dims.data(), dy_broadcast_dims.data() + ndim - y_ndim, 1);
+    std::copy(x_dims.data(),
+              x_dims.data() + x_ndim,
+              dx_broadcast_dims.data() + ndim - x_ndim);
+    std::copy(y_dims.data(),
+              y_dims.data() + y_ndim,
+              dy_broadcast_dims.data() + ndim - y_ndim);
+
+    std::vector<int> dx_reduce_dims;
+    std::vector<int> dy_reduce_dims;
+    for (int idx = 0; idx <= ndim - 3; idx++) {
+      if (dx_help_dims[idx] != 1 && dx_broadcast_dims[idx] == 1) {
+        dx_reduce_dims.push_back(idx);
+      }
+      if (dy_help_dims[idx] != 1 && dy_broadcast_dims[idx] == 1) {
+        dy_reduce_dims.push_back(idx);
+      }
+    }
+    // Reduce sum to get grad by ReduceSum
+    if (out_d_x) {
+      if (dx_reduce_dims.empty()) {
+        *out_d_x = std::move(out_dx_help);
+      } else {
+        ReduceSumForMatmulGrad<Context, T>()(
+            dev_ctx, out_dx_help, out_d_x, dx_reduce_dims);
+      }
+      out_d_x->Resize(x.dims());
+    }
+
+    if (out_d_y) {
+      if (dy_reduce_dims.empty()) {
+        *out_d_y = std::move(out_dy_help);
+      } else {
+        ReduceSumForMatmulGrad<Context, T>()(
+            dev_ctx, out_dy_help, out_d_y, dy_reduce_dims);
+      }
+      out_d_y->Resize(y.dims());
+    }
+
+    // compute d_dout
+    if (out_d_dout) {
+      MatMulFunction<Context, T>(dev_ctx,
+                                 d_dx.get(),
+                                 ddy_conj,
+                                 x_dims,
+                                 y_dims,
+                                 out_d_dout,
+                                 transpose_x,
+                                 transpose_y);
+      MatMulFunction<Context, T>(dev_ctx,
+                                 ddx_conj,
+                                 d_dy.get(),
+                                 x_dims,
+                                 y_dims,
+                                 out_d_dout,
+                                 transpose_x,
+                                 transpose_y,
+                                 true);
+    }
+    // compute d_ddx
+    if (out_d_ddx) {
+      if (transpose_x && transpose_y) {
+        // out_d_ddx1 = y' * d_ddout'
+        MatMulFunction<Context, T>(dev_ctx,
+                                   y_conj,
+                                   d_ddout.get(),
+                                   y_dims,
+                                   dout_dims,
+                                   &out_d_ddx_help,
+                                   true,
+                                   true);
+        // out_d_ddx2 = D_DY' * DOut'
+        MatMulFunction<Context, T>(dev_ctx,
+                                   d_dy.get(),
+                                   dout_conj,
+                                   y_dims,
+                                   dout_dims,
+                                   &out_d_ddx_help,
+                                   true,
+                                   true,
+                                   true);
+      } else if (transpose_x) {
+        // out_d_ddx1 = y * d_ddout'
+        MatMulFunction<Context, T>(dev_ctx,
+                                   y_conj,
+                                   d_ddout.get(),
+                                   y_dims,
+                                   dout_dims,
+                                   &out_d_ddx_help,
+                                   false,
+                                   true);
+        // out_d_ddx2 = D_DY * Dout'
+        MatMulFunction<Context, T>(dev_ctx,
+                                   d_dy.get(),
+                                   dout_conj,
+                                   y_dims,
+                                   dout_dims,
+                                   &out_d_ddx_help,
+                                   false,
+                                   true,
+                                   true);
+      } else if (transpose_y) {
+        // out_d_ddx1 = d_ddout * y
+        MatMulFunction<Context, T>(dev_ctx,
+                                   d_ddout.get(),
+                                   y_conj,
+                                   dout_dims,
+                                   y_dims,
+                                   &out_d_ddx_help,
+                                   false,
+                                   false);
+        // out_d_ddx2 = Dout * D_DY
+        MatMulFunction<Context, T>(dev_ctx,
+                                   dout_conj,
+                                   d_dy.get(),
+                                   dout_dims,
+                                   y_dims,
+                                   &out_d_ddx_help,
+                                   false,
+                                   false,
+                                   true);
+      } else {
+        // out_d_ddx1 = d_ddout * y'
+        MatMulFunction<Context, T>(dev_ctx,
+                                   d_ddout.get(),
+                                   y_conj,
+                                   dout_dims,
+                                   y_dims,
+                                   &out_d_ddx_help,
+                                   false,
+                                   true);
+        // out_d_ddx2 = Dout * D_DY'
+        MatMulFunction<Context, T>(dev_ctx,
+                                   dout_conj,
+                                   d_dy.get(),
+                                   dout_dims,
+                                   y_dims,
+                                   &out_d_ddx_help,
+                                   false,
+                                   true,
+                                   true);
+      }
+      if (dx_reduce_dims.empty()) {
+        *out_d_ddx = std::move(out_d_ddx_help);
+      } else {
+        ReduceSumForMatmulGrad<Context, T>()(
+            dev_ctx, out_d_ddx_help, out_d_ddx, dx_reduce_dims);
+      }
+      out_d_ddx->Resize(x.dims());
+    }
+
+    // compute d_ddy
+    if (out_d_ddy) {
+      if (transpose_x && transpose_y) {
+        // out_d_ddy1 = d_ddout' * x'
+        MatMulFunction<Context, T>(dev_ctx,
+                                   d_ddout.get(),
+                                   x_conj,
+                                   dout_dims,
+                                   x_dims,
+                                   &out_d_ddy_help,
+                                   true,
+                                   true);
+        // out_d_ddy2 = dout' * d_dx'
+        MatMulFunction<Context, T>(dev_ctx,
+                                   dout_conj,
+                                   d_dx.get(),
+                                   dout_dims,
+                                   x_dims,
+                                   &out_d_ddy_help,
+                                   true,
+                                   true,
+                                   true);
+      } else if (transpose_x) {
+        // out_d_ddy1 = x * d_ddout
+        MatMulFunction<Context, T>(dev_ctx,
+                                   x_conj,
+                                   d_ddout.get(),
+                                   x_dims,
+                                   dout_dims,
+                                   &out_d_ddy_help,
+                                   false,
+                                   false);
+        // out_d_ddy2 = d_dx * dout
+        MatMulFunction<Context, T>(dev_ctx,
+                                   d_dx.get(),
+                                   dout_conj,
+                                   x_dims,
+                                   dout_dims,
+                                   &out_d_ddy_help,
+                                   false,
+                                   false,
+                                   true);
+      } else if (transpose_y) {
+        // out_d_ddy1 = d_ddout' * x
+        MatMulFunction<Context, T>(dev_ctx,
+                                   d_ddout.get(),
+                                   x_conj,
+                                   dout_dims,
+                                   x_dims,
+                                   &out_d_ddy_help,
+                                   true,
+                                   false);
+        // out_d_ddy2 = dout' * d_dx
+        MatMulFunction<Context, T>(dev_ctx,
+                                   dout_conj,
+                                   d_dx.get(),
+                                   dout_dims,
+                                   x_dims,
+                                   &out_d_ddy_help,
+                                   true,
+                                   false,
+                                   true);
+      } else {
+        // out_d_ddy1 = x' * d_ddout
+        MatMulFunction<Context, T>(dev_ctx,
+                                   x_conj,
+                                   d_ddout.get(),
+                                   x_dims,
+                                   dout_dims,
+                                   &out_d_ddy_help,
+                                   true,
+                                   false);
+        // out_d_ddy2 = d_dx' * dout
+        MatMulFunction<Context, T>(dev_ctx,
+                                   d_dx.get(),
+                                   dout_conj,
+                                   x_dims,
+                                   dout_dims,
+                                   &out_d_ddy_help,
+                                   true,
+                                   false,
+                                   true);
+      }
+
+      if (dy_reduce_dims.empty()) {
+        *out_d_ddy = std::move(out_d_ddy_help);
+      } else {
+        ReduceSumForMatmulGrad<Context, T>()(
+            dev_ctx, out_d_ddy_help, out_d_ddy, dy_reduce_dims);
+      }
+      out_d_ddy->Resize(y.dims());
+    }
+  }
+}
+
+}  // namespace pten
diff --git a/paddle/pten/kernels/impl/matmul_kernel_impl.h b/paddle/pten/kernels/impl/matmul_kernel_impl.h
index e50b2f0641a46..f5f69f327a69f 100644
--- a/paddle/pten/kernels/impl/matmul_kernel_impl.h
+++ b/paddle/pten/kernels/impl/matmul_kernel_impl.h
@@ -86,7 +86,7 @@ static void IndexIncreaseFromDims(const int ndim,
 }
 
 template <typename Context, typename T>
-void MatMulFunction(const Context& context,
+void MatMulFunction(const Context& dev_ctx,
                     const DenseTensor& X,
                     const DenseTensor& Y,
                     const std::vector<std::int64_t>& x_dims,
@@ -102,7 +102,7 @@ void MatMulFunction(const Context& context,
   const T* x_data = X.data<T>();
   const T* y_data = Y.data<T>();
 
-  auto blas = paddle::operators::math::GetBlas<Context, T>(context);
+  auto blas = paddle::operators::math::GetBlas<Context, T>(dev_ctx);
 
   if (x_ndim == 1 && y_ndim == 1) {
     const int M = X.numel();
@@ -117,6 +117,8 @@ void MatMulFunction(const Context& context,
             M,
             N));
     VLOG(3) << "MatMul's case 1";
+    Out->Resize({1});
+    Out->mutable_data<T>();
     blas.GEMM(CblasNoTrans,
               CblasTrans,
               1,
@@ -471,7 +473,7 @@ void MatMulFunction(const Context& context,
 }
 
 template <typename Context, typename T>
-void MatMulFunction(const Context& context,
+void MatMulFunction(const Context& dev_ctx,
                     const DenseTensor& X,
                     const DenseTensor& Y,
                     DenseTensor* Out,
@@ -481,11 +483,11 @@ void MatMulFunction(const Context& context,
   const std::vector<std::int64_t> x_dims = vectorize(X.dims());
   const std::vector<std::int64_t> y_dims = vectorize(Y.dims());
   MatMulFunction<Context, T>(
-      context, X, Y, x_dims, y_dims, Out, trans_x, trans_y, flag);
+      dev_ctx, X, Y, x_dims, y_dims, Out, trans_x, trans_y, flag);
 }
 
 template <typename T, typename Context>
-void MatmulKernel(const Context& context,
+void MatmulKernel(const Context& dev_ctx,
                   const DenseTensor& x,
                   const DenseTensor& y,
                   bool transpose_x,
@@ -501,7 +503,7 @@ void MatmulKernel(const Context& context,
                     paddle::platform::errors::InvalidArgument(
                         "The Input(Y) dims size must not be equal 0,"
                         " but reviced dims size is 0. "));
-  MatMulFunction<Context, T>(context, x, y, out, transpose_x, transpose_y);
+  MatMulFunction<Context, T>(dev_ctx, x, y, out, transpose_x, transpose_y);
 }
 
 }  // namespace pten
diff --git a/paddle/pten/kernels/matmul_grad_kernel.h b/paddle/pten/kernels/matmul_grad_kernel.h
new file mode 100644
index 0000000000000..db485b79d2736
--- /dev/null
+++ b/paddle/pten/kernels/matmul_grad_kernel.h
@@ -0,0 +1,63 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/utils/optional.h"
+
+namespace pten {
+
+template <typename T, typename Context>
+void MatmulGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& y,
+                      const DenseTensor& dout,
+                      bool transpose_x,
+                      bool transpose_y,
+                      DenseTensor* dx,
+                      DenseTensor* dy);
+
+template <typename T, typename Context>
+void MatmulDoubleGradKernel(const Context& dev_ctx,
+                            const DenseTensor& x,
+                            const DenseTensor& y,
+                            const DenseTensor& dout,
+                            paddle::optional<const DenseTensor&> ddx,
+                            paddle::optional<const DenseTensor&> ddy,
+                            bool transpose_x,
+                            bool transpose_y,
+                            DenseTensor* dx,
+                            DenseTensor* dy,
+                            DenseTensor* ddout);
+
+template <typename T, typename Context>
+void MatmulTripleGradKernel(const Context& dev_ctx,
+                            const DenseTensor& x,
+                            const DenseTensor& y,
+                            const DenseTensor& dout,
+                            const DenseTensor& ddx,
+                            const DenseTensor& ddy,
+                            paddle::optional<const DenseTensor&> d_dx,
+                            paddle::optional<const DenseTensor&> d_dy,
+                            paddle::optional<const DenseTensor&> d_ddout,
+                            bool transpose_x,
+                            bool transpose_y,
+                            DenseTensor* out_d_x,
+                            DenseTensor* out_d_y,
+                            DenseTensor* out_d_dout,
+                            DenseTensor* out_d_ddx,
+                            DenseTensor* out_d_ddy);
+
+}  // namespace pten
diff --git a/paddle/pten/kernels/matmul_kernel.h b/paddle/pten/kernels/matmul_kernel.h
index fb54a5301e61c..f9cb2c3801caa 100644
--- a/paddle/pten/kernels/matmul_kernel.h
+++ b/paddle/pten/kernels/matmul_kernel.h
@@ -14,14 +14,15 @@
 
 #pragma once
 
-#include "paddle/pten/api/lib/utils/storage.h"
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/infermeta/binary.h"
 
+#include "paddle/pten/kernels/empty_kernel.h"
+
 namespace pten {
 
 template <typename T, typename Context>
-void MatmulKernel(const Context& context,
+void MatmulKernel(const Context& dev_ctx,
                   const DenseTensor& x,
                   const DenseTensor& y,
                   bool transpose_x,
@@ -29,17 +30,14 @@ void MatmulKernel(const Context& context,
                   DenseTensor* out);
 
 template <typename T, typename Context>
-DenseTensor Matmul(const Context& context,
+DenseTensor Matmul(const Context& dev_ctx,
                    const DenseTensor& x,
                    const DenseTensor& y,
                    bool transpose_x,
                    bool transpose_y) {
   auto out_meta = MatmulInferMeta(x.meta(), y.meta(), transpose_x, transpose_y);
-  DenseTensor dense_out(
-      pten::make_intrusive<paddle::experimental::SharedStorage>(
-          context.GetPlace()),
-      std::move(out_meta));
-  MatmulKernel<T, Context>(context, x, y, transpose_x, transpose_y, &dense_out);
+  auto dense_out = Empty<T, Context>(dev_ctx, std::move(out_meta));
+  MatmulKernel<T, Context>(dev_ctx, x, y, transpose_x, transpose_y, &dense_out);
   return dense_out;
 }
 

From 8cc09552473b842c651ead3b9848d41827a3dbab Mon Sep 17 00:00:00 2001
From: YuanRisheng <yuanrisheng@baidu.com>
Date: Tue, 11 Jan 2022 20:58:24 +0800
Subject: [PATCH 094/151] refactor reshape grad kernel (#38833)

---
 paddle/fluid/operators/reshape_op.cc       | 64 ++++++++++++++----
 paddle/pten/core/kernel_alias_name.h       |  3 +
 paddle/pten/kernels/reshape_grad_kernel.cc | 75 ++++++++++++++++++++++
 paddle/pten/kernels/reshape_grad_kernel.h  | 31 +++++++++
 4 files changed, 161 insertions(+), 12 deletions(-)
 create mode 100644 paddle/pten/kernels/reshape_grad_kernel.cc
 create mode 100644 paddle/pten/kernels/reshape_grad_kernel.h

diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index f2162f55636e5..a25e53aac5d73 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
 #include "paddle/pten/common/scalar_array.h"
 #include "paddle/pten/include/core.h"
+#include "paddle/pten/kernels/reshape_grad_kernel.h"
 #include "paddle/pten/kernels/reshape_kernel.h"
 namespace paddle {
 namespace framework {
@@ -467,13 +468,27 @@ class ReshapeGradKernel {
   void operator()(const framework::ExecutionContext &ctx) const {
     auto *d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
     auto *d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto in_dims = d_x->dims();
-
     d_x->mutable_data(ctx.GetPlace(), d_out->type());
-    framework::TensorCopy(
-        *d_out, ctx.GetPlace(),
-        ctx.template device_context<platform::DeviceContext>(), d_x);
-    d_x->Resize(in_dims);
+
+    auto pt_d_x = paddle::experimental::MakePtenDenseTensor(*d_x);
+    auto pt_d_out = paddle::experimental::MakePtenDenseTensor(*d_out);
+
+    if (platform::is_cpu_place(ctx.GetPlace())) {
+      auto &dev_ctx = ctx.device_context<platform::CPUDeviceContext>();
+      pten::ReshapeGradKernel(dev_ctx, *pt_d_out.get(), pt_d_x.get());
+    }
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    if (platform::is_gpu_place(ctx.GetPlace())) {
+      auto &dev_ctx = ctx.device_context<platform::CUDADeviceContext>();
+      pten::ReshapeGradKernel(dev_ctx, *pt_d_out.get(), pt_d_x.get());
+    }
+#endif
+#ifdef PADDLE_WITH_XPU
+    if (platform::is_xpu_place(ctx.GetPlace())) {
+      auto &dev_ctx = ctx.device_context<platform::XPUDeviceContext>();
+      pten::ReshapeGradKernel(dev_ctx, *pt_d_out.get(), pt_d_x.get());
+    }
+#endif
   }
 };
 
@@ -482,14 +497,27 @@ class ReshapeDoubleGradKernel {
   void operator()(const framework::ExecutionContext &ctx) const {
     auto *dd_x = ctx.Input<framework::Tensor>("DDX");
     auto *dd_out = ctx.Output<framework::Tensor>("DDOut");
+    dd_out->mutable_data(ctx.GetPlace(), dd_x->type());
 
-    auto out_dims = dd_out->dims();
+    auto pt_dd_x = paddle::experimental::MakePtenDenseTensor(*dd_x);
+    auto pt_dd_out = paddle::experimental::MakePtenDenseTensor(*dd_out);
 
-    dd_out->mutable_data(ctx.GetPlace(), dd_x->type());
-    framework::TensorCopy(
-        *dd_x, ctx.GetPlace(),
-        ctx.template device_context<platform::DeviceContext>(), dd_out);
-    dd_out->Resize(out_dims);
+    if (platform::is_cpu_place(ctx.GetPlace())) {
+      auto &dev_ctx = ctx.device_context<platform::CPUDeviceContext>();
+      pten::ReshapeDoubleGradKernel(dev_ctx, *pt_dd_x.get(), pt_dd_out.get());
+    }
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    if (platform::is_gpu_place(ctx.GetPlace())) {
+      auto &dev_ctx = ctx.device_context<platform::CUDADeviceContext>();
+      pten::ReshapeDoubleGradKernel(dev_ctx, *pt_dd_x.get(), pt_dd_out.get());
+    }
+#endif
+#ifdef PADDLE_WITH_XPU
+    if (platform::is_xpu_place(ctx.GetPlace())) {
+      auto &dev_ctx = ctx.device_context<platform::XPUDeviceContext>();
+      pten::ReshapeDoubleGradKernel(dev_ctx, *pt_dd_x.get(), pt_dd_out.get());
+    }
+#endif
   }
 };
 
@@ -624,6 +652,13 @@ class Reshape2GradOp : public framework::OperatorWithKernel {
     return framework::OpKernelType(expected_kernel_type.data_type_,
                                    tensor.place(), tensor.layout());
   }
+
+  framework::KernelSignature GetExpectedPtenKernelArgs(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::KernelSignature("reshape_grad",
+                                      {framework::GradVarName("Out")}, {},
+                                      {framework::GradVarName("X")});
+  }
 };
 
 class Reshape2DoubleGradOp : public framework::OperatorWithKernel {
@@ -660,6 +695,11 @@ class Reshape2DoubleGradOp : public framework::OperatorWithKernel {
     return framework::OpKernelType(expected_kernel_type.data_type_,
                                    tensor.place(), tensor.layout());
   }
+  framework::KernelSignature GetExpectedPtenKernelArgs(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::KernelSignature("reshape_double_grad", {"DDX"}, {},
+                                      {"DDOut"});
+  }
 };
 
 DECLARE_INPLACE_OP_INFERER(ReshapeOpInplaceInferer, {"X", "Out"});
diff --git a/paddle/pten/core/kernel_alias_name.h b/paddle/pten/core/kernel_alias_name.h
index 46fa6dd376ee3..5c86787966368 100644
--- a/paddle/pten/core/kernel_alias_name.h
+++ b/paddle/pten/core/kernel_alias_name.h
@@ -35,6 +35,8 @@ const std::unordered_map<std::string, std::string> kernel_alias_name_map = {
     {"reduce_mean", "mean"},
     {"reduce_sum", "sum"},
     {"reshape2", "reshape"},
+    {"reshape2_grad", "reshape_grad"},
+    {"reshape2_grad_grad", "reshape_double_grad"},
     // fluid kernel "mean/reshape/matmul/flatten/sum" should be deprecated
     {"flatten", "deprecated"},
     {"flatten_grad", "deprecated"},
@@ -43,6 +45,7 @@ const std::unordered_map<std::string, std::string> kernel_alias_name_map = {
     {"matmul_grad_grad", "deprecated"},
     {"mean", "deprecated"},
     {"reshape", "deprecated"},
+    {"reshape_grad", "deprecated"},
     {"sum", "deprecated"}};
 
 }  // namespace pten
diff --git a/paddle/pten/kernels/reshape_grad_kernel.cc b/paddle/pten/kernels/reshape_grad_kernel.cc
new file mode 100644
index 0000000000000..99f0556765ef6
--- /dev/null
+++ b/paddle/pten/kernels/reshape_grad_kernel.cc
@@ -0,0 +1,75 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/kernels/reshape_grad_kernel.h"
+#include "paddle/pten/backends/all_context.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/kernels/copy_kernel.h"
+
+namespace pten {
+
+template <typename Context>
+void ReshapeGradKernel(const Context& dev_ctx,
+                       const DenseTensor& out_grad,
+                       DenseTensor* x_grad) {
+  auto x_dims = x_grad->dims();
+  pten::Copy(dev_ctx, out_grad, false, x_grad);
+  x_grad->Resize(x_dims);
+}
+
+template <typename Context>
+void ReshapeDoubleGradKernel(const Context& dev_ctx,
+                             const DenseTensor& x_grad_grad,
+                             DenseTensor* out_grad_grad) {
+  ReshapeGradKernel(dev_ctx, x_grad_grad, out_grad_grad);
+}
+
+}  // namespace pten
+
+PT_REGISTER_GENERAL_KERNEL(reshape_grad,
+                           CPU,
+                           ALL_LAYOUT,
+                           pten::ReshapeGradKernel<pten::CPUContext>,
+                           ALL_DTYPE) {}
+PT_REGISTER_GENERAL_KERNEL(reshape_double_grad,
+                           CPU,
+                           ALL_LAYOUT,
+                           pten::ReshapeDoubleGradKernel<pten::CPUContext>,
+                           ALL_DTYPE) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PT_REGISTER_GENERAL_KERNEL(reshape_grad,
+                           GPU,
+                           ALL_LAYOUT,
+                           pten::ReshapeGradKernel<pten::GPUContext>,
+                           ALL_DTYPE) {}
+PT_REGISTER_GENERAL_KERNEL(reshape_double_grad,
+                           GPU,
+                           ALL_LAYOUT,
+                           pten::ReshapeDoubleGradKernel<pten::GPUContext>,
+                           ALL_DTYPE) {}
+#endif
+
+#ifdef PADDLE_WITH_XPU
+PT_REGISTER_GENERAL_KERNEL(reshape_grad,
+                           XPU,
+                           ALL_LAYOUT,
+                           pten::ReshapeGradKernel<pten::XPUContext>,
+                           ALL_DTYPE) {}
+PT_REGISTER_GENERAL_KERNEL(reshape_double_grad,
+                           XPU,
+                           ALL_LAYOUT,
+                           pten::ReshapeDoubleGradKernel<pten::XPUContext>,
+                           ALL_DTYPE) {}
+#endif
diff --git a/paddle/pten/kernels/reshape_grad_kernel.h b/paddle/pten/kernels/reshape_grad_kernel.h
new file mode 100644
index 0000000000000..1492d753704fd
--- /dev/null
+++ b/paddle/pten/kernels/reshape_grad_kernel.h
@@ -0,0 +1,31 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/core/dense_tensor.h"
+
+namespace pten {
+
+template <typename Context>
+void ReshapeGradKernel(const Context& dev_ctx,
+                       const DenseTensor& out_grad,
+                       DenseTensor* x_grad);
+
+template <typename Context>
+void ReshapeDoubleGradKernel(const Context& dev_ctx,
+                             const DenseTensor& x_grad_grad,
+                             DenseTensor* out_grad_grad);
+
+}  // namespace pten

From e9c77e09605c8c37d7704a02ec56f3ddb0ba05c4 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Wed, 12 Jan 2022 09:53:32 +0800
Subject: [PATCH 095/151] add xiaoguang into big pr approve list,
 test=document_fix (#38883)

---
 tools/check_file_diff_approvals.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index 92f806b7e8a84..e0ae600819873 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -96,8 +96,8 @@ if [[ $changed_env_var_count -gt 0 ]]; then
 fi
 
 if [[ $git_files -gt 19 || $git_count -gt 999 ]];then
-    echo_line="You must have Dianhai approval for change 20+ files or add than 1000+ lines of content.\n"
-    check_approval 1 38231817
+    echo_line="You must have Dianhai or XiaoguangHu01 approval for change 20+ files or add than 1000+ lines of content.\n"
+    check_approval 1 38231817 46782768
 fi
 
 for API_FILE in ${API_FILES[*]}; do

From b7bae939bf4ff5a55b783a620abf6d7fbf757abc Mon Sep 17 00:00:00 2001
From: Sing_chan <51314274+betterpig@users.noreply.github.com>
Date: Wed, 12 Jan 2022 11:49:30 +0800
Subject: [PATCH 096/151] add args check and comment for exp,polynomy decay
 (#38782)

* add args check and comment for exp,polynomy decay

* modify according to zhouwei's comment
---
 python/paddle/optimizer/lr.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/python/paddle/optimizer/lr.py b/python/paddle/optimizer/lr.py
index d4fafba9229b0..90117f99abc48 100644
--- a/python/paddle/optimizer/lr.py
+++ b/python/paddle/optimizer/lr.py
@@ -398,7 +398,7 @@ class NaturalExpDecay(LRScheduler):
 
     Args:
         learning_rate (float): The initial learning rate. It is a python float number.
-        gamma (float, optional): A Ratio to update the learning rate. Default: 0.1.
+        gamma (float, optional): A Ratio to update the learning rate, should greater than 0.0 to make learning rate decay. Default: 0.1.
         last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
         verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
 
@@ -456,6 +456,7 @@ class NaturalExpDecay(LRScheduler):
     """
 
     def __init__(self, learning_rate, gamma, last_epoch=-1, verbose=False):
+        assert gamma > 0.0, " 'gamma' must be a positive number so that the learning rate will decay."
         self.gamma = gamma
         super(NaturalExpDecay, self).__init__(learning_rate, last_epoch,
                                               verbose)
@@ -573,7 +574,7 @@ class PolynomialDecay(LRScheduler):
         learning_rate (float): The initial learning rate. It is a python float number.
         decay_steps(int): The decay step size. It determines the decay cycle. It must be a positive integer.
         end_lr(float, optional): The minimum final learning rate. Default: 0.0001.
-        power(float, optional): Power of polynomial. Default: 1.0.
+        power(float, optional): Power of polynomial, should greater than 0.0 to get learning rate decay. Default: 1.0.
         cycle(bool, optional): Whether the learning rate rises again. If True, then the learning rate will rise when it decrease
             to ``end_lr`` .  If False, the learning rate is monotone decreasing. Default: False.
         last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
@@ -644,6 +645,7 @@ def __init__(self,
             decay_steps, int), " 'decay_steps' must be a positive integer."
         self.decay_steps = decay_steps
         self.end_lr = end_lr
+        assert power > 0.0, " 'power' must be greater than 0.0 so that the learning rate will decay."
         self.power = power
         self.cycle = cycle
         super(PolynomialDecay, self).__init__(learning_rate, last_epoch,
@@ -820,7 +822,7 @@ class ExponentialDecay(LRScheduler):
     Args:
         learning_rate (float): The initial learning rate. It is a python float number.
         gamma (float): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` .
-            It should be less than 1.0.
+            It should be in interval (0.0, 1.0).
         last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
         verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
 
@@ -878,6 +880,7 @@ class ExponentialDecay(LRScheduler):
     """
 
     def __init__(self, learning_rate, gamma, last_epoch=-1, verbose=False):
+        assert gamma > 0.0 and gamma < 1.0, " 'gamma' must be in interval (0.0, 1.0) so that the learning rate will decay."
         self.gamma = gamma
         super(ExponentialDecay, self).__init__(learning_rate, last_epoch,
                                                verbose)

From 0d8d1e0ea74267dab379fdfafef2ee90db19318f Mon Sep 17 00:00:00 2001
From: liutiexing <74819124+liutiexing@users.noreply.github.com>
Date: Wed, 12 Jan 2022 12:36:55 +0800
Subject: [PATCH 097/151] Os info (#38779)

* add align for WorkQueue

* add spinlock

* merge develop

* merge

* Add EventsWaiter

* Revert "Add EventsWaiter"

This reverts commit e206173aa9be7401b83a53581627bfaf557c8fb2.

* os_info update

* update

* update

* update

* update

* update

* fix

* update

* update for windows

* fix windows

* update

* update

Co-authored-by: liutiexing <liutiexing@google.com>
---
 paddle/fluid/platform/CMakeLists.txt          |   1 +
 paddle/fluid/platform/os_info.cc              | 175 ++++++++++++++++--
 paddle/fluid/platform/os_info.h               |  80 +++-----
 paddle/fluid/platform/os_info_test.cc         |  40 ++++
 .../platform/profiler/host_event_recorder.cc  |   2 +-
 5 files changed, 234 insertions(+), 64 deletions(-)
 create mode 100644 paddle/fluid/platform/os_info_test.cc

diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 8a84429987d90..517b4a28a690f 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -47,6 +47,7 @@ ENDIF()
 cc_library(cpu_info SRCS cpu_info.cc DEPS ${CPU_INFO_DEPS})
 cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)
 cc_library(os_info SRCS os_info.cc DEPS enforce)
+cc_test(os_info_test SRCS os_info_test.cc DEPS os_info)
 
 IF(WITH_GPU)
     nv_library(cuda_graph_with_memory_pool SRCS cuda_graph_with_memory_pool.cc DEPS device_context allocator_facade cuda_graph)
diff --git a/paddle/fluid/platform/os_info.cc b/paddle/fluid/platform/os_info.cc
index 5ba7f1d144e12..07263153164e2 100644
--- a/paddle/fluid/platform/os_info.cc
+++ b/paddle/fluid/platform/os_info.cc
@@ -13,7 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/os_info.h"
+#include <functional>
+#include <mutex>
 #include <sstream>
+#include <thread>
+#include <vector>
 #if defined(__linux__)
 #include <sys/syscall.h>
 #include <sys/types.h>
@@ -21,32 +25,181 @@ limitations under the License. */
 #elif defined(_MSC_VER)
 #include <processthreadsapi.h>
 #endif
+#include "paddle/fluid/platform/macros.h"  // import DISABLE_COPY_AND_ASSIGN
 
 namespace paddle {
 namespace platform {
+namespace internal {
 
-ThreadId::ThreadId() {
+static uint64_t main_tid =
+    std::hash<std::thread::id>()(std::this_thread::get_id());
+
+template <typename T>
+class ThreadDataRegistry {
+  class ThreadDataHolder;
+
+ public:
+  // Singleton
+  static ThreadDataRegistry& GetInstance() {
+    static ThreadDataRegistry instance;
+    return instance;
+  }
+
+  const T& GetCurrentThreadData() { return CurrentThreadData(); }
+
+  void SetCurrentThreadData(const T& val) {
+    std::lock_guard<std::mutex> lock(lock_);
+    CurrentThreadData() = val;
+  }
+
+  // Returns current snapshot of all threads. Make sure there is no thread
+  // create/destory when using it.
+  template <typename = std::enable_if_t<std::is_copy_constructible<T>::value>>
+  std::unordered_map<uint64_t, T> GetAllThreadDataByValue() {
+    std::unordered_map<uint64_t, T> data_copy;
+    std::lock_guard<std::mutex> lock(lock_);
+    data_copy.reserve(tid_map_.size());
+    for (auto& kv : tid_map_) {
+      data_copy.emplace(kv.first, kv.second->GetData());
+    }
+    return std::move(data_copy);
+  }
+
+  void RegisterData(uint64_t tid, ThreadDataHolder* tls_obj) {
+    std::lock_guard<std::mutex> lock(lock_);
+    tid_map_[tid] = tls_obj;
+  }
+
+  void UnregisterData(uint64_t tid) {
+    if (tid == main_tid) {
+      return;
+    }
+    std::lock_guard<std::mutex> lock(lock_);
+    tid_map_.erase(tid);
+  }
+
+ private:
+  class ThreadDataHolder {
+   public:
+    ThreadDataHolder() {
+      tid_ = std::hash<std::thread::id>()(std::this_thread::get_id());
+      ThreadDataRegistry::GetInstance().RegisterData(tid_, this);
+    }
+
+    ~ThreadDataHolder() {
+      ThreadDataRegistry::GetInstance().UnregisterData(tid_);
+    }
+
+    T& GetData() { return data_; }
+
+   private:
+    uint64_t tid_;
+    T data_;
+  };
+
+  ThreadDataRegistry() = default;
+
+  DISABLE_COPY_AND_ASSIGN(ThreadDataRegistry);
+
+  T& CurrentThreadData() {
+    static thread_local ThreadDataHolder thread_data;
+    return thread_data.GetData();
+  }
+
+  std::mutex lock_;
+  std::unordered_map<uint64_t, ThreadDataHolder*> tid_map_;  // not owned
+};
+
+class InternalThreadId {
+ public:
+  InternalThreadId();
+
+  const ThreadId& GetTid() const { return id_; }
+
+ private:
+  ThreadId id_;
+};
+
+InternalThreadId::InternalThreadId() {
   // C++ std tid
-  std_tid_ = std::hash<std::thread::id>()(std::this_thread::get_id());
+  id_.std_tid = std::hash<std::thread::id>()(std::this_thread::get_id());
 // system tid
 #if defined(__linux__)
-  sys_tid_ = syscall(SYS_gettid);
+  id_.sys_tid = static_cast<uint64_t>(syscall(SYS_gettid));
 #elif defined(_MSC_VER)
-  sys_tid_ = GetCurrentThreadId();
-#else  // unsupported platforms
-  sys_tid_ = 0;
+  id_.sys_tid = static_cast<uint64_t>(::GetCurrentThreadId());
+#else  // unsupported platforms, use std_tid
+  id_.sys_tid = id_.std_tid;
 #endif
   // cupti tid
   std::stringstream ss;
   ss << std::this_thread::get_id();
-  cupti_tid_ = static_cast<uint32_t>(std::stoull(ss.str()));
+  id_.cupti_tid = static_cast<uint32_t>(std::stoull(ss.str()));
+}
+
+}  // namespace internal
+
+uint64_t GetCurrentThreadSysId() {
+  return internal::ThreadDataRegistry<internal::InternalThreadId>::GetInstance()
+      .GetCurrentThreadData()
+      .GetTid()
+      .sys_tid;
 }
 
-ThreadIdRegistry::~ThreadIdRegistry() {
-  std::lock_guard<std::mutex> lock(lock_);
-  for (auto id_pair : id_map_) {
-    delete id_pair.second;
+uint64_t GetCurrentThreadStdId() {
+  return internal::ThreadDataRegistry<internal::InternalThreadId>::GetInstance()
+      .GetCurrentThreadData()
+      .GetTid()
+      .std_tid;
+}
+
+ThreadId GetCurrentThreadId() {
+  return internal::ThreadDataRegistry<internal::InternalThreadId>::GetInstance()
+      .GetCurrentThreadData()
+      .GetTid();
+}
+
+std::unordered_map<uint64_t, ThreadId> GetAllThreadIds() {
+  auto tids =
+      internal::ThreadDataRegistry<internal::InternalThreadId>::GetInstance()
+          .GetAllThreadDataByValue();
+  std::unordered_map<uint64_t, ThreadId> res;
+  for (const auto& kv : tids) {
+    res[kv.first] = kv.second.GetTid();
   }
+  return res;
+}
+
+static constexpr const char* kDefaultThreadName = "unset";
+
+std::string GetCurrentThreadName() {
+  const auto& thread_name =
+      internal::ThreadDataRegistry<std::string>::GetInstance()
+          .GetCurrentThreadData();
+  return thread_name.empty() ? kDefaultThreadName : thread_name;
+}
+
+std::unordered_map<uint64_t, std::string> GetAllThreadNames() {
+  return internal::ThreadDataRegistry<std::string>::GetInstance()
+      .GetAllThreadDataByValue();
+}
+
+bool SetCurrentThreadName(const std::string& name) {
+  auto& instance = internal::ThreadDataRegistry<std::string>::GetInstance();
+  const auto& cur_name = instance.GetCurrentThreadData();
+  if (!cur_name.empty() || cur_name == kDefaultThreadName) {
+    return false;
+  }
+  instance.SetCurrentThreadData(name);
+  return true;
+}
+
+uint32_t GetProcessId() {
+#if defined(_MSC_VER)
+  return static_cast<uint32_t>(GetCurrentProcessId());
+#else
+  return static_cast<uint32_t>(getpid());
+#endif
 }
 
 }  // namespace platform
diff --git a/paddle/fluid/platform/os_info.h b/paddle/fluid/platform/os_info.h
index c38198f91b36b..c84738247a46f 100644
--- a/paddle/fluid/platform/os_info.h
+++ b/paddle/fluid/platform/os_info.h
@@ -14,15 +14,12 @@ limitations under the License. */
 
 #pragma once
 
-#include <mutex>
-#include <thread>
+#include <string>
 #include <unordered_map>
-#include "paddle/fluid/platform/enforce.h"  // import LIKELY
-#include "paddle/fluid/platform/macros.h"   // import DISABLE_COPY_AND_ASSIGN
-#include "paddle/fluid/platform/port.h"
 #ifdef _POSIX_C_SOURCE
 #include <time.h>
 #endif
+#include "paddle/fluid/platform/port.h"
 
 namespace paddle {
 namespace platform {
@@ -41,59 +38,38 @@ inline uint64_t PosixInNsec() {
 }
 
 // All kinds of Ids for OS thread
-class ThreadId {
- public:
-  ThreadId();
+struct ThreadId {
+  uint64_t std_tid = 0;    // std::hash<std::thread::id>
+  uint64_t sys_tid = 0;    // OS-specific, Linux: gettid
+  uint32_t cupti_tid = 0;  // thread_id used by Nvidia CUPTI
+};
 
-  uint64_t MainTid() const { return SysTid(); }
+// Better performance than GetCurrentThreadId
+uint64_t GetCurrentThreadStdId();
 
-  uint64_t StdTid() const { return std_tid_; }
+// Better performance than GetCurrentThreadId
+uint64_t GetCurrentThreadSysId();
 
-  uint32_t CuptiTid() const { return cupti_tid_; }
+ThreadId GetCurrentThreadId();
 
-  uint64_t SysTid() const { return sys_tid_ != 0 ? sys_tid_ : std_tid_; }
+// Return the map from StdTid to ThreadId
+// Returns current snapshot of all threads. Make sure there is no thread
+// create/destory when using it.
+std::unordered_map<uint64_t, ThreadId> GetAllThreadIds();
 
- private:
-  uint64_t std_tid_ = 0;    // std::hash<std::thread::id>
-  uint32_t cupti_tid_ = 0;  // thread_id used by Nvidia CUPTI
-  uint64_t sys_tid_ = 0;    // OS-specific, Linux: gettid
-};
+// Returns 'unset' if SetCurrentThreadName is never called.
+std::string GetCurrentThreadName();
 
-class ThreadIdRegistry {
- public:
-  // singleton
-  static ThreadIdRegistry& GetInstance() {
-    static ThreadIdRegistry instance;
-    return instance;
-  }
-
-  const ThreadId* GetThreadId(uint64_t std_id) {
-    std::lock_guard<std::mutex> lock(lock_);
-    if (LIKELY(id_map_.find(std_id) != id_map_.end())) {
-      return id_map_[std_id];
-    }
-    return nullptr;
-  }
-
-  const ThreadId& CurrentThreadId() {
-    static thread_local ThreadId* tid_ = nullptr;
-    if (LIKELY(tid_ != nullptr)) {
-      return *tid_;
-    }
-    tid_ = new ThreadId;
-    std::lock_guard<std::mutex> lock(lock_);
-    id_map_[tid_->StdTid()] = tid_;
-    return *tid_;
-  }
-
- private:
-  ThreadIdRegistry() = default;
-  DISABLE_COPY_AND_ASSIGN(ThreadIdRegistry);
-  ~ThreadIdRegistry();
-
-  std::mutex lock_;
-  std::unordered_map<uint64_t, ThreadId*> id_map_;
-};
+// Return the map from StdTid to ThreadName
+// Returns current snapshot of all threads. Make sure there is no thread
+// create/destory when using it.
+std::unordered_map<uint64_t, std::string> GetAllThreadNames();
+
+// Thread name is immutable, only the first call will succeed.
+// Returns false on failure.
+bool SetCurrentThreadName(const std::string& name);
+
+uint32_t GetProcessId();
 
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/os_info_test.cc b/paddle/fluid/platform/os_info_test.cc
new file mode 100644
index 0000000000000..b309bb985122d
--- /dev/null
+++ b/paddle/fluid/platform/os_info_test.cc
@@ -0,0 +1,40 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/platform/os_info.h"
+#include <thread>
+#include "gtest/gtest.h"
+
+TEST(ThreadInfo, TestThreadIdUtils) {
+  using paddle::platform::GetCurrentThreadStdId;
+  using paddle::platform::GetCurrentThreadId;
+  using paddle::platform::GetAllThreadIds;
+  EXPECT_EQ(std::hash<std::thread::id>()(std::this_thread::get_id()),
+            GetCurrentThreadId().std_tid);
+  auto ids = GetAllThreadIds();
+  EXPECT_TRUE(ids.find(GetCurrentThreadStdId()) != ids.end());
+}
+
+TEST(ThreadInfo, TestThreadNameUtils) {
+  using paddle::platform::GetCurrentThreadStdId;
+  using paddle::platform::GetCurrentThreadName;
+  using paddle::platform::SetCurrentThreadName;
+  using paddle::platform::GetAllThreadNames;
+  EXPECT_EQ("unset", GetCurrentThreadName());
+  EXPECT_TRUE(SetCurrentThreadName("MainThread"));
+  EXPECT_FALSE(SetCurrentThreadName("MainThread"));
+  auto names = GetAllThreadNames();
+  EXPECT_TRUE(names.find(GetCurrentThreadStdId()) != names.end());
+  EXPECT_EQ("MainThread", names[GetCurrentThreadStdId()]);
+  EXPECT_EQ("MainThread", GetCurrentThreadName());
+}
diff --git a/paddle/fluid/platform/profiler/host_event_recorder.cc b/paddle/fluid/platform/profiler/host_event_recorder.cc
index 14054418c5d24..b8495ca45ca84 100644
--- a/paddle/fluid/platform/profiler/host_event_recorder.cc
+++ b/paddle/fluid/platform/profiler/host_event_recorder.cc
@@ -16,7 +16,7 @@ namespace paddle {
 namespace platform {
 
 ThreadEventRecorder::ThreadEventRecorder() {
-  thread_id_ = ThreadIdRegistry::GetInstance().CurrentThreadId().MainTid();
+  thread_id_ = GetCurrentThreadSysId();
   HostEventRecorder::GetInstance().RegisterThreadRecorder(thread_id_, this);
 }
 

From f5166284dc04b5e0decc40fad37278f7e600e72b Mon Sep 17 00:00:00 2001
From: limingshu <61349199+JamesLim-sy@users.noreply.github.com>
Date: Wed, 12 Jan 2022 12:41:47 +0800
Subject: [PATCH 098/151] Adjust warpper of gpu_lanuch_config (#38654)

* first commit

* fix wrong filename

* fix the wrong spell name

* fix gpu config warper

* modify according to pr advices

* fix GpuLauchConfig1D api bugs

* change the config for dropout grad

* fix bugs

* modification according to pr advices

* modification according to pr advices
---
 paddle/fluid/operators/bilateral_slice_op.cu  |  12 +-
 paddle/fluid/operators/dropout_impl.cu.h      |  29 +++--
 .../elementwise/elementwise_add_op.cu         |   6 +-
 .../elementwise/elementwise_sub_op.cu         |  10 +-
 .../fused_fc_elementwise_layernorm_op.cu      |   1 +
 paddle/fluid/operators/index_sample_op.cu     |   2 +-
 paddle/fluid/operators/math/beam_search.cu    |   1 +
 paddle/fluid/operators/math/pooling.cu        |  37 +-----
 .../device/gpu/cuda/cuda_device_function.h    |  16 ---
 .../platform/device/gpu/gpu_launch_config.h   | 108 ++++++++++++------
 .../device/gpu/rocm/rocm_device_function.h    |  13 ---
 paddle/pten/kernels/gpu/elementwise.h         |  21 ++--
 12 files changed, 117 insertions(+), 139 deletions(-)

diff --git a/paddle/fluid/operators/bilateral_slice_op.cu b/paddle/fluid/operators/bilateral_slice_op.cu
index 3fd8995745acb..e7bf6d212dcf1 100644
--- a/paddle/fluid/operators/bilateral_slice_op.cu
+++ b/paddle/fluid/operators/bilateral_slice_op.cu
@@ -472,8 +472,8 @@ class BilateralSliceGradOpCUDAKernel : public framework::OpKernel<T> {
     grid_sizes.gw = gw;
     grid_sizes.input_chans = input_chans;
 
-    platform::GpuLaunchConfig config = platform::GetGpuLaunchConfig1D(
-        ctx.cuda_device_context(), grid_count, 512);
+    platform::GpuLaunchConfig config =
+        platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), grid_count);
 
     BilateralSliceCudaGridGradKernel<
         T><<<config.block_per_grid, config.thread_per_block, 0,
@@ -481,8 +481,8 @@ class BilateralSliceGradOpCUDAKernel : public framework::OpKernel<T> {
         grid_grad_data, output_grad_data, guide_data, input_data, grid_sizes,
         has_offset, grid_count, output_chans);
 
-    config = platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(),
-                                            guide_count, 512);
+    config =
+        platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), guide_count);
 
     BilateralSliceCudaGuideGradKernel<
         T><<<config.block_per_grid, config.thread_per_block, 0,
@@ -490,8 +490,8 @@ class BilateralSliceGradOpCUDAKernel : public framework::OpKernel<T> {
         guide_grad_data, output_grad_data, grid_data, guide_data, input_data,
         grid_sizes, has_offset, guide_count, output_chans);
 
-    config = platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(),
-                                            input_count, 512);
+    config =
+        platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), input_count);
 
     BilateralSliceCudaInputGradKernel<
         T><<<config.block_per_grid, config.thread_per_block, 0,
diff --git a/paddle/fluid/operators/dropout_impl.cu.h b/paddle/fluid/operators/dropout_impl.cu.h
index a708cbbfaacfc..192902902e5e2 100644
--- a/paddle/fluid/operators/dropout_impl.cu.h
+++ b/paddle/fluid/operators/dropout_impl.cu.h
@@ -193,12 +193,9 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
     // VectorizedRandomGenerator use curand_uniform4, so we only support
     // vec_size is 4;
     int vec_size = (platform::GetVectorizedSize<T>(x_data) == 4) ? 4 : 1;
-    int block_size = pten::funcs::GetThreadsConfig(dev_ctx, x_numel, vec_size);
-    int grid_size =
-        ((x_numel + vec_size - 1) / vec_size + block_size - 1) / block_size;
-
+    auto gpu_config = GetGpuLaunchConfig1D(dev_ctx, x_numel, vec_size);
     auto offset =
-        ((x_numel - 1) / (grid_size * block_size * vec_size) + 1) * vec_size;
+        ((x_numel - 1) / (gpu_config.GetThreadNum() * vec_size) + 1) * vec_size;
 
     GetSeedDataAndIncrement(dev_ctx, seed, is_fix_seed, seed_val, offset,
                             &seed_data, &increment);
@@ -206,23 +203,25 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
 #ifdef __HIPCC__
     if (vec_size == 4 && size % 4 == 0) {
       hipLaunchKernelGGL(
-          HIP_KERNEL_NAME(VectorizedRandomGenerator<T, uint8_t, 4>), grid_size,
-          block_size, 0, stream, size, seed_data, dropout_prob, x_data,
-          mask_data, y_data, upscale_in_train, increment);
+          HIP_KERNEL_NAME(VectorizedRandomGenerator<T, uint8_t, 4>),
+          gpu_config.GetGridSize(), gpu_config.GetBlockSize(), 0, stream, size,
+          seed_data, dropout_prob, x_data, mask_data, y_data, upscale_in_train,
+          increment);
     } else {
       hipLaunchKernelGGL(HIP_KERNEL_NAME(RandomGenerator<T, uint8_t>),
-                         grid_size, block_size, 0, stream, size, seed_data,
-                         dropout_prob, x_data, mask_data, y_data,
-                         upscale_in_train, increment);
+                         gpu_config.GetGridSize(), gpu_config.GetBlockSize(), 0,
+                         stream, size, seed_data, dropout_prob, x_data,
+                         mask_data, y_data, upscale_in_train, increment);
     }
 #else
     if (vec_size == 4 && size % 4 == 0) {
-      VectorizedRandomGenerator<T, uint8_t,
-                                4><<<grid_size, block_size, 0, stream>>>(
+      VectorizedRandomGenerator<T, uint8_t, 4><<<
+          gpu_config.block_per_grid, gpu_config.thread_per_block, 0, stream>>>(
           size, seed_data, dropout_prob, x_data, mask_data, y_data,
           upscale_in_train, increment);
     } else {
-      RandomGenerator<T, uint8_t><<<grid_size, block_size, 0, stream>>>(
+      RandomGenerator<T, uint8_t><<<gpu_config.block_per_grid,
+                                    gpu_config.thread_per_block, 0, stream>>>(
           size, seed_data, dropout_prob, x_data, mask_data, y_data,
           upscale_in_train, increment);
     }
@@ -265,7 +264,7 @@ void DropoutGradGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
           auto factor = static_cast<T>(1.0f / (1.0f - dropout_prob));
           auto stream = dev_ctx.stream();
           platform::GpuLaunchConfig config =
-              platform::GetGpuLaunchConfig1D(dev_ctx, size);
+              platform::GetGpuLaunchConfig1D(dev_ctx, size, vec_size);
           DropoutGradCUDAKernel<
               T, uint8_t,
               4><<<config.block_per_grid, config.thread_per_block, 0, stream>>>(
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
index b5c19a3edb818..779779b44da8d 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
@@ -128,10 +128,10 @@ elementwise_add_grad(const framework::ExecutionContext& ctx,
   } else if (dx_data != dout_data && dy_data != dout_data) {
     auto size = x->numel();
     int vec_size = max(static_cast<int>(sizeof(float4) / sizeof(T)), 1);
-    dim3 block_size = dim3(ELEMENTWISE_BLOCK_SIZE, 1);
+    dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1);
     dim3 grid_size =
-        dim3(((size + vec_size - 1) / vec_size + ELEMENTWISE_BLOCK_SIZE - 1) /
-                 ELEMENTWISE_BLOCK_SIZE,
+        dim3(((size + vec_size - 1) / vec_size + PREDEFINED_BLOCK_SIZE - 1) /
+                 PREDEFINED_BLOCK_SIZE,
              1);
     SimpleElemwiseAddGradCUDAKernel<
         T><<<grid_size, block_size, 0,
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
index 2ff4033ffe194..8f09476787741 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
@@ -73,10 +73,10 @@ default_elementwise_sub_grad(const framework::ExecutionContext& ctx,
     auto* dy_data = dy->mutable_data<T>(ctx.GetPlace());
     if (dy->dims() == dout->dims()) {
       if (dy_data != dout_data) {
-        dim3 block_size = dim3(ELEMENTWISE_BLOCK_SIZE, 1);
+        dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1);
         auto size = dy->numel();
-        dim3 grid_size = dim3(
-            (size + ELEMENTWISE_BLOCK_SIZE - 1) / ELEMENTWISE_BLOCK_SIZE, 1);
+        dim3 grid_size =
+            dim3((size + PREDEFINED_BLOCK_SIZE - 1) / PREDEFINED_BLOCK_SIZE, 1);
         SimpleElemwiseSubGradCUDAKernel<T><<<
             grid_size, block_size, 0,
             ctx.template device_context<plat::CUDADeviceContext>().stream()>>>(
@@ -100,10 +100,10 @@ elementwise_sub_grad(const framework::ExecutionContext& ctx,
                      const framework::Tensor* out,
                      const framework::Tensor* dout, framework::Tensor* dx,
                      framework::Tensor* dy) {
-  dim3 block_size = dim3(ELEMENTWISE_BLOCK_SIZE, 1);
+  dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1);
   auto size = x->numel();
   dim3 grid_size =
-      dim3((size + ELEMENTWISE_BLOCK_SIZE - 1) / ELEMENTWISE_BLOCK_SIZE, 1);
+      dim3((size + PREDEFINED_BLOCK_SIZE - 1) / PREDEFINED_BLOCK_SIZE, 1);
   SimpleElemwiseSubGradCUDAKernel<
       T><<<grid_size, block_size, 0,
            ctx.template device_context<plat::CUDADeviceContext>().stream()>>>(
diff --git a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
index c5b1fd9392950..ebda9bbaa8b81 100644
--- a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
+++ b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
@@ -23,6 +23,7 @@ namespace cub = hipcub;
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/index_sample_op.cu b/paddle/fluid/operators/index_sample_op.cu
index 40a968b8a397d..4260d0516e3cc 100644
--- a/paddle/fluid/operators/index_sample_op.cu
+++ b/paddle/fluid/operators/index_sample_op.cu
@@ -15,7 +15,7 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/index_sample_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/math/beam_search.cu b/paddle/fluid/operators/math/beam_search.cu
index 0cc552d34c587..cec688262604a 100644
--- a/paddle/fluid/operators/math/beam_search.cu
+++ b/paddle/fluid/operators/math/beam_search.cu
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/beam_search.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/pooling.cu b/paddle/fluid/operators/math/pooling.cu
index 076d3aa3361f0..9d96345eb1f6d 100644
--- a/paddle/fluid/operators/math/pooling.cu
+++ b/paddle/fluid/operators/math/pooling.cu
@@ -16,17 +16,10 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/operators/math/pooling.h"
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/fast_divmod.h"
 
-#ifdef __HIPCC__
-#define POOLING_BLOCK_SIZE 256
-#else
-#define POOLING_BLOCK_SIZE 512
-#endif
-
 namespace paddle {
 namespace operators {
 namespace math {
@@ -97,22 +90,6 @@ __device__ void OffsetPreparationFor4Dimension(
   }
 }
 
-int GetThreadsPerBlock(const platform::CUDADeviceContext& ctx,
-                       int threads_per_block, int64_t numel) {
-  int sm_count = ctx.GetSMCount();
-  if (numel / (sm_count << 1) < threads_per_block) {
-    // Round up threads number into an exponential multiple of 2, while number
-    // of acitve blocks is about twice of SM, to acquire better performance.
-    threads_per_block = platform::RoundToPowerOfTwo(numel / (sm_count << 1));
-  } else if (numel / (sm_count << 2) < threads_per_block) {
-    // Round up threads number into an exponential multiple of 2, while number
-    // of acitve blocks is about 4 times of SM, to acquire better performance.
-    threads_per_block = platform::RoundToPowerOfTwo(numel / (sm_count << 2));
-  }
-  // Number of threads per block shall be larger than 64.
-  return std::max(64, threads_per_block);
-}
-
 template <typename PoolProcess, typename T>
 __global__ void KernelPool2D(
     const int nthreads, const T* input_data, const int channels,
@@ -491,14 +468,13 @@ class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
     T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
 
     int nthreads = batch_size * input_channels * input_height * input_width;
-    int blocks = GetThreadsPerBlock(context, POOLING_BLOCK_SIZE, nthreads);
-    int grids = (nthreads + blocks - 1) / blocks;
-
     auto pool_divmods = FastDivModForPoolingWithMoreStaff(
         input_channels, input_width, input_height, ksize_width, ksize_height,
         stride_width, stride_height);
 
-    KernelPool2DGrad<T, PoolProcess><<<grids, blocks, 0, context.stream()>>>(
+    auto config = GetGpuLaunchConfig1D(context, nthreads);
+    KernelPool2DGrad<T, PoolProcess><<<
+        config.block_per_grid, config.thread_per_block, 0, context.stream()>>>(
         nthreads, input_data, output_data, output_grad_data, output_width,
         output_height, input_width, input_height, ksize_width, ksize_height,
         stride_width, stride_height, padding_width, padding_height,
@@ -541,14 +517,13 @@ class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
     T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
 
     int nthreads = batch_size * input_channels * input_height * input_width;
-    int blocks = GetThreadsPerBlock(context, POOLING_BLOCK_SIZE, nthreads);
-    int grids = (nthreads + blocks - 1) / blocks;
-
     auto pool_divmods = FastDivModForPoolingWithMoreStaff(
         input_channels, input_width, input_height, ksize_width, ksize_height,
         stride_width, stride_height);
 
-    KernelPool2DGrad<T, PoolProcess><<<grids, blocks, 0, context.stream()>>>(
+    auto config = GetGpuLaunchConfig1D(context, nthreads);
+    KernelPool2DGrad<T, PoolProcess><<<
+        config.block_per_grid, config.thread_per_block, 0, context.stream()>>>(
         nthreads, input_data, output_data, output_grad_data, output_width,
         output_height, input_width, input_height, ksize_width, ksize_height,
         stride_width, stride_height, padding_width, padding_height,
diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h b/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h
index 7fe2367b5510e..cd78a89088cc6 100644
--- a/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h
@@ -26,22 +26,6 @@ namespace platform {
 #define CREATE_SHFL_MASK(mask, predicate) \
   mask = __ballot_sync(FULL_WARP_MASK, (predicate))
 
-inline static int RoundToPowerOfTwo(int dim) {
-  if (dim > 512) {
-    return 1024;
-  } else if (dim > 256) {
-    return 512;
-  } else if (dim > 128) {
-    return 256;
-  } else if (dim > 64) {
-    return 128;
-  } else if (dim > 32) {
-    return 64;
-  } else {
-    return 32;
-  }
-}
-
 #define CUDA_LAUNCH_KERNEL_BASE(dim, ...)  \
   case (dim): {                            \
     constexpr auto kPowerOfTwoDim = (dim); \
diff --git a/paddle/fluid/platform/device/gpu/gpu_launch_config.h b/paddle/fluid/platform/device/gpu/gpu_launch_config.h
index 55f4c8eb4cd55..883767348f06a 100644
--- a/paddle/fluid/platform/device/gpu/gpu_launch_config.h
+++ b/paddle/fluid/platform/device/gpu/gpu_launch_config.h
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Used for compute gpu launch parameter
+// Used for compute gpu launch parameter config
 
 #pragma once
 
@@ -30,11 +30,36 @@
 #include <vector>
 #include "paddle/fluid/platform/device_context.h"
 
+#ifdef __HIPCC__
+// HIP results in error or nan if > 256
+#define PREDEFINED_BLOCK_SIZE 256
+#else
+/* CUDA performs better as thread_per_block
+   num is between [64, 512] */
+#define PREDEFINED_BLOCK_SIZE 512
+#endif
+
 namespace paddle {
 namespace platform {
 
 inline int DivUp(int a, int b) { return (a + b - 1) / b; }
 
+/* https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+   for round integer value into next highest power of 2. */
+static inline int RoundToPowerOfTwo(int n) {
+  n--;
+  n |= (n >> 1);
+  n |= (n >> 2);
+  n |= (n >> 4);
+  n |= (n >> 8);
+  n |= (n >> 16);
+#ifdef __HIPCC__
+  return std::min(256, std::max(32, (n + 1)));
+#else
+  return std::min(1024, std::max(32, (n + 1)));
+#endif
+}
+
 #ifdef WITH_NV_JETSON
 // The number of threads cannot be assigned 1024 in some cases when the device
 // is nano or tx2 .
@@ -48,54 +73,64 @@ inline void ChangeThreadNum(const platform::CUDADeviceContext& context,
 #endif
 
 struct GpuLaunchConfig {
-  dim3 theory_thread_count = dim3(1, 1, 1);
+ public:
+  GpuLaunchConfig() {}
+
+  size_t GetThreadNum() const { return GetBlockSize() * GetGridSize(); }
+
+  size_t GetGridSize() const {
+    return block_per_grid.x * block_per_grid.y * block_per_grid.z;
+  }
+
+  size_t GetBlockSize() const {
+    return thread_per_block.x * thread_per_block.y * thread_per_block.z;
+  }
+
+  int compute_capability = 0;
   dim3 thread_per_block = dim3(1, 1, 1);
   dim3 block_per_grid = dim3(1, 1, 1);
-  int compute_capability = 0;
 };
 
+/* According to NVIDIA, if number of threads per block is 64/128/256/512,
+  * cuda performs better. And number of blocks should be greater (at least
+  * 2x~4x) than number of SMs. Hence, SM count is took into account within
+  * this function to determine the right number of threads per block. */
 inline GpuLaunchConfig GetGpuLaunchConfig1D(
-    const platform::CUDADeviceContext& context, int64_t element_count,
-#ifdef PADDLE_WITH_HIP
-    // HIP will throw GPU memory access fault if threads > 256
-    int max_threads = 256) {
-#else
-    int max_threads = 1024) {
-#endif
-  PADDLE_ENFORCE_GT(element_count, 0,
-                    platform::errors::InvalidArgument(
-                        "element count should be greater than 0,"
-                        " but received value is: %d.",
-                        element_count));
-
-  const int theory_thread_count = element_count;
-  // Get Max threads in all SM
-  int max_physical_threads = context.GetMaxPhysicalThreadCount();
-  int sm = context.GetSMCount();
-
-  // Compute physical threads we need, should small than max sm threads
-  const int physical_thread_count =
-      (std::min)(max_physical_threads, theory_thread_count);
-
+    const platform::CUDADeviceContext& context, int64_t numel,
+    int vec_size = 1) {
+  PADDLE_ENFORCE_GT(numel, 0, platform::errors::InvalidArgument(
+                                  "element quantity should be greater than 0,"
+                                  " but received value is: %d.",
+                                  numel));
   // Get compute_capability
   const int capability = context.GetComputeCapability();
-
+  /* If thread number per block is 64/128/256/512, cuda performs better.*/
+  int limit_threads =
+      std::min(PREDEFINED_BLOCK_SIZE, context.GetMaxThreadsPerBlock());
 #ifdef WITH_NV_JETSON
   if (capability == 53 || capability == 62) {
-    max_threads = 512;
+    limit_threads = 512;
   }
 #endif
-
-  // Need get from device
-  const int thread_per_block =
-      (std::min)(max_threads, context.GetMaxThreadsPerBlock());
-  const int block_count =
-      (std::min)(DivUp(physical_thread_count, thread_per_block), sm);
+  int threads = limit_threads;
+  int sm_count = context.GetSMCount();
+  int active_threads_num = numel / vec_size;
+  if (active_threads_num / (sm_count << 1) < limit_threads) {
+    // Round up threads number into an exponential multiple of 2, while number
+    // of acitve blocks is about twice of SM, to acquire better performance.
+    threads = RoundToPowerOfTwo(active_threads_num / (sm_count << 1));
+  } else if (active_threads_num / (sm_count << 2) < limit_threads) {
+    // Round up threads number into an exponential multiple of 2, while number
+    // of acitve blocks is about 4 times of SM, to acquire better performance.
+    threads = RoundToPowerOfTwo(active_threads_num / (sm_count << 2));
+  }
+  // Number of threads per block shall be larger than 64.
+  threads = std::max(64, threads);
+  int blocks = DivUp(DivUp(numel, vec_size), threads);
 
   GpuLaunchConfig config;
-  config.theory_thread_count.x = theory_thread_count;
-  config.thread_per_block.x = thread_per_block;
-  config.block_per_grid.x = block_count;
+  config.thread_per_block.x = threads;
+  config.block_per_grid.x = blocks;
   config.compute_capability = capability;
   return config;
 }
@@ -120,7 +155,6 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D(
 
   GpuLaunchConfig config;
   // Noticed, block size is not align to 32, if needed do it yourself.
-  config.theory_thread_count = dim3(x_dim, y_dim, 1);
   config.thread_per_block = dim3(block_cols, block_rows, 1);
 
   int grid_x = (std::min)(DivUp(x_dim, block_cols), max_blocks);
diff --git a/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h b/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h
index 2263383f8fabb..13ffc2396946c 100644
--- a/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h
+++ b/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h
@@ -24,19 +24,6 @@ namespace platform {
 
 #define CREATE_SHFL_MASK(mask, predicate) mask = __ballot((predicate))
 
-inline static int RoundToPowerOfTwo(int dim) {
-  // HIP results in error or nan if > 256
-  if (dim > 128) {
-    return 256;
-  } else if (dim > 64) {
-    return 128;
-  } else if (dim > 32) {
-    return 64;
-  } else {
-    return 32;
-  }
-}
-
 #define CUDA_LAUNCH_KERNEL_BASE(dim, ...)  \
   case (dim): {                            \
     constexpr auto kPowerOfTwoDim = (dim); \
diff --git a/paddle/pten/kernels/gpu/elementwise.h b/paddle/pten/kernels/gpu/elementwise.h
index e4cc894e48354..049e430154a8b 100644
--- a/paddle/pten/kernels/gpu/elementwise.h
+++ b/paddle/pten/kernels/gpu/elementwise.h
@@ -16,9 +16,9 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h"
 #include "paddle/fluid/platform/aligned_vector.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/function_traits.h"
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/kernels/funcs/cuda_kernel_config.h"
 
 namespace pten {
 
@@ -239,18 +239,15 @@ void ElementwiseCudaKernel(const KPDevice &ctx,
                               VecSize><<<grid_size, block_size, 0, stream>>>(
       ins_data, outs_data, numel, main_offset, func);
 #else
-  int block_size = funcs::GetThreadsConfig(ctx, numel, VecSize);
-  int grid_size =
-      ((numel + VecSize - 1) / VecSize + block_size - 1) / block_size;
-  int main_offset = (numel / (VecSize * block_size)) * VecSize * block_size;
+  auto gpu_config = GetGpuLaunchConfig1D(ctx, numel, VecSize);
+  int main_offset = (numel / (VecSize * gpu_config.GetBlockSize())) * VecSize *
+                    gpu_config.GetBlockSize();
   auto stream = ctx.stream();
-  VectorizedElementwiseKernel<InT,
-                              OutT,
-                              Functor,
-                              Arity,
-                              NumOuts,
-                              VecSize><<<grid_size, block_size, 0, stream>>>(
-      ins_data, outs_data, numel, main_offset, func);
+  VectorizedElementwiseKernel<InT, OutT, Functor, Arity, NumOuts, VecSize><<<
+      gpu_config.block_per_grid,
+      gpu_config.thread_per_block,
+      0,
+      stream>>>(ins_data, outs_data, numel, main_offset, func);
 #endif
 }
 

From df5d55bb95903933d5160267d7a68c9c45269f5d Mon Sep 17 00:00:00 2001
From: Zhang Ting <zhangting_2017@163.com>
Date: Wed, 12 Jan 2022 12:47:20 +0800
Subject: [PATCH 099/151] [part 1]change type of function args (#38885)

---
 paddle/fluid/operators/abs_op.cu        | 4 ++--
 paddle/fluid/operators/bce_loss_op.cu   | 4 ++--
 paddle/fluid/operators/clip_op.h        | 2 +-
 paddle/fluid/operators/p_norm_op.cu     | 8 ++++----
 paddle/fluid/operators/renorm_op.cu     | 2 +-
 paddle/pten/kernels/gpu/cast_kernel.cu  | 2 +-
 paddle/pten/kernels/gpu/scale_kernel.cu | 2 +-
 7 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/paddle/fluid/operators/abs_op.cu b/paddle/fluid/operators/abs_op.cu
index 94b0a3ae72938..86748d4505d28 100644
--- a/paddle/fluid/operators/abs_op.cu
+++ b/paddle/fluid/operators/abs_op.cu
@@ -24,14 +24,14 @@ struct CudaAbsFunctor;
 
 template <typename T>
 struct CudaAbsFunctor<T, math::Complex<T, math::Real<T>>> {
-  __device__ __forceinline__ math::Real<T> operator()(const T& x) const {
+  __device__ __forceinline__ math::Real<T> operator()(const T x) const {
     return abs(x);
   }
 };
 
 template <typename T>
 struct CudaAbsFunctor<T, math::NoComplex<T, math::Real<T>>> {
-  __device__ __forceinline__ T operator()(const T& x) const {
+  __device__ __forceinline__ T operator()(const T x) const {
     return std::abs(x);
   }
 };
diff --git a/paddle/fluid/operators/bce_loss_op.cu b/paddle/fluid/operators/bce_loss_op.cu
index 18562b243255b..da96aa92cd25a 100644
--- a/paddle/fluid/operators/bce_loss_op.cu
+++ b/paddle/fluid/operators/bce_loss_op.cu
@@ -28,8 +28,8 @@ template <typename T>
 struct BCELossGradFunctor {
   T one = static_cast<T>(1.0f);
   T eps = static_cast<T>(1e-12);
-  __device__ __forceinline__ T operator()(const T& x, const T& label,
-                                          const T& dout) const {
+  __device__ __forceinline__ T operator()(const T x, const T label,
+                                          const T dout) const {
     T term1 = max((one - x) * x, eps);
     return (dout * (x - label) / term1);
   }
diff --git a/paddle/fluid/operators/clip_op.h b/paddle/fluid/operators/clip_op.h
index f08a7b2d57314..3672fa983e495 100644
--- a/paddle/fluid/operators/clip_op.h
+++ b/paddle/fluid/operators/clip_op.h
@@ -32,7 +32,7 @@ template <typename T>
 class ClipFunctor {
  public:
   explicit ClipFunctor(const T min, const T max) : min_(min), max_(max) {}
-  HOSTDEVICE T operator()(const T& x) const {
+  HOSTDEVICE T operator()(const T x) const {
     return x < min_ ? min_ : x > max_ ? max_ : x;
   }
 
diff --git a/paddle/fluid/operators/p_norm_op.cu b/paddle/fluid/operators/p_norm_op.cu
index 1db6f6e517462..b2a9ca6f93742 100644
--- a/paddle/fluid/operators/p_norm_op.cu
+++ b/paddle/fluid/operators/p_norm_op.cu
@@ -63,7 +63,7 @@ __device__ __forceinline__ double inline_pow(double base, double exponent) {
 template <typename T>
 struct NonzeroFunctor {
   HOSTDEVICE explicit inline NonzeroFunctor() {}
-  HOSTDEVICE inline T operator()(const T& x) const {
+  HOSTDEVICE inline T operator()(const T x) const {
     return static_cast<T>(static_cast<double>(x) != 0);
   }
 };
@@ -71,7 +71,7 @@ struct NonzeroFunctor {
 template <typename T>
 struct AbsFunctor {
   HOSTDEVICE explicit inline AbsFunctor() {}
-  HOSTDEVICE inline T operator()(const T& x) const {
+  HOSTDEVICE inline T operator()(const T x) const {
     return static_cast<T>(inline_abs(x));
   }
 };
@@ -81,7 +81,7 @@ struct UnsignedPowFunctor {
   HOSTDEVICE explicit inline UnsignedPowFunctor(float porder) {
     this->porder = porder;
   }
-  HOSTDEVICE inline Ty operator()(const Tx& x) const {
+  HOSTDEVICE inline Ty operator()(const Tx x) const {
     return static_cast<Ty>(inline_pow(inline_abs(x), static_cast<Tx>(porder)));
   }
   float porder;
@@ -90,7 +90,7 @@ struct UnsignedPowFunctor {
 template <typename Tx, typename Ty = Tx>
 struct PowFunctor {
   HOSTDEVICE explicit inline PowFunctor(float porder) { this->porder = porder; }
-  HOSTDEVICE inline Ty operator()(const Tx& x) const {
+  HOSTDEVICE inline Ty operator()(const Tx x) const {
     return static_cast<Ty>(inline_pow(x, static_cast<Tx>(porder)));
   }
   float porder;
diff --git a/paddle/fluid/operators/renorm_op.cu b/paddle/fluid/operators/renorm_op.cu
index 1798faa759bed..b21b9fde56f24 100644
--- a/paddle/fluid/operators/renorm_op.cu
+++ b/paddle/fluid/operators/renorm_op.cu
@@ -42,7 +42,7 @@ struct UnsignedPowFunctor {
   HOSTDEVICE explicit inline UnsignedPowFunctor(float porder) {
     this->porder = porder;
   }
-  HOSTDEVICE inline Ty operator()(const Tx& x) const {
+  HOSTDEVICE inline Ty operator()(const Tx x) const {
     return static_cast<Ty>(inline_pow(inline_abs(x), static_cast<Tx>(porder)));
   }
   float porder;
diff --git a/paddle/pten/kernels/gpu/cast_kernel.cu b/paddle/pten/kernels/gpu/cast_kernel.cu
index 9f65400f93b9f..0bbe7a3a132d1 100644
--- a/paddle/pten/kernels/gpu/cast_kernel.cu
+++ b/paddle/pten/kernels/gpu/cast_kernel.cu
@@ -30,7 +30,7 @@ namespace pten {
 
 template <typename InT, typename OutT>
 struct CastFuctor {
-  __device__ __forceinline__ OutT operator()(const InT& x) const {
+  __device__ __forceinline__ OutT operator()(const InT x) const {
     return static_cast<OutT>(x);
   }
 };
diff --git a/paddle/pten/kernels/gpu/scale_kernel.cu b/paddle/pten/kernels/gpu/scale_kernel.cu
index f4bb5c5dbf755..68574c063e77f 100644
--- a/paddle/pten/kernels/gpu/scale_kernel.cu
+++ b/paddle/pten/kernels/gpu/scale_kernel.cu
@@ -34,7 +34,7 @@ struct ScaleFunctor {
     bias_after_scale = is_bias_after_sacle;
   }
 
-  __device__ __forceinline__ InT operator()(const InT& x) const {
+  __device__ __forceinline__ InT operator()(const InT x) const {
     if (bias_after_scale) {
       return scale * x + bias;
     } else {

From 86434818474d5a93e00dd767aef4a88dcda79b6f Mon Sep 17 00:00:00 2001
From: Zhang Ting <zhangting_2017@163.com>
Date: Wed, 12 Jan 2022 12:47:30 +0800
Subject: [PATCH 100/151] [part 2]change type of function args (#38886)

---
 paddle/fluid/operators/activation_op.cu | 223 ++++++++++++------------
 1 file changed, 111 insertions(+), 112 deletions(-)

diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu
index 342ed3a6b19e2..8cced5cd919f2 100644
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -24,7 +24,7 @@ struct CudaReluFunctor : public BaseActivationFunctor<T> {
   T zero = static_cast<T>(0.0f);
 
   // relu(x) = max(x, 0)
-  __device__ __forceinline__ T operator()(const T& x) const {
+  __device__ __forceinline__ T operator()(const T x) const {
     return x > zero ? x : zero;
   }
 };
@@ -34,7 +34,7 @@ struct CudaReluGradFunctor : public BaseActivationFunctor<T> {
   T zero = static_cast<T>(0.0f);
 
   // dx = dout * (out > 0)
-  __device__ __forceinline__ T operator()(const T& dout, const T& out) const {
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
     return out > zero ? dout : zero;
   }
 
@@ -51,7 +51,7 @@ struct CudaLeakyReluFunctor : public BaseActivationFunctor<T> {
   }
 
   // leakyrelu(x) = x > 0 ? x : alpha * x
-  __device__ __forceinline__ T operator()(const T& x) const {
+  __device__ __forceinline__ T operator()(const T x) const {
     return x > zero ? x : static_cast<T>(alpha) * x;
   }
 };
@@ -66,7 +66,7 @@ struct CudaLeakyReluGradFunctor : public BaseActivationFunctor<T> {
   }
 
   // dx = dout * (x > 0 ? 1 : alpha)
-  __device__ __forceinline__ T operator()(const T& dout, const T& x) const {
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
     return x > zero ? dout : static_cast<T>(alpha) * dout;
   }
 
@@ -79,7 +79,7 @@ struct CudaSigmoidFunctor : public BaseActivationFunctor<T> {
   MPType one = static_cast<MPType>(1.0f);
 
   // sigmoid(x) = 1 / (1 + exp(-x))
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(one / (one + exp(-x)));
   }
@@ -90,7 +90,7 @@ struct CudaSigmoidGradFunctor : public BaseActivationFunctor<T> {
   T one = static_cast<T>(1.0f);
 
   // dx = dout * out * (1 - out)
-  __device__ __forceinline__ T operator()(const T& dout, const T& out) const {
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
     return dout * out * (one - out);
   }
 
@@ -103,7 +103,7 @@ struct CudaSiluFunctor : public BaseActivationFunctor<T> {
   MPType one = static_cast<MPType>(1.0f);
 
   // silu(x) = x / (1 + exp(-x))
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(x / (one + exp(-x)));
   }
@@ -115,8 +115,8 @@ struct CudaSiluGradFunctor : public BaseActivationFunctor<T> {
   MPType one = static_cast<MPType>(1.0f);
 
   // dx = dout * (1 + exp(-x) + x * exp(-x) / (1 + exp(-x))^2)
-  __device__ __forceinline__ T operator()(const T& arg_dout,
-                                          const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
     MPType dout = static_cast<MPType>(arg_dout);
     MPType x = static_cast<MPType>(arg_x);
     MPType temp = one / (one + exp(-x));
@@ -135,7 +135,7 @@ struct CudaLogSigmoidFunctor : public BaseActivationFunctor<T> {
   // For numerical stability,
   // logsigmoid(x) =
   //          - (max(-x, 0) + log(exp(-max(-x, 0)) + exp(-x - max(-x, 0))))
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     MPType temp = x > zero ? zero : -x;
     return static_cast<T>(-temp - log(exp(-temp) + exp(-x - temp)));
@@ -151,8 +151,8 @@ struct CudaLogSigmoidGradFunctor : public BaseActivationFunctor<T> {
   // For numerical stability:
   // dx = dout * exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) + exp(-x - max(-x,
   // 0)))
-  __device__ __forceinline__ T operator()(const T& arg_dout,
-                                          const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
     MPType dout = static_cast<MPType>(arg_dout);
     MPType x = static_cast<MPType>(arg_x);
     MPType temp1 = x > zero ? zero : -x;
@@ -168,7 +168,7 @@ struct CudaAtanFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // atan(x) = atan(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(atan(x));
   }
@@ -179,7 +179,7 @@ struct CudaAtanGradFunctor : public BaseActivationFunctor<T> {
   T one = static_cast<T>(1.0f);
 
   // dx = dout / (1 + x^2)
-  __device__ __forceinline__ T operator()(const T& dout, const T& x) const {
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
     return dout / (one + x * x);
   }
 
@@ -197,7 +197,7 @@ struct CudaSoftShrinkFunctor : public BaseActivationFunctor<T> {
   // softshrink(x) = x - lambda, if x > lambda;
   //                 x + lambda, if x < -lambda;
   //                 0, otherwise.
-  __device__ __forceinline__ T operator()(const T& x) const {
+  __device__ __forceinline__ T operator()(const T x) const {
     T l = static_cast<T>(lambda);
     T temp1 = static_cast<T>(x > l);
     T temp2 = static_cast<T>(x < -l);
@@ -215,7 +215,7 @@ struct CudaSoftShrinkGradFunctor : public BaseActivationFunctor<T> {
   }
 
   // dx = dout, if x > lambda or x < -lambda else 0
-  __device__ __forceinline__ T operator()(const T& dout, const T& x) const {
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
     T l = static_cast<T>(lambda);
     return (x >= -l && x <= l) ? zero : dout;
   }
@@ -228,7 +228,7 @@ struct CudaCeilFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // ceil(x) = ceil(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(ceil(x));
   }
@@ -239,7 +239,7 @@ struct CudaFloorFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // floor(x) = floor(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(floor(x));
   }
@@ -250,7 +250,7 @@ struct CudaRoundFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // round(x) = round(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(round(x));
   }
@@ -259,7 +259,7 @@ struct CudaRoundFunctor : public BaseActivationFunctor<T> {
 // GradFunctor for ceil, floor and round
 template <typename T>
 struct CudaZeroGradFunctor : public BaseActivationFunctor<T> {
-  __device__ __forceinline__ T operator()(const T& x) const {
+  __device__ __forceinline__ T operator()(const T x) const {
     return static_cast<T>(0.0f);
   }
 
@@ -271,7 +271,7 @@ struct CudaCosFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // cos(x) = cos(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(cos(x));
   }
@@ -282,8 +282,8 @@ struct CudaCosGradFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // dx = dout * (-sin(x))
-  __device__ __forceinline__ T operator()(const T& arg_dout,
-                                          const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
     MPType dout = static_cast<MPType>(arg_dout);
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(-dout * sin(x));
@@ -297,7 +297,7 @@ struct CudaSinFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // sin(x) = sin(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(sin(x));
   }
@@ -308,8 +308,8 @@ struct CudaSinGradFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // dx = dout * cos(x)
-  __device__ __forceinline__ T operator()(const T& arg_dout,
-                                          const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
     MPType dout = static_cast<MPType>(arg_dout);
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(dout * cos(x));
@@ -323,7 +323,7 @@ struct CudaTanFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // tan(x) = tan(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(tan(x));
   }
@@ -334,8 +334,8 @@ struct CudaTanGradFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // dx = dout / cos(x)^2
-  __device__ __forceinline__ T operator()(const T& arg_dout,
-                                          const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
     MPType dout = static_cast<MPType>(arg_dout);
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(dout / (cos(x) * cos(x)));
@@ -349,7 +349,7 @@ struct CudaAsinFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // asin(x) = asin(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(asin(x));
   }
@@ -361,8 +361,8 @@ struct CudaAsinGradFunctor : public BaseActivationFunctor<T> {
   MPType one = static_cast<MPType>(1.0f);
 
   // dx = dout / sqrt(1 - x^2)
-  __device__ __forceinline__ T operator()(const T& arg_dout,
-                                          const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
     MPType dout = static_cast<MPType>(arg_dout);
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(dout / sqrt(one - x * x));
@@ -376,7 +376,7 @@ struct CudaAcosFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // acos(x) = acos(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(acos(x));
   }
@@ -388,8 +388,8 @@ struct CudaAcosGradFunctor : public BaseActivationFunctor<T> {
   MPType one = static_cast<MPType>(1.0f);
 
   // dx = -dout / sqrt(1 - x^2)
-  __device__ __forceinline__ T operator()(const T& arg_dout,
-                                          const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
     MPType dout = static_cast<MPType>(arg_dout);
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(-dout / sqrt(one - x * x));
@@ -403,7 +403,7 @@ struct CudaCoshFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // cosh(x) = cosh(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(cosh(x));
   }
@@ -414,8 +414,8 @@ struct CudaCoshGradFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // dx = dout * sinh(x)
-  __device__ __forceinline__ T operator()(const T& arg_dout,
-                                          const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
     MPType dout = static_cast<MPType>(arg_dout);
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(dout * sinh(x));
@@ -429,7 +429,7 @@ struct CudaSinhFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // sinh(x) = sinh(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(sinh(x));
   }
@@ -440,8 +440,8 @@ struct CudaSinhGradFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // dx = dout * cosh(x)
-  __device__ __forceinline__ T operator()(const T& arg_dout,
-                                          const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
     MPType dout = static_cast<MPType>(arg_dout);
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(dout * cosh(x));
@@ -455,7 +455,7 @@ struct CudaTanhFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // tanh(x) = tanh(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(tanh(x));
   }
@@ -466,7 +466,7 @@ struct CudaTanhGradFunctor : public BaseActivationFunctor<T> {
   T one = static_cast<T>(1.0f);
 
   // dx = dout * (1 - out^2)
-  __device__ __forceinline__ T operator()(const T& dout, const T& out) const {
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
     return dout * (one - out * out);
   }
 
@@ -478,7 +478,7 @@ struct CudaAcoshFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // Acosh(x) = acosh(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(acosh(x));
   }
@@ -489,8 +489,8 @@ struct CudaAcoshGradFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
   MPType one = static_cast<MPType>(1.0f);
   // dx = dout * 1 / sqrt(x^2 - 1)
-  __device__ __forceinline__ T operator()(const T& arg_dout,
-                                          const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
     MPType dout = static_cast<MPType>(arg_dout);
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(dout * one / sqrt(x * x - one));
@@ -504,7 +504,7 @@ struct CudaAsinhFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // Asinh(x) = asinh(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(asinh(x));
   }
@@ -516,8 +516,8 @@ struct CudaAsinhGradFunctor : public BaseActivationFunctor<T> {
   MPType one = static_cast<MPType>(1.0f);
 
   // dx = dout * 1/sqrt(x^2 + 1)
-  __device__ __forceinline__ T operator()(const T& arg_dout,
-                                          const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
     MPType dout = static_cast<MPType>(arg_dout);
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(dout * one / sqrt(x * x + one));
@@ -531,7 +531,7 @@ struct CudaAtanhFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // Atanh(x) = atanh(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(atanh(x));
   }
@@ -542,8 +542,8 @@ struct CudaAtanhGradFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
   MPType one = static_cast<MPType>(1.0f);
   // dx = dout * 1/(1- x^2)
-  __device__ __forceinline__ T operator()(const T& arg_dout,
-                                          const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
     MPType dout = static_cast<MPType>(arg_dout);
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(dout * one / (one - x * x));
@@ -557,13 +557,13 @@ struct CudaReciprocalFunctor : public BaseActivationFunctor<T> {
   T one = static_cast<T>(1.0f);
 
   // reciprocal(x) = 1 / x
-  __device__ __forceinline__ T operator()(const T& x) const { return one / x; }
+  __device__ __forceinline__ T operator()(const T x) const { return one / x; }
 };
 
 template <typename T>
 struct CudaReciprocalGradFunctor : public BaseActivationFunctor<T> {
   // dx = -dout * out^2
-  __device__ __forceinline__ T operator()(const T& dout, const T& out) const {
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
     return -dout * out * out;
   }
 
@@ -575,7 +575,7 @@ struct CudaExpFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // exp(x) = exp(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(exp(x));
   }
@@ -584,7 +584,7 @@ struct CudaExpFunctor : public BaseActivationFunctor<T> {
 template <typename T>
 struct CudaExpGradFunctor : public BaseActivationFunctor<T> {
   // dx = dout * out
-  __device__ __forceinline__ T operator()(const T& dout, const T& out) const {
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
     return dout * out;
   }
 
@@ -596,7 +596,7 @@ struct CudaExpm1Functor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // expm1(x) = expm1(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(expm1(x));
   }
@@ -605,7 +605,7 @@ struct CudaExpm1Functor : public BaseActivationFunctor<T> {
 template <typename T>
 struct CudaExpm1GradFunctor : public BaseActivationFunctor<T> {
   // dx = dout * out
-  __device__ __forceinline__ T operator()(const T& dout, const T& out) const {
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
     return dout * out + dout;
   }
 
@@ -617,7 +617,7 @@ struct CudaLogFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // log(x) = log(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(log(x));
   }
@@ -626,7 +626,7 @@ struct CudaLogFunctor : public BaseActivationFunctor<T> {
 template <typename T>
 struct CudaLogGradFunctor : public BaseActivationFunctor<T> {
   // dx = dout / x
-  __device__ __forceinline__ T operator()(const T& dout, const T& x) const {
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
     return dout / x;
   }
 
@@ -636,7 +636,7 @@ struct CudaLogGradFunctor : public BaseActivationFunctor<T> {
 template <typename T>
 struct CudaSquareFunctor : public BaseActivationFunctor<T> {
   // square(x) = x * x
-  __device__ __forceinline__ T operator()(const T& x) const { return x * x; }
+  __device__ __forceinline__ T operator()(const T x) const { return x * x; }
 };
 
 template <typename T>
@@ -644,7 +644,7 @@ struct CudaSquareGradFunctor : public BaseActivationFunctor<T> {
   T two = static_cast<T>(2.0f);
 
   // dx = dout * 2 * x
-  __device__ __forceinline__ T operator()(const T& dout, const T& x) const {
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
     return dout * two * x;
   }
 
@@ -656,7 +656,7 @@ struct CudaSqrtFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // sqrt(x) = sqrt(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(sqrt(x));
   }
@@ -667,7 +667,7 @@ struct CudaSqrtGradFunctor : public BaseActivationFunctor<T> {
   T one_half = static_cast<T>(0.5f);
 
   // dx = dout * 0.5 / out
-  __device__ __forceinline__ T operator()(const T& dout, const T& out) const {
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
     return one_half * dout / out;
   }
 
@@ -679,7 +679,7 @@ struct CudaRsqrtFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // rsqrt(x) = rsqrt(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(rsqrt(x));
   }
@@ -690,7 +690,7 @@ struct CudaRsqrtGradFunctor : public BaseActivationFunctor<T> {
   T minus_one_half = static_cast<T>(-0.5f);
 
   // dx = -0.5 * dout * out^3
-  __device__ __forceinline__ T operator()(const T& dout, const T& out) const {
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
     return minus_one_half * dout * out * out * out;
   }
 
@@ -703,7 +703,7 @@ struct CudaLog1pFunctor : public BaseActivationFunctor<T> {
   MPType one = static_cast<MPType>(1.0f);
 
   // log1p(x) = log(1 + x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(log(one + x));
   }
@@ -714,7 +714,7 @@ struct CudaLog1pGradFunctor : public BaseActivationFunctor<T> {
   T one = static_cast<T>(1.0f);
 
   // dx = dout / (1 + x)
-  __device__ __forceinline__ T operator()(const T& dout, const T& x) const {
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
     return dout / (one + x);
   }
 
@@ -726,7 +726,7 @@ struct CudaLog2Functor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // log2(x) = log2(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(log2(x));
   }
@@ -738,7 +738,7 @@ struct CudaLog2GradFunctor : public BaseActivationFunctor<T> {
   T log_two = static_cast<T>(log(static_cast<MPType>(2.0f)));
 
   // dx = dout / (x * log(2))
-  __device__ __forceinline__ T operator()(const T& dout, const T& x) const {
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
     return dout / (x * log_two);
   }
 
@@ -750,7 +750,7 @@ struct CudaLog10Functor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // log10(x) = log10(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(log10(x));
   }
@@ -762,7 +762,7 @@ struct CudaLog10GradFunctor : public BaseActivationFunctor<T> {
   T log_ten = static_cast<T>(log(static_cast<MPType>(10.0f)));
 
   // dx = dout / (x * log(10))
-  __device__ __forceinline__ T operator()(const T& dout, const T& x) const {
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
     return dout / (x * log_ten);
   }
 
@@ -779,7 +779,7 @@ struct CudaBReluFunctor : public BaseActivationFunctor<T> {
   }
 
   // brelu(x) = min(max(x, t_min), t_max)
-  __device__ __forceinline__ T operator()(const T& x) const {
+  __device__ __forceinline__ T operator()(const T x) const {
     T t_min_cast = static_cast<T>(t_min);
     T t_max_cast = static_cast<T>(t_max);
     T temp_max = x > t_min_cast ? x : t_min_cast;
@@ -799,7 +799,7 @@ struct CudaBReluGradFunctor : public BaseActivationFunctor<T> {
   }
 
   // dx = (x > t_min && x < t_max) ? dout : 0
-  __device__ __forceinline__ T operator()(const T& dout, const T& x) const {
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
     T t_min_cast = static_cast<T>(t_min);
     T t_max_cast = static_cast<T>(t_max);
     return (x > t_min_cast && x < t_max_cast) ? dout : zero;
@@ -820,7 +820,7 @@ struct CudaSoftReluFunctor : public BaseActivationFunctor<T> {
 
   // soft_relu(x) = log(1 + exp(max(min(x, threshold), -threshold)))
   // threshold should not be negative
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     MPType t = static_cast<MPType>(threshold);
     MPType temp_min = x < t ? x : t;
@@ -841,8 +841,8 @@ struct CudaSoftReluGradFunctor : public BaseActivationFunctor<T> {
 
   // dx = (out > -threshold && out < threshold) ? dout * (1 - exp(-out)) : 0
   // threshold should not be negative
-  __device__ __forceinline__ T operator()(const T& arg_dout,
-                                          const T& arg_out) const {
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_out) const {
     MPType dout = static_cast<MPType>(arg_dout);
     MPType out = static_cast<MPType>(arg_out);
     MPType t = static_cast<MPType>(threshold);
@@ -864,7 +864,7 @@ struct CudaSTanhFunctor : public BaseActivationFunctor<T> {
   }
 
   // stanh(x) = b * tanh(a * x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     MPType a = static_cast<MPType>(scale_a);
     MPType b = static_cast<MPType>(scale_b);
@@ -884,8 +884,8 @@ struct CudaSTanhGradFunctor : public BaseActivationFunctor<T> {
   }
 
   // dx = dout * a * b * (1 - tanh(a * x) * tanh(a * x))
-  __device__ __forceinline__ T operator()(const T& arg_dout,
-                                          const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
     MPType dout = static_cast<MPType>(arg_dout);
     MPType x = static_cast<MPType>(arg_x);
     MPType a = static_cast<MPType>(scale_a);
@@ -909,7 +909,7 @@ struct CudaSoftplusFunctor : public BaseActivationFunctor<T> {
   }
 
   // softplus(x) = beta * x > threshold ? x : log(1 + exp(beta * x)) / beta
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     MPType b = static_cast<MPType>(beta);
     MPType t = static_cast<MPType>(threshold);
@@ -930,8 +930,8 @@ struct CudaSoftplusGradFunctor : public BaseActivationFunctor<T> {
   }
 
   // dx = x * beta > threshold ? dout : dout / (1 + exp(-beta * x))
-  __device__ __forceinline__ T operator()(const T& arg_dout,
-                                          const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
     MPType dout = static_cast<MPType>(arg_dout);
     MPType x = static_cast<MPType>(arg_x);
     MPType b = static_cast<MPType>(beta);
@@ -948,7 +948,7 @@ struct CudaSoftsignFunctor : public BaseActivationFunctor<T> {
   T one = static_cast<T>(1.0f);
 
   // softsign(x) = x / (1 + abs(x))
-  __device__ __forceinline__ T operator()(const T& x) const {
+  __device__ __forceinline__ T operator()(const T x) const {
     return x / (one + abs(x));
   }
 };
@@ -958,7 +958,7 @@ struct CudaSoftsignGradFunctor : public BaseActivationFunctor<T> {
   T one = static_cast<T>(1.0f);
 
   // dx = dout / (1 + abs(x))^2
-  __device__ __forceinline__ T operator()(const T& dout, const T& x) const {
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
     T temp = one + abs(x);
     return dout / (temp * temp);
   }
@@ -976,7 +976,7 @@ struct CudaRelu6Functor : public BaseActivationFunctor<T> {
   }
 
   // relu6(x) = min(max(0, x), 6)
-  __device__ __forceinline__ T operator()(const T& x) const {
+  __device__ __forceinline__ T operator()(const T x) const {
     T t = static_cast<T>(threshold);
     return x <= zero ? zero : (x < t ? x : t);
   }
@@ -992,7 +992,7 @@ struct CudaRelu6GradFunctor : public BaseActivationFunctor<T> {
   }
 
   // dx = (out > 0 && out < t) ? dout : 0
-  __device__ __forceinline__ T operator()(const T& dout, const T& out) const {
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
     T t = static_cast<T>(threshold);
     return (out > zero && out < t) ? dout : zero;
   }
@@ -1005,7 +1005,7 @@ struct CudaTanhShrinkFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // tanhshrink(x) = x - tanh(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(x - tanh(x));
   }
@@ -1016,8 +1016,8 @@ struct CudaTanhShrinkGradFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // dx = dout * tanh(x)^2
-  __device__ __forceinline__ T operator()(const T& arg_dout,
-                                          const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
     MPType dout = static_cast<MPType>(arg_dout);
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(dout * tanh(x) * tanh(x));
@@ -1036,7 +1036,7 @@ struct CudaHardShrinkFunctor : public BaseActivationFunctor<T> {
   }
 
   // hadrshrink(x) = (x > -threshold && x < threshold) ? 0 : x
-  __device__ __forceinline__ T operator()(const T& x) const {
+  __device__ __forceinline__ T operator()(const T x) const {
     T t = static_cast<T>(threshold);
     return (x > -t && x < t) ? zero : x;
   }
@@ -1052,7 +1052,7 @@ struct CudaHardShrinkGradFunctor : public BaseActivationFunctor<T> {
   }
 
   // dx = (x > -threshold && x < threshold) ? 0 : dout
-  __device__ __forceinline__ T operator()(const T& dout, const T& x) const {
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
     T t = static_cast<T>(threshold);
     return (x > -t && x < t) ? zero : dout;
   }
@@ -1074,7 +1074,7 @@ struct CudaHardSigmoidFunctor : public BaseActivationFunctor<T> {
   // hard_sigmoid(x) = 0, when x <= -3
   //                   1, when x >= 3
   //                   x * slope + offset, otherwise
-  __device__ __forceinline__ T operator()(const T& x) const {
+  __device__ __forceinline__ T operator()(const T x) const {
     T temp = x * static_cast<T>(slope) + static_cast<T>(offset);
     T temp_max = temp > zero ? temp : zero;
     T temp_min = temp_max < one ? temp_max : one;
@@ -1094,7 +1094,7 @@ struct CudaHardSigmoidGradFunctor : public BaseActivationFunctor<T> {
   }
 
   // dx = (out > 0 && out < 1) ? dout * slope : 0
-  __device__ __forceinline__ T operator()(const T& dout, const T& out) const {
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
     return (out > zero && out < one) ? dout * static_cast<T>(slope) : zero;
   }
 
@@ -1112,7 +1112,7 @@ struct CudaSwishFunctor : public BaseActivationFunctor<T> {
   }
 
   // swish(x) = x / (1 + exp(-beta * x))
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     MPType b = static_cast<MPType>(beta);
     return static_cast<T>(x / (one + exp(-b * x)));
@@ -1130,8 +1130,8 @@ struct CudaSwishGradFunctor : public BaseActivationFunctor<T> {
   }
 
   // dx = dout * (1 + exp(-b * x) + b * x * exp(-b * x) / (1 + exp(-b * x))^2)
-  __device__ __forceinline__ T operator()(const T& arg_dout,
-                                          const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
     MPType dout = static_cast<MPType>(arg_dout);
     MPType x = static_cast<MPType>(arg_x);
     MPType b = static_cast<MPType>(beta);
@@ -1159,7 +1159,7 @@ struct CudaMishFunctor : public BaseActivationFunctor<T> {
   // softplus(x) = x, if x > threshold
   //             = ln(1 + exp(x)), otherwise
   // Inputs: args[0], the input x
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     MPType sp = (x > static_cast<MPType>(threshold)) ? x : log(one + exp(x));
     return static_cast<T>(x * tanh(sp));
@@ -1180,8 +1180,8 @@ struct CudaMishGradFunctor : public BaseActivationFunctor<T> {
   // sp = softplus(x)
   // Inputs: args[0], the input dout
   //         args[1], the input x
-  __device__ __forceinline__ T operator()(const T& arg_dout,
-                                          const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
     MPType dout = static_cast<MPType>(arg_dout);
     MPType x = static_cast<MPType>(arg_x);
     MPType sp = (x > static_cast<MPType>(threshold)) ? x : log(one + exp(x));
@@ -1204,7 +1204,7 @@ struct CudaThresholdedReluFunctor : public BaseActivationFunctor<T> {
   }
 
   // thresholded_relu(x) = x > threshold ? x : 0
-  __device__ __forceinline__ T operator()(const T& x) const {
+  __device__ __forceinline__ T operator()(const T x) const {
     return x > static_cast<T>(threshold) ? x : zero;
   }
 };
@@ -1219,7 +1219,7 @@ struct CudaThresholdedReluGradFunctor : public BaseActivationFunctor<T> {
   }
 
   // dx = x > threshold ? dout : 0
-  __device__ __forceinline__ T operator()(const T& dout, const T& x) const {
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
     return x > static_cast<T>(threshold) ? dout : zero;
   }
 
@@ -1241,7 +1241,7 @@ struct CudaHardSwishFunctor : public BaseActivationFunctor<T> {
   //                 x , when x >= threshold - offset
   //                 x * (x + offset) / scale, otherwise
   // threshold = scale = 6, offset = 3 by default
-  __device__ __forceinline__ T operator()(const T& x) const {
+  __device__ __forceinline__ T operator()(const T x) const {
     T t = static_cast<T>(threshold);
     T temp = x + static_cast<T>(offset);
     T temp_max = temp > zero ? temp : zero;
@@ -1267,7 +1267,7 @@ struct CudaHardSwishGradFunctor : public BaseActivationFunctor<T> {
   //      dout , when x >= threshold - offset
   //      dout * (2 * x / scale + offset / scale), otherwise
   // threshold = scale = 6, offset = 3 by default
-  __device__ __forceinline__ T operator()(const T& dout, const T& x) const {
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
     T o = static_cast<T>(offset);
     T s = static_cast<T>(scale);
     T temp1 = static_cast<T>(x + o > zero);
@@ -1291,7 +1291,7 @@ struct CudaELUFunctor : public BaseActivationFunctor<T> {
 
   // elu(x) = x, if x > 0
   // elu(x) = alpha * (e^x - 1), if x <= 0
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     CT x = static_cast<CT>(arg_x);
     CT temp = static_cast<CT>(alpha) * (exp(x) - one);
     CT res = x > zero ? x : temp;
@@ -1312,8 +1312,7 @@ struct CudaELUGradFunctor : public BaseActivationFunctor<T> {
   // case 1: alpha >= 0
   // dx = dout, if out > 0
   // dx = dout * (out + alpha), if out <= 0
-  __device__ __forceinline__ T operator()(const T& arg_dout,
-                                          const T& arg_out) const {
+  __device__ __forceinline__ T operator()(T arg_dout, T arg_out) const {
     MPType dout = static_cast<MPType>(arg_dout);
     MPType out = static_cast<MPType>(arg_out);
     MPType a = static_cast<MPType>(alpha);
@@ -1338,8 +1337,8 @@ struct CudaELUGradNegativeAlphaFunctor : public BaseActivationFunctor<T> {
   // case 2: alpha < 0
   // dx = dout, if x > 0
   // dx = dout * (out + alpha), if x <=0
-  __device__ __forceinline__ T operator()(const T& arg_dout, const T& arg_out,
-                                          const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_dout, const T arg_out,
+                                          const T arg_x) const {
     MPType dout = static_cast<MPType>(arg_dout);
     MPType out = static_cast<MPType>(arg_out);
     MPType x = static_cast<MPType>(arg_x);
@@ -1393,7 +1392,7 @@ struct CudaCELUFunctor : public BaseActivationFunctor<T> {
   }
 
   // celu(x) = max(0, x) + min(0, alpha * (exp(x/alpha) - 1))
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     CT x = static_cast<CT>(arg_x);
     CT temp = static_cast<CT>(alpha) * (exp(x / static_cast<CT>(alpha)) - one);
     CT res = (x > zero ? x : zero) + (temp > zero ? zero : temp);
@@ -1416,8 +1415,8 @@ struct CudaCELUGradFunctor : public BaseActivationFunctor<T> {
   // dx = dout * (x/alpha).exp(), if alpha > 0 and x <= 0
   // dx = dout , if alpha < 0 and x > 0
   // dx = dout * (x/alpha).exp(), if alpha < 0 and x <=0
-  __device__ __forceinline__ T operator()(const T& arg_dout,
-                                          const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
     MPType dout = static_cast<MPType>(arg_dout);
     MPType x = static_cast<MPType>(arg_x);
     MPType a = static_cast<MPType>(alpha);

From a250c56c523d09235cfba8626e2ebed6005f134e Mon Sep 17 00:00:00 2001
From: Zhang Ting <zhangting_2017@163.com>
Date: Wed, 12 Jan 2022 12:49:07 +0800
Subject: [PATCH 101/151] [part 4]change type of function args (#38888)

---
 paddle/fluid/operators/complex_op.h       | 4 ++--
 paddle/fluid/operators/label_smooth_op.cu | 4 ++--
 paddle/fluid/operators/lgamma_op.cu       | 2 +-
 paddle/fluid/operators/matrix_rank_op.h   | 6 +++---
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/operators/complex_op.h b/paddle/fluid/operators/complex_op.h
index c6ae46f5a828f..3dd5ea9f7e83d 100644
--- a/paddle/fluid/operators/complex_op.h
+++ b/paddle/fluid/operators/complex_op.h
@@ -25,14 +25,14 @@ namespace operators {
 // functors to use with ElementwiseComputeEx
 template <typename T>
 struct RealAndImagToComplexFunctor {
-  inline HOSTDEVICE platform::complex<T> operator()(const T& x, const T& y) {
+  inline HOSTDEVICE platform::complex<T> operator()(const T x, const T y) {
     return platform::complex<T>(x, y);
   }
 };
 
 template <typename T>
 struct ImagAndRealToComplexFunctor {
-  inline HOSTDEVICE platform::complex<T> operator()(const T& y, const T& x) {
+  inline HOSTDEVICE platform::complex<T> operator()(const T y, const T x) {
     return platform::complex<T>(x, y);
   }
 };
diff --git a/paddle/fluid/operators/label_smooth_op.cu b/paddle/fluid/operators/label_smooth_op.cu
index 2e7d1de3bd756..2c7a08de0f65b 100644
--- a/paddle/fluid/operators/label_smooth_op.cu
+++ b/paddle/fluid/operators/label_smooth_op.cu
@@ -28,7 +28,7 @@ struct LabelSmoothFunctor {
     label_dim = static_cast<T>(label_dim_data);
   }
 
-  __device__ __forceinline__ T operator()(const T& x) const {
+  __device__ __forceinline__ T operator()(const T x) const {
     return (static_cast<T>(1 - epsilon) * x +
             static_cast<T>(epsilon / label_dim));
   }
@@ -42,7 +42,7 @@ struct LabelSmoothGradFunctor {
     epsilon = static_cast<T>(epsilon_data);
   }
 
-  __device__ __forceinline__ T operator()(const T& x) const {
+  __device__ __forceinline__ T operator()(const T x) const {
     return static_cast<T>(1 - epsilon) * x;
   }
 };
diff --git a/paddle/fluid/operators/lgamma_op.cu b/paddle/fluid/operators/lgamma_op.cu
index baf86c99b5678..da40518d9b4b2 100644
--- a/paddle/fluid/operators/lgamma_op.cu
+++ b/paddle/fluid/operators/lgamma_op.cu
@@ -21,7 +21,7 @@ namespace operators {
 
 template <typename T>
 struct CudaLgammaFunctor {
-  __device__ __forceinline__ T operator()(const T& x) const {
+  __device__ __forceinline__ T operator()(const T x) const {
     return Eigen::numext::lgamma(x);
   }
 };
diff --git a/paddle/fluid/operators/matrix_rank_op.h b/paddle/fluid/operators/matrix_rank_op.h
index 7fa74368332d0..c3d99a21b7235 100644
--- a/paddle/fluid/operators/matrix_rank_op.h
+++ b/paddle/fluid/operators/matrix_rank_op.h
@@ -48,17 +48,17 @@ static DDim RemoveLastDim(const DDim& dim) {
 
 template <typename T>
 struct GreaterThanFunctor {
-  HOSTDEVICE int operator()(const T& a, const T& b) const { return a > b; }
+  HOSTDEVICE int operator()(const T a, const T b) const { return a > b; }
 };
 
 template <typename T>
 struct LessThanFunctor {
-  HOSTDEVICE int operator()(const T& a, const T& b) const { return a < b; }
+  HOSTDEVICE int operator()(const T a, const T b) const { return a < b; }
 };
 
 template <typename T>
 struct GreaterElementFunctor {
-  HOSTDEVICE T operator()(const T& a, const T& b) const {
+  HOSTDEVICE T operator()(const T a, const T b) const {
     if (a > b) {
       return a;
     } else {

From 572ba24ea0f61f9ce54617c8c0dcd99c86bd17a7 Mon Sep 17 00:00:00 2001
From: Chen Long <1300851984@qq.com>
Date: Wed, 12 Jan 2022 14:25:54 +0800
Subject: [PATCH 102/151] Fix api docs (#38882)

* update readme test=document_fix

* update conll05 docs

* update conll05 docs test=document_fix
---
 python/paddle/text/datasets/conll05.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/python/paddle/text/datasets/conll05.py b/python/paddle/text/datasets/conll05.py
index 7dd29637706f3..88ae5e3d8c6e9 100644
--- a/python/paddle/text/datasets/conll05.py
+++ b/python/paddle/text/datasets/conll05.py
@@ -300,9 +300,10 @@ def get_dict(self):
     
             .. code-block:: python
     
-            from paddle.text.datasets import Conll05st
-            conll05st = Conll05st()
-            word_dict, predicate_dict, label_dict = conll05st.get_dict()
+            	from paddle.text.datasets import Conll05st
+
+            	conll05st = Conll05st()
+            	word_dict, predicate_dict, label_dict = conll05st.get_dict()
         """
         return self.word_dict, self.predicate_dict, self.label_dict
 
@@ -314,8 +315,9 @@ def get_embedding(self):
     
             .. code-block:: python
     
-            from paddle.text.datasets import Conll05st
-            conll05st = Conll05st()
-            emb_file = conll05st.get_embedding()
+            	from paddle.text.datasets import Conll05st
+
+            	conll05st = Conll05st()
+            	emb_file = conll05st.get_embedding()
         """
         return self.emb_file

From 676903d558f7e15d61bba6e9fb0cba7c80d1cff3 Mon Sep 17 00:00:00 2001
From: YuanRisheng <yuanrisheng@baidu.com>
Date: Wed, 12 Jan 2022 14:51:31 +0800
Subject: [PATCH 103/151] [PTen]Refactor impl of elementwise op grad_kernel
 (Part1) (#38873)

* refactor the impl of elementwise grad kernel

* refactor impl of elementwise grad kernel(cuda)

* fix compile bugs
---
 .../elementwise/elementwise_op_function.h     | 807 +-----------------
 paddle/fluid/operators/viterbi_decode_op.h    |   9 +-
 paddle/pten/kernels/cpu/elementwise.h         | 192 ++++-
 paddle/pten/kernels/funcs/elementwise_base.h  |  54 ++
 paddle/pten/kernels/gpu/elementwise.h         | 612 +++++++++++++
 5 files changed, 895 insertions(+), 779 deletions(-)

diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h
index 41cb2696f5492..37d29ed91b3d4 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -46,13 +46,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
-#ifdef __HIPCC__
-constexpr int ELEMWISE_MAX_BLOCK_DIM = 256;
-#else
-constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024;
-#endif
-#define BLOCK_X 32
-#define BLOCK_Y 32
 #endif
 
 #include "paddle/fluid/operators/math/math_function.h"
@@ -136,16 +129,6 @@ int PackTensorsIntoVector(const framework::ExecutionContext &ctx,
   return axis;
 }
 
-inline int GetElementwiseIndex(const int *x_dims_array, const int max_dim,
-                               const int *index_array) {
-  return pten::GetElementwiseIndex(x_dims_array, max_dim, index_array);
-}
-
-inline void UpdateElementwiseIndexArray(const int *out_dims_array,
-                                        const int max_dim, int *index_array) {
-  pten::UpdateElementwiseIndexArray(out_dims_array, max_dim, index_array);
-}
-
 inline void GetBroadcastDimsArrays(const framework::DDim &x_dims,
                                    const framework::DDim &y_dims,
                                    int *x_dims_array, int *y_dims_array,
@@ -169,205 +152,7 @@ void CommonForwardBroadcastCPU(const framework::Tensor *x,
                                   is_xsize_larger);
 }
 
-template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
-void CommonGradBroadcastCPU(
-    const framework::Tensor &x, const framework::Tensor &y,
-    const framework::Tensor &out, const framework::Tensor &dout,
-    framework::Tensor *dx, framework::Tensor *dy, int *x_dims_array,
-    int *y_dims_array, int *out_dims_array, int max_dim,
-    const platform::CPUDeviceContext &ctx, DX_OP dx_op, DY_OP dy_op) {
-  std::vector<int> index_array(max_dim, 0);
-  const T *x_data = x.data<T>();
-  const T *y_data = y.data<T>();
-  const Tout *out_data = out.data<Tout>();
-  const Tout *dout_data = dout.data<Tout>();
-  T *dx_data = dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace());
-  T *dy_data = dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace());
-  if (dx_data != nullptr) {
-    memset(dx_data, 0, dx->numel() * sizeof(T));
-  }
-  if (dy_data != nullptr) {
-    memset(dy_data, 0, dy->numel() * sizeof(T));
-  }
-  const int out_size = std::accumulate(out_dims_array, out_dims_array + max_dim,
-                                       1, std::multiplies<int>());
-  int x_index, y_index;
-  for (int out_index = 0; out_index < out_size; ++out_index) {
-    x_index = GetElementwiseIndex(x_dims_array, max_dim, index_array.data());
-    y_index = GetElementwiseIndex(y_dims_array, max_dim, index_array.data());
-    if (dx_data != nullptr) {
-      dx_data[x_index] += dx_op(x_data[x_index], y_data[y_index],
-                                out_data[out_index], dout_data[out_index]);
-    }
-    if (dy_data != nullptr) {
-      dy_data[y_index] += dy_op(x_data[x_index], y_data[y_index],
-                                out_data[out_index], dout_data[out_index]);
-    }
-
-    UpdateElementwiseIndexArray(out_dims_array, max_dim, index_array.data());
-  }
-}
-
-inline void ComputeBroadcastKernelSize(int *x_dims_array, int *out_dims_array,
-                                       int *x_blocks, int *x_threads,
-                                       int max_dim) {
-  *x_blocks = 1;
-  *x_threads = 1;
-  for (int i = 0; i < max_dim; i++) {
-    if (x_dims_array[i] == out_dims_array[i]) {
-      *x_blocks *= x_dims_array[i];
-    } else {
-      *x_threads *= out_dims_array[i];
-    }
-  }
-}
-
-inline void ComputeBroadcastTranspositionArray(const int *x_one_indexs,
-                                               int *x_trans_indexs,
-                                               const int max_dim,
-                                               const int x_one_size) {
-  int diff = max_dim - x_one_size;
-  std::copy_n(x_one_indexs, x_one_size, x_trans_indexs + diff);
-  int p = 0;
-  int q = diff;
-  for (int i = 0; i < max_dim; ++i) {
-    if (q < max_dim && i == x_trans_indexs[q]) {
-      ++q;
-    } else {
-      x_trans_indexs[p++] = i;
-    }
-  }
-}
-
 #if defined(__NVCC__) || defined(__HIPCC__)
-template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
-static __global__ void ElemwiseGradBroadcast1CUDAKernel(
-    const T *x, const T *y, const Tout *out, const Tout *dout, int h, int w,
-    bool is_xsize_larger, DX_OP dx_op, DY_OP dy_op, T *dx, T *dy) {
-  int j = blockIdx.x;
-  int i = threadIdx.x;
-  int tid = threadIdx.x;
-  T val(0);
-  if (is_xsize_larger) {
-    do {
-      int x_offset = i * w + j;
-      if (dx) {
-        dx[x_offset] = dx_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
-      }
-      if (dy) {
-        val += dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
-      }
-      i += ELEMWISE_MAX_BLOCK_DIM;
-    } while (i < h);
-
-    if (dy) {
-      h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
-      val = paddle::platform::reduceSum(val, tid, h);
-      if (threadIdx.x == 0) {
-        dy[j] = val;
-      }
-    }
-  } else {  // x.dims < y.dims, broadcast for x.
-    do {
-      int y_offset = i * w + j;
-      if (dy) {
-        dy[y_offset] = dy_op(x[j], y[y_offset], out[y_offset], dout[y_offset]);
-      }
-      if (dx) {
-        val += dx_op(x[j], y[y_offset], out[y_offset], dout[y_offset]);
-      }
-      i += ELEMWISE_MAX_BLOCK_DIM;
-    } while (i < h);
-
-    if (dx) {
-      h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
-      val = paddle::platform::reduceSum(val, tid, h);
-      if (threadIdx.x == 0) {
-        dx[j] = val;
-      }
-    }
-  }
-}
-
-// suppose use 2D block is fast because more parallel
-// and memory coalesced
-template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
-static __global__ void FastElemwiseGradBroadcast1CUDAKernel(
-    const T *x, const T *y, const Tout *out, const Tout *dout, int h, int w,
-    bool is_xsize_larger, DX_OP dx_op, DY_OP dy_op, T *dx, T *dy) {
-  __shared__ T sdata[BLOCK_Y][BLOCK_X + 1];
-
-  T val(0);
-  size_t width_stride = gridDim.x * blockDim.x;
-  size_t idx = threadIdx.x + blockDim.x * blockIdx.x;
-  size_t full_width =
-      (w & (~((uint64_t)(BLOCK_X - 1)))) + ((w & (BLOCK_X - 1)) ? BLOCK_X : 0);
-  size_t full_height =
-      (h & (~((uint64_t)(BLOCK_Y - 1)))) + ((h & (BLOCK_Y - 1)) ? BLOCK_Y : 0);
-  if (is_xsize_larger) {
-    for (int m = idx; m < full_width; m += width_stride) {
-      sdata[threadIdx.y][threadIdx.x] = 0;
-      for (int n = threadIdx.y; n < full_height; n += BLOCK_Y) {
-        int x_offset = n * w + m;
-        if (dx && m < w && n < h) {
-          dx[x_offset] =
-              dx_op(x[x_offset], y[m], out[x_offset], dout[x_offset]);
-        }
-        if (dy) {
-          if (m < w && n < h) {
-            T val = dy_op(x[x_offset], y[m], out[x_offset], dout[x_offset]);
-            sdata[threadIdx.y][threadIdx.x] += val;
-          }
-          __syncthreads();
-        }
-      }
-      if (dy) {
-        T my_val = sdata[threadIdx.x][threadIdx.y];
-        for (int i = warpSize >> 1; i > 0; i >>= 1)
-          my_val += platform::CudaShuffleXorSync(0xFFFFFFFF, my_val, i);
-        __syncthreads();
-        if ((threadIdx.x == 0)) {
-          sdata[0][threadIdx.y] = my_val;
-        }
-        __syncthreads();
-        if (threadIdx.y == 0 && m < w) {
-          dy[m] = sdata[0][threadIdx.x];
-        }
-      }
-    }
-  } else {  // x.dims < y.dims, broadcast for x.
-    for (int m = idx; m < full_width; m += width_stride) {
-      sdata[threadIdx.y][threadIdx.x] = 0;
-      for (int n = threadIdx.y; n < full_height; n += BLOCK_Y) {
-        int y_offset = n * w + m;
-        if (dy && m < w && n < h) {
-          dy[y_offset] =
-              dy_op(x[m], y[y_offset], out[y_offset], dout[y_offset]);
-        }
-        if (dx) {
-          if (m < w && n < h) {
-            T val = dx_op(x[m], y[y_offset], out[y_offset], dout[y_offset]);
-            sdata[threadIdx.y][threadIdx.x] += val;
-          }
-          __syncthreads();
-        }
-      }
-      if (dx) {
-        T my_val = sdata[threadIdx.x][threadIdx.y];
-        for (int i = warpSize >> 1; i > 0; i >>= 1)
-          my_val += platform::CudaShuffleXorSync(0xFFFFFFFF, my_val, i);
-        __syncthreads();
-        if ((threadIdx.x == 0)) {
-          sdata[0][threadIdx.y] = my_val;
-        }
-        __syncthreads();
-        if (threadIdx.y == 0 && m < w) {
-          dx[m] = sdata[0][threadIdx.x];
-        }
-      }
-    }
-  }
-}
 
 template <typename T, typename DX_OP, typename Tout = T>
 __global__ void CommonGradBroadcastCUDAKernel(
@@ -408,267 +193,6 @@ __global__ void CommonGradBroadcastCUDAKernel(
   }
 }
 
-template <typename T, typename DY_OP, typename Tout = T>
-static __global__ void CommonGradBroadcast1CUDAKernelHeight(
-    const T *x, const T *y, const Tout *out, const Tout *dout, int h, int w,
-    DY_OP dy_op, T *dy, int x_h, int x_w, bool is_y) {
-  int j = blockIdx.x;
-  int i = threadIdx.x;
-  int tid = threadIdx.x;
-  T val(0);
-
-  if (is_y) {
-    do {
-      int out_offset = i * w + j;
-      int x_offset = (i % x_h) * x_w + j % x_w;
-      if (dy) {
-        val += dy_op(x[x_offset], y[j], out[out_offset], dout[out_offset]);
-      }
-      i += ELEMWISE_MAX_BLOCK_DIM;
-    } while (i < h);
-
-    if (dy) {
-      h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
-      val = paddle::platform::reduceSum(val, tid, h);
-      if (threadIdx.x == 0) {
-        dy[j] = val;
-      }
-    }
-  } else {
-    do {
-      int out_offset = i * w + j;
-      int y_offset = (i % x_h) * x_w + j % x_w;
-      if (dy) {
-        val += dy_op(x[j], y[y_offset], out[out_offset], dout[out_offset]);
-      }
-      i += ELEMWISE_MAX_BLOCK_DIM;
-    } while (i < h);
-
-    if (dy) {
-      h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
-      val = paddle::platform::reduceSum(val, tid, h);
-      if (threadIdx.x == 0) {
-        dy[j] = val;
-      }
-    }
-  }
-}
-
-template <typename T, typename DY_OP, typename Tout = T>
-static __global__ void FastCommonGradBroadcastCUDAKernelHeight(
-    const T *x, const T *y, const Tout *out, const Tout *dout, int h, int w,
-    DY_OP dy_op, T *dy, int x_h, int x_w, bool is_y) {
-  __shared__ T sdata[BLOCK_Y][BLOCK_X + 1];
-
-  T val(0);
-  size_t width_stride = gridDim.x * blockDim.x;
-  size_t idx = threadIdx.x + blockDim.x * blockIdx.x;
-  size_t full_width =
-      (w & (~((uint64_t)(BLOCK_X - 1)))) + ((w & (BLOCK_X - 1)) ? BLOCK_X : 0);
-  size_t full_height =
-      (h & (~((uint64_t)(BLOCK_Y - 1)))) + ((h & (BLOCK_Y - 1)) ? BLOCK_Y : 0);
-  if (is_y) {
-    for (int m = idx; m < full_width; m += width_stride) {
-      sdata[threadIdx.y][threadIdx.x] = 0;
-      for (int n = threadIdx.y; n < full_height; n += BLOCK_Y) {
-        int out_offset = n * w + m;
-        int x_offset = (n % x_h) * x_w + m % x_w;
-        if (dy) {
-          if (m < w && n < h) {
-            T val = dy_op(x[x_offset], y[m], out[out_offset], dout[out_offset]);
-            sdata[threadIdx.y][threadIdx.x] += val;
-          }
-          __syncthreads();
-        }
-      }
-      if (dy) {
-        T my_val = sdata[threadIdx.x][threadIdx.y];
-        for (int i = warpSize >> 1; i > 0; i >>= 1) {
-          my_val += platform::CudaShuffleXorSync(0xFFFFFFFF, my_val, i);
-        }
-        __syncthreads();
-        if ((threadIdx.x == 0)) {
-          sdata[0][threadIdx.y] = my_val;
-        }
-        __syncthreads();
-        if (threadIdx.y == 0 && m < w) {
-          dy[m] = sdata[0][threadIdx.x];
-        }
-      }
-    }
-  } else {
-    for (int m = idx; m < full_width; m += width_stride) {
-      sdata[threadIdx.y][threadIdx.x] = 0;
-      for (int n = threadIdx.y; n < full_height; n += BLOCK_Y) {
-        int out_offset = n * w + m;
-        int y_offset = (n % x_h) * x_w + m % x_w;
-        if (dy) {
-          if (m < w && n < h) {
-            T val = dy_op(x[m], y[y_offset], out[out_offset], dout[out_offset]);
-            sdata[threadIdx.y][threadIdx.x] += val;
-          }
-          __syncthreads();
-        }
-      }
-      if (dy) {
-        T my_val = sdata[threadIdx.x][threadIdx.y];
-        for (int i = warpSize >> 1; i > 0; i >>= 1) {
-          my_val += platform::CudaShuffleXorSync(0xFFFFFFFF, my_val, i);
-        }
-        __syncthreads();
-        if ((threadIdx.x == 0)) {
-          sdata[0][threadIdx.y] = my_val;
-        }
-        __syncthreads();
-        if (threadIdx.y == 0 && m < w) {
-          dy[m] = sdata[0][threadIdx.x];
-        }
-      }
-    }
-  }
-}
-
-template <typename T, typename DY_OP, typename DX_OP, typename Tout = T>
-static __global__ void FastCommonGradBroadcastAllCUDAKernel(
-    const T *x, const T *y, const Tout *out, const Tout *dout, int pre, int n,
-    int post, bool is_xsize_larger, DX_OP dx_op, DY_OP dy_op, T *dx, T *dy) {
-  int tid = threadIdx.x;
-  int bid = blockIdx.x;
-
-  T val(0);
-  if (is_xsize_larger) {
-    for (int i = tid; i < n; i += ELEMWISE_MAX_BLOCK_DIM) {
-      int b_i = bid / post;
-      int b_j = bid % post;
-      int x_offset = b_i * n * post + i * post + b_j;
-      int y_offset = b_i * post + b_j;
-      if (dx) {
-        dx[x_offset] =
-            dx_op(x[x_offset], y[y_offset], out[x_offset], dout[x_offset]);
-      }
-      if (dy) {
-        val += dy_op(x[x_offset], y[y_offset], out[x_offset], dout[x_offset]);
-      }
-    }
-    if (dy) {
-      int h = n > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : n;
-      val = paddle::platform::reduceSum(val, tid, h);
-      if (tid == 0) {
-        dy[bid] = val;
-      }
-    }
-  } else {
-    for (int i = tid; i < n; i += ELEMWISE_MAX_BLOCK_DIM) {
-      int b_i = bid / post;
-      int b_j = bid % post;
-      int y_offset = b_i * n * post + i * post + b_j;
-      int x_offset = b_i * post + b_j;
-      if (dy) {
-        dy[y_offset] =
-            dy_op(x[x_offset], y[y_offset], out[y_offset], dout[y_offset]);
-      }
-      if (dx) {
-        val += dx_op(x[x_offset], y[y_offset], out[y_offset], dout[y_offset]);
-      }
-    }
-    if (dx) {
-      int h = n > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : n;
-      val = paddle::platform::reduceSum(val, tid, h);
-      if (tid == 0) {
-        dx[bid] = val;
-      }
-    }
-  }
-}
-
-template <typename T, typename OP, typename Tout = T>
-static __global__ void FastCommonGradBroadcastOneCUDAKernel(
-    const T *x, const T *y, const Tout *out, const Tout *dout, int pre, int n,
-    int post, int y_pre, int y_n, int y_post, bool is_xsize, OP op, T *dd) {
-  int tid = threadIdx.x;
-  int bid = blockIdx.x;
-
-  T val(0);
-  if (is_xsize) {
-    // do reduce for x
-    for (int i = tid; i < n; i += ELEMWISE_MAX_BLOCK_DIM) {
-      int b_i = bid / post;
-      int b_j = bid % post;
-      int x_offset = b_i * n * post + b_j;
-      int out_offset = b_i * n * post + i * post + b_j;
-
-      // Get y pre rows id with x post and y_pre.
-      int b_yi = bid / (post * y_pre);
-      int b_yj = bid % y_post;
-      int y_offset = b_yi * y_n + i * y_post + b_yj;
-
-      if (dd) {
-        val += op(x[x_offset], y[y_offset], out[out_offset], dout[out_offset]);
-      }
-    }
-    if (dd) {
-      int h = n > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : n;
-      val = paddle::platform::reduceSum(val, tid, h);
-      if (tid == 0) {
-        dd[bid] = val;
-      }
-    }
-  } else {
-    // do reduce for y
-    for (int i = tid; i < n; i += ELEMWISE_MAX_BLOCK_DIM) {
-      int b_i = bid / post;
-      int b_j = bid % post;
-      int y_offset = b_i * n * post + b_j;
-      int out_offset = b_i * n * post + i * post + b_j;
-
-      int b_yi = bid / (post * y_pre);
-      int b_yj = bid % y_post;
-      int x_offset = b_yi * y_n + i * y_post + b_yj;
-
-      if (dd) {
-        val += op(x[x_offset], y[y_offset], out[out_offset], dout[out_offset]);
-      }
-    }
-    if (dd) {
-      int h = n > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : n;
-      val = paddle::platform::reduceSum(val, tid, h);
-      if (tid == 0) {
-        dd[bid] = val;
-      }
-    }
-  }
-}
-
-// Check input can be split into 2 parts
-static inline bool SplitDims(const std::vector<int> &y_broadcast_pos,
-                             int max_dim) {
-  bool can_split_dim2 = true;
-  // must at start or end.
-  if (y_broadcast_pos[0] != 0 &&
-      y_broadcast_pos[y_broadcast_pos.size() - 1] != max_dim - 1) {
-    can_split_dim2 = false;
-  } else {
-    for (int i = 1; i < y_broadcast_pos.size(); ++i) {
-      // dim must be continue
-      if (y_broadcast_pos[i] != y_broadcast_pos[i - 1] + 1) {
-        can_split_dim2 = false;
-        break;
-      }
-    }
-  }
-  return can_split_dim2;
-}
-
-// Suppose only has contiguous dims
-static inline bool CheckContiguousDims(const std::vector<int> &broadcast_pos) {
-  for (int i = 1; i < broadcast_pos.size(); ++i) {
-    if (broadcast_pos[i] != broadcast_pos[i - 1] + 1) {
-      return false;
-    }
-  }
-  return true;
-}
-
 template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
 void CommonGradBroadcastCUDA(
     const framework::Tensor &x, const framework::Tensor &y,
@@ -700,10 +224,10 @@ void CommonGradBroadcastCUDA(
 
   std::vector<int> x_trans_indexs(max_dim);
   std::vector<int> y_trans_indexs(max_dim);
-  ComputeBroadcastTranspositionArray(x_one_indexs.data(), x_trans_indexs.data(),
-                                     max_dim, x_one_indexs.size());
-  ComputeBroadcastTranspositionArray(y_one_indexs.data(), y_trans_indexs.data(),
-                                     max_dim, y_one_indexs.size());
+  pten::ComputeBroadcastTranspositionArray(
+      x_one_indexs.data(), x_trans_indexs.data(), max_dim, x_one_indexs.size());
+  pten::ComputeBroadcastTranspositionArray(
+      y_one_indexs.data(), y_trans_indexs.data(), max_dim, y_one_indexs.size());
 
   // compute array stride for cuda kernel;
   // e.g. x.dims=[2,3,4], x_stride=[12,4,1]
@@ -790,15 +314,15 @@ void CommonGradBroadcastCUDA(
       if (w < 16 || h < 16) {
         int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, h);
         int grid_size = w;
-        CommonGradBroadcast1CUDAKernelHeight<<<grid_size, block_size, 0,
-                                               stream>>>(
+        pten::CommonGradBroadcast1CUDAKernelHeight<<<grid_size, block_size, 0,
+                                                     stream>>>(
             x_data, y_data, out_data, dout_data, h, w, dy_op, dy_data, kh, kw,
             is_y);
       } else {
         dim3 block_size = dim3(BLOCK_X, BLOCK_Y);
         int grid_size = (w + BLOCK_X - 1) / BLOCK_X;
-        FastCommonGradBroadcastCUDAKernelHeight<<<grid_size, block_size, 0,
-                                                  stream>>>(
+        pten::FastCommonGradBroadcastCUDAKernelHeight<<<grid_size, block_size,
+                                                        0, stream>>>(
             x_data, y_data, out_data, dout_data, h, w, dy_op, dy_data, kh, kw,
             is_y);
       }
@@ -806,15 +330,15 @@ void CommonGradBroadcastCUDA(
       if (w < 16 || h < 16) {
         int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, h);
         int grid_size = w;
-        CommonGradBroadcast1CUDAKernelHeight<<<grid_size, block_size, 0,
-                                               stream>>>(
+        pten::CommonGradBroadcast1CUDAKernelHeight<<<grid_size, block_size, 0,
+                                                     stream>>>(
             x_data, y_data, out_data, dout_data, h, w, dx_op, dx_data, kh, kw,
             is_y);
       } else {
         dim3 block_size = dim3(BLOCK_X, BLOCK_Y);
         int grid_size = (w + BLOCK_X - 1) / BLOCK_X;
-        FastCommonGradBroadcastCUDAKernelHeight<<<grid_size, block_size, 0,
-                                                  stream>>>(
+        pten::FastCommonGradBroadcastCUDAKernelHeight<<<grid_size, block_size,
+                                                        0, stream>>>(
             x_data, y_data, out_data, dout_data, h, w, dx_op, dx_data, kh, kw,
             is_y);
       }
@@ -835,14 +359,15 @@ void CommonGradBroadcastCUDA(
     if (w < 16 || h < 16) {
       int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, h);
       int grid_size = w;
-      ElemwiseGradBroadcast1CUDAKernel<<<grid_size, block_size, 0, stream>>>(
+      pten::ElemwiseGradBroadcast1CUDAKernel<<<grid_size, block_size, 0,
+                                               stream>>>(
           x_data, y_data, out_data, dout_data, h, w, x_large, dx_op, dy_op,
           dx_data, dy_data);
     } else {
       dim3 block_size = dim3(BLOCK_X, BLOCK_Y);
       int grid_size = (w + BLOCK_X - 1) / BLOCK_X;
-      FastElemwiseGradBroadcast1CUDAKernel<<<grid_size, block_size, 0,
-                                             stream>>>(
+      pten::FastElemwiseGradBroadcast1CUDAKernel<<<grid_size, block_size, 0,
+                                                   stream>>>(
           x_data, y_data, out_data, dout_data, h, w, x_large, dx_op, dy_op,
           dx_data, dy_data);
     }
@@ -876,7 +401,8 @@ void CommonGradBroadcastCUDA(
     int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, mid);
     int grid_size = pre * post;
 
-    FastCommonGradBroadcastAllCUDAKernel<<<grid_size, block_size, 0, stream>>>(
+    pten::FastCommonGradBroadcastAllCUDAKernel<<<grid_size, block_size, 0,
+                                                 stream>>>(
         x_data, y_data, out_data, dout_data, pre, mid, post, is_x_large, dx_op,
         dy_op, dx_data, dy_data);
   };
@@ -907,8 +433,8 @@ void CommonGradBroadcastCUDA(
       // size.
       if (k_pre != pre) k_pre = pre / k_pre;
 
-      FastCommonGradBroadcastOneCUDAKernel<<<grid_size, block_size, 0,
-                                             stream>>>(
+      pten::FastCommonGradBroadcastOneCUDAKernel<<<grid_size, block_size, 0,
+                                                   stream>>>(
           x_data, y_data, out_data, dout_data, pre, mid, post, k_pre, k_mid,
           k_post, true, dx_op, dx_data);
     } else {
@@ -921,8 +447,8 @@ void CommonGradBroadcastCUDA(
       int grid_size = pre * post;
       if (k_pre != pre) k_pre = pre / k_pre;
 
-      FastCommonGradBroadcastOneCUDAKernel<<<grid_size, block_size, 0,
-                                             stream>>>(
+      pten::FastCommonGradBroadcastOneCUDAKernel<<<grid_size, block_size, 0,
+                                                   stream>>>(
           x_data, y_data, out_data, dout_data, pre, mid, post, k_pre, k_mid,
           k_post, false, dy_op, dy_data);
     }
@@ -936,7 +462,7 @@ void CommonGradBroadcastCUDA(
   // 2. if both x and y need broadcast, then do it one by one.
   bool fast_broadcast = false;
   if (x_broadcast_pos.empty() && !y_broadcast_pos.empty()) {
-    can_split_y = SplitDims(y_broadcast_pos, max_dim);
+    can_split_y = pten::SplitDims(y_broadcast_pos, max_dim);
     if (can_split_y) {
       // only y need to do broadcast on h
       if (y_broadcast_pos[0] == 0) {
@@ -944,28 +470,29 @@ void CommonGradBroadcastCUDA(
         fast_broadcast = true;
       }
     } else if (y_broadcast_pos.size() == 1 ||
-               CheckContiguousDims(y_broadcast_pos)) {  // for only one dim and
-                                                        // contiguous broadcast.
+               pten::CheckContiguousDims(
+                   y_broadcast_pos)) {  // for only one dim and
+                                        // contiguous broadcast.
       // If cannot split,  which means input has 3 parts
       FastBroadCastAllCUDAF(y_broadcast_pos, max_dim, true);
       fast_broadcast = true;
     }
   } else if (y_broadcast_pos.empty() && !x_broadcast_pos.empty()) {
     // only x need broadcast
-    can_split_x = SplitDims(x_broadcast_pos, max_dim);
+    can_split_x = pten::SplitDims(x_broadcast_pos, max_dim);
     if (can_split_x) {
       if (x_broadcast_pos[0] == 0) {
         FastBroadCastHeightCUDAF(x_broadcast_pos, false);
         fast_broadcast = true;
       }
     } else if (x_broadcast_pos.size() == 1 ||
-               CheckContiguousDims(x_broadcast_pos)) {
+               pten::CheckContiguousDims(x_broadcast_pos)) {
       FastBroadCastAllCUDAF(x_broadcast_pos, max_dim, false);
       fast_broadcast = true;
     }
   } else if (!x_broadcast_pos.empty() && !y_broadcast_pos.empty()) {
     // do x and y broadcast each.
-    can_split_y = SplitDims(y_broadcast_pos, max_dim);
+    can_split_y = pten::SplitDims(y_broadcast_pos, max_dim);
     bool fast_broadcast_x = false;
     bool fast_broadcast_y = false;
     if (can_split_y) {
@@ -979,7 +506,7 @@ void CommonGradBroadcastCUDA(
       can_split_y = true;
       fast_broadcast_y = true;
     }
-    can_split_x = SplitDims(x_broadcast_pos, max_dim);
+    can_split_x = pten::SplitDims(x_broadcast_pos, max_dim);
     if (can_split_x) {
       if (x_broadcast_pos[0] == 0) {
         FastCommonCUDAF(x_broadcast_pos, false);
@@ -1005,12 +532,12 @@ void CommonGradBroadcastCUDA(
   }
   int x_blocks = 0;
   int x_threads = 0;
-  ComputeBroadcastKernelSize(x_dims_array, out_dims_array, &x_blocks,
-                             &x_threads, max_dim);
+  pten::ComputeBroadcastKernelSize(x_dims_array, out_dims_array, &x_blocks,
+                                   &x_threads, max_dim);
   int y_blocks = 0;
   int y_threads = 0;
-  ComputeBroadcastKernelSize(y_dims_array, out_dims_array, &y_blocks,
-                             &y_threads, max_dim);
+  pten::ComputeBroadcastKernelSize(y_dims_array, out_dims_array, &y_blocks,
+                                   &y_threads, max_dim);
 
   auto x_strides_array_tmp = memory::Alloc(ctx, bytes);
   int *x_strides_array_gpu =
@@ -1076,228 +603,6 @@ inline framework::DDim trim_trailing_singular_dims(
   return pten::funcs::trim_trailing_singular_dims(dims);
 }
 
-template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
-struct ElemwiseGradNoBroadcast {
-  const T *x_;
-  const T *y_;
-  const Tout *out_;
-  const Tout *dout_;
-
-  HOSTDEVICE void operator()(size_t i) {
-    if (dx_ != nullptr) {
-      dx_[i] = dx_op_(x_[i], y_[i], out_[i], dout_[i]);
-    }
-    if (dy_ != nullptr) {
-      dy_[i] = dy_op_(x_[i], y_[i], out_[i], dout_[i]);
-    }
-  }
-
-  DX_OP dx_op_;
-  DY_OP dy_op_;
-  T *dx_;
-  T *dy_;
-};
-
-template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
-static void ElemwiseGradBroadcast1CPU(const T *x, const T *y, const Tout *out,
-                                      const Tout *dout, int h, int w,
-                                      bool is_xsize_larger, DX_OP dx_op,
-                                      DY_OP dy_op, T *dx, T *dy) {
-  if (is_xsize_larger) {
-    for (int i = 0; i < h; ++i) {
-      for (int j = 0; j < w; ++j) {
-        int x_offset = i * w + j;
-        if (dx != nullptr) {
-          dx[x_offset] =
-              dx_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
-        }
-        if (dy != nullptr) {
-          T tmp = dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
-          if (i == 0) {
-            dy[j] = tmp;
-          } else {
-            dy[j] += tmp;
-          }
-        }
-      }
-    }
-  } else {  // x.dims < y.dims, broadcast for x.
-    for (int i = 0; i < h; ++i) {
-      for (int j = 0; j < w; ++j) {
-        int y_offset = i * w + j;
-        if (dy != nullptr) {
-          dy[y_offset] =
-              dy_op(x[j], y[y_offset], out[y_offset], dout[y_offset]);
-        }
-        if (dx != nullptr) {
-          T tmp = dx_op(x[j], y[y_offset], out[y_offset], dout[y_offset]);
-          if (i == 0) {
-            dx[j] = tmp;
-          } else {
-            dx[j] += tmp;
-          }
-        }
-      }
-    }
-  }
-}
-
-#if defined(__NVCC__) || defined(__HIPCC__)
-
-template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
-static void ElemwiseGradBroadcast1CUDA(gpuStream_t stream, const T *x,
-                                       const T *y, const Tout *out,
-                                       const Tout *dout, int h, int w,
-                                       bool is_xsize_larger, DX_OP dx_op,
-                                       DY_OP dy_op, T *dx, T *dy) {
-  // For small case use 1D block
-  constexpr int half_walf = 16;
-  if (w < half_walf || h < half_walf) {
-    int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, h);
-    int gird_size = w;
-    ElemwiseGradBroadcast1CUDAKernel<<<gird_size, block_size, 0, stream>>>(
-        x, y, out, dout, h, w, is_xsize_larger, dx_op, dy_op, dx, dy);
-  } else {
-    // suppose perfoemance improves with h increased.
-    dim3 block_size = dim3(BLOCK_X, BLOCK_Y);
-    int grid_size = (w + BLOCK_X - 1) / BLOCK_X;
-    FastElemwiseGradBroadcast1CUDAKernel<<<grid_size, block_size, 0, stream>>>(
-        x, y, out, dout, h, w, is_xsize_larger, dx_op, dy_op, dx, dy);
-  }
-}
-
-#endif
-
-template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
-static void ElemwiseGradBroadcast2CPU(const T *x, const T *y, const Tout *out,
-                                      const Tout *dout, int pre, int n,
-                                      int post, bool is_xsize_larger,
-                                      DX_OP dx_op, DY_OP dy_op, T *dx, T *dy) {
-  if (is_xsize_larger) {
-    for (int i = 0; i < pre; ++i) {
-      for (int j = 0; j < n; ++j) {
-        for (int k = 0; k < post; ++k) {
-          int x_offset = i * n * post + j * post + k;
-          if (dx != nullptr) {
-            dx[x_offset] =
-                dx_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
-          }
-          if (dy != nullptr) {
-            T tmp = dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
-            if (i == 0 && k == 0) {
-              dy[j] = tmp;
-            } else {
-              dy[j] += tmp;
-            }
-          }
-        }
-      }
-    }
-  } else {  // x.dims < y.dims, broadcast for x.
-    for (int i = 0; i < pre; ++i) {
-      for (int j = 0; j < n; ++j) {
-        for (int k = 0; k < post; ++k) {
-          int y_offset = i * n * post + j * post + k;
-          if (dy != nullptr) {
-            dy[y_offset] =
-                dy_op(x[j], y[y_offset], out[y_offset], dout[y_offset]);
-          }
-          if (dx != nullptr) {
-            T tmp = dx_op(x[j], y[y_offset], out[y_offset], dout[y_offset]);
-            if (i == 0 && k == 0) {
-              dx[j] = tmp;
-            } else {
-              dx[j] += tmp;
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-#if defined(__NVCC__) || defined(__HIPCC__)
-template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
-static __global__ void ElemwiseGradBroadcast2CUDAKernel(
-    const T *x, const T *y, const Tout *out, const Tout *dout, int pre, int n,
-    int post, bool is_xsize_larger, DX_OP dx_op, DY_OP dy_op, T *dx, T *dy) {
-  int tid = threadIdx.x;
-  int j = blockIdx.x;
-
-  T val(0);
-  int ttid = tid;
-
-  if (is_xsize_larger) {
-    while (true) {
-      int i = ttid / post;
-      int k = ttid % post;
-      if (i >= pre) break;
-
-      int x_offset = i * n * post + j * post + k;
-
-      if (dx != nullptr) {
-        dx[x_offset] = dx_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
-      }
-
-      if (dy != nullptr) {
-        val += dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
-      }
-
-      ttid += ELEMWISE_MAX_BLOCK_DIM;
-    }
-
-    if (dy) {
-      int h = pre * post;
-      h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
-      val = paddle::platform::reduceSum(val, tid, h);
-      if (threadIdx.x == 0) {
-        dy[j] = val;
-      }
-    }
-  } else {  // x.dims < y.dims, broadcast for x.
-    while (true) {
-      int i = ttid / post;
-      int k = ttid % post;
-      if (i >= pre) break;
-
-      int y_offset = i * n * post + j * post + k;
-
-      if (dy != nullptr) {
-        dy[y_offset] = dy_op(x[j], y[y_offset], out[y_offset], dout[y_offset]);
-      }
-
-      if (dx != nullptr) {
-        val += dx_op(x[j], y[y_offset], out[y_offset], dout[y_offset]);
-      }
-
-      ttid += ELEMWISE_MAX_BLOCK_DIM;
-    }
-
-    if (dx) {
-      int h = pre * post;
-      h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
-      val = paddle::platform::reduceSum(val, tid, h);
-      if (threadIdx.x == 0) {
-        dx[j] = val;
-      }
-    }
-  }
-}
-
-template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
-static void ElemwiseGradBroadcast2CUDA(gpuStream_t stream, const T *x,
-                                       const T *y, const Tout *out,
-                                       const Tout *dout, int pre, int n,
-                                       int post, bool is_xsize_larger,
-                                       DX_OP dx_op, DY_OP dy_op, T *dx, T *dy) {
-  int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, pre * post);
-  int gird_size = n;
-  ElemwiseGradBroadcast2CUDAKernel<<<gird_size, block_size, 0, stream>>>(
-      x, y, out, dout, pre, n, post, is_xsize_larger, dx_op, dy_op, dx, dy);
-}
-
-#endif
-
 template <typename DeviceContext, typename T, typename DX_OP, typename DY_OP,
           typename Tout = T>
 void CommonElementwiseBroadcastBackward(
@@ -1334,7 +639,7 @@ void CommonElementwiseBroadcastBackward(
         dy_op);
 #endif
   } else {
-    CommonGradBroadcastCPU<T, DX_OP, DY_OP, Tout>(
+    pten::CommonGradBroadcastCPU<T, DX_OP, DY_OP, Tout>(
         x, y, out, dout, dx, dy, x_dims_array.data(), y_dims_array.data(),
         out_dims_array.data(), max_dim,
         ctx.template device_context<platform::CPUDeviceContext>(), dx_op,
@@ -1342,28 +647,6 @@ void CommonElementwiseBroadcastBackward(
   }
 }
 
-template <typename DeviceContext, typename T, typename DX_OP, typename DY_OP,
-          typename Tout = T>
-void ElemwiseGradComputeNoBroadcast(
-    const framework::ExecutionContext &ctx, const framework::DDim &x_dim,
-    const framework::DDim &y_dim, const framework::Tensor &x,
-    const framework::Tensor &y, const framework::Tensor &out,
-    const framework::Tensor &dout, int axis, framework::Tensor *dx,
-    framework::Tensor *dy, DX_OP dx_op, DY_OP dy_op) {
-  size_t N = static_cast<size_t>(framework::product(x_dim));
-#if !defined(_WIN32)
-  platform::ForRange<DeviceContext> for_range(
-      ctx.template device_context<DeviceContext>(), N);
-#else
-  platform::ForRange<DeviceContext> for_range(
-      ctx.device_context<DeviceContext>(), N);
-#endif  // !_WIN32
-  for_range(ElemwiseGradNoBroadcast<T, DX_OP, DY_OP, Tout>{
-      x.data<T>(), y.data<T>(), out.data<Tout>(), dout.data<Tout>(), dx_op,
-      dy_op, dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
-      dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace())});
-}
-
 template <typename DeviceContext, typename T, typename DX_OP, typename DY_OP,
           typename Tout = T>
 void ElemwiseGradComputeWithBroadcast(
@@ -1412,7 +695,7 @@ void ElemwiseGradComputeWithBroadcast(
   if (post == 1) {
     if (platform::is_gpu_place(ctx.GetPlace())) {
 #if defined(__NVCC__) || defined(__HIPCC__)
-      ElemwiseGradBroadcast1CUDA(
+      pten::ElemwiseGradBroadcast1CUDA(
           ctx.template device_context<DeviceContext>().stream(), x.data<T>(),
           y.data<T>(), out.data<Tout>(), dout.data<Tout>(), pre, n,
           is_xsize_larger, dx_op, dy_op,
@@ -1420,7 +703,7 @@ void ElemwiseGradComputeWithBroadcast(
           dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
 #endif
     } else {
-      ElemwiseGradBroadcast1CPU(
+      pten::ElemwiseGradBroadcast1CPU(
           x.data<T>(), y.data<T>(), out.data<Tout>(), dout.data<Tout>(), pre, n,
           is_xsize_larger, dx_op, dy_op,
           dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
@@ -1429,7 +712,7 @@ void ElemwiseGradComputeWithBroadcast(
   } else {
     if (platform::is_gpu_place(ctx.GetPlace())) {
 #if defined(__NVCC__) || defined(__HIPCC__)
-      ElemwiseGradBroadcast2CUDA(
+      pten::ElemwiseGradBroadcast2CUDA(
           ctx.template device_context<DeviceContext>().stream(), x.data<T>(),
           y.data<T>(), out.data<Tout>(), dout.data<Tout>(), pre, n, post,
           is_xsize_larger, dx_op, dy_op,
@@ -1437,7 +720,7 @@ void ElemwiseGradComputeWithBroadcast(
           dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
 #endif
     } else {
-      ElemwiseGradBroadcast2CPU(
+      pten::ElemwiseGradBroadcast2CPU(
           x.data<T>(), y.data<T>(), out.data<Tout>(), dout.data<Tout>(), pre, n,
           post, is_xsize_larger, dx_op, dy_op,
           dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
@@ -1474,8 +757,10 @@ void ElemwiseGradCompute(const framework::ExecutionContext &ctx,
   const framework::DDim &x_dim = x.dims();
   const framework::DDim &y_dim = y.dims();
   if (x.dims() == y.dims()) {
-    ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP, Tout>(
-        ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
+    const auto &dev_ctx = ctx.template device_context<DeviceContext>();
+    pten::funcs::ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP,
+                                                Tout>(
+        dev_ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
   } else {
     ElemwiseGradComputeWithBroadcast<DeviceContext, T, DX_OP, DY_OP, Tout>(
         ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
@@ -1497,8 +782,10 @@ void ElemwiseExplicitGradCompute(const framework::ExecutionContext &ctx,
   const framework::DDim &x_dim = x.dims();
   const framework::DDim &y_dim = y.dims();
   if (x.dims() == y.dims()) {
-    ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP>(
-        ctx, x_dim, y_dim, dout, dout, out, dout, axis, dx, dy, dx_op, dy_op);
+    const auto &dev_ctx = ctx.template device_context<DeviceContext>();
+    pten::funcs::ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP>(
+        dev_ctx, x_dim, y_dim, dout, dout, out, dout, axis, dx, dy, dx_op,
+        dy_op);
   } else {
     ElemwiseGradComputeWithBroadcast<DeviceContext, T, DX_OP, DY_OP>(
         ctx, x_dim, y_dim, dout, dout, out, dout, axis, dx, dy, dx_op, dy_op);
diff --git a/paddle/fluid/operators/viterbi_decode_op.h b/paddle/fluid/operators/viterbi_decode_op.h
index 4da137f77433d..2b392ae74cc82 100644
--- a/paddle/fluid/operators/viterbi_decode_op.h
+++ b/paddle/fluid/operators/viterbi_decode_op.h
@@ -150,9 +150,12 @@ struct GetInputIndex<false> {
                   const std::vector<int>& output_strides, int output_idx,
                   int* index_array, int* lhs_idx, int* rhs_idx) {
     int out_dims_size = output_strides.size();
-    *lhs_idx = GetElementwiseIndex(lhs_dims.data(), out_dims_size, index_array);
-    *rhs_idx = GetElementwiseIndex(rhs_dims.data(), out_dims_size, index_array);
-    UpdateElementwiseIndexArray(output_dims.data(), out_dims_size, index_array);
+    *lhs_idx =
+        pten::GetElementwiseIndex(lhs_dims.data(), out_dims_size, index_array);
+    *rhs_idx =
+        pten::GetElementwiseIndex(rhs_dims.data(), out_dims_size, index_array);
+    pten::UpdateElementwiseIndexArray(output_dims.data(), out_dims_size,
+                                      index_array);
   }
 };
 
diff --git a/paddle/pten/kernels/cpu/elementwise.h b/paddle/pten/kernels/cpu/elementwise.h
index d3687b22fb392..5a421de1173d8 100644
--- a/paddle/pten/kernels/cpu/elementwise.h
+++ b/paddle/pten/kernels/cpu/elementwise.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/pten/backends/cpu/cpu_context.h"
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/kernels/funcs/elementwise_base.h"
 
@@ -22,6 +23,8 @@ limitations under the License. */
 
 namespace pten {
 
+// FORWARD CODE
+
 // Add
 template <typename DevCtx, typename T, class Enable = void>
 struct SameDimsAddFunctor {
@@ -206,6 +209,56 @@ inline int GetElementwiseIndex(const int* x_dims_array,
   return index_;
 }
 
+template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
+void CommonGradBroadcastCPU(const DenseTensor& x,
+                            const DenseTensor& y,
+                            const DenseTensor& out,
+                            const DenseTensor& dout,
+                            DenseTensor* dx,
+                            DenseTensor* dy,
+                            int* x_dims_array,
+                            int* y_dims_array,
+                            int* out_dims_array,
+                            int max_dim,
+                            const CPUContext& ctx,
+                            DX_OP dx_op,
+                            DY_OP dy_op) {
+  std::vector<int> index_array(max_dim, 0);
+  const T* x_data = x.data<T>();
+  const T* y_data = y.data<T>();
+  const Tout* out_data = out.data<Tout>();
+  const Tout* dout_data = dout.data<Tout>();
+  T* dx_data = dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace());
+  T* dy_data = dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace());
+  if (dx_data != nullptr) {
+    memset(dx_data, 0, dx->numel() * sizeof(T));
+  }
+  if (dy_data != nullptr) {
+    memset(dy_data, 0, dy->numel() * sizeof(T));
+  }
+  const int out_size = std::accumulate(
+      out_dims_array, out_dims_array + max_dim, 1, std::multiplies<int>());
+  int x_index, y_index;
+  for (int out_index = 0; out_index < out_size; ++out_index) {
+    x_index = GetElementwiseIndex(x_dims_array, max_dim, index_array.data());
+    y_index = GetElementwiseIndex(y_dims_array, max_dim, index_array.data());
+    if (dx_data != nullptr) {
+      dx_data[x_index] += dx_op(x_data[x_index],
+                                y_data[y_index],
+                                out_data[out_index],
+                                dout_data[out_index]);
+    }
+    if (dy_data != nullptr) {
+      dy_data[y_index] += dy_op(x_data[x_index],
+                                y_data[y_index],
+                                out_data[out_index],
+                                dout_data[out_index]);
+    }
+
+    UpdateElementwiseIndexArray(out_dims_array, max_dim, index_array.data());
+  }
+}
+
 template <typename Functor, typename T, typename OutType = T>
 void CommonForwardBroadcastCPU(const DenseTensor& x,
                                const DenseTensor& y,
@@ -214,7 +267,7 @@ void CommonForwardBroadcastCPU(const DenseTensor& x,
                                int* y_dims_array,
                                int* out_dims_array,
                                int max_dim,
-                               const paddle::platform::CPUDeviceContext& ctx,
+                               const CPUContext& ctx,
                                Functor func,
                                const bool is_xsize_larger = true) {
   std::vector<int> index_array(max_dim, 0);
@@ -245,16 +298,15 @@ void CommonForwardBroadcastCPU(const DenseTensor& x,
 }
 
 template <typename Functor, typename T, typename OutType = T>
-void CommonElementwiseBroadcastForward(
-    const paddle::platform::CPUDeviceContext& dev_ctx,
-    const DenseTensor& x,
-    const DenseTensor& y,
-    DenseTensor* z,
-    const DDim& x_dims,
-    const DDim& y_dims,
-    Functor func,
-    int axis,
-    const bool is_xsize_larger = true) {
+void CommonElementwiseBroadcastForward(const CPUContext& dev_ctx,
+                                       const DenseTensor& x,
+                                       const DenseTensor& y,
+                                       DenseTensor* z,
+                                       const DDim& x_dims,
+                                       const DDim& y_dims,
+                                       Functor func,
+                                       int axis,
+                                       const bool is_xsize_larger = true) {
   int max_dim = (std::max)(x_dims.size(), y_dims.size());
   axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
   PADDLE_ENFORCE_GE(
@@ -302,7 +354,7 @@ void CommonElementwiseBroadcastForward(
 // TODO(liuyiqun): optimize the CPU implementation to support all broadcast
 // cases and avoid the need of XxxInverseFunctor.
 template <typename Functor, typename T, typename OutType = T>
-void ElementwiseCompute(const paddle::platform::CPUDeviceContext& dev_ctx,
+void ElementwiseCompute(const CPUContext& dev_ctx,
                         const DenseTensor& x,
                         const DenseTensor& y,
                         int axis,
@@ -317,9 +369,8 @@ void ElementwiseCompute(const paddle::platform::CPUDeviceContext& dev_ctx,
     is_xsize_larger = false;
     max_dim = y_dims.size();
   }
-  funcs::
-      TransformFunctor<Functor, T, paddle::platform::CPUDeviceContext, OutType>
-          functor(x, y, z, dev_ctx, func, is_xsize_larger);
+  funcs::TransformFunctor<Functor, T, CPUContext, OutType> functor(
+      x, y, z, dev_ctx, func, is_xsize_larger);
   if (x_dims == y_dims) {
     functor.Run();
     return;
@@ -381,7 +432,7 @@ void ElementwiseCompute(const paddle::platform::CPUDeviceContext& dev_ctx,
 
 template <typename Functor>
 struct SameDimsElementwiseCompute {
-  void operator()(const paddle::platform::CPUDeviceContext& dev_ctx,
+  void operator()(const CPUContext& dev_ctx,
                   const DenseTensor& x,
                   const DenseTensor& y,
                   DenseTensor* z) {
@@ -389,4 +440,113 @@ struct SameDimsElementwiseCompute {
   }
 };
 
+// BACKWARD CODE
+
+template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
+static void ElemwiseGradBroadcast1CPU(const T* x,
+                                      const T* y,
+                                      const Tout* out,
+                                      const Tout* dout,
+                                      int h,
+                                      int w,
+                                      bool is_xsize_larger,
+                                      DX_OP dx_op,
+                                      DY_OP dy_op,
+                                      T* dx,
+                                      T* dy) {
+  if (is_xsize_larger) {
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; ++j) {
+        int x_offset = i * w + j;
+        if (dx != nullptr) {
+          dx[x_offset] =
+              dx_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
+        }
+        if (dy != nullptr) {
+          T tmp = dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
+          if (i == 0) {
+            dy[j] = tmp;
+          } else {
+            dy[j] += tmp;
+          }
+        }
+      }
+    }
+  } else {  // x.dims < y.dims, broadcast for x.
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; ++j) {
+        int y_offset = i * w + j;
+        if (dy != nullptr) {
+          dy[y_offset] =
+              dy_op(x[j], y[y_offset], out[y_offset], dout[y_offset]);
+        }
+        if (dx != nullptr) {
+          T tmp = dx_op(x[j], y[y_offset], out[y_offset], dout[y_offset]);
+          if (i == 0) {
+            dx[j] = tmp;
+          } else {
+            dx[j] += tmp;
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
+static void ElemwiseGradBroadcast2CPU(const T* x,
+                                      const T* y,
+                                      const Tout* out,
+                                      const Tout* dout,
+                                      int pre,
+                                      int n,
+                                      int post,
+                                      bool is_xsize_larger,
+                                      DX_OP dx_op,
+                                      DY_OP dy_op,
+                                      T* dx,
+                                      T* dy) {
+  if (is_xsize_larger) {
+    for (int i = 0; i < pre; ++i) {
+      for (int j = 0; j < n; ++j) {
+        for (int k = 0; k < post; ++k) {
+          int x_offset = i * n * post + j * post + k;
+          if (dx != nullptr) {
+            dx[x_offset] =
+                dx_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
+          }
+          if (dy != nullptr) {
+            T tmp = dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
+            if (i == 0 && k == 0) {
+              dy[j] = tmp;
+            } else {
+              dy[j] += tmp;
+            }
+          }
+        }
+      }
+    }
+  } else {  // x.dims < y.dims, broadcast for x.
+    for (int i = 0; i < pre; ++i) {
+      for (int j = 0; j < n; ++j) {
+        for (int k = 0; k < post; ++k) {
+          int y_offset = i * n * post + j * post + k;
+          if (dy != nullptr) {
+            dy[y_offset] =
+                dy_op(x[j], y[y_offset], out[y_offset], dout[y_offset]);
+          }
+          if (dx != nullptr) {
+            T tmp = dx_op(x[j], y[y_offset], out[y_offset], dout[y_offset]);
+            if (i == 0 && k == 0) {
+              dx[j] = tmp;
+            } else {
+              dx[j] += tmp;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
 }  // namespace pten
diff --git a/paddle/pten/kernels/funcs/elementwise_base.h b/paddle/pten/kernels/funcs/elementwise_base.h
index a0c6d5ba57011..be355557d548f 100644
--- a/paddle/pten/kernels/funcs/elementwise_base.h
+++ b/paddle/pten/kernels/funcs/elementwise_base.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/fluid/platform/for_range.h"
 #include "paddle/fluid/platform/transform.h"
 #include "paddle/pten/backends/all_context.h"
 #include "paddle/pten/core/dense_tensor.h"
@@ -23,6 +24,28 @@ namespace funcs {
 
 using DDim = paddle::framework::DDim;
 
+template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
+struct ElemwiseGradNoBroadcast {
+  const T *x_;
+  const T *y_;
+  const Tout *out_;
+  const Tout *dout_;
+
+  HOSTDEVICE void operator()(size_t i) {
+    if (dx_ != nullptr) {
+      dx_[i] = dx_op_(x_[i], y_[i], out_[i], dout_[i]);
+    }
+    if (dy_ != nullptr) {
+      dy_[i] = dy_op_(x_[i], y_[i], out_[i], dout_[i]);
+    }
+  }
+
+  DX_OP dx_op_;
+  DY_OP dy_op_;
+  T *dx_;
+  T *dy_;
+};
+
 template <typename T, typename DeviceContext>
 class RowwiseTransformIterator;
 
@@ -378,5 +401,36 @@ inline void GetBroadcastDimsArrays(const DDim &x_dims,
     }
   }
 }
+
+template <typename DeviceContext,
+          typename T,
+          typename DX_OP,
+          typename DY_OP,
+          typename Tout = T>
+void ElemwiseGradComputeNoBroadcast(const DeviceContext &dev_ctx,
+                                    const DDim &x_dim,
+                                    const DDim &y_dim,
+                                    const DenseTensor &x,
+                                    const DenseTensor &y,
+                                    const DenseTensor &out,
+                                    const DenseTensor &dout,
+                                    int axis,
+                                    DenseTensor *dx,
+                                    DenseTensor *dy,
+                                    DX_OP dx_op,
+                                    DY_OP dy_op) {
+  size_t N = static_cast<size_t>(paddle::framework::product(x_dim));
+  paddle::platform::ForRange<DeviceContext> for_range(dev_ctx, N);
+  for_range(ElemwiseGradNoBroadcast<T, DX_OP, DY_OP, Tout>{
+      x.data<T>(),
+      y.data<T>(),
+      out.data<Tout>(),
+      dout.data<Tout>(),
+      dx_op,
+      dy_op,
+      dx == nullptr ? nullptr : dx->mutable_data<T>(dev_ctx.GetPlace()),
+      dy == nullptr ? nullptr : dy->mutable_data<T>(dev_ctx.GetPlace())});
+}
+
 }  // namespace funcs
 }  // namespace pten
diff --git a/paddle/pten/kernels/gpu/elementwise.h b/paddle/pten/kernels/gpu/elementwise.h
index 049e430154a8b..4dfcd7a2152e0 100644
--- a/paddle/pten/kernels/gpu/elementwise.h
+++ b/paddle/pten/kernels/gpu/elementwise.h
@@ -20,6 +20,14 @@ limitations under the License. */
 #include "paddle/fluid/platform/function_traits.h"
 #include "paddle/pten/core/dense_tensor.h"
 
+#ifdef __HIPCC__
+constexpr int ELEMWISE_MAX_BLOCK_DIM = 256;
+#else
+constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024;
+#endif
+#define BLOCK_X 32
+#define BLOCK_Y 32
+
 namespace pten {
 
 namespace kps = paddle::operators::kernel_primitives;
@@ -31,6 +39,7 @@ template <class T, int Num>
 using ConditionalT =
     typename std::conditional_t<Num == 1, T, paddle::framework::Array<T, Num>>;
 
+// FORWARD CODE
 template <typename InT,
           typename OutT,
           int VecSize,
@@ -857,4 +866,607 @@ void LaunchElementwiseCudaKernel(const KPDevice &ctx,
   }
 }
 
+// BACKWARD CODE
+
+// Suppose only has contiguous dims
+static inline bool CheckContiguousDims(const std::vector<int> &broadcast_pos) {
+  for (int i = 1; i < broadcast_pos.size(); ++i) {
+    if (broadcast_pos[i] != broadcast_pos[i - 1] + 1) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline void ComputeBroadcastTranspositionArray(const int *x_one_indexs,
+                                               int *x_trans_indexs,
+                                               const int max_dim,
+                                               const int x_one_size) {
+  int diff = max_dim - x_one_size;
+  std::copy_n(x_one_indexs, x_one_size, x_trans_indexs + diff);
+  int p = 0;
+  int q = diff;
+  for (int i = 0; i < max_dim; ++i) {
+    if (q < max_dim && i == x_trans_indexs[q]) {
+      ++q;
+    } else {
+      x_trans_indexs[p++] = i;
+    }
+  }
+}
+
+// Check input can be split into 2 parts
+static inline bool SplitDims(const std::vector<int> &y_broadcast_pos,
+                             int max_dim) {
+  bool can_split_dim2 = true;
+  // must at start or end.
+  if (y_broadcast_pos[0] != 0 &&
+      y_broadcast_pos[y_broadcast_pos.size() - 1] != max_dim - 1) {
+    can_split_dim2 = false;
+  } else {
+    for (int i = 1; i < y_broadcast_pos.size(); ++i) {
+      // dim must be continue
+      if (y_broadcast_pos[i] != y_broadcast_pos[i - 1] + 1) {
+        can_split_dim2 = false;
+        break;
+      }
+    }
+  }
+  return can_split_dim2;
+}
+
+inline void ComputeBroadcastKernelSize(int *x_dims_array,
+                                       int *out_dims_array,
+                                       int *x_blocks,
+                                       int *x_threads,
+                                       int max_dim) {
+  *x_blocks = 1;
+  *x_threads = 1;
+  for (int i = 0; i < max_dim; i++) {
+    if (x_dims_array[i] == out_dims_array[i]) {
+      *x_blocks *= x_dims_array[i];
+    } else {
+      *x_threads *= out_dims_array[i];
+    }
+  }
+}
+
+template <typename T, typename OP, typename Tout = T>
+static __global__ void FastCommonGradBroadcastOneCUDAKernel(const T *x,
+                                                            const T *y,
+                                                            const Tout *out,
+                                                            const Tout *dout,
+                                                            int pre,
+                                                            int n,
+                                                            int post,
+                                                            int y_pre,
+                                                            int y_n,
+                                                            int y_post,
+                                                            bool is_xsize,
+                                                            OP op,
+                                                            T *dd) {
+  int tid = threadIdx.x;
+  int bid = blockIdx.x;
+
+  T val(0);
+  if (is_xsize) {
+    // do reduce for x
+    for (int i = tid; i < n; i += ELEMWISE_MAX_BLOCK_DIM) {
+      int b_i = bid / post;
+      int b_j = bid % post;
+      int x_offset = b_i * n * post + b_j;
+      int out_offset = b_i * n * post + i * post + b_j;
+
+      // Get y pre rows id with x post and y_pre.
+      int b_yi = bid / (post * y_pre);
+      int b_yj = bid % y_post;
+      int y_offset = b_yi * y_n + i * y_post + b_yj;
+
+      if (dd) {
+        val += op(x[x_offset], y[y_offset], out[out_offset], dout[out_offset]);
+      }
+    }
+    if (dd) {
+      int h = n > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : n;
+      val = paddle::platform::reduceSum(val, tid, h);
+      if (tid == 0) {
+        dd[bid] = val;
+      }
+    }
+  } else {
+    // do reduce for y
+    for (int i = tid; i < n; i += ELEMWISE_MAX_BLOCK_DIM) {
+      int b_i = bid / post;
+      int b_j = bid % post;
+      int y_offset = b_i * n * post + b_j;
+      int out_offset = b_i * n * post + i * post + b_j;
+
+      int b_yi = bid / (post * y_pre);
+      int b_yj = bid % y_post;
+      int x_offset = b_yi * y_n + i * y_post + b_yj;
+
+      if (dd) {
+        val += op(x[x_offset], y[y_offset], out[out_offset], dout[out_offset]);
+      }
+    }
+    if (dd) {
+      int h = n > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : n;
+      val = paddle::platform::reduceSum(val, tid, h);
+      if (tid == 0) {
+        dd[bid] = val;
+      }
+    }
+  }
+}
+
+template <typename T, typename DY_OP, typename DX_OP, typename Tout = T>
+static __global__ void FastCommonGradBroadcastAllCUDAKernel(
+    const T *x,
+    const T *y,
+    const Tout *out,
+    const Tout *dout,
+    int pre,
+    int n,
+    int post,
+    bool is_xsize_larger,
+    DX_OP dx_op,
+    DY_OP dy_op,
+    T *dx,
+    T *dy) {
+  int tid = threadIdx.x;
+  int bid = blockIdx.x;
+
+  T val(0);
+  if (is_xsize_larger) {
+    for (int i = tid; i < n; i += ELEMWISE_MAX_BLOCK_DIM) {
+      int b_i = bid / post;
+      int b_j = bid % post;
+      int x_offset = b_i * n * post + i * post + b_j;
+      int y_offset = b_i * post + b_j;
+      if (dx) {
+        dx[x_offset] =
+            dx_op(x[x_offset], y[y_offset], out[x_offset], dout[x_offset]);
+      }
+      if (dy) {
+        val += dy_op(x[x_offset], y[y_offset], out[x_offset], dout[x_offset]);
+      }
+    }
+    if (dy) {
+      int h = n > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : n;
+      val = paddle::platform::reduceSum(val, tid, h);
+      if (tid == 0) {
+        dy[bid] = val;
+      }
+    }
+  } else {
+    for (int i = tid; i < n; i += ELEMWISE_MAX_BLOCK_DIM) {
+      int b_i = bid / post;
+      int b_j = bid % post;
+      int y_offset = b_i * n * post + i * post + b_j;
+      int x_offset = b_i * post + b_j;
+      if (dy) {
+        dy[y_offset] =
+            dy_op(x[x_offset], y[y_offset], out[y_offset], dout[y_offset]);
+      }
+      if (dx) {
+        val += dx_op(x[x_offset], y[y_offset], out[y_offset], dout[y_offset]);
+      }
+    }
+    if (dx) {
+      int h = n > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : n;
+      val = paddle::platform::reduceSum(val, tid, h);
+      if (tid == 0) {
+        dx[bid] = val;
+      }
+    }
+  }
+}
+
+template <typename T, typename DY_OP, typename Tout = T>
+static __global__ void FastCommonGradBroadcastCUDAKernelHeight(const T *x,
+                                                               const T *y,
+                                                               const Tout *out,
+                                                               const Tout *dout,
+                                                               int h,
+                                                               int w,
+                                                               DY_OP dy_op,
+                                                               T *dy,
+                                                               int x_h,
+                                                               int x_w,
+                                                               bool is_y) {
+  __shared__ T sdata[BLOCK_Y][BLOCK_X + 1];
+
+  T val(0);
+  size_t width_stride = gridDim.x * blockDim.x;
+  size_t idx = threadIdx.x + blockDim.x * blockIdx.x;
+  size_t full_width =
+      (w & (~((uint64_t)(BLOCK_X - 1)))) + ((w & (BLOCK_X - 1)) ? BLOCK_X : 0);
+  size_t full_height =
+      (h & (~((uint64_t)(BLOCK_Y - 1)))) + ((h & (BLOCK_Y - 1)) ? BLOCK_Y : 0);
+  if (is_y) {
+    for (int m = idx; m < full_width; m += width_stride) {
+      sdata[threadIdx.y][threadIdx.x] = 0;
+      for (int n = threadIdx.y; n < full_height; n += BLOCK_Y) {
+        int out_offset = n * w + m;
+        int x_offset = (n % x_h) * x_w + m % x_w;
+        if (dy) {
+          if (m < w && n < h) {
+            T val = dy_op(x[x_offset], y[m], out[out_offset], dout[out_offset]);
+            sdata[threadIdx.y][threadIdx.x] += val;
+          }
+          __syncthreads();
+        }
+      }
+      if (dy) {
+        T my_val = sdata[threadIdx.x][threadIdx.y];
+        for (int i = warpSize >> 1; i > 0; i >>= 1) {
+          my_val += paddle::platform::CudaShuffleXorSync(0xFFFFFFFF, my_val, i);
+        }
+        __syncthreads();
+        if ((threadIdx.x == 0)) {
+          sdata[0][threadIdx.y] = my_val;
+        }
+        __syncthreads();
+        if (threadIdx.y == 0 && m < w) {
+          dy[m] = sdata[0][threadIdx.x];
+        }
+      }
+    }
+  } else {
+    for (int m = idx; m < full_width; m += width_stride) {
+      sdata[threadIdx.y][threadIdx.x] = 0;
+      for (int n = threadIdx.y; n < full_height; n += BLOCK_Y) {
+        int out_offset = n * w + m;
+        int y_offset = (n % x_h) * x_w + m % x_w;
+        if (dy) {
+          if (m < w && n < h) {
+            T val = dy_op(x[m], y[y_offset], out[out_offset], dout[out_offset]);
+            sdata[threadIdx.y][threadIdx.x] += val;
+          }
+          __syncthreads();
+        }
+      }
+      if (dy) {
+        T my_val = sdata[threadIdx.x][threadIdx.y];
+        for (int i = warpSize >> 1; i > 0; i >>= 1) {
+          my_val += paddle::platform::CudaShuffleXorSync(0xFFFFFFFF, my_val, i);
+        }
+        __syncthreads();
+        if ((threadIdx.x == 0)) {
+          sdata[0][threadIdx.y] = my_val;
+        }
+        __syncthreads();
+        if (threadIdx.y == 0 && m < w) {
+          dy[m] = sdata[0][threadIdx.x];
+        }
+      }
+    }
+  }
+}
+
+template <typename T, typename DY_OP, typename Tout = T>
+static __global__ void CommonGradBroadcast1CUDAKernelHeight(const T *x,
+                                                            const T *y,
+                                                            const Tout *out,
+                                                            const Tout *dout,
+                                                            int h,
+                                                            int w,
+                                                            DY_OP dy_op,
+                                                            T *dy,
+                                                            int x_h,
+                                                            int x_w,
+                                                            bool is_y) {
+  int j = blockIdx.x;
+  int i = threadIdx.x;
+  int tid = threadIdx.x;
+  T val(0);
+
+  if (is_y) {
+    do {
+      int out_offset = i * w + j;
+      int x_offset = (i % x_h) * x_w + j % x_w;
+      if (dy) {
+        val += dy_op(x[x_offset], y[j], out[out_offset], dout[out_offset]);
+      }
+      i += ELEMWISE_MAX_BLOCK_DIM;
+    } while (i < h);
+
+    if (dy) {
+      h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
+      val = paddle::platform::reduceSum(val, tid, h);
+      if (threadIdx.x == 0) {
+        dy[j] = val;
+      }
+    }
+  } else {
+    do {
+      int out_offset = i * w + j;
+      int y_offset = (i % x_h) * x_w + j % x_w;
+      if (dy) {
+        val += dy_op(x[j], y[y_offset], out[out_offset], dout[out_offset]);
+      }
+      i += ELEMWISE_MAX_BLOCK_DIM;
+    } while (i < h);
+
+    if (dy) {
+      h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
+      val = paddle::platform::reduceSum(val, tid, h);
+      if (threadIdx.x == 0) {
+        dy[j] = val;
+      }
+    }
+  }
+}
+
+template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
+static __global__ void ElemwiseGradBroadcast1CUDAKernel(const T *x,
+                                                        const T *y,
+                                                        const Tout *out,
+                                                        const Tout *dout,
+                                                        int h,
+                                                        int w,
+                                                        bool is_xsize_larger,
+                                                        DX_OP dx_op,
+                                                        DY_OP dy_op,
+                                                        T *dx,
+                                                        T *dy) {
+  int j = blockIdx.x;
+  int i = threadIdx.x;
+  int tid = threadIdx.x;
+  T val(0);
+  if (is_xsize_larger) {
+    do {
+      int x_offset = i * w + j;
+      if (dx) {
+        dx[x_offset] = dx_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
+      }
+      if (dy) {
+        val += dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
+      }
+      i += ELEMWISE_MAX_BLOCK_DIM;
+    } while (i < h);
+
+    if (dy) {
+      h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
+      val = paddle::platform::reduceSum(val, tid, h);
+      if (threadIdx.x == 0) {
+        dy[j] = val;
+      }
+    }
+  } else {  // x.dims < y.dims, broadcast for x.
+    do {
+      int y_offset = i * w + j;
+      if (dy) {
+        dy[y_offset] = dy_op(x[j], y[y_offset], out[y_offset], dout[y_offset]);
+      }
+      if (dx) {
+        val += dx_op(x[j], y[y_offset], out[y_offset], dout[y_offset]);
+      }
+      i += ELEMWISE_MAX_BLOCK_DIM;
+    } while (i < h);
+
+    if (dx) {
+      h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
+      val = paddle::platform::reduceSum(val, tid, h);
+      if (threadIdx.x == 0) {
+        dx[j] = val;
+      }
+    }
+  }
+}
+
+// suppose use 2D block is fast because more parallel
+// and memory coalesced
+template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
+static __global__ void FastElemwiseGradBroadcast1CUDAKernel(
+    const T *x,
+    const T *y,
+    const Tout *out,
+    const Tout *dout,
+    int h,
+    int w,
+    bool is_xsize_larger,
+    DX_OP dx_op,
+    DY_OP dy_op,
+    T *dx,
+    T *dy) {
+  __shared__ T sdata[BLOCK_Y][BLOCK_X + 1];
+
+  T val(0);
+  size_t width_stride = gridDim.x * blockDim.x;
+  size_t idx = threadIdx.x + blockDim.x * blockIdx.x;
+  size_t full_width =
+      (w & (~((uint64_t)(BLOCK_X - 1)))) + ((w & (BLOCK_X - 1)) ? BLOCK_X : 0);
+  size_t full_height =
+      (h & (~((uint64_t)(BLOCK_Y - 1)))) + ((h & (BLOCK_Y - 1)) ? BLOCK_Y : 0);
+  if (is_xsize_larger) {
+    for (int m = idx; m < full_width; m += width_stride) {
+      sdata[threadIdx.y][threadIdx.x] = 0;
+      for (int n = threadIdx.y; n < full_height; n += BLOCK_Y) {
+        int x_offset = n * w + m;
+        if (dx && m < w && n < h) {
+          dx[x_offset] =
+              dx_op(x[x_offset], y[m], out[x_offset], dout[x_offset]);
+        }
+        if (dy) {
+          if (m < w && n < h) {
+            T val = dy_op(x[x_offset], y[m], out[x_offset], dout[x_offset]);
+            sdata[threadIdx.y][threadIdx.x] += val;
+          }
+          __syncthreads();
+        }
+      }
+      if (dy) {
+        T my_val = sdata[threadIdx.x][threadIdx.y];
+        for (int i = warpSize >> 1; i > 0; i >>= 1)
+          my_val += paddle::platform::CudaShuffleXorSync(0xFFFFFFFF, my_val, i);
+        __syncthreads();
+        if ((threadIdx.x == 0)) {
+          sdata[0][threadIdx.y] = my_val;
+        }
+        __syncthreads();
+        if (threadIdx.y == 0 && m < w) {
+          dy[m] = sdata[0][threadIdx.x];
+        }
+      }
+    }
+  } else {  // x.dims < y.dims, broadcast for x.
+    for (int m = idx; m < full_width; m += width_stride) {
+      sdata[threadIdx.y][threadIdx.x] = 0;
+      for (int n = threadIdx.y; n < full_height; n += BLOCK_Y) {
+        int y_offset = n * w + m;
+        if (dy && m < w && n < h) {
+          dy[y_offset] =
+              dy_op(x[m], y[y_offset], out[y_offset], dout[y_offset]);
+        }
+        if (dx) {
+          if (m < w && n < h) {
+            T val = dx_op(x[m], y[y_offset], out[y_offset], dout[y_offset]);
+            sdata[threadIdx.y][threadIdx.x] += val;
+          }
+          __syncthreads();
+        }
+      }
+      if (dx) {
+        T my_val = sdata[threadIdx.x][threadIdx.y];
+        for (int i = warpSize >> 1; i > 0; i >>= 1)
+          my_val += paddle::platform::CudaShuffleXorSync(0xFFFFFFFF, my_val, i);
+        __syncthreads();
+        if ((threadIdx.x == 0)) {
+          sdata[0][threadIdx.y] = my_val;
+        }
+        __syncthreads();
+        if (threadIdx.y == 0 && m < w) {
+          dx[m] = sdata[0][threadIdx.x];
+        }
+      }
+    }
+  }
+}
+
+template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
+static __global__ void ElemwiseGradBroadcast2CUDAKernel(const T *x,
+                                                        const T *y,
+                                                        const Tout *out,
+                                                        const Tout *dout,
+                                                        int pre,
+                                                        int n,
+                                                        int post,
+                                                        bool is_xsize_larger,
+                                                        DX_OP dx_op,
+                                                        DY_OP dy_op,
+                                                        T *dx,
+                                                        T *dy) {
+  int tid = threadIdx.x;
+  int j = blockIdx.x;
+
+  T val(0);
+  int ttid = tid;
+
+  if (is_xsize_larger) {
+    while (true) {
+      int i = ttid / post;
+      int k = ttid % post;
+      if (i >= pre) break;
+
+      int x_offset = i * n * post + j * post + k;
+
+      if (dx != nullptr) {
+        dx[x_offset] = dx_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
+      }
+
+      if (dy != nullptr) {
+        val += dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
+      }
+
+      ttid += ELEMWISE_MAX_BLOCK_DIM;
+    }
+
+    if (dy) {
+      int h = pre * post;
+      h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
+      val = paddle::platform::reduceSum(val, tid, h);
+      if (threadIdx.x == 0) {
+        dy[j] = val;
+      }
+    }
+  } else {  // x.dims < y.dims, broadcast for x.
+    while (true) {
+      int i = ttid / post;
+      int k = ttid % post;
+      if (i >= pre) break;
+
+      int y_offset = i * n * post + j * post + k;
+
+      if (dy != nullptr) {
+        dy[y_offset] = dy_op(x[j], y[y_offset], out[y_offset], dout[y_offset]);
+      }
+
+      if (dx != nullptr) {
+        val += dx_op(x[j], y[y_offset], out[y_offset], dout[y_offset]);
+      }
+
+      ttid += ELEMWISE_MAX_BLOCK_DIM;
+    }
+
+    if (dx) {
+      int h = pre * post;
+      h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
+      val = paddle::platform::reduceSum(val, tid, h);
+      if (threadIdx.x == 0) {
+        dx[j] = val;
+      }
+    }
+  }
+}
+
+template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
+static void ElemwiseGradBroadcast1CUDA(gpuStream_t stream,
+                                       const T *x,
+                                       const T *y,
+                                       const Tout *out,
+                                       const Tout *dout,
+                                       int h,
+                                       int w,
+                                       bool is_xsize_larger,
+                                       DX_OP dx_op,
+                                       DY_OP dy_op,
+                                       T *dx,
+                                       T *dy) {
+  // For small case use 1D block
+  constexpr int half_walf = 16;
+  if (w < half_walf || h < half_walf) {
+    int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, h);
+    int gird_size = w;
+    ElemwiseGradBroadcast1CUDAKernel<<<gird_size, block_size, 0, stream>>>(
+        x, y, out, dout, h, w, is_xsize_larger, dx_op, dy_op, dx, dy);
+  } else {
+    // suppose perfoemance improves with h increased.
+    dim3 block_size = dim3(BLOCK_X, BLOCK_Y);
+    int grid_size = (w + BLOCK_X - 1) / BLOCK_X;
+    FastElemwiseGradBroadcast1CUDAKernel<<<grid_size, block_size, 0, stream>>>(
+        x, y, out, dout, h, w, is_xsize_larger, dx_op, dy_op, dx, dy);
+  }
+}
+
+template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
+static void ElemwiseGradBroadcast2CUDA(gpuStream_t stream,
+                                       const T *x,
+                                       const T *y,
+                                       const Tout *out,
+                                       const Tout *dout,
+                                       int pre,
+                                       int n,
+                                       int post,
+                                       bool is_xsize_larger,
+                                       DX_OP dx_op,
+                                       DY_OP dy_op,
+                                       T *dx,
+                                       T *dy) {
+  int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, pre * post);
+  int gird_size = n;
+  ElemwiseGradBroadcast2CUDAKernel<<<gird_size, block_size, 0, stream>>>(
+      x, y, out, dout, pre, n, post, is_xsize_larger, dx_op, dy_op, dx, dy);
+}
+
 }  // namespace pten

From e7f2bf37106b652a366ca123cbf73eda218d915b Mon Sep 17 00:00:00 2001
From: Zhang Zheng <32410583+ZzSean@users.noreply.github.com>
Date: Wed, 12 Jan 2022 14:57:02 +0800
Subject: [PATCH 104/151] Add pten change file check for op benchmark (#38796)

* Add pten change file check for op benchmark

* fix style format

* test

* revert
---
 tools/ci_op_benchmark.sh | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tools/ci_op_benchmark.sh b/tools/ci_op_benchmark.sh
index 8d28b71a3444d..9872a3b1a330b 100644
--- a/tools/ci_op_benchmark.sh
+++ b/tools/ci_op_benchmark.sh
@@ -43,6 +43,10 @@ function match_cu_file_directory {
   do
     [ "${cu_file_dir}" == "paddle/fluid/operators${sub_dir}" ] && return 0
   done
+  for sub_dir in "" "/gpu" "/hybird"
+  do
+    [ "${cu_file_dir}" == "paddle/pten/kernels${sub_dir}" ] && return 0
+  done
   return 1
 }
 
@@ -50,7 +54,7 @@ function match_cu_file_directory {
 function load_CHANGE_OP_FILES_by_header_file {
   LOG "[INFO] run function load_CHANGE_OP_FILES_by_header_file"
   local change_file
-  for change_file in $(grep -rl "${1}" paddle/fluid/operators)
+  for change_file in $(grep -rl "${1}" paddle/fluid/operators paddle/pten/kernels/)
   do
     if [[ "$change_file" =~ "_op.cu" ]]
     then
@@ -76,7 +80,7 @@ function load_CHANGE_OP_FILES {
   for change_file in $(git diff --name-only develop)
   do
     # match directory limit
-    [[ "$change_file" =~ "paddle/fluid/operators/" ]] || continue
+    [[ "$change_file" =~ "paddle/fluid/operators/" ]] || [[ "$change_file" =~ "paddle/pten/kernels/" ]]  || continue
     # match file name limit
     if [[ "$change_file" =~ "_op.cu" ]]
     then
@@ -295,7 +299,6 @@ if [ -n "${approval_line}" ]; then
     exit 0
   fi
 fi
-set -x
 
 case $1 in
   run_op_benchmark)

From 4640955c3ac0e3629f5bbdcf649823f5a146e99f Mon Sep 17 00:00:00 2001
From: Jiabin Yang <marsyang199376@gmail.com>
Date: Wed, 12 Jan 2022 16:02:33 +0800
Subject: [PATCH 105/151] support test_auto_prune_partial (#38871)

---
 paddle/fluid/eager/api/utils/tensor_utils.cc  |  1 -
 paddle/fluid/eager/backward.cc                |  4 +-
 paddle/fluid/eager/eager_tensor.h             |  8 ++--
 paddle/fluid/eager/grad_node_info.cc          | 45 +++----------------
 paddle/fluid/eager/grad_node_info.h           |  1 -
 .../grad_node_info_test.cc                    |  4 +-
 .../eager/tests/task_tests/backward_test.cc   | 28 +++++++++---
 .../cross_batch_accumulation_test.cc          |  4 +-
 .../fluid/eager/tests/task_tests/hook_test.cc |  8 +++-
 paddle/fluid/pybind/eager_method.cc           | 44 ++++++++++++++++++
 .../tests/unittests/test_egr_python_api.py    | 27 +++++++++++
 .../unittests/test_imperative_auto_prune.py   | 15 ++++++-
 12 files changed, 130 insertions(+), 59 deletions(-)

diff --git a/paddle/fluid/eager/api/utils/tensor_utils.cc b/paddle/fluid/eager/api/utils/tensor_utils.cc
index ad6c34b7cf86c..115c9144df222 100644
--- a/paddle/fluid/eager/api/utils/tensor_utils.cc
+++ b/paddle/fluid/eager/api/utils/tensor_utils.cc
@@ -49,7 +49,6 @@ egr::EagerTensor CreateTensorWithValue(const pten::DDim& ddim,
   egr::EagerTensor out = egr::EagerTensor();
   out.set_tensor(std::make_shared<paddle::experimental::Tensor>(tensor));
   auto meta = EagerUtils::autograd_meta(&out);
-
   if (is_leaf) {
     auto accumulation_node = std::make_shared<GradNodeAccumulation>();
     meta->SetGradNode(accumulation_node);
diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc
index 9a760c03728cd..01cb1b81e341e 100644
--- a/paddle/fluid/eager/backward.cc
+++ b/paddle/fluid/eager/backward.cc
@@ -181,7 +181,9 @@ void RunBackward(const std::vector<egr::EagerTensor>& tensors,
     PADDLE_ENFORCE(edges.size() == grad_output_tensors.size() || edges.empty(),
                    paddle::platform::errors::Fatal(
                        "Number of edges should be either empty ( for leaf node "
-                       ") or the same as number of output grad tensors"));
+                       ") or the same as number of output grad tensors, but we "
+                       "got edges size is: %d, grad_output size is: %d",
+                       edges.size(), grad_output_tensors.size()));
 
     for (size_t i = 0; i < edges.size(); i++) {
       for (size_t j = 0; j < edges[i].size(); j++) {
diff --git a/paddle/fluid/eager/eager_tensor.h b/paddle/fluid/eager/eager_tensor.h
index 72fe5732e9620..80faad9080ffe 100644
--- a/paddle/fluid/eager/eager_tensor.h
+++ b/paddle/fluid/eager/eager_tensor.h
@@ -195,7 +195,6 @@ class EagerTensor final {
     }
     tensor_->copy_(*(src.tensor_.get()), blocking);
   }
-
   /* Part 6: Operator overloading */
   EagerTensor& operator=(const EagerTensor& x) & {
     tensor_ = x.tensor_;
@@ -238,7 +237,7 @@ class EagerTensor final {
           // Contruct framework::Tensor from egr::EagerTensor
           auto tensor_dense =
               std::dynamic_pointer_cast<pten::DenseTensor>(tensor_->impl());
-          if (tensor_dense) {
+          if (tensor_dense && tensor_dense.get()) {
             paddle::experimental::SharesStorage(tensor_dense.get(),
                                                 framework_tensor);
           } else {
@@ -292,11 +291,10 @@ class EagerTensor final {
   template <typename LEGACY_TYPE, typename TYPE>
   void SetImplWithLegacyTensor() {
     const auto& framework_tensor = var_.Get<LEGACY_TYPE>();
-    if (this->initialized()) {
+    if (defined()) {
       VLOG(8) << "Sync Var to initialized tensor for: " << name();
       paddle::experimental::ReMakePtenDenseTensor(
-          framework_tensor,
-          static_cast<pten::DenseTensor*>(this->impl().get()));
+          framework_tensor, static_cast<pten::DenseTensor*>(impl().get()));
     } else {
       VLOG(8) << "Sync Var to uninitialized tensor for: " << name();
       this->set_impl(std::move(
diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc
index 0e6f6aa63dd0f..49bd416d46a76 100644
--- a/paddle/fluid/eager/grad_node_info.cc
+++ b/paddle/fluid/eager/grad_node_info.cc
@@ -47,45 +47,15 @@ void GradNodeBase::AddEdges(std::vector<AutogradMeta*>* metas, size_t slot_id) {
     // adj_edges has as same rank as fwd inputs, and record it's output rank
     // from
     // its pre-ops
-    if (meta) {
+    if (meta && !meta->StopGradient()) {
       auto node = meta->GetMutableGradNode();
       if (node) {
         adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
                                          meta->OutRankInfo());
       } else {
-        if (!meta->StopGradient()) {
-          meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>());
-          adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
-                                           meta->OutRankInfo());
-        }
-      }
-    }
-  }
-}
-
-void GradNodeBase::AddEdges(const std::vector<AutogradMeta*>& metas,
-                            size_t slot_id) {
-  PADDLE_ENFORCE_LT(
-      slot_id, adj_edges_.size(),
-      paddle::platform::errors::InvalidArgument(
-          "Given slot id is out of range of adj_edges outter size, "
-          "adj_edges is designed to has the same size of grad "
-          "inputs's slot num."));
-  for (const auto& meta : metas) {
-    // adj_edges has as same rank as fwd inputs, and record it's output rank
-    // from
-    // its pre-ops
-    if (meta) {
-      auto node = meta->GetMutableGradNode();
-      if (node) {
+        meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>());
         adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
                                          meta->OutRankInfo());
-      } else {
-        if (!meta->StopGradient()) {
-          meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>());
-          adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
-                                           meta->OutRankInfo());
-        }
       }
     }
   }
@@ -98,17 +68,16 @@ void GradNodeBase::AddEdges(AutogradMeta* meta, size_t slot_id) {
           "Given slot id is out of range of adj_edges outter size, "
           "adj_edges is designed to has the same size of grad "
           "inputs's slot num."));
-  if (meta) {
+  if (meta && !meta->StopGradient()) {
+    VLOG(6) << "Add Edges for slot: " << slot_id;
     auto node = meta->GetMutableGradNode();
     if (node) {
       adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
                                        meta->OutRankInfo());
     } else {
-      if (!meta->StopGradient()) {
-        meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>());
-        adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
-                                         meta->OutRankInfo());
-      }
+      meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>());
+      adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
+                                       meta->OutRankInfo());
     }
   }
 }
diff --git a/paddle/fluid/eager/grad_node_info.h b/paddle/fluid/eager/grad_node_info.h
index 545b577f4bda9..f15c50ef75190 100644
--- a/paddle/fluid/eager/grad_node_info.h
+++ b/paddle/fluid/eager/grad_node_info.h
@@ -106,7 +106,6 @@ class GradNodeBase {
    * This one is called slot by slot
    * **/
   void AddEdges(std::vector<AutogradMeta*>* metas, size_t slot_id);
-  void AddEdges(const std::vector<AutogradMeta*>& metas, size_t slot_id);
   void AddEdges(AutogradMeta* meta, size_t slot_id);
 
   /**
diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc b/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc
index aebb0553e28b6..a89fb019d5b37 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc
@@ -56,15 +56,17 @@ TEST(GradNodeInfo, GradNodeBase) {
   VLOG(6) << "Test Add Edges";
   egr::Edge edge0(grad_test_node1, 1, 2);
   auto auto_grad0 = std::make_shared<egr::AutogradMeta>(edge0);
+  auto_grad0->SetStopGradient(false);
   egr::Edge edge1(grad_test_node1, 3, 4);
   auto auto_grad1 = std::make_shared<egr::AutogradMeta>(edge1);
+  auto_grad1->SetStopGradient(false);
   grad_test_node0->AddEdges(auto_grad0.get(), 0);
   CHECK_EQ(grad_test_node0->GetEdges()[0][0].GetEdgeRankInfo().first,
            size_t(1));
   CHECK_EQ(grad_test_node0->GetEdges()[0][0].GetEdgeRankInfo().second,
            size_t(2));
   std::vector<egr::AutogradMeta*> metas = {auto_grad1.get()};
-  grad_test_node0->AddEdges(metas, 1);
+  grad_test_node0->AddEdges(&metas, 1);
   CHECK_EQ(grad_test_node0->GetEdges()[1][0].GetEdgeRankInfo().first,
            size_t(3));
   CHECK_EQ(grad_test_node0->GetEdges()[1][0].GetEdgeRankInfo().second,
diff --git a/paddle/fluid/eager/tests/task_tests/backward_test.cc b/paddle/fluid/eager/tests/task_tests/backward_test.cc
index 0ec86b7cc360c..3737fd95ad64d 100644
--- a/paddle/fluid/eager/tests/task_tests/backward_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/backward_test.cc
@@ -69,9 +69,11 @@ TEST(Backward, SingleNodeEmptyGrad) {
 
     // Connect Node0 -> AccumulationNode via Edge
     auto meta = egr::AutogradMeta();
+    meta.SetStopGradient(false);
     meta.SetSingleOutRankWithSlot(0, 0);
     meta.SetGradNode(acc_node_ptr);
-    node0_ptr->AddEdges({&meta}, 0);
+    std::vector<egr::AutogradMeta*> res = {&meta};
+    node0_ptr->AddEdges(&res, 0);
   }
   std::vector<egr::EagerTensor> outs = {target_tensor};
   // Run Backward
@@ -130,9 +132,11 @@ TEST(Backward, SingleNodeCustomGrad) {
 
     // Connect Node0 -> AccumulationNode via Edge
     auto meta = egr::AutogradMeta();
+    meta.SetStopGradient(false);
     meta.SetSingleOutRankWithSlot(0, 0);
     meta.SetGradNode(acc_node_ptr);
-    node0_ptr->AddEdges({&meta}, 0);
+    std::vector<egr::AutogradMeta*> res = {&meta};
+    node0_ptr->AddEdges(&res, 0);
   }
 
   // Run Backward
@@ -188,9 +192,11 @@ TEST(Backward, LinearNodes) {
 
     // Connect Node0 -> Node1 via Edge
     auto meta0 = egr::AutogradMeta();
+    meta0.SetStopGradient(false);
     meta0.SetSingleOutRankWithSlot(0, 0);
     meta0.SetGradNode(node1_ptr);
-    node0_ptr->AddEdges({&meta0}, 0);
+    std::vector<egr::AutogradMeta*> res0 = {&meta0};
+    node0_ptr->AddEdges(&res0, 0);
 
     // Connect Tensor and AccumulationNode via AutoGradMeta
     auto acc_node_ptr = std::make_shared<egr::GradNodeAccumulation>();
@@ -204,9 +210,11 @@ TEST(Backward, LinearNodes) {
 
     // Connect Node1 -> AccumulationNode via Edge
     auto meta1 = egr::AutogradMeta();
+    meta1.SetStopGradient(false);
     meta1.SetSingleOutRankWithSlot(0, 0);
     meta1.SetGradNode(acc_node_ptr);
-    node1_ptr->AddEdges({&meta1}, 0);
+    std::vector<egr::AutogradMeta*> res1 = {&meta1};
+    node1_ptr->AddEdges(&res1, 0);
   }
 
   // Use Empty Grad Tensor
@@ -283,15 +291,19 @@ TEST(Backward, WithAccumulation) {
 
     // Connect Node0 -> Node2 via Edge
     auto meta0 = egr::AutogradMeta();
+    meta0.SetStopGradient(false);
     meta0.SetSingleOutRankWithSlot(0, 0);
     meta0.SetGradNode(node2_ptr);
-    node0_ptr->AddEdges({&meta0}, 0);
+    std::vector<egr::AutogradMeta*> res0 = {&meta0};
+    node0_ptr->AddEdges(&res0, 0);
 
     // Connect Node1 -> Node2 via Edge
     auto meta1 = egr::AutogradMeta();
+    meta1.SetStopGradient(false);
     meta1.SetSingleOutRankWithSlot(0, 0);
     meta1.SetGradNode(node2_ptr);
-    node1_ptr->AddEdges({&meta1}, 0);
+    std::vector<egr::AutogradMeta*> res1 = {&meta1};
+    node1_ptr->AddEdges(&res1, 0);
 
     // Connect Tensor and AccumulationNode via AutoGradMeta
     auto acc_node_ptr = std::make_shared<egr::GradNodeAccumulation>();
@@ -305,9 +317,11 @@ TEST(Backward, WithAccumulation) {
 
     // Connect Node2 -> AccumulationNode via Edge
     auto meta2 = egr::AutogradMeta();
+    meta2.SetStopGradient(false);
     meta2.SetSingleOutRankWithSlot(0, 0);
     meta2.SetGradNode(acc_node_ptr);
-    node2_ptr->AddEdges({&meta2}, 0);
+    std::vector<egr::AutogradMeta*> res2 = {&meta2};
+    node2_ptr->AddEdges(&res2, 0);
   }
 
   RunBackward(target_tensors, grad_tensors);
diff --git a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
index 52e10b2b1b8a0..7f180fa1076fd 100644
--- a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
@@ -62,8 +62,10 @@ TEST(CrossBatchAccumulation, SingleScaleNode) {
 
     auto meta = AutogradMeta();
     meta.SetSingleOutRankWithSlot(0, 0);
+    meta.SetStopGradient(false);
     meta.SetGradNode(acc_node_ptr);
-    scale_node_ptr->AddEdges({&meta}, 0);
+    std::vector<egr::AutogradMeta*> res = {&meta};
+    scale_node_ptr->AddEdges(&res, 0);
 
     AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor);
     auto_grad_meta1->SetGradNode(
diff --git a/paddle/fluid/eager/tests/task_tests/hook_test.cc b/paddle/fluid/eager/tests/task_tests/hook_test.cc
index 4ec49bfa56676..0f8039dade801 100644
--- a/paddle/fluid/eager/tests/task_tests/hook_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/hook_test.cc
@@ -105,9 +105,11 @@ TEST(RetainGrad, HookBeforeRetainGrad) {
   // Connect ScaleNode -> AccumulationNode via Edge
   {
     auto meta = AutogradMeta();
+    meta.SetStopGradient(false);
     meta.SetSingleOutRankWithSlot(0, 0);
     meta.SetGradNode(acc_node_ptr);
-    scale_node_ptr->AddEdges({&meta}, 0);
+    std::vector<egr::AutogradMeta*> res = {&meta};
+    scale_node_ptr->AddEdges(&res, 0);
   }
 
   // Retain Grad for leaf tensor1
@@ -180,9 +182,11 @@ TEST(RetainGrad, HookAfterRetainGrad) {
   // Connect ScaleNode -> AccumulationNode via Edge
   {
     auto meta = AutogradMeta();
+    meta.SetStopGradient(false);
     meta.SetSingleOutRankWithSlot(0, 0);
     meta.SetGradNode(acc_node_ptr);
-    scale_node_ptr->AddEdges({&meta}, 0);
+    std::vector<egr::AutogradMeta*> res = {&meta};
+    scale_node_ptr->AddEdges(&res, 0);
   }
 
   // Retain Grad for leaf tensor1
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index c56fe5be4da69..a0067f9c64fb1 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -234,6 +234,44 @@ static PyObject* eager_tensor__zero_grads(EagerTensorObject* self,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
+static PyObject* eager_tensor__share_buffer_to(EagerTensorObject* self,
+                                               PyObject* args,
+                                               PyObject* kwargs) {
+  EAGER_SYNC_TRY
+  egr::EagerTensor* src_ptr =
+      &(reinterpret_cast<EagerTensorObject*>(PyTuple_GET_ITEM(args, 0))
+            ->eager_tensor);
+  PADDLE_ENFORCE_EQ(self->eager_tensor.initialized(), true,
+                    platform::errors::InvalidArgument(
+                        "Tensor %s has not been initialized! please initialize "
+                        "src tensor before share_buffer_with to other.",
+                        self->eager_tensor.name()));
+  src_ptr->set_impl(self->eager_tensor.impl());
+  Py_INCREF(Py_None);
+  return Py_None;
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* eager_tensor__is_shared_buffer_with(EagerTensorObject* self,
+                                                     PyObject* args,
+                                                     PyObject* kwargs) {
+  EAGER_SYNC_TRY
+  egr::EagerTensor src_tensor =
+      CastPyArg2EagerTensor(PyTuple_GET_ITEM(args, 0), 0);
+  PADDLE_ENFORCE_EQ(src_tensor.initialized(), true,
+                    platform::errors::InvalidArgument(
+                        "Tensor %s has not been initialized! please initialize "
+                        "src tensor before share_buffer_with to other.",
+                        src_tensor.name()));
+  bool res = false;
+  if (!self->eager_tensor.defined() || !src_tensor.defined()) {
+    return ToPyObject(res);
+  }
+  res = (self->eager_tensor.impl().get() == src_tensor.impl().get());
+  return ToPyObject(res);
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
 static PyObject* eager_tensor_method_detach(EagerTensorObject* self,
                                             PyObject* args, PyObject* kwargs) {
   EAGER_SYNC_TRY
@@ -278,6 +316,12 @@ PyMethodDef variable_methods[] = {
      METH_VARARGS | METH_KEYWORDS, NULL},
     {"_zero_grads", (PyCFunction)(void (*)(void))eager_tensor__zero_grads,
      METH_VARARGS | METH_KEYWORDS, NULL},
+    {"_is_shared_buffer_to",
+     (PyCFunction)(void (*)(void))eager_tensor__share_buffer_to,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    {"_share_buffer_with",
+     (PyCFunction)(void (*)(void))eager_tensor__is_shared_buffer_with,
+     METH_VARARGS | METH_KEYWORDS, NULL},
     {"detach", (PyCFunction)(void (*)(void))eager_tensor_method_detach,
      METH_VARARGS | METH_KEYWORDS, NULL},
     {NULL, NULL, 0, NULL}};
diff --git a/python/paddle/fluid/tests/unittests/test_egr_python_api.py b/python/paddle/fluid/tests/unittests/test_egr_python_api.py
index e4576fe2ea8bd..3ab7981cdb1a4 100644
--- a/python/paddle/fluid/tests/unittests/test_egr_python_api.py
+++ b/python/paddle/fluid/tests/unittests/test_egr_python_api.py
@@ -645,6 +645,33 @@ def test_copy_and_copy_to(self):
                 self.assertTrue(tensor3.stop_gradient, True)
                 self.assertTrue(tensor3.place.is_cpu_place())
 
+        def test_share_buffer_to():
+            arr = np.ones([4, 16, 16, 32]).astype('float32')
+            arr1 = np.zeros([4, 16]).astype('float32')
+            arr2 = np.ones([4, 16, 16, 32]).astype('float32') + np.ones(
+                [4, 16, 16, 32]).astype('float32')
+            tensor = None
+            tensor2 = None
+            tensor = paddle.to_tensor(arr, core.VarDesc.VarType.FP32,
+                                      core.CPUPlace())
+            tensor3 = core.eager.EagerTensor()
+            if core.is_compiled_with_cuda():
+                tensor2 = paddle.to_tensor(arr2, core.VarDesc.VarType.FP32,
+                                           core.CUDAPlace(0))
+            else:
+                tensor2 = paddle.to_tensor(arr2, core.VarDesc.VarType.FP32,
+                                           core.CPUPlace())
+            self.assertTrue(np.array_equal(tensor.numpy(), arr1))
+            self.assertTrue(np.array_equal(tensor2.numpy(), arr2))
+            tensor2._share_buffer_to(tensor)
+            self.assertTrue(np.array_equal(tensor.numpy(), arr2))
+            self.assertTrue(np.array_equal(tensor2.numpy(), arr2))
+            self.assertTrue(tensor._is_shared_buffer_with(tensor2))
+            self.assertTrue(tensor2._is_shared_buffer_with(tensor))
+            tensor._share_buffer_to(tensor3)
+            self.assertTrue(np.array_equal(tensor3.numpy(), arr2))
+            self.assertTrue(tensor3._is_shared_buffer_with(tensor))
+
     def test_properties(self):
         print("Test_properties")
         with _test_eager_guard():
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
index b82a058ae4eb1..d2e1a4fbb1882 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
@@ -15,6 +15,7 @@
 import unittest
 import paddle.fluid as fluid
 import numpy as np
+from paddle.fluid.framework import _test_eager_guard
 
 
 class AutoPruneLayer0(fluid.Layer):
@@ -145,7 +146,7 @@ def embed_linear0(self, x):
 
 
 class TestImperativeAutoPrune(unittest.TestCase):
-    def test_auto_prune(self):
+    def func_auto_prune(self):
         with fluid.dygraph.guard():
             case1 = AutoPruneLayer0(input_size=5)
             value1 = np.arange(25).reshape(5, 5).astype("float32")
@@ -157,7 +158,12 @@ def test_auto_prune(self):
             self.assertTrue(case1.linear2.weight._grad_ivar() is not None)
             self.assertTrue(case1.linear1.weight._grad_ivar() is not None)
 
-    def test_auto_prune2(self):
+    def test_auto_prune(self):
+        with _test_eager_guard():
+            self.func_auto_prune()
+        self.func_auto_prune()
+
+    def func_auto_prune2(self):
         with fluid.dygraph.guard():
             case2 = AutoPruneLayer1(input_size=5)
             value1 = np.arange(25).reshape(5, 5).astype("float32")
@@ -170,6 +176,11 @@ def test_auto_prune2(self):
             self.assertTrue(case2.linear2.weight._grad_ivar() is None)
             self.assertTrue(case2.linear1.weight._grad_ivar() is not None)
 
+    def test_auto_prune2(self):
+        with _test_eager_guard():
+            self.func_auto_prune2()
+        self.func_auto_prune2()
+
     def test_auto_prune3(self):
         with fluid.dygraph.guard():
             case3 = AutoPruneLayer3(input_size=784)

From 5fc8bbf79d3551654a83562d23d2cc8a76c60ba2 Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Wed, 12 Jan 2022 16:07:16 +0800
Subject: [PATCH 106/151] [pten]Move dot, conj, sign dev_api into kernel.h
 (#38862)

* move dot_dev api into dot_kernel.h

* add infermate header

* modify to dotkerel in dot_op.h

* mvoe conj dev api into complex_kernel.h

* move sign dev api into  sign_kernel.h
---
 paddle/fluid/operators/conj_op.h              |  2 +-
 paddle/fluid/operators/dot_op.h               |  3 +-
 paddle/fluid/operators/sign_op.h              |  2 +-
 paddle/pten/all.h                             |  1 -
 paddle/pten/include/linalg.h                  | 37 -------------------
 paddle/pten/include/math.h                    | 12 ------
 paddle/pten/kernels/complex_kernel.h          |  4 +-
 paddle/pten/kernels/cpu/sign_kernel.cc        |  3 +-
 paddle/pten/kernels/dot_kernel.h              | 12 +++++-
 paddle/pten/kernels/gpu/sign_kernel.cu        |  2 +-
 .../pten/kernels/impl/complex_kernel_impl.h   |  4 +-
 paddle/pten/kernels/impl/sign_kernel_impl.h   |  4 +-
 paddle/pten/kernels/sign_kernel.h             | 12 +++++-
 .../pten/tests/kernels/test_conj_dev_api.cc   |  2 +-
 paddle/pten/tests/kernels/test_dot_dev_api.cc |  2 +-
 15 files changed, 39 insertions(+), 63 deletions(-)
 delete mode 100644 paddle/pten/include/linalg.h

diff --git a/paddle/fluid/operators/conj_op.h b/paddle/fluid/operators/conj_op.h
index 381f4cb66b3cd..71115c2eba796 100644
--- a/paddle/fluid/operators/conj_op.h
+++ b/paddle/fluid/operators/conj_op.h
@@ -39,7 +39,7 @@ class ConjKernel : public framework::OpKernel<T> {
     auto pt_out = paddle::experimental::MakePtenDenseTensor(*out);
 
     // call new kernel
-    pten::ConjKernel<T>(dev_ctx, *pt_x.get(), pt_out.get());
+    pten::ConjKernel<T, DeviceContext>(dev_ctx, *pt_x.get(), pt_out.get());
   }
 };
 
diff --git a/paddle/fluid/operators/dot_op.h b/paddle/fluid/operators/dot_op.h
index 02ba57ef8d495..8817e2f3ca79d 100644
--- a/paddle/fluid/operators/dot_op.h
+++ b/paddle/fluid/operators/dot_op.h
@@ -46,7 +46,8 @@ class DotKernel : public framework::OpKernel<T> {
     auto pt_out = paddle::experimental::MakePtenDenseTensor(*out);
 
     // call new kernel
-    pten::DotKernel<T>(dev_ctx, *pt_x.get(), *pt_y.get(), pt_out.get());
+    pten::DotKernel<T, DeviceContext>(dev_ctx, *pt_x.get(), *pt_y.get(),
+                                      pt_out.get());
   }
 };
 
diff --git a/paddle/fluid/operators/sign_op.h b/paddle/fluid/operators/sign_op.h
index b93c062cda200..b8dd44c01b050 100644
--- a/paddle/fluid/operators/sign_op.h
+++ b/paddle/fluid/operators/sign_op.h
@@ -39,7 +39,7 @@ class SignKernel : public framework::OpKernel<T> {
     auto pt_out = paddle::experimental::MakePtenDenseTensor(*out);
 
     // call new kernel
-    pten::Sign<T>(dev_ctx, *pt_x.get(), pt_out.get());
+    pten::SignKernel<T, DeviceContext>(dev_ctx, *pt_x.get(), pt_out.get());
   }
 };
 
diff --git a/paddle/pten/all.h b/paddle/pten/all.h
index 7dd517e5e6381..c8be629b10e75 100644
--- a/paddle/pten/all.h
+++ b/paddle/pten/all.h
@@ -17,5 +17,4 @@ limitations under the License. */
 // developer apis
 #include "paddle/pten/include/core.h"
 #include "paddle/pten/include/infermeta.h"
-#include "paddle/pten/include/linalg.h"
 #include "paddle/pten/include/math.h"
diff --git a/paddle/pten/include/linalg.h b/paddle/pten/include/linalg.h
deleted file mode 100644
index 71bc518aa89f8..0000000000000
--- a/paddle/pten/include/linalg.h
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-// See Note: [ How do we organize the kernel directory ]
-#include "paddle/pten/api/lib/utils/storage.h"
-#include "paddle/pten/include/infermeta.h"
-#include "paddle/pten/kernels/dot_kernel.h"
-
-namespace pten {
-
-template <typename T, typename ContextT>
-DenseTensor Dot(const ContextT& dev_ctx,
-                const DenseTensor& x,
-                const DenseTensor& y) {
-  auto out_meta = DotInferMeta(x.meta(), y.meta());
-  pten::DenseTensor dense_out(
-      pten::make_intrusive<paddle::experimental::SharedStorage>(
-          dev_ctx.GetPlace()),
-      std::move(out_meta));
-  DotKernel<T, ContextT>(dev_ctx, x, y, &dense_out);
-  return dense_out;
-}
-
-}  // namespace pten
diff --git a/paddle/pten/include/math.h b/paddle/pten/include/math.h
index 5070d0d4e0e5a..a4fb7f4d98faf 100644
--- a/paddle/pten/include/math.h
+++ b/paddle/pten/include/math.h
@@ -17,22 +17,10 @@ limitations under the License. */
 // See Note: [ How do we organize the kernel directory ]
 #include "paddle/pten/api/lib/utils/storage.h"
 #include "paddle/pten/include/infermeta.h"
-#include "paddle/pten/kernels/complex_kernel.h"
 #include "paddle/pten/kernels/scale_kernel.h"
 
 namespace pten {
 
-template <typename T, typename ContextT>
-DenseTensor Sign(const ContextT& dev_ctx, const DenseTensor& x) {
-  auto out_meta = UnchangedInferMeta(x.meta());
-  pten::DenseTensor dense_out(
-      pten::make_intrusive<paddle::experimental::SharedStorage>(
-          dev_ctx.GetPlace()),
-      std::move(out_meta));
-  Sign<T>(dev_ctx, x, &dense_out);
-  return dense_out;
-}
-
 template <typename T, typename ContextT>
 DenseTensor Scale(const ContextT& dev_ctx,
                   const DenseTensor& x,
diff --git a/paddle/pten/kernels/complex_kernel.h b/paddle/pten/kernels/complex_kernel.h
index e9f717152a458..9dd3d457e4a26 100644
--- a/paddle/pten/kernels/complex_kernel.h
+++ b/paddle/pten/kernels/complex_kernel.h
@@ -15,6 +15,8 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/include/infermeta.h"
+#include "paddle/pten/kernels/empty_kernel.h"
 
 #include "paddle/pten/infermeta/unary.h"
 #include "paddle/pten/kernels/empty_kernel.h"
@@ -27,7 +29,7 @@ void ConjKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
 template <typename T, typename Context>
 DenseTensor Conj(const Context& dev_ctx, const DenseTensor& x) {
   auto out_meta = UnchangedInferMeta(x.meta());
-  auto dense_out = Empty<T, Context>(dev_ctx, std::move(out_meta));
+  auto dense_out = pten::Empty<T, Context>(dev_ctx, std::move(out_meta));
   ConjKernel<T>(dev_ctx, x, &dense_out);
   return dense_out;
 }
diff --git a/paddle/pten/kernels/cpu/sign_kernel.cc b/paddle/pten/kernels/cpu/sign_kernel.cc
index c6e352f7da44a..a7b62822d6e0f 100644
--- a/paddle/pten/kernels/cpu/sign_kernel.cc
+++ b/paddle/pten/kernels/cpu/sign_kernel.cc
@@ -21,4 +21,5 @@ limitations under the License. */
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/bfloat16.h"
 
-PT_REGISTER_CTX_KERNEL(sign, CPU, ALL_LAYOUT, pten::Sign, float, double) {}
+PT_REGISTER_CTX_KERNEL(sign, CPU, ALL_LAYOUT, pten::SignKernel, float, double) {
+}
diff --git a/paddle/pten/kernels/dot_kernel.h b/paddle/pten/kernels/dot_kernel.h
index 5ef660265333e..47f1c89109e7e 100644
--- a/paddle/pten/kernels/dot_kernel.h
+++ b/paddle/pten/kernels/dot_kernel.h
@@ -15,7 +15,8 @@
 #pragma once
 
 #include "paddle/pten/core/dense_tensor.h"
-
+#include "paddle/pten/infermeta/binary.h"
+#include "paddle/pten/kernels/empty_kernel.h"
 namespace pten {
 
 template <typename T, typename Context>
@@ -24,4 +25,13 @@ void DotKernel(const Context& dev_ctx,
                const DenseTensor& y,
                DenseTensor* out);
 
+template <typename T, typename Context>
+DenseTensor Dot(const Context& dev_ctx,
+                const DenseTensor& x,
+                const DenseTensor& y) {
+  auto out_meta = DotInferMeta(x.meta(), y.meta());
+  auto dense_out = pten::Empty<T, Context>(dev_ctx, std::move(out_meta));
+  DotKernel<T, Context>(dev_ctx, x, y, &dense_out);
+  return dense_out;
+}
 }  // namespace pten
diff --git a/paddle/pten/kernels/gpu/sign_kernel.cu b/paddle/pten/kernels/gpu/sign_kernel.cu
index 42b39141460fe..e7eb7e46861c8 100644
--- a/paddle/pten/kernels/gpu/sign_kernel.cu
+++ b/paddle/pten/kernels/gpu/sign_kernel.cu
@@ -24,4 +24,4 @@ limitations under the License. */
 using float16 = paddle::platform::float16;
 
 PT_REGISTER_CTX_KERNEL(
-    sign, GPU, ALL_LAYOUT, pten::Sign, float, double, float16) {}
+    sign, GPU, ALL_LAYOUT, pten::SignKernel, float, double, float16) {}
diff --git a/paddle/pten/kernels/impl/complex_kernel_impl.h b/paddle/pten/kernels/impl/complex_kernel_impl.h
index e0c6825a78a53..d7132b05f7f04 100644
--- a/paddle/pten/kernels/impl/complex_kernel_impl.h
+++ b/paddle/pten/kernels/impl/complex_kernel_impl.h
@@ -21,14 +21,14 @@
 namespace pten {
 
 template <typename T, typename Context>
-void ConjKernel(const Context& context,
+void ConjKernel(const Context& dev_ctx,
                 const DenseTensor& x,
                 DenseTensor* out) {
   auto numel = x.numel();
   auto* x_data = x.data<T>();
   auto* out_data = out->mutable_data<T>();
 
-  paddle::platform::ForRange<Context> for_range(context, numel);
+  paddle::platform::ForRange<Context> for_range(dev_ctx, numel);
   paddle::operators::math::ConjFunctor<T> functor(x_data, numel, out_data);
   for_range(functor);
 }
diff --git a/paddle/pten/kernels/impl/sign_kernel_impl.h b/paddle/pten/kernels/impl/sign_kernel_impl.h
index d663808f03792..306206f1fc3fa 100644
--- a/paddle/pten/kernels/impl/sign_kernel_impl.h
+++ b/paddle/pten/kernels/impl/sign_kernel_impl.h
@@ -23,7 +23,9 @@ limitations under the License. */
 namespace pten {
 
 template <typename T, typename Context>
-void Sign(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) {
+void SignKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                DenseTensor* out) {
   out->mutable_data<T>();
   auto eigen_out = pten::EigenVector<T>::Flatten(*out);
   auto eigen_x = pten::EigenVector<T>::Flatten(x);
diff --git a/paddle/pten/kernels/sign_kernel.h b/paddle/pten/kernels/sign_kernel.h
index 2cf5ca973f093..ba205fc96a15c 100644
--- a/paddle/pten/kernels/sign_kernel.h
+++ b/paddle/pten/kernels/sign_kernel.h
@@ -15,10 +15,20 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/include/infermeta.h"
+#include "paddle/pten/kernels/empty_kernel.h"
 
 namespace pten {
 
 template <typename T, typename Context>
-void Sign(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
+void SignKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
+
+template <typename T, typename Context>
+DenseTensor Sign(const Context& dev_ctx, const DenseTensor& x) {
+  auto out_meta = UnchangedInferMeta(x.meta());
+  auto dense_out = pten::Empty<T, Context>(dev_ctx, std::move(out_meta));
+  SignKernel<T, Context>(dev_ctx, x, &dense_out);
+  return dense_out;
+}
 
 }  // namespace pten
diff --git a/paddle/pten/tests/kernels/test_conj_dev_api.cc b/paddle/pten/tests/kernels/test_conj_dev_api.cc
index 0438a8f4f462b..3392626dc2ad3 100644
--- a/paddle/pten/tests/kernels/test_conj_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_conj_dev_api.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/pten/include/math.h"
+#include "paddle/pten/kernels/complex_kernel.h"
 
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
diff --git a/paddle/pten/tests/kernels/test_dot_dev_api.cc b/paddle/pten/tests/kernels/test_dot_dev_api.cc
index 3fda81d3b5eae..6e2166cb673bd 100644
--- a/paddle/pten/tests/kernels/test_dot_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_dot_dev_api.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/pten/include/linalg.h"
+#include "paddle/pten/kernels/dot_kernel.h"
 
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"

From 12c5b1fea4906519bfa5ffa6167515b3c2d45067 Mon Sep 17 00:00:00 2001
From: Zhang Ting <zhangting_2017@163.com>
Date: Wed, 12 Jan 2022 16:35:36 +0800
Subject: [PATCH 107/151] [part 6]change type of function args (#38891)

---
 .../kernel_primitives/functor_primitives.h    | 30 +++++++++----------
 paddle/pten/kernels/gpu/math_kernel.cu        |  2 +-
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/paddle/fluid/operators/kernel_primitives/functor_primitives.h b/paddle/fluid/operators/kernel_primitives/functor_primitives.h
index 2bd8721b82fa2..5e3c1fc202d59 100644
--- a/paddle/fluid/operators/kernel_primitives/functor_primitives.h
+++ b/paddle/fluid/operators/kernel_primitives/functor_primitives.h
@@ -53,7 +53,7 @@ struct ExpFunctor {
 
   HOSTDEVICE explicit inline ExpFunctor(int n) {}
 
-  HOSTDEVICE inline Ty operator()(const Tx& x) const {
+  HOSTDEVICE inline Ty operator()(const Tx x) const {
     return static_cast<Ty>(details::Exp(x));
   }
 };
@@ -67,7 +67,7 @@ struct IdentityFunctor {
 
   HOSTDEVICE explicit inline IdentityFunctor(int n) {}
 
-  HOSTDEVICE inline Ty operator()(const Tx& x) const {
+  HOSTDEVICE inline Ty operator()(const Tx x) const {
     return static_cast<Ty>(x);
   }
 };
@@ -85,7 +85,7 @@ struct DivideFunctor {
 
   HOSTDEVICE explicit inline DivideFunctor(int n) : n_inv((MPType)(1.0 / n)) {}
 
-  HOSTDEVICE inline Ty operator()(const Tx& x) const {
+  HOSTDEVICE inline Ty operator()(const Tx x) const {
     return static_cast<Ty>(static_cast<MPType>(x) * n_inv);
   }
 
@@ -102,7 +102,7 @@ struct InverseFunctor {
 
   HOSTDEVICE explicit inline InverseFunctor(int n) {}
 
-  HOSTDEVICE inline Ty operator()(const Tx& x) const {
+  HOSTDEVICE inline Ty operator()(const Tx x) const {
     return static_cast<Ty>(-x);
   }
 };
@@ -116,7 +116,7 @@ struct SquareFunctor {
 
   HOSTDEVICE explicit inline SquareFunctor(int n) {}
 
-  HOSTDEVICE inline Ty operator()(const Tx& x) const {
+  HOSTDEVICE inline Ty operator()(const Tx x) const {
     return static_cast<Ty>(x) * static_cast<Ty>(x);
   }
 };
@@ -130,7 +130,7 @@ template <typename T>
 struct MinFunctor {
   inline T initial() { return static_cast<T>(std::numeric_limits<T>::max()); }
 
-  __device__ __forceinline__ T operator()(const T& a, const T& b) const {
+  __device__ __forceinline__ T operator()(const T a, const T b) const {
     return (b < a) ? b : a;
   }
 };
@@ -144,7 +144,7 @@ struct MaxFunctor {
     return static_cast<T>(std::numeric_limits<T>::lowest());
   }
 
-  __device__ __forceinline__ T operator()(const T& a, const T& b) const {
+  __device__ __forceinline__ T operator()(const T a, const T b) const {
     return (b > a) ? b : a;
   }
 };
@@ -156,7 +156,7 @@ template <typename T>
 struct AddFunctor {
   inline T initial() { return static_cast<T>(0.0f); }
 
-  __device__ __forceinline__ T operator()(const T& a, const T& b) const {
+  __device__ __forceinline__ T operator()(const T a, const T b) const {
     return b + a;
   }
 };
@@ -168,7 +168,7 @@ template <typename T>
 struct MulFunctor {
   inline T initial() { return static_cast<T>(1.0f); }
 
-  __device__ __forceinline__ T operator()(const T& a, const T& b) const {
+  __device__ __forceinline__ T operator()(const T a, const T b) const {
     return b * a;
   }
 };
@@ -180,7 +180,7 @@ template <typename T>
 struct LogicalOrFunctor {
   inline T initial() { return static_cast<T>(false); }
 
-  __device__ __forceinline__ T operator()(const T& a, const T& b) const {
+  __device__ __forceinline__ T operator()(const T a, const T b) const {
     return b || a;
   }
 };
@@ -192,7 +192,7 @@ template <typename T>
 struct LogicalAndFunctor {
   inline T initial() { return static_cast<T>(true); }
 
-  __device__ __forceinline__ T operator()(const T& a, const T& b) const {
+  __device__ __forceinline__ T operator()(const T a, const T b) const {
     return b && a;
   }
 };
@@ -204,7 +204,7 @@ template <typename T>
 struct SubFunctor {
   inline T initial() { return static_cast<T>(0.0f); }
 
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a - b; }
+  inline HOSTDEVICE T operator()(const T a, const T b) const { return a - b; }
 };
 
 /**
@@ -214,7 +214,7 @@ template <typename T, typename Enable = void>
 struct DivFunctor {
   inline T initial() { return static_cast<T>(1.0f); }
 
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a / b; }
+  inline HOSTDEVICE T operator()(const T a, const T b) const { return a / b; }
 };
 
 template <typename T>
@@ -222,7 +222,7 @@ struct DivFunctor<T,
                   typename std::enable_if<std::is_integral<T>::value>::type> {
   inline T initial() { return static_cast<T>(1.0f); }
 
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const {
+  inline HOSTDEVICE T operator()(const T a, const T b) const {
     // For int32/int64, need to check whether the divison is zero.
     PADDLE_ENFORCE_NE(b, 0,
                       platform::errors::InvalidArgument(
@@ -239,7 +239,7 @@ template <typename T>
 struct FloorDivFunctor {
   inline T initial() { return static_cast<T>(1.0f); }
 
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const {
+  inline HOSTDEVICE T operator()(const T a, const T b) const {
     PADDLE_ENFORCE_NE(b, 0,
                       platform::errors::InvalidArgument(
                           "Integer division by zero encountered "
diff --git a/paddle/pten/kernels/gpu/math_kernel.cu b/paddle/pten/kernels/gpu/math_kernel.cu
index f41934313d674..557080638038d 100644
--- a/paddle/pten/kernels/gpu/math_kernel.cu
+++ b/paddle/pten/kernels/gpu/math_kernel.cu
@@ -61,7 +61,7 @@ struct DivideFunctor {
   HOSTDEVICE explicit inline DivideFunctor(int n)
       : n_inv(static_cast<T>(1.0 / n)) {}
 
-  HOSTDEVICE inline T operator()(const T& x) const { return x * n_inv; }
+  HOSTDEVICE inline T operator()(const T x) const { return x * n_inv; }
 
  private:
   T n_inv;

From c2f825d776171cea7176e4c567e88afde2876ec7 Mon Sep 17 00:00:00 2001
From: Lijunhui <1578034415@qq.com>
Date: Wed, 12 Jan 2022 16:51:37 +0800
Subject: [PATCH 108/151] optimize elementwise_min_grad using new reduce
 interface (#38236)

* ini commit

* multi-outputs init commit

* optimize code

* remove inplace
---
 .../elementwise/elementwise_functor.h         | 26 ++++++++++++++++
 .../elementwise/elementwise_min_op.cu         | 30 +++++++++++++++++--
 .../elementwise/elementwise_min_op.h          | 26 ++++++++++++++--
 3 files changed, 77 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/operators/elementwise/elementwise_functor.h b/paddle/fluid/operators/elementwise/elementwise_functor.h
index e2689cefd43a7..438a47f5dc593 100644
--- a/paddle/fluid/operators/elementwise/elementwise_functor.h
+++ b/paddle/fluid/operators/elementwise/elementwise_functor.h
@@ -233,6 +233,32 @@ struct FMinFunctor<int64_t> {
   }
 };
 
+template <typename T>
+struct MinGradXFunctor {
+  inline HOSTDEVICE T operator()(const T& x, const T& y, const T& dout) const {
+    return dout * static_cast<T>(x < y);
+  }
+};
+template <typename T>
+struct MinGradYFunctor {
+  inline HOSTDEVICE T operator()(const T& x, const T& y, const T& dout) const {
+    return dout * static_cast<T>(x >= y);
+  }
+};
+
+template <typename InT, typename OutT>
+struct MinGradXYFunctor {
+  inline HOSTDEVICE paddle::framework::Array<OutT, 2> operator()(
+      const InT& x, const InT& y, const InT& dout) {
+    paddle::framework::Array<OutT, 2> outs;
+    // dx = dout * (x < y)
+    outs[0] = static_cast<OutT>(dout * static_cast<InT>(x < y));
+    // dy = dout * (x >= y)
+    outs[1] = static_cast<OutT>(dout * static_cast<InT>(x >= y));
+    return outs;
+  }
+};
+
 template <typename T>
 struct MulGradFunctor {
   inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a * b; }
diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.cu b/paddle/fluid/operators/elementwise/elementwise_min_op.cu
index b51dbcd883608..a733b4a66f129 100644
--- a/paddle/fluid/operators/elementwise/elementwise_min_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op.cu
@@ -24,15 +24,41 @@ class ElementwiseMinKernel<platform::CUDADeviceContext, T>
   void Compute(const framework::ExecutionContext& ctx) const override {
     std::vector<const framework::Tensor*> ins;
     std::vector<framework::Tensor*> outs;
-    const auto& cuda_ctx =
+    const auto& dev_ctx =
         ctx.template device_context<platform::CUDADeviceContext>();
 
     int axis = PackTensorsIntoVector<T>(ctx, &ins, &outs);
     LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
-        cuda_ctx, ins, &outs, axis, MinFunctor<T>());
+        dev_ctx, ins, &outs, axis, MinFunctor<T>());
   }
 };
 
+template <typename DeviceContext, typename T>
+typename std::enable_if<
+    std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
+ElementwiseMinGrad(const framework::ExecutionContext& ctx,
+                   const framework::Tensor* x, const framework::Tensor* y,
+                   const framework::Tensor* out, const framework::Tensor* dout,
+                   framework::Tensor* dx, framework::Tensor* dy) {
+  int axis = ctx.Attr<int>("axis");
+  const auto& dev_ctx =
+      ctx.template device_context<platform::CUDADeviceContext>();
+  const auto place = ctx.GetPlace();
+  if (dx != nullptr && dy != nullptr) {
+    std::vector<const framework::Tensor*> ins = {x, y, dout};
+    GetGradXAndYOut<ElementwiseType::kTernary, T>(
+        dev_ctx, place, axis, ins, dout, dx, dy, MinGradXYFunctor<T, T>());
+  } else if (dx != nullptr && dy == nullptr) {
+    std::vector<const framework::Tensor*> ins = {x, y, dout};
+    GetGradXOrYOut<ElementwiseType::kTernary, T>(
+        dev_ctx, place, axis, ins, dout, dx, MinGradXFunctor<T>());
+  } else if (dx == nullptr && dy != nullptr) {
+    std::vector<const framework::Tensor*> ins = {x, y, dout};
+    GetGradXOrYOut<ElementwiseType::kTernary, T>(
+        dev_ctx, place, axis, ins, dout, dy, MinGradYFunctor<T>());
+  }
+}
+
 }  // namespace operators
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.h b/paddle/fluid/operators/elementwise/elementwise_min_op.h
index ffb8c965357a3..88fb044d42206 100644
--- a/paddle/fluid/operators/elementwise/elementwise_min_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op.h
@@ -86,6 +86,28 @@ struct MinGradDy<platform::float16> {
 };
 #endif
 
+template <typename DeviceContext, typename T>
+typename std::enable_if<
+    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
+ElementwiseMinGrad(const framework::ExecutionContext& ctx,
+                   const framework::Tensor* x, const framework::Tensor* y,
+                   const framework::Tensor* out, const framework::Tensor* dout,
+                   framework::Tensor* dx, framework::Tensor* dy) {
+  int axis = ctx.Attr<int>("axis");
+  ElemwiseGradCompute<DeviceContext, T, MinGradDx<T>, MinGradDy<T>>(
+      ctx, *x, *y, *out, *dout, axis, dx, dy, MinGradDx<T>(), MinGradDy<T>());
+}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <typename DeviceContext, typename T>
+typename std::enable_if<
+    std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
+ElementwiseMinGrad(const framework::ExecutionContext& ctx,
+                   const framework::Tensor* x, const framework::Tensor* y,
+                   const framework::Tensor* out, const framework::Tensor* dout,
+                   framework::Tensor* dx, framework::Tensor* dy);
+#endif
+
 template <typename DeviceContext, typename T>
 class ElementwiseMinGradKernel : public ElemwiseGradKernel<T> {
  public:
@@ -99,9 +121,7 @@ class ElementwiseMinGradKernel : public ElemwiseGradKernel<T> {
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
     auto* out = dout;  // Fake out, not used
-    int axis = ctx.Attr<int>("axis");
-    ElemwiseGradCompute<DeviceContext, T, MinGradDx<T>, MinGradDy<T>>(
-        ctx, *x, *y, *out, *dout, axis, dx, dy, MinGradDx<T>(), MinGradDy<T>());
+    ElementwiseMinGrad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
   }
 };
 

From 5f5f626b22324329c3d2d6b3415cc1a4b3d53b11 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Wed, 12 Jan 2022 17:00:35 +0800
Subject: [PATCH 109/151] [PTen] Remove hybird dir (#38863)

* remove hybird dir

* resolve conflit
---
 paddle/fluid/operators/math/math_function.cc  |  2 +-
 paddle/fluid/operators/math/math_function.cu  |  2 +-
 paddle/pten/kernels/CMakeLists.txt            |  4 +-
 paddle/pten/kernels/cpu/elementwise.h         |  2 +-
 paddle/pten/kernels/cpu/reduce.h              |  4 +-
 .../kernels/{hybird => funcs}/CMakeLists.txt  |  1 -
 .../cuda => funcs/eigen}/CMakeLists.txt       |  0
 .../kernels/{hybird => funcs}/eigen/common.h  |  0
 .../kernels/{hybird => funcs}/transpose.cc    |  2 +-
 .../kernels/{hybird => funcs}/transpose.cu    |  2 +-
 .../kernels/{hybird => funcs}/transpose.h     |  2 +-
 paddle/pten/kernels/gpu/dot_kernel.cu         |  2 +-
 .../pten/kernels/hybird/eigen/CMakeLists.txt  |  0
 paddle/pten/kernels/hybird/eigen/sign.h       | 41 -------------------
 .../kernels/hybird/general/CMakeLists.txt     |  0
 .../pten/kernels/impl/dot_grad_kernel_impl.h  |  2 +-
 paddle/pten/kernels/impl/full_kernel_impl.h   |  2 +-
 paddle/pten/kernels/impl/scale_kernel_impl.h  |  2 +-
 paddle/pten/kernels/impl/sign_kernel_impl.h   |  2 +-
 19 files changed, 15 insertions(+), 57 deletions(-)
 rename paddle/pten/kernels/{hybird => funcs}/CMakeLists.txt (92%)
 rename paddle/pten/kernels/{hybird/cuda => funcs/eigen}/CMakeLists.txt (100%)
 rename paddle/pten/kernels/{hybird => funcs}/eigen/common.h (100%)
 rename paddle/pten/kernels/{hybird => funcs}/transpose.cc (98%)
 rename paddle/pten/kernels/{hybird => funcs}/transpose.cu (98%)
 rename paddle/pten/kernels/{hybird => funcs}/transpose.h (97%)
 delete mode 100644 paddle/pten/kernels/hybird/eigen/CMakeLists.txt
 delete mode 100644 paddle/pten/kernels/hybird/eigen/sign.h
 delete mode 100644 paddle/pten/kernels/hybird/general/CMakeLists.txt

diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc
index 1efddc4818671..ec21524b0b880 100644
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -29,7 +29,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/math_function_impl.h"
 #include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/float16.h"
-#include "paddle/pten/kernels/hybird/eigen/common.h"
+#include "paddle/pten/kernels/funcs/eigen/common.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu
index a692246a06b1c..378f0426ddfb7 100644
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -21,7 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/math_function_impl.h"
 #include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/float16.h"
-#include "paddle/pten/kernels/hybird/eigen/common.h"
+#include "paddle/pten/kernels/funcs/eigen/common.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/pten/kernels/CMakeLists.txt b/paddle/pten/kernels/CMakeLists.txt
index fc04cd797f4a5..45724e5d22abd 100644
--- a/paddle/pten/kernels/CMakeLists.txt
+++ b/paddle/pten/kernels/CMakeLists.txt
@@ -6,8 +6,8 @@ file(WRITE ${kernel_declare_file} "// Generated by the paddle/pten/kernels/CMake
 
 # kernel primitive api
 add_subdirectory(primitive)
-# pten hybird functors and functions called by kernels
-add_subdirectory(hybird)
+# pten functors and functions called by kernels
+add_subdirectory(funcs)
 
 add_subdirectory(cpu)
 if(WITH_GPU OR WITH_ROCM)
diff --git a/paddle/pten/kernels/cpu/elementwise.h b/paddle/pten/kernels/cpu/elementwise.h
index 5a421de1173d8..97db997a16478 100644
--- a/paddle/pten/kernels/cpu/elementwise.h
+++ b/paddle/pten/kernels/cpu/elementwise.h
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/pten/kernels/funcs/elementwise_base.h"
 
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/pten/kernels/hybird/eigen/common.h"
+#include "paddle/pten/kernels/funcs/eigen/common.h"
 
 namespace pten {
 
diff --git a/paddle/pten/kernels/cpu/reduce.h b/paddle/pten/kernels/cpu/reduce.h
index fa603b2163055..1e9c1e885f44d 100644
--- a/paddle/pten/kernels/cpu/reduce.h
+++ b/paddle/pten/kernels/cpu/reduce.h
@@ -22,8 +22,8 @@
 
 #include "paddle/pten/api/lib/utils/storage.h"
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/kernels/hybird/eigen/common.h"
-#include "paddle/pten/kernels/hybird/transpose.h"
+#include "paddle/pten/kernels/funcs/eigen/common.h"
+#include "paddle/pten/kernels/funcs/transpose.h"
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 namespace pten {
diff --git a/paddle/pten/kernels/hybird/CMakeLists.txt b/paddle/pten/kernels/funcs/CMakeLists.txt
similarity index 92%
rename from paddle/pten/kernels/hybird/CMakeLists.txt
rename to paddle/pten/kernels/funcs/CMakeLists.txt
index 5d04bae2eae82..32bdc94b95d52 100644
--- a/paddle/pten/kernels/hybird/CMakeLists.txt
+++ b/paddle/pten/kernels/funcs/CMakeLists.txt
@@ -1,5 +1,4 @@
 add_subdirectory(eigen)
-add_subdirectory(general)
 
 cc_library(pten_transpose_cpu SRCS transpose.cc DEPS dense_tensor pten_context)
 if(WITH_GPU)
diff --git a/paddle/pten/kernels/hybird/cuda/CMakeLists.txt b/paddle/pten/kernels/funcs/eigen/CMakeLists.txt
similarity index 100%
rename from paddle/pten/kernels/hybird/cuda/CMakeLists.txt
rename to paddle/pten/kernels/funcs/eigen/CMakeLists.txt
diff --git a/paddle/pten/kernels/hybird/eigen/common.h b/paddle/pten/kernels/funcs/eigen/common.h
similarity index 100%
rename from paddle/pten/kernels/hybird/eigen/common.h
rename to paddle/pten/kernels/funcs/eigen/common.h
diff --git a/paddle/pten/kernels/hybird/transpose.cc b/paddle/pten/kernels/funcs/transpose.cc
similarity index 98%
rename from paddle/pten/kernels/hybird/transpose.cc
rename to paddle/pten/kernels/funcs/transpose.cc
index d1d4350c93161..5a40abbd1b7e8 100644
--- a/paddle/pten/kernels/hybird/transpose.cc
+++ b/paddle/pten/kernels/funcs/transpose.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/pten/kernels/hybird/transpose.h"
+#include "paddle/pten/kernels/funcs/transpose.h"
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/pten/backends/cpu/cpu_context.h"
 #include "paddle/pten/core/dense_tensor.h"
diff --git a/paddle/pten/kernels/hybird/transpose.cu b/paddle/pten/kernels/funcs/transpose.cu
similarity index 98%
rename from paddle/pten/kernels/hybird/transpose.cu
rename to paddle/pten/kernels/funcs/transpose.cu
index 6ea5e36e106d5..e03c538e38682 100644
--- a/paddle/pten/kernels/hybird/transpose.cu
+++ b/paddle/pten/kernels/funcs/transpose.cu
@@ -16,7 +16,7 @@
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/pten/backends/gpu/gpu_context.h"
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/kernels/hybird/transpose.h"
+#include "paddle/pten/kernels/funcs/transpose.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/bfloat16.h"
diff --git a/paddle/pten/kernels/hybird/transpose.h b/paddle/pten/kernels/funcs/transpose.h
similarity index 97%
rename from paddle/pten/kernels/hybird/transpose.h
rename to paddle/pten/kernels/funcs/transpose.h
index 17f52c74a1344..d0e4dafe2c3b8 100644
--- a/paddle/pten/kernels/hybird/transpose.h
+++ b/paddle/pten/kernels/funcs/transpose.h
@@ -18,7 +18,7 @@
 #include "paddle/pten/core/dense_tensor.h"
 
 #include "paddle/fluid/operators/eigen/eigen_function.h"
-#include "paddle/pten/kernels/hybird/eigen/common.h"
+#include "paddle/pten/kernels/funcs/eigen/common.h"
 
 namespace pten {
 
diff --git a/paddle/pten/kernels/gpu/dot_kernel.cu b/paddle/pten/kernels/gpu/dot_kernel.cu
index 1f9e7aa3f1cfd..08d8f83c408de 100644
--- a/paddle/pten/kernels/gpu/dot_kernel.cu
+++ b/paddle/pten/kernels/gpu/dot_kernel.cu
@@ -16,7 +16,7 @@
 
 #include "paddle/pten/backends/gpu/gpu_context.h"
 #include "paddle/pten/core/kernel_registry.h"
-#include "paddle/pten/kernels/hybird/eigen/common.h"
+#include "paddle/pten/kernels/funcs/eigen/common.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/operators/eigen/eigen_function.h"
diff --git a/paddle/pten/kernels/hybird/eigen/CMakeLists.txt b/paddle/pten/kernels/hybird/eigen/CMakeLists.txt
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/paddle/pten/kernels/hybird/eigen/sign.h b/paddle/pten/kernels/hybird/eigen/sign.h
deleted file mode 100644
index 0beebda4f39e8..0000000000000
--- a/paddle/pten/kernels/hybird/eigen/sign.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/kernels/hybird/eigen/common.h"
-
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-
-namespace pten {
-namespace eigen {
-
-template <typename DevCtx, typename T>
-void Sign(const DevCtx& dev_ctx, const DenseTensor& x, DenseTensor* out) {
-  out->mutable_data<T>();
-  // TODO(chenweihang): if we design new tensor, we should support
-  // the low-level calc functor use new tensor as input,
-  // which may be a big project!
-  auto eigen_out = pten::EigenVector<T>::Flatten(*out);
-  auto eigen_x = pten::EigenVector<T>::Flatten(x);
-
-  auto& dev = *dev_ctx.eigen_device();
-  paddle::operators::EigenSign<std::decay_t<decltype(dev)>, T>::Eval(
-      dev, eigen_out, eigen_x);
-}
-
-}  // namespace eigen
-}  // namespace pten
diff --git a/paddle/pten/kernels/hybird/general/CMakeLists.txt b/paddle/pten/kernels/hybird/general/CMakeLists.txt
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/paddle/pten/kernels/impl/dot_grad_kernel_impl.h b/paddle/pten/kernels/impl/dot_grad_kernel_impl.h
index 16c87bbab474a..39cdbad5146de 100644
--- a/paddle/pten/kernels/impl/dot_grad_kernel_impl.h
+++ b/paddle/pten/kernels/impl/dot_grad_kernel_impl.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/kernels/hybird/eigen/common.h"
+#include "paddle/pten/kernels/funcs/eigen/common.h"
 
 #include "paddle/pten/kernels/complex_kernel.h"
 
diff --git a/paddle/pten/kernels/impl/full_kernel_impl.h b/paddle/pten/kernels/impl/full_kernel_impl.h
index 9be40e22a0360..79ca63c9b0669 100644
--- a/paddle/pten/kernels/impl/full_kernel_impl.h
+++ b/paddle/pten/kernels/impl/full_kernel_impl.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/pten/common/scalar.h"
 #include "paddle/pten/common/scalar_array.h"
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/kernels/hybird/eigen/common.h"
+#include "paddle/pten/kernels/funcs/eigen/common.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/operators/eigen/eigen_function.h"
diff --git a/paddle/pten/kernels/impl/scale_kernel_impl.h b/paddle/pten/kernels/impl/scale_kernel_impl.h
index 937b3115e63b3..2e0b158b36b8d 100644
--- a/paddle/pten/kernels/impl/scale_kernel_impl.h
+++ b/paddle/pten/kernels/impl/scale_kernel_impl.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "paddle/pten/common/scalar.h"
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/kernels/hybird/eigen/common.h"
+#include "paddle/pten/kernels/funcs/eigen/common.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/operators/eigen/eigen_function.h"
diff --git a/paddle/pten/kernels/impl/sign_kernel_impl.h b/paddle/pten/kernels/impl/sign_kernel_impl.h
index 306206f1fc3fa..655cda762ee1a 100644
--- a/paddle/pten/kernels/impl/sign_kernel_impl.h
+++ b/paddle/pten/kernels/impl/sign_kernel_impl.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/kernels/hybird/eigen/common.h"
+#include "paddle/pten/kernels/funcs/eigen/common.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/operators/eigen/eigen_function.h"

From 4a64ca1e9df499d3d8822c00304b41b5215f2a93 Mon Sep 17 00:00:00 2001
From: Lijunhui <1578034415@qq.com>
Date: Wed, 12 Jan 2022 17:08:04 +0800
Subject: [PATCH 110/151] optimize elementwise_max_grad using new interfaces
 (#37906)

* init elem_max_grad op

* optimize code and reply review comments

* ternary functors

* apply new reduce func

* move functor to .h

* multi-outputs init

* rearrange code

* modifed functors

* optimizer code

* pass nullptr

* revert the last change as seg fault occurs

* optimize code

* remove inplace

* remove comments
---
 .../elementwise/elementwise_functor.h         | 27 +++++++++++++++++
 .../elementwise/elementwise_max_op.cu         | 30 +++++++++++++++++--
 .../elementwise/elementwise_max_op.h          | 29 +++++++++++++++---
 3 files changed, 80 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/operators/elementwise/elementwise_functor.h b/paddle/fluid/operators/elementwise/elementwise_functor.h
index 438a47f5dc593..a8c9640d479d3 100644
--- a/paddle/fluid/operators/elementwise/elementwise_functor.h
+++ b/paddle/fluid/operators/elementwise/elementwise_functor.h
@@ -301,5 +301,32 @@ struct MulGradXYFunctor<Complex<InT>, Complex<OutT>> {
   }
 };
 
+// Ternary compare
+template <typename T>
+struct MaxGradXFunctor {
+  inline HOSTDEVICE T operator()(const T& x, const T& y, const T& dout) const {
+    return dout * static_cast<T>(x > y);
+  }
+};
+template <typename T>
+struct MaxGradYFunctor {
+  inline HOSTDEVICE T operator()(const T& x, const T& y, const T& dout) const {
+    return dout * static_cast<T>(x <= y);
+  }
+};
+
+template <typename InT, typename OutT>
+struct MaxGradXYFunctor {
+  inline HOSTDEVICE paddle::framework::Array<OutT, 2> operator()(
+      const InT& x, const InT& y, const InT& dout) {
+    paddle::framework::Array<OutT, 2> outs;
+    // dx = dout * (x > y)
+    outs[0] = static_cast<OutT>(dout * static_cast<InT>(x > y));
+    // dy = dout * (x <= y)
+    outs[1] = static_cast<OutT>(dout * static_cast<InT>(x <= y));
+    return outs;
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.cu b/paddle/fluid/operators/elementwise/elementwise_max_op.cu
index 760429200889b..eaf7774428565 100644
--- a/paddle/fluid/operators/elementwise/elementwise_max_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op.cu
@@ -24,15 +24,41 @@ class ElementwiseMaxKernel<platform::CUDADeviceContext, T>
   void Compute(const framework::ExecutionContext& ctx) const override {
     std::vector<const framework::Tensor*> ins;
     std::vector<framework::Tensor*> outs;
-    const auto& cuda_ctx =
+    const auto& dev_ctx =
         ctx.template device_context<platform::CUDADeviceContext>();
 
     int axis = PackTensorsIntoVector<T>(ctx, &ins, &outs);
     LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
-        cuda_ctx, ins, &outs, axis, MaxFunctor<T>());
+        dev_ctx, ins, &outs, axis, MaxFunctor<T>());
   }
 };
 
+template <typename DeviceContext, typename T>
+typename std::enable_if<
+    std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
+ElementwiseMaxGrad(const framework::ExecutionContext& ctx,
+                   const framework::Tensor* x, const framework::Tensor* y,
+                   const framework::Tensor* out, const framework::Tensor* dout,
+                   framework::Tensor* dx, framework::Tensor* dy) {
+  int axis = ctx.Attr<int>("axis");
+  const auto& dev_ctx =
+      ctx.template device_context<platform::CUDADeviceContext>();
+  const auto place = ctx.GetPlace();
+  if (dx != nullptr && dy != nullptr) {
+    std::vector<const framework::Tensor*> ins = {x, y, dout};
+    GetGradXAndYOut<ElementwiseType::kTernary, T>(
+        dev_ctx, place, axis, ins, dout, dx, dy, MaxGradXYFunctor<T, T>());
+  } else if (dx != nullptr && dy == nullptr) {
+    std::vector<const framework::Tensor*> ins = {x, y, dout};
+    GetGradXOrYOut<ElementwiseType::kTernary, T>(
+        dev_ctx, place, axis, ins, dout, dx, MaxGradXFunctor<T>());
+  } else if (dx == nullptr && dy != nullptr) {
+    std::vector<const framework::Tensor*> ins = {x, y, dout};
+    GetGradXOrYOut<ElementwiseType::kTernary, T>(
+        dev_ctx, place, axis, ins, dout, dy, MaxGradYFunctor<T>());
+  }
+}
+
 }  // namespace operators
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.h b/paddle/fluid/operators/elementwise/elementwise_max_op.h
index a7a49fed87151..cff30be50a3d1 100644
--- a/paddle/fluid/operators/elementwise/elementwise_max_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op.h
@@ -64,6 +64,28 @@ struct MaxGradDy {
   }
 };
 
+template <typename DeviceContext, typename T>
+typename std::enable_if<
+    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
+ElementwiseMaxGrad(const framework::ExecutionContext& ctx,
+                   const framework::Tensor* x, const framework::Tensor* y,
+                   const framework::Tensor* out, const framework::Tensor* dout,
+                   framework::Tensor* dx, framework::Tensor* dy) {
+  int axis = ctx.Attr<int>("axis");
+  ElemwiseGradCompute<DeviceContext, T, MaxGradDx<T>, MaxGradDy<T>>(
+      ctx, *x, *y, *out, *dout, axis, dx, dy, MaxGradDx<T>(), MaxGradDy<T>());
+}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <typename DeviceContext, typename T>
+typename std::enable_if<
+    std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
+ElementwiseMaxGrad(const framework::ExecutionContext& ctx,
+                   const framework::Tensor* x, const framework::Tensor* y,
+                   const framework::Tensor* out, const framework::Tensor* dout,
+                   framework::Tensor* dx, framework::Tensor* dy);
+#endif
+
 template <typename DeviceContext, typename T>
 class ElementwiseMaxGradKernel : public ElemwiseGradKernel<T> {
  public:
@@ -74,12 +96,11 @@ class ElementwiseMaxGradKernel : public ElemwiseGradKernel<T> {
     auto* x = ctx.Input<Tensor>("X");
     auto* y = ctx.Input<Tensor>("Y");
     auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* out = dout;  // out is not necessary
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    auto* out = dout;  // Fake out, not used
-    int axis = ctx.Attr<int>("axis");
-    ElemwiseGradCompute<DeviceContext, T, MaxGradDx<T>, MaxGradDy<T>>(
-        ctx, *x, *y, *out, *dout, axis, dx, dy, MaxGradDx<T>(), MaxGradDy<T>());
+
+    ElementwiseMaxGrad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
   }
 };
 

From cc24427ec33705294f80414055082922db576ca5 Mon Sep 17 00:00:00 2001
From: JZ-LIANG <jianzhongliang10@gmail.com>
Date: Wed, 12 Jan 2022 17:14:55 +0800
Subject: [PATCH 111/151] [Dist Pass] Amp Pass (#38764)

* auto parallel sharding base

* chmod

* add unitest

* set unitest cmake dist label

* revise code according to rewiew

* chmod

* bugfix for grad_clip and param broadcast

* chmod

* update unitest

* chmod

* add clip

* chmod

* add amp pass

* chmod

* add unitest

* remove grad update

* fixed bug

* fixed bug

* fixed typose

* fixed typoes
---
 .../auto_parallel/operators/__init__.py       |   1 +
 .../dist_check_finite_and_unscale.py          | 178 +++++
 .../distributed/auto_parallel/parallelizer.py |  23 +-
 .../auto_parallel/process_group.py            |   2 +-
 python/paddle/distributed/passes/__init__.py  |   1 +
 .../distributed/passes/auto_parallel_amp.py   | 715 ++++++++++++++++++
 .../passes/auto_parallel_sharding.py          |   2 +-
 .../test_auto_parallel_amp_pass.py            |  63 ++
 8 files changed, 973 insertions(+), 12 deletions(-)
 create mode 100644 python/paddle/distributed/auto_parallel/operators/dist_check_finite_and_unscale.py
 create mode 100644 python/paddle/distributed/passes/auto_parallel_amp.py
 create mode 100755 python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_amp_pass.py

diff --git a/python/paddle/distributed/auto_parallel/operators/__init__.py b/python/paddle/distributed/auto_parallel/operators/__init__.py
index d0ddeb1dcc711..5502cb3191a48 100644
--- a/python/paddle/distributed/auto_parallel/operators/__init__.py
+++ b/python/paddle/distributed/auto_parallel/operators/__init__.py
@@ -23,3 +23,4 @@
 from . import dist_softmax
 from . import dist_transpose
 from . import dist_default
+from . import dist_check_finite_and_unscale
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_check_finite_and_unscale.py b/python/paddle/distributed/auto_parallel/operators/dist_check_finite_and_unscale.py
new file mode 100644
index 0000000000000..00dc346f9a2ac
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/operators/dist_check_finite_and_unscale.py
@@ -0,0 +1,178 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+
+from .common import DistributedOperatorImplContainer
+from .common import DistributedOperatorImpl
+from .common import register_distributed_operator_impl_container
+from .common import register_distributed_operator_impl
+from paddle.fluid import core, unique_name
+from paddle.fluid.framework import in_dygraph_mode
+from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype
+from paddle.distributed.fleet.meta_optimizers.common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY
+from ..utils import set_var_dist_attr
+from ..utils import set_dist_op_desc_original_id
+from ..process_group import new_process_group
+from ..dist_attribute import OperatorDistributedAttribute
+from paddle.distributed.auto_parallel.process_group import get_world_process_group
+
+global_process_mesh = get_world_process_group().ranks
+
+
+class DistributedCheckFiniteAndUnscale(DistributedOperatorImplContainer):
+    def __init__(self, name):
+        super(DistributedCheckFiniteAndUnscale, self).__init__()
+        self._name = name
+
+
+register_distributed_operator_impl_container(
+    "check_finite_and_unscale",
+    DistributedCheckFiniteAndUnscale("check_finite_and_unscale"))
+
+
+class DistributedCheckFiniteAndUnscaleImpl(DistributedOperatorImpl):
+    def __init__(self, name):
+        super(DistributedCheckFiniteAndUnscaleImpl, self).__init__()
+        self._name = name
+        self._forward_implemented = False
+        self._backward_implemented = True
+
+    def is_input_compatible(self, dist_op):
+        raise RuntimeError(
+            "DistributedCheckFiniteAndUnscaleImpl's is_input_compatible should not be called !"
+        )
+
+    def is_output_compatible(self, dist_op):
+        raise RuntimeError(
+            "DistributedCheckFiniteAndUnscaleImpl's is_output_compatible should not be called !"
+        )
+
+    def update_dims_mapping(self, dist_op):
+        raise RuntimeError(
+            "DistributedCheckFiniteAndUnscaleImpl's update_dims_mapping should not be called !"
+        )
+
+    @staticmethod
+    def forward(ctx, *args, **kwargs):
+        raise RuntimeError(
+            "DistributedCheckFiniteAndUnscaleImpl's forward should not be called !"
+        )
+
+    @staticmethod
+    def backward(ctx, *args, **kwargs):
+
+        # by now the backward function only insert the gradient allreduce for dist op itself
+        dist_op_context = ctx.dist_op_context
+        main_block = dist_op_context.get_dst_main_program().global_block()
+        backward_op = dist_op_context.get_cur_src_op()
+        rank_id = dist_op_context.get_rank_id()
+        dist_attr = ctx.get_op_dist_attr_for_program(backward_op)
+        assert dist_attr is not None, "backward op [{}] don't have dist attribute !".format(
+            str(backward_op))
+
+        assert rank_id in dist_attr.process_mesh.processes
+
+        assert 'X' in kwargs, "input [{}] is not given".format('X')
+        assert 'Scale' in kwargs, "input [{}] is not given".format('Scale')
+        assert 'Out' in kwargs, "input [{}] is not given".format('Out')
+        assert 'FoundInfinite' in kwargs, "output [{}] is not given".format(
+            'FoundInfinite')
+
+        assert len(
+            kwargs['Scale']
+        ) == 1, "check_finite_and_unscale input Scale take 1 variable but got {}".format(
+            kwargs['Scale'])
+        assert len(
+            kwargs['FoundInfinite']
+        ) == 1, "check_finite_and_unscale input FoundInfinite take 1 variable but got {}".format(
+            kwargs['FoundInfinite'])
+        assert len(kwargs['X']) == len(
+            kwargs['Out']
+        ), "check_finite_and_unscale got [{}] X and [{}] Out, which are supposed to be equal".format(
+            len(kwargs['X']), len(kwargs['Out']))
+
+        filter_vars = []
+        for varname in kwargs['X']:
+            if rank_id in ctx.get_tensor_dist_attr_for_program(
+                    main_block.var(varname)).process_mesh.processes:
+                filter_vars.append(varname)
+
+        # replicate op in dist program
+        dist_op_desc = main_block.desc.append_op()
+        dist_op_desc.copy_from(backward_op.desc)
+        set_dist_op_desc_original_id(dist_op_desc, backward_op.desc, ctx)
+        dist_op_desc.set_input('X', filter_vars)
+        dist_op_desc.set_output('Out', filter_vars)
+        main_block._sync_with_cpp()
+
+        # sync result
+        group = new_process_group(global_process_mesh)
+
+        inf_var = main_block.var(kwargs['FoundInfinite'][0])
+        inf_var_int32 = main_block.create_var(
+            name=inf_var.name + "@cast_int32",
+            shape=inf_var.shape,
+            dtype=core.VarDesc.VarType.INT32)
+        set_var_dist_attr(
+            ctx, inf_var_int32,
+            ctx.get_tensor_dist_attr_for_program(inf_var).dims_mapping,
+            ctx.get_tensor_dist_attr_for_program(inf_var).process_mesh)
+        cast_op1 = main_block.append_op(
+            type='cast',
+            inputs={'X': inf_var},
+            outputs={'Out': inf_var_int32},
+            attrs={
+                "in_dtype": inf_var.dtype,
+                "out_dtype": inf_var_int32.dtype,
+                OP_ROLE_KEY: OpRole.Backward
+            })
+        allreduce_op = main_block.append_op(
+            type='c_allreduce_max',
+            inputs={'X': inf_var_int32},
+            outputs={'Out': inf_var_int32},
+            attrs={
+                'ring_id': group.id,
+                'use_calc_stream': True,
+                OP_ROLE_KEY: OpRole.Backward
+            })
+        cast_op2 = main_block.append_op(
+            type='cast',
+            inputs={'X': inf_var_int32},
+            outputs={'Out': inf_var},
+            attrs={
+                "in_dtype": inf_var_int32.dtype,
+                "out_dtype": inf_var.dtype,
+                OP_ROLE_KEY: OpRole.Backward
+            })
+        main_block._sync_with_cpp()
+
+        for op in [cast_op1, allreduce_op, cast_op2]:
+            new_op_dist_attr = OperatorDistributedAttribute()
+            for varname in op.input_arg_names:
+                var_dist_attr = ctx.get_tensor_dist_attr_for_program(
+                    main_block.var(varname))
+                assert var_dist_attr is not None
+                new_op_dist_attr.set_input_dims_mapping(
+                    varname, var_dist_attr.dims_mapping)
+            for varname in op.output_arg_names:
+                var_dist_attr = ctx.get_tensor_dist_attr_for_program(
+                    main_block.var(varname))
+                new_op_dist_attr.set_output_dims_mapping(
+                    varname, var_dist_attr.dims_mapping)
+            new_op_dist_attr.process_mesh = var_dist_attr.process_mesh
+            ctx.set_op_dist_attr_for_program(op, new_op_dist_attr)
+
+
+register_distributed_operator_impl(
+    "check_finite_and_unscale",
+    DistributedCheckFiniteAndUnscaleImpl("check_finite_and_unscale"))
diff --git a/python/paddle/distributed/auto_parallel/parallelizer.py b/python/paddle/distributed/auto_parallel/parallelizer.py
index 7cad4d746bbf2..294a966726d73 100644
--- a/python/paddle/distributed/auto_parallel/parallelizer.py
+++ b/python/paddle/distributed/auto_parallel/parallelizer.py
@@ -36,7 +36,7 @@
 from .partitioner import Partitioner
 from .process_group import get_all_process_groups
 from .process_group import get_process_group
-from .process_group import get_world_process_groups
+from .process_group import get_world_process_group
 from .process_group import _g_process_group_map, ProcessGroup
 from .utils import make_data_unshard
 from .utils import set_grad_var_shape
@@ -97,13 +97,16 @@ def _remove_distributed_attrs(self, main_program):
                     if suffix in attr_name:
                         op._remove_attr(attr_name)
 
-    def _apply_serial_pass(self, main_program, startup_program):
-
+    def _apply_pre_optimization_passed(self, main_program, startup_program,
+                                       loss, params_grads):
         # apply amp pass
         if self._dist_strategy.amp:
-            auto_parallel_amp_pass = new_pass("auto_parallel_amp_pass",
-                                              self._dist_strategy.amp_configs)
-            auto_parallel_amp_pass.apply(main_program, startup_program,
+            config = copy.deepcopy(self._dist_strategy.amp_configs)
+            config["dist_context"] = self._dist_context
+            config["params_grads"] = params_grads
+            config["loss"] = loss
+            auto_parallel_amp_pass = new_pass("auto_parallel_amp", config)
+            auto_parallel_amp_pass.apply([main_program], [startup_program],
                                          self._pass_context)
 
         # apply recompute pass
@@ -185,10 +188,10 @@ def _get_dist_program(self, rank, dist_context=None, relaunch_phase=False):
             self._parameter_list, self._no_grad_set, self._callbacks)
 
         # serial forward pass
-        self._apply_serial_pass(completed_main_program, serial_startup_program)
-
+        self._apply_pre_optimization_passed(completed_main_program,
+                                            serial_startup_program, serial_loss,
+                                            params_grads)
         # Logical partition 
-        rank = paddle.distributed.get_rank()
         partitioner = Partitioner(self._dist_context, rank)
         dist_main_prog, dist_startup_prog, dist_params_grads = partitioner.partition(
             completed_main_program, serial_startup_program, params_grads)
@@ -235,7 +238,7 @@ def parallelize(self,
             assert self._cluster is not None, \
                 "The cluster must not be none when using auto mapping."
             dist_programs = {}
-            world_process_group = get_world_process_groups()
+            world_process_group = get_world_process_group()
             dist_context = None
             # auto search
             if self._dist_strategy.auto_search:
diff --git a/python/paddle/distributed/auto_parallel/process_group.py b/python/paddle/distributed/auto_parallel/process_group.py
index fee52e85697dc..1df70672e55da 100644
--- a/python/paddle/distributed/auto_parallel/process_group.py
+++ b/python/paddle/distributed/auto_parallel/process_group.py
@@ -33,7 +33,7 @@ def get_process_group(group_id, g_process_group_map=None):
             group_id, None)
 
 
-def get_world_process_groups():
+def get_world_process_group():
     global _g_process_group_map
     return _g_process_group_map[0]
 
diff --git a/python/paddle/distributed/passes/__init__.py b/python/paddle/distributed/passes/__init__.py
index 87454d8842497..06f2efe08a489 100644
--- a/python/paddle/distributed/passes/__init__.py
+++ b/python/paddle/distributed/passes/__init__.py
@@ -16,6 +16,7 @@
 from .fuse_all_reduce import *
 from .auto_parallel_gradient_merge import *
 from .auto_parallel_sharding import *
+from .auto_parallel_amp import *
 from .cpp_pass import *
 
 __all__ = [
diff --git a/python/paddle/distributed/passes/auto_parallel_amp.py b/python/paddle/distributed/passes/auto_parallel_amp.py
new file mode 100644
index 0000000000000..d2af422bac023
--- /dev/null
+++ b/python/paddle/distributed/passes/auto_parallel_amp.py
@@ -0,0 +1,715 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle.framework import core
+from paddle.fluid import unique_name
+from .pass_base import PassBase, register_pass
+from paddle.distributed.fleet.meta_optimizers.common import OpRole
+from paddle.fluid.data_feeder import check_variable_and_dtype, check_type
+from paddle.distributed.auto_parallel.utils import get_loss_op, set_var_dist_attr
+from paddle.distributed.auto_parallel.utils import naive_set_dist_op_attr_for_program_by_mesh_and_mapping
+from paddle.distributed.auto_parallel.process_group import get_world_process_group
+from paddle.fluid.contrib.mixed_precision.fp16_utils import AutoMixedPrecisionLists
+from paddle.fluid.contrib.mixed_precision.fp16_utils import _keep_fp32_input, _keep_fp32_output, find_op_index
+from paddle.fluid.contrib.mixed_precision.fp16_utils import _valid_types, find_true_post_op, find_true_prev_op
+from paddle.fluid.contrib.mixed_precision.fp16_utils import _is_in_black_varnames, _dtype_to_str, _rename_arg
+from paddle.distributed.auto_parallel.dist_attribute import OperatorDistributedAttribute
+global_process_mesh = get_world_process_group().ranks
+
+
+class AMPState(object):
+    def __init__(self, block):
+        self._block = block
+        self._op_fp16_dict = {
+        }  # op_id --> True/False. 'True' means that the current op is in fp16 mode.
+        self._var_name_dict = {}  # fwd_op_id --> {old_name: cast_name}
+
+    def _is_fp16_op(self, op_id):
+        return self._op_fp16_dict.get(op_id, None)
+
+    def _build_stats(self, amp_lists, dist_context):
+        ops = self._block.ops
+        dist_op_context = dist_context.dist_op_context
+        for op in ops:
+            if int(op.attr('op_role')) == int(OpRole.Forward):
+                self._mark_black_white_ops(amp_lists)
+            elif int(op.attr('op_role')) == int(OpRole.Backward):
+                if op.desc.id() in dist_op_context.grad_op_id_to_op_id:
+                    fwd_op_id = dist_op_context.grad_op_id_to_op_id[op.desc.id(
+                    )]
+                    if self._is_fp16_op(fwd_op_id) == True:
+                        self._op_fp16_dict[op.desc.id()] = True
+                    elif self._is_fp16_op(fwd_op_id) == False:
+                        self._op_fp16_dict[op.desc.id()] = False
+            elif int(op.attr('op_role')) == int(OpRole.Optimize):
+                break
+
+    def _mark_black_white_ops(self, amp_lists):
+        """
+        this function is modified from paddle.fluid.contrib.mixed_precision
+        """
+        self._block._sync_with_cpp()
+        ops = self._block.ops
+
+        for op in ops:
+            if int(op.attr('op_role')) == int(OpRole.Backward):
+                break
+            if op.type == 'create_py_reader' or op.type == 'read':
+                continue
+            if amp_lists.black_varnames is not None and _is_in_black_varnames(
+                    op, amp_lists):
+                self._op_fp16_dict[op.desc.id()] = False
+                continue
+            if op.type in amp_lists.black_list:
+                self._op_fp16_dict[op.desc.id()] = False
+            elif op.type in amp_lists.white_list:
+                self._op_fp16_dict[op.desc.id()] = True
+            elif op.type in amp_lists.gray_list:
+                is_black_op = False
+                is_white_op = False
+                for in_name in op.input_names:
+                    # if this op has inputs
+                    if in_name:
+                        for in_var_name in op.input(in_name):
+                            in_var = self._block.var(in_var_name)
+                            # this in_var isn't the output of other op
+                            if in_var.op is None:
+                                continue
+                            elif in_var.op is op:
+                                prev_op = find_true_prev_op(ops, op,
+                                                            in_var_name)
+                                if prev_op is None:
+                                    continue
+                            else:
+                                prev_op = in_var.op
+                            # if it's one of inputs
+                            if self._is_fp16_op(prev_op.desc.id()) == False or \
+                                    prev_op.type in amp_lists.black_list:
+                                is_black_op = True
+                            elif self._is_fp16_op(prev_op.desc.id()) == True or \
+                                    prev_op.type in amp_lists.white_list:
+                                is_white_op = True
+                if is_black_op:
+                    self._op_fp16_dict[op.desc.id()] = False
+                elif is_white_op:
+                    self._op_fp16_dict[op.desc.id()] = True
+                else:
+                    pass
+            else:
+                # For numerical safe, we apply fp32 computation on ops that
+                # are not determined which list they should stay.
+                self._op_fp16_dict[op.desc.id()] = False
+
+    def cast_forward_program(self, dist_context):
+        ops = self._block.ops
+        idx = 0
+        while idx < len(ops):
+            op = ops[idx]
+            num_cast_ops = 0
+            if int(op.attr('op_role')) == int(OpRole.Backward):
+                break
+            if self._is_fp16_op(op.desc.id()) == False:
+                num_cast_ops = self._insert_cast_op_forward(
+                    op, idx, core.VarDesc.VarType.FP16,
+                    core.VarDesc.VarType.FP32, dist_context)
+            elif self._is_fp16_op(op.desc.id()) == True:
+                num_cast_ops = self._insert_cast_op_forward(
+                    op, idx, core.VarDesc.VarType.FP32,
+                    core.VarDesc.VarType.FP16, dist_context)
+            else:
+                pass
+            idx += num_cast_ops + 1
+        self._block._sync_with_cpp()
+
+    def _insert_cast_op_forward(self, op, idx, src_dtype, dst_dtype,
+                                dist_context):
+        """
+        only for forward cast
+        modified from paddle.fluid.contrib.mixed_precision
+        """
+        num_cast_ops = 0
+
+        for in_name in op.input_names:
+            var_name_dict = {}
+            if src_dtype == core.VarDesc.VarType.FP32 and _keep_fp32_input(
+                    op, in_name):
+                continue
+            for in_var_name in op.input(in_name):
+                in_var = self._block._find_var_recursive(in_var_name)
+                if in_var.type not in _valid_types or in_var.dtype == dst_dtype:
+                    continue
+                if in_var.dtype == src_dtype:
+                    cast_name = in_var.name + '.cast_' + _dtype_to_str(
+                        dst_dtype)
+                    out_var = self._block.vars.get(cast_name)
+                    var_name_dict[in_var.name] = cast_name
+                    consume_op_attr = dist_context.get_op_dist_attr_for_program(
+                        op)
+                    assert consume_op_attr is not None
+                    if out_var is None or out_var.dtype != dst_dtype:
+                        # NOTE we make the cast op and var's dist attr as the op that consume the
+                        # cast var instead of the op which generates the var
+                        in_var_dist_attr = consume_op_attr.get_input_dist_attr(
+                            in_var.name)
+                        assert in_var_dist_attr is not None
+                        ref_mesh = in_var_dist_attr.process_mesh
+                        ref_mapping = in_var_dist_attr.dims_mapping
+                        consume_op_attr.set_input_dist_attr(cast_name,
+                                                            in_var_dist_attr)
+
+                        out_var = self._block.create_var(
+                            name=cast_name,
+                            dtype=dst_dtype,
+                            persistable=False,
+                            stop_gradient=in_var.stop_gradient)
+                        set_var_dist_attr(dist_context, out_var, ref_mapping,
+                                          ref_mesh)
+
+                        cast_op = self._block._insert_op_without_sync(
+                            idx,
+                            type="cast",
+                            inputs={"X": in_var},
+                            outputs={"Out": out_var},
+                            attrs={
+                                "in_dtype": in_var.dtype,
+                                "out_dtype": out_var.dtype,
+                            })
+                        naive_set_dist_op_attr_for_program_by_mesh_and_mapping(
+                            cast_op, ref_mesh, ref_mapping, dist_context)
+                        num_cast_ops += 1
+                    else:
+                        in_var_dist_attr = consume_op_attr.get_input_dist_attr(
+                            in_var.name)
+                        consume_op_attr.set_input_dist_attr(cast_name,
+                                                            in_var_dist_attr)
+                    _rename_arg(op, in_var.name, cast_name)
+                else:
+                    if op.has_attr('in_dtype'):
+                        op._set_attr('in_dtype', dst_dtype)
+        self._var_name_dict[op.desc.id()] = var_name_dict
+
+        if src_dtype == core.VarDesc.VarType.FP32 and dst_dtype == core.VarDesc.VarType.FP16:
+            for out_name in op.output_names:
+                if _keep_fp32_output(op, out_name):
+                    continue
+                for out_var_name in op.output(out_name):
+                    out_var = self._block.var(out_var_name)
+                    if out_var.type not in _valid_types:
+                        continue
+                    if out_var.dtype == core.VarDesc.VarType.FP32:
+                        out_var.desc.set_dtype(core.VarDesc.VarType.FP16)
+                        if op.has_attr('out_dtype'):
+                            op._set_attr('out_dtype', core.VarDesc.VarType.FP16)
+        return num_cast_ops
+
+    def cast_backward_program(self, params_grads, dist_context):
+        self._block._sync_with_cpp()
+        ops = self._block.ops
+
+        loss_op = get_loss_op(self._block)
+        loss_op_index = find_op_index(self._block.desc, loss_op.desc)
+
+        idx = loss_op_index + 1
+        while idx < len(ops):
+            num_cast_ops = 0
+            grad_op = ops[idx]
+            dist_op_context = dist_context.dist_op_context
+            if grad_op.desc.id() in dist_op_context.grad_op_id_to_op_id:
+                if self._is_fp16_op(grad_op.desc.id()) == False:  # fp32
+                    num_cast_ops = self._insert_cast_op_backward(
+                        grad_op, idx, core.VarDesc.VarType.FP16,
+                        core.VarDesc.VarType.FP32, dist_context)
+                elif self._is_fp16_op(grad_op.desc.id()) == True:  # fp16
+                    num_cast_ops = self._insert_cast_op_backward(
+                        grad_op, idx, core.VarDesc.VarType.FP32,
+                        core.VarDesc.VarType.FP16, dist_context)
+            elif grad_op.type == "sum":
+                in_var_name = grad_op.desc.input_arg_names()[0]
+                src_dtype = self._block.var(in_var_name).dtype
+                for in_var_name in grad_op.desc.input_arg_names():
+                    assert src_dtype == self._block.var(in_var_name).dtype
+                out_var_name = grad_op.desc.output_arg_names()[0]
+                out_var = self._block.var(out_var_name)
+                if out_var.dtype != src_dtype:
+                    out_var.desc.set_dtype(src_dtype)
+            elif int(grad_op.attr('op_role')) == 257:
+                pass
+            else:
+                raise ValueError(
+                    "'{}' op is not supported in the complete amp pass.".format(
+                        grad_op.type))
+            idx += num_cast_ops + 1
+
+        self._block._sync_with_cpp()
+        _update_backward_cast_ops(params_grads, dist_context)
+
+    def _insert_cast_op_backward(self, grad_op, idx, src_dtype, dst_dtype,
+                                 dist_context):
+        """ only for backward cast """
+
+        def _keep_fp32_input(op, in_name):
+            op_type = op.type
+            if op_type in ['layer_norm_grad']:
+                return in_name not in {'X', 'Y@GRAD'}
+            return False
+
+        def _keep_fp32_output(op, out_name):
+            op_type = op.type
+            if op_type in ['layer_norm_grad']:
+                return out_name != 'X@GRAD'
+            return False
+
+        num_cast_ops = 0
+        dist_op_context = dist_context.dist_op_context
+        fwd_op_id = dist_op_context.grad_op_id_to_op_id[grad_op.desc.id()]
+
+        for in_name in grad_op.input_names:
+            if src_dtype == core.VarDesc.VarType.FP32 and _keep_fp32_input(
+                    grad_op, in_name):
+                for in_var_name in grad_op.input(in_name):
+                    in_var = self._block._find_var_recursive(in_var_name)
+                    assert in_var.dtype == core.VarDesc.VarType.FP32
+                continue
+
+            for in_var_name in grad_op.input(in_name):
+                in_var = self._block._find_var_recursive(in_var_name)
+                if in_var.dtype == src_dtype:
+                    consume_op_attr = dist_context.get_op_dist_attr_for_program(
+                        grad_op)
+                    if in_var_name in self._var_name_dict[fwd_op_id]:
+                        # NOTE: if in_var of consume grad_op has been casted before,
+                        # it should be renamed and reset dist_attr.
+                        cast_name = self._var_name_dict[fwd_op_id][in_var_name]
+                        grad_op.desc._rename_input(in_var_name, cast_name)
+                        in_var_dist_attr = consume_op_attr.get_input_dist_attr(
+                            in_var_name)
+                        consume_op_attr.set_input_dist_attr(cast_name,
+                                                            in_var_dist_attr)
+                    else:
+                        assert in_var.dtype == dst_dtype
+
+        for out_name in grad_op.output_names:
+            if src_dtype == core.VarDesc.VarType.FP32 and _keep_fp32_output(
+                    grad_op, out_name):
+                for out_var_name in grad_op.output(out_name):
+                    out_var = self._block._find_var_recursive(out_var_name)
+                    assert out_var.dtype == core.VarDesc.VarType.FP32
+                continue
+
+            for out_var_name in grad_op.output(out_name):
+                out_var = self._block._find_var_recursive(out_var_name)
+                out_var_name_prefix = out_var_name[:out_var_name.find("@")]
+                fwd_var = self._block._find_var_recursive(out_var_name_prefix)
+                # NOTE: the out_var's dtype of consume grad_op should equal to the fwd_var's dtype
+                if out_var.dtype != fwd_var.dtype:
+                    out_var.desc.set_dtype(fwd_var.dtype)
+
+                if out_var.dtype == src_dtype:
+                    if out_var_name_prefix in self._var_name_dict[fwd_op_id]:
+                        # NOTE: if out_var of consume grad_op has been casted before,
+                        # it should be renamed and reset dist_attr, then we insert cast op to
+                        # convert the cast_var to original dtype
+                        consume_op_attr = dist_context.get_op_dist_attr_for_program(
+                            grad_op)
+                        fwd_cast_name = self._var_name_dict[fwd_op_id][
+                            out_var_name_prefix]
+                        cast_name = fwd_cast_name + "@GRAD"
+                        cast_var = self._block.vars.get(cast_name)
+                        if cast_var is None or cast_var.dtype != dst_dtype:
+                            grad_op.desc._rename_output(out_var_name, cast_name)
+                            out_var_dist_attr = consume_op_attr.get_output_dist_attr(
+                                out_var_name)
+                            ref_mesh = out_var_dist_attr.process_mesh
+                            ref_mapping = out_var_dist_attr.dims_mapping
+                            consume_op_attr.set_output_dist_attr(
+                                cast_name, out_var_dist_attr)
+                            assert ref_mapping is not None
+                            cast_var = self._block.create_var(
+                                name=cast_name,
+                                shape=out_var.shape,
+                                dtype=dst_dtype,
+                                persistable=False,
+                                stop_gradient=out_var.stop_gradient)
+                            set_var_dist_attr(dist_context, cast_var,
+                                              ref_mapping, ref_mesh)
+
+                            cast_op = self._block._insert_op(
+                                idx + 1,
+                                type="cast",
+                                inputs={"X": cast_var},
+                                outputs={"Out": out_var},
+                                attrs={
+                                    "in_dtype": cast_var.dtype,
+                                    "out_dtype": out_var.dtype,
+                                    "op_role": OpRole.Backward
+                                })
+                            cast_op._remove_attr("op_role_var")
+                            cast_op._remove_attr("op_namescope")
+                            cast_op._remove_attr("with_quant_attr")
+                            naive_set_dist_op_attr_for_program_by_mesh_and_mapping(
+                                cast_op, ref_mesh, ref_mapping, dist_context)
+                            num_cast_ops += 1
+                else:
+                    assert out_var.dtype == dst_dtype
+
+        return num_cast_ops
+
+
+def _update_backward_cast_ops(params_grads, dist_context):
+    """
+    move param grad cast to the end of backward segment
+    in order to enabel fp16 allreduce
+    """
+    # TODO filter optimize ops in future
+
+    main_block = paddle.static.default_main_program().global_block()
+    main_block._sync_with_cpp()
+
+    for p, g in params_grads:
+        op = g.op
+        if g.dtype == core.VarDesc.VarType.FP32 and op.type == 'cast':
+            if int(op.attr('op_role')) == int(OpRole.Backward) and op.has_attr(
+                    'op_role_var'):
+                op._remove_attr("op_role_var")
+
+            post_ops = find_true_post_op(main_block.ops, op, g.name)
+            if post_ops:
+                raise ValueError("The cast op {0}'s output should not be"
+                                 "used by a non-optimize op, however, it"
+                                 "is used by {1}".format(op, post_ops[0]))
+
+            if op == main_block.ops[-1]:
+                continue
+
+            # add new op in the python and cpp at the same time
+            new_op_desc = main_block.desc.append_op()
+            new_op_desc.copy_from(op.desc)
+            new_op = paddle.fluid.framework.Operator(
+                block=main_block,
+                desc=new_op_desc,
+                type=None,
+                inputs=None,
+                outputs=None,
+                attrs=None)
+            main_block.ops.append(new_op)
+
+            # dist attr
+            param_dist_attr = dist_context.get_tensor_dist_attr_for_program(p)
+            output_dist_attr = dist_context.get_tensor_dist_attr_for_program(
+                main_block.var(op.output_arg_names[0]))
+            assert param_dist_attr is not None
+            assert output_dist_attr is not None
+            naive_set_dist_op_attr_for_program_by_mesh_and_mapping(
+                new_op, param_dist_attr.process_mesh,
+                param_dist_attr.dims_mapping, dist_context)
+
+            output_dist_attr.process_mesh = param_dist_attr.process_mesh
+            output_dist_attr.dims_mapping = param_dist_attr.dims_mapping
+
+            op_idx = find_op_index(main_block.desc, op.desc)
+            if op_idx == -1:
+                raise ValueError("The op {0} is not in program".format(op))
+            main_block._remove_op(op_idx, sync=False)
+
+    main_block._sync_with_cpp()
+
+
+def _check_and_update_gradient(params_grads, loss_scaling, dist_context):
+
+    main_block = paddle.static.default_main_program().global_block()
+    main_block._sync_with_cpp()
+
+    grads = [g for _, g in params_grads]
+    check_type(grads, 'x', (tuple, list), 'check_finite_and_unscale')
+    for e in grads:
+        check_variable_and_dtype(e, "x", ['float16', 'float32', 'float64'],
+                                 'check_finite_and_unscale')
+
+    found_inf = main_block.create_var(
+        name=unique_name.generate_with_ignorable_key(".".join(
+            ['find_infinite_scale', 'tmp'])),
+        shape=[1],
+        dtype='bool',
+        type=core.VarDesc.VarType.LOD_TENSOR,
+        persistable=False,
+        stop_gradient=False)
+    set_var_dist_attr(dist_context, found_inf, [-1], global_process_mesh)
+
+    inputs = {'X': grads, 'Scale': loss_scaling}
+    outputs = {'Out': grads, 'FoundInfinite': found_inf}
+    attrs = {'op_role': OpRole.Backward}
+    new_op = main_block.append_op(
+        type='check_finite_and_unscale',
+        inputs=inputs,
+        outputs=outputs,
+        attrs=attrs)
+
+    new_op_dist_attr = OperatorDistributedAttribute()
+    new_op_dist_attr.process_mesh = global_process_mesh
+    if len(global_process_mesh) > 1:
+        new_op_dist_attr.impl_idx = 0
+    for g in grads:
+        g_dist_attr = dist_context.get_tensor_dist_attr_for_program(g)
+        assert g_dist_attr is not None
+        new_op_dist_attr.set_input_dims_mapping(g.name,
+                                                g_dist_attr.dims_mapping)
+        new_op_dist_attr.set_output_dims_mapping(g.name,
+                                                 g_dist_attr.dims_mapping)
+    dist_context.set_op_dist_attr_for_program(new_op, new_op_dist_attr)
+    return grads, found_inf
+
+
+@register_pass("auto_parallel_amp")
+class AMPPass(PassBase):
+    def __init__(self):
+        super(AMPPass, self).__init__()
+        self.set_attr("loss", None)
+        self.set_attr("dist_context", None)
+        self.set_attr("custom_white_list", None)
+        self.set_attr("custom_black_list", None)
+        self.set_attr("custom_black_varnames", None)
+        self.set_attr("init_loss_scaling", 32768.0)
+        self.set_attr("incr_every_n_steps", 1000)
+        self.set_attr("decr_every_n_nan_or_inf", 2)
+        self.set_attr("incr_ratio", 2.0)
+        self.set_attr("decr_ratio", 0.8)
+        self.set_attr("use_dynamic_loss_scaling", False)
+        self.set_attr("params_grads", [])
+        self._loss_scaling = None
+        self._num_good_steps = None
+        self._num_bad_steps = None
+
+    def _check_self(self):
+        if self.get_attr("init_loss_scaling") < 0:
+            return False
+        if self.get_attr("incr_every_n_steps") < 0:
+            return False
+        if self.get_attr("decr_every_n_nan_or_inf") < 0:
+            return False
+        if self.get_attr("incr_ratio") < 0:
+            return False
+        if self.get_attr("decr_ratio") < 0:
+            return False
+        if len(self.get_attr("params_grads")) <= 0:
+            return False
+        if self.get_attr("dist_context") is None:
+            return False
+        return True
+
+    def _check_conflict(self, other_pass):
+
+        return True
+
+    # NOTE: why AMPBackwardPass can override apply_single_impl instead of 
+    # apply_impl? AMP is an optimization pass for serial program, 
+    # in distributed scenario, all ranks should have the same modification.
+    def _apply_single_impl(self, main_program, startup_program, context):
+        self.dist_context = self.get_attr("dist_context")
+        params_grads = self.get_attr("params_grads")
+
+        amp_lists = AutoMixedPrecisionLists(
+            set(self.get_attr("custom_white_list")),
+            set(self.get_attr("custom_black_list")),
+            set(self.get_attr("custom_black_varnames")))
+
+        amp_state = AMPState(main_program.global_block())
+        amp_state._build_stats(amp_lists, self.dist_context)
+
+        with paddle.static.program_guard(main_program, startup_program):
+            amp_state.cast_forward_program(self.dist_context)
+            amp_state.cast_backward_program(params_grads, self.dist_context)
+            # TODO (JZ-LIANG)support cast forward program only when inference 
+            self._init_amp_var()
+            self._scale_loss()
+
+            if self.get_attr("use_dynamic_loss_scaling") or self.get_attr(
+                    "init_loss_scaling") != 1.0:
+                grads, found_inf = _check_and_update_gradient(
+                    params_grads, self._loss_scaling, self.dist_context)
+
+            if self.get_attr("use_dynamic_loss_scaling"):
+                self._update_loss_scaling(grads, found_inf)
+
+    def _init_amp_var(self):
+        self._loss_scaling = paddle.static.create_global_var(
+            name=unique_name.generate("loss_scaling"),
+            shape=[1],
+            value=self.get_attr("init_loss_scaling"),
+            dtype='float32',
+            persistable=True)
+        set_var_dist_attr(self.dist_context, self._loss_scaling, [-1],
+                          global_process_mesh)
+
+        if self.get_attr("use_dynamic_loss_scaling"):
+            self._num_good_steps = paddle.static.create_global_var(
+                name=unique_name.generate("num_good_steps"),
+                shape=[1],
+                value=0,
+                dtype='int32',
+                persistable=True)
+            set_var_dist_attr(self.dist_context, self._num_good_steps, [-1],
+                              global_process_mesh)
+
+            self._num_bad_steps = paddle.static.create_global_var(
+                name=unique_name.generate("num_bad_steps"),
+                shape=[1],
+                value=0,
+                dtype='int32',
+                persistable=True)
+            set_var_dist_attr(self.dist_context, self._num_bad_steps, [-1],
+                              global_process_mesh)
+
+    def _scale_loss(self):
+
+        main_block = paddle.static.default_main_program().global_block()
+        main_block._sync_with_cpp()
+        loss = self.get_attr("loss")
+        assert loss is not None
+        loss_op = loss.op
+        loss_op_dist_attr = self.dist_context.get_op_dist_attr_for_program(
+            loss_op)
+
+        if loss.dtype != core.VarDesc.VarType.FP32:
+            loss = loss.astype('float32')
+
+        if self.get_attr("use_dynamic_loss_scaling") or self.get_attr(
+                "init_loss_scaling") != 1.0:
+
+            loss_op_idx = find_op_index(main_block.desc, loss_op.desc)
+
+            # forward
+            ref_mesh = loss_op_dist_attr.process_mesh
+            self._scaled_loss = main_block.create_var(
+                name=unique_name.generate("scaled_loss"),
+                shape=loss.shape,
+                dtype=loss.dtype,
+                persistable=loss.persistable)
+            set_var_dist_attr(self.dist_context, self._scaled_loss, [-1],
+                              ref_mesh)
+
+            OP_ROLE_KEY = core.op_proto_and_checker_maker.kOpRoleAttrName()
+            elementwise_mul_op = main_block._insert_op(
+                loss_op_idx + 1,
+                type='elementwise_mul',
+                inputs={'X': [loss],
+                        'Y': [self._loss_scaling]},
+                outputs={'Out': [self._scaled_loss]},
+                attrs={'op_role': loss_op.all_attrs()[OP_ROLE_KEY], })
+            loss_op._set_attr(OP_ROLE_KEY,
+                              core.op_proto_and_checker_maker.OpRole.Forward)
+            naive_set_dist_op_attr_for_program_by_mesh_and_mapping(
+                elementwise_mul_op, ref_mesh, [-1], self.dist_context)
+
+            # backward
+            first_backward_op = main_block.ops[loss_op_idx + 2]
+            assert first_backward_op.type == "fill_constant" and int(
+                first_backward_op.all_attrs()[OP_ROLE_KEY]) == 257
+            self._scaled_loss_grad = main_block.create_var(
+                name=unique_name.generate("scaled_loss") + "@GRAD",
+                shape=loss.shape,
+                dtype=loss.dtype,
+                persistable=loss.persistable)
+            set_var_dist_attr(self.dist_context, self._scaled_loss_grad, [-1],
+                              ref_mesh)
+            pre_grad_name = first_backward_op.output_arg_names[0]
+            first_backward_op._rename_output(pre_grad_name,
+                                             self._scaled_loss_grad.name)
+            # FIXME(JZ-LIANG) a trick to insert backward op
+            main_block._sync_with_cpp()
+            elementwise_mul_grad_op_desc = main_block.desc._insert_op(
+                loss_op_idx + 3)
+            elementwise_mul_grad_op_desc.set_type("elementwise_mul_grad")
+            elementwise_mul_grad_op_desc.set_input(
+                'Out@GRAD', [self._scaled_loss_grad.name])
+            elementwise_mul_grad_op_desc.set_input('X', [loss.name])
+            elementwise_mul_grad_op_desc.set_input('Y',
+                                                   [self._loss_scaling.name])
+            elementwise_mul_grad_op_desc.set_output('X@GRAD', [pre_grad_name])
+            elementwise_mul_grad_op_desc.set_output('Y@GRAD', [])
+            elementwise_mul_grad_op_desc._set_attr(
+                OP_ROLE_KEY, core.op_proto_and_checker_maker.OpRole.Backward)
+            elementwise_mul_grad_op_desc._set_attr('axis', -1)
+            elementwise_mul_grad_op = paddle.fluid.framework.Operator(
+                main_block, elementwise_mul_grad_op_desc)
+            main_block.ops.insert(loss_op_idx + 3, elementwise_mul_grad_op)
+            main_block._sync_with_cpp()
+            elementwise_mul_grad_op = main_block.ops[loss_op_idx + 3]
+            assert elementwise_mul_grad_op.type == "elementwise_mul_grad"
+            naive_set_dist_op_attr_for_program_by_mesh_and_mapping(
+                elementwise_mul_grad_op, ref_mesh, [-1], self.dist_context)
+
+        else:
+            self._scaled_loss = loss
+
+        main_block._sync_with_cpp()
+
+    def _update_loss_scaling(self, grads, found_inf):
+
+        main_block = paddle.static.default_main_program().global_block()
+        main_block._sync_with_cpp()
+
+        check_variable_and_dtype(self._loss_scaling, "prev_loss_scaling",
+                                 ['float32', 'float64'], "update_loss_scaling")
+        check_type(grads, 'x', (tuple, list), 'update_loss_scaling')
+        for e in grads:
+            check_variable_and_dtype(e, "x", ['float16', 'float32', 'float64'],
+                                     'update_loss_scaling')
+            assert self._loss_scaling.dtype == e.dtype, \
+                "The dtype of prev_loss_scaling should be equal to the dtype of x."
+
+        inputs = {
+            'X': grads,
+            'FoundInfinite': found_inf,
+            'PrevLossScaling': self._loss_scaling,
+            'InGoodSteps': self._num_good_steps,
+            'InBadSteps': self._num_bad_steps
+        }
+
+        outputs = {
+            'Out': grads,
+            'LossScaling': self._loss_scaling,
+            'OutGoodSteps': self._num_good_steps,
+            'OutBadSteps': self._num_bad_steps
+        }
+
+        attrs = {
+            'incr_every_n_steps': self.get_attr("incr_every_n_steps"),
+            'decr_every_n_nan_or_inf': self.get_attr("decr_every_n_nan_or_inf"),
+            'incr_ratio': self.get_attr("incr_ratio"),
+            'decr_ratio': self.get_attr("decr_ratio"),
+            'stop_update': self.get_attr("stop_update"),
+            'op_role': OpRole.Backward
+        }
+
+        new_op = main_block.append_op(
+            type='update_loss_scaling',
+            inputs=inputs,
+            outputs=outputs,
+            attrs=attrs)
+
+        new_op_dist_attr = OperatorDistributedAttribute()
+        new_op_dist_attr.process_mesh = global_process_mesh
+        if len(global_process_mesh) > 1:
+            new_op_dist_attr.impl_idx = 0
+        for g in grads:
+            g_dist_attr = self.dist_context.get_tensor_dist_attr_for_program(g)
+            assert g_dist_attr is not None
+            new_op_dist_attr.set_input_dims_mapping(g.name,
+                                                    g_dist_attr.dims_mapping)
+            new_op_dist_attr.set_output_dims_mapping(g.name,
+                                                     g_dist_attr.dims_mapping)
+        self.dist_context.set_op_dist_attr_for_program(new_op, new_op_dist_attr)
+
+        main_block._sync_with_cpp()
diff --git a/python/paddle/distributed/passes/auto_parallel_sharding.py b/python/paddle/distributed/passes/auto_parallel_sharding.py
index 2785eae6e8a46..7729d1c2bd0d1 100644
--- a/python/paddle/distributed/passes/auto_parallel_sharding.py
+++ b/python/paddle/distributed/passes/auto_parallel_sharding.py
@@ -21,7 +21,7 @@
 from paddle.fluid import unique_name
 from .pass_base import PassBase, register_pass
 from paddle.distributed.fleet.meta_optimizers.common import is_backward_op, is_optimizer_op
-from paddle.distributed.auto_parallel.process_group import get_world_process_groups, new_process_group
+from paddle.distributed.auto_parallel.process_group import new_process_group
 from paddle.distributed.auto_parallel.operators.common import is_parameter_related
 from paddle.distributed.auto_parallel.utils import _get_comm_group, naive_set_dist_op_attr_for_program_by_mesh_and_mapping, set_var_dist_attr
 
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_amp_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_amp_pass.py
new file mode 100755
index 0000000000000..0507909b132e1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_amp_pass.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import random
+import numpy as np
+
+import unittest
+import paddle
+import paddle.distributed.fleet as fleet
+from auto_parallel_pass_test_base import AutoPallelPassTestBase
+
+
+class TestAMPPass(AutoPallelPassTestBase):
+    def init(self):
+        if paddle.is_compiled_with_cuda():
+            paddle.set_flags({'FLAGS_cudnn_deterministic': 1})
+        self.rtol = 1e-5
+        self.atol = 1e-8
+
+        rank = paddle.distributed.get_rank()
+        paddle.seed(rank + 2021)
+        random.seed(rank + 2021)
+        np.random.seed(rank + 2021)
+
+    def apply_passes(self):
+        dist_strategy = fleet.DistributedStrategy()
+        dist_strategy.amp = True
+        dist_strategy.amp_configs = {
+            "custom_white_list": [
+                'softmax',
+                'layer_norm',
+                'gelu',
+            ],
+            "custom_black_list": ['c_softmax_with_cross_entropy'],
+            "init_loss_scaling": 32768,
+            "use_dynamic_loss_scaling": True,
+        }
+        dist_strategy.semi_auto = True
+        fleet.init(is_collective=True, strategy=dist_strategy)
+
+    def test_bs_8(self):
+        self.check_main(
+            gpus=[0, 1], batch_size=8, sequence_len=512, vocab_size=1000)
+
+    def get_model(self, place, batch_size, sequence_len, vocab_size):
+        return self.get_gpt_model("mp", place, batch_size, sequence_len,
+                                  vocab_size)
+
+
+if __name__ == "__main__":
+    unittest.main()

From d296456c1af37cb4b1a87e2b684488f811318f58 Mon Sep 17 00:00:00 2001
From: xiaoting <31891223+tink2123@users.noreply.github.com>
Date: Wed, 12 Jan 2022 17:16:51 +0800
Subject: [PATCH 112/151] support 5d for nearest interp (#38868)

* support 5d for nearest

* update nearest3d unittest, test=develop

* fix approve ci, test=develop

* fix approve ci, test=develop
---
 paddle/fluid/operators/interpolate_v2_op.cc   |  12 +-
 paddle/fluid/operators/interpolate_v2_op.cu   | 126 ++++++++++++
 paddle/fluid/operators/interpolate_v2_op.h    |  77 +++++++
 .../unittests/test_nearest_interp_v2_op.py    | 192 ++++++++++++++++--
 python/paddle/nn/functional/common.py         |  11 +-
 python/paddle/nn/layer/common.py              |   3 +-
 6 files changed, 394 insertions(+), 27 deletions(-)

diff --git a/paddle/fluid/operators/interpolate_v2_op.cc b/paddle/fluid/operators/interpolate_v2_op.cc
index de276cfa31cb5..7783303785998 100644
--- a/paddle/fluid/operators/interpolate_v2_op.cc
+++ b/paddle/fluid/operators/interpolate_v2_op.cc
@@ -249,12 +249,12 @@ static void Interpolate3DInferShapeCheck(framework::InferShapeContext* ctx) {
   auto dim_x = ctx->GetInputDim("X");
   auto interp_method = ctx->Attrs().Get<std::string>("interp_method");
 
-  PADDLE_ENFORCE_EQ(
-      "trilinear", interp_method,
-      platform::errors::InvalidArgument(
-          "Interpolation method can only be \"trilinear\" when Input(X) "
-          "dimension is 5, but got method = %s .",
-          interp_method));
+  PADDLE_ENFORCE("nearest" == interp_method || "trilinear" == interp_method,
+                 platform::errors::InvalidArgument(
+                     "Interpolation method can only be \"trilinear\" or "
+                     "\"nearest\" when Input(X) "
+                     "dimension is 5, but got method = %s .",
+                     interp_method));
   const DataLayout data_layout = framework::StringToDataLayout(
       ctx->Attrs().Get<std::string>("data_layout"));
 
diff --git a/paddle/fluid/operators/interpolate_v2_op.cu b/paddle/fluid/operators/interpolate_v2_op.cu
index bc1ab704aafe3..3db0fdf5e6da4 100644
--- a/paddle/fluid/operators/interpolate_v2_op.cu
+++ b/paddle/fluid/operators/interpolate_v2_op.cu
@@ -67,6 +67,61 @@ __global__ void KeNearestNeighborInterpFw(
   }
 }
 
+template <typename T>
+__global__ void KeNearestNeighbor3DInterpFw(
+    const T* in, const size_t in_img_d, const size_t in_img_h,
+    const size_t in_img_w, const size_t input_h, const size_t input_w, T* out,
+    const size_t out_img_d, const size_t out_img_h, const size_t out_img_w,
+    const size_t output_h, const size_t output_w, const size_t num_channels,
+    const float ratio_d, const float ratio_h, const float ratio_w,
+    const bool align_corners, const DataLayout data_layout) {
+  int nthreads = output_h * output_w;  // ncdhw
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  for (; tid < nthreads; tid += stride) {
+    int out_id_h = tid / output_w;
+    int out_id_w = tid % output_w;
+    int in_img_size = input_w / num_channels;
+    int out_img_size = output_w / num_channels;
+
+    int channel_id, out_img_idt, out_img_idy, out_img_idx;
+    if (data_layout == DataLayout::kNCHW) {
+      channel_id = out_id_w / out_img_size;
+      out_img_idt = (out_id_w % out_img_size) / out_img_h / out_img_w;
+      out_img_idy = ((out_id_w % out_img_size) / out_img_w) % out_img_h;
+      out_img_idx = tid % out_img_w;
+    } else {
+      out_img_idt = out_id_w / (out_img_h * out_img_w * num_channels);
+      out_img_idy = out_id_w % (out_img_h * out_img_w * num_channels) /
+                    (out_img_w * num_channels);
+      out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels;
+      channel_id = tid % num_channels;
+    }
+
+    int in_img_idt = (align_corners)
+                         ? static_cast<int>(ratio_d * out_img_idt + 0.5)
+                         : static_cast<int>(ratio_d * out_img_idt);
+
+    int in_img_idy = (align_corners)
+                         ? static_cast<int>(ratio_h * out_img_idy + 0.5)
+                         : static_cast<int>(ratio_h * out_img_idy);
+    int in_img_idx = (align_corners)
+                         ? static_cast<int>(ratio_w * out_img_idx + 0.5)
+                         : static_cast<int>(ratio_w * out_img_idx);
+
+    if (data_layout == DataLayout::kNCHW) {
+      out[tid] = in[out_id_h * input_w + channel_id * in_img_size +
+                    in_img_idt * in_img_h * in_img_w + in_img_idy * in_img_w +
+                    in_img_idx];
+    } else {
+      out[tid] = in[out_id_h * input_w +
+                    in_img_idt * in_img_h * in_img_w * num_channels +
+                    in_img_idy * in_img_w * num_channels +
+                    in_img_idx * num_channels + channel_id];
+    }
+  }
+}
+
 template <typename T>
 __global__ void KeNearestNeighborInterpBw(
     T* in, const size_t in_img_h, const size_t in_img_w, const size_t input_h,
@@ -114,6 +169,63 @@ __global__ void KeNearestNeighborInterpBw(
   }
 }
 
+template <typename T>
+__global__ void KeNearestNeighbor3DInterpBw(
+    T* in, const size_t in_img_d, const size_t in_img_h, const size_t in_img_w,
+    const size_t input_h, const size_t input_w, const T* out,
+    const size_t out_img_d, const size_t out_img_h, const size_t out_img_w,
+    const size_t output_h, const size_t output_w, const size_t num_channels,
+    const float ratio_d, const float ratio_h, const float ratio_w,
+    const bool align_corners, const DataLayout data_layout) {
+  int nthreads = output_h * output_w;
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  for (; tid < nthreads; tid += stride) {
+    int out_id_h = tid / output_w;
+    int out_id_w = tid % output_w;
+    int in_img_size = input_w / num_channels;
+    int out_img_size = output_w / num_channels;
+
+    int channel_id, out_img_idt, out_img_idy, out_img_idx;
+    if (data_layout == DataLayout::kNCHW) {
+      channel_id = out_id_w / out_img_size;
+      out_img_idt = (out_id_w % out_img_size) / out_img_h / out_img_w;
+      out_img_idy = ((out_id_w % out_img_size) / out_img_w) % out_img_h;
+      out_img_idx = tid % out_img_w;
+    } else {
+      out_img_idt = out_id_w / (out_img_h * out_img_w * num_channels);
+      out_img_idy = out_id_w % (out_img_h * out_img_w * num_channels) /
+                    (out_img_w * num_channels);
+      out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels;
+      channel_id = tid % num_channels;
+    }
+
+    int in_img_idt = (align_corners)
+                         ? static_cast<int>(ratio_d * out_img_idt + 0.5)
+                         : static_cast<int>(ratio_d * out_img_idt);
+    int in_img_idy = (align_corners)
+                         ? static_cast<int>(ratio_h * out_img_idy + 0.5)
+                         : static_cast<int>(ratio_h * out_img_idy);
+    int in_img_idx = (align_corners)
+                         ? static_cast<int>(ratio_w * out_img_idx + 0.5)
+                         : static_cast<int>(ratio_w * out_img_idx);
+
+    T* in_pos;
+    if (data_layout == DataLayout::kNCHW) {
+      in_pos = &in[out_id_h * input_w + channel_id * in_img_size +
+                   in_img_idt * in_img_h * in_img_w + in_img_idy * in_img_w +
+                   in_img_idx];
+    } else {
+      in_pos = &in[out_id_h * input_w +
+                   in_img_idt * in_img_h * in_img_w * num_channels +
+                   in_img_idy * in_img_w * num_channels +
+                   in_img_idx * num_channels + channel_id];
+    }
+    const T out_pos = out[out_id_h * output_w + out_id_w];
+    platform::CudaAtomicAdd(in_pos, out_pos);
+  }
+}
+
 template <typename T>
 __global__ void KeLinearInterpFw(const T* in, const size_t in_img_w,
                                  const size_t input_w, T* out,
@@ -1376,6 +1488,13 @@ static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx,
         input_data, in_d, in_h, in_w, n, in_cdhw, output_data, out_d, out_h,
         out_w, n, out_cdhw, c, ratio_d, ratio_h, ratio_w, align_corners,
         align_mode, data_layout);
+  } else if ("nearest" == interp_method) {
+    KeNearestNeighbor3DInterpFw<
+        T><<<config.block_per_grid, config.thread_per_block, 0,
+             ctx.cuda_device_context().stream()>>>(
+        input_data, in_d, in_h, in_w, n, in_cdhw, output_data, out_d, out_h,
+        out_w, n, out_cdhw, c, ratio_d, ratio_h, ratio_w, align_corners,
+        data_layout);
   }
 }
 
@@ -1801,6 +1920,13 @@ static void Interpolate3DCUDABwd(const framework::ExecutionContext& ctx,
         input_grad_data, in_d, in_h, in_w, n, in_cdhw, output_grad_data, out_d,
         out_h, out_w, n, out_cdhw, c, ratio_d, ratio_h, ratio_w, align_corners,
         align_mode, data_layout);
+  } else if ("nearest" == interp_method) {
+    KeNearestNeighbor3DInterpBw<
+        T><<<config.block_per_grid, config.thread_per_block, 0,
+             ctx.cuda_device_context().stream()>>>(
+        input_grad_data, in_d, in_h, in_w, n, in_cdhw, output_grad_data, out_d,
+        out_h, out_w, n, out_cdhw, c, ratio_d, ratio_h, ratio_w, align_corners,
+        data_layout);
   }
 }
 
diff --git a/paddle/fluid/operators/interpolate_v2_op.h b/paddle/fluid/operators/interpolate_v2_op.h
index 8daf440f60e5f..0af799eca0c55 100644
--- a/paddle/fluid/operators/interpolate_v2_op.h
+++ b/paddle/fluid/operators/interpolate_v2_op.h
@@ -121,6 +121,39 @@ static void NearestNeighborInterpolate(const Tensor& input, Tensor* output,
   }
 }
 
+template <typename T>
+static void NearestNeighbor3DInterpolate(
+    const Tensor& input, Tensor* output, const float ratio_d,
+    const float ratio_h, const float ratio_w, const int n, const int c,
+    const int out_d, const int out_h, const int out_w, const bool align_corners,
+    const DataLayout& data_layout) {
+  auto input_t = EigenTensor<T, 5>::From(input);
+  auto output_t = EigenTensor<T, 5>::From(*output);
+  for (int d = 0; d < out_d; d++) {  // loop for images
+    int in_d = (align_corners) ? static_cast<int>(ratio_d * d + 0.5)
+                               : static_cast<int>(ratio_d * d);
+    for (int k = 0; k < out_h; k++) {
+      int in_k = (align_corners) ? static_cast<int>(ratio_h * k + 0.5)
+                                 : static_cast<int>(ratio_h * k);
+
+      for (int l = 0; l < out_w; l++) {
+        int in_l = (align_corners) ? static_cast<int>(ratio_w * l + 0.5)
+                                   : static_cast<int>(ratio_w * l);
+
+        for (int i = 0; i < n; i++) {    // loop for batches
+          for (int j = 0; j < c; j++) {  // loop for channels
+            if (data_layout == DataLayout::kNCHW) {
+              output_t(i, j, d, k, l) = input_t(i, j, in_d, in_k, in_l);
+            } else {  // NDHWC
+              output_t(i, d, k, l, j) = input_t(i, in_d, in_k, in_l, j);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
 template <typename T>
 static void LinearInterpolation(const Tensor& input, Tensor* output,
                                 const float ratio_w, const int in_w,
@@ -584,6 +617,42 @@ static void NearestNeighborInterpolateGrad(
   }
 }
 
+template <typename T>
+static void NearestNeighbor3DInterpolateGrad(
+    const Tensor& output_grad, Tensor* input_grad, const float ratio_d,
+    const float ratio_h, const float ratio_w, const int n, const int c,
+    const int out_d, const int out_h, const int out_w, const bool align_corners,
+    const DataLayout data_layout) {
+  auto input_grad_t = EigenTensor<T, 5>::From(*input_grad);
+  auto output_grad_t = EigenTensor<T, 5>::From(output_grad);
+
+  for (int d = 0; d < out_d; d++) {
+    int in_d = (align_corners) ? static_cast<int>(ratio_d * d + 0.5)
+                               : static_cast<int>(ratio_d * d);
+    for (int k = 0; k < out_h; k++) {  // loop for images
+      int in_k = (align_corners) ? static_cast<int>(ratio_h * k + 0.5)
+                                 : static_cast<int>(ratio_h * k);
+
+      for (int l = 0; l < out_w; l++) {
+        int in_l = (align_corners) ? static_cast<int>(ratio_w * l + 0.5)
+                                   : static_cast<int>(ratio_w * l);
+
+        for (int i = 0; i < n; i++) {    // loop for batches
+          for (int j = 0; j < c; j++) {  // loop for channels
+            if (data_layout == DataLayout::kNCHW) {
+              input_grad_t(i, j, in_d, in_k, in_l) +=
+                  output_grad_t(i, j, d, k, l);
+            } else {
+              input_grad_t(i, in_d, in_k, in_l, j) +=
+                  output_grad_t(i, d, k, l, j);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
 template <typename T>
 static void BilinearInterpolationGrad(
     const Tensor& output_grad, Tensor* input_grad, const float ratio_h,
@@ -1137,6 +1206,10 @@ static void Interpolate3DCPUFwd(const framework::ExecutionContext& ctx,
     TrilinearInterpolation<T>(input, output, ratio_d, ratio_h, ratio_w, in_d,
                               in_h, in_w, n, c, out_d, out_h, out_w,
                               align_corners, align_mode, data_layout);
+  } else if ("nearest" == interp_method) {
+    NearestNeighbor3DInterpolate<T>(input, output, ratio_d, ratio_h, ratio_w, n,
+                                    c, out_d, out_h, out_w, align_corners,
+                                    data_layout);
   }
 }
 
@@ -1489,6 +1562,10 @@ static void Interpolate3DCPUBwd(const framework::ExecutionContext& ctx,
     TrilinearInterpolationGrad<T>(
         output_grad, input_grad, ratio_d, ratio_h, ratio_w, in_d, in_h, in_w, n,
         c, out_d, out_h, out_w, align_corners, align_mode, data_layout);
+  } else if ("nearest" == interp_method) {
+    NearestNeighbor3DInterpolateGrad<T>(output_grad, input_grad, ratio_d,
+                                        ratio_h, ratio_w, n, c, out_d, out_h,
+                                        out_w, align_corners, data_layout);
   }
 }
 
diff --git a/python/paddle/fluid/tests/unittests/test_nearest_interp_v2_op.py b/python/paddle/fluid/tests/unittests/test_nearest_interp_v2_op.py
index 04962a93c11c1..e2ac98f7c9f1f 100755
--- a/python/paddle/fluid/tests/unittests/test_nearest_interp_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_nearest_interp_v2_op.py
@@ -23,6 +23,8 @@
 import paddle
 from paddle.nn.functional import interpolate
 
+paddle.enable_static()
+
 
 def nearest_neighbor_interp_np(X,
                                out_h,
@@ -78,7 +80,80 @@ def nearest_neighbor_interp_np(X,
 
     if data_layout == "NHWC":
         out = np.transpose(out, (0, 2, 3, 1))  # NCHW => NHWC
+    # out = np.expand_dims(out, 2)
+    return out.astype(X.dtype)
+
+
+def nearest_neighbor_interp3d_np(X,
+                                 out_d,
+                                 out_h,
+                                 out_w,
+                                 scale_d=0,
+                                 scale_h=0,
+                                 scale_w=0,
+                                 out_size=None,
+                                 actual_shape=None,
+                                 align_corners=True,
+                                 data_layout='NCHW'):
+    """nearest neighbor interpolation implement in shape [N, C, H, W]"""
+    if data_layout == "NHWC":
+        X = np.transpose(X, (0, 4, 1, 2, 3))  # NDHWC => NCDHW
+    if out_size is not None:
+        out_d = out_size[0]
+        out_h = out_size[1]
+        out_w = out_size[2]
+    if actual_shape is not None:
+        out_d = actual_shape[0]
+        out_h = actual_shape[1]
+        out_w = actual_shape[2]
+    n, c, in_d, in_h, in_w = X.shape
 
+    ratio_d = ratio_h = ratio_w = 0.0
+    if (out_d > 1):
+        if (align_corners):
+            ratio_d = (in_d - 1.0) / (out_d - 1.0)
+        else:
+            if scale_d > 0:
+                ratio_d = 1.0 / scale_d
+            else:
+                ratio_d = 1.0 * in_d / out_d
+    if (out_h > 1):
+        if (align_corners):
+            ratio_h = (in_h - 1.0) / (out_h - 1.0)
+        else:
+            if scale_h > 0:
+                ratio_h = 1.0 / scale_h
+            else:
+                ratio_h = 1.0 * in_h / out_h
+    if (out_w > 1):
+        if (align_corners):
+            ratio_w = (in_w - 1.0) / (out_w - 1.0)
+        else:
+            if scale_w > 0:
+                ratio_w = 1.0 / scale_w
+            else:
+                ratio_w = 1.0 * in_w / out_w
+    out = np.zeros((n, c, out_d, out_h, out_w))
+
+    if align_corners:
+        for d in range(out_d):
+            in_d = int(ratio_d * d + 0.5)
+            for i in range(out_h):
+                in_i = int(ratio_h * i + 0.5)
+                for j in range(out_w):
+                    in_j = int(ratio_w * j + 0.5)
+                    out[:, :, d, i, j] = X[:, :, in_d, in_i, in_j]
+    else:
+        for d in range(out_d):
+            in_d = int(ratio_d * d)
+            for i in range(out_h):
+                in_i = int(ratio_h * i)
+                for j in range(out_w):
+                    in_j = int(ratio_w * j)
+                    out[:, :, d, i, j] = X[:, :, in_d, in_i, in_j]
+
+    if data_layout == "NDHWC":
+        out = np.transpose(out, (0, 2, 3, 4, 1))  # NCDHW => NDHWC
     return out.astype(X.dtype)
 
 
@@ -91,44 +166,81 @@ def setUp(self):
         self.op_type = "nearest_interp_v2"
         input_np = np.random.random(self.input_shape).astype("float64")
 
-        if self.data_layout == "NCHW":
+        if self.data_layout == "NCHW" and len(self.input_shape) == 4:
+            in_d = 1
             in_h = self.input_shape[2]
             in_w = self.input_shape[3]
         else:
+            in_d = 1
             in_h = self.input_shape[1]
             in_w = self.input_shape[2]
+
+        if self.data_layout == "NCDHW" and len(self.input_shape) == 5:
+            in_d = self.input_shape[2]
+            in_h = self.input_shape[3]
+            in_w = self.input_shape[4]
+        else:
+            in_d = self.input_shape[1]
+            in_h = self.input_shape[2]
+            in_w = self.input_shape[3]
+        scale_d = 0
         scale_h = 0
         scale_w = 0
         if self.scale:
             if isinstance(self.scale, float) or isinstance(self.scale, int):
                 if self.scale > 0:
-                    scale_h = scale_w = float(self.scale)
+                    scale_d = scale_h = scale_w = float(self.scale)
             if isinstance(self.scale, list) and len(self.scale) == 1:
-                scale_w = scale_h = self.scale[0]
+                scale_d = scale_w = scale_h = self.scale[0]
             elif isinstance(self.scale, list) and len(self.scale) > 1:
-                scale_w = self.scale[1]
-                scale_h = self.scale[0]
+                if len(self.scale) == 5:
+                    scale_w = self.scale[2]
+                    scale_h = self.scale[1]
+                    scale_d = self.scale[0]
+                else:
+                    scale_w = self.scale[1]
+                    scale_h = self.scale[0]
+
             out_h = int(in_h * scale_h)
             out_w = int(in_w * scale_w)
+            out_d = int(in_d * scale_d)
         else:
+            if len(self.input_shape) == 5:
+                out_d = self.out_d
             out_h = self.out_h
             out_w = self.out_w
 
-        output_np = nearest_neighbor_interp_np(
-            input_np, out_h, out_w, scale_h, scale_w, self.out_size,
-            self.actual_shape, self.align_corners, self.data_layout)
+        if len(self.input_shape) == 4:
+            output_np = nearest_neighbor_interp_np(
+                input_np, out_h, out_w, scale_h, scale_w, self.out_size,
+                self.actual_shape, self.align_corners, self.data_layout)
+        elif len(self.input_shape) == 5:
+            output_np = nearest_neighbor_interp3d_np(
+                input_np, out_d, out_h, out_w, scale_d, scale_h, scale_w,
+                self.out_size, self.actual_shape, self.align_corners,
+                self.data_layout)
         self.inputs = {'X': input_np}
         if self.out_size is not None:
             self.inputs['OutSize'] = self.out_size
         if self.actual_shape is not None:
             self.inputs['OutSize'] = self.actual_shape
-        self.attrs = {
-            'out_h': self.out_h,
-            'out_w': self.out_w,
-            'interp_method': self.interp_method,
-            'align_corners': self.align_corners,
-            'data_layout': self.data_layout
-        }
+        if len(self.input_shape) == 5:
+            self.attrs = {
+                'out_d': self.out_d,
+                'out_h': self.out_h,
+                'out_w': self.out_w,
+                'interp_method': self.interp_method,
+                'align_corners': self.align_corners,
+                'data_layout': self.data_layout
+            }
+        else:
+            self.attrs = {
+                'out_h': self.out_h,
+                'out_w': self.out_w,
+                'interp_method': self.interp_method,
+                'align_corners': self.align_corners,
+                'data_layout': self.data_layout
+            }
         if self.scale:
             if isinstance(self.scale, float) or isinstance(self.scale, int):
                 if self.scale > 0:
@@ -157,7 +269,8 @@ def init_test_case(self):
 class TestNearestNeighborInterpCase1(TestNearestInterpOp):
     def init_test_case(self):
         self.interp_method = 'nearest'
-        self.input_shape = [4, 1, 7, 8]
+        self.input_shape = [4, 1, 1, 7, 8]
+        self.out_d = 1
         self.out_h = 1
         self.out_w = 1
         self.scale = 0.
@@ -366,6 +479,18 @@ def init_test_case(self):
         self.align_corners = True
 
 
+class TestNearestNeighbor3DInterp(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [3, 2, 4, 7, 5]
+        self.out_d = 8
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = [4.0, 2.0, 3.0]
+        self.out_size = np.array([8, 66, 40]).astype("int32")
+        self.align_corners = True
+
+
 class TestNearestInterpOp_attr_tensor(OpTest):
     def setUp(self):
         self.out_size = None
@@ -549,8 +674,32 @@ def test_case(self):
             self.assertTrue(np.allclose(out.numpy(), expect_res))
 
 
+class TestNearestInterp3DOpAPI_dy(unittest.TestCase):
+    def test_case(self):
+        import paddle
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+        else:
+            place = core.CPUPlace()
+        with fluid.dygraph.guard(place):
+            input_data = np.random.random((2, 2, 6, 6, 6)).astype("int64")
+            scale_np = np.array([2, 2, 2]).astype("int64")
+            input_x = paddle.to_tensor(input_data)
+            scale = paddle.to_tensor(scale_np)
+            expect_res = nearest_neighbor_interp3d_np(
+                input_data, out_d=12, out_h=12, out_w=12, align_corners=False)
+            out = interpolate(
+                x=input_x,
+                scale_factor=scale,
+                mode="nearest",
+                align_corners=False,
+                data_format="NCDHW")
+            self.assertTrue(np.allclose(out.numpy(), expect_res))
+
+
 class TestNearestInterpException(unittest.TestCase):
     def test_exception(self):
+        import paddle
         input = fluid.data(name="input", shape=[1, 3, 6, 6], dtype="float32")
 
         def attr_data_format():
@@ -564,9 +713,20 @@ def attr_scale_type():
         def attr_scale_value():
             out = fluid.layers.resize_nearest(input, scale=-0.3)
 
+        def input_shape_error():
+            x = paddle.randn([1, 3])
+            out = paddle.nn.functional.interpolate(x, scale_factor='scale')
+
+        def mode_error():
+            x = paddle.randn([1, 3])
+            out = paddle.nn.functional.interpolate(
+                x, scale_factor='scale', mode="BILINEAR")
+
         self.assertRaises(ValueError, attr_data_format)
         self.assertRaises(TypeError, attr_scale_type)
         self.assertRaises(ValueError, attr_scale_value)
+        self.assertRaises(ValueError, input_shape_error)
+        self.assertRaises(ValueError, mode_error)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index 3dba9505e92c7..5a010ad2f20c5 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -221,7 +221,8 @@ def interpolate(x,
         ValueError: The 'mode' of image_resize can only be 'linear', 'bilinear',
                     'trilinear', 'bicubic', 'area' or 'nearest' currently.
         ValueError: 'linear' only support 3-D tensor.
-        ValueError: 'bilinear', 'bicubic' and 'nearest' only support 4-D tensor.
+        ValueError: 'bilinear' and 'bicubic' only support 4-D tensor.
+        ValueError: 'nearest' only support 4-D or 5-D tensor.
         ValueError: 'trilinear' only support 5-D tensor.
         ValueError: One of size and scale_factor must not be None.
         ValueError: size length should be 1 for input 3-D tensor.
@@ -276,9 +277,11 @@ def interpolate(x,
     if resample in ['LINEAR'] and len(x.shape) != 3:
         raise ValueError("'linear' only support 3-D tensor.")
 
-    if resample in ['BILINEAR', 'NEAREST', 'BICUBIC'] and len(x.shape) != 4:
-        raise ValueError(
-            "'bilinear', 'bicubic' and 'nearest' only support 4-D tensor.")
+    if resample in ['NEAREST'] and len(x.shape) != 4 and len(x.shape) != 5:
+        raise ValueError("'NEAREST' only support 4-D  or 5-D tensor.")
+
+    if resample in ['BILINEAR', 'BICUBIC'] and len(x.shape) != 4:
+        raise ValueError("'bilinear' and 'bicubic' only support 4-D tensor.")
     if resample == 'TRILINEAR' and len(x.shape) != 5:
         raise ValueError("'trilinear'only support 5-D tensor.")
 
diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py
index 22f7f798374d8..89ff156bded2a 100644
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -359,8 +359,9 @@ class Upsample(Layer):
         ValueError: The 'mode' of image_resize can only be 'linear', 'bilinear',
                     'trilinear', 'bicubic', or 'nearest' currently.
         ValueError: 'linear' only support 3-D tensor.
-        ValueError: 'bilinear', 'bicubic' and 'nearest' only support 4-D tensor.
+        ValueError: 'bilinear' and 'bicubic'  only support 4-D tensor.
         ValueError: 'trilinear' only support 5-D tensor.
+        ValueError: 'nearest' only support 4-D or 5-D tensor.
         ValueError: One of size and scale_factor must not be None.
         ValueError: size length should be 1 for input 3-D tensor.
         ValueError: size length should be 2 for input 4-D tensor.

From 4825addd11f6aecfc27e961ff72726051239e6ef Mon Sep 17 00:00:00 2001
From: Sylwester Fraczek <sylwester.fraczek@intel.com>
Date: Wed, 12 Jan 2022 11:29:03 +0100
Subject: [PATCH 113/151] Fix conv act int8 scale (#38331)

* fix conv act int8 scale

* add unit test for conv+hard_swish
---
 .../ir/mkldnn/fc_act_mkldnn_fuse_pass.h       |  2 +-
 .../fluid/operators/mkldnn/conv_mkldnn_op.cc  | 54 ++++++++++------
 .../quantization/quant2_int8_mkldnn_pass.py   |  1 +
 .../mkldnn/test_conv2d_int8_mkldnn_op.py      | 64 ++++++++++++++++---
 4 files changed, 90 insertions(+), 31 deletions(-)

diff --git a/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.h
index 7e039d9852fc3..81294dd568926 100644
--- a/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.h
@@ -42,4 +42,4 @@ class FuseFCActOneDNNPass : public FusePassBase {
 
 }  // namespace ir
 }  // namespace framework
-}  // namespace paddlea
+}  // namespace paddle
diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
index 1bde58f7c4edb..0526ae52b3903 100644
--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
@@ -218,13 +218,15 @@ class ConvMKLDNNHandlerT
                                          : dnnl::prop_kind::forward_training;
 
       float sum_scale = 1.0f;
+      float activation_scale = 1.0f;
       std::vector<float> output_shift_scale;
       if (platform::is_int8<T>())
-        std::tie(sum_scale, output_shift_scale) = get_int8_scales(ctx);
+        std::tie(sum_scale, output_shift_scale, activation_scale) =
+            get_int8_scales(ctx);
 
       const dnnl::primitive_attr conv_attr = CreatePostOps(
           fuse_activation, fuse_alpha, fuse_beta, fuse_residual_conn,
-          output_shift_scale, sum_scale);  // for INT8 only!
+          output_shift_scale, sum_scale, activation_scale);  // for INT8 only!
 
       if (bias) {
         auto bias_tz = framework::vectorize(bias->dims());
@@ -432,7 +434,7 @@ class ConvMKLDNNHandlerT
     return bias_scale_tuple;
   }
 
-  std::tuple<float, std::vector<float>> get_int8_scales(
+  std::tuple<float, std::vector<float>, float> get_int8_scales(
       const framework::ExecutionContext& ctx) const {
     const auto* filter = ctx.Input<Tensor>("Filter");
     const auto& weights_tz = framework::vectorize(filter->dims());
@@ -445,8 +447,14 @@ class ConvMKLDNNHandlerT
     const auto& scale_in_eltwise_data = ctx.Attr<float>("Scale_in_eltwise");
     auto scale_weights_data = ctx.Attr<std::vector<float>>("Scale_weights");
     bool is_multi_channel = scale_weights_data.size() > 1;
+    bool has_activation = !ctx.Attr<std::string>("fuse_activation").empty();
+    float activation_scale =
+        force_fp32_output ? 1.0f : has_activation ? ctx.Attr<float>("Scale_out")
+                                                  : 1.0f;
     auto scale_out_data =
-        force_fp32_output ? 1.0f : ctx.Attr<float>("Scale_out");
+        force_fp32_output ? 1.0f : has_activation
+                                       ? 1.0f
+                                       : ctx.Attr<float>("Scale_out");
     float sum_scale =
         fuse_residual_conn ? scale_out_data / scale_in_eltwise_data : 1.0f;
     int count =
@@ -468,13 +476,13 @@ class ConvMKLDNNHandlerT
                                 static_cast<double>(scale_weights_data[i])));
     }
 
-    return std::make_tuple(sum_scale, output_shift_scale);
+    return std::make_tuple(sum_scale, output_shift_scale, activation_scale);
   }
 
   dnnl::primitive_attr CreatePostOps(
       std::string fuse_activation, float fuse_alpha, float fuse_beta,
       bool fuse_residual_conn, const std::vector<float> output_shift_scale = {},
-      float sum_scale = 1.0f) {
+      float sum_scale = 1.0f, float activation_scale = 1.0f) {
     dnnl::primitive_attr conv_attr;
     dnnl::post_ops post_operations;
     if (output_shift_scale.size() > 0) {
@@ -492,30 +500,34 @@ class ConvMKLDNNHandlerT
     }
     // Fusion with ReLU layer is executed through the PostOps feature. Create a
     // PostOps object and configure it to execute an eltwise relu operation.
-    constexpr float scale = 1.0f;
     if (fuse_activation == "relu" || fuse_activation == "leaky_relu") {
-      post_operations.append_eltwise(scale, dnnl::algorithm::eltwise_relu,
-                                     fuse_alpha, fuse_beta);
+      post_operations.append_eltwise(activation_scale,
+                                     dnnl::algorithm::eltwise_relu, fuse_alpha,
+                                     fuse_beta);
     } else if (fuse_activation == "relu6") {
-      post_operations.append_eltwise(
-          scale, dnnl::algorithm::eltwise_bounded_relu, fuse_alpha, fuse_beta);
-    } else if (fuse_activation == "swish") {
-      post_operations.append_eltwise(scale, dnnl::algorithm::eltwise_swish,
+      post_operations.append_eltwise(activation_scale,
+                                     dnnl::algorithm::eltwise_bounded_relu,
                                      fuse_alpha, fuse_beta);
+    } else if (fuse_activation == "swish") {
+      post_operations.append_eltwise(activation_scale,
+                                     dnnl::algorithm::eltwise_swish, fuse_alpha,
+                                     fuse_beta);
     } else if (fuse_activation == "hard_swish") {
-      post_operations.append_eltwise(scale, dnnl::algorithm::eltwise_hardswish,
+      post_operations.append_eltwise(activation_scale,
+                                     dnnl::algorithm::eltwise_hardswish,
                                      fuse_alpha, fuse_beta);
     } else if (fuse_activation == "hard_sigmoid") {
-      post_operations.append_eltwise(scale, dnnl::algorithm::eltwise_linear,
+      post_operations.append_eltwise(activation_scale,
+                                     dnnl::algorithm::eltwise_linear,
                                      fuse_alpha, fuse_beta);
-      post_operations.append_eltwise(scale, dnnl::algorithm::eltwise_clip, 0.0f,
-                                     1.0f);
+      post_operations.append_eltwise(activation_scale,
+                                     dnnl::algorithm::eltwise_clip, 0.0f, 1.0f);
     } else if (fuse_activation == "gelu_tanh") {
-      post_operations.append_eltwise(scale, dnnl::algorithm::eltwise_gelu_tanh,
-                                     0.0f, 0.0f);
+      post_operations.append_eltwise(
+          activation_scale, dnnl::algorithm::eltwise_gelu_tanh, 0.0f, 0.0f);
     } else if (fuse_activation == "gelu_erf") {
-      post_operations.append_eltwise(scale, dnnl::algorithm::eltwise_gelu_erf,
-                                     0.0f, 0.0f);
+      post_operations.append_eltwise(
+          activation_scale, dnnl::algorithm::eltwise_gelu_erf, 0.0f, 0.0f);
     }
     conv_attr.set_post_ops(post_operations);
     return conv_attr;
diff --git a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
index 7dbd927874d19..0251dd693f66f 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
@@ -426,6 +426,7 @@ def _optimize_fp32_graph(self, graph):
         graph = self._apply_pass(graph, 'conv_elementwise_add_mkldnn_fuse_pass')
         graph = self._apply_pass(graph, 'conv_relu_mkldnn_fuse_pass')
         graph = self._apply_pass(graph, 'conv_relu6_mkldnn_fuse_pass')
+        graph = self._apply_pass(graph, 'conv_hard_swish_mkldnn_fuse_pass')
         graph = self._apply_pass(graph, 'fc_fuse_pass',
                                  ['use_gpu', 'use_fc_padding'], [False, False])
         graph = self._apply_pass(graph, 'repeated_fc_relu_fuse_pass')
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py
index 7508ecbb2946d..6fc01488c7ea0 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py
@@ -43,7 +43,7 @@ def setUp(self):
         self.init_group()
         self.init_dilation()
         self.init_test_case()
-        self.init_fuse_relu()
+        self.init_fuse_activation()
         self.init_fuse_residual()
         self.init_data_type()
 
@@ -54,7 +54,9 @@ def setUp(self):
         }
         # This implementation of convolution quantization is based on OneDNN documentation
         # https://oneapi-src.github.io/oneDNN/dev_guide_int8_computations.html#doxid-dev-guide-int8-computations-1dg-i8-comp-s11
-        scale_output_shift = (self.scale_out /
+        inner_scale = 1. if self.fuse_activation != "" else self.scale_out
+        activation_scale = self.scale_out if self.fuse_activation != "" else 1.
+        scale_output_shift = (inner_scale /
                               (self.scale_in * self.scale_weights[0]))
         filter = np.random.random(self.filter_size).astype(self.weighttype)
 
@@ -78,7 +80,7 @@ def residual_helper(init_low, init_high, output_):
                 init_low, init_high,
                 self.input_residual_size).astype(self.srctype)
             return (output_ + input_residual_ *
-                    (self.scale_out / self.scale_in_eltwise)), input_residual_
+                    (inner_scale / self.scale_in_eltwise)), input_residual_
 
         if self.srctype == np.int8:
             init_low, init_high = (-5, 5)
@@ -101,12 +103,24 @@ def residual_helper(init_low, init_high, output_):
             output, input_residual = residual_helper(init_low, init_high,
                                                      output)
 
-        output = np.round(output)
-
-        if self.fuse_activation == "relu":
-            output = np.maximum(output, 0)
+        if self.fuse_activation == "":
+            pass
+        elif self.fuse_activation == "relu":
+            output = activation_scale * np.maximum(output, 0)
+        elif self.fuse_activation == "hard_swish":
+            output = activation_scale * output / 6. * np.minimum(
+                np.maximum(0, output + 3.), 6)
+        elif self.fuse_activation == "relu6":
+            output = activation_scale * np.maximum(0, np.minimum(6, output))
+        elif self.fuse_activation == "swish":
+            output = activation_scale * output / (1. + np.exp(-1. * output))
+        elif self.fuse_activation == "leaky_relu":
+            output = activation_scale * np.maximum(output, 0.02 * output)
+        else:
+            raise NotImplementedError("test for " + self.fuse_activation +
+                                      " activation not implemented")
 
-        output = output.astype(self.dsttype)
+        output = np.round(output).astype(self.dsttype)
 
         self.inputs = {
             'Input':
@@ -131,6 +145,8 @@ def residual_helper(init_low, init_high, output_):
             'Scale_weights': self.scale_weights,
             'Scale_in_eltwise': self.scale_in_eltwise,
             'fuse_activation': self.fuse_activation,
+            'fuse_alpha': self.fuse_alpha,
+            'fuse_beta': self.fuse_beta,
             'fuse_residual_connection': self.fuse_residual,
             'mkldnn_data_type': self.mkldnn_data_type
         }
@@ -165,8 +181,10 @@ def init_data_type(self):
         self.srctype = np.uint8
         self.dsttype = np.int8
 
-    def init_fuse_relu(self):
+    def init_fuse_activation(self):
         self.fuse_activation = "relu"
+        self.fuse_alpha = 0
+        self.fuse_beta = 0
 
     def init_fuse_residual(self):
         self.fuse_residual = True
@@ -190,6 +208,34 @@ def init_test_case(self):
         self.scale_in_eltwise = 0.6
 
 
+class TestWithHardSwish(TestConv2D):
+    def init_fuse_activation(self):
+        self.fuse_activation = "hard_swish"
+        self.fuse_alpha = 0
+        self.fuse_beta = 0
+
+
+class TestWithRelu6(TestConv2D):
+    def init_fuse_activation(self):
+        self.fuse_activation = "relu6"
+        self.fuse_alpha = 6
+        self.fuse_beta = 0
+
+
+class TestWithSwish(TestConv2D):
+    def init_fuse_activation(self):
+        self.fuse_activation = "swish"
+        self.fuse_alpha = 1
+        self.fuse_beta = 0
+
+
+class TestWithLeakyRelu(TestConv2D):
+    def init_fuse_activation(self):
+        self.fuse_activation = "leaky_relu"
+        self.fuse_alpha = 0.02
+        self.fuse_beta = 0
+
+
 class TestWithPad(TestConv2D):
     def init_test_case(self):
         TestConv2D.init_test_case(self)

From 506092146d05aa94d2430c5b3dfb63d699dca858 Mon Sep 17 00:00:00 2001
From: ziyoujiyi <73728031+ziyoujiyi@users.noreply.github.com>
Date: Wed, 12 Jan 2022 19:02:03 +0800
Subject: [PATCH 114/151] the_one_ps dirs reconstruct (#38804)

* delete gloo connect retry

* the_one_ps dirs reconstruct

* .

* .

* create the_one_ps dirs

* create the_one_ps dirs

* create the_one_ps dirs

* create the_one_ps dirs

* create the_one_ps dirs

* create the_one_ps dirs

* the one ps dirs modify

* the one ps dirs modify

* the one ps dirs modify

* the one ps dirs modify
---
 .../fluid/distributed/dataset_utils/README.md  |  6 ++++++
 paddle/fluid/distributed/ps/CMakeLists.txt     |  0
 paddle/fluid/distributed/ps/README.md          |  3 +++
 .../fluid/distributed/ps/coordinator/README.md |  3 +++
 paddle/fluid/distributed/ps/ps.proto           | 13 +++++++++++++
 paddle/fluid/distributed/ps/service/README.md  |  8 ++++++++
 .../distributed/ps/wrapper/ps_cpu_wrapper.h    | 18 ++++++++++++++++++
 .../distributed/ps/wrapper/ps_gpu_wrapper.h    | 18 ++++++++++++++++++
 .../distributed/ps/wrapper/ps_heter_wrapper.h  | 18 ++++++++++++++++++
 .../fluid/distributed/ps/wrapper/ps_wrapper.h  | 18 ++++++++++++++++++
 python/paddle/distributed/ps/README.md         |  3 +++
 python/paddle/distributed/ps/__init__.py       | 13 +++++++++++++
 python/paddle/distributed/ps/the_one_ps.py     | 13 +++++++++++++
 .../ps/utils/compile_time_strategy.py          | 13 +++++++++++++
 .../paddle/distributed/ps/utils/ps_factory.py  | 13 +++++++++++++
 .../distributed/ps/utils/ps_infer_utils.py     | 13 +++++++++++++
 16 files changed, 173 insertions(+)
 create mode 100755 paddle/fluid/distributed/dataset_utils/README.md
 create mode 100644 paddle/fluid/distributed/ps/CMakeLists.txt
 create mode 100755 paddle/fluid/distributed/ps/README.md
 create mode 100755 paddle/fluid/distributed/ps/coordinator/README.md
 create mode 100755 paddle/fluid/distributed/ps/ps.proto
 create mode 100755 paddle/fluid/distributed/ps/service/README.md
 create mode 100755 paddle/fluid/distributed/ps/wrapper/ps_cpu_wrapper.h
 create mode 100755 paddle/fluid/distributed/ps/wrapper/ps_gpu_wrapper.h
 create mode 100755 paddle/fluid/distributed/ps/wrapper/ps_heter_wrapper.h
 create mode 100755 paddle/fluid/distributed/ps/wrapper/ps_wrapper.h
 create mode 100755 python/paddle/distributed/ps/README.md
 create mode 100755 python/paddle/distributed/ps/__init__.py
 create mode 100755 python/paddle/distributed/ps/the_one_ps.py
 create mode 100755 python/paddle/distributed/ps/utils/compile_time_strategy.py
 create mode 100755 python/paddle/distributed/ps/utils/ps_factory.py
 create mode 100755 python/paddle/distributed/ps/utils/ps_infer_utils.py

diff --git a/paddle/fluid/distributed/dataset_utils/README.md b/paddle/fluid/distributed/dataset_utils/README.md
new file mode 100755
index 0000000000000..b1637c185e63d
--- /dev/null
+++ b/paddle/fluid/distributed/dataset_utils/README.md
@@ -0,0 +1,6 @@
+# 目录说明
+
+> 干掉原来的 index_dataset 目录
+dataset 抽样工具类
+用户自定义数据处理so
+流式dataserver相关类
diff --git a/paddle/fluid/distributed/ps/CMakeLists.txt b/paddle/fluid/distributed/ps/CMakeLists.txt
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/paddle/fluid/distributed/ps/README.md b/paddle/fluid/distributed/ps/README.md
new file mode 100755
index 0000000000000..d287dcd111198
--- /dev/null
+++ b/paddle/fluid/distributed/ps/README.md
@@ -0,0 +1,3 @@
+# 目录说明
+
+> 成型之后，上级目录的 table、thirdparty、table、service 目录可以删除，communicator_common.h 、fleet.cc、fleet.h 删除
diff --git a/paddle/fluid/distributed/ps/coordinator/README.md b/paddle/fluid/distributed/ps/coordinator/README.md
new file mode 100755
index 0000000000000..774c7a1809dfd
--- /dev/null
+++ b/paddle/fluid/distributed/ps/coordinator/README.md
@@ -0,0 +1,3 @@
+# 目录说明
+
+* Coordinator 功能类
diff --git a/paddle/fluid/distributed/ps/ps.proto b/paddle/fluid/distributed/ps/ps.proto
new file mode 100755
index 0000000000000..2691f637527d4
--- /dev/null
+++ b/paddle/fluid/distributed/ps/ps.proto
@@ -0,0 +1,13 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
\ No newline at end of file
diff --git a/paddle/fluid/distributed/ps/service/README.md b/paddle/fluid/distributed/ps/service/README.md
new file mode 100755
index 0000000000000..a219e92c63b75
--- /dev/null
+++ b/paddle/fluid/distributed/ps/service/README.md
@@ -0,0 +1,8 @@
+# 目录说明
+
+* PSServer 
+* PSClient
+* PsService
+* Communicator
+* MessageBusFramework
+* *.proto
diff --git a/paddle/fluid/distributed/ps/wrapper/ps_cpu_wrapper.h b/paddle/fluid/distributed/ps/wrapper/ps_cpu_wrapper.h
new file mode 100755
index 0000000000000..2175bacb965ff
--- /dev/null
+++ b/paddle/fluid/distributed/ps/wrapper/ps_cpu_wrapper.h
@@ -0,0 +1,18 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef PADDLE_FLUID_DISTRIBUTED_PS_WRAPPER_PS_CPU_WRAPPER_H_
+#define PADDLE_FLUID_DISTRIBUTED_PS_WRAPPER_PS_CPU_WRAPPER_H_
+
+#endif  // PADDLE_FLUID_DISTRIBUTED_PS_WRAPPER_PS_CPU_WRAPPER_H_
diff --git a/paddle/fluid/distributed/ps/wrapper/ps_gpu_wrapper.h b/paddle/fluid/distributed/ps/wrapper/ps_gpu_wrapper.h
new file mode 100755
index 0000000000000..a73c74efff426
--- /dev/null
+++ b/paddle/fluid/distributed/ps/wrapper/ps_gpu_wrapper.h
@@ -0,0 +1,18 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef PADDLE_FLUID_DISTRIBUTED_PS_WRAPPER_PS_GPU_WRAPPER_H_
+#define PADDLE_FLUID_DISTRIBUTED_PS_WRAPPER_PS_GPU_WRAPPER_H_
+
+#endif  // PADDLE_FLUID_DISTRIBUTED_PS_WRAPPER_PS_GPU_WRAPPER_H_
diff --git a/paddle/fluid/distributed/ps/wrapper/ps_heter_wrapper.h b/paddle/fluid/distributed/ps/wrapper/ps_heter_wrapper.h
new file mode 100755
index 0000000000000..b1c15d1e2035f
--- /dev/null
+++ b/paddle/fluid/distributed/ps/wrapper/ps_heter_wrapper.h
@@ -0,0 +1,18 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef PADDLE_FLUID_DISTRIBUTED_PS_WRAPPER_PS_HETER_WRAPPER_H_
+#define PADDLE_FLUID_DISTRIBUTED_PS_WRAPPER_PS_HETER_WRAPPER_H_
+
+#endif  // PADDLE_FLUID_DISTRIBUTED_PS_WRAPPER_PS_HETER_WRAPPER_H_
diff --git a/paddle/fluid/distributed/ps/wrapper/ps_wrapper.h b/paddle/fluid/distributed/ps/wrapper/ps_wrapper.h
new file mode 100755
index 0000000000000..c92835aa995ad
--- /dev/null
+++ b/paddle/fluid/distributed/ps/wrapper/ps_wrapper.h
@@ -0,0 +1,18 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef PADDLE_FLUID_DISTRIBUTED_PS_WRAPPER_PS_WRAPPER_H_
+#define PADDLE_FLUID_DISTRIBUTED_PS_WRAPPER_PS_WRAPPER_H_
+
+#endif  // PADDLE_FLUID_DISTRIBUTED_PS_WRAPPER_PS_WRAPPER_H_
diff --git a/python/paddle/distributed/ps/README.md b/python/paddle/distributed/ps/README.md
new file mode 100755
index 0000000000000..8d28031794f5d
--- /dev/null
+++ b/python/paddle/distributed/ps/README.md
@@ -0,0 +1,3 @@
+# 目录说明
+
+* 改完之后，上层目录中 fleet 中相关文件（夹）就可以删除
diff --git a/python/paddle/distributed/ps/__init__.py b/python/paddle/distributed/ps/__init__.py
new file mode 100755
index 0000000000000..97043fd7ba688
--- /dev/null
+++ b/python/paddle/distributed/ps/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/paddle/distributed/ps/the_one_ps.py b/python/paddle/distributed/ps/the_one_ps.py
new file mode 100755
index 0000000000000..e1663029ef1f8
--- /dev/null
+++ b/python/paddle/distributed/ps/the_one_ps.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/paddle/distributed/ps/utils/compile_time_strategy.py b/python/paddle/distributed/ps/utils/compile_time_strategy.py
new file mode 100755
index 0000000000000..e1663029ef1f8
--- /dev/null
+++ b/python/paddle/distributed/ps/utils/compile_time_strategy.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/paddle/distributed/ps/utils/ps_factory.py b/python/paddle/distributed/ps/utils/ps_factory.py
new file mode 100755
index 0000000000000..e1663029ef1f8
--- /dev/null
+++ b/python/paddle/distributed/ps/utils/ps_factory.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/paddle/distributed/ps/utils/ps_infer_utils.py b/python/paddle/distributed/ps/utils/ps_infer_utils.py
new file mode 100755
index 0000000000000..e1663029ef1f8
--- /dev/null
+++ b/python/paddle/distributed/ps/utils/ps_infer_utils.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

From 050fd16876f54ae4aad9885a3ea6edefa6faa34d Mon Sep 17 00:00:00 2001
From: Allen Guo <guosonglin001@qq.com>
Date: Wed, 12 Jan 2022 19:27:12 +0800
Subject: [PATCH 115/151] [IPU] add more ops (#38831)

* support more ops

* Co-authored-by: Xiaobing Wang <xiaobingw@graphcore.ai>
Co-authored-by: Allen Guo <alleng@graphcore.ai>
Co-authored-by: Zhixin Yao <zhixiny@graphcore.ai>
Co-authored-by: Haicheng Jiang <haichengj@graphcore.ai>
Co-authored-by: Han Zhao <hanzhao@graphcore.ai>

* add authors

Co-authored-by: Xiaobing Wang <xiaobingw@graphcore.ai>
Co-authored-by: Allen Guo <alleng@graphcore.ai>
Co-authored-by: Zhixin Yao <zhixiny@graphcore.ai>
Co-authored-by: Haicheng Jiang <haichengj@graphcore.ai>
Co-authored-by: Han Zhao <hanzhao@graphcore.ai>

* update date

Co-authored-by: Xiaobing Wang <xiaobingw@graphcore.ai>
Co-authored-by: Zhixin Yao <zhixiny@graphcore.ai>
Co-authored-by: Haicheng Jiang <haichengj@graphcore.ai>
Co-authored-by: Han Zhao <hanzhao@graphcore.ai>
---
 .../ir/ipu/popart_canonicalization_pass.cc    |   1 -
 .../popart_canonicalization/activation_ops.cc |  32 ++-
 .../canonicalization_utils.cc                 |  11 +
 .../canonicalization_utils.h                  |   6 +-
 .../ipu/popart_canonicalization/logic_ops.cc  |  14 ++
 .../ipu/popart_canonicalization/math_ops.cc   | 225 +++++++++++++-----
 .../ipu/popart_canonicalization/nn_ops.cc     |  23 +-
 .../ipu/popart_canonicalization/op_builder.cc |  34 ++-
 .../ipu/popart_canonicalization/op_builder.h  |   9 +-
 .../ipu/popart_canonicalization/other_ops.cc  |  65 +++++
 .../ipu/popart_canonicalization/search_ops.cc |  72 +++---
 .../ipu/popart_canonicalization/tensor_ops.cc | 159 +++++++++++--
 12 files changed, 517 insertions(+), 134 deletions(-)
 create mode 100644 paddle/fluid/platform/device/ipu/popart_canonicalization/other_ops.cc

diff --git a/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.cc b/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.cc
index c97b7fd5bcb0c..d2d76f9a9a2f9 100644
--- a/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.cc
@@ -16,7 +16,6 @@
 
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 #include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
-#include "paddle/fluid/platform/device/ipu/popart_canonicalization/post_canonicalization.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/activation_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/activation_ops.cc
index 5793c4c0e3ca6..fc2f1e476b92e 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/activation_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/activation_ops.cc
@@ -48,7 +48,37 @@ Node *sqrt_handler(Graph *graph, Node *node) {
 }
 
 Node *gelu_handler(Graph *graph, Node *node) {
-  return activation_op_handler(graph, node, "popart_gelu_v2");
+  auto *op = node->Op();
+  auto approximate_ = BOOST_GET_CONST(bool, op->GetAttr("approximate"));
+  if (approximate_) {
+    return activation_op_handler(graph, node, "popart_gelu_v2");
+  } else {
+    auto sqrt2 = CreateConst(graph, node, {}, {},
+                             {{"value", std::vector<float>{1.4142135623730951}},
+                              {"dims", std::vector<int64_t>{1}},
+                              {"dtype", GetOutputVarDtype(node)}});
+    auto zero_point_five =
+        CreateConst(graph, node, {}, {}, {{"value", std::vector<float>{0.5}},
+                                          {"dims", std::vector<int64_t>{1}},
+                                          {"dtype", GetOutputVarDtype(node)}});
+    auto one =
+        CreateConst(graph, node, {}, {}, {{"value", std::vector<float>{1}},
+                                          {"dims", std::vector<int64_t>{1}},
+                                          {"dtype", GetOutputVarDtype(node)}});
+    auto div =
+        CreateBaseOp(graph, node, "popart_div",
+                     {GetInputVarNode("X", node), sqrt2->outputs[0]}, {}, {});
+    auto erf =
+        CreateBaseOp(graph, node, "popart_erf", {div->outputs[0]}, {}, {});
+    auto add = CreateBaseOp(graph, node, "popart_add",
+                            {erf->outputs[0], one->outputs[0]}, {}, {});
+    auto mul1 =
+        CreateBaseOp(graph, node, "popart_mul",
+                     {GetInputVarNode("X", node), add->outputs[0]}, {}, {});
+    return CreateBaseOp(graph, node, "popart_mul",
+                        {mul1->outputs[0], zero_point_five->outputs[0]},
+                        {GetOutputVarNode("Out", node)}, {});
+  }
 }
 
 Node *log_softmax_handler(Graph *graph, Node *node) {
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.cc
index d46fc55ec6ce0..3d22f75d345d6 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.cc
@@ -180,6 +180,17 @@ const bool is_float_equal(float a, float b, float eps) {
   return std::fabs(a - b) <= eps;
 }
 
+const int GetOutputVarDtype(const Node *node, const std::string &output_name) {
+  auto out_node = GetOutputVarNode(output_name, node);
+  PADDLE_ENFORCE_NOT_NULL(out_node, platform::errors::Unavailable(
+                                        "Node's out node does not exist."));
+  auto var = out_node->Var();
+  PADDLE_ENFORCE_NOT_NULL(
+      var, platform::errors::Unavailable("Node is not a variable."));
+  auto proto_var_type = var->GetDataType();
+  return VarType2OnnxDtype(proto_var_type);
+}
+
 }  // namespace ipu
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h b/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h
index c1b2bd0c8b5fd..5725ec767a425 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h
@@ -23,10 +23,6 @@ namespace paddle {
 namespace platform {
 namespace ipu {
 
-using framework::ir::Graph;
-using framework::ir::Node;
-using framework::OpDesc;
-
 #define REGISTER_HANDLER(name, func) \
   static bool __UNUSED_##name =      \
       paddle::platform::ipu::RegisterHandler(#name, func)
@@ -58,6 +54,8 @@ Node *GetOutputVarNodeByVarName(const std::string &var_name,
                                 const Node *op_node);
 
 const bool is_float_equal(float a, float b, float eps = 1e-8);
+const int GetOutputVarDtype(const Node *node,
+                            const std::string &output_name = "Out");
 
 }  // namespace ipu
 }  // namespace platform
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/logic_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/logic_ops.cc
index 92362ebf5be7d..c980bb780cfc0 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/logic_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/logic_ops.cc
@@ -28,7 +28,21 @@ Node *equal_handler(Graph *graph, Node *node) {
   return new_node;
 }
 
+Node *logical_not_handler(Graph *graph, Node *node) {
+  return CreateBaseOp(graph, node, "popart_logical_not",
+                      {GetInputVarNode("X", node)},
+                      {GetOutputVarNode("Out", node)}, {});
+}
+
+Node *greater_than_handler(Graph *graph, Node *node) {
+  return CreateBaseOp(graph, node, "popart_greater",
+                      {GetInputVarNode("X", node), GetInputVarNode("Y", node)},
+                      {GetOutputVarNode("Out", node)}, {});
+}
+
 REGISTER_HANDLER(equal, equal_handler);
+REGISTER_HANDLER(logical_not, logical_not_handler);
+REGISTER_HANDLER(greater_than, greater_than_handler);
 
 }  // namespace
 }  // namespace ipu
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc
index af7e4d0c7dbe9..67012e8d4b92d 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc
@@ -41,7 +41,8 @@ Node *pow_handler(Graph *graph, Node *node) {
     // Op(pow) -> Op(Constant)->Var(const_out)->Op(Pow)
     auto value_ = BOOST_GET_CONST(float, op->GetAttr("factor"));
     auto attrs =
-        MakeConstAttrMapFromValue<float>(value_, {1}, ONNXDataType::FLOAT);
+        MakeConstAttrMapFromValue<float>(value_, {1}, GetOutputVarDtype(node));
+
     auto new_node_const = CreateConst(graph, node, {}, {}, attrs);
     return CreateBaseOp(graph, node, "popart_pow", {GetInputVarNode("X", node),
                                                     new_node_const->outputs[0]},
@@ -122,16 +123,16 @@ Node *matmul_handler(Graph *graph, Node *node) {
     y_node = y_node->outputs[0];
   }
   if (is_float_equal(alpha, 1.0)) {
+    return CreateBaseOp(graph, node, "popart_matmul", {x_node, y_node},
+                        node->outputs);
+  } else {
     auto o_node =
         CreateBaseOp(graph, node, "popart_matmul", {x_node, y_node}, {});
-    auto attr = MakeConstAttrMapFromValue(alpha, {1}, ONNXDataType::FLOAT);
+    auto attr = MakeConstAttrMapFromValue(alpha, {1}, GetOutputVarDtype(node));
     auto const_node = CreateConst(graph, node, {}, {}, attr);
     return CreateBaseOp(graph, node, "popart_mul",
                         {o_node->outputs[0], const_node->outputs[0]},
                         node->outputs);
-  } else {
-    return CreateBaseOp(graph, node, "popart_matmul", {x_node, y_node},
-                        node->outputs);
   }
 }
 
@@ -141,7 +142,10 @@ Node *sum_handler(Graph *graph, Node *node) {
 
 Node *softmax_handler(Graph *graph, Node *node) {
   auto *op = node->Op();
-  auto axis = BOOST_GET_CONST(int, op->GetAttr("axis"));
+  int axis = -1;
+  if (op->HasAttr("axis")) {
+    axis = BOOST_GET_CONST(int, op->GetAttr("axis"));
+  }
   return CreateSoftmaxOpset11(graph, node, node->inputs, node->outputs, axis);
 }
 
@@ -153,42 +157,72 @@ Node *scale_handler(Graph *graph, Node *node) {
       BOOST_GET_CONST(bool, op->GetAttr("bias_after_scale"));
   auto data_type_ = GetInputVarNode("X", node)->Var()->GetDataType();
 
-  auto new_node_bias_var =
-      CreateConst(graph, node, {}, {}, {{"value", std::vector<float>{bias_}},
-                                        {"dims", std::vector<int64_t>{1}},
-                                        {"dtype", ONNXDataType::FLOAT}});
-  new_node_bias_var = new_node_bias_var->outputs[0];
-
-  Node *new_node_scale_var = nullptr;
-  if (op->HasInput("ScaleTensor") && !op->Input("ScaleTensor").empty()) {
-    new_node_scale_var = GetInputVarNode("ScaleTensor", node);
-  } else {
-    new_node_scale_var =
-        CreateConst(graph, node, {}, {}, {{"value", std::vector<float>{scale_}},
-                                          {"dims", std::vector<int64_t>{1}},
-                                          {"dtype", ONNXDataType::FLOAT}});
-    new_node_scale_var = new_node_scale_var->outputs[0];
-  }
+  auto cast = CreateCast(graph, node, {GetInputVarNode("X", node)}, {},
+                         static_cast<int>(framework::proto::VarType::FP32));
 
-  // convert to float32
-  auto new_node_cast =
-      CreateCast(graph, node, {GetInputVarNode("X", node)}, {},
-                 static_cast<int>(framework::proto::VarType::FP32));
   Node *result = nullptr;
-  if (bias_after_scale_) {
-    auto new_node_mul =
-        CreateBaseOp(graph, node, "popart_mul",
-                     {new_node_cast->outputs[0], new_node_scale_var}, {}, {});
-    result =
-        CreateBaseOp(graph, node, "popart_add",
-                     {new_node_mul->outputs[0], new_node_bias_var}, {}, {});
+  if (op->HasInput("ScaleTensor") && !op->Input("ScaleTensor").empty()) {
+    auto scale = GetInputVarNode("ScaleTensor", node);
+    if (is_float_equal(bias_, 0.0)) {
+      result = CreateBaseOp(graph, node, "popart_mul",
+                            {cast->outputs[0], scale}, {}, {});
+    } else {
+      auto bias = CreateConst(graph, node, {}, {},
+                              {{"value", std::vector<float>{bias_}},
+                               {"dims", std::vector<int64_t>{1}},
+                               {"dtype", ONNXDataType::FLOAT}});
+      bias = bias->outputs[0];
+      if (bias_after_scale_) {
+        auto mul = CreateBaseOp(graph, node, "popart_mul",
+                                {cast->outputs[0], scale}, {}, {});
+        result = CreateBaseOp(graph, node, "popart_add",
+                              {mul->outputs[0], bias}, {}, {});
+      } else {
+        auto add = CreateBaseOp(graph, node, "popart_add",
+                                {cast->outputs[0], bias}, {}, {});
+        result = CreateBaseOp(graph, node, "popart_mul",
+                              {add->outputs[0], scale}, {}, {});
+      }
+    }
   } else {
-    auto new_node_add =
-        CreateBaseOp(graph, node, "popart_add",
-                     {new_node_cast->outputs[0], new_node_bias_var}, {}, {});
-    result =
-        CreateBaseOp(graph, node, "popart_mul",
-                     {new_node_add->outputs[0], new_node_scale_var}, {}, {});
+    if (is_float_equal(bias_, 0.0) && is_float_equal(scale_, 1.0)) {
+      return CreateBaseOp(graph, node, "popart_identity",
+                          {GetInputVarNode("X", node)}, node->outputs, {});
+    } else if (is_float_equal(scale_, 1.0)) {
+      auto bias = CreateConst(graph, node, {}, {},
+                              {{"value", std::vector<float>{bias_}},
+                               {"dims", std::vector<int64_t>{1}},
+                               {"dtype", ONNXDataType::FLOAT}});
+      result = CreateBaseOp(graph, node, "popart_add",
+                            {cast->outputs[0], bias->outputs[0]}, {}, {});
+    } else if (is_float_equal(bias_, 0.0)) {
+      auto scale = CreateConst(graph, node, {}, {},
+                               {{"value", std::vector<float>{scale_}},
+                                {"dims", std::vector<int64_t>{1}},
+                                {"dtype", ONNXDataType::FLOAT}});
+      result = CreateBaseOp(graph, node, "popart_mul",
+                            {cast->outputs[0], scale->outputs[0]}, {}, {});
+    } else {
+      auto bias = CreateConst(graph, node, {}, {},
+                              {{"value", std::vector<float>{bias_}},
+                               {"dims", std::vector<int64_t>{1}},
+                               {"dtype", ONNXDataType::FLOAT}});
+      auto scale = CreateConst(graph, node, {}, {},
+                               {{"value", std::vector<float>{scale_}},
+                                {"dims", std::vector<int64_t>{1}},
+                                {"dtype", ONNXDataType::FLOAT}});
+      if (bias_after_scale_) {
+        auto mul = CreateBaseOp(graph, node, "popart_mul",
+                                {cast->outputs[0], scale->outputs[0]}, {}, {});
+        result = CreateBaseOp(graph, node, "popart_add",
+                              {mul->outputs[0], bias->outputs[0]}, {}, {});
+      } else {
+        auto add = CreateBaseOp(graph, node, "popart_add",
+                                {cast->outputs[0], bias->outputs[0]}, {}, {});
+        result = CreateBaseOp(graph, node, "popart_mul",
+                              {add->outputs[0], scale->outputs[0]}, {}, {});
+      }
+    }
   }
   auto result_after_cast =
       CreateCast(graph, node, result->outputs, node->outputs,
@@ -199,16 +233,27 @@ Node *scale_handler(Graph *graph, Node *node) {
 Node *cross_entropy2_handler(Graph *graph, Node *node) {
   auto *op = node->Op();
   auto ignoreIndex = BOOST_GET_CONST(int, op->GetAttr("ignore_index"));
-  auto new_cast = CreateCast(graph, node, {GetInputVarNode("Label", node)}, {},
-                             framework::proto::VarType::INT32);
+  Node *new_cast = nullptr;
+  if (GetInputVarNode("Label", node)->Var()->GetDataType() ==
+      framework::proto::VarType::INT32) {
+    new_cast = GetInputVarNode("Label", node);
+  } else {
+    auto new_cast = CreateCast(graph, node, {GetInputVarNode("Label", node)},
+                               {}, framework::proto::VarType::INT32);
+    new_cast = new_cast->outputs[0];
+  }
   auto label_shape_ = GetInputVarNode("Label", node)->Var()->GetShape();
-  if (label_shape_.size() == 1) {
-    return CreateBaseOp(graph, node, "popart_nllloss",
-                        {GetInputVarNode("X", node), new_cast->outputs[0]},
-                        {GetOutputVarNode("Y", node)},
-                        {
-                            {"ignoreIndex", ignoreIndex},
-                        });
+  if (label_shape_[label_shape_.size() - 1] != 1) {
+    auto log = CreateBaseOp(graph, node, "popart_log",
+                            {GetInputVarNode("X", node)}, {}, {});
+    return CreateBaseOp(
+        graph, node, "popart_nllloss_v2", {log->outputs[0], new_cast},
+        {GetOutputVarNode("Y", node)},
+        {
+            {"reduction", 2},  // popart::ReductionType::NoReduction
+            {"ignoreIndex", ignoreIndex},
+            {"inputIsLogProbability", true},
+        });
   } else {
     std::vector<int64_t> new_shape_{label_shape_[0]};
     auto const_before_loss = CreateBaseOp(
@@ -218,15 +263,19 @@ Node *cross_entropy2_handler(Graph *graph, Node *node) {
           std::vector<int64_t>{static_cast<int64_t>(new_shape_.size())}},
          {"dtype", ONNXDataType::INT64}});
 
-    auto reshape_before_loss = CreateBaseOp(
-        graph, node, "popart_reshape",
-        {new_cast->outputs[0], const_before_loss->outputs[0]}, {}, {});
+    auto reshape_before_loss =
+        CreateBaseOp(graph, node, "popart_reshape",
+                     {new_cast, const_before_loss->outputs[0]}, {}, {});
 
+    auto log = CreateBaseOp(graph, node, "popart_log",
+                            {GetInputVarNode("X", node)}, {}, {});
     auto nllloss = CreateBaseOp(
-        graph, node, "popart_nllloss",
-        {GetInputVarNode("X", node), reshape_before_loss->outputs[0]}, {},
+        graph, node, "popart_nllloss_v2",
+        {log->outputs[0], reshape_before_loss->outputs[0]}, {},
         {
+            {"reduction", 2},  // popart::ReductionType::NoReduction
             {"ignoreIndex", ignoreIndex},
+            {"inputIsLogProbability", true},
         });
 
     auto const_after_loss = CreateBaseOp(
@@ -244,6 +293,73 @@ Node *cross_entropy2_handler(Graph *graph, Node *node) {
   }
 }
 
+Node *cumsum_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto exclusive = BOOST_GET_CONST(bool, op->GetAttr("exclusive"));
+  int64_t popart_exclusive = 1 ? exclusive : 0;
+  auto reverse = BOOST_GET_CONST(bool, op->GetAttr("reverse"));
+  int64_t popart_reverse = 1 ? reverse : 0;
+  auto axis = BOOST_GET_CONST(int, op->GetAttr("axis"));
+  auto axis_node =
+      CreateConst(graph, node, {}, {}, {{"value", std::vector<int64_t>{axis}},
+                                        {"dims", std::vector<int64_t>{1}},
+                                        {"dtype", ONNXDataType::INT64}});
+  return CreateBaseOp(
+      graph, node, "popart_cumsum",
+      {GetInputVarNode("X", node), axis_node->outputs[0]},
+      {GetOutputVarNode("Out", node)},
+      {{"exclusive", popart_exclusive}, {"reverse", popart_reverse}});
+}
+
+Node *matmul_v2_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto transpose_x = BOOST_GET_CONST(bool, op->GetAttr("trans_x"));
+  auto transpose_y = BOOST_GET_CONST(bool, op->GetAttr("trans_y"));
+  auto x_shape = GetInputVarNode("X", node)->Var()->GetShape();
+  auto y_shape = GetInputVarNode("Y", node)->Var()->GetShape();
+
+  std::vector<int64_t> perm;
+  int x_rank = x_shape.size();
+  if (x_rank == 1) {
+    perm = std::vector<int64_t>{0};
+  } else if (x_rank == 2) {
+    perm = std::vector<int64_t>{1, 0};
+  } else if (x_rank == 3) {
+    perm = std::vector<int64_t>{0, 2, 1};
+  } else if (x_rank == 4) {
+    perm = std::vector<int64_t>{0, 1, 3, 2};
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "op matmul with input rank == %d", x_rank));
+  }
+
+  Node *x_node = GetInputVarNode("X", node);
+  Node *y_node = GetInputVarNode("Y", node);
+
+  if (transpose_x) {
+    x_node = CreateBaseOp(graph, node, "popart_transpose",
+                          {GetInputVarNode("X", node)}, {}, {{"perm", perm}});
+    x_node = x_node->outputs[0];
+  }
+  if (transpose_y) {
+    y_node = CreateBaseOp(graph, node, "popart_transpose",
+                          {GetInputVarNode("Y", node)}, {}, {{"perm", perm}});
+    y_node = y_node->outputs[0];
+  }
+
+  return CreateBaseOp(graph, node, "popart_matmul", {x_node, y_node},
+                      node->outputs);
+}
+
+Node *arg_max_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto axis = BOOST_GET_CONST(int64_t, op->GetAttr("axis"));
+  return CreateBaseOp(graph, node, "popart_argmax",
+                      {GetInputVarNode("X", node)},
+                      {GetOutputVarNode("Out", node)},
+                      {{"axis", axis}, {"keepdims", int64_t{0}}});
+}
+
 REGISTER_HANDLER(mean, mean_handler);
 REGISTER_HANDLER(pow, pow_handler);
 REGISTER_HANDLER(mul, mul_handler);
@@ -252,6 +368,9 @@ REGISTER_HANDLER(sum, sum_handler);
 REGISTER_HANDLER(softmax, softmax_handler);
 REGISTER_HANDLER(scale, scale_handler);
 REGISTER_HANDLER(cross_entropy2, cross_entropy2_handler);
+REGISTER_HANDLER(cumsum, cumsum_handler);
+REGISTER_HANDLER(matmul_v2, matmul_v2_handler);
+REGISTER_HANDLER(arg_max, arg_max_handler);
 
 }  // namespace
 }  // namespace ipu
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc
index 58f3e42b7387a..b7412000107d3 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc
@@ -22,7 +22,7 @@ namespace ipu {
 namespace {
 
 Node *conv2d_handler(Graph *graph, Node *node) {
-  OpDesc *op = node->Op();
+  auto *op = node->Op();
   auto dilations_ = BOOST_GET_CONST(std::vector<int>, op->GetAttr("dilations"));
   auto dilations = std::vector<int64_t>{dilations_.begin(), dilations_.end()};
   auto group_ = BOOST_GET_CONST(int, op->GetAttr("groups"));
@@ -193,6 +193,21 @@ Node *layer_norm_handler(Graph *graph, Node *node) {
   auto *op = node->Op();
   auto begin_norm_axis_ = BOOST_GET_CONST(int, op->GetAttr("begin_norm_axis"));
   auto input_shape_ = GetInputVarNode("X", node)->Var()->GetShape();
+  auto epsilon_ = BOOST_GET_CONST(float, op->GetAttr("epsilon"));
+  int64_t groups_ = 1;
+
+  auto groupnorm_attrs_ =
+      AttributeMap{{"epsilon", epsilon_}, {"num_groups", groups_}};
+
+  if (input_shape_.size() == 2) {
+    return CreateBaseOp(
+        graph, node, "popart_groupnormalization_v2",
+        {GetInputVarNode("X", node), GetInputVarNode("Scale", node),
+         GetInputVarNode("Bias", node)},
+        {GetOutputVarNode("Y", node), GetOutputVarNode("Mean", node),
+         GetOutputVarNode("Variance", node)},
+        groupnorm_attrs_);
+  }
 
   std::vector<int64_t> norm_shape_{1, 1};
   for (int i = 0; i < input_shape_.size(); i++) {
@@ -213,10 +228,6 @@ Node *layer_norm_handler(Graph *graph, Node *node) {
       graph, node, "popart_reshape",
       {GetInputVarNode("X", node), reshape1_const->outputs[0]}, {}, {});
 
-  auto epsilon_ = BOOST_GET_CONST(float, op->GetAttr("epsilon"));
-  int64_t groups_ = 1;
-  auto groupnorm_attrs_ =
-      AttributeMap{{"epsilon", epsilon_}, {"num_groups", groups_}};
   auto out_Y_ = MakeVarNode(graph, node);
   CreateBaseOp(graph, node, "popart_groupnormalization_v2",
                {new_node_reshape1->outputs[0], GetInputVarNode("Scale", node),
@@ -262,7 +273,7 @@ Node *dropout_handler(Graph *graph, Node *node) {
           CreateConst(graph, node, {}, {},
                       {{"value", std::vector<float>{1 - dropout_prob_}},
                        {"dims", std::vector<int64_t>{1}},
-                       {"dtype", ONNXDataType::FLOAT}});
+                       {"dtype", GetOutputVarDtype(node)}});
       return CreateBaseOp(graph, node, "popart_mul",
                           {GetInputVarNode("X", node), scale->outputs[0]},
                           {GetOutputVarNode("Out", node)}, {});
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc
index b7a3a8ca7c60f..3ec1999edc4f0 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc
@@ -31,15 +31,31 @@ const std::string GenerateOpName() {
 }
 
 const std::string CreateOpIdentifyId(Node *node) {
-  // format: op_type|out_var0|out_var1|...|_gen_*
+  // format:
+  //   if has custom op_namescope:
+  //      {op_namescope}/op_type/_gen_*
+  //   else:
+  //     {op_type}/{out_var0}/{out_var1}/.../_gen_*
   // this name will be used as op name when exporting onnx model from popart
   auto op_type = node->Name();
-  std::string op_out = "";
-  for (auto *out_node : node->outputs) {
-    op_out += "|";
-    op_out += out_node->Name();
+  std::string op_namescope;
+  if (node->Op()->HasAttr("op_namescope")) {
+    op_namescope =
+        BOOST_GET_CONST(std::string, node->Op()->GetAttr("op_namescope"));
+  } else {
+    op_namescope = "/";
+  }
+
+  if (op_namescope != "/") {
+    return {op_namescope + op_type + "/" + GenerateOpName()};
+  } else {
+    std::string op_out = "";
+    for (auto *out_node : node->outputs) {
+      op_out += "/";
+      op_out += out_node->Name();
+    }
+    return {op_type + op_out + "/" + GenerateOpName()};
   }
-  return {op_type + op_out + "|" + GenerateOpName()};
 }
 
 Node *MakeVarNode(Graph *graph, Node *node) {
@@ -100,6 +116,12 @@ Node *CreateBaseOp(Graph *graph, Node *node, const std::string &type,
   if (!new_node->Op()->HasAttr(sIpuStageAttr)) {
     CopyOpAttr(sIpuStageAttr, node->Op(), new_node->Op());
   }
+  if (node->Op()->HasAttr(sMatmulSerializeFactor)) {
+    CopyOpAttr(sMatmulSerializeFactor, node->Op(), new_node->Op());
+  }
+  if (node->Op()->HasAttr(sMatmulSerializeMode)) {
+    CopyOpAttr(sMatmulSerializeMode, node->Op(), new_node->Op());
+  }
   {
     new_node->Op()->SetAttr(sOpIdentifyIdAttr, CreateOpIdentifyId(node));
     new_node->Op()->Flush();
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h
index 7e70e56ef9166..de3788e437a42 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h
@@ -14,15 +14,16 @@
 
 #pragma once
 
-#include "paddle/fluid/platform/device/ipu/common.h"
+#include "paddle/fluid/platform/device/ipu/ipu_names.h"
 #include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
 
+using paddle::framework::AttributeMap;
+using paddle::framework::Attribute;
+
 namespace paddle {
 namespace platform {
 namespace ipu {
 
-using paddle::framework::AttributeMap;
-
 template <typename T>
 AttributeMap MakeConstAttrMap(std::vector<T> value, std::vector<int64_t> dims,
                               int dtype) {
@@ -56,7 +57,7 @@ Node *CreateConst(Graph *graph, Node *node, const std::vector<Node *> &inputs,
                   const std::vector<Node *> &outputs,
                   const AttributeMap &attrs);
 
-// otype is proto::VarType::Type
+// otype is framework::proto::VarType::Type
 Node *CreateCast(Graph *graph, Node *node, const std::vector<Node *> &inputs,
                  const std::vector<Node *> &outputs, const int otype);
 
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/other_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/other_ops.cc
new file mode 100644
index 0000000000000..0919afef4d83a
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/other_ops.cc
@@ -0,0 +1,65 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+namespace {
+
+Node *custom_op_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto attrs = op->GetAttrMap();
+  attrs.insert({"__op_type", node->Op()->Type()});
+  auto new_node = CreateBaseOp(graph, node, "popart_custom_op", node->inputs,
+                               node->outputs, attrs);
+  return new_node;
+}
+
+Node *print_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto print_phase = BOOST_GET_CONST(std::string, op->GetAttr("print_phase"));
+  int64_t print_gradient = 0;
+  if (print_phase != "forward") {
+    print_gradient = 1;
+  }
+  auto title = BOOST_GET_CONST(std::string, op->GetAttr("message"));
+  if (title.empty()) {
+    title = GetInputVarNode("In", node)->Var()->Name();
+  }
+  auto attrs =
+      AttributeMap{{"print_gradient", print_gradient}, {"title", title}};
+  return CreateBaseOp(graph, node, "popart_printtensor", node->inputs,
+                      node->outputs, attrs);
+}
+
+Node *popart_optimizer_handler(Graph *graph, Node *node) { return nullptr; }
+
+Node *checkpointoutput_handler(Graph *graph, Node *node) {
+  return CreateBaseOp(graph, node, "popart_checkpointoutput", node->inputs,
+                      node->outputs);
+}
+
+REGISTER_HANDLER(custom_op, custom_op_handler);
+REGISTER_HANDLER(print, print_handler);
+REGISTER_HANDLER(popart_optimizer, popart_optimizer_handler);
+REGISTER_HANDLER(checkpointoutput, checkpointoutput_handler);
+
+}  // namespace
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/search_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/search_ops.cc
index e90faa502ec64..662660c23b4a6 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/search_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/search_ops.cc
@@ -21,24 +21,24 @@ namespace platform {
 namespace ipu {
 namespace {
 
-Node *topK_op_handler(Graph *graph, Node *node) {
-  VLOG(10) << "[topK_op_handler] entering to handler ...";
+Node *topk_handler(Graph *graph, Node *node) {
   auto *op = node->Op();
   auto attrs = AttributeMap{};
-  int axis_32INT = -1;
+
+  int axis_ = -1;
   if (op->HasAttr("axis")) {
-    axis_32INT = BOOST_GET_CONST(int, op->GetAttr("axis"));
+    axis_ = BOOST_GET_CONST(int, op->GetAttr("axis"));
   }
-  if (axis_32INT == -1) {
+  if (axis_ == -1) {
     auto shape = GetInputVarNode("X", node)->Var()->GetShape();
     int rank = shape.size();
     if (rank < 1) {
       PADDLE_THROW(platform::errors::InvalidArgument(
           "The dimension of the shape of topK input should be large than 1"));
     }
-    axis_32INT = rank - 1;
+    axis_ = rank - 1;
   }
-  int64_t axis = int64_t{axis_32INT};
+  int64_t axis = int64_t{axis_};
   attrs.emplace("axis", axis);
 
   bool largest = true;
@@ -63,45 +63,31 @@ Node *topK_op_handler(Graph *graph, Node *node) {
     attrs.emplace("sorted", 0);
   }
 
-  std::vector<paddle::framework::ir::Node *> inputs = node->inputs;
-  if (node->inputs.size() == 2) {
-    // Input X tensor and K const tensor
-    VLOG(10) << "[topK_op_handler] get 2 input tensors.";
-    inputs[0] = node->inputs[1];  // K_t
-    VLOG(10) << "[topK_op_handler] input node(" << inputs[0]->Var()->Name()
-             << ")";
-    inputs[1] = node->inputs[0];  // X
-    VLOG(10) << "[topK_op_handler] input node(" << inputs[1]->Var()->Name()
-             << ")";
-  } else if (node->inputs.size() == 1) {
-    // Input X tensor with k integer
-    VLOG(10) << "[topK_op_handler] get 1 input tensor.";
-    int k_32INT = BOOST_GET_CONST(int, op->GetAttr("k"));
-    int64_t k = int64_t{k_32INT};
-    attrs.emplace("k", k);
-  }
-  // show output node dtype
-  for (auto *o_node : node->outputs) {
-    auto *var = o_node->Var();
-    // see framework.pb.h
-    // VarType_Type_INT64 = 3,
-    // VarType_Type_FP32 = 5,
-    auto dtype = var->GetDataType();
-    if (dtype == 3) {
-      // poplar does not support int64_t
-      var->SetDataType(framework::proto::VarType::INT32);
-    }
-    std::string name = var->Name();
-    VLOG(10) << "[topK_op_handler] output node(" << name
-             << ") dtype : " << dtype;
+  Node *var_x = GetInputVarNode("X", node);
+  Node *var_k = nullptr;
+  if (op->HasInput("K") && !op->Input("K").empty()) {
+    var_k = GetInputVarNode("K", node);
+  } else {
+    auto k = BOOST_GET_CONST(int, op->GetAttr("k"));
+    auto *op_k =
+        CreateConst(graph, node, {}, {}, {{"value", std::vector<int64_t>{k}},
+                                          {"dims", std::vector<int64_t>{1}},
+                                          {"dtype", ONNXDataType::INT64}});
+    var_k = op_k->outputs[0];
   }
-  VLOG(10) << "[topK_op_handler] leave the handler.";
-  return CreateBaseOp(graph, node, "popart_topk", inputs,
-                      {node->outputs[1], node->outputs[0]}, attrs);
+
+  auto *var_i = MakeVarNode(graph, node);
+  CreateBaseOp(graph, node, "popart_topk", {var_x, var_k},
+               {GetOutputVarNode("Out", node), var_i},
+               {{"axis", int64_t{axis}},
+                {"largest", int64_t{largest}},
+                {"sorted", int64_t{sorted}}});
+  return CreateCast(graph, node, {var_i}, {GetOutputVarNode("Indices", node)},
+                    static_cast<int>(framework::proto::VarType::INT32));
 }
 
-REGISTER_HANDLER(top_k, topK_op_handler);
-REGISTER_HANDLER(top_k_v2, topK_op_handler);
+REGISTER_HANDLER(top_k, topk_handler);
+REGISTER_HANDLER(top_k_v2, topk_handler);
 
 }  // namespace
 }  // namespace ipu
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc
index f1f77b53e4614..296668890ebe5 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc
@@ -21,9 +21,6 @@ namespace platform {
 namespace ipu {
 namespace {
 
-using framework::Attribute;
-using framework::AttributeMap;
-
 Node *fill_constant_handler(Graph *graph, Node *node) {
   auto *op = node->Op();
   if (op->HasInput("ShapeTensor") && !op->Input("ShapeTensor").empty()) {
@@ -133,6 +130,14 @@ Node *reshape_handler(Graph *graph, Node *node) {
   return new_node_reshape;
 }
 
+Node *flatten2_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto axis = BOOST_GET_CONST(int, op->GetAttr("axis"));
+  return CreateBaseOp(
+      graph, node, "popart_flatten", {GetInputVarNode("X", node)},
+      {GetOutputVarNode("Out", node)}, {{"axis", int64_t(axis)}});
+}
+
 Node *gather_handler(Graph *graph, Node *node) {
   auto new_node_gather =
       CreateBaseOp(graph, node, "popart_gather",
@@ -169,7 +174,8 @@ Node *cast_handler(Graph *graph, Node *node) {
   return new_node_cast;
 }
 
-Node *lookup_table_handler(Graph *graph, Node *node) {
+Node *lookup_table_op_handler(Graph *graph, Node *node,
+                              const std::string &type) {
   auto *op = node->Op();
   auto padding_idx_ = BOOST_GET_CONST(int64_t, op->GetAttr("padding_idx"));
   auto w_shape_ = GetInputVarNode("W", node)->Var()->GetShape();
@@ -183,7 +189,7 @@ Node *lookup_table_handler(Graph *graph, Node *node) {
     auto concat_const =
         CreateConst(graph, node, {}, {}, {{"value", const_value_},
                                           {"dims", const_shape_},
-                                          {"dtype", ONNXDataType::FLOAT}});
+                                          {"dtype", GetOutputVarDtype(node)}});
     auto axes =
         CreateConst(graph, node, {}, {}, {{"value", std::vector<int64_t>{0}},
                                           {"dims", std::vector<int64_t>{1}},
@@ -247,16 +253,28 @@ Node *lookup_table_handler(Graph *graph, Node *node) {
     w_node = GetInputVarNode("W", node);
   }
 
-  auto squeeze = CreateBaseOp(graph, node, "popart_squeeze",
-                              {GetInputVarNode("Ids", node)}, {},
-                              {{"axes", std::vector<int64_t>{-1}}});
+  // lookup_table and lookup_table_v2
+  auto ids = GetInputVarNode("Ids", node);
+  if (type == "v1") {
+    ids = CreateBaseOp(graph, node, "popart_squeeze",
+                       {GetInputVarNode("Ids", node)}, {},
+                       {{"axes", std::vector<int64_t>{-1}}});
+    ids = ids->outputs[0];
+  }
 
-  auto gather =
-      CreateBaseOp(graph, node, "popart_gather", {w_node, squeeze->outputs[0]},
-                   {GetOutputVarNode("Out", node)}, {});
+  auto gather = CreateBaseOp(graph, node, "popart_gather", {w_node, ids},
+                             {GetOutputVarNode("Out", node)}, {});
   return gather;
 }
 
+Node *lookup_table_handler(Graph *graph, Node *node) {
+  return lookup_table_op_handler(graph, node, "v1");
+}
+
+Node *lookup_table_v2_handler(Graph *graph, Node *node) {
+  return lookup_table_op_handler(graph, node, "v2");
+}
+
 Node *unsqueeze_handler(Graph *graph, Node *node) {
   auto *op = node->Op();
   auto axes_ = BOOST_GET_CONST(std::vector<int>, op->GetAttr("axes"));
@@ -336,11 +354,32 @@ Node *slice_handler(Graph *graph, Node *node) {
     auto attr = MakeConstAttrMap<int>(axes_, {dim}, ONNXDataType::INT32);
     axes = CreateConst(graph, node, {}, {}, attr);
   }
-  auto new_node = CreateBaseOp(
-      graph, node, "popart_slice",
-      {GetInputVarNode("Input", node), starts, ends, axes->outputs[0]},
-      node->outputs);
-  return new_node;
+
+  auto decrease_axis_ =
+      BOOST_GET_CONST(std::vector<int>, op->GetAttr("decrease_axis"));
+  auto input_shape_ = GetInputVarNode("Input", node)->Var()->GetShape();
+  auto output_shape_ = GetOutputVarNode("Out", node)->Var()->GetShape();
+  if (decrease_axis_.size() == 0) {
+    return CreateBaseOp(
+        graph, node, "popart_slice",
+        {GetInputVarNode("Input", node), starts, ends, axes->outputs[0]},
+        node->outputs);
+  } else if (output_shape_ == std::vector<int64_t>{0} ||
+             input_shape_.size() > output_shape_.size()) {
+    auto slice = CreateBaseOp(
+        graph, node, "popart_slice",
+        {GetInputVarNode("Input", node), starts, ends, axes->outputs[0]}, {},
+        {});
+    return CreateBaseOp(graph, node, "popart_squeeze", {slice->outputs[0]},
+                        {GetOutputVarNode("Out", node)},
+                        {{"axes", std::vector<int64_t>{decrease_axis_.begin(),
+                                                       decrease_axis_.end()}}});
+  } else {
+    return CreateBaseOp(
+        graph, node, "popart_slice",
+        {GetInputVarNode("Input", node), starts, ends, axes->outputs[0]},
+        node->outputs);
+  }
 }
 
 Node *expand_handler(Graph *graph, Node *node) {
@@ -373,11 +412,94 @@ Node *expand_handler(Graph *graph, Node *node) {
   return new_node;
 }
 
+Node *assign_handler(Graph *graph, Node *node) {
+  return CreateBaseOp(graph, node, "popart_identity",
+                      {GetInputVarNode("X", node)},
+                      {GetOutputVarNode("Out", node)}, {});
+}
+
+Node *fill_any_like_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto value = BOOST_GET_CONST(float, op->GetAttr("value"));
+  auto x_shape = GetInputVarNode("X", node)->Var()->GetShape();
+  auto dtype = BOOST_GET_CONST(int, op->GetAttr("dtype"));
+  auto x_dtype = static_cast<framework::proto::VarType::Type>(dtype);
+  size_t size = 1;
+  for (auto &dim : x_shape) {
+    size *= dim;
+  }
+
+  Attribute out_value;
+  switch (x_dtype) {
+    case framework::proto::VarType::FP32:
+      out_value = std::vector<float>(size, value);
+      break;
+    case framework::proto::VarType::FP64:
+      out_value = std::vector<double>(size, value);
+      break;
+    case framework::proto::VarType::INT32:
+      out_value = std::vector<int>(size, value);
+      break;
+    case framework::proto::VarType::INT64:
+      out_value = std::vector<int64_t>(size, value);
+      break;
+    case framework::proto::VarType::BOOL:
+      out_value = std::vector<int64_t>(size, value);
+      break;
+    default:
+      PADDLE_THROW(
+          platform::errors::Unimplemented("fill_any_like dtype: %d", x_dtype));
+  }
+  return CreateConst(graph, node, node->inputs, node->outputs,
+                     AttributeMap{
+                         {"value", out_value},
+                         {"dims", x_shape},
+                         {"dtype", VarType2OnnxDtype(dtype)},
+                     });
+}
+
+Node *one_hot_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto depth = BOOST_GET_CONST(int, op->GetAttr("depth"));
+  auto allow_out_of_range =
+      BOOST_GET_CONST(bool, op->GetAttr("allow_out_of_range"));
+  if (allow_out_of_range) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Do not support allow_out_of_range=True"));
+  } else {
+    auto depth_tensor = CreateConst(graph, node, {}, {},
+                                    {{"value", std::vector<int64_t>{depth}},
+                                     {"dims", std::vector<int64_t>{1}},
+                                     {"dtype", ONNXDataType::INT64}});
+    auto value_tensor =
+        CreateConst(graph, node, {}, {}, {{"value", std::vector<float>{0, 1}},
+                                          {"dims", std::vector<int64_t>{2}},
+                                          {"dtype", ONNXDataType::FLOAT}});
+    return CreateBaseOp(graph, node, "popart_onehot",
+                        {GetInputVarNode("X", node), depth_tensor->outputs[0],
+                         value_tensor->outputs[0]},
+                        {GetOutputVarNode("Out", node)},
+                        {{"axis", int64_t{-1}}});
+  }
+}
+
+Node *split_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto axis = BOOST_GET_CONST(int, op->GetAttr("axis"));
+  auto sections = BOOST_GET_CONST(std::vector<int>, op->GetAttr("sections"));
+  return CreateBaseOp(
+      graph, node, "popart_split", {GetInputVarNode("X", node)}, node->outputs,
+      {{"num_outputs", int64_t(sections.size())},
+       {"axis", int64_t(axis)},
+       {"split", std::vector<int64_t>{sections.begin(), sections.end()}}});
+}
+
 REGISTER_HANDLER(fill_constant, fill_constant_handler);
 REGISTER_HANDLER(gaussian_random, gaussian_random_handler);
 REGISTER_HANDLER(uniform_random, uniform_random_handler);
 REGISTER_HANDLER(transpose2, transpose_handler);
 REGISTER_HANDLER(reshape2, reshape_handler);
+REGISTER_HANDLER(flatten2, flatten2_handler);
 REGISTER_HANDLER(gather, gather_handler);
 REGISTER_HANDLER(squeeze2, squeeze_handler);
 REGISTER_HANDLER(cast, cast_handler);
@@ -388,6 +510,11 @@ REGISTER_HANDLER(stack, stack_handler);
 REGISTER_HANDLER(shape, shape_handler);
 REGISTER_HANDLER(slice, slice_handler);
 REGISTER_HANDLER(expand, expand_handler);
+REGISTER_HANDLER(assign, assign_handler);
+REGISTER_HANDLER(fill_any_like, fill_any_like_handler);
+REGISTER_HANDLER(lookup_table_v2, lookup_table_v2_handler);
+REGISTER_HANDLER(split, split_handler);
+REGISTER_HANDLER(one_hot, one_hot_handler);
 
 }  // namespace
 }  // namespace ipu

From f120148240d7d4b4fe295fcbbd373b38671262a9 Mon Sep 17 00:00:00 2001
From: zhaocaibei123 <48509226+zhaocaibei123@users.noreply.github.com>
Date: Wed, 12 Jan 2022 20:10:48 +0800
Subject: [PATCH 116/151] pscore perfermance optimization (#38582)

---
 cmake/external/libmct.cmake                   |   2 +-
 .../distributed/common/chunk_allocator.h      |  95 +++++
 paddle/fluid/distributed/fleet.cc             |  39 +-
 .../distributed/service/brpc_ps_client.cc     |  46 ++-
 .../distributed/service/brpc_ps_server.cc     |  10 +
 paddle/fluid/distributed/table/CMakeLists.txt |   7 +
 .../distributed/table/common_dense_table.h    |   2 +-
 .../fluid/distributed/table/depends/dense.h   |  71 ++--
 .../distributed/table/depends/feature_value.h | 247 ++++++------
 .../distributed/table/memory_sparse_table.cc  | 356 +++++++++---------
 .../distributed/table/memory_sparse_table.h   |  16 +-
 .../distributed/test/dense_table_test.cc      |  62 +--
 .../distributed/test/feature_value_test.cc    |  31 +-
 13 files changed, 583 insertions(+), 401 deletions(-)
 create mode 100644 paddle/fluid/distributed/common/chunk_allocator.h

diff --git a/cmake/external/libmct.cmake b/cmake/external/libmct.cmake
index c002def29c7a2..92c3165fbaa90 100644
--- a/cmake/external/libmct.cmake
+++ b/cmake/external/libmct.cmake
@@ -19,7 +19,7 @@ IF((NOT DEFINED LIBMCT_VER) OR (NOT DEFINED LIBMCT_URL))
   MESSAGE(STATUS "use pre defined download url")
   SET(LIBMCT_VER "0.1.0" CACHE STRING "" FORCE)
   SET(LIBMCT_NAME "libmct" CACHE STRING "" FORCE)
-  SET(LIBMCT_URL "https://pslib.bj.bcebos.com/libmct.tar.gz" CACHE STRING "" FORCE) 
+  SET(LIBMCT_URL "https://pslib.bj.bcebos.com/libmct/libmct.tar.gz" CACHE STRING "" FORCE)
 ENDIF()
 MESSAGE(STATUS "LIBMCT_NAME: ${LIBMCT_NAME}, LIBMCT_URL: ${LIBMCT_URL}")
 SET(LIBMCT_PREFIX_DIR    "${THIRD_PARTY_PATH}/libmct")
diff --git a/paddle/fluid/distributed/common/chunk_allocator.h b/paddle/fluid/distributed/common/chunk_allocator.h
new file mode 100644
index 0000000000000..17f7bb14224d3
--- /dev/null
+++ b/paddle/fluid/distributed/common/chunk_allocator.h
@@ -0,0 +1,95 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <glog/logging.h>
+
+namespace paddle {
+namespace distributed {
+
+// Fast allocation and deallocation of objects by allocating them in chunks.
+template <class T>
+class ChunkAllocator {
+ public:
+  explicit ChunkAllocator(size_t chunk_size = 64) {
+    CHECK(sizeof(Node) == std::max(sizeof(void*), sizeof(T)));
+    _chunk_size = chunk_size;
+    _chunks = NULL;
+    _free_nodes = NULL;
+    _counter = 0;
+  }
+  ChunkAllocator(const ChunkAllocator&) = delete;
+  ~ChunkAllocator() {
+    while (_chunks != NULL) {
+      Chunk* x = _chunks;
+      _chunks = _chunks->next;
+      free(x);
+    }
+  }
+  template <class... ARGS>
+  T* acquire(ARGS&&... args) {
+    if (_free_nodes == NULL) {
+      create_new_chunk();
+    }
+
+    T* x = (T*)(void*)_free_nodes;  // NOLINT
+    _free_nodes = _free_nodes->next;
+    new (x) T(std::forward<ARGS>(args)...);
+    _counter++;
+    return x;
+  }
+  void release(T* x) {
+    x->~T();
+    Node* node = (Node*)(void*)x;  // NOLINT
+    node->next = _free_nodes;
+    _free_nodes = node;
+    _counter--;
+  }
+  size_t size() const { return _counter; }
+
+ private:
+  struct alignas(T) Node {
+    union {
+      Node* next;
+      char data[sizeof(T)];
+    };
+  };
+  struct Chunk {
+    Chunk* next;
+    Node nodes[];
+  };
+
+  size_t _chunk_size;  // how many elements in one chunk
+  Chunk* _chunks;      // a list
+  Node* _free_nodes;   // a list
+  size_t _counter;     // how many elements are acquired
+
+  void create_new_chunk() {
+    Chunk* chunk;
+    posix_memalign(reinterpret_cast<void**>(&chunk),
+                   std::max<size_t>(sizeof(void*), alignof(Chunk)),
+                   sizeof(Chunk) + sizeof(Node) * _chunk_size);
+    chunk->next = _chunks;
+    _chunks = chunk;
+
+    for (size_t i = 0; i < _chunk_size; i++) {
+      Node* node = &chunk->nodes[i];
+      node->next = _free_nodes;
+      _free_nodes = node;
+    }
+  }
+};
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/fleet.cc b/paddle/fluid/distributed/fleet.cc
index 9eb6cbecdc752..5caeab832a3e7 100644
--- a/paddle/fluid/distributed/fleet.cc
+++ b/paddle/fluid/distributed/fleet.cc
@@ -460,25 +460,7 @@ void FleetWrapper::PushSparseFromTensorAsync(
       clks->lod().size() ? clks->lod()[0].size() - 1 : clks->dims()[0];
   CHECK(clk_size == batch_size || clk_size == 1);
 
-  std::vector<float> g;
-  for (framework::LoDTensor* g_tensor : *outputs) {
-    float* g_ori = g_tensor->data<float>();
-    // no cvm
-    if (batch_size_consist) {  // TODO(zhaocaibei123): add config
-                               // scale_sparse_gradient_with_batch_size_
-      Eigen::Map<
-          Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
-          g_mat(g_ori, g_tensor->numel() / fea_dim, fea_dim);
-      g_mat.rightCols(fea_dim) *= batch_size;
-    }
-
-    size_t origin = g.size();
-    size_t add = g_tensor->numel();
-    g.resize(origin + add);
-
-    memcpy(g.data() + origin, g_tensor->data<float>(), add * sizeof(float));
-  }
-
+  CHECK(outputs->size() == inputs->size());
   std::vector<uint64_t> push_keys;
   push_keys.reserve(MAX_FEASIGN_NUM / 100);
   std::vector<std::vector<float>> push_values;
@@ -495,9 +477,21 @@ void FleetWrapper::PushSparseFromTensorAsync(
   const int64_t* clk_tensor = clks->data<int64_t>();
 
   for (size_t index = 0; index < inputs->size(); ++index) {
+    framework::LoDTensor* g_tensor = outputs->at(index);
+    float* g = g_tensor->data<float>();
+    // no cvm
+    if (batch_size_consist) {  // TODO(zhaocaibei123): add config
+                               // scale_sparse_gradient_with_batch_size_
+      Eigen::Map<
+          Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
+          g_mat(g, g_tensor->numel() / fea_dim, fea_dim);
+      g_mat.rightCols(fea_dim) *= batch_size;
+    }
+
     const framework::LoDTensor* tensor = inputs->at(index);
     const int64_t* ids = tensor->data<int64_t>();
     size_t len = tensor->numel();
+    output_len = 0;
 
     if (tensor->lod().size() > 0) {
       for (size_t i = 0; i < tensor->lod()[0].size() - 1; ++i) {
@@ -519,7 +513,7 @@ void FleetWrapper::PushSparseFromTensorAsync(
 
           float* data = push_values.back().data() + 3;
 
-          memcpy(data, g.data() + output_len, sizeof(float) * fea_dim);
+          memcpy(data, g + output_len, sizeof(float) * fea_dim);
 
           ++input_idx;
         }
@@ -542,14 +536,13 @@ void FleetWrapper::PushSparseFromTensorAsync(
 
         float* data = push_values.back().data() + 3;
 
-        memcpy(data, g.data() + output_len, sizeof(float) * fea_dim);
+        memcpy(data, g + output_len, sizeof(float) * fea_dim);
 
         ++input_idx;
       }
     }
+    CHECK(output_len == g_tensor->numel());
   }
-  VLOG(1) << "output_len: " << output_len << " g.size(): " << g.size();
-  CHECK(output_len == g.size());
 
   std::vector<float*> push_g_vec(input_idx, nullptr);
 
diff --git a/paddle/fluid/distributed/service/brpc_ps_client.cc b/paddle/fluid/distributed/service/brpc_ps_client.cc
index f6b544d22b22d..a0a09b14dba72 100644
--- a/paddle/fluid/distributed/service/brpc_ps_client.cc
+++ b/paddle/fluid/distributed/service/brpc_ps_client.cc
@@ -210,6 +210,23 @@ int32_t BrpcPsClient::initialize() {
     }
   }
 
+  auto &profiler = CostProfiler::instance();
+  profiler.register_profiler("pserver_client_pull_dense");
+  profiler.register_profiler("pserver_client_pull_sparse");
+  profiler.register_profiler("pserver_client_pull_sparse_local");
+  profiler.register_profiler("pserver_client_push_sparse");
+  profiler.register_profiler("pserver_client_push_sparse_parse");
+  profiler.register_profiler("client_push_sparse_put");
+  profiler.register_profiler("pserver_client_push_sparse");
+  profiler.register_profiler("pserver_client_push_sparse_merge");
+  profiler.register_profiler("pserver_client_push_sparse_rpc");
+  profiler.register_profiler("pserver_client_push_dense");
+  profiler.register_profiler("pserver_client_push_dense_parse");
+  profiler.register_profiler("push_dense_put");
+  profiler.register_profiler("pserver_client_push_dense_merge");
+  profiler.register_profiler("pserver_client_push_dense_rpc");
+  profiler.register_profiler("pserver_client_push_dense_send");
+
   _running = true;
   _flushing = false;
   // 启动异步push线程
@@ -588,6 +605,7 @@ std::future<int32_t> BrpcPsClient::push_sparse_param(
 std::future<int32_t> BrpcPsClient::pull_dense(Region *regions,
                                               size_t region_num,
                                               size_t table_id) {
+  auto timer = std::make_shared<CostTimer>("pserver_client_pull_dense");
   auto *accessor = table_accessor(table_id);
   size_t request_call_num = _server_channels.size();
   uint32_t num_per_shard =
@@ -643,6 +661,7 @@ std::future<int32_t> BrpcPsClient::pull_dense(Region *regions,
         }
         closure->set_promise_value(ret);
       });
+  closure->add_timer(timer);
   auto promise = std::make_shared<std::promise<int32_t>>();
   closure->add_promise(promise);
   std::future<int> fut = promise->get_future();
@@ -865,6 +884,9 @@ std::future<int32_t> BrpcPsClient::pull_sparse(float **select_values,
                                                size_t table_id,
                                                const uint64_t *keys, size_t num,
                                                bool is_training) {
+  auto timer = std::make_shared<CostTimer>("pserver_client_pull_sparse");
+  auto local_timer =
+      std::make_shared<CostTimer>("pserver_client_pull_sparse_local");
   size_t request_call_num = _server_channels.size();
 
   auto shard_sorted_kvs = std::make_shared<
@@ -925,7 +947,7 @@ std::future<int32_t> BrpcPsClient::pull_sparse(float **select_values,
         }
         closure->set_promise_value(ret);
       });
-
+  closure->add_timer(timer);
   auto promise = std::make_shared<std::promise<int32_t>>();
   closure->add_promise(promise);
   std::future<int> fut = promise->get_future();
@@ -1110,8 +1132,8 @@ std::future<int32_t> BrpcPsClient::push_sparse(size_t table_id,
                                                const uint64_t *keys,
                                                const float **update_values,
                                                size_t num) {
-  auto push_timer =
-      std::make_shared<CostTimer>("pserver_client_push_sparse_parse");
+  auto push_timer = std::make_shared<CostTimer>("pserver_client_push_sparse");
+  CostTimer parse_timer("pserver_client_push_sparse_parse");
   int push_sparse_async_num = _push_sparse_task_queue_map[table_id]->Size();
   while (push_sparse_async_num > FLAGS_pserver_max_async_call_num) {
     // LOG(INFO) << "push_sparse Waiting for async_call_num comsume, task_num:"
@@ -1121,6 +1143,7 @@ std::future<int32_t> BrpcPsClient::push_sparse(size_t table_id,
     // push_sparse_async_num = _push_sparse_task_queue_map[table_id]->size();
     push_sparse_async_num = _push_sparse_task_queue_map[table_id]->Size();
   }
+  auto put_timer = std::make_shared<CostTimer>("client_push_sparse_put");
   thread_local std::vector<std::vector<std::pair<uint64_t, const float *>>>
       shard_sorted_kv_list;
   auto *accessor = table_accessor(table_id);
@@ -1250,14 +1273,14 @@ void BrpcPsClient::push_sparse_task_consume() {
         for_each(task_list.begin() + 1, task_list.end(),
                  [&request_kv_num, request_call_num,
                   closure](std::shared_ptr<SparseAsyncTask> &task) {
-                   // closure->add_timer(task->timer());
+                   closure->add_timer(task->timer());
                    closure->add_promise(task->promise());
                  });
 
-        // CostTimer merge_timer("pserver_client_push_sparse_merge");
-        // auto rpc_timer =
-        // std::make_shared<CostTimer>("pserver_client_push_sparse_rpc");
-        // closure->add_timer(rpc_timer);
+        CostTimer merge_timer("pserver_client_push_sparse_merge");
+        auto rpc_timer =
+            std::make_shared<CostTimer>("pserver_client_push_sparse_rpc");
+        closure->add_timer(rpc_timer);
 
         std::vector<std::future<int>> merge_status(request_call_num);
         for (int shard_idx = 0; shard_idx < request_call_num; ++shard_idx) {
@@ -1295,6 +1318,7 @@ void BrpcPsClient::push_sparse_task_consume() {
         std::vector<std::future<int>>().swap(merge_status);
       }
     }
+    timeline.Pause();
     auto wait_ms =
         FLAGS_pserver_async_push_sparse_interval_ms - (timeline.ElapsedMS());
     if (wait_ms > 0) {
@@ -1464,10 +1488,12 @@ std::future<int32_t> BrpcPsClient::push_dense(const Region *regions,
     usleep(5000);  // 5ms
     push_dense_async_num = _push_dense_task_queue_map[table_id]->Size();
   }
+  auto push_dense_timer = std::make_shared<CostTimer>("push_dense_put");
   // auto dense_data = _dense_matrix_obj_pool.get();
   auto dense_data = std::make_shared<std::vector<float>>();
   auto async_task = new DenseAsyncTask(dense_data, table_id, push_timer);
   size_t request_call_num = _server_channels.size();
+
   uint32_t num_per_shard =
       dense_dim_per_shard(accessor->fea_dim(), request_call_num);
 
@@ -1567,6 +1593,7 @@ void BrpcPsClient::push_dense_task_consume() {
                 << total_send_data[total_send_data_size - 2]
                 << total_send_data[0] << " total_send_data[-1]"
                 << total_send_data[total_send_data_size - 1];
+
         if (scale_gradient && merge_count > 1) {
           Eigen::Map<Eigen::MatrixXf> mat(total_send_data, 1,
                                           total_send_data_size);
@@ -1585,6 +1612,7 @@ void BrpcPsClient::push_dense_task_consume() {
       push_dense_raw_gradient(task_ptr, total_send_data, total_send_data_size,
                               closure);
     }
+    timeline.Pause();
     auto wait_ms =
         FLAGS_pserver_async_push_dense_interval_ms - (timeline.ElapsedMS());
     if (wait_ms > 0) {
@@ -1603,6 +1631,8 @@ void BrpcPsClient::push_dense_raw_gradient(
   closure->add_timer(timer);
   uint32_t num_per_shard =
       dense_dim_per_shard(accessor->fea_dim(), request_call_num);
+  auto send_timer =
+      std::make_shared<CostTimer>("pserver_client_push_dense_send");
   for (size_t i = 0; i < request_call_num; ++i) {
     closure->request(i)->set_cmd_id(PS_PUSH_DENSE_TABLE);
     closure->request(i)->set_table_id(task->table_id());
diff --git a/paddle/fluid/distributed/service/brpc_ps_server.cc b/paddle/fluid/distributed/service/brpc_ps_server.cc
index a1440260bf2e7..dd7072be7de63 100644
--- a/paddle/fluid/distributed/service/brpc_ps_server.cc
+++ b/paddle/fluid/distributed/service/brpc_ps_server.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/distributed/service/brpc_ps_server.h"
 #include <thread>  // NOLINT
 #include "butil/object_pool.h"
+#include "paddle/fluid/distributed/common/cost_timer.h"
 #include "paddle/fluid/distributed/table/depends/sparse_utils.h"
 #include "paddle/fluid/distributed/table/table.h"
 #include "paddle/fluid/framework/archive.h"
@@ -117,6 +118,11 @@ int32_t BrpcPsService::initialize() {
   _service_handler_map[PS_START_PROFILER] = &BrpcPsService::start_profiler;
   _service_handler_map[PS_STOP_PROFILER] = &BrpcPsService::stop_profiler;
   _service_handler_map[PS_PUSH_GLOBAL_STEP] = &BrpcPsService::push_global_step;
+  auto &profiler = CostProfiler::instance();
+  profiler.register_profiler("pserver_server_pull_dense");
+  profiler.register_profiler("pserver_server_push_dense");
+  profiler.register_profiler("pserver_server_pull_sparse");
+  profiler.register_profiler("pserver_server_push_sparse");
 
   // shard初始化,server启动后才可从env获取到server_list的shard信息
   initialize_shard_info();
@@ -190,6 +196,7 @@ int32_t BrpcPsService::pull_dense(Table *table, const PsRequestMessage &request,
         "PsRequestMessage.datas is requeired at least 1 for num of dense");
     return 0;
   }
+  CostTimer timer("pserver_server_pull_dense");
   uint32_t num = *(const uint32_t *)request.params(0).c_str();
   if (num < 0) {
     set_response_code(response, -1,
@@ -246,6 +253,7 @@ int32_t BrpcPsService::push_dense(Table *table, const PsRequestMessage &request,
     return 0;
   }
 
+  CostTimer timer("pserver_server_push_dense");
   /*
   Push Content:
   |--num--|---valuesData---|
@@ -356,6 +364,7 @@ int32_t BrpcPsService::pull_sparse(Table *table,
     return 0;
   }
 
+  CostTimer timer("pserver_server_pull_sparse");
   uint32_t num = *(uint32_t *)(request.params(0).c_str());
   auto dim = table->value_accesor()->select_dim();
 
@@ -396,6 +405,7 @@ int32_t BrpcPsService::push_sparse(Table *table,
                       "least 1 for num of sparse_key");
     return 0;
   }
+  CostTimer timer("pserver_server_push_sparse");
   uint32_t num = *(uint32_t *)(request.params(0).c_str());
   /*
   Push Content:
diff --git a/paddle/fluid/distributed/table/CMakeLists.txt b/paddle/fluid/distributed/table/CMakeLists.txt
index 0201b627801cb..b0a553f210044 100644
--- a/paddle/fluid/distributed/table/CMakeLists.txt
+++ b/paddle/fluid/distributed/table/CMakeLists.txt
@@ -16,6 +16,11 @@ set_source_files_properties(common_graph_table.cc PROPERTIES COMPILE_FLAGS ${DIS
 
 get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS)
 
+set(PADDLE_LIB_THIRD_PARTY_PATH "${PADDLE_LIB}/third_party/")
+include_directories(${PADDLE_LIB_THIRD_PARTY_PATH}libmct/src/extern_libmct/libmct/include)
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
+
 set(EXTERN_DEP "")
 if(WITH_HETERPS)
     set(TABLE_SRC common_sparse_table.cc ssd_sparse_table.cc common_dense_table.cc sparse_geo_table.cc barrier_table.cc common_graph_table.cc)
@@ -43,3 +48,5 @@ cc_library(ctr_accessor SRCS ctr_accessor.cc DEPS ${TABLE_DEPS} ps_framework_pro
 cc_library(memory_sparse_table SRCS memory_sparse_table.cc DEPS ps_framework_proto ${TABLE_DEPS} fs afs_wrapper ctr_accessor common_table)
 
 cc_library(table SRCS table.cc DEPS memory_sparse_table common_table tensor_accessor tensor_table ps_framework_proto string_helper device_context gflags glog boost)
+
+target_link_libraries(table -fopenmp)
diff --git a/paddle/fluid/distributed/table/common_dense_table.h b/paddle/fluid/distributed/table/common_dense_table.h
index 1fa0226decd56..c8813dc33053f 100644
--- a/paddle/fluid/distributed/table/common_dense_table.h
+++ b/paddle/fluid/distributed/table/common_dense_table.h
@@ -57,7 +57,7 @@ class CommonDenseTable : public DenseTable {
   int32_t _push_dense(const float* values, size_t num);
 
  private:
-  const int task_pool_size_ = 1;
+  const int task_pool_size_ = 10;
   bool sync = true;
   std::vector<std::shared_ptr<::ThreadPool>> _shards_task_pool;
   int param_dim_ = 0;
diff --git a/paddle/fluid/distributed/table/depends/dense.h b/paddle/fluid/distributed/table/depends/dense.h
index 8e507842bc330..d2042b7a718e6 100644
--- a/paddle/fluid/distributed/table/depends/dense.h
+++ b/paddle/fluid/distributed/table/depends/dense.h
@@ -99,6 +99,7 @@ class DSGD : public DenseOptimizer {
 };
 
 // adam optimizer for dense tensor
+// TODO(zhaocaibei123): add CHECK(common_dense_table.task_pool_size_) == 1
 class DAdam : public DenseOptimizer {
  public:
   explicit DAdam(const CommonAccessorParameter& accessor,
@@ -131,6 +132,8 @@ class DAdam : public DenseOptimizer {
     epsilon = 1.0e-8;
   }
 
+  // make sure common_dense_table.task_pool_size_ == 1;
+  // otherwise, task_pool_size_ times beta1_pow/beta2_pow multiplication
   void update(const float* update_values, size_t num, int begin,
               int end) override {
     auto update_numel = end - begin;
@@ -221,45 +224,35 @@ class DAdamD2Sum : public DenseOptimizer {
   void update(const float* update_values, size_t num, int begin,
               int end) override {
     auto update_numel = end - begin;
-    std::vector<float> grad, grad2, scale;
-    grad.resize(update_numel);
-    grad2.resize(update_numel);
-    scale.resize(update_numel);
-
-    auto blas = GetBlas<float>();
-    // copy grad
-    blas.VCOPY(update_numel, update_values + begin, grad.data());
-    blas.VCOPY(update_numel, update_values + begin, grad2.data());
-
-    // d2sum
-    blas.SCAL(update_numel, ada_decay_rate[0], ada_d2sum + begin);
-    ADD<float>(update_numel, ada_d2sum + begin, 1, ada_d2sum + begin);
-
-    // g2sum
-    blas.SCAL(update_numel, ada_decay_rate[0], ada_g2sum + begin);
-    blas.VSQUARE(update_numel, grad2.data(), grad2.data());
-    blas.VADD(update_numel, ada_g2sum + begin, grad2.data(), ada_g2sum + begin);
-
-    // mom
-    blas.SCAL(update_numel, mom_decay_rate[0], mom_velocity + begin);
-    blas.SCAL(update_numel, 1 - mom_decay_rate[0], grad.data());
-    blas.VADD(update_numel, mom_velocity + begin, grad.data(),
-              mom_velocity + begin);
-
-    // scale
-    float* scale_ = scale.data();
-    blas.VDIV(update_numel, ada_g2sum + begin, ada_d2sum + begin, scale_);
-    ADD<float>(update_numel, scale_, ada_epsilon[0], scale_);
-    DIV<float>(update_numel, 1 + ada_epsilon[0], scale_, scale_);
-    SQRT<float>(update_numel, scale_, scale_);
-
-    blas.SCAL(update_numel, learning_rate[0], scale_);
-
-    // TODO(zhaocaibei123): check if there exists elementwise_multiply in blas
-    // TODO(zhaocaibei123): blas.VMUL
-    ELE_MUL<float>(update_numel, scale_, mom_velocity + begin, scale_);
-
-    blas.VSUB(update_numel, param + begin, scale_, param + begin);
+    Eigen::Map<Eigen::MatrixXf> mat_ada_g2sum(ada_g2sum + begin, 1,
+                                              update_numel);
+
+    Eigen::Map<Eigen::MatrixXf> mat_ada_d2sum(ada_d2sum + begin, 1,
+                                              update_numel);
+    Eigen::Map<Eigen::MatrixXf> mat_mom_velocity(mom_velocity + begin, 1,
+                                                 update_numel);
+    Eigen::Map<Eigen::MatrixXf> mat_w(param + begin, 1, update_numel);
+
+    Eigen::Map<const Eigen::MatrixXf> mat_grad(update_values + begin, 1,
+                                               update_numel);
+
+    mat_ada_d2sum = (mat_ada_d2sum * ada_decay_rate[0]).array() + 1;
+    mat_ada_g2sum =
+        (mat_ada_g2sum * ada_decay_rate[0]) + mat_grad.cwiseProduct(mat_grad);
+
+    thread_local std::vector<float> scale_vec;
+    scale_vec.resize(update_numel);
+    Eigen::Map<Eigen::MatrixXf> scale(scale_vec.data(), 1, update_numel);
+    memcpy(scale_vec.data(), mat_ada_d2sum.data(),
+           sizeof(float) * update_numel);
+
+    scale = scale.array() * ada_epsilon[0];
+    scale = (mat_ada_d2sum + scale).cwiseQuotient(mat_ada_g2sum + scale);
+    scale = scale.cwiseSqrt();
+    mat_mom_velocity =
+        (mat_mom_velocity - mat_grad) * mom_decay_rate[0] + mat_grad;
+
+    mat_w -= learning_rate[0] * mat_mom_velocity.cwiseProduct(scale);
   }
 
   float* learning_rate;
diff --git a/paddle/fluid/distributed/table/depends/feature_value.h b/paddle/fluid/distributed/table/depends/feature_value.h
index ad037a86bce80..7a83fdec1d7eb 100644
--- a/paddle/fluid/distributed/table/depends/feature_value.h
+++ b/paddle/fluid/distributed/table/depends/feature_value.h
@@ -14,35 +14,11 @@
 
 #pragma once
 
-#include <ThreadPool.h>
-#include <functional>
-#include <future>  // NOLINT
-#include <memory>
-#include <string>
-#include <thread>  // NOLINT
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
 #include <vector>
 #include "gflags/gflags.h"
 
-#include "butil/object_pool.h"
-#include "paddle/fluid/distributed/common/utils.h"
-#include "paddle/fluid/distributed/table/depends/initializers.h"
-#include "paddle/fluid/distributed/thirdparty/round_robin.h"
-#include "paddle/fluid/framework/generator.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/rw_lock.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/platform/port.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/fluid/string/string_helper.h"
+#include <mct/hash-map.hpp>
+#include "paddle/fluid/distributed/common/chunk_allocator.h"
 
 namespace paddle {
 namespace distributed {
@@ -55,112 +31,169 @@ class FixedFeatureValue {
  public:
   FixedFeatureValue() {}
   ~FixedFeatureValue() {}
-  float *data() { return data_.data(); }
-  size_t size() { return data_.size(); }
-  void resize(size_t size) { data_.resize(size); }
-  void shrink_to_fit() { data_.shrink_to_fit(); }
+  float* data() { return _data.data(); }
+  size_t size() { return _data.size(); }
+  void resize(size_t size) { _data.resize(size); }
+  void shrink_to_fit() { _data.shrink_to_fit(); }
 
  private:
-  std::vector<float> data_;
+  std::vector<float> _data;
 };
 
-class SparseTableShard {
+template <class KEY, class VALUE>
+struct alignas(64) SparseTableShard {
  public:
-  typedef typename robin_hood::unordered_map<uint64_t, FixedFeatureValue *>
+  typedef typename mct::closed_hash_map<KEY, mct::Pointer, std::hash<KEY>>
       map_type;
-  SparseTableShard() {}
-  ~SparseTableShard() {}
+  struct iterator {
+    typename map_type::iterator it;
+    size_t bucket;
+    map_type* buckets;
+    friend bool operator==(const iterator& a, const iterator& b) {
+      return a.it == b.it;
+    }
+    friend bool operator!=(const iterator& a, const iterator& b) {
+      return a.it != b.it;
+    }
+    const KEY& key() const { return it->first; }
+    VALUE& value() const { return *(VALUE*)(void*)it->second; }  // NOLINT
+    iterator& operator++() {
+      ++it;
 
-  FixedFeatureValue *Init(const uint64_t &id) {
-    size_t hash = hasher_(id);
-    size_t bucket = compute_bucket(hash);
-    auto &table = values_[bucket];
+      while (it == buckets[bucket].end() &&
+             bucket + 1 < CTR_SPARSE_SHARD_BUCKET_NUM) {
+        it = buckets[++bucket].begin();
+      }
 
-    FixedFeatureValue *value = nullptr;
-    value = butil::get_object<FixedFeatureValue>();
-    table[id] = value;
-    return value;
+      return *this;
+    }
+    iterator operator++(int) {
+      iterator ret = *this;
+      ++*this;
+      return ret;
+    }
+  };
+  struct local_iterator {
+    typename map_type::iterator it;
+    friend bool operator==(const local_iterator& a, const local_iterator& b) {
+      return a.it == b.it;
+    }
+    friend bool operator!=(const local_iterator& a, const local_iterator& b) {
+      return a.it != b.it;
+    }
+    const KEY& key() const { return it->first; }
+    VALUE& value() const { return *(VALUE*)(void*)it->second; }  // NOLINT
+    local_iterator& operator++() {
+      ++it;
+      return *this;
+    }
+    local_iterator operator++(int) { return {it++}; }
+  };
+
+  ~SparseTableShard() { clear(); }
+  bool empty() { return _alloc.size() == 0; }
+  size_t size() { return _alloc.size(); }
+  void set_max_load_factor(float x) {
+    for (size_t bucket = 0; bucket < CTR_SPARSE_SHARD_BUCKET_NUM; bucket++) {
+      _buckets[bucket].max_load_factor(x);
+    }
   }
-
-  // dont judge if (has(id))
-  float *Get(const uint64_t &id) {
-    size_t hash = hasher_(id);
-    size_t bucket = compute_bucket(hash);
-    auto &table = values_[bucket];
-
-    // auto &value = table.at(id);
-    // return value->data_.data();
-    auto res = table.find(id);
-    FixedFeatureValue *value = res->second;
-    return value->data();
+  size_t bucket_count() { return CTR_SPARSE_SHARD_BUCKET_NUM; }
+  size_t bucket_size(size_t bucket) { return _buckets[bucket].size(); }
+  void clear() {
+    for (size_t bucket = 0; bucket < CTR_SPARSE_SHARD_BUCKET_NUM; bucket++) {
+      map_type& data = _buckets[bucket];
+      for (auto it = data.begin(); it != data.end(); ++it) {
+        _alloc.release((VALUE*)(void*)it->second);  // NOLINT
+      }
+      data.clear();
+    }
   }
-
-  // for load, to reset count, unseen_days
-  FixedFeatureValue *GetValue(const uint64_t &id) {
-    size_t hash = hasher_(id);
-    size_t bucket = compute_bucket(hash);
-
-    auto &table = values_[bucket];
-    auto res = table.find(id);
-    return res->second;
+  iterator begin() {
+    auto it = _buckets[0].begin();
+    size_t bucket = 0;
+    while (it == _buckets[bucket].end() &&
+           bucket + 1 < CTR_SPARSE_SHARD_BUCKET_NUM) {
+      it = _buckets[++bucket].begin();
+    }
+    return {it, bucket, _buckets};
   }
-
-  void erase(uint64_t feasign) {
-    size_t hash = hasher_(feasign);
+  iterator end() {
+    return {_buckets[CTR_SPARSE_SHARD_BUCKET_NUM - 1].end(),
+            CTR_SPARSE_SHARD_BUCKET_NUM - 1, _buckets};
+  }
+  local_iterator begin(size_t bucket) { return {_buckets[bucket].begin()}; }
+  local_iterator end(size_t bucket) { return {_buckets[bucket].end()}; }
+  iterator find(const KEY& key) {
+    size_t hash = _hasher(key);
     size_t bucket = compute_bucket(hash);
-    auto &table = values_[bucket];
-
-    auto iter = table.find(feasign);
-    if (iter != table.end()) {
-      butil::return_object(iter->second);
-      iter = table.erase(iter);
+    auto it = _buckets[bucket].find_with_hash(key, hash);
+    if (it == _buckets[bucket].end()) {
+      return end();
     }
+    return {it, bucket, _buckets};
+  }
+  VALUE& operator[](const KEY& key) { return emplace(key).first.value(); }
+  std::pair<iterator, bool> insert(const KEY& key, const VALUE& val) {
+    return emplace(key, val);
   }
+  std::pair<iterator, bool> insert(const KEY& key, VALUE&& val) {
+    return emplace(key, std::move(val));
+  }
+  template <class... ARGS>
+  std::pair<iterator, bool> emplace(const KEY& key, ARGS&&... args) {
+    size_t hash = _hasher(key);
+    size_t bucket = compute_bucket(hash);
+    auto res = _buckets[bucket].insert_with_hash({key, NULL}, hash);
 
-  void clear() {}
+    if (res.second) {
+      res.first->second = _alloc.acquire(std::forward<ARGS>(args)...);
+    }
 
-  size_t compute_bucket(size_t hash) {
-    if (CTR_SPARSE_SHARD_BUCKET_NUM == 1) {
-      return 0;
-    } else {
-      return hash >> (sizeof(size_t) * 8 - CTR_SPARSE_SHARD_BUCKET_NUM_BITS);
+    return {{res.first, bucket, _buckets}, res.second};
+  }
+  iterator erase(iterator it) {
+    _alloc.release((VALUE*)(void*)it.it->second);  // NOLINT
+    size_t bucket = it.bucket;
+    auto it2 = _buckets[bucket].erase(it.it);
+    while (it2 == _buckets[bucket].end() &&
+           bucket + 1 < CTR_SPARSE_SHARD_BUCKET_NUM) {
+      it2 = _buckets[++bucket].begin();
     }
+    return {it2, bucket, _buckets};
   }
-
-  map_type::iterator end() {
-    return values_[CTR_SPARSE_SHARD_BUCKET_NUM - 1].end();
+  void quick_erase(iterator it) {
+    _alloc.release((VALUE*)(void*)it.it->second);  // NOLINT
+    _buckets[it.bucket].quick_erase(it.it);
   }
-
-  map_type::iterator Find(uint64_t id) {
-    size_t hash = hasher_(id);
-    size_t bucket = compute_bucket(hash);
-    auto &table = values_[bucket];
-
-    auto got = table.find(id);
-    if (got == table.end()) {
-      return end();
-    } else {
-      return got;
+  local_iterator erase(size_t bucket, local_iterator it) {
+    _alloc.release((VALUE*)(void*)it.it->second);  // NOLINT
+    return {_buckets[bucket].erase(it.it)};
+  }
+  void quick_erase(size_t bucket, local_iterator it) {
+    _alloc.release((VALUE*)(void*)it.it->second);  // NOLINT
+    _buckets[bucket].quick_erase(it.it);
+  }
+  size_t erase(const KEY& key) {
+    auto it = find(key);
+    if (it == end()) {
+      return 0;
     }
+    quick_erase(it);
+    return 1;
   }
-
- private:
-  bool Has(const uint64_t id) {
-    size_t hash = hasher_(id);
-    size_t bucket = compute_bucket(hash);
-    auto &table = values_[bucket];
-
-    auto got = table.find(id);
-    if (got == table.end()) {
-      return false;
+  size_t compute_bucket(size_t hash) {
+    if (CTR_SPARSE_SHARD_BUCKET_NUM == 1) {
+      return 0;
     } else {
-      return true;
+      return hash >> (sizeof(size_t) * 8 - CTR_SPARSE_SHARD_BUCKET_NUM_BITS);
     }
   }
 
- public:
-  map_type values_[CTR_SPARSE_SHARD_BUCKET_NUM];
-  std::hash<uint64_t> hasher_;
+ private:
+  map_type _buckets[CTR_SPARSE_SHARD_BUCKET_NUM];
+  ChunkAllocator<VALUE> _alloc;
+  std::hash<KEY> _hasher;
 };
 
 }  // namespace distributed
diff --git a/paddle/fluid/distributed/table/memory_sparse_table.cc b/paddle/fluid/distributed/table/memory_sparse_table.cc
index da5c51dfd560a..7501207abe09b 100644
--- a/paddle/fluid/distributed/table/memory_sparse_table.cc
+++ b/paddle/fluid/distributed/table/memory_sparse_table.cc
@@ -12,8 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <omp.h>
 #include <sstream>
 
+#include "paddle/fluid/distributed/common/cost_timer.h"
 #include "paddle/fluid/distributed/table/memory_sparse_table.h"
 #include "paddle/fluid/framework/io/fs.h"
 
@@ -25,41 +27,40 @@ namespace paddle {
 namespace distributed {
 
 // TODO(zhaocaibei123): configure
-bool FLAGS_pslib_create_value_when_push = false;
-int FLAGS_pslib_table_save_max_retry = 3;
-bool FLAGS_pslib_enable_create_feasign_randomly = false;
+bool FLAGS_pserver_create_value_when_push = false;
+int FLAGS_pserver_table_save_max_retry = 3;
+bool FLAGS_pserver_enable_create_feasign_randomly = false;
 
 int32_t MemorySparseTable::initialize() {
-  shards_task_pool_.resize(task_pool_size_);
-  for (int i = 0; i < shards_task_pool_.size(); ++i) {
-    shards_task_pool_[i].reset(new ::ThreadPool(1));
+  _shards_task_pool.resize(_task_pool_size);
+  for (int i = 0; i < _shards_task_pool.size(); ++i) {
+    _shards_task_pool[i].reset(new ::ThreadPool(1));
   }
+  auto& profiler = CostProfiler::instance();
+  profiler.register_profiler("pserver_sparse_update_all");
+  profiler.register_profiler("pserver_sparse_select_all");
   initialize_value();
   VLOG(0) << "initalize MemorySparseTable succ";
   return 0;
 }
 
 int32_t MemorySparseTable::initialize_value() {
-  sparse_table_shard_num_ = static_cast<int>(_config.shard_num());
-  avg_local_shard_num_ =
-      SparseTable::sparse_local_shard_num(sparse_table_shard_num_, _shard_num);
-  real_local_shard_num_ = avg_local_shard_num_;
-  if (real_local_shard_num_ * (_shard_idx + 1) > sparse_table_shard_num_) {
-    real_local_shard_num_ =
-        sparse_table_shard_num_ - real_local_shard_num_ * _shard_idx;
-    real_local_shard_num_ =
-        real_local_shard_num_ < 0 ? 0 : real_local_shard_num_;
+  _sparse_table_shard_num = static_cast<int>(_config.shard_num());
+  _avg_local_shard_num =
+      SparseTable::sparse_local_shard_num(_sparse_table_shard_num, _shard_num);
+  _real_local_shard_num = _avg_local_shard_num;
+  if (_real_local_shard_num * (_shard_idx + 1) > _sparse_table_shard_num) {
+    _real_local_shard_num =
+        _sparse_table_shard_num - _real_local_shard_num * _shard_idx;
+    _real_local_shard_num =
+        _real_local_shard_num < 0 ? 0 : _real_local_shard_num;
   }
-  VLOG(1) << "memory sparse table avg_local_shard_num_: "
-          << avg_local_shard_num_
-          << " real_local_shard_num_: " << real_local_shard_num_;
+  VLOG(1) << "memory sparse table _avg_local_shard_num: "
+          << _avg_local_shard_num
+          << " _real_local_shard_num: " << _real_local_shard_num;
 
-  shard_values_.reserve(real_local_shard_num_);
+  _local_shards.reset(new shard_type[_real_local_shard_num]);
 
-  for (int x = 0; x < real_local_shard_num_; ++x) {
-    auto shard = std::make_shared<SparseTableShard>();
-    shard_values_.emplace_back(shard);
-  }
   return 0;
 }
 
@@ -74,7 +75,7 @@ int32_t MemorySparseTable::load(const std::string& path,
   }
 
   int load_param = atoi(param.c_str());
-  auto expect_shard_num = sparse_table_shard_num_;
+  auto expect_shard_num = _sparse_table_shard_num;
   if (file_list.size() != expect_shard_num) {
     LOG(WARNING) << "MemorySparseTable file_size:" << file_list.size()
                  << " not equal to expect_shard_num:" << expect_shard_num;
@@ -85,14 +86,14 @@ int32_t MemorySparseTable::load(const std::string& path,
     return -1;
   }
 
-  size_t file_start_idx = _shard_idx * avg_local_shard_num_;
+  size_t file_start_idx = _shard_idx * _avg_local_shard_num;
 
   size_t feature_value_size = _value_accesor->size() / sizeof(float);
-  // TODO(zhaocaibei123): multi-thread
-  // int thread_num = shard_values_.size() < 15 ? shard_values_.size() : 15;
-  // omp_set_num_threads(thread_num);
-  // #pragma omp parallel for schedule(dynamic)
-  for (size_t i = 0; i < real_local_shard_num_; ++i) {
+
+  int thread_num = _real_local_shard_num < 15 ? _real_local_shard_num : 15;
+  omp_set_num_threads(thread_num);
+#pragma omp parallel for schedule(dynamic)
+  for (size_t i = 0; i < _real_local_shard_num; ++i) {
     FsChannelConfig channel_config;
     channel_config.path = file_list[file_start_idx + i];
     VLOG(1) << "MemorySparseTable::load begin load " << channel_config.path
@@ -110,21 +111,21 @@ int32_t MemorySparseTable::load(const std::string& path,
       std::string line_data;
       auto read_channel = _afs_client.open_r(channel_config, 0, &err_no);
       char* end = NULL;
-      auto& shard = shard_values_[i];
+      auto& shard = _local_shards[i];
       try {
         while (read_channel->read_line(line_data) == 0 &&
                line_data.size() > 1) {
           uint64_t key = std::strtoul(line_data.data(), &end, 10);
-          auto* value = shard->Init(key);
-          value->resize(feature_value_size);
+          auto& value = shard[key];
+          value.resize(feature_value_size);
           int parse_size =
-              _value_accesor->parse_from_string(++end, value->data());
-          value->resize(parse_size);
+              _value_accesor->parse_from_string(++end, value.data());
+          value.resize(parse_size);
 
           // for debug
           for (int ii = 0; ii < parse_size; ++ii) {
             VLOG(2) << "MemorySparseTable::load key: " << key << " value " << ii
-                    << ": " << value->data()[ii] << " local_shard: " << i;
+                    << ": " << value.data()[ii] << " local_shard: " << i;
           }
         }
         read_channel->close();
@@ -141,7 +142,7 @@ int32_t MemorySparseTable::load(const std::string& path,
         LOG(ERROR) << "MemorySparseTable load failed, retry it! path:"
                    << channel_config.path << " , retry_num=" << retry_num;
       }
-      if (retry_num > paddle::distributed::FLAGS_pslib_table_save_max_retry) {
+      if (retry_num > paddle::distributed::FLAGS_pserver_table_save_max_retry) {
         LOG(ERROR) << "MemorySparseTable load failed reach max limit!";
         exit(-1);
       }
@@ -149,7 +150,7 @@ int32_t MemorySparseTable::load(const std::string& path,
   }
   LOG(INFO) << "MemorySparseTable load success, path from "
             << file_list[file_start_idx] << " to "
-            << file_list[file_start_idx + real_local_shard_num_ - 1];
+            << file_list[file_start_idx + _real_local_shard_num - 1];
   return 0;
 }
 
@@ -159,7 +160,7 @@ int32_t MemorySparseTable::load_local_fs(const std::string& path,
   auto file_list = paddle::framework::localfs_list(table_path);
 
   int load_param = atoi(param.c_str());
-  auto expect_shard_num = sparse_table_shard_num_;
+  auto expect_shard_num = _sparse_table_shard_num;
   if (file_list.size() != expect_shard_num) {
     LOG(WARNING) << "MemorySparseTable file_size:" << file_list.size()
                  << " not equal to expect_shard_num:" << expect_shard_num;
@@ -170,14 +171,14 @@ int32_t MemorySparseTable::load_local_fs(const std::string& path,
     return -1;
   }
 
-  size_t file_start_idx = _shard_idx * avg_local_shard_num_;
+  size_t file_start_idx = _shard_idx * _avg_local_shard_num;
 
   size_t feature_value_size = _value_accesor->size() / sizeof(float);
 
-  // int thread_num = shard_values_.size() < 15 ? shard_values_.size() : 15;
-  // omp_set_num_threads(thread_num);
-  // #pragma omp parallel for schedule(dynamic)
-  for (size_t i = 0; i < real_local_shard_num_; ++i) {
+  int thread_num = _real_local_shard_num < 15 ? _real_local_shard_num : 15;
+  omp_set_num_threads(thread_num);
+#pragma omp parallel for schedule(dynamic)
+  for (size_t i = 0; i < _real_local_shard_num; ++i) {
     bool is_read_failed = false;
     int retry_num = 0;
     int err_no = 0;
@@ -187,16 +188,15 @@ int32_t MemorySparseTable::load_local_fs(const std::string& path,
       std::string line_data;
       std::ifstream file(file_list[file_start_idx + i]);
       char* end = NULL;
-      auto& shard = shard_values_[i];
+      auto& shard = _local_shards[i];
       try {
         while (std::getline(file, line_data) && line_data.size() > 1) {
           uint64_t key = std::strtoul(line_data.data(), &end, 10);
-          auto* value = shard->Init(key);
-          value->resize(feature_value_size);
+          auto& value = shard[key];
+          value.resize(feature_value_size);
           int parse_size =
-              _value_accesor->parse_from_string(++end, value->data());
-          value->resize(parse_size);
-          // value->shrink_to_fit();
+              _value_accesor->parse_from_string(++end, value.data());
+          value.resize(parse_size);
         }
         file.close();
         if (err_no == -1) {
@@ -213,7 +213,7 @@ int32_t MemorySparseTable::load_local_fs(const std::string& path,
                    << file_list[file_start_idx + i]
                    << " , retry_num=" << retry_num;
       }
-      if (retry_num > paddle::distributed::FLAGS_pslib_table_save_max_retry) {
+      if (retry_num > paddle::distributed::FLAGS_pserver_table_save_max_retry) {
         LOG(ERROR) << "MemorySparseTable load failed reach max limit!";
         exit(-1);
       }
@@ -221,7 +221,7 @@ int32_t MemorySparseTable::load_local_fs(const std::string& path,
   }
   LOG(INFO) << "MemorySparseTable load success, path from "
             << file_list[file_start_idx] << " to "
-            << file_list[file_start_idx + real_local_shard_num_ - 1];
+            << file_list[file_start_idx + _real_local_shard_num - 1];
   return 0;
 }
 
@@ -233,15 +233,14 @@ int32_t MemorySparseTable::save(const std::string& dirname,
   std::string table_path = table_dir(dirname);
   _afs_client.remove(paddle::string::format_string(
       "%s/part-%03d-*", table_path.c_str(), _shard_idx));
-  // int thread_num = shard_values_.size() < 20 ? shard_values_.size() : 20;
   std::atomic<uint32_t> feasign_size_all{0};
 
-  size_t file_start_idx = avg_local_shard_num_ * _shard_idx;
+  size_t file_start_idx = _avg_local_shard_num * _shard_idx;
 
-  // TODO(zhaocaibei123): openmp
-  // omp_set_num_threads(thread_num);
-  // #pragma omp parallel for schedule(dynamic)
-  for (size_t i = 0; i < real_local_shard_num_; ++i) {
+  int thread_num = _real_local_shard_num < 20 ? _real_local_shard_num : 20;
+  omp_set_num_threads(thread_num);
+#pragma omp parallel for schedule(dynamic)
+  for (size_t i = 0; i < _real_local_shard_num; ++i) {
     FsChannelConfig channel_config;
     if (_config.compress_in_save() && (save_param == 0 || save_param == 3)) {
       channel_config.path = paddle::string::format_string(
@@ -259,30 +258,28 @@ int32_t MemorySparseTable::save(const std::string& dirname,
     int feasign_size = 0;
     int retry_num = 0;
     int err_no = 0;
-    auto& shard = shard_values_[i];
+    auto& shard = _local_shards[i];
     do {
       err_no = 0;
       feasign_size = 0;
       is_write_failed = false;
       auto write_channel =
           _afs_client.open_w(channel_config, 1024 * 1024 * 40, &err_no);
-      for (auto& table : shard->values_) {
-        for (auto& value : table) {
-          if (_value_accesor->save(value.second->data(), save_param)) {
-            std::string format_value = _value_accesor->parse_to_string(
-                value.second->data(), value.second->size());
-            if (0 !=
-                write_channel->write_line(paddle::string::format_string(
-                    "%lu %s", value.first, format_value.c_str()))) {
-              ++retry_num;
-              is_write_failed = true;
-              LOG(ERROR)
-                  << "MemorySparseTable save prefix failed, retry it! path:"
-                  << channel_config.path << " , retry_num=" << retry_num;
-              break;
-            }
-            ++feasign_size;
+      for (auto it = shard.begin(); it != shard.end(); ++it) {
+        if (_value_accesor->save(it.value().data(), save_param)) {
+          std::string format_value = _value_accesor->parse_to_string(
+              it.value().data(), it.value().size());
+          if (0 !=
+              write_channel->write_line(paddle::string::format_string(
+                  "%lu %s", it.key(), format_value.c_str()))) {
+            ++retry_num;
+            is_write_failed = true;
+            LOG(ERROR)
+                << "MemorySparseTable save prefix failed, retry it! path:"
+                << channel_config.path << " , retry_num=" << retry_num;
+            break;
           }
+          ++feasign_size;
         }
       }
       write_channel->close();
@@ -296,17 +293,14 @@ int32_t MemorySparseTable::save(const std::string& dirname,
       if (is_write_failed) {
         _afs_client.remove(channel_config.path);
       }
-      if (retry_num > paddle::distributed::FLAGS_pslib_table_save_max_retry) {
+      if (retry_num > paddle::distributed::FLAGS_pserver_table_save_max_retry) {
         LOG(ERROR) << "MemorySparseTable save prefix failed reach max limit!";
         exit(-1);
       }
     } while (is_write_failed);
     feasign_size_all += feasign_size;
-    for (auto& table : shard->values_) {
-      for (auto& value : table) {
-        _value_accesor->update_stat_after_save(value.second->data(),
-                                               save_param);
-      }
+    for (auto it = shard.begin(); it != shard.end(); ++it) {
+      _value_accesor->update_stat_after_save(it.value().data(), save_param);
     }
     LOG(INFO) << "MemorySparseTable save prefix success, path: "
               << channel_config.path;
@@ -322,26 +316,30 @@ int32_t MemorySparseTable::save_local_fs(const std::string& dirname,
       atoi(param.c_str());  // checkpoint:0  xbox delta:1  xbox base:2
   std::string table_path = table_dir(dirname);
   int feasign_cnt = 0;
-  size_t file_start_idx = avg_local_shard_num_ * _shard_idx;
-  for (size_t i = 0; i < real_local_shard_num_; ++i) {
+  size_t file_start_idx = _avg_local_shard_num * _shard_idx;
+
+  int thread_num = _real_local_shard_num < 20 ? _real_local_shard_num : 20;
+  std::atomic<uint32_t> feasign_size_all{0};
+
+  omp_set_num_threads(thread_num);
+#pragma omp parallel for schedule(dynamic)
+  for (size_t i = 0; i < _real_local_shard_num; ++i) {
     feasign_cnt = 0;
-    auto& shard = shard_values_[i];
+    auto& shard = _local_shards[i];
     std::string file_name = paddle::string::format_string(
         "%s/part-%s-%03d-%05d", table_path.c_str(), prefix.c_str(), _shard_idx,
         file_start_idx + i);
     std::ofstream os;
     os.open(file_name);
-    for (auto& table : shard->values_) {
-      for (auto& value : table) {
-        if (_value_accesor->save(value.second->data(), save_param)) {
-          std::string format_value = _value_accesor->parse_to_string(
-              value.second->data(), value.second->size());
-          std::string out_line = paddle::string::format_string(
-              "%lu %s\n", value.first, format_value.c_str());
-          // VLOG(2) << out_line.c_str();
-          os.write(out_line.c_str(), sizeof(char) * out_line.size());
-          ++feasign_cnt;
-        }
+    for (auto it = shard.begin(); it != shard.end(); ++it) {
+      if (_value_accesor->save(it.value().data(), save_param)) {
+        std::string format_value = _value_accesor->parse_to_string(
+            it.value().data(), it.value().size());
+        std::string out_line = paddle::string::format_string(
+            "%lu %s\n", it.key(), format_value.c_str());
+        // VLOG(2) << out_line.c_str();
+        os.write(out_line.c_str(), sizeof(char) * out_line.size());
+        ++feasign_cnt;
       }
     }
     os.close();
@@ -351,22 +349,51 @@ int32_t MemorySparseTable::save_local_fs(const std::string& dirname,
   return 0;
 }
 
-std::pair<int64_t, int64_t> MemorySparseTable::print_table_stat() {
-  int64_t feasign_size = 0;
-  int64_t mf_size = 0;
+int64_t MemorySparseTable::local_size() {
+  int64_t local_size = 0;
+  for (size_t i = 0; i < _real_local_shard_num; ++i) {
+    local_size += _local_shards[i].size();
+  }
+  return local_size;
+}
 
-  for (auto& shard : shard_values_) {
-    for (auto& table : shard->values_) {
-      feasign_size += table.size();
-    }
+int64_t MemorySparseTable::local_mf_size() {
+  std::vector<int64_t> size_arr(_real_local_shard_num, 0);
+  std::vector<std::future<int>> tasks(_real_local_shard_num);
+  int64_t ret_size = 0;
+  for (size_t shard_id = 0; shard_id < _real_local_shard_num; ++shard_id) {
+    tasks[shard_id] =
+        _shards_task_pool[shard_id % _shards_task_pool.size()]->enqueue(
+            [this, shard_id, &size_arr]() -> int {
+              auto& local_shard = _local_shards[shard_id];
+              for (auto it = local_shard.begin(); it != local_shard.end();
+                   ++it) {
+                if (_value_accesor->has_mf(it.value().size())) {
+                  size_arr[shard_id] += 1;
+                }
+              }
+              return 0;
+            });
+  }
+  for (size_t i = 0; i < _real_local_shard_num; ++i) {
+    tasks[i].wait();
+  }
+  for (auto x : size_arr) {
+    ret_size += x;
   }
+  return ret_size;
+}
 
+std::pair<int64_t, int64_t> MemorySparseTable::print_table_stat() {
+  int64_t feasign_size = local_size();
+  int64_t mf_size = local_mf_size();
   return {feasign_size, mf_size};
 }
 
 int32_t MemorySparseTable::pull_sparse(float* pull_values,
                                        const PullSparseValue& pull_value) {
-  std::vector<std::future<int>> tasks(real_local_shard_num_);
+  CostTimer timer("pserver_sparse_select_all");
+  std::vector<std::future<int>> tasks(_real_local_shard_num);
 
   const size_t value_size = _value_accesor->size() / sizeof(float);
   size_t mf_value_size = _value_accesor->mf_size() / sizeof(float);
@@ -374,42 +401,42 @@ int32_t MemorySparseTable::pull_sparse(float* pull_values,
   // std::atomic<uint32_t> missed_keys{0};
 
   std::vector<std::vector<std::pair<uint64_t, int>>> task_keys(
-      real_local_shard_num_);
+      _real_local_shard_num);
   size_t num = pull_value.numel_;
   for (size_t i = 0; i < num; ++i) {
-    int shard_id = (pull_value.feasigns_[i] % sparse_table_shard_num_) %
-                   avg_local_shard_num_;
+    int shard_id = (pull_value.feasigns_[i] % _sparse_table_shard_num) %
+                   _avg_local_shard_num;
     task_keys[shard_id].push_back({pull_value.feasigns_[i], i});
   }
-  for (int shard_id = 0; shard_id < real_local_shard_num_; ++shard_id) {
+  for (int shard_id = 0; shard_id < _real_local_shard_num; ++shard_id) {
     tasks[shard_id] =
-        shards_task_pool_[shard_id % shards_task_pool_.size()]->enqueue(
+        _shards_task_pool[shard_id % _shards_task_pool.size()]->enqueue(
             [this, shard_id, &task_keys, value_size, pull_values, mf_value_size,
              select_value_size]() -> int {
-              auto& local_shard = shard_values_[shard_id];
+              auto& local_shard = _local_shards[shard_id];
               float data_buffer[value_size];  // NOLINT
               float* data_buffer_ptr = data_buffer;
 
               auto& keys = task_keys[shard_id];
               for (size_t i = 0; i < keys.size(); i++) {
                 uint64_t key = keys[i].first;
-                auto itr = local_shard->Find(key);
+                auto itr = local_shard.find(key);
                 size_t data_size = value_size - mf_value_size;
-                if (itr == local_shard->end()) {
+                if (itr == local_shard.end()) {
                   // ++missed_keys;
-                  if (FLAGS_pslib_create_value_when_push) {
+                  if (FLAGS_pserver_create_value_when_push) {
                     memset(data_buffer, 0, sizeof(float) * data_size);
                   } else {
-                    auto* feature_value = local_shard->Init(key);
-                    feature_value->resize(data_size);
-                    float* data_ptr = feature_value->data();
+                    auto& feature_value = local_shard[key];
+                    feature_value.resize(data_size);
+                    float* data_ptr = feature_value.data();
                     _value_accesor->create(&data_buffer_ptr, 1);
                     memcpy(data_ptr, data_buffer_ptr,
                            data_size * sizeof(float));
                   }
                 } else {
-                  data_size = itr->second->size();
-                  memcpy(data_buffer_ptr, itr->second->data(),
+                  data_size = itr.value().size();
+                  memcpy(data_buffer_ptr, itr.value().data(),
                          data_size * sizeof(float));
                 }
                 for (int mf_idx = data_size; mf_idx < value_size; ++mf_idx) {
@@ -439,11 +466,12 @@ int32_t MemorySparseTable::pull_sparse_ptr(char** pull_values,
 
 int32_t MemorySparseTable::push_sparse(const uint64_t* keys,
                                        const float* values, size_t num) {
-  std::vector<std::future<int>> tasks(real_local_shard_num_);
+  CostTimer timer("pserver_sparse_update_all");
+  std::vector<std::future<int>> tasks(_real_local_shard_num);
   std::vector<std::vector<std::pair<uint64_t, int>>> task_keys(
-      real_local_shard_num_);
+      _real_local_shard_num);
   for (size_t i = 0; i < num; ++i) {
-    int shard_id = (keys[i] % sparse_table_shard_num_) % avg_local_shard_num_;
+    int shard_id = (keys[i] % _sparse_table_shard_num) % _avg_local_shard_num;
     task_keys[shard_id].push_back({keys[i], i});
   }
 
@@ -451,41 +479,38 @@ int32_t MemorySparseTable::push_sparse(const uint64_t* keys,
   size_t mf_value_col = _value_accesor->mf_size() / sizeof(float);
   size_t update_value_col = _value_accesor->update_size() / sizeof(float);
 
-  for (size_t shard_id = 0; shard_id < real_local_shard_num_; ++shard_id) {
-    tasks[shard_id] = shards_task_pool_[shard_id % task_pool_size_]->enqueue(
+  for (size_t shard_id = 0; shard_id < _real_local_shard_num; ++shard_id) {
+    tasks[shard_id] = _shards_task_pool[shard_id % _task_pool_size]->enqueue(
         [this, shard_id, value_col, mf_value_col, update_value_col, values,
          &task_keys]() -> int {
           auto& keys = task_keys[shard_id];
-          auto& local_shard = shard_values_[shard_id];
+          auto& local_shard = _local_shards[shard_id];
           float data_buffer[value_col];  // NOLINT
           float* data_buffer_ptr = data_buffer;
-
           for (int i = 0; i < keys.size(); ++i) {
             uint64_t key = keys[i].first;
             uint64_t push_data_idx = keys[i].second;
             const float* update_data =
                 values + push_data_idx * update_value_col;
-            auto itr = local_shard->Find(key);
-            if (itr == local_shard->end()) {
+            auto itr = local_shard.find(key);
+            if (itr == local_shard.end()) {
               VLOG(0) << "sparse table push_sparse: " << key << "not found!";
-              if (FLAGS_pslib_enable_create_feasign_randomly &&
+              if (FLAGS_pserver_enable_create_feasign_randomly &&
                   !_value_accesor->create_value(1, update_data)) {
                 continue;
               }
               auto value_size = value_col - mf_value_col;
-              auto* feature_value = local_shard->Init(key);
-              feature_value->resize(value_size);
+              auto& feature_value = local_shard[key];
+              feature_value.resize(value_size);
               _value_accesor->create(&data_buffer_ptr, 1);
-              memcpy(feature_value->data(), data_buffer_ptr,
+              memcpy(feature_value.data(), data_buffer_ptr,
                      value_size * sizeof(float));
-              itr = local_shard->Find(key);
-            } else {
-              VLOG(2) << "sparse table debug push_sparse: " << key << " found!";
+              itr = local_shard.find(key);
             }
 
-            auto* feature_value = itr->second;
-            float* value_data = feature_value->data();
-            size_t value_size = feature_value->size();
+            auto& feature_value = itr.value();
+            float* value_data = feature_value.data();
+            size_t value_size = feature_value.size();
 
             if (value_size == value_col) {  // 已拓展到最大size, 则就地update
               _value_accesor->update(&value_data, &update_data, 1);
@@ -495,8 +520,8 @@ int32_t MemorySparseTable::push_sparse(const uint64_t* keys,
               _value_accesor->update(&data_buffer_ptr, &update_data, 1);
 
               if (_value_accesor->need_extend_mf(data_buffer)) {
-                feature_value->resize(value_col);
-                value_data = feature_value->data();
+                feature_value.resize(value_col);
+                value_data = feature_value.data();
                 _value_accesor->create(&value_data, 1);
               }
               memcpy(value_data, data_buffer_ptr, value_size * sizeof(float));
@@ -520,11 +545,11 @@ int32_t MemorySparseTable::push_sparse(const uint64_t* keys,
 
 int32_t MemorySparseTable::_push_sparse(const uint64_t* keys,
                                         const float** values, size_t num) {
-  std::vector<std::future<int>> tasks(real_local_shard_num_);
+  std::vector<std::future<int>> tasks(_real_local_shard_num);
   std::vector<std::vector<std::pair<uint64_t, int>>> task_keys(
-      real_local_shard_num_);
+      _real_local_shard_num);
   for (size_t i = 0; i < num; ++i) {
-    int shard_id = (keys[i] % sparse_table_shard_num_) % avg_local_shard_num_;
+    int shard_id = (keys[i] % _sparse_table_shard_num) % _avg_local_shard_num;
     task_keys[shard_id].push_back({keys[i], i});
   }
 
@@ -532,36 +557,35 @@ int32_t MemorySparseTable::_push_sparse(const uint64_t* keys,
   size_t mf_value_col = _value_accesor->mf_size() / sizeof(float);
   size_t update_value_col = _value_accesor->update_size() / sizeof(float);
 
-  for (int shard_id = 0; shard_id < real_local_shard_num_; ++shard_id) {
-    tasks[shard_id] = shards_task_pool_[shard_id % task_pool_size_]->enqueue(
+  for (int shard_id = 0; shard_id < _real_local_shard_num; ++shard_id) {
+    tasks[shard_id] = _shards_task_pool[shard_id % _task_pool_size]->enqueue(
         [this, shard_id, value_col, mf_value_col, update_value_col, values,
          &task_keys]() -> int {
           auto& keys = task_keys[shard_id];
-          auto& local_shard = shard_values_[shard_id];
+          auto& local_shard = _local_shards[shard_id];
           float data_buffer[value_col];  // NOLINT
           float* data_buffer_ptr = data_buffer;
-
           for (int i = 0; i < keys.size(); ++i) {
             uint64_t key = keys[i].first;
             uint64_t push_data_idx = keys[i].second;
             const float* update_data = values[push_data_idx];
-            auto itr = local_shard->Find(key);
-            if (itr == local_shard->end()) {
-              if (FLAGS_pslib_enable_create_feasign_randomly &&
+            auto itr = local_shard.find(key);
+            if (itr == local_shard.end()) {
+              if (FLAGS_pserver_enable_create_feasign_randomly &&
                   !_value_accesor->create_value(1, update_data)) {
                 continue;
               }
               auto value_size = value_col - mf_value_col;
-              auto* feature_value = local_shard->Init(key);
-              feature_value->resize(value_size);
+              auto& feature_value = local_shard[key];
+              feature_value.resize(value_size);
               _value_accesor->create(&data_buffer_ptr, 1);
-              memcpy(feature_value->data(), data_buffer_ptr,
+              memcpy(feature_value.data(), data_buffer_ptr,
                      value_size * sizeof(float));
-              itr = local_shard->Find(key);
+              itr = local_shard.find(key);
             }
-            auto* feature_value = itr->second;
-            float* value_data = feature_value->data();
-            size_t value_size = feature_value->size();
+            auto& feature_value = itr.value();
+            float* value_data = feature_value.data();
+            size_t value_size = feature_value.size();
             if (value_size == value_col) {  // 已拓展到最大size, 则就地update
               _value_accesor->update(&value_data, &update_data, 1);
             } else {
@@ -569,8 +593,8 @@ int32_t MemorySparseTable::_push_sparse(const uint64_t* keys,
               memcpy(data_buffer_ptr, value_data, value_size * sizeof(float));
               _value_accesor->update(&data_buffer_ptr, &update_data, 1);
               if (_value_accesor->need_extend_mf(data_buffer)) {
-                feature_value->resize(value_col);
-                value_data = feature_value->data();
+                feature_value.resize(value_col);
+                value_data = feature_value.data();
                 _value_accesor->create(&value_data, 1);
               }
               memcpy(value_data, data_buffer_ptr, value_size * sizeof(float));
@@ -591,18 +615,14 @@ int32_t MemorySparseTable::flush() { return 0; }
 int32_t MemorySparseTable::shrink(const std::string& param) {
   VLOG(0) << "MemorySparseTable::shrink";
   // TODO(zhaocaibei123): implement with multi-thread
-  for (int shard_id = 0; shard_id < real_local_shard_num_; ++shard_id) {
+  for (int shard_id = 0; shard_id < _real_local_shard_num; ++shard_id) {
     // shrink
-    auto& shard = shard_values_[shard_id];
-    for (auto& table : shard->values_) {
-      for (auto iter = table.begin(); iter != table.end();) {
-        if (_value_accesor->shrink(iter->second->data())) {
-          butil::return_object(iter->second);
-          iter = table.erase(iter);
-          VLOG(1) << "shrink erase key: " << iter->first;
-        } else {
-          ++iter;
-        }
+    auto& shard = _local_shards[shard_id];
+    for (auto it = shard.begin(); it != shard.end();) {
+      if (_value_accesor->shrink(it.value().data())) {
+        it = shard.erase(it);
+      } else {
+        ++it;
       }
     }
   }
diff --git a/paddle/fluid/distributed/table/memory_sparse_table.h b/paddle/fluid/distributed/table/memory_sparse_table.h
index 409757ebec22a..cb552beab1371 100644
--- a/paddle/fluid/distributed/table/memory_sparse_table.h
+++ b/paddle/fluid/distributed/table/memory_sparse_table.h
@@ -36,6 +36,7 @@ namespace distributed {
 
 class MemorySparseTable : public SparseTable {
  public:
+  typedef SparseTableShard<uint64_t, FixedFeatureValue> shard_type;
   MemorySparseTable() {}
   virtual ~MemorySparseTable() {}
 
@@ -59,6 +60,9 @@ class MemorySparseTable : public SparseTable {
   int32_t save_local_fs(const std::string& path, const std::string& param,
                         const std::string& prefix);
 
+  int64_t local_size();
+  int64_t local_mf_size();
+
   virtual std::pair<int64_t, int64_t> print_table_stat();
   virtual int32_t pull_sparse(float* values, const PullSparseValue& pull_value);
 
@@ -80,12 +84,12 @@ class MemorySparseTable : public SparseTable {
                                size_t num);
 
  protected:
-  const int task_pool_size_ = 24;
-  size_t avg_local_shard_num_;
-  size_t real_local_shard_num_;
-  size_t sparse_table_shard_num_;
-  std::vector<std::shared_ptr<::ThreadPool>> shards_task_pool_;
-  std::vector<std::shared_ptr<SparseTableShard>> shard_values_;
+  const int _task_pool_size = 24;
+  size_t _avg_local_shard_num;
+  size_t _real_local_shard_num;
+  size_t _sparse_table_shard_num;
+  std::vector<std::shared_ptr<::ThreadPool>> _shards_task_pool;
+  std::unique_ptr<shard_type[]> _local_shards;
 };
 
 }  // namespace distributed
diff --git a/paddle/fluid/distributed/test/dense_table_test.cc b/paddle/fluid/distributed/test/dense_table_test.cc
index f2f1e098faae2..2e48b791dc8db 100644
--- a/paddle/fluid/distributed/test/dense_table_test.cc
+++ b/paddle/fluid/distributed/test/dense_table_test.cc
@@ -27,9 +27,6 @@ class Table;
 TEST(CommonDenseTable, Adam) {
   int fea_dim = 10;
   int trainers = 2;
-  float beta1 = 0.9;
-  float beta2 = 0.999;
-  float epsilon = 1.0e-8;
 
   TableParameter table_config;
   table_config.set_table_class("CommonDenseTable");
@@ -39,27 +36,33 @@ TEST(CommonDenseTable, Adam) {
   accessor_config->set_accessor_class("CommMergeAccessor");
   CommonAccessorParameter *common_config = table_config.mutable_common();
   // set adam optimize config
-  common_config->set_name("adam");
+  common_config->set_name("adam_d2sum");
   common_config->set_table_name("adam_test_table");
   common_config->set_trainer_num(trainers);
   common_config->add_params("Param");
   common_config->add_dims(fea_dim);
   common_config->add_initializers("gaussian_random&0&0.0&1.0");
-  common_config->add_params("LearningRate");
-  common_config->add_dims(1);
-  common_config->add_initializers("fill_constant&1.0");
-  common_config->add_params("Moment1");
+  common_config->add_params("D2Sum");
+  common_config->add_dims(fea_dim);
+  common_config->add_initializers("fill_constant&0.0");
+  common_config->add_params("G2Sum");
   common_config->add_dims(fea_dim);
   common_config->add_initializers("fill_constant&0.0");
-  common_config->add_params("Moment2");
+  common_config->add_params("Moment");
   common_config->add_dims(fea_dim);
   common_config->add_initializers("fill_constant&0.0");
-  common_config->add_params("Beta1Pow");
+  common_config->add_params("MomentDecayRate");
   common_config->add_dims(1);
-  common_config->add_initializers("fill_constant&1.0");
-  common_config->add_params("Beta2Pow");
+  common_config->add_initializers("fill_constant&0.99");
+  common_config->add_params("AdaDecayRate");
   common_config->add_dims(1);
-  common_config->add_initializers("fill_constant&1.0");
+  common_config->add_initializers("fill_constant&0.9999");
+  common_config->add_params("AdaEpsilon");
+  common_config->add_dims(1);
+  common_config->add_initializers("fill_constant&1.0e-8");
+  common_config->add_params("LearningRate");
+  common_config->add_dims(1);
+  common_config->add_initializers("fill_constant&5e-6");
   auto ret = table->initialize(table_config, fs_config);
   ASSERT_EQ(ret, 0);
 
@@ -89,29 +92,30 @@ TEST(CommonDenseTable, Adam) {
   pull_values.resize(fea_dim);
   table->pull_dense(pull_values.data(), fea_dim);
 
-  std::vector<float> beta1_pow, beta2_pow, lr, mom1, mom2, param;
-  beta1_pow.push_back(beta1);
-  beta2_pow.push_back(beta2);
-  lr.push_back(1.0);
+  float mom_rate = 0.99;
+  float decay_rate = 0.9999;
+  float epsilon = 1.0e-8;
+  float lr = 5e-6;
+  std::vector<float> d2sum, g2sum, mom, param;
   for (int i = 0; i < fea_dim; i++) {
-    mom1.push_back(0.0);
-    mom2.push_back(0.0);
+    mom.push_back(0.0);
+    d2sum.push_back(0.0);
+    g2sum.push_back(0.0);
     param.push_back(init_values[i]);
   }
 
   for (int i = 0; i < trainers; i++) {
-    auto lr_ = lr[0] * sqrt(1 - beta2_pow[0]) / (1 - beta1_pow[0]);
     for (int j = 0; j < fea_dim; j++) {
-      mom1[j] = beta1 * mom1[j] + (1 - beta1) * trainer_gradient_values[i][j];
-      mom2[j] = beta2 * mom2[j] +
-                (1 - beta2) * trainer_gradient_values[i][j] *
-                    trainer_gradient_values[i][j];
-      param[j] =
-          param[j] -
-          lr_ * (mom1[j] / (sqrt(mom2[j]) + epsilon * sqrt(1 - beta2_pow[0])));
+      d2sum[j] = d2sum[j] * decay_rate + 1;
+      g2sum[j] = g2sum[j] * decay_rate +
+                 trainer_gradient_values[i][j] * trainer_gradient_values[i][j];
+      float scale = d2sum[j] * epsilon;
+      scale = (scale + d2sum[j]) / (scale + g2sum[j]);
+      scale = sqrt(scale);
+      mom[j] = (mom[j] - trainer_gradient_values[i][j]) * mom_rate +
+               trainer_gradient_values[i][j];
+      param[j] = param[j] - lr * scale * mom[j];
     }
-    beta1_pow[0] *= beta1;
-    beta2_pow[0] *= beta2;
   }
   for (int j = 0; j < fea_dim; j++) {
     ASSERT_TRUE(abs(param[j] - pull_values[j]) < 1e-5);
diff --git a/paddle/fluid/distributed/test/feature_value_test.cc b/paddle/fluid/distributed/test/feature_value_test.cc
index 9c9f0ffcac321..9bd00dcc56fc2 100644
--- a/paddle/fluid/distributed/test/feature_value_test.cc
+++ b/paddle/fluid/distributed/test/feature_value_test.cc
@@ -12,38 +12,31 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <ThreadPool.h>
-
-#include <unistd.h>
-#include <string>
-#include <thread>  // NOLINT
+#include "paddle/fluid/distributed/table/depends/feature_value.h"
 #include <vector>
-
-#include "google/protobuf/text_format.h"
 #include "gtest/gtest.h"
-#include "paddle/fluid/distributed/table/depends/feature_value.h"
 
 namespace paddle {
 namespace distributed {
 
 TEST(BENCHMARK, LargeScaleKV) {
-  std::shared_ptr<SparseTableShard> shard =
-      std::make_shared<SparseTableShard>();
+  typedef SparseTableShard<uint64_t, FixedFeatureValue> shard_type;
+  shard_type shard;
   uint64_t key = 1;
-  auto itr = shard->Find(key);
-  ASSERT_TRUE(itr == shard->end());
+  auto itr = shard.find(key);
+  ASSERT_TRUE(itr == shard.end());
 
   std::vector<float> vec = {0.0, 0.1, 0.2, 0.3};
 
-  auto* feature_value = shard->Init(key);
-  feature_value->resize(vec.size());
-  memcpy(feature_value->data(), vec.data(), vec.size() * sizeof(float));
+  auto& feature_value = shard[key];
+  feature_value.resize(vec.size());
+  memcpy(feature_value.data(), vec.data(), vec.size() * sizeof(float));
 
-  itr = shard->Find(key);
-  ASSERT_TRUE(itr != shard->end());
+  itr = shard.find(key);
+  ASSERT_TRUE(itr != shard.end());
 
-  feature_value = itr->second;
-  float* value_data = feature_value->data();
+  feature_value = itr.value();
+  float* value_data = feature_value.data();
 
   ASSERT_FLOAT_EQ(value_data[0], 0.0);
   ASSERT_FLOAT_EQ(value_data[1], 0.1);

From 0efcae8676869d923eb3beca5259549e8b0776a0 Mon Sep 17 00:00:00 2001
From: Zhang Ting <zhangting_2017@163.com>
Date: Wed, 12 Jan 2022 21:09:38 +0800
Subject: [PATCH 117/151] [part 3]change type of function args (#38887)

* code clean

* [part 3]change type of function args
---
 .../fluid/operators/controlflow/bitwise_op.h  | 30 ++++++-------
 .../operators/controlflow/compare_all_op.h    |  2 +-
 .../fluid/operators/controlflow/compare_op.h  | 12 +++---
 .../fluid/operators/controlflow/logical_op.cu | 28 ++-----------
 .../fluid/operators/controlflow/logical_op.h  | 42 ++++++++-----------
 5 files changed, 44 insertions(+), 70 deletions(-)

diff --git a/paddle/fluid/operators/controlflow/bitwise_op.h b/paddle/fluid/operators/controlflow/bitwise_op.h
index 92abe4cd3b1c3..9e652f9200747 100644
--- a/paddle/fluid/operators/controlflow/bitwise_op.h
+++ b/paddle/fluid/operators/controlflow/bitwise_op.h
@@ -22,19 +22,19 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-#define BITWISE_BINARY_FUNCTOR(func, expr, bool_expr)                          \
-  template <typename T>                                                        \
-  struct Bitwise##func##Functor {                                              \
-    using ELEM_TYPE = T;                                                       \
-    HOSTDEVICE T operator()(const T& a, const T& b) const { return a expr b; } \
-  };                                                                           \
-                                                                               \
-  template <>                                                                  \
-  struct Bitwise##func##Functor<bool> {                                        \
-    using ELEM_TYPE = bool;                                                    \
-    HOSTDEVICE bool operator()(const bool& a, const bool& b) const {           \
-      return a bool_expr b;                                                    \
-    }                                                                          \
+#define BITWISE_BINARY_FUNCTOR(func, expr, bool_expr)                        \
+  template <typename T>                                                      \
+  struct Bitwise##func##Functor {                                            \
+    using ELEM_TYPE = T;                                                     \
+    HOSTDEVICE T operator()(const T a, const T b) const { return a expr b; } \
+  };                                                                         \
+                                                                             \
+  template <>                                                                \
+  struct Bitwise##func##Functor<bool> {                                      \
+    using ELEM_TYPE = bool;                                                  \
+    HOSTDEVICE bool operator()(const bool a, const bool b) const {           \
+      return a bool_expr b;                                                  \
+    }                                                                        \
   };
 
 BITWISE_BINARY_FUNCTOR(And, &, &&)
@@ -45,13 +45,13 @@ BITWISE_BINARY_FUNCTOR(Xor, ^, !=)
 template <typename T>
 struct BitwiseNotFunctor {
   using ELEM_TYPE = T;
-  HOSTDEVICE T operator()(const T& a) const { return ~a; }
+  HOSTDEVICE T operator()(const T a) const { return ~a; }
 };
 
 template <>
 struct BitwiseNotFunctor<bool> {
   using ELEM_TYPE = bool;
-  HOSTDEVICE bool operator()(const bool& a) const { return !a; }
+  HOSTDEVICE bool operator()(const bool a) const { return !a; }
 };
 
 template <typename DeviceContext, typename Functor>
diff --git a/paddle/fluid/operators/controlflow/compare_all_op.h b/paddle/fluid/operators/controlflow/compare_all_op.h
index bcad240601cf6..78a7b76e3fd9d 100644
--- a/paddle/fluid/operators/controlflow/compare_all_op.h
+++ b/paddle/fluid/operators/controlflow/compare_all_op.h
@@ -28,7 +28,7 @@ namespace operators {
 template <typename T>
 struct EqualReduceFunctor {
   using ELEM_TYPE = T;
-  HOSTDEVICE bool operator()(const T& a, const T& b) const {
+  HOSTDEVICE bool operator()(const T a, const T b) const {
     if (std::is_floating_point<T>::value) {
       // This branch will be optimized while compiling if T is integer. It is
       // safe to cast a and b to double.
diff --git a/paddle/fluid/operators/controlflow/compare_op.h b/paddle/fluid/operators/controlflow/compare_op.h
index 36185322a96b8..d2ef4c9befba9 100644
--- a/paddle/fluid/operators/controlflow/compare_op.h
+++ b/paddle/fluid/operators/controlflow/compare_op.h
@@ -25,31 +25,31 @@ namespace operators {
 template <typename T>
 struct LessThanFunctor {
   using ELEM_TYPE = T;
-  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a < b; }
+  HOSTDEVICE bool operator()(const T a, const T b) const { return a < b; }
 };
 
 template <typename T>
 struct LessEqualFunctor {
   using ELEM_TYPE = T;
-  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a <= b; }
+  HOSTDEVICE bool operator()(const T a, const T b) const { return a <= b; }
 };
 
 template <typename T>
 struct GreaterThanFunctor {
   using ELEM_TYPE = T;
-  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a > b; }
+  HOSTDEVICE bool operator()(const T a, const T b) const { return a > b; }
 };
 
 template <typename T>
 struct GreaterEqualFunctor {
   using ELEM_TYPE = T;
-  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a >= b; }
+  HOSTDEVICE bool operator()(const T a, const T b) const { return a >= b; }
 };
 
 template <typename T>
 struct EqualFunctor {
   using ELEM_TYPE = T;
-  HOSTDEVICE bool operator()(const T& a, const T& b) const {
+  HOSTDEVICE bool operator()(const T a, const T b) const {
     if (std::is_floating_point<T>::value) {
       // This branch will be optimized while compiling if T is integer. It is
       // safe to cast a and b to double.
@@ -63,7 +63,7 @@ struct EqualFunctor {
 template <typename T>
 struct NotEqualFunctor {
   using ELEM_TYPE = T;
-  HOSTDEVICE bool operator()(const T& a, const T& b) const {
+  HOSTDEVICE bool operator()(const T a, const T b) const {
     return !EqualFunctor<T>()(a, b);
   }
 };
diff --git a/paddle/fluid/operators/controlflow/logical_op.cu b/paddle/fluid/operators/controlflow/logical_op.cu
index 301b4c4149fad..4a3fc6c895174 100644
--- a/paddle/fluid/operators/controlflow/logical_op.cu
+++ b/paddle/fluid/operators/controlflow/logical_op.cu
@@ -18,26 +18,6 @@ namespace plat = paddle::platform;
 namespace paddle {
 namespace operators {
 
-#define LOGICAL_BINARY_FUNCTOR(func_name, op)                          \
-  template <typename T>                                                \
-  struct func_name {                                                   \
-    using ELEMENT_TYPE = T;                                            \
-    HOSTDEVICE bool operator()(const T* args) const {                  \
-      return static_cast<bool>(args[0]) op static_cast<bool>(args[1]); \
-    }                                                                  \
-  };
-
-LOGICAL_BINARY_FUNCTOR(CudaOrFunctor, ||)
-LOGICAL_BINARY_FUNCTOR(CudaAndFunctor, &&)
-LOGICAL_BINARY_FUNCTOR(CudaXorFunctor, ^)
-#undef LOGICAL_BINARY_FUNCTOR
-
-template <typename T>
-struct CudaNotFunctor {
-  using ELEMENT_TYPE = T;
-  HOSTDEVICE bool operator()(const T* args) const { return !args[0]; }
-};
-
 template <typename Functor>
 class BinaryLogicalOpKernel<platform::CUDADeviceContext, Functor>
     : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
@@ -76,8 +56,8 @@ class BinaryLogicalOpKernel<platform::CUDADeviceContext, Functor>
       ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<float>>,   \
       ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<double>>);
 
-REGISTER_LOGICAL_CUDA_KERNEL(logical_or, CudaOrFunctor)
-REGISTER_LOGICAL_CUDA_KERNEL(logical_and, CudaAndFunctor)
-REGISTER_LOGICAL_CUDA_KERNEL(logical_xor, CudaXorFunctor)
-REGISTER_LOGICAL_CUDA_KERNEL(logical_not, CudaNotFunctor)
+REGISTER_LOGICAL_CUDA_KERNEL(logical_or, LogicalOrFunctor)
+REGISTER_LOGICAL_CUDA_KERNEL(logical_and, LogicalAndFunctor)
+REGISTER_LOGICAL_CUDA_KERNEL(logical_xor, LogicalXorFunctor)
+REGISTER_LOGICAL_CUDA_KERNEL(logical_not, LogicalNotFunctor)
 #undef REGISTER_LOGICAL_CUDA_KERNEL
diff --git a/paddle/fluid/operators/controlflow/logical_op.h b/paddle/fluid/operators/controlflow/logical_op.h
index 92fe0a10cb907..ee63da60fcd0f 100644
--- a/paddle/fluid/operators/controlflow/logical_op.h
+++ b/paddle/fluid/operators/controlflow/logical_op.h
@@ -19,38 +19,32 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename T>
-struct LogicalAndFunctor {
-  using ELEM_TYPE = T;
-  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a && b; }
-};
+#define LOGICAL_BINARY_FUNCTOR(func_name, op)                \
+  template <typename T>                                      \
+  struct func_name {                                         \
+    using ELEMENT_TYPE = T;                                  \
+    HOSTDEVICE bool operator()(const T a, const T b) const { \
+      return static_cast<bool>(a) op static_cast<bool>(b);   \
+    }                                                        \
+  };
 
-template <typename T>
-struct LogicalOrFunctor {
-  using ELEM_TYPE = T;
-  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a || b; }
-};
+LOGICAL_BINARY_FUNCTOR(LogicalOrFunctor, ||)
+LOGICAL_BINARY_FUNCTOR(LogicalAndFunctor, &&)
+LOGICAL_BINARY_FUNCTOR(LogicalXorFunctor, ^)
+#undef LOGICAL_BINARY_FUNCTOR
 
 template <typename T>
 struct LogicalNotFunctor {
-  using ELEM_TYPE = T;
-  HOSTDEVICE bool operator()(const T& a) const { return !a; }
-};
-
-template <typename T>
-struct LogicalXorFunctor {
-  using ELEM_TYPE = T;
-  HOSTDEVICE bool operator()(const T& a, const T& b) const {
-    return (a || b) && !(a && b);
-  }
+  using ELEMENT_TYPE = T;
+  HOSTDEVICE bool operator()(const T a) const { return !a; }
 };
 
 template <typename DeviceContext, typename Functor>
 class BinaryLogicalOpKernel
-    : public framework::OpKernel<typename Functor::ELEM_TYPE> {
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    using T = typename Functor::ELEM_TYPE;
+    using T = typename Functor::ELEMENT_TYPE;
     auto* x = context.Input<framework::Tensor>("X");
     auto* y = context.Input<framework::Tensor>("Y");
     auto* out = context.Output<framework::Tensor>("Out");
@@ -62,10 +56,10 @@ class BinaryLogicalOpKernel
 
 template <typename DeviceContext, typename Functor>
 class UnaryLogicalOpKernel
-    : public framework::OpKernel<typename Functor::ELEM_TYPE> {
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    using T = typename Functor::ELEM_TYPE;
+    using T = typename Functor::ELEMENT_TYPE;
     auto* x = context.Input<framework::Tensor>("X");
     auto* out = context.Output<framework::Tensor>("Out");
     Functor unary_func;

From 277cf900fb49a28e7d7818addbb863f2b62d3ef5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=9F=B3=E6=99=93=E4=BC=9F?=
 <39303645+Shixiaowei02@users.noreply.github.com>
Date: Thu, 13 Jan 2022 10:23:12 +0800
Subject: [PATCH 118/151] splits allocation for pten, test=develop (#38853)

---
 paddle/fluid/framework/operator.h             |   4 +-
 paddle/fluid/framework/tensor.cc              |   8 --
 paddle/fluid/framework/tensor.h               |   8 --
 paddle/fluid/framework/tensor_util.cc         |   3 +-
 paddle/fluid/framework/tensor_util.h          |   9 +-
 .../inference/api/details/zero_copy_tensor.cc |   7 +-
 paddle/fluid/inference/lite/tensor_utils.cc   |   5 +-
 .../memory/allocation/aligned_allocator.cc    |  14 ++-
 .../memory/allocation/aligned_allocator.h     |   4 +-
 paddle/fluid/memory/allocation/allocator.cc   |   9 +-
 paddle/fluid/memory/allocation/allocator.h    | 105 ++++++-----------
 .../memory/allocation/allocator_facade.cc     |  34 +++---
 .../memory/allocation/allocator_facade.h      |   1 +
 .../auto_growth_best_fit_allocator.cc         |  11 +-
 .../auto_growth_best_fit_allocator.h          |   8 +-
 .../auto_growth_best_fit_allocator_test.cc    |   8 +-
 .../fluid/memory/allocation/base_ptr_test.cu  |   8 +-
 .../memory/allocation/best_fit_allocator.cc   |   6 +-
 .../memory/allocation/best_fit_allocator.h    |   8 +-
 .../memory/allocation/buffered_allocator.cc   |   7 +-
 .../memory/allocation/buffered_allocator.h    |   4 +-
 .../allocation/buffered_allocator_test.cc     |   6 +-
 .../fluid/memory/allocation/cpu_allocator.cc  |   4 +-
 .../fluid/memory/allocation/cpu_allocator.h   |   4 +-
 .../fluid/memory/allocation/cuda_allocator.cc |   4 +-
 .../fluid/memory/allocation/cuda_allocator.h  |   4 +-
 .../cuda_device_context_allocator.h           |  14 +--
 .../allocation/cuda_virtual_mem_allocator.cc  |   4 +-
 .../allocation/cuda_virtual_mem_allocator.h   |   4 +-
 .../memory/allocation/locked_allocator.cc     |   4 +-
 .../memory/allocation/locked_allocator.h      |   4 +-
 .../allocation/naive_best_fit_allocator.cc    |   4 +-
 .../allocation/naive_best_fit_allocator.h     |   4 +-
 .../fluid/memory/allocation/npu_allocator.cc  |   4 +-
 .../fluid/memory/allocation/npu_allocator.h   |   4 +-
 .../memory/allocation/npu_pinned_allocator.cc |   8 +-
 .../memory/allocation/npu_pinned_allocator.h  |   8 +-
 .../memory/allocation/pinned_allocator.cc     |   4 +-
 .../memory/allocation/pinned_allocator.h      |   4 +-
 .../memory/allocation/retry_allocator.cc      |   4 +-
 .../fluid/memory/allocation/retry_allocator.h |   4 +-
 .../memory/allocation/retry_allocator_test.cc |   4 +-
 .../allocation/stream_safe_cuda_allocator.cc  |   9 +-
 .../allocation/stream_safe_cuda_allocator.h   |   8 +-
 .../allocation/test_aligned_allocator.cc      |   4 +-
 .../allocation/thread_local_allocator.h       |   4 +-
 ...l_memory_auto_growth_best_fit_allocator.cc |   8 +-
 ...al_memory_auto_growth_best_fit_allocator.h |   6 +-
 paddle/fluid/memory/malloc.h                  |   2 +-
 .../fluid/operators/math/concat_and_split.cu  |  10 +-
 .../device/mlu/device_context_allocator.h     |   6 +-
 .../fluid/platform/device/npu/npu_op_runner.h |   3 +-
 paddle/fluid/pybind/eager_functions.cc        |   2 +-
 paddle/pten/api/lib/utils/CMakeLists.txt      |   2 +-
 paddle/pten/api/lib/utils/allocator.cc        |  23 ----
 paddle/pten/api/lib/utils/allocator.h         |   8 +-
 paddle/pten/api/lib/utils/storage.cc          |   5 +-
 paddle/pten/api/lib/utils/tensor_utils.cc     |   2 +-
 paddle/pten/core/allocator.h                  |   3 +
 paddle/pten/core/candidate/allocator.h        | 107 ++++++++++++++++++
 paddle/pten/core/dense_tensor.h               |   2 +
 paddle/pten/core/storage.h                    |   1 +
 paddle/pten/tests/core/allocator.h            |   7 +-
 paddle/pten/tests/core/test_allocator.cc      |   4 +
 tools/check_file_diff_approvals.sh            |  19 +---
 65 files changed, 328 insertions(+), 292 deletions(-)
 delete mode 100644 paddle/pten/api/lib/utils/allocator.cc
 create mode 100644 paddle/pten/core/candidate/allocator.h

diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 0a46c83a2b3ad..09e4abc77f573 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -410,8 +410,8 @@ class ExecutionContext {
     auto tmp_allocation_ptr = memory::Alloc(dev_ctx, product(dim) * sizeof(T));
     auto& deleter = tmp_allocation_ptr.get_deleter();
     auto* allocation_ptr = tmp_allocation_ptr.release();
-    auto shared_allocation = std::shared_ptr<memory::allocation::Allocation>(
-        allocation_ptr, deleter);
+    auto shared_allocation =
+        std::shared_ptr<pten::Allocation>(allocation_ptr, deleter);
 
     PADDLE_ENFORCE_GE(
         allocation_ptr->size(), framework::product(dim) * sizeof(T),
diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc
index f11b37825d4f0..6aa10a058081b 100644
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@@ -17,14 +17,6 @@ limitations under the License. */
 
 DECLARE_bool(use_stream_safe_cuda_allocator);
 
-namespace paddle {
-namespace memory {
-namespace allocation {
-class Allocation;
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
-
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index e86009e9aafea..fcdb837bc80ce 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -32,14 +32,6 @@ limitations under the License. */
 
 #include "paddle/pten/core/dense_tensor.h"
 
-namespace paddle {
-namespace memory {
-namespace allocation {
-class Allocation;
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
-
 namespace paddle {
 
 namespace framework {
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 7fd125834a0c3..5fd581220097b 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -151,8 +151,7 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place,
             paddle::memory::allocation::AllocatorFacade::Instance()
                 .GetAllocator(npu_pinned_place)
                 .get());
-    paddle::memory::allocation::Allocation* allocation =
-        npu_pinned_tensor.Holder().get();
+    pten::Allocation* allocation = npu_pinned_tensor.Holder().get();
     npu_pinned_allocator->RecordEvent(
         allocation,
         reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index 46eba6a1e41bb..11858e4166595 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -183,8 +183,7 @@ void TensorFromArray(const T* src, const size_t& array_size,
             paddle::memory::allocation::AllocatorFacade::Instance()
                 .GetAllocator(npu_pinned_place)
                 .get());
-    paddle::memory::allocation::Allocation* allocation =
-        npu_pinned_tensor.Holder().get();
+    pten::Allocation* allocation = npu_pinned_tensor.Holder().get();
     npu_pinned_allocator->RecordEvent(
         allocation,
         reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
@@ -241,8 +240,7 @@ void TensorFromVector(const std::vector<T>& src,
             paddle::memory::allocation::AllocatorFacade::Instance()
                 .GetAllocator(npu_pinned_place)
                 .get());
-    paddle::memory::allocation::Allocation* allocation =
-        npu_pinned_tensor.Holder().get();
+    pten::Allocation* allocation = npu_pinned_tensor.Holder().get();
     npu_pinned_allocator->RecordEvent(
         allocation,
         reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
@@ -312,8 +310,7 @@ inline void TensorFromVector(const std::vector<bool>& src,
             paddle::memory::allocation::AllocatorFacade::Instance()
                 .GetAllocator(npu_pinned_place)
                 .get());
-    paddle::memory::allocation::Allocation* allocation =
-        npu_pinned_tensor.Holder().get();
+    pten::Allocation* allocation = npu_pinned_tensor.Holder().get();
     npu_pinned_allocator->RecordEvent(
         allocation,
         reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
index 01d4dbccd50ea..2f2f4c0ead760 100644
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -223,9 +223,10 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb,
   auto t_place = tensor->place();
 
   paddle::framework::Tensor out;
-  auto mem_allocation = std::make_shared<paddle::memory::Allocation>(
-      static_cast<void *>(data), ele_num * sizeof(T),
-      paddle::platform::CPUPlace());
+  auto mem_allocation =
+      std::make_shared<paddle::memory::allocation::Allocation>(
+          static_cast<void *>(data), ele_num * sizeof(T),
+          paddle::platform::CPUPlace());
   out.ResetHolder(mem_allocation);
 
   if (paddle::platform::is_cpu_place(t_place)) {
diff --git a/paddle/fluid/inference/lite/tensor_utils.cc b/paddle/fluid/inference/lite/tensor_utils.cc
index b1e0eb5ef16ab..0d5cd29a0c579 100644
--- a/paddle/fluid/inference/lite/tensor_utils.cc
+++ b/paddle/fluid/inference/lite/tensor_utils.cc
@@ -257,9 +257,8 @@ void TensorDataShare(framework::LoDTensor* dst, paddle::lite_api::Tensor* src) {
   size_t memory_size =
       GetLiteTensorNumel(*src) *
       framework::SizeOfType(GetNativePrecisionType(src->precision()));
-  std::shared_ptr<memory::allocation::Allocation> holder(
-      new memory::allocation::Allocation(src_raw_data, memory_size,
-                                         GetNativePlace(src->target())));
+  std::shared_ptr<pten::Allocation> holder(new pten::Allocation(
+      src_raw_data, memory_size, GetNativePlace(src->target())));
   dst->Resize(paddle::framework::make_ddim(src->shape()));
   SetLoD(dst->mutable_lod(), src->lod());
   dst->ResetHolderWithType(holder, GetNativePrecisionType(src->precision()));
diff --git a/paddle/fluid/memory/allocation/aligned_allocator.cc b/paddle/fluid/memory/allocation/aligned_allocator.cc
index 10380c0d6028d..258cff32b4fca 100644
--- a/paddle/fluid/memory/allocation/aligned_allocator.cc
+++ b/paddle/fluid/memory/allocation/aligned_allocator.cc
@@ -23,7 +23,7 @@ namespace allocation {
 // For memory address alignment
 class AlignedAllocation : public Allocation {
  public:
-  AlignedAllocation(AllocationPtr underlying_allocation, size_t offset)
+  AlignedAllocation(DecoratedAllocationPtr underlying_allocation, size_t offset)
       : Allocation(
             reinterpret_cast<uint8_t*>(underlying_allocation->ptr()) + offset,
             underlying_allocation->base_ptr(),
@@ -32,7 +32,7 @@ class AlignedAllocation : public Allocation {
         underlying_allocation_(std::move(underlying_allocation)) {}
 
  private:
-  AllocationPtr underlying_allocation_;
+  DecoratedAllocationPtr underlying_allocation_;
 };
 
 AlignedAllocator::AlignedAllocator(
@@ -52,13 +52,17 @@ bool AlignedAllocator::IsAllocThreadSafe() const {
   return underlying_allocator_->IsAllocThreadSafe();
 }
 
-Allocation* AlignedAllocator::AllocateImpl(size_t size) {
+pten::Allocation* AlignedAllocator::AllocateImpl(size_t size) {
   auto raw_allocation = underlying_allocator_->Allocate(size + alignment_);
   size_t offset = AlignedPtrOffset(raw_allocation->ptr(), alignment_);
-  return new AlignedAllocation(std::move(raw_allocation), offset);
+  auto* p = new AlignedAllocation(
+      static_unique_ptr_cast<Allocation>(std::move(raw_allocation)), offset);
+  return p;
 }
 
-void AlignedAllocator::FreeImpl(Allocation* allocation) { delete allocation; }
+void AlignedAllocator::FreeImpl(pten::Allocation* allocation) {
+  delete allocation;
+}
 
 }  // namespace allocation
 }  // namespace memory
diff --git a/paddle/fluid/memory/allocation/aligned_allocator.h b/paddle/fluid/memory/allocation/aligned_allocator.h
index 6fef5cae8d6af..ffd5ad0fae1b0 100644
--- a/paddle/fluid/memory/allocation/aligned_allocator.h
+++ b/paddle/fluid/memory/allocation/aligned_allocator.h
@@ -30,9 +30,9 @@ class AlignedAllocator : public Allocator {
   bool IsAllocThreadSafe() const override;
 
  protected:
-  Allocation* AllocateImpl(size_t size) override;
+  pten::Allocation* AllocateImpl(size_t size) override;
 
-  void FreeImpl(Allocation* allocation) override;
+  void FreeImpl(pten::Allocation* allocation) override;
 
  private:
   std::shared_ptr<Allocator> underlying_allocator_;
diff --git a/paddle/fluid/memory/allocation/allocator.cc b/paddle/fluid/memory/allocation/allocator.cc
index 4998f3dbb9613..0ef6f5cbab5cc 100644
--- a/paddle/fluid/memory/allocation/allocator.cc
+++ b/paddle/fluid/memory/allocation/allocator.cc
@@ -18,11 +18,10 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
-bool Allocator::IsAllocThreadSafe() const { return false; }
-
-void Allocator::FreeImpl(Allocation* allocation) {
-  Allocator* allocator = allocation->TopDecoratedAllocator();
-  allocator->Free(allocation);
+void Allocator::FreeImpl(pten::Allocation* allocation) {
+  static_cast<Allocation*>(allocation)
+      ->TopDecoratedAllocator()
+      ->Free(allocation);
 }
 
 }  // namespace allocation
diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h
index ee802462ddc94..3f04d47516377 100644
--- a/paddle/fluid/memory/allocation/allocator.h
+++ b/paddle/fluid/memory/allocation/allocator.h
@@ -22,6 +22,7 @@
 #include "paddle/fluid/framework/inlined_vector.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/pten/core/allocator.h"
 
 DECLARE_string(allocator_strategy);
 
@@ -80,30 +81,19 @@ class Allocator;
  * e.g., something what is done in AlignedAllocator, etc.
  * In this case, we should declare a derived class of Allocation, which
  * contains an underlying Allocation allocated by the underlying allocator.
- * Therefore, `decorated_allocators_` of the new Allocation object would
+ * Therefore, `decorated_allocators_` of the new Allocation object
+ * would
  * be a new chain, differing from the underlying Allocation object.
  */
-class Allocation {
+class Allocation : public pten::Allocation {
  public:
-  inline Allocation(void* ptr, size_t size, platform::Place place)
-      : ptr_(ptr), base_ptr_(ptr), size_(size), place_(place) {}
-  inline Allocation(void* ptr, void* base_ptr, size_t size,
-                    platform::Place place)
-      : ptr_(ptr), base_ptr_(base_ptr), size_(size), place_(place) {}
-
-  Allocation(const Allocation& o) = delete;
-  Allocation& operator=(const Allocation& o) = delete;
-  Allocation(Allocation&& o) = delete;
-  Allocation& operator=(Allocation&& o) = delete;
-
-  // Returns the holding pointer.
-  // NOTE: For performance consideration, it is better not to make this method
-  // as a virtual method. If we want to implement a `defragmentation` later,
-  // we might need to make `ptr_` field as a protected field, and add a virtual
-  // method like `defragmentation` to change `ptr_`.
-  inline void* ptr() const { return ptr_; }
-
-  inline void* base_ptr() const {
+  Allocation(void* ptr, size_t size, platform::Place place)
+      : pten::Allocation(ptr, size, place), base_ptr_(ptr) {}
+  Allocation(void* ptr, void* base_ptr, size_t size,
+             const platform::Place& place)
+      : pten::Allocation(ptr, size, place), base_ptr_(base_ptr) {}
+
+  void* base_ptr() const {
     PADDLE_ENFORCE_EQ(FLAGS_allocator_strategy, "auto_growth",
                       paddle::platform::errors::Unimplemented(
                           "base_ptr() is only implemented for auto_growth "
@@ -112,21 +102,6 @@ class Allocation {
     return base_ptr_;
   }
 
-  // Returns the size of this memory buffer, i.e., ptr() + size() - 1 is the
-  // last valid element.
-  //
-  // NOTE: Some allocator might alloc more memory than request. The size
-  // could larger than its request. For example,
-  //    the AlignedAllocator will always allocate memory as size + kAlignment.
-  //    The raw pointer might not aligned, so an offset might be added to raw
-  //    the pointer. The size of this allocation will be
-  //    `size + kAlignemnt - offset`.
-  inline size_t size() const { return size_; }
-
-  inline const platform::Place& place() const { return place_; }
-
-  virtual ~Allocation() {}
-
  private:
   inline void RegisterDecoratedAllocator(Allocator* allocator) {
     decorated_allocators_.emplace_back(allocator);
@@ -139,10 +114,7 @@ class Allocation {
   }
 
  private:
-  void* ptr_;
   void* base_ptr_;  // the point that directly requested from system
-  size_t size_;
-  platform::Place place_;
 
   /**
    * NOTE(zjl): Since decorated_allocators_ is usually a small vector.
@@ -162,53 +134,42 @@ class Allocation {
   friend class Allocator;
 };
 
+using AllocationPtr = pten::Allocator::AllocationPtr;
+using DecoratedAllocationPtr =
+    std::unique_ptr<Allocation, pten::Allocator::DeleterType>;
+
 // Base interface class of memory Allocator.
-class Allocator {
+class Allocator : public pten::Allocator {
  public:
-  virtual ~Allocator() {}
-
-  class AllocationDeleter {
-   public:
-    inline void operator()(Allocation* allocation) const {
-      Allocator* allocator = allocation->TopDecoratedAllocator();
-      allocator->Free(allocation);
-    }
-  };
-
-  using AllocationPtr = std::unique_ptr<Allocation, AllocationDeleter>;
+  static void AllocationDeleter(pten::Allocation* allocation) {
+    Allocator* allocator =
+        static_cast<Allocation*>(allocation)->TopDecoratedAllocator();
+    allocator->Free(allocation);
+  }
 
   // Allocate an allocation.
   // size may be 0, but it would be too complex if we handle size == 0
   // in each Allocator. So we handle size == 0 inside AllocatorFacade
   // in our design.
-  inline AllocationPtr Allocate(size_t size) {
+  AllocationPtr Allocate(size_t size) override {
     auto ptr = AllocateImpl(size);
-    ptr->RegisterDecoratedAllocator(this);
-    return AllocationPtr(ptr);
+    static_cast<Allocation*>(ptr)->RegisterDecoratedAllocator(this);
+    return AllocationPtr(ptr, AllocationDeleter);
   }
 
-  // This function should not be called outside Allocator class
-  inline void Free(Allocation* allocation) {
-    allocation->PopDecoratedAllocator();
+  void Free(pten::Allocation* allocation) {
+    static_cast<Allocation*>(allocation)->PopDecoratedAllocator();
     FreeImpl(allocation);
   }
 
-  inline uint64_t Release(const platform::Place& place) {
-    return ReleaseImpl(place);
-  }
-
-  // True if the `Allocate` is thread safe.
-  virtual bool IsAllocThreadSafe() const;
+  uint64_t Release(const platform::Place& place) { return ReleaseImpl(place); }
 
  protected:
-  virtual Allocation* AllocateImpl(size_t size) = 0;
-  virtual void FreeImpl(Allocation* allocation);
+  virtual pten::Allocation* AllocateImpl(size_t size) = 0;
+  virtual void FreeImpl(pten::Allocation* allocation);
   virtual uint64_t ReleaseImpl(const platform::Place& place) { return 0; }
 };
 
-using AllocationDeleter = Allocator::AllocationDeleter;
-using AllocationPtr = Allocator::AllocationPtr;
-
 inline size_t AlignedSize(size_t size, size_t alignment) {
   auto remaining = size % alignment;
   return remaining == 0 ? size : size + alignment - remaining;
@@ -220,6 +181,14 @@ inline size_t AlignedPtrOffset(const void* ptr, size_t alignment) {
   return diff == 0 ? 0 : alignment - diff;
 }
 
+template <typename Derived, typename Base, typename BaseDel>
+decltype(auto) static_unique_ptr_cast(std::unique_ptr<Base, BaseDel>&& p) {
+  static_assert(std::is_base_of<Base, Derived>::value,
+                "Derived type must derive from Base.");
+  auto d = static_cast<Derived*>(p.release());
+  return std::unique_ptr<Derived, BaseDel>(d, p.get_deleter());
+}
+
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 9bc2f5461f383..474b4fe3d4522 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -94,7 +94,7 @@ class CUDAGraphAllocator
   class PrivateAllocation : public Allocation {
    public:
     PrivateAllocation(CUDAGraphAllocator* allocator,
-                      AllocationPtr underlying_allocation)
+                      DecoratedAllocationPtr underlying_allocation)
         : Allocation(
               underlying_allocation->ptr(), underlying_allocation->base_ptr(),
               underlying_allocation->size(), underlying_allocation->place()),
@@ -103,7 +103,7 @@ class CUDAGraphAllocator
 
    private:
     std::shared_ptr<Allocator> allocator_;
-    AllocationPtr underlying_allocation_;
+    DecoratedAllocationPtr underlying_allocation_;
   };
 
   explicit CUDAGraphAllocator(const std::shared_ptr<Allocator>& allocator)
@@ -116,12 +116,14 @@ class CUDAGraphAllocator
   }
 
  protected:
-  Allocation* AllocateImpl(size_t size) {
+  pten::Allocation* AllocateImpl(size_t size) {
     VLOG(10) << "Allocate " << size << " for CUDA Graph";
-    return new PrivateAllocation(this, underlying_allocator_->Allocate(size));
+    return new PrivateAllocation(this,
+                                 static_unique_ptr_cast<Allocation>(
+                                     underlying_allocator_->Allocate(size)));
   }
 
-  void FreeImpl(Allocation* allocation) {
+  void FreeImpl(pten::Allocation* allocation) {
     VLOG(10) << "delete for CUDA Graph";
     delete allocation;
   }
@@ -322,7 +324,7 @@ class AllocatorFacadePrivate {
     return static_cast<platform::CUDADeviceContext*>(pool.Get(place))->stream();
   }
 
-  void RecordStream(std::shared_ptr<Allocation> allocation,
+  void RecordStream(std::shared_ptr<pten::Allocation> allocation,
                     const gpuStream_t& stream) {
     if (allocation->size() == 0) {
       return;
@@ -339,7 +341,7 @@ class AllocatorFacadePrivate {
   }
 
   const gpuStream_t& GetStream(
-      const std::shared_ptr<Allocation>& allocation) const {
+      const std::shared_ptr<pten::Allocation>& allocation) const {
     const StreamSafeCUDAAllocation* stream_safe_cuda_allocation =
         dynamic_cast<const StreamSafeCUDAAllocation*>(allocation.get());
     PADDLE_ENFORCE_NOT_NULL(stream_safe_cuda_allocation,
@@ -391,10 +393,10 @@ class AllocatorFacadePrivate {
     bool IsAllocThreadSafe() const override { return true; }
 
    protected:
-    Allocation* AllocateImpl(size_t size) override {
+    pten::Allocation* AllocateImpl(size_t size) override {
       return new Allocation(nullptr, 0, place_);
     }
-    void FreeImpl(Allocation* allocation) override { delete allocation; }
+    void FreeImpl(pten::Allocation* allocation) override { delete allocation; }
 
    private:
     platform::Place place_;
@@ -820,9 +822,9 @@ const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
   return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
 }
 
-std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
+std::shared_ptr<pten::Allocation> AllocatorFacade::AllocShared(
     const platform::Place& place, size_t size) {
-  return std::shared_ptr<Allocation>(Alloc(place, size));
+  return std::shared_ptr<pten::Allocation>(Alloc(place, size));
 }
 
 AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
@@ -866,7 +868,7 @@ uint64_t AllocatorFacade::Release(const platform::Place& place) {
       ->Release(place);
 }
 
-std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
+std::shared_ptr<pten::Allocation> AllocatorFacade::AllocShared(
     const platform::Place& place, size_t size, const platform::Stream& stream) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   PADDLE_ENFORCE_EQ(
@@ -884,14 +886,14 @@ std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
   }
 #endif
   gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id());
-  return std::shared_ptr<Allocation>(Alloc(place, size, s));
+  return std::shared_ptr<pten::Allocation>(Alloc(place, size, s));
 #else
   PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU."));
 #endif
 }
 
 bool AllocatorFacade::InSameStream(
-    const std::shared_ptr<Allocation>& allocation,
+    const std::shared_ptr<pten::Allocation>& allocation,
     const platform::Stream& stream) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   PADDLE_ENFORCE_EQ(
@@ -962,7 +964,7 @@ uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place,
   return m_->GetAllocator(place, stream)->Release(place);
 }
 
-void AllocatorFacade::RecordStream(std::shared_ptr<Allocation> allocation,
+void AllocatorFacade::RecordStream(std::shared_ptr<pten::Allocation> allocation,
                                    const gpuStream_t& stream) {
   PADDLE_ENFORCE_EQ(
       FLAGS_use_stream_safe_cuda_allocator, true,
@@ -983,7 +985,7 @@ void AllocatorFacade::RecordStream(std::shared_ptr<Allocation> allocation,
 }
 
 const gpuStream_t& AllocatorFacade::GetStream(
-    const std::shared_ptr<Allocation>& allocation) const {
+    const std::shared_ptr<pten::Allocation>& allocation) const {
   PADDLE_ENFORCE_EQ(
       FLAGS_use_stream_safe_cuda_allocator, true,
       platform::errors::Unimplemented(
diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h
index d59ecaece5a70..76e2f0b5a94f6 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.h
+++ b/paddle/fluid/memory/allocation/allocator_facade.h
@@ -42,6 +42,7 @@ using NPUPinnedAllocator = paddle::memory::allocation::NPUPinnedAllocator;
 class AllocatorFacadePrivate;
 class AllocatorFacade {
  public:
+  using Allocation = pten::Allocation;
   AllocatorFacade(const AllocatorFacade& o) = delete;
   const AllocatorFacade& operator=(const AllocatorFacade& o) = delete;
   ~AllocatorFacade();
diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
index dd2a65d889d8d..ad62af8480f58 100644
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
@@ -45,7 +45,8 @@ AutoGrowthBestFitAllocator::AutoGrowthBestFitAllocator(
       chunk_size_(std::max(AlignedSize(chunk_size, alignment), alignment)),
       allow_free_idle_chunk_(allow_free_idle_chunk) {}
 
-Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t unaligned_size) {
+pten::Allocation *AutoGrowthBestFitAllocator::AllocateImpl(
+    size_t unaligned_size) {
   size_t size = AlignedSize(unaligned_size, alignment_);
   VLOG(10) << "Allocate " << unaligned_size << " bytes, aligned to " << size;
 
@@ -78,11 +79,13 @@ Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t unaligned_size) {
     size_t realloc_size = std::max(size, chunk_size_);
 
     try {
-      chunks_.emplace_back(underlying_allocator_->Allocate(realloc_size));
+      chunks_.emplace_back(static_unique_ptr_cast<Allocation>(
+          underlying_allocator_->Allocate(realloc_size)));
     } catch (BadAlloc &ex) {
       if (FLAGS_free_when_no_cache_hit) throw ex;
       FreeIdleChunks();
-      chunks_.emplace_back(underlying_allocator_->Allocate(realloc_size));
+      chunks_.emplace_back(static_unique_ptr_cast<Allocation>(
+          underlying_allocator_->Allocate(realloc_size)));
     }
 
     auto *chunk = &(*chunks_.rbegin());
@@ -104,7 +107,7 @@ Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t unaligned_size) {
   return new BlockAllocation(block_it);
 }
 
-void AutoGrowthBestFitAllocator::FreeImpl(Allocation *allocation) {
+void AutoGrowthBestFitAllocator::FreeImpl(pten::Allocation *allocation) {
   VLOG(10) << "Free " << allocation->size()
            << " bytes, ptr = " << allocation->ptr();
   std::lock_guard<SpinLock> guard(spinlock_);
diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h
index 2334a1b6d4d55..94aff93ec50f8 100644
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h
@@ -36,9 +36,9 @@ class AutoGrowthBestFitAllocator : public Allocator {
   bool IsAllocThreadSafe() const override { return true; }
 
  protected:
-  Allocation *AllocateImpl(size_t size) override;
+  pten::Allocation *AllocateImpl(size_t size) override;
 
-  void FreeImpl(Allocation *allocation) override;
+  void FreeImpl(pten::Allocation *allocation) override;
 
   // Release the memory block which is not used in pool.
   uint64_t ReleaseImpl(const platform::Place &place) override {
@@ -64,10 +64,10 @@ class AutoGrowthBestFitAllocator : public Allocator {
   };
 
   struct Chunk {
-    explicit Chunk(AllocationPtr allocation)
+    explicit Chunk(DecoratedAllocationPtr allocation)
         : allocation_(std::move(allocation)) {}
 
-    AllocationPtr allocation_;
+    DecoratedAllocationPtr allocation_;
     List<Block> blocks_;
   };
 
diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc
index 926af8292d2e8..5942fbe730e57 100644
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc
@@ -28,12 +28,12 @@ namespace allocation {
 
 class RecordedAllocator : public Allocator {
  protected:
-  Allocation *AllocateImpl(size_t size) override {
+  pten::Allocation *AllocateImpl(size_t size) override {
     allocated_size_ += size;
     return new Allocation(malloc(size), size, platform::CPUPlace());
   }
 
-  void FreeImpl(Allocation *allocation) {
+  void FreeImpl(pten::Allocation *allocation) {
     allocated_size_ -= allocation->size();
     free(allocation->ptr());
     delete allocation;
@@ -79,7 +79,7 @@ class LimitedResourceAllocator : public Allocator {
   size_t AllocatedSize() const { return allocated_size_; }
 
  protected:
-  Allocation *AllocateImpl(size_t size) override {
+  pten::Allocation *AllocateImpl(size_t size) override {
     if (allocated_size_ + size > capacity_) {
       throw BadAlloc("", __FILE__, __LINE__);
     }
@@ -88,7 +88,7 @@ class LimitedResourceAllocator : public Allocator {
     return new Allocation(malloc(size), size, platform::CPUPlace());
   }
 
-  void FreeImpl(Allocation *allocation) {
+  void FreeImpl(pten::Allocation *allocation) {
     allocated_size_ -= allocation->size();
     free(allocation->ptr());
     delete allocation;
diff --git a/paddle/fluid/memory/allocation/base_ptr_test.cu b/paddle/fluid/memory/allocation/base_ptr_test.cu
index a34750a5e34ba..5edabfcb9f5e7 100644
--- a/paddle/fluid/memory/allocation/base_ptr_test.cu
+++ b/paddle/fluid/memory/allocation/base_ptr_test.cu
@@ -37,7 +37,7 @@ class CUDAAllocatoionBasePtrTest : public ::testing::Test {
       size_t size = dis_(random_engine_);
       AllocationPtr allocation = Alloc(place_, size);
 
-      void* base_ptr = allocation->base_ptr();
+      void* base_ptr = static_cast<Allocation*>(allocation.get())->base_ptr();
       void* system_ptr =
           platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId());
       EXPECT_EQ(base_ptr, system_ptr);
@@ -56,7 +56,7 @@ class CUDAAllocatoionBasePtrTest : public ::testing::Test {
         size_t size = dis_(random_engine_);
         AllocationPtr allocation = Alloc(place_, size);
 
-        void* base_ptr = allocation->base_ptr();
+        void* base_ptr = static_cast<Allocation*>(allocation.get())->base_ptr();
         void* system_ptr =
             platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId());
         EXPECT_EQ(base_ptr, system_ptr);
@@ -77,7 +77,7 @@ class CUDAAllocatoionBasePtrTest : public ::testing::Test {
       size_t size = dis_(random_engine_);
       AllocationPtr allocation = Alloc(place_, size);
 
-      void* base_ptr = allocation->base_ptr();
+      void* base_ptr = static_cast<Allocation*>(allocation.get())->base_ptr();
       void* system_ptr =
           platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId());
       EXPECT_EQ(base_ptr, system_ptr);
@@ -91,7 +91,7 @@ class CUDAAllocatoionBasePtrTest : public ::testing::Test {
 
   void ZeroSizeAllocTest() {
     AllocationPtr allocation = Alloc(place_, 0);
-    void* base_ptr = allocation->base_ptr();
+    void* base_ptr = static_cast<Allocation*>(allocation.get())->base_ptr();
     void* system_ptr =
         platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId());
     EXPECT_EQ(base_ptr, system_ptr);
diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.cc b/paddle/fluid/memory/allocation/best_fit_allocator.cc
index 0955b5212622f..3cba70bd5b502 100644
--- a/paddle/fluid/memory/allocation/best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/best_fit_allocator.cc
@@ -33,7 +33,7 @@ static int HighestBitPos(size_t N) {
   }
 }
 
-BestFitAllocator::BestFitAllocator(Allocation* allocation)
+BestFitAllocator::BestFitAllocator(pten::Allocation* allocation)
     : allocation_(allocation) {
   details::Chunk chunk;
   chunk.size_ = allocation_->size();
@@ -115,7 +115,7 @@ size_t BestFitAllocator::NumFreeChunks() const {
   }
   return num;
 }
-void BestFitAllocator::FreeImpl(Allocation* allocation) {
+void BestFitAllocator::FreeImpl(pten::Allocation* allocation) {
   auto* bf_allocation = dynamic_cast<BestFitAllocation*>(allocation);
   PADDLE_ENFORCE_NOT_NULL(
       bf_allocation,
@@ -150,7 +150,7 @@ void BestFitAllocator::FreeImpl(Allocation* allocation) {
   InsertFreeNode(chunk_it);
   delete allocation;
 }
-Allocation* BestFitAllocator::AllocateImpl(size_t size) {
+pten::Allocation* BestFitAllocator::AllocateImpl(size_t size) {
   auto highest_set_bit = static_cast<size_t>(HighestBitPos(size));
   MapIt map_it;
   for (; highest_set_bit < free_chunks_.size(); ++highest_set_bit) {
diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.h b/paddle/fluid/memory/allocation/best_fit_allocator.h
index 42f69e6d704af..297d876178f3d 100644
--- a/paddle/fluid/memory/allocation/best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/best_fit_allocator.h
@@ -108,7 +108,7 @@ class BestFitAllocation : public Allocation {
 // the prev-chunk and the next-chunk when possible.
 class BestFitAllocator : public Allocator {
  public:
-  explicit BestFitAllocator(Allocation* allocation);
+  explicit BestFitAllocator(pten::Allocation* allocation);
 
   void* BasePtr() const { return allocation_->ptr(); }
 
@@ -127,11 +127,11 @@ class BestFitAllocator : public Allocator {
   void InsertFreeNode(const ListIt& it);
 
  protected:
-  void FreeImpl(Allocation* allocation) override;
-  Allocation* AllocateImpl(size_t size) override;
+  void FreeImpl(pten::Allocation* allocation) override;
+  pten::Allocation* AllocateImpl(size_t size) override;
 
  private:
-  Allocation* allocation_;  // not owned
+  pten::Allocation* allocation_;  // not owned
   details::ChunkList chunks_;
   details::FreeChunkBin free_chunks_;
 };
diff --git a/paddle/fluid/memory/allocation/buffered_allocator.cc b/paddle/fluid/memory/allocation/buffered_allocator.cc
index 325cb010bf466..11739ebba955f 100644
--- a/paddle/fluid/memory/allocation/buffered_allocator.cc
+++ b/paddle/fluid/memory/allocation/buffered_allocator.cc
@@ -46,12 +46,13 @@ void BufferedAllocator::FreeCache(size_t size) {
 
 bool BufferedAllocator::IsAllocThreadSafe() const { return mtx_ != nullptr; }
 
-void BufferedAllocator::FreeImpl(Allocation *allocation) {
+void BufferedAllocator::FreeImpl(pten::Allocation *allocation) {
   platform::LockGuardPtr<std::mutex> guard(mtx_);
-  allocations_.emplace(allocation->size(), AllocationPtr(allocation));
+  allocations_.emplace(allocation->size(),
+                       AllocationPtr(allocation, Allocator::AllocationDeleter));
 }
 
-Allocation *BufferedAllocator::AllocateImpl(size_t size) {
+pten::Allocation *BufferedAllocator::AllocateImpl(size_t size) {
   {
     platform::LockGuardPtr<std::mutex> guard(mtx_);
     auto it = allocations_.lower_bound(size);
diff --git a/paddle/fluid/memory/allocation/buffered_allocator.h b/paddle/fluid/memory/allocation/buffered_allocator.h
index 5e1733bd839de..0ccccef573963 100644
--- a/paddle/fluid/memory/allocation/buffered_allocator.h
+++ b/paddle/fluid/memory/allocation/buffered_allocator.h
@@ -45,8 +45,8 @@ class BufferedAllocator : public Allocator {
   void FreeCache(size_t size);
 
  protected:
-  void FreeImpl(Allocation *allocation) override;
-  Allocation *AllocateImpl(size_t size) override;
+  void FreeImpl(pten::Allocation *allocation) override;
+  pten::Allocation *AllocateImpl(size_t size) override;
 
  private:
   std::shared_ptr<Allocator> underlying_allocator_;
diff --git a/paddle/fluid/memory/allocation/buffered_allocator_test.cc b/paddle/fluid/memory/allocation/buffered_allocator_test.cc
index 0bfa10a1616b6..21c30efccd8ad 100644
--- a/paddle/fluid/memory/allocation/buffered_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/buffered_allocator_test.cc
@@ -27,7 +27,7 @@ namespace memory {
 namespace allocation {
 
 inline std::unique_ptr<BufferedAllocator> GetBufferedAllocator(
-    Allocation *allocation, bool thread_safe) {
+    pten::Allocation *allocation, bool thread_safe) {
   std::unique_ptr<Allocator> allocator(new BestFitAllocator(allocation));
   if (thread_safe) {
     allocator.reset(new LockedAllocator(std::move(allocator)));
@@ -68,7 +68,7 @@ class StubAllocator : public Allocator {
   size_t GetFreeCount() const { return destruct_count_; }
 
  protected:
-  void FreeImpl(Allocation *allocation) override {
+  void FreeImpl(pten::Allocation *allocation) override {
     auto *alloc = dynamic_cast<StubAllocation *>(allocation);
     PADDLE_ENFORCE_NOT_NULL(
         alloc, platform::errors::InvalidArgument(
@@ -77,7 +77,7 @@ class StubAllocator : public Allocator {
     ++destruct_count_;
     delete allocation;
   }
-  Allocation *AllocateImpl(size_t size) override {
+  pten::Allocation *AllocateImpl(size_t size) override {
     ++construct_count_;
     if (size == 0) {
       return new StubAllocation(nullptr, 0, platform::CPUPlace());
diff --git a/paddle/fluid/memory/allocation/cpu_allocator.cc b/paddle/fluid/memory/allocation/cpu_allocator.cc
index 128591f5a8d3e..bf0bd891be26f 100644
--- a/paddle/fluid/memory/allocation/cpu_allocator.cc
+++ b/paddle/fluid/memory/allocation/cpu_allocator.cc
@@ -24,7 +24,7 @@ namespace allocation {
 
 bool CPUAllocator::IsAllocThreadSafe() const { return true; }
 
-void CPUAllocator::FreeImpl(Allocation *allocation) {
+void CPUAllocator::FreeImpl(pten::Allocation *allocation) {
   void *p = allocation->ptr();
 #ifdef _WIN32
   _aligned_free(p);
@@ -34,7 +34,7 @@ void CPUAllocator::FreeImpl(Allocation *allocation) {
   delete allocation;
 }
 
-Allocation *CPUAllocator::AllocateImpl(size_t size) {
+pten::Allocation *CPUAllocator::AllocateImpl(size_t size) {
   void *p;
 #ifdef _WIN32
   p = _aligned_malloc(size, kAlignment);
diff --git a/paddle/fluid/memory/allocation/cpu_allocator.h b/paddle/fluid/memory/allocation/cpu_allocator.h
index 058ff63381658..a64089dd2de42 100644
--- a/paddle/fluid/memory/allocation/cpu_allocator.h
+++ b/paddle/fluid/memory/allocation/cpu_allocator.h
@@ -37,8 +37,8 @@ class CPUAllocator : public Allocator {
   bool IsAllocThreadSafe() const override;
 
  protected:
-  void FreeImpl(Allocation* allocation) override;
-  Allocation* AllocateImpl(size_t size) override;
+  void FreeImpl(pten::Allocation* allocation) override;
+  pten::Allocation* AllocateImpl(size_t size) override;
 };
 }  // namespace allocation
 }  // namespace memory
diff --git a/paddle/fluid/memory/allocation/cuda_allocator.cc b/paddle/fluid/memory/allocation/cuda_allocator.cc
index 4242083f2e617..ff9bbf4ab3df8 100644
--- a/paddle/fluid/memory/allocation/cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_allocator.cc
@@ -32,7 +32,7 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 bool CUDAAllocator::IsAllocThreadSafe() const { return true; }
-void CUDAAllocator::FreeImpl(Allocation* allocation) {
+void CUDAAllocator::FreeImpl(pten::Allocation* allocation) {
   PADDLE_ENFORCE_EQ(
       BOOST_GET_CONST(platform::CUDAPlace, allocation->place()), place_,
       platform::errors::PermissionDenied(
@@ -42,7 +42,7 @@ void CUDAAllocator::FreeImpl(Allocation* allocation) {
   delete allocation;
 }
 
-Allocation* CUDAAllocator::AllocateImpl(size_t size) {
+pten::Allocation* CUDAAllocator::AllocateImpl(size_t size) {
   std::call_once(once_flag_, [this] { platform::SetDeviceId(place_.device); });
 
   void* ptr;
diff --git a/paddle/fluid/memory/allocation/cuda_allocator.h b/paddle/fluid/memory/allocation/cuda_allocator.h
index 5969d4d20ddee..57e85a3dc21d1 100644
--- a/paddle/fluid/memory/allocation/cuda_allocator.h
+++ b/paddle/fluid/memory/allocation/cuda_allocator.h
@@ -28,8 +28,8 @@ class CUDAAllocator : public Allocator {
   bool IsAllocThreadSafe() const override;
 
  protected:
-  void FreeImpl(Allocation* allocation) override;
-  Allocation* AllocateImpl(size_t size) override;
+  void FreeImpl(pten::Allocation* allocation) override;
+  pten::Allocation* AllocateImpl(size_t size) override;
 
  private:
   platform::CUDAPlace place_;
diff --git a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
index 33cf2fe054247..a6696634c12d4 100644
--- a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
+++ b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
@@ -41,7 +41,7 @@ namespace allocation {
  */
 class CUDADeviceContextAllocation : public Allocation {
  public:
-  explicit CUDADeviceContextAllocation(AllocationPtr allocation)
+  explicit CUDADeviceContextAllocation(DecoratedAllocationPtr allocation)
       : Allocation(allocation->ptr(), allocation->base_ptr(),
                    allocation->size(), allocation->place()),
         underlying_allocation_(std::move(allocation)) {}
@@ -56,7 +56,7 @@ class CUDADeviceContextAllocation : public Allocation {
             << p_allocation;
     dev_ctx_->AddStreamCallback([p_allocation] {
       VLOG(4) << "Delete CUDADeviceContextAllocation at " << p_allocation;
-      AllocationDeleter()(p_allocation);
+      Allocator::AllocationDeleter(p_allocation);
     });
   }
 
@@ -65,7 +65,7 @@ class CUDADeviceContextAllocation : public Allocation {
   }
 
  private:
-  AllocationPtr underlying_allocation_;
+  DecoratedAllocationPtr underlying_allocation_;
   const platform::CUDADeviceContext *dev_ctx_{nullptr};
 };
 
@@ -102,14 +102,14 @@ class CUDADeviceContextAllocator : public Allocator {
   }
 
  protected:
-  Allocation *AllocateImpl(size_t size) override {
+  pten::Allocation *AllocateImpl(size_t size) override {
     PADDLE_ENFORCE_NOT_NULL(
         default_stream_,
         platform::errors::PreconditionNotMet(
             "Default stream is not set for CUDADeviceContextAllocator"));
     platform::CUDADeviceGuard guard(place_.device);
-    auto allocation =
-        new CUDADeviceContextAllocation(memory::Alloc(place_, size));
+    auto allocation = new CUDADeviceContextAllocation(
+        static_unique_ptr_cast<Allocation>(memory::Alloc(place_, size)));
 // Wait for the event on stream
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, default_stream_));
@@ -121,7 +121,7 @@ class CUDADeviceContextAllocator : public Allocator {
     return allocation;
   }
 
-  void FreeImpl(Allocation *allocation) override { delete allocation; }
+  void FreeImpl(pten::Allocation *allocation) override { delete allocation; }
 
  private:
   platform::CUDAPlace place_;
diff --git a/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc b/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc
index f4baca8288f03..2ae2cf20ee6d4 100644
--- a/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc
@@ -101,7 +101,7 @@ CUDAVirtualMemAllocator::CUDAVirtualMemAllocator(
 
 bool CUDAVirtualMemAllocator::IsAllocThreadSafe() const { return false; }
 
-void CUDAVirtualMemAllocator::FreeImpl(Allocation* allocation) {
+void CUDAVirtualMemAllocator::FreeImpl(pten::Allocation* allocation) {
   PADDLE_ENFORCE_EQ(
       BOOST_GET_CONST(platform::CUDAPlace, allocation->place()), place_,
       platform::errors::PermissionDenied(
@@ -140,7 +140,7 @@ void CUDAVirtualMemAllocator::FreeImpl(Allocation* allocation) {
   delete allocation;
 }
 
-Allocation* CUDAVirtualMemAllocator::AllocateImpl(size_t size) {
+pten::Allocation* CUDAVirtualMemAllocator::AllocateImpl(size_t size) {
   size = AlignedSize(size, granularity_);
 
   CUdeviceptr ptr = virtual_mem_base_ + virtual_mem_alloced_offset_;
diff --git a/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h b/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h
index c51b56566bb02..0e1e59d200d91 100644
--- a/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h
+++ b/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h
@@ -37,8 +37,8 @@ class CUDAVirtualMemAllocator : public Allocator {
   bool IsAllocThreadSafe() const override;
 
  protected:
-  void FreeImpl(Allocation* allocation) override;
-  Allocation* AllocateImpl(size_t size) override;
+  void FreeImpl(pten::Allocation* allocation) override;
+  pten::Allocation* AllocateImpl(size_t size) override;
 
  private:
   platform::CUDAPlace place_;
diff --git a/paddle/fluid/memory/allocation/locked_allocator.cc b/paddle/fluid/memory/allocation/locked_allocator.cc
index 6e8f870b235ff..a0c8efddbd80d 100644
--- a/paddle/fluid/memory/allocation/locked_allocator.cc
+++ b/paddle/fluid/memory/allocation/locked_allocator.cc
@@ -37,12 +37,12 @@ LockedAllocator::LockedAllocator(
   }
 }
 
-void LockedAllocator::FreeImpl(Allocation *allocation) {
+void LockedAllocator::FreeImpl(pten::Allocation *allocation) {
   platform::LockGuardPtr<std::mutex> guard(mtx_);
   underlying_allocator_->Free(allocation);
 }
 
-Allocation *LockedAllocator::AllocateImpl(size_t size) {
+pten::Allocation *LockedAllocator::AllocateImpl(size_t size) {
   platform::LockGuardPtr<std::mutex> guard(mtx_);
   return underlying_allocator_->Allocate(size).release();
 }
diff --git a/paddle/fluid/memory/allocation/locked_allocator.h b/paddle/fluid/memory/allocation/locked_allocator.h
index 1b8418bc8494a..d17c8b24e27bd 100644
--- a/paddle/fluid/memory/allocation/locked_allocator.h
+++ b/paddle/fluid/memory/allocation/locked_allocator.h
@@ -29,8 +29,8 @@ class LockedAllocator : public Allocator {
   bool IsAllocThreadSafe() const override;
 
  protected:
-  void FreeImpl(Allocation *allocation) override;
-  Allocation *AllocateImpl(size_t size) override;
+  void FreeImpl(pten::Allocation *allocation) override;
+  pten::Allocation *AllocateImpl(size_t size) override;
 
  private:
   std::shared_ptr<Allocator> underlying_allocator_;
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
index 8710bbe6ce98b..ffe7ccf9190be 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -790,7 +790,7 @@ size_t Usage::operator()(const platform::CUDAPinnedPlace &cuda_pinned) const {
 
 namespace allocation {
 
-Allocation *NaiveBestFitAllocator::AllocateImpl(size_t size) {
+pten::Allocation *NaiveBestFitAllocator::AllocateImpl(size_t size) {
   void *ptr = boost::apply_visitor(legacy::AllocVisitor(size), place_);
   auto *tmp_alloc = new Allocation(ptr, size, place_);
   platform::MemEvenRecorder::Instance().PushMemRecord(
@@ -798,7 +798,7 @@ Allocation *NaiveBestFitAllocator::AllocateImpl(size_t size) {
   return tmp_alloc;
 }
 
-void NaiveBestFitAllocator::FreeImpl(Allocation *allocation) {
+void NaiveBestFitAllocator::FreeImpl(pten::Allocation *allocation) {
   boost::apply_visitor(
       legacy::FreeVisitor(allocation->ptr(), allocation->size()),
       allocation->place());
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.h b/paddle/fluid/memory/allocation/naive_best_fit_allocator.h
index 474a308a064fd..b7b3647ff98c1 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.h
@@ -34,8 +34,8 @@ class NaiveBestFitAllocator : public Allocator {
   bool IsAllocThreadSafe() const override { return true; }
 
  protected:
-  Allocation *AllocateImpl(size_t size) override;
-  void FreeImpl(Allocation *allocation) override;
+  pten::Allocation *AllocateImpl(size_t size) override;
+  void FreeImpl(pten::Allocation *allocation) override;
   uint64_t ReleaseImpl(const platform::Place &place) override;
 
  private:
diff --git a/paddle/fluid/memory/allocation/npu_allocator.cc b/paddle/fluid/memory/allocation/npu_allocator.cc
index 074a900cf5463..d9fa7ec27fdde 100644
--- a/paddle/fluid/memory/allocation/npu_allocator.cc
+++ b/paddle/fluid/memory/allocation/npu_allocator.cc
@@ -22,7 +22,7 @@ namespace memory {
 namespace allocation {
 
 bool NPUAllocator::IsAllocThreadSafe() const { return true; }
-void NPUAllocator::FreeImpl(Allocation* allocation) {
+void NPUAllocator::FreeImpl(pten::Allocation* allocation) {
   PADDLE_ENFORCE_EQ(
       BOOST_GET_CONST(platform::NPUPlace, allocation->place()), place_,
       platform::errors::PermissionDenied(
@@ -32,7 +32,7 @@ void NPUAllocator::FreeImpl(Allocation* allocation) {
   delete allocation;
 }
 
-Allocation* NPUAllocator::AllocateImpl(size_t size) {
+pten::Allocation* NPUAllocator::AllocateImpl(size_t size) {
   std::call_once(once_flag_,
                  [this] { platform::SetNPUDeviceId(place_.device); });
 
diff --git a/paddle/fluid/memory/allocation/npu_allocator.h b/paddle/fluid/memory/allocation/npu_allocator.h
index bf668973505ba..88b0c9a24bb3d 100644
--- a/paddle/fluid/memory/allocation/npu_allocator.h
+++ b/paddle/fluid/memory/allocation/npu_allocator.h
@@ -28,8 +28,8 @@ class NPUAllocator : public Allocator {
   bool IsAllocThreadSafe() const override;
 
  protected:
-  void FreeImpl(Allocation* allocation) override;
-  Allocation* AllocateImpl(size_t size) override;
+  void FreeImpl(pten::Allocation* allocation) override;
+  pten::Allocation* AllocateImpl(size_t size) override;
 
  private:
   platform::NPUPlace place_;
diff --git a/paddle/fluid/memory/allocation/npu_pinned_allocator.cc b/paddle/fluid/memory/allocation/npu_pinned_allocator.cc
index 292fe15c5d952..2389973fa9b88 100644
--- a/paddle/fluid/memory/allocation/npu_pinned_allocator.cc
+++ b/paddle/fluid/memory/allocation/npu_pinned_allocator.cc
@@ -26,7 +26,7 @@ void NPUPinnedAllocator::ProcessEventsAndFree() {
     platform::NPUEventQuery(event, &status);
 
     if (status == ACL_EVENT_STATUS_COMPLETE) {
-      Allocation *allocation = it->first;
+      auto *allocation = it->first;
       void *ptr = allocation->ptr();
       free(ptr);
       npu_events_.erase(it++);
@@ -38,7 +38,7 @@ void NPUPinnedAllocator::ProcessEventsAndFree() {
   }
 }
 
-Allocation *NPUPinnedAllocator::AllocateImpl(size_t size) {
+pten::Allocation *NPUPinnedAllocator::AllocateImpl(size_t size) {
   std::lock_guard<std::mutex> lock(mtx_);
   ProcessEventsAndFree();
   void *ptr;
@@ -50,7 +50,7 @@ Allocation *NPUPinnedAllocator::AllocateImpl(size_t size) {
   return new Allocation(ptr, size, platform::NPUPinnedPlace());
 }
 
-void NPUPinnedAllocator::FreeImpl(Allocation *allocation) {
+void NPUPinnedAllocator::FreeImpl(pten::Allocation *allocation) {
   std::lock_guard<std::mutex> lock(mtx_);
   void *ptr = allocation->ptr();
   auto iter = npu_events_.find(allocation);
@@ -83,7 +83,7 @@ uint64_t NPUPinnedAllocator::ReleaseImpl(const platform::Place &place) {
   return static_cast<uint64_t>(0);
 }
 
-void NPUPinnedAllocator::RecordEvent(Allocation *allocation,
+void NPUPinnedAllocator::RecordEvent(pten::Allocation *allocation,
                                      aclrtStream stream) {
   std::lock_guard<std::mutex> lock(mtx_);
   aclrtEvent event = nullptr;
diff --git a/paddle/fluid/memory/allocation/npu_pinned_allocator.h b/paddle/fluid/memory/allocation/npu_pinned_allocator.h
index 1d3f8bf1e449d..716b12eea15f8 100644
--- a/paddle/fluid/memory/allocation/npu_pinned_allocator.h
+++ b/paddle/fluid/memory/allocation/npu_pinned_allocator.h
@@ -32,16 +32,16 @@ class NPUPinnedAllocator : public Allocator {
  public:
   bool IsAllocThreadSafe() const override { return true; }
   void ProcessEventsAndFree();
-  void RecordEvent(Allocation *allocation, aclrtStream stream);
+  void RecordEvent(pten::Allocation *allocation, aclrtStream stream);
   constexpr static size_t kAlignment = 4096UL;
 
  protected:
-  Allocation *AllocateImpl(size_t size) override;
-  void FreeImpl(Allocation *allocation) override;
+  pten::Allocation *AllocateImpl(size_t size) override;
+  void FreeImpl(pten::Allocation *allocation) override;
   uint64_t ReleaseImpl(const platform::Place &place) override;
 
  private:
-  std::unordered_map<Allocation *, aclrtEvent> npu_events_;
+  std::unordered_map<pten::Allocation *, aclrtEvent> npu_events_;
   mutable std::mutex mtx_;
 };
 
diff --git a/paddle/fluid/memory/allocation/pinned_allocator.cc b/paddle/fluid/memory/allocation/pinned_allocator.cc
index c56a7235c109c..f1175fc4374e7 100644
--- a/paddle/fluid/memory/allocation/pinned_allocator.cc
+++ b/paddle/fluid/memory/allocation/pinned_allocator.cc
@@ -18,7 +18,7 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 bool CPUPinnedAllocator::IsAllocThreadSafe() const { return true; }
-void CPUPinnedAllocator::FreeImpl(Allocation *allocation) {
+void CPUPinnedAllocator::FreeImpl(pten::Allocation *allocation) {
 #ifdef PADDLE_WITH_HIP
   PADDLE_ENFORCE_GPU_SUCCESS(hipHostFree(allocation->ptr()));
 #else
@@ -26,7 +26,7 @@ void CPUPinnedAllocator::FreeImpl(Allocation *allocation) {
 #endif
   delete allocation;
 }
-Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) {
+pten::Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) {
   void *ptr;
 #ifdef PADDLE_WITH_HIP
   PADDLE_ENFORCE_GPU_SUCCESS(hipHostMalloc(&ptr, size, hipHostMallocPortable));
diff --git a/paddle/fluid/memory/allocation/pinned_allocator.h b/paddle/fluid/memory/allocation/pinned_allocator.h
index 4f535ef33734a..800e3ff3bb2e3 100644
--- a/paddle/fluid/memory/allocation/pinned_allocator.h
+++ b/paddle/fluid/memory/allocation/pinned_allocator.h
@@ -25,8 +25,8 @@ class CPUPinnedAllocator : public Allocator {
   bool IsAllocThreadSafe() const override;
 
  protected:
-  void FreeImpl(Allocation *allocation) override;
-  Allocation *AllocateImpl(size_t size) override;
+  void FreeImpl(pten::Allocation *allocation) override;
+  pten::Allocation *AllocateImpl(size_t size) override;
 };
 
 }  // namespace allocation
diff --git a/paddle/fluid/memory/allocation/retry_allocator.cc b/paddle/fluid/memory/allocation/retry_allocator.cc
index 1607af3808b43..856b6c2e9a2b0 100644
--- a/paddle/fluid/memory/allocation/retry_allocator.cc
+++ b/paddle/fluid/memory/allocation/retry_allocator.cc
@@ -39,7 +39,7 @@ class WaitedAllocateSizeGuard {
   size_t requested_size_;
 };
 
-void RetryAllocator::FreeImpl(Allocation* allocation) {
+void RetryAllocator::FreeImpl(pten::Allocation* allocation) {
   // Delete underlying allocation first.
   size_t size = allocation->size();
   underlying_allocator_->Free(allocation);
@@ -51,7 +51,7 @@ void RetryAllocator::FreeImpl(Allocation* allocation) {
   }
 }
 
-Allocation* RetryAllocator::AllocateImpl(size_t size) {
+pten::Allocation* RetryAllocator::AllocateImpl(size_t size) {
   auto alloc_func = [&, this]() {
     return underlying_allocator_->Allocate(size).release();
   };
diff --git a/paddle/fluid/memory/allocation/retry_allocator.h b/paddle/fluid/memory/allocation/retry_allocator.h
index 031a5e2b97f17..b427a37907a67 100644
--- a/paddle/fluid/memory/allocation/retry_allocator.h
+++ b/paddle/fluid/memory/allocation/retry_allocator.h
@@ -45,8 +45,8 @@ class RetryAllocator : public Allocator {
   bool IsAllocThreadSafe() const override { return true; }
 
  protected:
-  void FreeImpl(Allocation* allocation) override;
-  Allocation* AllocateImpl(size_t size) override;
+  void FreeImpl(pten::Allocation* allocation) override;
+  pten::Allocation* AllocateImpl(size_t size) override;
   uint64_t ReleaseImpl(const platform::Place& place) override {
     return underlying_allocator_->Release(place);
   }
diff --git a/paddle/fluid/memory/allocation/retry_allocator_test.cc b/paddle/fluid/memory/allocation/retry_allocator_test.cc
index 787f3d9dca377..d636c73e07a18 100644
--- a/paddle/fluid/memory/allocation/retry_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/retry_allocator_test.cc
@@ -98,12 +98,12 @@ class DummyAllocator : public Allocator {
   bool IsAllocThreadSafe() const override { return true; }
 
  protected:
-  Allocation *AllocateImpl(size_t size) override {
+  pten::Allocation *AllocateImpl(size_t size) override {
     PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
         "Here is a test exception, always BadAlloc."));
   }
 
-  void FreeImpl(Allocation *) override {}
+  void FreeImpl(pten::Allocation *) override {}
 };
 
 TEST(RetryAllocator, RetryAllocatorLastAllocFailure) {
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
index a4f766f1d1abc..05c6a7adaff8b 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
@@ -19,7 +19,7 @@ namespace memory {
 namespace allocation {
 
 StreamSafeCUDAAllocation::StreamSafeCUDAAllocation(
-    AllocationPtr underlying_allocation, gpuStream_t owning_stream)
+    DecoratedAllocationPtr underlying_allocation, gpuStream_t owning_stream)
     : Allocation(underlying_allocation->ptr(),
                  underlying_allocation->base_ptr(),
                  underlying_allocation->size(), underlying_allocation->place()),
@@ -116,7 +116,7 @@ StreamSafeCUDAAllocator::~StreamSafeCUDAAllocator() {
 
 bool StreamSafeCUDAAllocator::IsAllocThreadSafe() const { return true; }
 
-Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) {
+pten::Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) {
   ProcessUnfreedAllocations();
   VLOG(8) << "Try allocate " << size << " bytes";
   AllocationPtr underlying_allocation;
@@ -136,13 +136,14 @@ Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) {
     throw;
   }
   StreamSafeCUDAAllocation* allocation = new StreamSafeCUDAAllocation(
-      std::move(underlying_allocation), default_stream_);
+      static_unique_ptr_cast<Allocation>(std::move(underlying_allocation)),
+      default_stream_);
   VLOG(8) << "Allocate " << allocation->size() << " bytes at address "
           << allocation->ptr();
   return allocation;
 }
 
-void StreamSafeCUDAAllocator::FreeImpl(Allocation* allocation) {
+void StreamSafeCUDAAllocator::FreeImpl(pten::Allocation* allocation) {
   StreamSafeCUDAAllocation* stream_safe_cuda_allocation =
       dynamic_cast<StreamSafeCUDAAllocation*>(allocation);
   PADDLE_ENFORCE_NOT_NULL(stream_safe_cuda_allocation,
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
index d84994f58a9c4..f54cdc749611a 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
@@ -34,7 +34,7 @@ namespace allocation {
 
 class StreamSafeCUDAAllocation : public Allocation {
  public:
-  StreamSafeCUDAAllocation(AllocationPtr underlying_allocation,
+  StreamSafeCUDAAllocation(DecoratedAllocationPtr underlying_allocation,
                            gpuStream_t owning_stream);
   void RecordStream(const gpuStream_t &stream);
   bool CanBeFreed();
@@ -42,7 +42,7 @@ class StreamSafeCUDAAllocation : public Allocation {
   const gpuStream_t &GetOwningStream() const;
 
  private:
-  AllocationPtr underlying_allocation_;
+  DecoratedAllocationPtr underlying_allocation_;
   std::map<gpuStream_t, gpuEvent_t> outstanding_event_map_;
   gpuStream_t owning_stream_;
   SpinLock outstanding_event_map_lock_;
@@ -57,8 +57,8 @@ class StreamSafeCUDAAllocator : public Allocator {
   bool IsAllocThreadSafe() const override;
 
  protected:
-  Allocation *AllocateImpl(size_t size) override;
-  void FreeImpl(Allocation *allocation) override;
+  pten::Allocation *AllocateImpl(size_t size) override;
+  void FreeImpl(pten::Allocation *allocation) override;
   uint64_t ReleaseImpl(const platform::Place &place) override;
 
  private:
diff --git a/paddle/fluid/memory/allocation/test_aligned_allocator.cc b/paddle/fluid/memory/allocation/test_aligned_allocator.cc
index 3eb1f140edd84..987c7ea772d23 100644
--- a/paddle/fluid/memory/allocation/test_aligned_allocator.cc
+++ b/paddle/fluid/memory/allocation/test_aligned_allocator.cc
@@ -32,12 +32,12 @@ struct StubAllocator : public Allocator {
   size_t AllocNum() const { return alloc_num_; }
 
  protected:
-  Allocation *AllocateImpl(size_t size) override {
+  pten::Allocation *AllocateImpl(size_t size) override {
     ++alloc_num_;
     return new Allocation(new uint8_t[size], size, platform::CPUPlace());
   }
 
-  void FreeImpl(Allocation *allocation) override {
+  void FreeImpl(pten::Allocation *allocation) override {
     delete[] static_cast<uint8_t *>(allocation->ptr());
     delete allocation;
     --alloc_num_;
diff --git a/paddle/fluid/memory/allocation/thread_local_allocator.h b/paddle/fluid/memory/allocation/thread_local_allocator.h
index c55f579981b00..9c9306517021a 100644
--- a/paddle/fluid/memory/allocation/thread_local_allocator.h
+++ b/paddle/fluid/memory/allocation/thread_local_allocator.h
@@ -83,11 +83,11 @@ class ThreadLocalCUDAAllocator : public Allocator {
   bool IsAllocThreadSafe() const override { return true; }
 
  protected:
-  Allocation* AllocateImpl(size_t size) override {
+  pten::Allocation* AllocateImpl(size_t size) override {
     return ThreadLocalCUDAAllocatorPool::Instance().Get(gpu_id_)->AllocateImpl(
         size);
   }
-  void FreeImpl(Allocation* allocation) override {
+  void FreeImpl(pten::Allocation* allocation) override {
     auto* tl_allocation = static_cast<ThreadLocalAllocation*>(allocation);
     auto allocator_impl = tl_allocation->GetAllocator();
     allocator_impl->FreeImpl(tl_allocation);
diff --git a/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.cc b/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.cc
index 5c7e8e2d933f3..face27debe9ff 100644
--- a/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.cc
@@ -35,7 +35,8 @@ VirtualMemoryAutoGrowthBestFitAllocator::
       alignment_(alignment),
       place_(place) {}
 
-Allocation *VirtualMemoryAutoGrowthBestFitAllocator::AllocateImpl(size_t size) {
+pten::Allocation *VirtualMemoryAutoGrowthBestFitAllocator::AllocateImpl(
+    size_t size) {
   std::lock_guard<SpinLock> guard(spinlock_);
   size = AlignedSize(size, alignment_);
   auto result = AllocFromFreeBlocks(size);
@@ -48,7 +49,8 @@ Allocation *VirtualMemoryAutoGrowthBestFitAllocator::AllocateImpl(size_t size) {
   return result;
 }
 
-void VirtualMemoryAutoGrowthBestFitAllocator::FreeImpl(Allocation *allocation) {
+void VirtualMemoryAutoGrowthBestFitAllocator::FreeImpl(
+    pten::Allocation *allocation) {
   std::lock_guard<SpinLock> guard(spinlock_);
   auto block_it = static_cast<BlockAllocation *>(allocation)->block_it_;
   TryMergeBlock2Blocks(block_it);
@@ -225,7 +227,7 @@ void VirtualMemoryAutoGrowthBestFitAllocator::ExtendAndMerge(size_t size) {
   }
 }
 
-Allocation *VirtualMemoryAutoGrowthBestFitAllocator::AllocFromFreeBlocks(
+pten::Allocation *VirtualMemoryAutoGrowthBestFitAllocator::AllocFromFreeBlocks(
     size_t size) {
   auto iter = free_blocks_.lower_bound(std::make_pair(size, nullptr));
   if (iter != free_blocks_.end()) {
diff --git a/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.h b/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.h
index 5171e5b3cd1bf..10bf0bbf49d5a 100644
--- a/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.h
@@ -60,12 +60,12 @@ class VirtualMemoryAutoGrowthBestFitAllocator : public Allocator {
   bool IsAllocThreadSafe() const override { return true; }
 
  protected:
-  Allocation *AllocateImpl(size_t size) override;
+  pten::Allocation *AllocateImpl(size_t size) override;
 
-  void FreeImpl(Allocation *allocation) override;
+  void FreeImpl(pten::Allocation *allocation) override;
 
  private:
-  Allocation *AllocFromFreeBlocks(size_t size);
+  pten::Allocation *AllocFromFreeBlocks(size_t size);
   void ExtendAndMerge(size_t size);
   void TryMergeBlock2Blocks(std::list<Block>::iterator iter);
 
diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h
index 7069fb46203d6..8830c46a17798 100644
--- a/paddle/fluid/memory/malloc.h
+++ b/paddle/fluid/memory/malloc.h
@@ -28,7 +28,7 @@ class DeviceContext;
 
 namespace memory {
 
-using allocation::Allocation;
+using pten::Allocation;
 using allocation::Allocator;
 using allocation::AllocationPtr;
 
diff --git a/paddle/fluid/operators/math/concat_and_split.cu b/paddle/fluid/operators/math/concat_and_split.cu
index bc2d496a3e76a..6892f7ce4e503 100644
--- a/paddle/fluid/operators/math/concat_and_split.cu
+++ b/paddle/fluid/operators/math/concat_and_split.cu
@@ -336,9 +336,8 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
     auto* data_alloc_released = data_alloc.release();
     auto* col_alloc_released = col_alloc.release();
     context.AddStreamCallback([data_alloc_released, col_alloc_released] {
-      memory::allocation::AllocationDeleter deleter;
-      deleter(data_alloc_released);
-      deleter(col_alloc_released);
+      memory::allocation::Allocator::AllocationDeleter(data_alloc_released);
+      memory::allocation::Allocator::AllocationDeleter(col_alloc_released);
     });
 #endif
   }
@@ -466,9 +465,8 @@ class SplitFunctor<platform::CUDADeviceContext, T> {
     auto* data_alloc_released = data_alloc.release();
     auto* cols_alloc_released = cols_alloc.release();
     context.AddStreamCallback([data_alloc_released, cols_alloc_released] {
-      memory::allocation::AllocationDeleter deleter;
-      deleter(data_alloc_released);
-      deleter(cols_alloc_released);
+      memory::allocation::Allocator::AllocationDeleter(data_alloc_released);
+      memory::allocation::Allocator::AllocationDeleter(cols_alloc_released);
     });
 #endif
   }
diff --git a/paddle/fluid/platform/device/mlu/device_context_allocator.h b/paddle/fluid/platform/device/mlu/device_context_allocator.h
index 9deab92af5cd6..408016c0f0d99 100644
--- a/paddle/fluid/platform/device/mlu/device_context_allocator.h
+++ b/paddle/fluid/platform/device/mlu/device_context_allocator.h
@@ -55,7 +55,7 @@ class MLUDeviceContextAllocation : public Allocation {
             << p_allocation;
     dev_ctx_->AddStreamCallback([p_allocation] {
       VLOG(4) << "Delete MLUDeviceContextAllocation at " << p_allocation;
-      AllocationDeleter()(p_allocation);
+      Allocator::AllocationDeleter(p_allocation);
     });
   }
 
@@ -91,7 +91,7 @@ class MLUDeviceContextAllocator : public Allocator {
   }
 
  protected:
-  Allocation *AllocateImpl(size_t size) override {
+  pten::Allocation *AllocateImpl(size_t size) override {
     PADDLE_ENFORCE_NOT_NULL(
         default_stream_,
         platform::errors::PreconditionNotMet(
@@ -105,7 +105,7 @@ class MLUDeviceContextAllocator : public Allocator {
     return allocation;
   }
 
-  void FreeImpl(Allocation *allocation) override { delete allocation; }
+  void FreeImpl(pten::Allocation *allocation) override { delete allocation; }
 
  private:
   platform::MLUPlace place_;
diff --git a/paddle/fluid/platform/device/npu/npu_op_runner.h b/paddle/fluid/platform/device/npu/npu_op_runner.h
index e83057e682fef..c049da3b33566 100644
--- a/paddle/fluid/platform/device/npu/npu_op_runner.h
+++ b/paddle/fluid/platform/device/npu/npu_op_runner.h
@@ -158,8 +158,7 @@ void FillNpuTensorWithConstant(Tensor *tensor, T val) {
             paddle::memory::allocation::AllocatorFacade::Instance()
                 .GetAllocator(npu_pinned_place)
                 .get());
-    paddle::memory::allocation::Allocation *allocation =
-        npu_pinned_tensor.Holder().get();
+    pten::Allocation *allocation = npu_pinned_tensor.Holder().get();
 
     npu_pinned_allocator->RecordEvent(allocation, GetCurrentNPUStream());
   } else {
diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc
index 3f8923440be50..659df6b9b44de 100644
--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -53,7 +53,7 @@ size_t PyArray_Size_(PyObject* numpy_data) {
   return res;
 }
 
-class EagerNumpyAllocation : public paddle::memory::allocation::Allocation {
+class EagerNumpyAllocation : public pten::Allocation {
  public:
   explicit EagerNumpyAllocation(PyObject* numpy_data, pten::DataType dtype)
       : Allocation(
diff --git a/paddle/pten/api/lib/utils/CMakeLists.txt b/paddle/pten/api/lib/utils/CMakeLists.txt
index 4a44ad7758b56..a4db8c4b193b6 100644
--- a/paddle/pten/api/lib/utils/CMakeLists.txt
+++ b/paddle/pten/api/lib/utils/CMakeLists.txt
@@ -1,2 +1,2 @@
-cc_library(pten_api_utils SRCS allocator.cc storage.cc tensor_utils.cc DEPS
+cc_library(pten_api_utils SRCS storage.cc tensor_utils.cc DEPS
 tensor_base convert_utils dense_tensor lod_tensor selected_rows place var_type_traits)
diff --git a/paddle/pten/api/lib/utils/allocator.cc b/paddle/pten/api/lib/utils/allocator.cc
deleted file mode 100644
index e80152431e712..0000000000000
--- a/paddle/pten/api/lib/utils/allocator.cc
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/pten/api/lib/utils/allocator.h"
-
-namespace paddle {
-namespace experimental {
-
-memory::Allocator::AllocationDeleter DefaultAllocator::deleter_;
-
-}  // namespace experimental
-}  // namespace paddle
diff --git a/paddle/pten/api/lib/utils/allocator.h b/paddle/pten/api/lib/utils/allocator.h
index 4f5a810e400ce..a8c05b7651689 100644
--- a/paddle/pten/api/lib/utils/allocator.h
+++ b/paddle/pten/api/lib/utils/allocator.h
@@ -22,14 +22,15 @@ limitations under the License. */
 namespace paddle {
 namespace experimental {
 
-class DefaultAllocator : public pten::Allocator {
+class DefaultAllocator : public pten::deprecated::Allocator {
  public:
-  using Allocation = pten::Allocation;
+  using Allocation = pten::deprecated::Allocation;
   explicit DefaultAllocator(const paddle::platform::Place& place)
       : place_(place) {}
 
   static void Delete(Allocation* allocation) {
-    deleter_(allocation->CastContextWithoutCheck<paddle::memory::Allocation>());
+    paddle::memory::allocation::Allocator::AllocationDeleter(
+        allocation->CastContextWithoutCheck<paddle::memory::Allocation>());
   }
 
   Allocation Allocate(size_t bytes_size) override {
@@ -42,7 +43,6 @@ class DefaultAllocator : public pten::Allocator {
 
  private:
   paddle::platform::Place place_;
-  static paddle::memory::Allocator::AllocationDeleter deleter_;
 };
 
 }  // namespace experimental
diff --git a/paddle/pten/api/lib/utils/storage.cc b/paddle/pten/api/lib/utils/storage.cc
index 9ee1b9e5b7f92..6116a709d5065 100644
--- a/paddle/pten/api/lib/utils/storage.cc
+++ b/paddle/pten/api/lib/utils/storage.cc
@@ -20,14 +20,13 @@ namespace experimental {
 ExternalStorage::ExternalStorage(void* ptr,
                                  size_t size,
                                  const paddle::platform::Place& place)
-    : pten::Storage(
-          std::make_shared<paddle::memory::Allocation>(ptr, size, place)),
+    : pten::Storage(std::make_shared<pten::Allocation>(ptr, size, place)),
       size_(size) {}
 
 ExternalStorage::ExternalStorage(const pten::intrusive_ptr<pten::Storage>& root,
                                  size_t delta,
                                  size_t size)
-    : Storage(std::make_shared<paddle::memory::Allocation>(
+    : Storage(std::make_shared<pten::Allocation>(
           static_cast<uint8_t*>(root->data()) + delta, size, root->place())),
       size_(size) {
   PADDLE_ENFORCE_LE(static_cast<size_t>(delta + size),
diff --git a/paddle/pten/api/lib/utils/tensor_utils.cc b/paddle/pten/api/lib/utils/tensor_utils.cc
index 69a1fc274a28d..0b6cb8d95cc1a 100644
--- a/paddle/pten/api/lib/utils/tensor_utils.cc
+++ b/paddle/pten/api/lib/utils/tensor_utils.cc
@@ -307,7 +307,7 @@ void MovesStorage(pten::DenseTensor* src, paddle::framework::Tensor* dst) {
   dst->Resize(src->dims());
   dst->set_type(pten::TransToProtoVarType(src->dtype()));
   auto storage = src->release();
-  std::shared_ptr<paddle::memory::allocation::Allocation> holder(
+  std::shared_ptr<pten::Allocation> holder(
       new TensorStorage(std::move(storage)));
   dst->ResetHolderWithType(holder, pten::TransToProtoVarType(src->dtype()));
   dst->set_offset(src->meta().offset);
diff --git a/paddle/pten/core/allocator.h b/paddle/pten/core/allocator.h
index 74455be136834..2647490c9f58b 100644
--- a/paddle/pten/core/allocator.h
+++ b/paddle/pten/core/allocator.h
@@ -16,8 +16,10 @@ limitations under the License. */
 
 #include <cstdint>
 #include "paddle/fluid/platform/place.h"
+#include "paddle/pten/core/candidate/allocator.h"
 
 namespace pten {
+namespace deprecated {
 
 /// \brief Encapsulates strategies for access/addressing, allocation/
 /// deallocation and construction/destruction of objects.
@@ -147,4 +149,5 @@ inline Allocation Allocate(const std::shared_ptr<Allocator>& a, size_t n) {
   return a->Allocate(n);
 }
 
+}  // namespace deprecated
 }  // namespace pten
diff --git a/paddle/pten/core/candidate/allocator.h b/paddle/pten/core/candidate/allocator.h
new file mode 100644
index 0000000000000..75d42c4fd15cb
--- /dev/null
+++ b/paddle/pten/core/candidate/allocator.h
@@ -0,0 +1,107 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cstdint>
+#include <functional>
+#include "paddle/fluid/platform/place.h"
+
+namespace pten {
+
+/// \brief Fancy pointer with deleter. The use of this data type
+/// is to be compatible with allocators from different frameworks
+/// without significant performance loss. This class does not
+/// support being inherited.
+class Allocation {
+ public:
+  using Place = paddle::platform::Place;
+  using DeleterFnPtr = void (*)(Allocation*);
+
+  Allocation() = default;
+
+  // Don't own resources, only provide access.
+  Allocation(void* data, size_t size, const Place& place)
+      : ptr_(data), size_(size), place_(place) {}
+
+  // Own resources.
+  Allocation(void* data, size_t size, DeleterFnPtr deleter, const Place& place)
+      : ptr_(data), size_(size), deleter_(deleter), place_(place) {}
+
+  Allocation(Allocation&& other) noexcept { swap(*this, other); }
+  Allocation& operator=(Allocation&& other) noexcept {
+    // Exchange them explicitly to avoid moving is equivalent
+    // to copying.
+    swap(*this, other);
+    return *this;
+  }
+
+  virtual ~Allocation() {
+    if (deleter_) {
+      deleter_(this);
+    }
+  }
+
+  // Returns the holding pointer.
+  // NOTE: For performance consideration, it is better not to make this method
+  // as a virtual method. If we want to implement a `defragmentation` later,
+  // we might need to make `ptr_` field as a protected field, and add a virtual
+  // method like `defragmentation` to change `ptr_`.
+  void* ptr() const noexcept { return ptr_; }
+
+  // Returns the size of this memory buffer, i.e., ptr() + size() - 1 is the
+  // last valid element.
+  //
+  // NOTE: Some allocator might alloc more memory than request. The size
+  // could larger than its request. For example,
+  //    the AlignedAllocator will always allocate memory as size + kAlignment.
+  //    The raw pointer might not aligned, so an offset might be added to raw
+  //    the pointer. The size of this allocation will be
+  //    `size + kAlignemnt - offset`.
+  size_t size() const noexcept { return size_; }
+
+  void* operator->() const noexcept { return ptr_; }
+  operator bool() const noexcept { return ptr_; }
+  const Place& place() const noexcept { return place_; }
+  DeleterFnPtr deleter() const noexcept { return deleter_; }
+
+ protected:
+  friend void swap(Allocation& a, Allocation& b) noexcept;
+  void* ptr_{nullptr};
+  size_t size_{};
+  DeleterFnPtr deleter_{nullptr};
+  // TODO(Shixiaowei02): Enum needs to be used instead to reduce
+  // the construction overhead by more than 50%.
+  Place place_;
+};
+
+inline void swap(Allocation& a, Allocation& b) noexcept {
+  ::std::swap(a.ptr_, b.ptr_);
+  ::std::swap(a.deleter_, b.deleter_);
+  ::std::swap(a.place_, b.place_);
+  ::std::swap(a.size_, b.size_);
+}
+
+class Allocator {
+ public:
+  using DeleterType = std::function<void(Allocation*)>;
+  using AllocationPtr = std::unique_ptr<Allocation, DeleterType>;
+
+  virtual ~Allocator() = default;
+  virtual AllocationPtr Allocate(size_t bytes_size) = 0;
+
+  virtual bool IsAllocThreadSafe() const { return false; }
+};
+
+}  // namespace pten
diff --git a/paddle/pten/core/dense_tensor.h b/paddle/pten/core/dense_tensor.h
index 1502accd197be..1802a2461158f 100644
--- a/paddle/pten/core/dense_tensor.h
+++ b/paddle/pten/core/dense_tensor.h
@@ -60,6 +60,8 @@ class TensorInplaceVersion {
 class DenseTensor : public TensorBase,
                     public TypeInfoTraits<TensorBase, DenseTensor> {
  public:
+  using Allocator = deprecated::Allocator;
+
   /// \brief Construct a dense tensor and allocate space.
   /// \param a The allocator used to allocate space.
   /// \param meta The meta data of dense tensor.
diff --git a/paddle/pten/core/storage.h b/paddle/pten/core/storage.h
index fc56935eeaf19..cf18dd913093a 100644
--- a/paddle/pten/core/storage.h
+++ b/paddle/pten/core/storage.h
@@ -91,6 +91,7 @@ class Storage : public intrusive_ref_counter<Storage> {
 class TensorStorage : public Storage {
  public:
   using Place = paddle::platform::Place;
+  using Allocator = deprecated::Allocator;
 
   explicit TensorStorage(const std::shared_ptr<Allocator>& a) : alloc_(a) {}
 
diff --git a/paddle/pten/tests/core/allocator.h b/paddle/pten/tests/core/allocator.h
index 094c0e8437d98..c2c74e1aacf1f 100644
--- a/paddle/pten/tests/core/allocator.h
+++ b/paddle/pten/tests/core/allocator.h
@@ -21,7 +21,7 @@ limitations under the License. */
 namespace pten {
 namespace tests {
 
-class HostAllocatorSample : public pten::RawAllocator {
+class HostAllocatorSample : public pten::deprecated::RawAllocator {
  public:
   using Place = paddle::platform::Place;
   void* Allocate(size_t bytes_size) override {
@@ -36,8 +36,9 @@ class HostAllocatorSample : public pten::RawAllocator {
   Place place_{paddle::platform::CPUPlace()};
 };
 
-class FancyAllocator : public pten::Allocator {
+class FancyAllocator : public pten::deprecated::Allocator {
  public:
+  using Allocation = pten::deprecated::Allocation;
   static void Delete(Allocation* allocation) {
     ::operator delete(allocation->ptr());
   }
@@ -55,7 +56,7 @@ class FancyAllocator : public pten::Allocator {
 template <typename T>
 struct CustomAllocator {
   using value_type = T;
-  using Allocator = pten::RawAllocator;
+  using Allocator = pten::deprecated::RawAllocator;
 
   explicit CustomAllocator(const std::shared_ptr<Allocator>& a) noexcept
       : alloc_(a) {}
diff --git a/paddle/pten/tests/core/test_allocator.cc b/paddle/pten/tests/core/test_allocator.cc
index c509d8bd20a01..94ba9a1e1b9a2 100644
--- a/paddle/pten/tests/core/test_allocator.cc
+++ b/paddle/pten/tests/core/test_allocator.cc
@@ -24,6 +24,10 @@ limitations under the License. */
 namespace pten {
 namespace tests {
 
+using RawAllocator = pten::deprecated::RawAllocator;
+using Allocator = pten::deprecated::Allocator;
+using Allocation = pten::deprecated::Allocation;
+
 template <typename T>
 bool host_allocator_test(size_t vector_size) {
   std::vector<T> src(vector_size);
diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index e0ae600819873..caacecf446a82 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -226,7 +226,7 @@ if [ "${HAS_MODIFIED_DEMO_CMAKE}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
 HAS_MODIFIED_ALLOCATION=`git diff --name-only upstream/$BRANCH | grep "paddle/fluid/memory/allocation" || true`
 if [ "${HAS_MODIFIED_ALLOCATION}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     echo_line="You must be approved by zhiqiu and Shixiaowei02 for paddle/fluid/memory/allocation.\nIt is being modularized and refactored. Thanks!\n"
-    check_approval 2 6888866 39303645
+    check_approval 1 6888866 39303645
   fi
 
 HAS_MODIFIED_TENSOR=`git diff --name-only upstream/$BRANCH | grep "paddle/fluid/framework/tensor" || true`
@@ -241,23 +241,6 @@ if [ "${HAS_MODIFIED_TENSOR}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     check_approval 1 22561442 22334008
   fi
 
-ALLOCSHARED_FILE_CHANGED=`git diff --name-only --diff-filter=AM upstream/$BRANCH |grep -E "*\.(h|cc)" || true`
-if [ "${ALLOCSHARED_FILE_CHANGED}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-    ERROR_LINES=""
-    for TEST_FILE in ${ALLOCSHARED_FILE_CHANGED};
-    do
-        HAS_SKIP_CHECK_ALLOC_CI=`git diff -U0 upstream/$BRANCH ${PADDLE_ROOT}/${TEST_FILE} |grep "AllocShared" || true`
-        if [ "${HAS_SKIP_CHECK_ALLOC_CI}" != "" ]; then
-            ERROR_LINES="${ERROR_LINES}\n${TEST_FILE}\n${HAS_SKIP_CHECK_ALLOC_CI}\n"
-        fi
-    done
-    if [ "${ERROR_LINES}" != "" ]; then
-        ERROR_LINES=${ERROR_LINES//+/'\n+\t'}
-        echo_line="memory::AllocShared is not recommended, because it is being modularized and refactored. Please use memory::Alloc here. Otherwise, please request zhiqiu and Shixiaowei02 review and approve.\n"
-        check_approval 2 6888866 39303645
-    fi
-fi
-
 ALL_PADDLE_ENFORCE=`git diff -U0 upstream/$BRANCH |grep "^+" |grep -zoE "PADDLE_ENFORCE\(.[^,\);]+.[^;]*\);\s" || true`
 if [ "${ALL_PADDLE_ENFORCE}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     echo_line="PADDLE_ENFORCE is not recommended. Please use PADDLE_ENFORCE_EQ/NE/GT/GE/LT/LE or PADDLE_ENFORCE_NOT_NULL or PADDLE_ENFORCE_GPU_SUCCESS instead, see [ https://github.com/PaddlePaddle/Paddle/wiki/PADDLE_ENFORCE-Rewriting-Specification ] for details.\nYou must have one RD (chenwhql (Recommend) , luotao1 (Recommend) or lanxianghit) approval for the usage (either add or delete) of PADDLE_ENFORCE.\n${ALL_PADDLE_ENFORCE}\n"

From 5e5157812d0284f265c4d927b85d66b5bfb9c6d2 Mon Sep 17 00:00:00 2001
From: Weilong Wu <veyron_wu@163.com>
Date: Thu, 13 Jan 2022 11:06:09 +0800
Subject: [PATCH 119/151] Support test_imperative using_non_zero_gpu with
 _test_eager_guard() (#38881)

* Support test_imperative using_non_zero_gpu and Add a TODO comment

* Change GPU number to 0

* Modify the cuda device selection method
---
 .../unittests/test_imperative_numpy_bridge.py   |  1 +
 .../test_imperative_using_non_zero_gpu.py       | 17 ++++++++++++++---
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_imperative_numpy_bridge.py b/python/paddle/fluid/tests/unittests/test_imperative_numpy_bridge.py
index 4f3089baffdd3..7b8d31ff030e5 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_numpy_bridge.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_numpy_bridge.py
@@ -42,6 +42,7 @@ def func_tensor_from_numpy(self):
             self.assertEqual(data_np[0][0], -1)
             if _in_eager_mode():
                 # eager_mode, var2 is EagerTensor, is not subscriptable
+                # TODO(wuweilong): to support slice in eager mode later
                 self.assertNotEqual(var2.numpy()[0][0], -1)
             else:
                 self.assertNotEqual(var2[0][0].numpy()[0], -1)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_using_non_zero_gpu.py b/python/paddle/fluid/tests/unittests/test_imperative_using_non_zero_gpu.py
index f2dfaef397797..46a89efcec491 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_using_non_zero_gpu.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_using_non_zero_gpu.py
@@ -12,10 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import paddle
 import paddle.fluid as fluid
 import unittest
-from paddle.fluid.dygraph import to_variable, Embedding, guard
+from paddle.fluid.dygraph import to_variable, guard
 import numpy as np
+from paddle.fluid.framework import _test_eager_guard
 
 
 class TestImperativeUsingNonZeroGpu(unittest.TestCase):
@@ -24,12 +26,21 @@ def run_main(self, np_arr, place):
             var = to_variable(np_arr)
             self.assertTrue(np.array_equal(np_arr, var.numpy()))
 
-    def test_non_zero_gpu(self):
+    def func_non_zero_gpu(self):
         if not fluid.is_compiled_with_cuda():
             return
 
         np_arr = np.random.random([11, 13]).astype('float32')
-        self.run_main(np_arr, fluid.CUDAPlace(0))
+        if paddle.device.cuda.device_count() > 1:
+            # should use non zero gpu if there are more than 1 gpu
+            self.run_main(np_arr, fluid.CUDAPlace(1))
+        else:
+            self.run_main(np_arr, fluid.CUDAPlace(0))
+
+    def test_non_zero_gpu(self):
+        with _test_eager_guard():
+            self.func_non_zero_gpu()
+        self.func_non_zero_gpu()
 
 
 if __name__ == '__main__':

From 281644cd0734d99151b08f8e221c2fd58a326249 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Thu, 13 Jan 2022 11:15:49 +0800
Subject: [PATCH 120/151] Fix mkldnn invalid infershape impl (#38837)

* fix mkldnn invalid infershape

* add unittest for mkldnn in new executor

* add import os
---
 .../fluid/eager/legacy/infer_shape_context.h  | 19 ++++++++++++++-----
 .../fluid/eager/legacy/prepared_operator.cc   |  2 +-
 .../new_executor/new_executor_defs.cc         | 11 +++++++++++
 .../new_executor/new_executor_defs.h          |  2 ++
 paddle/fluid/framework/op_desc.cc             |  4 ++++
 paddle/fluid/framework/operator.cc            | 15 ++++++++++++---
 paddle/fluid/framework/operator.h             |  7 ++-----
 paddle/fluid/framework/shape_inference.h      |  2 ++
 paddle/fluid/imperative/infer_shape_context.h | 19 +++++++++++++------
 paddle/fluid/imperative/prepared_operator.cc  |  8 ++++----
 paddle/fluid/operators/batch_norm_op.cc       |  6 +++---
 paddle/fluid/operators/conv_op.cc             |  2 +-
 paddle/fluid/operators/conv_transpose_op.cc   |  4 ++--
 paddle/fluid/operators/inplace_abn_op.cc      |  8 ++++----
 paddle/fluid/operators/pool_op.cc             |  2 +-
 .../unittests/mkldnn/test_conv2d_mkldnn_op.py | 10 ++++++++++
 16 files changed, 86 insertions(+), 35 deletions(-)

diff --git a/paddle/fluid/eager/legacy/infer_shape_context.h b/paddle/fluid/eager/legacy/infer_shape_context.h
index 7a05f6a9b3581..a1032fd404f85 100644
--- a/paddle/fluid/eager/legacy/infer_shape_context.h
+++ b/paddle/fluid/eager/legacy/infer_shape_context.h
@@ -31,15 +31,18 @@ class EagerInferShapeContext : public paddle::framework::InferShapeContext {
   using DDim = paddle::framework::DDim;
 
  public:
-  EagerInferShapeContext(const NameTensorMap* in, const NameTensorMap* out,
-                         const paddle::framework::AttributeMap* attr,
-                         const paddle::framework::AttributeMap* default_attr,
-                         const std::string op_type)
+  EagerInferShapeContext(
+      const NameTensorMap* in, const NameTensorMap* out,
+      const paddle::framework::AttributeMap* attr,
+      const paddle::framework::AttributeMap* default_attr,
+      const std::string op_type,
+      const paddle::framework::OpKernelType* op_kernel_type = nullptr)
       : tensor_in_(in),
         tensor_out_(out),
         attrs_(attr),
         default_attrs_(default_attr),
-        op_type_(op_type) {}
+        op_type_(op_type),
+        op_kernel_type_(op_kernel_type) {}
 
   bool HasInput(const std::string& name) const override {
     // has only one input
@@ -214,6 +217,11 @@ class EagerInferShapeContext : public paddle::framework::InferShapeContext {
 
   bool IsRuntime() const override { return true; }
 
+  bool IsRunMKLDNNKernel() const override {
+    return (op_kernel_type_ && (op_kernel_type_->data_layout_ ==
+                                paddle::framework::DataLayout::kMKLDNN));
+  }
+
   // TODO(paddle-dev): Can this be template?
   std::vector<paddle::framework::InferShapeVarPtr> GetInputVarPtrs(
       const std::string& name) const override {
@@ -400,6 +408,7 @@ class EagerInferShapeContext : public paddle::framework::InferShapeContext {
   const paddle::framework::AttributeMap* attrs_;
   const paddle::framework::AttributeMap* default_attrs_;
   const std::string op_type_;
+  const paddle::framework::OpKernelType* op_kernel_type_;
 };
 
 }  // namespace legacy
diff --git a/paddle/fluid/eager/legacy/prepared_operator.cc b/paddle/fluid/eager/legacy/prepared_operator.cc
index 4e892b14a9c9c..fbf2d678740ab 100644
--- a/paddle/fluid/eager/legacy/prepared_operator.cc
+++ b/paddle/fluid/eager/legacy/prepared_operator.cc
@@ -173,7 +173,7 @@ static void PreparedOpRunImpl(
   paddle::framework::Scope scope;
 
   EagerInferShapeContext infer_shape_ctx(&ins, &outs, &attrs, &default_attrs,
-                                         op.Type());
+                                         op.Type(), &kernel_type);
   op.Info().infer_shape_(&infer_shape_ctx);
 
   func(EagerExecutionContext(op, scope, *dev_ctx, ctx, ins, outs, attrs,
diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.cc b/paddle/fluid/framework/new_executor/new_executor_defs.cc
index 4b9404fd178fd..654746794da4e 100644
--- a/paddle/fluid/framework/new_executor/new_executor_defs.cc
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.cc
@@ -307,6 +307,17 @@ void InterpretercoreInferShapeContext::SetLoDLevel(const std::string& out,
 
 bool InterpretercoreInferShapeContext::IsRuntime() const { return true; }
 
+bool InterpretercoreInferShapeContext::IsRunMKLDNNKernel() const {
+  try {
+    auto& op_with_kernel = dynamic_cast<const OperatorWithKernel&>(op_);
+    return ((op_with_kernel.kernel_type()) &&
+            (op_with_kernel.kernel_type()->data_layout_ ==
+             framework::DataLayout::kMKLDNN));
+  } catch (std::bad_cast exp) {
+    return false;
+  }
+}
+
 // TODO(paddle-dev): Can this be template?
 std::vector<InferShapeVarPtr> InterpretercoreInferShapeContext::GetInputVarPtrs(
     const std::string& name) const {
diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h
index ca49e7f5670d6..5d63eb33d424b 100644
--- a/paddle/fluid/framework/new_executor/new_executor_defs.h
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.h
@@ -84,6 +84,8 @@ class InterpretercoreInferShapeContext : public InferShapeContext {
 
   bool IsRuntime() const override;
 
+  bool IsRunMKLDNNKernel() const override;
+
   // TODO(paddle-dev): Can this be template?
   std::vector<InferShapeVarPtr> GetInputVarPtrs(
       const std::string& name) const override;
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index 4254ec236d473..7bceeb05bac59 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -240,6 +240,8 @@ class CompileTimeInferShapeContext : public InferShapeContext {
 
   bool IsRuntime() const override;
 
+  bool IsRunMKLDNNKernel() const override;
+
   std::vector<proto::VarType::Type> GetInputsVarType(
       const std::string &name) const override {
     return GetVarTypes(Inputs(name));
@@ -930,6 +932,8 @@ void CompileTimeInferShapeContext::SetRepeatedDims(
 
 bool CompileTimeInferShapeContext::IsRuntime() const { return false; }
 
+bool CompileTimeInferShapeContext::IsRunMKLDNNKernel() const { return false; }
+
 proto::VarType::Type CompileTimeInferShapeContext::GetVarType(
     const std::string &name) const {
   return block_.FindVarRecursive(name)->GetType();
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index dc4d1365093aa..93349b8b88449 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -884,6 +884,17 @@ class RuntimeInferShapeContext : public InferShapeContext {
 
   bool IsRuntime() const override { return true; }
 
+  bool IsRunMKLDNNKernel() const override {
+    try {
+      auto& op_with_kernel = dynamic_cast<const OperatorWithKernel&>(op_);
+      return ((op_with_kernel.kernel_type()) &&
+              (op_with_kernel.kernel_type()->data_layout_ ==
+               framework::DataLayout::kMKLDNN));
+    } catch (std::bad_cast exp) {
+      return false;
+    }
+  }
+
   // TODO(paddle-dev): Can this be template?
   std::vector<InferShapeVarPtr> GetInputVarPtrs(
       const std::string& name) const override {
@@ -1178,9 +1189,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
     platform::RecordEvent record_event("infer_shape",
                                        platform::EventRole::kInnerOp);
     RuntimeInferShapeContext infer_shape_ctx(*this, *runtime_ctx);
-    // TODO(chenweihang): replace this after removing `this->IsMKLDNNType()`
-    // in some mkldnn infershape functions, such conv2d infershape
-    this->InferShape(&infer_shape_ctx);
+    this->Info().infer_shape_(&infer_shape_ctx);
   }
 
   if (FLAGS_enable_unused_var_check) {
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 09e4abc77f573..8e69f96dfb813 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -528,11 +528,6 @@ class OperatorWithKernel : public OperatorBase {
     return g_all_op_kernels;
   }
 
-  bool IsMKLDNNType() const {
-    return ((this->kernel_type_) && (this->kernel_type_->data_layout_ ==
-                                     framework::DataLayout::kMKLDNN));
-  }
-
   bool SupportGPU() const override {
     auto& op_kernels = OperatorWithKernel::AllOpKernels().at(type_);
     return std::any_of(op_kernels.begin(), op_kernels.end(),
@@ -609,6 +604,8 @@ class OperatorWithKernel : public OperatorBase {
     return pt_kernel_context_.get();
   }
 
+  const OpKernelType* kernel_type() const { return kernel_type_.get(); }
+
  private:
   void RunImpl(const Scope& scope, const platform::Place& place) const final;
   void RunImpl(const Scope& scope, const platform::Place& place,
diff --git a/paddle/fluid/framework/shape_inference.h b/paddle/fluid/framework/shape_inference.h
index 10b0fa6afd78a..791600b39c3d9 100644
--- a/paddle/fluid/framework/shape_inference.h
+++ b/paddle/fluid/framework/shape_inference.h
@@ -102,6 +102,8 @@ class InferShapeContext {
 
   virtual bool IsRuntime() const = 0;
 
+  virtual bool IsRunMKLDNNKernel() const = 0;
+
   virtual std::vector<InferShapeVarPtr> GetInputVarPtrs(
       const std::string &name) const = 0;
   virtual std::vector<InferShapeVarPtr> GetOutputVarPtrs(
diff --git a/paddle/fluid/imperative/infer_shape_context.h b/paddle/fluid/imperative/infer_shape_context.h
index 167d5682cbfdb..a16ad1688fbac 100644
--- a/paddle/fluid/imperative/infer_shape_context.h
+++ b/paddle/fluid/imperative/infer_shape_context.h
@@ -32,16 +32,17 @@ class DygraphInferShapeContext : public framework::InferShapeContext {
   using DDim = framework::DDim;
 
  public:
-  DygraphInferShapeContext(const NameVarMap<VarType>* in,
-                           const NameVarMap<VarType>* out,
-                           const framework::AttributeMap* attr,
-                           const framework::AttributeMap* default_attr,
-                           const std::string op_type)
+  DygraphInferShapeContext(
+      const NameVarMap<VarType>* in, const NameVarMap<VarType>* out,
+      const framework::AttributeMap* attr,
+      const framework::AttributeMap* default_attr, const std::string op_type,
+      const framework::OpKernelType* op_kernel_type = nullptr)
       : var_base_map_in_(in),
         var_base_map_out_(out),
         attrs_(attr),
         default_attrs_(default_attr),
-        op_type_(op_type) {}
+        op_type_(op_type),
+        op_kernel_type_(op_kernel_type) {}
 
   bool HasInput(const std::string& name) const override {
     // has only one input
@@ -214,6 +215,11 @@ class DygraphInferShapeContext : public framework::InferShapeContext {
 
   bool IsRuntime() const override { return true; }
 
+  bool IsRunMKLDNNKernel() const override {
+    return (op_kernel_type_ &&
+            (op_kernel_type_->data_layout_ == framework::DataLayout::kMKLDNN));
+  }
+
   // TODO(paddle-dev): Can this be template?
   std::vector<framework::InferShapeVarPtr> GetInputVarPtrs(
       const std::string& name) const override {
@@ -399,6 +405,7 @@ class DygraphInferShapeContext : public framework::InferShapeContext {
   const framework::AttributeMap* attrs_;
   const framework::AttributeMap* default_attrs_;
   const std::string op_type_;
+  const framework::OpKernelType* op_kernel_type_;
 };
 
 }  // namespace imperative
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 1d12ecf30ede5..46e974c8f43f3 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -514,8 +514,8 @@ static void PreparedOpRunImpl(
   // TODO(zjl): remove scope in dygraph
   framework::Scope scope;
 
-  DygraphInferShapeContext<VarType> infer_shape_ctx(&ins, &outs, &attrs,
-                                                    &default_attrs, op.Type());
+  DygraphInferShapeContext<VarType> infer_shape_ctx(
+      &ins, &outs, &attrs, &default_attrs, op.Type(), &kernel_type);
   op.Info().infer_shape_(&infer_shape_ctx);
 
   func(DygraphExecutionContext<VarType>(op, scope, *dev_ctx, ctx, ins, outs,
@@ -560,8 +560,8 @@ static void PreparedOpRunPtImpl(
     platform::DeviceContext* dev_ctx, const NameVarMap<VarType>& ins,
     const NameVarMap<VarType>& outs, const framework::AttributeMap& attrs,
     const framework::AttributeMap& default_attrs) {
-  DygraphInferShapeContext<VarType> infer_shape_ctx(&ins, &outs, &attrs,
-                                                    &default_attrs, op.Type());
+  DygraphInferShapeContext<VarType> infer_shape_ctx(
+      &ins, &outs, &attrs, &default_attrs, op.Type(), &kernel_type);
   op.Info().infer_shape_(&infer_shape_ctx);
 
   BuildDygraphPtenKernelContext<VarType>(pt_kernel_signature, pt_kernel, ins,
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index bc5bd118dbec4..0a8e753c01dc0 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -93,7 +93,7 @@ void BatchNormOp::InferShape(framework::InferShapeContext *ctx) const {
           x_dims, x_dims.size()));
 
   const int64_t C =
-      ((this->IsMKLDNNType() == true) || (data_layout == DataLayout::kNCHW)
+      ((ctx->IsRunMKLDNNKernel() == true) || (data_layout == DataLayout::kNCHW)
            ? x_dims[1]
            : x_dims[x_dims.size() - 1]);
 
@@ -508,7 +508,7 @@ void BatchNormGradOp::InferShape(framework::InferShapeContext *ctx) const {
       ctx->Attrs().Get<std::string>("data_layout"));
 
   const int C =
-      ((this->IsMKLDNNType() == true) || (data_layout == DataLayout::kNCHW)
+      ((ctx->IsRunMKLDNNKernel() == true) || (data_layout == DataLayout::kNCHW)
            ? x_dims[1]
            : x_dims[x_dims.size() - 1]);
 
@@ -911,7 +911,7 @@ void BatchNormDoubleGradOp::InferShape(
   const DataLayout data_layout = framework::StringToDataLayout(
       ctx->Attrs().Get<std::string>("data_layout"));
   const int C =
-      ((this->IsMKLDNNType() == true) || (data_layout == DataLayout::kNCHW)
+      ((ctx->IsRunMKLDNNKernel() == true) || (data_layout == DataLayout::kNCHW)
            ? x_dims[1]
            : x_dims[x_dims.size() - 1]);
 
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index 41f6f75200697..e500814232aae 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -57,7 +57,7 @@ std::vector<int64_t> ConvOp::ComputeOutputShape(
 
   // MKL-DNN Kernels are using NCHW order of dims description
   // so we ignore data_format consideration for MKL-DNN kernel
-  const bool channel_last = (this->IsMKLDNNType() == false) &&
+  const bool channel_last = (ctx->IsRunMKLDNNKernel() == false) &&
                             (data_format == "NHWC" || data_format == "NDHWC");
 
   PADDLE_ENFORCE_EQ(
diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc
index d60786f60e9cc..12f537e2f7980 100644
--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
@@ -49,8 +49,8 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
   const std::string data_layout_str =
       ctx->Attrs().Get<std::string>("data_format");
   const DataLayout data_layout =
-      this->IsMKLDNNType() ? DataLayout::kNCHW
-                           : framework::StringToDataLayout(data_layout_str);
+      ctx->IsRunMKLDNNKernel() ? DataLayout::kNCHW
+                               : framework::StringToDataLayout(data_layout_str);
 
   PADDLE_ENFORCE_EQ(in_dims.size() == 4 || in_dims.size() == 5, true,
                     platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/inplace_abn_op.cc b/paddle/fluid/operators/inplace_abn_op.cc
index 8234d63d681ff..7a112292c8fc5 100644
--- a/paddle/fluid/operators/inplace_abn_op.cc
+++ b/paddle/fluid/operators/inplace_abn_op.cc
@@ -100,10 +100,10 @@ class InplaceABNGradOp : public paddle::operators::BatchNormGradOp {
     const DataLayout data_layout = framework::StringToDataLayout(
         ctx->Attrs().Get<std::string>("data_layout"));
 
-    const int C =
-        ((this->IsMKLDNNType() == true) || (data_layout == DataLayout::kNCHW)
-             ? y_dims[1]
-             : y_dims[y_dims.size() - 1]);
+    const int C = ((ctx->IsRunMKLDNNKernel() == true) ||
+                           (data_layout == DataLayout::kNCHW)
+                       ? y_dims[1]
+                       : y_dims[y_dims.size() - 1]);
 
     ctx->SetOutputDim(framework::GradVarName("X"), y_dims);
     // has_scale_grad == has_bias_grad, judge has_scale_grad is enough
diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc
index fa98e76e39338..b4ba80ae7ae2f 100644
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -97,7 +97,7 @@ void PoolOp::InferShape(framework::InferShapeContext* ctx) const {
 
   // MKL-DNN Kernels are using NCHW order of dims description
   // so we ignore data_format consideration for MKL-DNN kernel
-  const bool channel_last = (this->IsMKLDNNType() == false) &&
+  const bool channel_last = (ctx->IsRunMKLDNNKernel() == false) &&
                             (data_format == "NHWC" || data_format == "NDHWC");
 
   // update paddings if "SAME" or global_pooling
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
index 50d53864789f3..487a69807e2b0 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 
+import os
 import unittest
 import numpy as np
 
@@ -232,6 +233,15 @@ def init_group(self):
         self.groups = 3
 
 
+# TODO(chenweihang): To solve the coverage problem, add this unittest,
+# remove this unittest after new executor set to default executor
+class TestConv2dMKLDNNByNewExecutor(TestConv2DMKLDNNOp):
+    def test_check_output_by_new_executor(self):
+        os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] = '1'
+        self.test_check_output()
+        del os.environ['FLAGS_USE_STANDALONE_EXECUTOR']
+
+
 if __name__ == '__main__':
     from paddle import enable_static
     enable_static()

From fc6eed5b2789d5cdb5c84bf2fb9e41db2bcfdc5d Mon Sep 17 00:00:00 2001
From: jakpiase <jakpia21@gmail.com>
Date: Thu, 13 Jan 2022 04:43:45 +0100
Subject: [PATCH 121/151] Added mul BF16/FP32 FWD/BWD oneDNN kernel (#38552)

* base changes for mul reimplementation

* empty commit

* tmp save

* full implementation of mul bf16/fp32 fwd bwd

* CI fix

* CI rerun

* changed unity build cmake to avoid gpu issues

* removed mul mkldnn from unity build

* added skipping tests if not cpu_bf16

* CI fix

* CI fix

* CI fix
---
 .../operators/mkldnn/matmul_v2_mkldnn_op.cc   | 109 +----------
 .../fluid/operators/mkldnn/mul_mkldnn_op.cc   | 176 +++++++++++++++++-
 paddle/fluid/operators/mul_op.cc              |  36 ++++
 paddle/fluid/operators/mul_op.h               |   1 +
 paddle/fluid/operators/unity_build_rule.cmake |   1 -
 paddle/fluid/platform/mkldnn_reuse.h          | 108 +++++++++++
 .../contrib/mixed_precision/bf16/amp_lists.py |   2 +-
 .../fluid/tests/book/test_fit_a_line.py       |  13 ++
 .../mkldnn/test_mul_int8_mkldnn_op.py         |   2 +
 .../unittests/mkldnn/test_mul_mkldnn_op.py    | 159 ++++++++++++++++
 10 files changed, 490 insertions(+), 117 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/test_mul_mkldnn_op.py

diff --git a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
index a8d4b852ca3c2..d3c7c1759641b 100644
--- a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
@@ -20,6 +20,7 @@ using dnnl::memory;
 using dnnl::primitive;
 using paddle::framework::DataLayout;
 using paddle::framework::ExecutionContext;
+using paddle::platform::MatMulV2MKLDNNHandler;
 using paddle::platform::GetMKLDNNFormat;
 using paddle::platform::MKLDNNDeviceContext;
 using paddle::platform::MKLDNNGetDataType;
@@ -107,114 +108,6 @@ std::vector<int64_t> GetInputStrides(const ExecutionContext& ctx,
   return strides;
 }
 
-template <typename T>
-class MatMulV2MKLDNNHandler
-    : public paddle::platform::MKLDNNHandlerNoCachingT<T, dnnl::matmul> {
- public:
-  MatMulV2MKLDNNHandler(const dnnl::engine engine,
-                        paddle::platform::Place cpu_place,
-                        const std::vector<int64_t>& x_org_dims, bool trans_x,
-                        const std::vector<int64_t>& y_org_dims, bool trans_y,
-                        bool is_output_fused,
-                        const std::vector<int64_t>& x_strides_override,
-                        const std::vector<int64_t>& y_strides_override)
-      : paddle::platform::MKLDNNHandlerNoCachingT<T, dnnl::matmul>(engine,
-                                                                   cpu_place) {
-    // M X K * K X N
-    std::vector<int64_t> x_dims(x_org_dims);
-    std::vector<int64_t> y_dims(y_org_dims);
-
-    const int MB_idx = x_dims.size() - 3;
-    const int H_idx = x_dims.size() - 2;
-    const int W_idx = x_dims.size() - 1;
-
-    if (trans_x) std::swap(x_dims[H_idx], x_dims[W_idx]);
-    if (trans_y) std::swap(y_dims[H_idx], y_dims[W_idx]);
-
-    const memory::dim M = x_dims[H_idx];
-    const memory::dim K = x_dims[W_idx];
-    const memory::dim N = y_dims[W_idx];
-
-    std::vector<int64_t> x_strides(x_dims.size() - 3, 1);
-    std::vector<int64_t> y_strides(x_dims.size() - 3, 1);
-    std::vector<int64_t> out_strides(x_dims.size() - 3, 1);
-    std::vector<int64_t> out_ddims(x_dims.size() - 3, 1);
-
-    x_strides.reserve(x_dims.size());
-    y_strides.reserve(x_dims.size());
-    out_strides.reserve(x_dims.size());
-
-    if (!x_strides_override.empty()) {
-      x_strides = x_strides_override;
-    } else {
-      if (!trans_x) {
-        x_strides.insert(x_strides.end(), {M * K, K, 1});
-      } else {
-        x_strides.insert(x_strides.end(), {M * K, 1, M});
-      }
-    }
-
-    if (!y_strides_override.empty()) {
-      y_strides = y_strides_override;
-    } else {
-      if (!trans_y) {
-        y_strides.insert(y_strides.end(), {N * K, N, 1});
-      } else {
-        y_strides.insert(y_strides.end(), {N * K, 1, K});
-      }
-    }
-
-    out_strides.insert(out_strides.end(), {M * N, N, 1});
-    out_ddims.insert(out_ddims.end(),
-                     {std::max(x_dims[MB_idx], y_dims[MB_idx]), M, N});
-
-    for (int i = x_dims.size() - 4; i >= 0; --i) {
-      out_ddims[i] = std::max(x_dims[i], y_dims[i]);
-      if (x_strides_override.empty()) {
-        x_strides[i] = x_dims[i + 1] * x_strides[i + 1];
-      }
-      if (y_strides_override.empty()) {
-        y_strides[i] = y_dims[i + 1] * y_strides[i + 1];
-      }
-      out_strides[i] = out_ddims[i + 1] * out_strides[i + 1];
-    }
-
-    if (is_output_fused) {
-      out_strides = FakeTransposeStrides(out_ddims);
-    }
-
-    auto x_md = memory::desc(x_dims, MKLDNNGetDataType<T>(), x_strides);
-    auto y_md = memory::desc(y_dims, MKLDNNGetDataType<T>(), y_strides);
-    auto out_md = memory::desc(out_ddims, MKLDNNGetDataType<T>(), out_strides);
-
-    this->AcquireForwardPrimitiveDescriptor(x_md, y_md, out_md);
-  }
-
-  std::vector<int64_t> FakeTransposeStrides(
-      const std::vector<int64_t>& matmul_out_dims) const {
-    // fuse matmul_v2 + transpose + reshape guarantees that output is 4D and
-    // transpose axis are: {0, 2, 1, 3}
-    std::vector<int64_t> transpose_axis = {0, 2, 1, 3};
-    std::vector<int64_t> fake_strides(transpose_axis.size());
-    int ndims = static_cast<int>(transpose_axis.size());
-
-    int total_stride = 1;
-
-    for (int i = ndims - 1; i >= 0; --i) {
-      fake_strides[transpose_axis[i]] = total_stride;
-      total_stride *= matmul_out_dims[transpose_axis[i]];
-    }
-
-    return fake_strides;
-  }
-
-  std::shared_ptr<memory> AcquireWeightsMemory(const Tensor* input) {
-    const T* input_data = input->data<T>();
-    return this->AcquireMemoryFromPrimitive(this->fwd_pd_->weights_desc(),
-                                            to_void_cast<T>(input_data));
-  }
-};
-
 bool IsOutputFused(const ExecutionContext& ctx) {
   auto& fused_reshape_Out = ctx.Attr<std::vector<int>>("fused_reshape_Out");
   auto& fused_transpose_Out = ctx.Attr<std::vector<int>>("fused_transpose_Out");
diff --git a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc
index 0938024052271..49c896ef80fcc 100644
--- a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <string>
 
 #include "paddle/fluid/operators/mul_op.h"
-#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/mkldnn_reuse.h"
 
 namespace paddle {
 namespace framework {
@@ -32,13 +32,17 @@ namespace operators {
 using framework::DataLayout;
 using framework::DDim;
 using framework::ExecutionContext;
+using framework::LoDTensor;
 using framework::Tensor;
+
+using platform::MatMulV2MKLDNNHandler;
+using platform::MKLDNNDeviceContext;
+using platform::to_void_cast;
+
 using dnnl::inner_product_forward;
 using dnnl::memory;
 using dnnl::prop_kind;
 using dnnl::stream;
-using platform::MKLDNNDeviceContext;
-using platform::to_void_cast;
 
 template <typename XT, typename YT, typename OT>
 class MulPrimitiveFactory {
@@ -345,7 +349,7 @@ inner_product_forward GetMulPrimitive(const MKLDNNDeviceContext &dev_ctx,
 
 /* XT: input x data type, YT: input y data type */
 template <typename XT, typename YT>
-class MulMKLDNNKernel : public framework::OpKernel<XT> {
+class MulMKLDNNINT8Kernel : public framework::OpKernel<XT> {
  public:
   void Compute(const ExecutionContext &ctx) const override {
     PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true,
@@ -371,17 +375,175 @@ class MulMKLDNNKernel : public framework::OpKernel<XT> {
   }
 };
 
+template <typename XT, typename YT>
+class MulMKLDNNKernel : public framework::OpKernel<XT> {
+ public:
+  void Compute(const ExecutionContext &ctx) const override { RunKernel(ctx); }
+
+ protected:
+  void ExecuteMatMul(const ExecutionContext &ctx,
+                     const MKLDNNDeviceContext &dev_ctx,
+                     const dnnl::engine &onednn_engine,
+                     const platform::Place &cpu_place, const Tensor *x,
+                     const std::vector<int64_t> &x_dims, bool trans_x,
+                     const Tensor *y, const std::vector<int64_t> &y_dims,
+                     bool trans_y, Tensor *out) const {
+    static const std::vector<int64_t> vec_placeholder;
+    MatMulV2MKLDNNHandler<XT> handler(onednn_engine, ctx.GetPlace(), x_dims,
+                                      trans_x, y_dims, trans_y, false,
+                                      vec_placeholder, vec_placeholder);
+
+    const auto src_memory_p = handler.AcquireSrcMemory(x);
+    const auto weights_memory_p = handler.AcquireWeightsMemory(y);
+    const auto dst_memory_p = handler.AcquireDstMemory(out);
+
+    auto matmul_p = handler.AcquireForwardPrimitive();
+
+    std::unordered_map<int, dnnl::memory> matmul_args = {
+        {DNNL_ARG_SRC, *src_memory_p},
+        {DNNL_ARG_WEIGHTS, *weights_memory_p},
+        {DNNL_ARG_DST, *dst_memory_p}};
+
+    auto &astream = MKLDNNDeviceContext::tls().get_stream();
+    matmul_p->execute(astream, matmul_args);
+    astream.wait();
+
+    out->set_layout(framework::DataLayout::kMKLDNN);
+    // plain output formats are enforced inside handler
+    out->set_format(platform::MKLDNNFormatForSize(
+        out->dims().size(), dnnl::memory::format_tag::nchw));
+  }
+
+ private:
+  void RunKernel(const ExecutionContext &ctx) const {
+    const auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+    const auto &onednn_engine = dev_ctx.GetEngine();
+
+    const auto *x = ctx.Input<Tensor>("X");
+    const auto *y = ctx.Input<Tensor>("Y");
+    auto *out = ctx.Output<Tensor>("Out");
+
+    int x_num_col_dims = ctx.Attr<int>("x_num_col_dims");
+    int y_num_col_dims = ctx.Attr<int>("y_num_col_dims");
+
+    const Tensor x_matrix = x->dims().size() > 2
+                                ? framework::ReshapeToMatrix(*x, x_num_col_dims)
+                                : *x;
+    const Tensor y_matrix = y->dims().size() > 2
+                                ? framework::ReshapeToMatrix(*y, y_num_col_dims)
+                                : *y;
+
+    // adding mb dim because MatMulV2 handler needs it
+    std::vector<int64_t> y_dims(3, 1);
+    std::vector<int64_t> x_dims(3, 1);
+
+    y_dims[1] = y_matrix.dims()[0];
+    y_dims[2] = y_matrix.dims()[1];
+
+    x_dims[1] = x_matrix.dims()[0];
+    x_dims[2] = x_matrix.dims()[1];
+
+    ExecuteMatMul(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), &x_matrix,
+                  x_dims, false, &y_matrix, y_dims, false, out);
+  }
+};
+
+template <typename XT, typename YT>
+class MulGradMKLDNNKernel : public MulMKLDNNKernel<XT, YT> {
+ public:
+  void Compute(const ExecutionContext &ctx) const override { RunKernel(ctx); }
+
+ private:
+  template <typename OT = XT>
+  void RunKernel(const ExecutionContext &ctx) const {
+    const auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+    const auto &onednn_engine = dev_ctx.GetEngine();
+
+    const auto *x = ctx.Input<LoDTensor>("X");
+    const auto *y = ctx.Input<LoDTensor>("Y");
+    const auto *dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+    auto *dx = ctx.Output<LoDTensor>(framework::GradVarName("X"));
+    auto *dy = ctx.Output<LoDTensor>(framework::GradVarName("Y"));
+
+    int x_num_col_dims = ctx.Attr<int>("x_num_col_dims");
+    int y_num_col_dims = ctx.Attr<int>("y_num_col_dims");
+
+    const Tensor x_matrix = x->dims().size() > 2
+                                ? framework::ReshapeToMatrix(*x, x_num_col_dims)
+                                : static_cast<const Tensor &>(*x);
+    const Tensor y_matrix = y->dims().size() > 2
+                                ? framework::ReshapeToMatrix(*y, y_num_col_dims)
+                                : static_cast<const Tensor &>(*y);
+
+    Tensor dout_matrix = *dout;
+    dout_matrix.Resize(
+        {framework::flatten_to_2d(x->dims(), x_num_col_dims)[0],
+         framework::flatten_to_2d(y->dims(), y_num_col_dims)[1]});
+
+    // adding mb dim because MatMulV2 handler needs it
+    std::vector<int64_t> x_dims(3, 1);
+    std::vector<int64_t> y_dims(3, 1);
+    std::vector<int64_t> dout_dims(3, 1);
+
+    x_dims[1] = x_matrix.dims()[0];
+    x_dims[2] = x_matrix.dims()[1];
+
+    y_dims[1] = y_matrix.dims()[0];
+    y_dims[2] = y_matrix.dims()[1];
+
+    dout_dims[1] = dout_matrix.dims()[0];
+    dout_dims[2] = dout_matrix.dims()[1];
+
+    if (dx != nullptr) {
+      dx->set_lod(x->lod());
+      this->ExecuteMatMul(ctx, dev_ctx, onednn_engine, ctx.GetPlace(),
+                          &dout_matrix, dout_dims, false, &y_matrix, y_dims,
+                          true, static_cast<Tensor *>(dx));
+    }
+    if (dy != nullptr) {
+      dy->set_lod(y->lod());
+      this->ExecuteMatMul(ctx, dev_ctx, onednn_engine, ctx.GetPlace(),
+                          &x_matrix, x_dims, true, &dout_matrix, dout_dims,
+                          false, static_cast<Tensor *>(dy));
+    }
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(mul, MKLDNN, ::paddle::platform::CPUPlace,
                                     U8, ops::kMULMKLDNNINT8,
-                                    ops::MulMKLDNNKernel<uint8_t, float>);
+                                    ops::MulMKLDNNINT8Kernel<uint8_t, float>);
 
 REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(mul, MKLDNN, ::paddle::platform::CPUPlace,
                                     S8, ops::kMULMKLDNNINT8,
-                                    ops::MulMKLDNNKernel<int8_t, float>);
+                                    ops::MulMKLDNNINT8Kernel<int8_t, float>);
+
+REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(mul, MKLDNN, ::paddle::platform::CPUPlace,
+                                    FP32, ops::kMULMKLDNNFP32,
+                                    ops::MulMKLDNNKernel<float, float>);
+
+REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(
+    mul, MKLDNN, ::paddle::platform::CPUPlace, BF16, ops::kMULMKLDNNFP32,
+    ops::MulMKLDNNKernel<paddle::platform::bfloat16,
+                         paddle::platform::bfloat16>);
 
 REGISTER_OP_KERNEL(mul, MKLDNN, ::paddle::platform::CPUPlace,
-                   ops::MulMKLDNNKernel<uint8_t, float>);
+                   ops::MulMKLDNNINT8Kernel<uint8_t, float>,
+                   ops::MulMKLDNNKernel<paddle::platform::bfloat16,
+                                        paddle::platform::bfloat16>,
+                   ops::MulMKLDNNKernel<float, float>);
+
+REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(mul_grad, MKLDNN,
+                                    ::paddle::platform::CPUPlace, FP32,
+                                    ops::kMULMKLDNNFP32,
+                                    ops::MulGradMKLDNNKernel<float, float>);
+
+REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(
+    mul_grad, MKLDNN, ::paddle::platform::CPUPlace, BF16, ops::kMULMKLDNNFP32,
+    ops::MulGradMKLDNNKernel<paddle::platform::bfloat16,
+                             paddle::platform::bfloat16>,
+    ops::MulGradMKLDNNKernel<float, float>);
diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc
index 14291f8458430..691c394870ad4 100644
--- a/paddle/fluid/operators/mul_op.cc
+++ b/paddle/fluid/operators/mul_op.cc
@@ -113,6 +113,12 @@ class MulOp : public framework::OperatorWithKernel {
       if (input_data_type == framework::DataTypeTrait<int8_t>::DataType() ||
           input_data_type == framework::DataTypeTrait<uint8_t>::DataType()) {
         customized_type_value = kMULMKLDNNINT8;
+      } else if (input_data_type ==
+                     framework::DataTypeTrait<
+                         paddle::platform::bfloat16>::DataType() ||
+                 input_data_type ==
+                     framework::DataTypeTrait<float>::DataType()) {
+        customized_type_value = kMULMKLDNNFP32;
       }
     }
 #endif
@@ -233,6 +239,36 @@ class MulGradOp : public framework::OperatorWithKernel {
       ctx->SetOutputDim(y_grad_name, y_dims);
     }
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const {
+    framework::LibraryType library = framework::LibraryType::kPlain;
+    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+    int customized_type_value =
+        framework::OpKernelType::kDefaultCustomizedTypeValue;
+    auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+#ifdef PADDLE_WITH_MKLDNN
+    if (library == framework::LibraryType::kPlain &&
+        this->CanMKLDNNBeUsed(ctx, input_data_type)) {
+      library = framework::LibraryType::kMKLDNN;
+      layout = framework::DataLayout::kMKLDNN;
+
+      if (input_data_type == framework::DataTypeTrait<int8_t>::DataType() ||
+          input_data_type == framework::DataTypeTrait<uint8_t>::DataType()) {
+        customized_type_value = kMULMKLDNNINT8;
+      } else if (input_data_type ==
+                     framework::DataTypeTrait<
+                         paddle::platform::bfloat16>::DataType() ||
+                 input_data_type ==
+                     framework::DataTypeTrait<float>::DataType()) {
+        customized_type_value = kMULMKLDNNFP32;
+      }
+    }
+#endif
+
+    return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
+                                   library, customized_type_value);
+  }
 };
 
 template <typename T>
diff --git a/paddle/fluid/operators/mul_op.h b/paddle/fluid/operators/mul_op.h
index 3a13e0576e347..0fb32cf4be886 100644
--- a/paddle/fluid/operators/mul_op.h
+++ b/paddle/fluid/operators/mul_op.h
@@ -25,6 +25,7 @@ namespace operators {
 using Tensor = framework::Tensor;
 
 constexpr int kMULMKLDNNINT8 = 1;
+constexpr int kMULMKLDNNFP32 = 2;
 
 template <typename DeviceContext, typename T>
 class MulKernel : public framework::OpKernel<T> {
diff --git a/paddle/fluid/operators/unity_build_rule.cmake b/paddle/fluid/operators/unity_build_rule.cmake
index 25aef67425ef9..5ab2004617810 100644
--- a/paddle/fluid/operators/unity_build_rule.cmake
+++ b/paddle/fluid/operators/unity_build_rule.cmake
@@ -192,7 +192,6 @@ register_unity_group(cc
     pad_op.cc)
 register_unity_group(cc
     modified_huber_loss_op.cc
-    mkldnn/mul_mkldnn_op.cc
     partial_sum_op.cc
     pixel_shuffle_op.cc
     pool_op.cc
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index c16137b50dbf7..ef216e48416f9 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -772,6 +772,114 @@ class ReductionMKLDNNHandler
   }
 };
 
+template <typename T>
+class MatMulV2MKLDNNHandler
+    : public paddle::platform::MKLDNNHandlerNoCachingT<T, dnnl::matmul> {
+ public:
+  MatMulV2MKLDNNHandler(const dnnl::engine engine,
+                        paddle::platform::Place cpu_place,
+                        const std::vector<int64_t>& x_org_dims, bool trans_x,
+                        const std::vector<int64_t>& y_org_dims, bool trans_y,
+                        bool is_output_fused,
+                        const std::vector<int64_t>& x_strides_override,
+                        const std::vector<int64_t>& y_strides_override)
+      : paddle::platform::MKLDNNHandlerNoCachingT<T, dnnl::matmul>(engine,
+                                                                   cpu_place) {
+    // M X K * K X N
+    std::vector<int64_t> x_dims(x_org_dims);
+    std::vector<int64_t> y_dims(y_org_dims);
+
+    const int MB_idx = x_dims.size() - 3;
+    const int H_idx = x_dims.size() - 2;
+    const int W_idx = x_dims.size() - 1;
+
+    if (trans_x) std::swap(x_dims[H_idx], x_dims[W_idx]);
+    if (trans_y) std::swap(y_dims[H_idx], y_dims[W_idx]);
+
+    const memory::dim M = x_dims[H_idx];
+    const memory::dim K = x_dims[W_idx];
+    const memory::dim N = y_dims[W_idx];
+
+    std::vector<int64_t> x_strides(x_dims.size() - 3, 1);
+    std::vector<int64_t> y_strides(x_dims.size() - 3, 1);
+    std::vector<int64_t> out_strides(x_dims.size() - 3, 1);
+    std::vector<int64_t> out_ddims(x_dims.size() - 3, 1);
+
+    x_strides.reserve(x_dims.size());
+    y_strides.reserve(x_dims.size());
+    out_strides.reserve(x_dims.size());
+
+    if (!x_strides_override.empty()) {
+      x_strides = x_strides_override;
+    } else {
+      if (!trans_x) {
+        x_strides.insert(x_strides.end(), {M * K, K, 1});
+      } else {
+        x_strides.insert(x_strides.end(), {M * K, 1, M});
+      }
+    }
+
+    if (!y_strides_override.empty()) {
+      y_strides = y_strides_override;
+    } else {
+      if (!trans_y) {
+        y_strides.insert(y_strides.end(), {N * K, N, 1});
+      } else {
+        y_strides.insert(y_strides.end(), {N * K, 1, K});
+      }
+    }
+
+    out_strides.insert(out_strides.end(), {M * N, N, 1});
+    out_ddims.insert(out_ddims.end(),
+                     {std::max(x_dims[MB_idx], y_dims[MB_idx]), M, N});
+
+    for (int i = x_dims.size() - 4; i >= 0; --i) {
+      out_ddims[i] = std::max(x_dims[i], y_dims[i]);
+      if (x_strides_override.empty()) {
+        x_strides[i] = x_dims[i + 1] * x_strides[i + 1];
+      }
+      if (y_strides_override.empty()) {
+        y_strides[i] = y_dims[i + 1] * y_strides[i + 1];
+      }
+      out_strides[i] = out_ddims[i + 1] * out_strides[i + 1];
+    }
+
+    if (is_output_fused) {
+      out_strides = FakeTransposeStrides(out_ddims);
+    }
+
+    auto x_md = memory::desc(x_dims, MKLDNNGetDataType<T>(), x_strides);
+    auto y_md = memory::desc(y_dims, MKLDNNGetDataType<T>(), y_strides);
+    auto out_md = memory::desc(out_ddims, MKLDNNGetDataType<T>(), out_strides);
+
+    this->AcquireForwardPrimitiveDescriptor(x_md, y_md, out_md);
+  }
+
+  std::vector<int64_t> FakeTransposeStrides(
+      const std::vector<int64_t>& matmul_out_dims) const {
+    // fuse matmul_v2 + transpose + reshape guarantees that output is 4D and
+    // transpose axis are: {0, 2, 1, 3}
+    std::vector<int64_t> transpose_axis = {0, 2, 1, 3};
+    std::vector<int64_t> fake_strides(transpose_axis.size());
+    int ndims = static_cast<int>(transpose_axis.size());
+
+    int total_stride = 1;
+
+    for (int i = ndims - 1; i >= 0; --i) {
+      fake_strides[transpose_axis[i]] = total_stride;
+      total_stride *= matmul_out_dims[transpose_axis[i]];
+    }
+
+    return fake_strides;
+  }
+
+  std::shared_ptr<memory> AcquireWeightsMemory(const Tensor* input) {
+    const T* input_data = input->data<T>();
+    return this->AcquireMemoryFromPrimitive(this->fwd_pd_->weights_desc(),
+                                            to_void_cast<T>(input_data));
+  }
+};
+
 template <typename T>
 class ActivationMKLDNNHandler
     : public MKLDNNHandlerNoCachingT<T, dnnl::eltwise_forward,
diff --git a/python/paddle/fluid/contrib/mixed_precision/bf16/amp_lists.py b/python/paddle/fluid/contrib/mixed_precision/bf16/amp_lists.py
index 3a4dc8ed9afcc..bbabbaa007309 100644
--- a/python/paddle/fluid/contrib/mixed_precision/bf16/amp_lists.py
+++ b/python/paddle/fluid/contrib/mixed_precision/bf16/amp_lists.py
@@ -83,7 +83,7 @@ def _update_list(self):
 bf16_initializer_list = {'fill_constant', 'uniform_random'}
 
 # always bf16
-bf16_list = {'elementwise_add', }
+bf16_list = {'elementwise_add', 'mul'}
 
 # depends on the prev_op type
 gray_list = {
diff --git a/python/paddle/fluid/tests/book/test_fit_a_line.py b/python/paddle/fluid/tests/book/test_fit_a_line.py
index a8a5c8bf31598..4324e582fc991 100644
--- a/python/paddle/fluid/tests/book/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/test_fit_a_line.py
@@ -37,6 +37,15 @@ def convert_uint16_to_float(in_list):
     return numpy.reshape(out, in_list.shape)
 
 
+def convert_float_to_uint16(in_list):
+    out = []
+    for x in numpy.nditer(in_list):
+        out.append(
+            numpy.uint16(struct.unpack('<I', struct.pack('<f', x))[0] >> 16))
+    out = numpy.reshape(out, in_list.shape).view(numpy.uint16)
+    return out
+
+
 def train(use_cuda, save_dirname, is_local, use_bf16, pure_bf16):
     x = fluid.layers.data(name='x', shape=[13], dtype='float32')
     y = fluid.layers.data(name='y', shape=[1], dtype='float32')
@@ -158,6 +167,10 @@ def infer(use_cuda, save_dirname=None, use_bf16=False):
         test_data = next(test_reader())
         test_feat = numpy.array(
             [data[0] for data in test_data]).astype("float32")
+
+        if use_bf16:
+            test_feat = convert_float_to_uint16(test_feat)
+
         test_label = numpy.array(
             [data[1] for data in test_data]).astype("float32")
 
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_mul_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_mul_int8_mkldnn_op.py
index 0c91868d30245..9265d5f7edfbb 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_mul_int8_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_mul_int8_mkldnn_op.py
@@ -16,6 +16,7 @@
 
 import unittest
 import numpy as np
+import paddle
 import paddle.fluid.core as core
 from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci
 '''
@@ -159,4 +160,5 @@ def init_data_type(self):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_mul_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_mul_mkldnn_op.py
new file mode 100644
index 0000000000000..a0581d791209d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_mul_mkldnn_op.py
@@ -0,0 +1,159 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from numpy.matrixlib import defmatrix
+import paddle
+import paddle.fluid.core as core
+from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16, OpTestTool
+
+
+@OpTestTool.skip_if_not_cpu_bf16()
+class TestMulOneDNNOp(OpTest):
+    def setUp(self):
+        self.op_type = "mul"
+        self.attrs = {'use_mkldnn': True}
+        self.init_shapes_and_attrs()
+
+        self.x_fp32 = np.random.random(self.x_shape).astype(np.float32)
+        self.y_fp32 = np.random.random(self.y_shape).astype(np.float32)
+
+        self.x = self.x_fp32
+        self.y = self.y_fp32
+
+        self.init_inputs_dtype()
+
+        self.inputs = {'X': self.x, 'Y': self.y}
+
+        output = np.dot(
+            np.reshape(self.x_fp32, self.np_x_shape),
+            np.reshape(self.y_fp32, self.np_y_shape))
+        self.outputs = {'Out': np.reshape(output, self.out_shape)}
+
+    def init_shapes_and_attrs(self):
+        self.x_shape = (20, 5)
+        self.y_shape = (5, 21)
+
+        self.np_x_shape = (20, 5)
+        self.np_y_shape = (5, 21)
+
+        self.out_shape = (20, 21)
+
+    def init_inputs_dtype(self):
+        pass
+
+    def test_check_output(self):
+        self.check_output_with_place(core.CPUPlace())
+
+    def test_check_grad(self):
+        self.check_grad_with_place(core.CPUPlace(), ['X', 'Y'], 'Out')
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad_with_place(core.CPUPlace(), ['Y'], 'Out', set('X'))
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad_with_place(core.CPUPlace(), ['X'], 'Out', set('Y'))
+
+
+class TestMulXNumColDims2OneDNNOp(TestMulOneDNNOp):
+    def init_shapes_and_attrs(self):
+        self.x_shape = (6, 7, 5)
+        self.y_shape = (5, 21)
+
+        self.np_x_shape = (42, 5)
+        self.np_y_shape = (5, 21)
+
+        self.out_shape = (6, 7, 21)
+
+        self.attrs["x_num_col_dims"] = 2
+
+
+class TestMulYNumColDims2OneDNNOp(TestMulOneDNNOp):
+    def init_shapes_and_attrs(self):
+        self.x_shape = (20, 6)
+        self.y_shape = (2, 3, 21)
+
+        self.np_x_shape = (20, 6)
+        self.np_y_shape = (6, 21)
+
+        self.out_shape = (20, 21)
+
+        self.attrs["y_num_col_dims"] = 2
+
+
+class TestMulYAndXNumColDims2OneDNNOp(TestMulOneDNNOp):
+    def init_shapes_and_attrs(self):
+        self.x_shape = (10, 5, 6)
+        self.y_shape = (2, 3, 21)
+
+        self.np_x_shape = (50, 6)
+        self.np_y_shape = (6, 21)
+
+        self.out_shape = (10, 5, 21)
+
+        self.attrs["x_num_col_dims"] = 2
+        self.attrs["y_num_col_dims"] = 2
+
+
+class TestMulBF16OneDNNOp(TestMulOneDNNOp):
+    def init_inputs_dtype(self):
+        self.x = convert_float_to_uint16(self.x)
+        self.y = convert_float_to_uint16(self.y)
+
+    def calculate_grads(self):
+        x_np = np.reshape(self.x_fp32, self.np_x_shape)
+        y_np = np.reshape(self.y_fp32, self.np_y_shape)
+
+        self.dout = self.outputs['Out']
+        self.dout_np = np.reshape(self.dout, (x_np.shape[0], y_np.shape[1]))
+
+        y_np_trans = np.transpose(y_np, (1, 0))
+        x_np_trans = np.transpose(x_np, (1, 0))
+
+        self.dx = np.matmul(self.dout_np, y_np_trans)
+        self.dy = np.matmul(x_np_trans, self.dout_np)
+
+    def test_check_grad(self):
+        self.calculate_grads()
+        self.check_grad_with_place(
+            core.CPUPlace(), ['X', 'Y'],
+            'Out',
+            user_defined_grads=[self.dx, self.dy],
+            user_defined_grad_outputs=[convert_float_to_uint16(self.dout)])
+
+    def test_check_grad_ingore_x(self):
+        self.calculate_grads()
+        self.check_grad_with_place(
+            core.CPUPlace(), ['Y'],
+            'Out',
+            set('X'),
+            user_defined_grads=[self.dy],
+            user_defined_grad_outputs=[convert_float_to_uint16(self.dout)])
+
+    def test_check_grad_ingore_y(self):
+        self.calculate_grads()
+        self.check_grad_with_place(
+            core.CPUPlace(), ['X'],
+            'Out',
+            set('Y'),
+            user_defined_grads=[self.dx],
+            user_defined_grad_outputs=[convert_float_to_uint16(self.dout)])
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    unittest.main()

From 08dcea18edaf19ef1eeea1a8905e28d6f318d211 Mon Sep 17 00:00:00 2001
From: wenbin <wang3323032@qq.com>
Date: Thu, 13 Jan 2022 14:00:27 +0800
Subject: [PATCH 122/151] roi_align aligned supported (#38905)

roi_align aligned supported
---
 .../tensorrt/convert/roi_align_op.cc          |  4 +-
 paddle/fluid/inference/tensorrt/op_teller.cc  | 30 ---------
 .../tensorrt/plugin/roi_align_op_plugin.cu    | 64 +++++++++++--------
 .../tensorrt/plugin/roi_align_op_plugin.h     |  4 +-
 .../inference/test_trt_convert_roi_align.py   | 10 ---
 5 files changed, 45 insertions(+), 67 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc b/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc
index 654fe7e013379..54f7937d83747 100644
--- a/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc
@@ -51,6 +51,7 @@ class RoiAlignOpConverter : public OpConverter {
         BOOST_GET_CONST(float, op_desc.GetAttr("spatial_scale"));
     const auto sampling_ratio =
         BOOST_GET_CONST(int, op_desc.GetAttr("sampling_ratio"));
+    const auto aligned = BOOST_GET_CONST(bool, op_desc.GetAttr("aligned"));
 
     const auto input_tensor = engine_->GetITensor(input_name);
     const auto rois_tensor = engine_->GetITensor(rois_name);
@@ -63,7 +64,8 @@ class RoiAlignOpConverter : public OpConverter {
     nvinfer1::ILayer* layer = nullptr;
 
     auto* roi_align_plugin = new plugin::RoiAlignPluginDynamic(
-        data_type_, pooled_height, pooled_width, spatial_scale, sampling_ratio);
+        data_type_, pooled_height, pooled_width, spatial_scale, sampling_ratio,
+        aligned);
     auto roi_align_layer = engine_->network()->addPluginV2(
         inputs.data(), inputs.size(), *roi_align_plugin);
     layer = roi_align_layer;
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 878eef016e7d1..ddee4e0d682b0 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -13,9 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/tensorrt/op_teller.h"
-
 #include <bitset>
-
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/data_layout.h"
 
@@ -737,28 +735,6 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
       }
     }
 
-    if (op_type == "roi_align") {
-      if (!with_dynamic_shape) return false;
-
-      std::vector<std::string> attrs{"pooled_height", "pooled_width",
-                                     "spatial_scale", "sampling_ratio"};
-      for (auto const attr : attrs) {
-        if (!desc.HasAttr(attr)) return false;
-      }
-
-      const auto pooled_height =
-          BOOST_GET_CONST(int, desc.GetAttr("pooled_height"));
-      if (pooled_height <= 0) return false;
-
-      const auto pooled_width =
-          BOOST_GET_CONST(int, desc.GetAttr("pooled_width"));
-      if (pooled_width <= 0) return false;
-
-      const auto spatial_scale =
-          BOOST_GET_CONST(float, desc.GetAttr("spatial_scale"));
-      if (spatial_scale <= 0.f) return false;
-    }
-
     if (op_type == "hard_swish") {
       if (desc.Input("X").size() != 1) {
         VLOG(3) << "HardSwish op has only 1 input, but got "
@@ -1303,12 +1279,6 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
           BOOST_GET_CONST(float, desc.GetAttr("spatial_scale"));
       if (spatial_scale <= 0.f) return false;
 
-      const auto sampling_ratio =
-          BOOST_GET_CONST(int, desc.GetAttr("sampling_ratio"));
-      const auto aligned = BOOST_GET_CONST(bool, desc.GetAttr("aligned"));
-
-      if (sampling_ratio == -1 && aligned == true) return false;
-
       auto roi_align_inputs = desc.Inputs();
       if (roi_align_inputs.find("RoisNum") != roi_align_inputs.end()) {
         if (desc.Input("RoisNum").size() >= 1) {
diff --git a/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu
index 06540b3626082..7dc31fb44719a 100644
--- a/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu
@@ -58,14 +58,12 @@ __inline__ __device__ T BilinearInterpolate(const T* input_data,
 }
 
 template <typename T, typename OutT, bool USE_SMEM>
-__global__ void GPUROIAlignOpt(const int nthreads,
-                               const T* __restrict__ input_data,
-                               const T* __restrict__ input_rois,
-                               const float spatial_scale, const int channels,
-                               const int height, const int width,
-                               const int pooled_height, const int pooled_width,
-                               const int sampling_ratio, const int num_rois,
-                               OutT* __restrict__ output_data) {
+__global__ void GPUROIAlignOpt(
+    const int nthreads, const T* __restrict__ input_data,
+    const T* __restrict__ input_rois, const float spatial_scale,
+    const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, const int sampling_ratio,
+    const int num_rois, const bool aligned, OutT* __restrict__ output_data) {
   const int batch = blockIdx.x;
   const int channel = blockIdx.y;
   const T* offset_input_data =
@@ -84,21 +82,28 @@ __global__ void GPUROIAlignOpt(const int nthreads,
     const int roi_idx = (idx / pooled_width / pooled_height) % num_rois;
     const int n = batch * num_rois + roi_idx;
     const float4 rois_offset = reinterpret_cast<const float4*>(input_rois)[n];
-    const T roi_xmin = rois_offset.x * spatial_scale;
-    const T roi_ymin = rois_offset.y * spatial_scale;
-    const T roi_xmax = rois_offset.z * spatial_scale;
-    const T roi_ymax = rois_offset.w * spatial_scale;
-    const T roi_width = max(roi_xmax - roi_xmin, static_cast<T>(1.f));
-    const T roi_height = max(roi_ymax - roi_ymin, static_cast<T>(1.f));
-    const T bin_size_h = roi_height / static_cast<T>(pooled_height);
-    const T bin_size_w = roi_width / static_cast<T>(pooled_width);
+    const T roi_offset = aligned ? static_cast<T>(0.5) : 0;
+    const T roi_xmin = rois_offset.x * spatial_scale - roi_offset;
+    const T roi_ymin = rois_offset.y * spatial_scale - roi_offset;
+    const T roi_xmax = rois_offset.z * spatial_scale - roi_offset;
+    const T roi_ymax = rois_offset.w * spatial_scale - roi_offset;
+
+    T roi_width = roi_xmax - roi_xmin;
+    T roi_height = roi_ymax - roi_ymin;
+    if (!aligned) {
+      roi_width = max(roi_width, static_cast<T>(1.));
+      roi_height = max(roi_height, static_cast<T>(1.));
+    }
+    const T bin_size_h =
+        static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    const T bin_size_w =
+        static_cast<T>(roi_width) / static_cast<T>(pooled_width);
     const int roi_bin_grid_h = (sampling_ratio > 0)
                                    ? sampling_ratio
                                    : ceil(roi_height / pooled_height);
     const int roi_bin_grid_w =
         (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
-    const T count = roi_bin_grid_h * roi_bin_grid_w;
-
+    const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1);
     T output_val = 0.f;
     for (int iy = 0; iy < roi_bin_grid_h; ++iy) {
       const T y = roi_ymin + ph * bin_size_h +
@@ -132,12 +137,13 @@ RoiAlignPluginDynamic::RoiAlignPluginDynamic(const nvinfer1::DataType data_type,
                                              const int pooled_height,
                                              const int pooled_width,
                                              float spatial_scale,
-                                             int sampling_ratio)
+                                             int sampling_ratio, bool aligned)
     : data_type_(data_type),
       pooled_height_(pooled_height),
       pooled_width_(pooled_width),
       spatial_scale_(spatial_scale),
-      sampling_ratio_(sampling_ratio) {
+      sampling_ratio_(sampling_ratio),
+      aligned_(aligned) {
   bool data_type_is_valid = data_type_ == nvinfer1::DataType::kFLOAT ||
                             data_type_ == nvinfer1::DataType::kHALF;
   PADDLE_ENFORCE_EQ(data_type_is_valid, true,
@@ -187,6 +193,7 @@ RoiAlignPluginDynamic::RoiAlignPluginDynamic(void const* data, size_t length) {
   DeserializeValue(&data, &length, &pooled_width_);
   DeserializeValue(&data, &length, &spatial_scale_);
   DeserializeValue(&data, &length, &sampling_ratio_);
+  DeserializeValue(&data, &length, &aligned_);
   int smem_per_block = -1;
   int device = -1;
   cudaGetDevice(&device);
@@ -204,7 +211,7 @@ nvinfer1::IPluginV2DynamicExt* RoiAlignPluginDynamic::clone() const
     TRT_NOEXCEPT {
   auto* plugin =
       new RoiAlignPluginDynamic(data_type_, pooled_height_, pooled_width_,
-                                spatial_scale_, sampling_ratio_);
+                                spatial_scale_, sampling_ratio_, aligned_);
   plugin->setPluginNamespace(namespace_.c_str());
   return plugin;
 }
@@ -272,14 +279,15 @@ int RoiAlignPluginDynamic::enqueue_impl(
         output_size, static_cast<const T*>(inputs[0]),
         static_cast<const T*>(inputs[1]), spatial_scale_, channels, height,
         width, pooled_height_, pooled_width_, sampling_ratio_, rois_num / batch,
-        static_cast<OutT*>(outputs[0]));
+        aligned_, static_cast<OutT*>(outputs[0]));
   } else {
     GPUROIAlignOpt<
-        T, OutT, true><<<blocks, threads, width * height * sizeof(T), stream>>>(
+        T, OutT,
+        false><<<blocks, threads, width * height * sizeof(T), stream>>>(
         output_size, static_cast<const T*>(inputs[0]),
         static_cast<const T*>(inputs[1]), spatial_scale_, channels, height,
         width, pooled_height_, pooled_width_, sampling_ratio_, rois_num / batch,
-        static_cast<OutT*>(outputs[0]));
+        aligned_, static_cast<OutT*>(outputs[0]));
   }
 
   return cudaGetLastError() != cudaSuccess;
@@ -313,6 +321,10 @@ const char* RoiAlignPluginDynamic::getPluginType() const TRT_NOEXCEPT {
   return "roi_align_plugin_dynamic";
 }
 
+const char* RoiAlignPluginDynamic::getPluginVersion() const TRT_NOEXCEPT {
+  return "2";
+}
+
 int RoiAlignPluginDynamic::getNbOutputs() const TRT_NOEXCEPT { return 1; }
 
 int RoiAlignPluginDynamic::initialize() TRT_NOEXCEPT { return 0; }
@@ -326,6 +338,7 @@ size_t RoiAlignPluginDynamic::getSerializationSize() const TRT_NOEXCEPT {
   serialize_size += SerializedSize(pooled_width_);
   serialize_size += SerializedSize(spatial_scale_);
   serialize_size += SerializedSize(sampling_ratio_);
+  serialize_size += SerializedSize(aligned_);
   return serialize_size;
 }
 
@@ -335,6 +348,7 @@ void RoiAlignPluginDynamic::serialize(void* buffer) const TRT_NOEXCEPT {
   SerializeValue(&buffer, pooled_width_);
   SerializeValue(&buffer, spatial_scale_);
   SerializeValue(&buffer, sampling_ratio_);
+  SerializeValue(&buffer, aligned_);
 }
 
 void RoiAlignPluginDynamic::destroy() TRT_NOEXCEPT {}
@@ -357,7 +371,7 @@ const char* RoiAlignPluginDynamicCreator::getPluginName() const TRT_NOEXCEPT {
 
 const char* RoiAlignPluginDynamicCreator::getPluginVersion() const
     TRT_NOEXCEPT {
-  return "1";
+  return "2";
 }
 
 const nvinfer1::PluginFieldCollection*
diff --git a/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h
index 44d2b63069835..9f4723da9e17b 100644
--- a/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h
@@ -31,7 +31,7 @@ class RoiAlignPluginDynamic : public DynamicPluginTensorRT {
   explicit RoiAlignPluginDynamic(const nvinfer1::DataType data_type,
                                  const int pooled_height,
                                  const int pooled_width, float spatial_scale,
-                                 int sampling_ratio);
+                                 int sampling_ratio, bool aligned);
   RoiAlignPluginDynamic(void const* data, size_t length);
   ~RoiAlignPluginDynamic() = default;
   nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
@@ -66,6 +66,7 @@ class RoiAlignPluginDynamic : public DynamicPluginTensorRT {
   size_t getSerializationSize() const TRT_NOEXCEPT override;
   void serialize(void* buffer) const TRT_NOEXCEPT override;
   void destroy() TRT_NOEXCEPT override;
+  const char* getPluginVersion() const TRT_NOEXCEPT override;
 
  private:
   template <typename T, typename OutT>
@@ -80,6 +81,7 @@ class RoiAlignPluginDynamic : public DynamicPluginTensorRT {
   float spatial_scale_;
   int sampling_ratio_;
   int smem_per_block_;
+  bool aligned_;
   std::string namespace_;
 };
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_roi_align.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_roi_align.py
index 56efdb91959ce..b2d754337fe02 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_roi_align.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_roi_align.py
@@ -176,16 +176,6 @@ def teller1(program_config, predictor_config):
         self.add_skip_case(teller1, SkipReasons.TRT_NOT_SUPPORT,
                            "INPUT RoisNum NOT SUPPORT")
 
-        def teller2(program_config, predictor_config):
-            if (program_config.ops[0].attrs['sampling_ratio'] == -1 and
-                    program_config.ops[0].attrs['aligned'] == True):
-                return True
-            return False
-
-        self.add_skip_case(
-            teller2, SkipReasons.TRT_NOT_SUPPORT,
-            "SAMPLING_RATIO EQUAL TO - 1 WHEN ALIGNED IS TRUE IS NOT SUPPORT")
-
     def test(self):
         self.add_skip_trt_case()
         self.run_test()

From a6cf6cddd323436b0e441aeb6f67a9a5da6c2172 Mon Sep 17 00:00:00 2001
From: LiYuRio <63526175+LiYuRio@users.noreply.github.com>
Date: Thu, 13 Jan 2022 14:32:22 +0800
Subject: [PATCH 123/151] [fleet_executor] fix uninitialized pointer (#38904)

---
 paddle/fluid/distributed/fleet_executor/carrier.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/distributed/fleet_executor/carrier.h b/paddle/fluid/distributed/fleet_executor/carrier.h
index 7762effdb9c87..9a74fa78c0e76 100644
--- a/paddle/fluid/distributed/fleet_executor/carrier.h
+++ b/paddle/fluid/distributed/fleet_executor/carrier.h
@@ -101,8 +101,8 @@ class Carrier final {
   std::mutex running_mutex_;
   std::condition_variable cond_var_;
   std::vector<framework::Scope*> microbatch_scopes_;
-  framework::Scope* root_scope_;
-  framework::Scope* minibatch_scope_;
+  framework::Scope* root_scope_{nullptr};
+  framework::Scope* minibatch_scope_{nullptr};
   paddle::platform::Place place_;
   paddle::platform::DeviceContext* dev_ctx_{nullptr};
   int64_t rank_;

From 53783e1e3d972a5eccb4936ce0ef9ee4aa292a96 Mon Sep 17 00:00:00 2001
From: JZ-LIANG <jianzhongliang10@gmail.com>
Date: Thu, 13 Jan 2022 14:48:24 +0800
Subject: [PATCH 124/151] [Dist Pass] AMP pass add dist_update_loss_scaling op
 (#38902)

---
 .../auto_parallel/operators/__init__.py       |   1 +
 .../auto_parallel/operators/common.py         |   2 +-
 .../operators/dist_update_loss_scaling.py     | 134 ++++++++++++++++++
 3 files changed, 136 insertions(+), 1 deletion(-)
 create mode 100644 python/paddle/distributed/auto_parallel/operators/dist_update_loss_scaling.py

diff --git a/python/paddle/distributed/auto_parallel/operators/__init__.py b/python/paddle/distributed/auto_parallel/operators/__init__.py
index 5502cb3191a48..c28b7930124dd 100644
--- a/python/paddle/distributed/auto_parallel/operators/__init__.py
+++ b/python/paddle/distributed/auto_parallel/operators/__init__.py
@@ -24,3 +24,4 @@
 from . import dist_transpose
 from . import dist_default
 from . import dist_check_finite_and_unscale
+from . import dist_update_loss_scaling
diff --git a/python/paddle/distributed/auto_parallel/operators/common.py b/python/paddle/distributed/auto_parallel/operators/common.py
index 32496b94b920c..8f1ba33f544fb 100644
--- a/python/paddle/distributed/auto_parallel/operators/common.py
+++ b/python/paddle/distributed/auto_parallel/operators/common.py
@@ -15,7 +15,7 @@
 from ..dist_attribute import OperatorDistributedAttribute
 
 _g_distributed_operator_impl_registries = {}
-BACKWARD_ONLY_DIST_OPS = {'check_finite_and_unscale'}
+BACKWARD_ONLY_DIST_OPS = {'check_finite_and_unscale', 'update_loss_scaling'}
 
 
 class DistributedOperatorImplContainer:
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_update_loss_scaling.py b/python/paddle/distributed/auto_parallel/operators/dist_update_loss_scaling.py
new file mode 100644
index 0000000000000..56782bec0856a
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/operators/dist_update_loss_scaling.py
@@ -0,0 +1,134 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+
+from .common import DistributedOperatorImplContainer
+from .common import DistributedOperatorImpl
+from .common import register_distributed_operator_impl_container
+from .common import register_distributed_operator_impl
+from ..utils import set_dist_op_desc_original_id
+
+
+class DistributedUpdateLossScaling(DistributedOperatorImplContainer):
+    def __init__(self, name):
+        super(DistributedUpdateLossScaling, self).__init__()
+        self._name = name
+
+
+register_distributed_operator_impl_container(
+    "update_loss_scaling", DistributedUpdateLossScaling("update_loss_scaling"))
+
+
+class DistributedUpdateLossScalingImpl(DistributedOperatorImpl):
+    def __init__(self, name):
+        super(DistributedUpdateLossScalingImpl, self).__init__()
+        self._name = name
+        self._forward_implemented = False
+        self._backward_implemented = True
+
+    def is_input_compatible(self, dist_op):
+        raise RuntimeError(
+            "DistributedUpdateLossScalingImpl's is_input_compatible should not be called !"
+        )
+
+    def is_output_compatible(self, dist_op):
+        raise RuntimeError(
+            "DistributedUpdateLossScalingImpl's is_output_compatible should not be called !"
+        )
+
+    def update_dims_mapping(self, dist_op):
+        raise RuntimeError(
+            "DistributedUpdateLossScalingImpl's update_dims_mapping should not be called !"
+        )
+
+    @staticmethod
+    def forward(ctx, *args, **kwargs):
+        raise RuntimeError(
+            "DistributedUpdateLossScalingImpl's forward should not be called !")
+
+    @staticmethod
+    def backward(ctx, *args, **kwargs):
+
+        # the backward function only filte the gradient with current rank id
+        dist_op_context = ctx.dist_op_context
+        main_block = dist_op_context.get_dst_main_program().global_block()
+        backward_op = dist_op_context.get_cur_src_op()
+        rank_id = dist_op_context.get_rank_id()
+        dist_attr = ctx.get_op_dist_attr_for_program(backward_op)
+        assert dist_attr is not None, "backward op [{}] don't have dist attribute !".format(
+            str(backward_op))
+
+        assert rank_id in dist_attr.process_mesh.processes
+
+        assert 'X' in kwargs, "input [{}] is not given".format('X')
+        assert 'FoundInfinite' in kwargs, "input [{}] is not given".format(
+            'FoundInfinite')
+        assert 'PrevLossScaling' in kwargs, "input [{}] is not given".format(
+            'PrevLossScaling')
+        assert 'InGoodSteps' in kwargs, "input [{}] is not given".format(
+            'InGoodSteps')
+        assert 'InBadSteps' in kwargs, "input [{}] is not given".format(
+            'InBadSteps')
+
+        assert 'Out' in kwargs, "output [{}] is not given".format('Out')
+        assert 'LossScaling' in kwargs, "output [{}] is not given".format(
+            'LossScaling')
+        assert 'OutGoodSteps' in kwargs, "input [{}] is not given".format(
+            'OutGoodSteps')
+        assert 'OutBadSteps' in kwargs, "input [{}] is not given".format(
+            'OutBadSteps')
+
+        assert len(kwargs['FoundInfinite']) == 1, \
+            "update_loss_scaling input FoundInfinite take 1 variable but got {}".format(
+            kwargs['FoundInfinite'])
+        assert len(kwargs['PrevLossScaling']) == 1, \
+            "update_loss_scaling input PrevLossScaling take 1 variable but got {}".format(
+            kwargs['PrevLossScaling'])
+        assert len(kwargs['InGoodSteps']) == 1, \
+            "update_loss_scaling input InGoodSteps take 1 variable but got {}".format(
+            kwargs['InGoodSteps'])
+        assert len(kwargs['InBadSteps']) == 1, \
+            "update_loss_scaling input InBadSteps take 1 variable but got {}".format(
+            kwargs['InBadSteps'])
+        assert len(kwargs['LossScaling']) == 1, \
+            "update_loss_scaling output LossScaling take 1 variable but got {}".format(
+            kwargs['LossScaling'])
+        assert len(kwargs['OutGoodSteps']) == 1, \
+            "update_loss_scaling output OutGoodSteps take 1 variable but got {}".format(
+            kwargs['OutGoodSteps'])
+        assert len(kwargs['OutBadSteps']) == 1, \
+            "update_loss_scaling output OutBadSteps take 1 variable but got {}".format(
+            kwargs['OutBadSteps'])
+
+        assert len(kwargs['X']) == len(kwargs['Out']), \
+            "update_loss_scaling got [{}] X and [{}] Out, which are supposed to be equal".format(
+            len(kwargs['X']), len(kwargs['Out']))
+
+        filter_vars = []
+        for varname in kwargs['X']:
+            if rank_id in ctx.get_tensor_dist_attr_for_program(
+                    main_block.var(varname)).process_mesh.processes:
+                filter_vars.append(varname)
+
+        # replicate op in dist program
+        dist_op_desc = main_block.desc.append_op()
+        dist_op_desc.copy_from(backward_op.desc)
+        set_dist_op_desc_original_id(dist_op_desc, backward_op.desc, ctx)
+        dist_op_desc.set_input('X', filter_vars)
+        dist_op_desc.set_output('Out', filter_vars)
+        main_block._sync_with_cpp()
+
+
+register_distributed_operator_impl(
+    "update_loss_scaling",
+    DistributedUpdateLossScalingImpl("update_loss_scaling"))

From 7e0292ead7d8c0632135e5480870e4c6bdf93acd Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Thu, 13 Jan 2022 14:51:17 +0800
Subject: [PATCH 125/151] [pten]Remove pten/include dir files (#38878)

* move dot_dev api into dot_kernel.h

* add infermate header

* modify to dotkerel in dot_op.h

* mvoe conj dev api into complex_kernel.h

* move sign dev api into  sign_kernel.h

* move scale dev api into kernel.h and remove infermete.h

* rm paddle/pten/include/math.h

* rm paddle/pten/include/math.h

* rm include dir

* rm paddle/pten/include/math.h

* fix conflict with develop branch

* rm devContext in conj_op.h

* add the missing complex_kernel header
---
 .../eager/accumulation/accumulation_node.cc   |  1 -
 .../accumulation/gradient_accumulation.cc     |  1 -
 .../eager_generated/backwards/scale_node.cc   | 16 +++---
 .../eager_generated/forwards/scale.cc         |  1 -
 paddle/fluid/eager/eager_tensor.h             |  1 -
 paddle/fluid/eager/grad_node_info.h           |  1 -
 .../eager/legacy/infer_var_type_context.h     |  1 -
 paddle/fluid/eager/legacy/prepared_operator.h |  2 -
 paddle/fluid/eager/legacy/tensor_helper.h     |  1 -
 .../framework/data_device_transform_test.cu   |  1 -
 paddle/fluid/framework/operator.h             |  3 +-
 paddle/fluid/imperative/layer.h               |  1 -
 paddle/fluid/imperative/op_base.h             |  1 -
 paddle/fluid/imperative/prepared_operator.h   |  2 -
 paddle/fluid/operators/cast_op.h              |  1 -
 paddle/fluid/operators/conj_op.h              |  3 +-
 paddle/fluid/operators/dot_op.h               |  1 -
 .../elementwise/elementwise_add_op.h          |  1 -
 .../elementwise/elementwise_mul_op.h          |  1 -
 .../elementwise/elementwise_op_function.h     |  1 -
 .../elementwise/elementwise_op_impl.cu.h      |  1 -
 .../elementwise/elementwise_sub_op.h          |  1 -
 paddle/fluid/operators/fill_any_like_op.h     |  1 -
 paddle/fluid/operators/flatten_op.h           |  1 -
 paddle/fluid/operators/matmul_v2_op.h         |  1 -
 paddle/fluid/operators/reduce_ops/reduce_op.h |  2 -
 paddle/fluid/operators/reshape_op.cc          |  1 -
 paddle/fluid/operators/scale_op.h             |  5 +-
 paddle/fluid/operators/sign_op.h              |  1 -
 paddle/fluid/pybind/eager.cc                  |  1 -
 paddle/fluid/pybind/eager_functions.cc        |  1 -
 paddle/fluid/pybind/eager_method.cc           |  1 -
 paddle/fluid/pybind/eager_properties.cc       |  1 -
 paddle/fluid/pybind/eager_utils.cc            |  1 -
 paddle/pten/CMakeLists.txt                    |  2 +-
 paddle/pten/all.cc                            | 17 -------
 paddle/pten/all.h                             | 20 --------
 paddle/pten/api/lib/utils.cc                  |  3 +-
 paddle/pten/include/core.h                    | 22 --------
 paddle/pten/include/infermeta.h               | 21 --------
 paddle/pten/include/math.h                    | 39 ---------------
 paddle/pten/kernels/complex_kernel.h          |  3 --
 paddle/pten/kernels/cpu/scale_kernel.cc       | 34 ++++++++++++-
 paddle/pten/kernels/flatten_kernel.h          |  2 +-
 paddle/pten/kernels/gpu/scale_kernel.cu       | 14 +++---
 .../kernels/impl/matmul_grad_kernel_impl.h    |  3 +-
 paddle/pten/kernels/impl/scale_kernel_impl.h  | 50 -------------------
 paddle/pten/kernels/math_kernel.h             |  3 +-
 paddle/pten/kernels/reshape_kernel.h          |  2 +-
 paddle/pten/kernels/scale_kernel.h            | 28 ++++++++---
 paddle/pten/kernels/sign_kernel.h             |  2 +-
 paddle/pten/tests/api/scale_api.h             | 35 +++++++------
 .../pten/tests/kernels/test_scale_dev_api.cc  |  2 +-
 python/paddle/utils/code_gen/api_gen.py       |  6 ++-
 54 files changed, 103 insertions(+), 265 deletions(-)
 delete mode 100644 paddle/pten/all.cc
 delete mode 100644 paddle/pten/all.h
 delete mode 100644 paddle/pten/include/core.h
 delete mode 100644 paddle/pten/include/infermeta.h
 delete mode 100644 paddle/pten/include/math.h
 delete mode 100644 paddle/pten/kernels/impl/scale_kernel_impl.h

diff --git a/paddle/fluid/eager/accumulation/accumulation_node.cc b/paddle/fluid/eager/accumulation/accumulation_node.cc
index ed1146eed0fb0..823c0153d71b0 100644
--- a/paddle/fluid/eager/accumulation/accumulation_node.cc
+++ b/paddle/fluid/eager/accumulation/accumulation_node.cc
@@ -18,7 +18,6 @@
 
 #include "paddle/pten/api/all.h"
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/include/core.h"
 
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/eager/accumulation/gradient_accumulation.cc b/paddle/fluid/eager/accumulation/gradient_accumulation.cc
index 9bc24dd28756a..1f66596a0b578 100644
--- a/paddle/fluid/eager/accumulation/gradient_accumulation.cc
+++ b/paddle/fluid/eager/accumulation/gradient_accumulation.cc
@@ -28,7 +28,6 @@
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/pten/api/all.h"
 #include "paddle/pten/core/convert_utils.h"
-#include "paddle/pten/include/core.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 #ifdef PADDLE_WITH_XPU
 #include "xpu/refactor/math.h"
diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
index 02eaa79fc9b28..99f6c7a83538e 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
+++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
@@ -16,7 +16,7 @@
 #include "paddle/fluid/eager/api/utils/global_utils.h"
 #include "paddle/fluid/eager/eager_tensor.h"
 
-#include "paddle/pten/api/all.h"
+#include "paddle/pten/kernels/scale_kernel.h"
 
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -33,28 +33,28 @@ static void ScaleDeviceDispatch(const pten::DenseTensor& dense_tensor,
                                 pten::DenseTensor* dense_out) {
   switch (dense_tensor.dtype()) {
     case pten::DataType::FLOAT64: {
-      pten::Scale<double, DeviceContext>(
+      pten::ScaleKernel<double, DeviceContext>(
           dev_ctx, dense_tensor /* tensor */, scale /* scale */,
           bias /* bias */, bias_after_scale /* bias_after_scale */,
           dense_out /* out tensor */);
       break;
     }
     case pten::DataType::FLOAT32: {
-      pten::Scale<float, DeviceContext>(dev_ctx, dense_tensor /* tensor */,
-                                        scale /* scale */, bias /* bias */,
-                                        bias_after_scale /* bias_after_scale */,
-                                        dense_out /* out tensor */);
+      pten::ScaleKernel<float, DeviceContext>(
+          dev_ctx, dense_tensor /* tensor */, scale /* scale */,
+          bias /* bias */, bias_after_scale /* bias_after_scale */,
+          dense_out /* out tensor */);
       break;
     }
     case pten::DataType::INT64: {
-      pten::Scale<int64_t, DeviceContext>(
+      pten::ScaleKernel<int64_t, DeviceContext>(
           dev_ctx, dense_tensor /* tensor */, scale /* scale */,
           bias /* bias */, bias_after_scale /* bias_after_scale */,
           dense_out /* out tensor */);
       break;
     }
     case pten::DataType::INT32: {
-      pten::Scale<int32_t, DeviceContext>(
+      pten::ScaleKernel<int32_t, DeviceContext>(
           dev_ctx, dense_tensor /* tensor */, scale /* scale */,
           bias /* bias */, bias_after_scale /* bias_after_scale */,
           dense_out /* out tensor */);
diff --git a/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc b/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc
index 7b20ff144a7a7..642302a4119be 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc
+++ b/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc
@@ -29,7 +29,6 @@
 #include "paddle/fluid/eager/utils.h"
 
 #include "paddle/pten/api/all.h"
-#include "paddle/pten/include/core.h"
 
 namespace egr {
 
diff --git a/paddle/fluid/eager/eager_tensor.h b/paddle/fluid/eager/eager_tensor.h
index 80faad9080ffe..c58c0b9e66e7a 100644
--- a/paddle/fluid/eager/eager_tensor.h
+++ b/paddle/fluid/eager/eager_tensor.h
@@ -18,7 +18,6 @@
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/variable.h"
 // pten deps
-#include "paddle/pten/all.h"
 #include "paddle/pten/api/all.h"
 #include "paddle/pten/api/lib/api_declare.h"
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
diff --git a/paddle/fluid/eager/grad_node_info.h b/paddle/fluid/eager/grad_node_info.h
index f15c50ef75190..5cf0b90220148 100644
--- a/paddle/fluid/eager/grad_node_info.h
+++ b/paddle/fluid/eager/grad_node_info.h
@@ -16,7 +16,6 @@
 
 #include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/pten/api/all.h"
-#include "paddle/pten/include/core.h"
 
 namespace egr {
 /**
diff --git a/paddle/fluid/eager/legacy/infer_var_type_context.h b/paddle/fluid/eager/legacy/infer_var_type_context.h
index 2d5a8d806fee7..9d9cbeb38ccfa 100644
--- a/paddle/fluid/eager/legacy/infer_var_type_context.h
+++ b/paddle/fluid/eager/legacy/infer_var_type_context.h
@@ -26,7 +26,6 @@
 #include "paddle/fluid/framework/var_type_inference.h"
 #include "paddle/fluid/framework/var_type_traits.h"
 #include "paddle/pten/api/all.h"
-#include "paddle/pten/include/core.h"
 
 namespace egr {
 namespace legacy {
diff --git a/paddle/fluid/eager/legacy/prepared_operator.h b/paddle/fluid/eager/legacy/prepared_operator.h
index 9ba186b14e3b3..0e00b52e0481a 100644
--- a/paddle/fluid/eager/legacy/prepared_operator.h
+++ b/paddle/fluid/eager/legacy/prepared_operator.h
@@ -25,8 +25,6 @@
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/type_defs.h"
 
-#include "paddle/pten/include/core.h"
-
 DECLARE_bool(use_mkldnn);
 
 namespace paddle {
diff --git a/paddle/fluid/eager/legacy/tensor_helper.h b/paddle/fluid/eager/legacy/tensor_helper.h
index f87ab70c93686..ce407f8965aa0 100644
--- a/paddle/fluid/eager/legacy/tensor_helper.h
+++ b/paddle/fluid/eager/legacy/tensor_helper.h
@@ -17,7 +17,6 @@
 #include <vector>
 #include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/pten/api/all.h"
-#include "paddle/pten/include/core.h"
 namespace egr {
 namespace legacy {
 
diff --git a/paddle/fluid/framework/data_device_transform_test.cu b/paddle/fluid/framework/data_device_transform_test.cu
index a81e4abd45e56..858688dffd8c1 100644
--- a/paddle/fluid/framework/data_device_transform_test.cu
+++ b/paddle/fluid/framework/data_device_transform_test.cu
@@ -24,7 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/init.h"
 
 #include "paddle/fluid/framework/pten_utils.h"
-#include "paddle/pten/include/core.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 8e69f96dfb813..9d75c66beb7d4 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -41,7 +41,8 @@ limitations under the License. */
 #include "paddle/utils/flat_hash_map.h"
 
 #include "paddle/pten/core/arg_map_context.h"
-#include "paddle/pten/include/core.h"
+#include "paddle/pten/core/kernel_context.h"
+#include "paddle/pten/core/kernel_factory.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index 199d62bff1f20..d27460aeeccef 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -37,7 +37,6 @@
 #include "paddle/fluid/imperative/variable_wrapper.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"
-#include "paddle/pten/include/core.h"
 namespace paddle {
 namespace framework {
 class Variable;
diff --git a/paddle/fluid/imperative/op_base.h b/paddle/fluid/imperative/op_base.h
index 3ff451f817872..cb76a82353282 100644
--- a/paddle/fluid/imperative/op_base.h
+++ b/paddle/fluid/imperative/op_base.h
@@ -25,7 +25,6 @@
 #include "paddle/fluid/imperative/type_defs.h"
 #include "paddle/fluid/imperative/variable_wrapper.h"
 #include "paddle/fluid/platform/place.h"
-#include "paddle/pten/include/core.h"
 
 namespace paddle {
 namespace imperative {
diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h
index 5262b265b1b53..29747e79ef6fa 100644
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -27,8 +27,6 @@
 #include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/imperative/type_defs.h"
 
-#include "paddle/pten/include/core.h"
-
 DECLARE_bool(use_mkldnn);
 
 namespace paddle {
diff --git a/paddle/fluid/operators/cast_op.h b/paddle/fluid/operators/cast_op.h
index 72aa9a195ec7c..c54c811b25b66 100644
--- a/paddle/fluid/operators/cast_op.h
+++ b/paddle/fluid/operators/cast_op.h
@@ -19,7 +19,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/transform.h"
 
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
-#include "paddle/pten/include/core.h"
 #include "paddle/pten/kernels/cast_kernel.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/conj_op.h b/paddle/fluid/operators/conj_op.h
index 71115c2eba796..6df982abb8612 100644
--- a/paddle/fluid/operators/conj_op.h
+++ b/paddle/fluid/operators/conj_op.h
@@ -19,7 +19,6 @@
 
 // only can include the headers in paddle/pten/api dirs
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
-#include "paddle/pten/include/core.h"
 #include "paddle/pten/kernels/complex_kernel.h"
 
 namespace paddle {
@@ -39,7 +38,7 @@ class ConjKernel : public framework::OpKernel<T> {
     auto pt_out = paddle::experimental::MakePtenDenseTensor(*out);
 
     // call new kernel
-    pten::ConjKernel<T, DeviceContext>(dev_ctx, *pt_x.get(), pt_out.get());
+    pten::ConjKernel<T>(dev_ctx, *pt_x.get(), pt_out.get());
   }
 };
 
diff --git a/paddle/fluid/operators/dot_op.h b/paddle/fluid/operators/dot_op.h
index 8817e2f3ca79d..ceb8a28e8aa4c 100644
--- a/paddle/fluid/operators/dot_op.h
+++ b/paddle/fluid/operators/dot_op.h
@@ -21,7 +21,6 @@
 
 // only can include the headers in paddle/pten/api dirs
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
-#include "paddle/pten/include/core.h"
 #include "paddle/pten/kernels/dot_grad_kernel.h"
 #include "paddle/pten/kernels/dot_kernel.h"
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h
index 35807d7c57d47..622a6d7edb783 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h
@@ -18,7 +18,6 @@ limitations under the License. */
 #include <utility>
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 
-// only can include the headers in paddle/pten/include dirs
 #include "paddle/pten/kernels/math_kernel.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
index 385c7549e07f2..687340b668a13 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
@@ -18,7 +18,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 #include "paddle/fluid/platform/cpu_info.h"
 
-// only can include the headers in paddle/pten/include dirs
 #include "paddle/pten/kernels/math_kernel.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h
index 37d29ed91b3d4..626046890fb06 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -29,7 +29,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/transform.h"
 
-// only can include the headers in paddle/pten/include dirs
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
 #include "paddle/pten/kernels/cpu/elementwise.h"
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
index 36ff1ae254d20..9cc741344e50e 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
@@ -22,7 +22,6 @@ limitations under the License. */
 
 // only can include the headers in paddle/top/api dirs
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
-#include "paddle/pten/include/core.h"
 #include "paddle/pten/kernels/gpu/elementwise.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.h b/paddle/fluid/operators/elementwise/elementwise_sub_op.h
index 09818380d8ea7..f035e46d1d082 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.h
@@ -16,7 +16,6 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 
-// only can include the headers in paddle/pten/include dirs
 #include "paddle/pten/kernels/math_kernel.h"
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/fill_any_like_op.h b/paddle/fluid/operators/fill_any_like_op.h
index 287bbbfa3b343..19f6e7a4ef51f 100644
--- a/paddle/fluid/operators/fill_any_like_op.h
+++ b/paddle/fluid/operators/fill_any_like_op.h
@@ -19,7 +19,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/pten_utils.h"
 
-#include "paddle/pten/include/core.h"
 #include "paddle/pten/kernels/full_kernel.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/flatten_op.h b/paddle/fluid/operators/flatten_op.h
index ef42619bfe4ff..8e54ecb922f5a 100644
--- a/paddle/fluid/operators/flatten_op.h
+++ b/paddle/fluid/operators/flatten_op.h
@@ -20,7 +20,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/pooling.h"
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/pten/include/core.h"
 #include "paddle/pten/kernels/empty_kernel.h"
 #include "paddle/pten/kernels/flatten_grad_kernel.h"
 #include "paddle/pten/kernels/flatten_kernel.h"
diff --git a/paddle/fluid/operators/matmul_v2_op.h b/paddle/fluid/operators/matmul_v2_op.h
index e93bd212868fd..9ab77cdcaec0a 100644
--- a/paddle/fluid/operators/matmul_v2_op.h
+++ b/paddle/fluid/operators/matmul_v2_op.h
@@ -27,7 +27,6 @@ limitations under the License. */
 
 // only can include the headers in paddle/pten/api dirs
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
-#include "paddle/pten/include/core.h"
 #include "paddle/pten/kernels/matmul_grad_kernel.h"
 #include "paddle/pten/kernels/matmul_kernel.h"
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
index e1854d8a13d8b..eb4d4a5c1680e 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -26,8 +26,6 @@ limitations under the License. */
 
 // only can include the headers in paddle/pten/api dirs
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
-#include "paddle/pten/include/core.h"
-#include "paddle/pten/include/math.h"
 #include "paddle/pten/kernels/cpu/reduce.h"
 
 #if defined(__HIPCC__) || defined(__NVCC__)
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index a25e53aac5d73..47b8da70adbac 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -20,7 +20,6 @@ limitations under the License. */
 // only can include the headers in paddle/pten/api dirs
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
 #include "paddle/pten/common/scalar_array.h"
-#include "paddle/pten/include/core.h"
 #include "paddle/pten/kernels/reshape_grad_kernel.h"
 #include "paddle/pten/kernels/reshape_kernel.h"
 namespace paddle {
diff --git a/paddle/fluid/operators/scale_op.h b/paddle/fluid/operators/scale_op.h
index 6011fe9a66b60..a6f4f6e27204e 100644
--- a/paddle/fluid/operators/scale_op.h
+++ b/paddle/fluid/operators/scale_op.h
@@ -19,7 +19,6 @@ limitations under the License. */
 
 // only can include the headers in paddle/top/api dirs
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
-#include "paddle/pten/include/core.h"
 #include "paddle/pten/kernels/scale_kernel.h"
 
 namespace paddle {
@@ -70,8 +69,8 @@ class ScaleKernel : public framework::OpKernel<T> {
     auto pt_out = paddle::experimental::MakePtenDenseTensor(*out);
 
     // call new kernel
-    pten::Scale<T>(dev_ctx, *pt_x.get(), scale, bias, bias_after_scale,
-                   pt_out.get());
+    pten::ScaleKernel<T>(dev_ctx, *pt_x.get(), scale, bias, bias_after_scale,
+                         pt_out.get());
   }
 };
 
diff --git a/paddle/fluid/operators/sign_op.h b/paddle/fluid/operators/sign_op.h
index b8dd44c01b050..8294cd2c5f145 100644
--- a/paddle/fluid/operators/sign_op.h
+++ b/paddle/fluid/operators/sign_op.h
@@ -19,7 +19,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/pten_utils.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 
-#include "paddle/pten/include/core.h"
 #include "paddle/pten/kernels/sign_kernel.h"
 
 namespace paddle {
diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc
index 9484d506b20fb..102bc9f162b0f 100644
--- a/paddle/fluid/pybind/eager.cc
+++ b/paddle/fluid/pybind/eager.cc
@@ -26,7 +26,6 @@ limitations under the License. */
 #include "paddle/pten/common/data_type.h"
 #include "paddle/pten/core/convert_utils.h"
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/include/core.h"
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc
index 659df6b9b44de..aaf86bc41aeff 100644
--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -34,7 +34,6 @@ limitations under the License. */
 #include "paddle/pten/common/data_type.h"
 #include "paddle/pten/core/convert_utils.h"
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/include/core.h"
 
 namespace paddle {
 namespace pybind {
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index a0067f9c64fb1..a8c1da2a8b866 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -31,7 +31,6 @@ limitations under the License. */
 #include "paddle/pten/common/data_type.h"
 #include "paddle/pten/core/convert_utils.h"
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/include/core.h"
 namespace paddle {
 namespace pybind {
 
diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc
index 71b8bbbb1a283..038a1254d7ef6 100644
--- a/paddle/fluid/pybind/eager_properties.cc
+++ b/paddle/fluid/pybind/eager_properties.cc
@@ -28,7 +28,6 @@ limitations under the License. */
 #include "paddle/pten/common/data_type.h"
 #include "paddle/pten/core/convert_utils.h"
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/include/core.h"
 #pragma GCC diagnostic ignored "-Wwrite-strings"
 
 namespace paddle {
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index 9849d0d41611b..c1049d240795c 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -26,7 +26,6 @@ limitations under the License. */
 #include "paddle/pten/common/data_type.h"
 #include "paddle/pten/core/convert_utils.h"
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/include/core.h"
 
 namespace paddle {
 namespace pybind {
diff --git a/paddle/pten/CMakeLists.txt b/paddle/pten/CMakeLists.txt
index 6a823ff3672bf..a9b7c7581bc2b 100644
--- a/paddle/pten/CMakeLists.txt
+++ b/paddle/pten/CMakeLists.txt
@@ -29,4 +29,4 @@ get_property(pten_kernels GLOBAL PROPERTY PTEN_KERNELS)
 message(STATUS "All standard pten kernels: ${pten_kernels}")
 set(PTEN_DEPS ${PTEN_DEPS} ${pten_kernels})
 
-cc_library(pten SRCS all.cc DEPS ${PTEN_DEPS})
+cc_library(pten DEPS ${PTEN_DEPS})
diff --git a/paddle/pten/all.cc b/paddle/pten/all.cc
deleted file mode 100644
index d8d96e1cd461e..0000000000000
--- a/paddle/pten/all.cc
+++ /dev/null
@@ -1,17 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/pten/all.h"
-
-namespace pten {}  // namespace pten
diff --git a/paddle/pten/all.h b/paddle/pten/all.h
deleted file mode 100644
index c8be629b10e75..0000000000000
--- a/paddle/pten/all.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-// developer apis
-#include "paddle/pten/include/core.h"
-#include "paddle/pten/include/infermeta.h"
-#include "paddle/pten/include/math.h"
diff --git a/paddle/pten/api/lib/utils.cc b/paddle/pten/api/lib/utils.cc
index ddb29c8833f3b..6eb1e5a3797c9 100644
--- a/paddle/pten/api/lib/utils.cc
+++ b/paddle/pten/api/lib/utils.cc
@@ -22,8 +22,7 @@ limitations under the License. */
 #include "paddle/pten/api/lib/kernel_dispatch.h"
 #include "paddle/pten/api/lib/utils/storage.h"
 #include "paddle/pten/core/kernel_registry.h"
-#include "paddle/pten/include/core.h"
-#include "paddle/pten/include/infermeta.h"
+#include "paddle/pten/infermeta/unary.h"
 
 PT_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
 
diff --git a/paddle/pten/include/core.h b/paddle/pten/include/core.h
deleted file mode 100644
index 9a042753d1f73..0000000000000
--- a/paddle/pten/include/core.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-// See Note: [ How do we organize the kernel directory ]
-#include "paddle/pten/core/convert_utils.h"
-#include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/core/kernel_context.h"
-#include "paddle/pten/core/kernel_factory.h"
-#include "paddle/pten/core/tensor_meta.h"
diff --git a/paddle/pten/include/infermeta.h b/paddle/pten/include/infermeta.h
deleted file mode 100644
index 5e356dd37c03e..0000000000000
--- a/paddle/pten/include/infermeta.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-// See Note: [ How do we organize the kernel directory ]
-#include "paddle/pten/infermeta/binary.h"
-#include "paddle/pten/infermeta/multiary.h"
-#include "paddle/pten/infermeta/nullary.h"
-#include "paddle/pten/infermeta/unary.h"
diff --git a/paddle/pten/include/math.h b/paddle/pten/include/math.h
deleted file mode 100644
index a4fb7f4d98faf..0000000000000
--- a/paddle/pten/include/math.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-// See Note: [ How do we organize the kernel directory ]
-#include "paddle/pten/api/lib/utils/storage.h"
-#include "paddle/pten/include/infermeta.h"
-#include "paddle/pten/kernels/scale_kernel.h"
-
-namespace pten {
-
-template <typename T, typename ContextT>
-DenseTensor Scale(const ContextT& dev_ctx,
-                  const DenseTensor& x,
-                  const Scalar& scale,
-                  float bias,
-                  bool bias_after_scale) {
-  auto out_meta = UnchangedInferMeta(x.meta());
-  pten::DenseTensor dense_out(
-      pten::make_intrusive<paddle::experimental::SharedStorage>(
-          dev_ctx.GetPlace()),
-      std::move(out_meta));
-  Scale<T, ContextT>(dev_ctx, x, scale, bias, bias_after_scale, &dense_out);
-  return dense_out;
-}
-
-}  // namespace pten
diff --git a/paddle/pten/kernels/complex_kernel.h b/paddle/pten/kernels/complex_kernel.h
index 9dd3d457e4a26..b6074f117ea14 100644
--- a/paddle/pten/kernels/complex_kernel.h
+++ b/paddle/pten/kernels/complex_kernel.h
@@ -15,9 +15,6 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/include/infermeta.h"
-#include "paddle/pten/kernels/empty_kernel.h"
-
 #include "paddle/pten/infermeta/unary.h"
 #include "paddle/pten/kernels/empty_kernel.h"
 
diff --git a/paddle/pten/kernels/cpu/scale_kernel.cc b/paddle/pten/kernels/cpu/scale_kernel.cc
index fe9a0a033bced..0582fb87b4457 100644
--- a/paddle/pten/kernels/cpu/scale_kernel.cc
+++ b/paddle/pten/kernels/cpu/scale_kernel.cc
@@ -13,18 +13,48 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/pten/kernels/scale_kernel.h"
-#include "paddle/pten/kernels/impl/scale_kernel_impl.h"
 
 #include "paddle/pten/backends/cpu/cpu_context.h"
+#include "paddle/pten/common/scalar.h"
+#include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/kernels/funcs/eigen/common.h"
 
 // See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/fluid/platform/bfloat16.h"
+namespace pten {
+
+template <typename T, typename Context>
+void ScaleKernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 const Scalar& scale,
+                 float bias,
+                 bool bias_after_scale,
+                 DenseTensor* out) {
+  // calc
+  out->mutable_data<T>();
+  auto eigen_out = pten::EigenVector<T>::Flatten(*out);
+  auto eigen_x = pten::EigenVector<T>::Flatten(x);
+  auto& dev = *dev_ctx.eigen_device();
+  // TODO(chenweihang): now the eigen function here need the dtype of scale,
+  // eigen_x, bias should be same, so here need cast for two scalar arg,
+  // maybe we declare that the type of scale and bias is T?
+  paddle::operators::EigenScale<std::decay_t<decltype(dev)>, T>::Eval(
+      dev,
+      eigen_out,
+      eigen_x,
+      scale.to<T>(),
+      static_cast<T>(bias),
+      bias_after_scale);
+}
+
+}  // namespace pten
 
 PT_REGISTER_CTX_KERNEL(scale,
                        CPU,
                        ALL_LAYOUT,
-                       pten::Scale,
+                       pten::ScaleKernel,
                        float,
                        double,
                        paddle::platform::bfloat16,
diff --git a/paddle/pten/kernels/flatten_kernel.h b/paddle/pten/kernels/flatten_kernel.h
index a67e66fac4130..c974fda1ed363 100644
--- a/paddle/pten/kernels/flatten_kernel.h
+++ b/paddle/pten/kernels/flatten_kernel.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/include/infermeta.h"
+#include "paddle/pten/infermeta/unary.h"
 #include "paddle/pten/kernels/empty_kernel.h"
 
 namespace pten {
diff --git a/paddle/pten/kernels/gpu/scale_kernel.cu b/paddle/pten/kernels/gpu/scale_kernel.cu
index 68574c063e77f..ff7e2a6ed284c 100644
--- a/paddle/pten/kernels/gpu/scale_kernel.cu
+++ b/paddle/pten/kernels/gpu/scale_kernel.cu
@@ -44,12 +44,12 @@ struct ScaleFunctor {
 };
 
 template <typename T, typename ContextT>
-void Scale(const ContextT& dev_ctx,
-           const DenseTensor& x,
-           const Scalar& scale,
-           float bias,
-           bool bias_after_scale,
-           DenseTensor* out) {
+void ScaleKernel(const ContextT& dev_ctx,
+                 const DenseTensor& x,
+                 const Scalar& scale,
+                 float bias,
+                 bool bias_after_scale,
+                 DenseTensor* out) {
   std::vector<const DenseTensor*> inputs;
   std::vector<DenseTensor*> outputs;
   inputs.emplace_back(&x);
@@ -67,7 +67,7 @@ void Scale(const ContextT& dev_ctx,
 PT_REGISTER_CTX_KERNEL(scale,
                        GPU,
                        ALL_LAYOUT,
-                       pten::Scale,
+                       pten::ScaleKernel,
                        float,
                        double,
                        paddle::platform::float16,
diff --git a/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h b/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h
index 802cc019d78c5..b1bae78ddc5fa 100644
--- a/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h
+++ b/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h
@@ -14,8 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-// #include "paddle/pten/kernels/complex_kernel.h"
-#include "paddle/pten/include/math.h"
+#include "paddle/pten/kernels/complex_kernel.h"
 #include "paddle/pten/kernels/empty_kernel.h"
 #include "paddle/pten/kernels/impl/dot_grad_kernel_impl.h"
 #include "paddle/pten/kernels/impl/matmul_kernel_impl.h"
diff --git a/paddle/pten/kernels/impl/scale_kernel_impl.h b/paddle/pten/kernels/impl/scale_kernel_impl.h
deleted file mode 100644
index 2e0b158b36b8d..0000000000000
--- a/paddle/pten/kernels/impl/scale_kernel_impl.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/pten/common/scalar.h"
-#include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/kernels/funcs/eigen/common.h"
-
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-
-namespace pten {
-
-template <typename T, typename Context>
-void Scale(const Context& dev_ctx,
-           const DenseTensor& x,
-           const Scalar& scale,
-           float bias,
-           bool bias_after_scale,
-           DenseTensor* out) {
-  // calc
-  out->mutable_data<T>();
-  auto eigen_out = pten::EigenVector<T>::Flatten(*out);
-  auto eigen_x = pten::EigenVector<T>::Flatten(x);
-  auto& dev = *dev_ctx.eigen_device();
-  // TODO(chenweihang): now the eigen function here need the dtype of scale,
-  // eigen_x, bias should be same, so here need cast for two scalar arg,
-  // maybe we declare that the type of scale and bias is T?
-  paddle::operators::EigenScale<std::decay_t<decltype(dev)>, T>::Eval(
-      dev,
-      eigen_out,
-      eigen_x,
-      scale.to<T>(),
-      static_cast<T>(bias),
-      bias_after_scale);
-}
-
-}  // namespace pten
diff --git a/paddle/pten/kernels/math_kernel.h b/paddle/pten/kernels/math_kernel.h
index f87d0a31b470b..e01103fc5b847 100644
--- a/paddle/pten/kernels/math_kernel.h
+++ b/paddle/pten/kernels/math_kernel.h
@@ -16,7 +16,8 @@ limitations under the License. */
 
 #include "paddle/pten/api/lib/utils/storage.h"
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/include/infermeta.h"
+#include "paddle/pten/infermeta/binary.h"
+#include "paddle/pten/infermeta/unary.h"
 #include "paddle/pten/kernels/empty_kernel.h"
 
 namespace pten {
diff --git a/paddle/pten/kernels/reshape_kernel.h b/paddle/pten/kernels/reshape_kernel.h
index faa51c69ad17c..293f6cd2baf61 100644
--- a/paddle/pten/kernels/reshape_kernel.h
+++ b/paddle/pten/kernels/reshape_kernel.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "paddle/pten/common/scalar_array.h"
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/include/infermeta.h"
+#include "paddle/pten/infermeta/unary.h"
 #include "paddle/pten/kernels/empty_kernel.h"
 
 namespace pten {
diff --git a/paddle/pten/kernels/scale_kernel.h b/paddle/pten/kernels/scale_kernel.h
index 5908050029c7a..ba16db566b8bb 100644
--- a/paddle/pten/kernels/scale_kernel.h
+++ b/paddle/pten/kernels/scale_kernel.h
@@ -16,15 +16,29 @@ limitations under the License. */
 
 #include "paddle/pten/common/scalar.h"
 #include "paddle/pten/core/dense_tensor.h"
-
+#include "paddle/pten/infermeta/unary.h"
+#include "paddle/pten/kernels/empty_kernel.h"
 namespace pten {
 
 template <typename T, typename Context>
-void Scale(const Context& dev_ctx,
-           const DenseTensor& x,
-           const Scalar& scale,
-           float bias,
-           bool bias_after_scale,
-           DenseTensor* out);
+void ScaleKernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 const Scalar& scale,
+                 float bias,
+                 bool bias_after_scale,
+                 DenseTensor* out);
+
+template <typename T, typename ContextT>
+DenseTensor Scale(const ContextT& dev_ctx,
+                  const DenseTensor& x,
+                  const Scalar& scale,
+                  float bias,
+                  bool bias_after_scale) {
+  auto out_meta = UnchangedInferMeta(x.meta());
+  auto dense_out = pten::Empty<T, ContextT>(dev_ctx, std::move(out_meta));
+  ScaleKernel<T, ContextT>(
+      dev_ctx, x, scale, bias, bias_after_scale, &dense_out);
+  return dense_out;
+}
 
 }  // namespace pten
diff --git a/paddle/pten/kernels/sign_kernel.h b/paddle/pten/kernels/sign_kernel.h
index ba205fc96a15c..304b640d2af69 100644
--- a/paddle/pten/kernels/sign_kernel.h
+++ b/paddle/pten/kernels/sign_kernel.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/include/infermeta.h"
+#include "paddle/pten/infermeta/unary.h"
 #include "paddle/pten/kernels/empty_kernel.h"
 
 namespace pten {
diff --git a/paddle/pten/tests/api/scale_api.h b/paddle/pten/tests/api/scale_api.h
index d525b305c7409..41143826c45d8 100644
--- a/paddle/pten/tests/api/scale_api.h
+++ b/paddle/pten/tests/api/scale_api.h
@@ -23,8 +23,7 @@
 #include "paddle/pten/common/scalar.h"
 #include "paddle/pten/common/scalar_array.h"
 #include "paddle/pten/core/kernel_registry.h"
-#include "paddle/pten/include/core.h"
-#include "paddle/pten/include/infermeta.h"
+#include "paddle/pten/infermeta/unary.h"
 #include "paddle/pten/kernels/scale_kernel.h"
 
 namespace paddle {
@@ -92,42 +91,42 @@ static void ScaleCPU(DataType kernel_dtype,
                      pten::DenseTensor* dense_out) {
   switch (kernel_dtype) {
     case pten::DataType::FLOAT64: {
-      pten::Scale<double>(
+      pten::ScaleKernel<double>(
           dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out);
       break;
     }
     case pten::DataType::FLOAT32: {
-      pten::Scale<float>(
+      pten::ScaleKernel<float>(
           dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out);
       break;
     }
     case pten::DataType::BFLOAT16: {
-      pten::Scale<paddle::platform::bfloat16>(
+      pten::ScaleKernel<paddle::platform::bfloat16>(
           dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out);
       break;
     }
     case pten::DataType::INT64: {
-      pten::Scale<int64_t>(
+      pten::ScaleKernel<int64_t>(
           dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out);
       break;
     }
     case pten::DataType::INT32: {
-      pten::Scale<int32_t>(
+      pten::ScaleKernel<int32_t>(
           dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out);
       break;
     }
     case pten::DataType::INT16: {
-      pten::Scale<int16_t>(
+      pten::ScaleKernel<int16_t>(
           dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out);
       break;
     }
     case pten::DataType::INT8: {
-      pten::Scale<int8_t>(
+      pten::ScaleKernel<int8_t>(
           dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out);
       break;
     }
     case pten::DataType::UINT8: {
-      pten::Scale<uint8_t>(
+      pten::ScaleKernel<uint8_t>(
           dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out);
       break;
     }
@@ -151,42 +150,42 @@ static void ScaleGPU(DataType kernel_dtype,
                      pten::DenseTensor* dense_out) {
   switch (kernel_dtype) {
     case pten::DataType::FLOAT64: {
-      pten::Scale<double>(
+      pten::ScaleKernel<double>(
           dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out);
       break;
     }
     case pten::DataType::FLOAT32: {
-      pten::Scale<float>(
+      pten::ScaleKernel<float>(
           dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out);
       break;
     }
     case pten::DataType::FLOAT16: {
-      pten::Scale<paddle::platform::float16>(
+      pten::ScaleKernel<paddle::platform::float16>(
           dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out);
       break;
     }
     case pten::DataType::INT64: {
-      pten::Scale<int64_t>(
+      pten::ScaleKernel<int64_t>(
           dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out);
       break;
     }
     case pten::DataType::INT32: {
-      pten::Scale<int32_t>(
+      pten::ScaleKernel<int32_t>(
           dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out);
       break;
     }
     case pten::DataType::INT16: {
-      pten::Scale<int16_t>(
+      pten::ScaleKernel<int16_t>(
           dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out);
       break;
     }
     case pten::DataType::INT8: {
-      pten::Scale<int8_t>(
+      pten::ScaleKernel<int8_t>(
           dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out);
       break;
     }
     case pten::DataType::UINT8: {
-      pten::Scale<uint8_t>(
+      pten::ScaleKernel<uint8_t>(
           dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out);
       break;
     }
diff --git a/paddle/pten/tests/kernels/test_scale_dev_api.cc b/paddle/pten/tests/kernels/test_scale_dev_api.cc
index ac2922b36f205..fe26f56552b05 100644
--- a/paddle/pten/tests/kernels/test_scale_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_scale_dev_api.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/pten/include/math.h"
+#include "paddle/pten/kernels/scale_kernel.h"
 
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py
index 35720ae32fe38..e8539b11d1455 100644
--- a/python/paddle/utils/code_gen/api_gen.py
+++ b/python/paddle/utils/code_gen/api_gen.py
@@ -345,8 +345,10 @@ def source_include(header_file_path):
 #include "paddle/pten/api/lib/kernel_dispatch.h"
 #include "paddle/pten/api/lib/utils/storage.h"
 #include "paddle/pten/core/kernel_registry.h"
-#include "paddle/pten/include/core.h"
-#include "paddle/pten/include/infermeta.h"
+#include "paddle/pten/infermeta/binary.h"
+#include "paddle/pten/infermeta/multiary.h"
+#include "paddle/pten/infermeta/nullary.h"
+#include "paddle/pten/infermeta/unary.h"
 #include "paddle/pten/kernels/declarations.h"
 """
 

From 23aa7b08d18d9b6a3e80d6bc31d71b481719b0bd Mon Sep 17 00:00:00 2001
From: Sing_chan <51314274+betterpig@users.noreply.github.com>
Date: Thu, 13 Jan 2022 15:04:01 +0800
Subject: [PATCH 126/151] force close eager_generator.exe (#38896)

* force close eager_generator.exe

* modify according to zhouwei's comment
---
 paddle/scripts/paddle_build.bat | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index ca34b12b5d4f8..343ab8ff9f5b7 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -42,7 +42,11 @@ taskkill /f /im nvcc.exe /t 2>NUL
 taskkill /f /im cicc.exe /t 2>NUL
 taskkill /f /im ptxas.exe /t 2>NUL
 taskkill /f /im op_function_generator.exe /t 2>NUL
+taskkill /f /im eager_generator.exe /t 2>NUL
+taskkill /f /im eager_op_function_generator.exe /t 2>NUL
 wmic process where name="op_function_generator.exe" call terminate 2>NUL
+wmic process where name="eager_generator.exe" call terminate 2>NUL
+wmic process where name="eager_op_function_generator.exe" call terminate 2>NUL
 wmic process where name="cvtres.exe" call terminate 2>NUL
 wmic process where name="rc.exe" call terminate 2>NUL
 wmic process where name="cl.exe" call terminate 2>NUL
@@ -509,8 +513,12 @@ taskkill /f /im nvcc.exe /t 2>NUL
 taskkill /f /im cicc.exe /t 2>NUL
 taskkill /f /im ptxas.exe /t 2>NUL
 taskkill /f /im op_function_generator.exe /t 2>NUL
-wmic process where name="cmake.exe" call terminate 2>NUL
+taskkill /f /im eager_generator.exe /t 2>NUL
+taskkill /f /im eager_op_function_generator.exe /t 2>NUL
 wmic process where name="op_function_generator.exe" call terminate 2>NUL
+wmic process where name="eager_generator.exe" call terminate 2>NUL
+wmic process where name="eager_op_function_generator.exe" call terminate 2>NUL
+wmic process where name="cmake.exe" call terminate 2>NUL
 wmic process where name="cvtres.exe" call terminate 2>NUL
 wmic process where name="rc.exe" call terminate 2>NUL
 wmic process where name="cl.exe" call terminate 2>NUL
@@ -972,7 +980,11 @@ taskkill /f /im nvcc.exe /t 2>NUL
 taskkill /f /im cicc.exe /t 2>NUL
 taskkill /f /im ptxas.exe /t 2>NUL
 taskkill /f /im op_function_generator.exe /t 2>NUL
+taskkill /f /im eager_generator.exe /t 2>NUL
+taskkill /f /im eager_op_function_generator.exe /t 2>NUL
 wmic process where name="op_function_generator.exe" call terminate 2>NUL
+wmic process where name="eager_generator.exe" call terminate 2>NUL
+wmic process where name="eager_op_function_generator.exe" call terminate 2>NUL
 wmic process where name="cvtres.exe" call terminate 2>NUL
 wmic process where name="rc.exe" call terminate 2>NUL
 wmic process where name="cl.exe" call terminate 2>NUL

From 7a5af6306bb3f34ea951203e5e36419c0be9ac11 Mon Sep 17 00:00:00 2001
From: furnace <34057289+windstamp@users.noreply.github.com>
Date: Thu, 13 Jan 2022 16:31:19 +0800
Subject: [PATCH 127/151] [NPU] fix expand op (#38526)

* [NPU] fix expand op

* [NPU] optimize codes

* [NPU] optimize codes
---
 paddle/fluid/operators/expand_op_npu.cc       | 26 +++++++++++++++----
 .../tests/unittests/npu/test_expand_op_npu.py | 21 +++++++++++++++
 2 files changed, 42 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/operators/expand_op_npu.cc b/paddle/fluid/operators/expand_op_npu.cc
index 8ecdd5e8cb695..e9f31f8ddd698 100644
--- a/paddle/fluid/operators/expand_op_npu.cc
+++ b/paddle/fluid/operators/expand_op_npu.cc
@@ -81,14 +81,30 @@ class ExpandNPUKernel : public framework::OpKernel<T> {
       out_dims[i] *= expand_times[i];
     }
 
-    out0->Resize(out_dims);
-    out0->mutable_data<T>(context.device_context().GetPlace());
-    const auto& runner =
-        NpuOpRunner("TileD", {*in0}, {*out0}, {{"multiples", expand_times}});
+    auto place = context.GetPlace();
     auto stream =
         context.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
-    runner.Run(stream);
+
+    out0->Resize(out_dims);
+    out0->mutable_data<T>(place);
+
+    bool is_expand_times_all_one =
+        (out0->numel() == in0->numel()) ? true : false;
+
+    if (is_expand_times_all_one) {
+      memory::Copy(BOOST_GET_CONST(platform::NPUPlace, place),
+                   out0->mutable_data<T>(place),
+                   BOOST_GET_CONST(platform::NPUPlace, place), in0->data<T>(),
+                   in0->numel() * sizeof(T), stream);
+      if (out_dims != in_dims) {
+        out0->Resize(out_dims);
+      }
+    } else {
+      const auto& runner =
+          NpuOpRunner("TileD", {*in0}, {*out0}, {{"multiples", expand_times}});
+      runner.Run(stream);
+    }
   }
 };
 }  // namespace operators
diff --git a/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py
index 375003f79e500..89ac9e09aa348 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py
@@ -132,5 +132,26 @@ def test_npu(self):
         self.assertTrue(np.allclose(npu_loss, cpu_loss))
 
 
+# ------------------------------------------------
+# Special Cases for NPU
+# ------------------------------------------------
+
+
+class TestExpand_expand_times_all_one(TestExpand):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "expand"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.randn(3, 1, 7).astype(self.dtype)
+        out = np.tile(x, [1, 1, 1])
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {'expand_times': [1, 1, 1]}
+        self.outputs = {'Out': out}
+
+
 if __name__ == '__main__':
     unittest.main()

From eaccdc71dd04b1f42ceac170c82754dd0a953867 Mon Sep 17 00:00:00 2001
From: furnace <34057289+windstamp@users.noreply.github.com>
Date: Thu, 13 Jan 2022 16:34:17 +0800
Subject: [PATCH 128/151] [NPU] fix tril_triu (#38864)

[NPU] fix tril_triu
---
 paddle/fluid/operators/tril_triu_op_npu.cc    | 41 ++++++++++++++++---
 .../unittests/npu/test_tril_triu_op_npu.py    | 16 +++++++-
 2 files changed, 51 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/operators/tril_triu_op_npu.cc b/paddle/fluid/operators/tril_triu_op_npu.cc
index ab7a9035fb974..02af711567f84 100644
--- a/paddle/fluid/operators/tril_triu_op_npu.cc
+++ b/paddle/fluid/operators/tril_triu_op_npu.cc
@@ -33,12 +33,41 @@ class TrilTriuNPUKernel : public framework::OpKernel<T> {
 
     framework::NPUAttributeMap attr_input = {{"diagonal", diagonal}};
 
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
+    const auto& dev_ctx =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>();
 
-    const auto& runner = NpuOpRunner(op_type, {*x}, {*out}, attr_input);
-    runner.Run(stream);
+    auto op_func_tril = [](const std::vector<Tensor>& inputs,
+                           const std::vector<Tensor>& outputs,
+                           const NPUAttributeMap& attrs,
+                           const platform::NPUDeviceContext& dev_ctx) {
+      const auto& runner = NpuOpRunner("Tril", inputs, outputs, attrs);
+      runner.Run(dev_ctx.stream());
+    };
+
+    auto op_func_triu = [](const std::vector<Tensor>& inputs,
+                           const std::vector<Tensor>& outputs,
+                           const NPUAttributeMap& attrs,
+                           const platform::NPUDeviceContext& dev_ctx) {
+      const auto& runner = NpuOpRunner("Triu", inputs, outputs, attrs);
+      runner.Run(dev_ctx.stream());
+    };
+
+    if (x->type() == framework::proto::VarType::BOOL) {
+      if (lower) {
+        NpuOpRunner::TypeAdapter({*x}, {*out}, attr_input, dev_ctx,
+                                 op_func_tril,
+                                 {framework::proto::VarType::UINT8},
+                                 {framework::proto::VarType::UINT8});
+      } else {
+        NpuOpRunner::TypeAdapter({*x}, {*out}, attr_input, dev_ctx,
+                                 op_func_triu,
+                                 {framework::proto::VarType::UINT8},
+                                 {framework::proto::VarType::UINT8});
+      }
+    } else {
+      const auto& runner = NpuOpRunner(op_type, {*x}, {*out}, attr_input);
+      runner.Run(dev_ctx.stream());
+    }
   }
 };
 
@@ -49,4 +78,6 @@ namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 REGISTER_OP_NPU_KERNEL(
     tril_triu, ops::TrilTriuNPUKernel<plat::NPUDeviceContext, float>,
+    ops::TrilTriuNPUKernel<plat::NPUDeviceContext, int>,
+    ops::TrilTriuNPUKernel<plat::NPUDeviceContext, bool>,
     ops::TrilTriuNPUKernel<plat::NPUDeviceContext, plat::float16>);
diff --git a/python/paddle/fluid/tests/unittests/npu/test_tril_triu_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_tril_triu_op_npu.py
index 13adc25a38ca5..8239dd4f3fa89 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_tril_triu_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_tril_triu_op_npu.py
@@ -15,7 +15,7 @@
 
 import unittest
 import numpy as np
-from paddle.fluid.tests.unittests.op_test import OpTest
+from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci
 import paddle
 import paddle.fluid as fluid
 import paddle.tensor as tensor
@@ -187,5 +187,19 @@ def test_fluid_api(self):
                                    fetch_list=[triu_out])
 
 
+# @skip_check_grad_ci(reason="[NPU does not support grad right now.")
+class TestNPUTrilTriu_bool(TestNPUTrilTriu):
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def init_dtype(self):
+        self.dtype = np.bool
+
+    def initTestCase(self):
+        self.real_op_type = np.random.choice(['triu', 'tril'])
+        self.diagonal = None
+        self.X = np.random.choice([False, True], size=(100)).reshape([10, -1])
+
+
 if __name__ == '__main__':
     unittest.main()

From 7f1234563ff3aab32168a6fbaeb57d73748981c3 Mon Sep 17 00:00:00 2001
From: shangliang Xu <ghostxsl@users.noreply.github.com>
Date: Thu, 13 Jan 2022 17:24:53 +0800
Subject: [PATCH 129/151] [bug fix] fix unfold bug in compile time (#38907)

---
 paddle/fluid/operators/unfold_op.cc | 35 +++++++++++++----------------
 paddle/fluid/operators/unfold_op.h  | 10 +--------
 2 files changed, 16 insertions(+), 29 deletions(-)

diff --git a/paddle/fluid/operators/unfold_op.cc b/paddle/fluid/operators/unfold_op.cc
index 3f580884aa515..5a8e7e3efbe82 100644
--- a/paddle/fluid/operators/unfold_op.cc
+++ b/paddle/fluid/operators/unfold_op.cc
@@ -143,22 +143,18 @@ class UnfoldOp : public framework::OperatorWithKernel {
             "but recieved dilations_height: %d dilations_width: %d.",
             dilations[0], dilations[1]));
 
-    bool contain_unknown_dim = framework::contain_unknown_dim(in_dims);
-    bool check = ctx->IsRuntime() || !contain_unknown_dim;
-    if (check) {
-      std::vector<int> out_dims;
-      out_dims.push_back(in_dims[0]);
-
-      int output_channels = in_dims[1] * kernel_sizes[0] * kernel_sizes[1];
-      out_dims.push_back(output_channels);
-
-      int output_height =
-          CalcOutputSize(in_dims[2], kernel_sizes[0], dilations[0], paddings[0],
-                         paddings[2], strides[0]);
-      int output_width =
-          CalcOutputSize(in_dims[3], kernel_sizes[1], dilations[1], paddings[1],
-                         paddings[3], strides[1]);
-      // check output height and width
+    std::vector<int> out_dims;
+    out_dims.push_back(in_dims[0]);
+    int output_channels = in_dims[1] * kernel_sizes[0] * kernel_sizes[1];
+    out_dims.push_back(output_channels);
+
+    int output_height =
+        CalcOutputSize(in_dims[2], kernel_sizes[0], dilations[0], paddings[0],
+                       paddings[2], strides[0]);
+    int output_width = CalcOutputSize(in_dims[3], kernel_sizes[1], dilations[1],
+                                      paddings[1], paddings[3], strides[1]);
+    if (ctx->IsRuntime()) {
+      // only check output height and width in runtime
       PADDLE_ENFORCE_GT(
           output_height, 0,
           platform::errors::InvalidArgument(
@@ -179,11 +175,10 @@ class UnfoldOp : public framework::OperatorWithKernel {
               in_dims[2], in_dims[3], kernel_sizes[0], kernel_sizes[1],
               strides[0], strides[1], dilations[0], dilations[1], output_height,
               output_width));
-      int output_col_length = output_height * output_width;
-      out_dims.push_back(output_col_length);
-
-      ctx->SetOutputDim("Y", framework::make_ddim(out_dims));
     }
+    int output_col_length = output_height * output_width;
+    out_dims.push_back(output_col_length);
+    ctx->SetOutputDim("Y", framework::make_ddim(out_dims));
   }
 
  protected:
diff --git a/paddle/fluid/operators/unfold_op.h b/paddle/fluid/operators/unfold_op.h
index f22559f1f38c2..006e4822fead0 100644
--- a/paddle/fluid/operators/unfold_op.h
+++ b/paddle/fluid/operators/unfold_op.h
@@ -16,6 +16,7 @@
 
 #include <memory>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/im2col.h"
 #include "paddle/fluid/operators/math/math_function.h"
@@ -29,15 +30,6 @@ inline int CalcOutputSize(int input_size, int filter_size, int dilation,
                           int padding1, int padding2, int stride) {
   const int dkernel = dilation * (filter_size - 1) + 1;
   int output_size = (input_size + padding1 + padding2 - dkernel) / stride + 1;
-  PADDLE_ENFORCE_GT(
-      output_size, 0UL,
-      platform::errors::InvalidArgument(
-          "Due to the settings of padding(%d, %d), filter_size(%d), "
-          "dilation(%d) and "
-          "stride(%d), the output size is less than 0, please check "
-          "again. Input_size:%d",
-          padding1, padding2, filter_size, dilation, stride, input_size));
-
   return output_size;
 }
 

From dccdc719ebd863db342c3ef1c8794be2ee391348 Mon Sep 17 00:00:00 2001
From: Wangzheee <634486483@qq.com>
Date: Thu, 13 Jan 2022 19:33:45 +0800
Subject: [PATCH 130/151] [Paddle-Inference] add Paddle Trt config:
 with_interleaved (#38884)

* add Paddle Trt config: with_interleaved
---
 paddle/fluid/inference/analysis/argument.h    |  1 +
 .../inference/analysis/ir_pass_manager.cc     |  2 +
 .../ir_passes/tensorrt_subgraph_pass.cc       |  1 +
 paddle/fluid/inference/api/analysis_config.cc |  3 +
 .../fluid/inference/api/analysis_predictor.cc |  7 +++
 .../inference/api/paddle_analysis_config.h    |  2 +
 paddle/fluid/inference/api/paddle_api.h       | 21 +++++++
 .../inference/api/paddle_inference_api.h      | 16 -----
 .../tensorrt/convert/batch_norm_op.cc         | 17 ++++--
 .../tensorrt/convert/elementwise_op.cc        | 14 +++--
 .../inference/tensorrt/convert/gather_op.cc   |  2 +
 .../inference/tensorrt/convert/op_converter.h | 58 ++++++++++++-------
 .../inference/tensorrt/convert/scale_op.cc    | 16 +++++
 .../inference/tensorrt/convert/slice_op.cc    | 30 +++++-----
 paddle/fluid/inference/tensorrt/engine.h      |  5 ++
 15 files changed, 136 insertions(+), 59 deletions(-)

diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index aff2f60551de9..175bc55dcff17 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -212,6 +212,7 @@ struct Argument {
                       bool);
   DECL_ARGUMENT_FIELD(tensorrt_use_calib_mode, TensorRtUseCalibMode, bool);
   DECL_ARGUMENT_FIELD(tensorrt_use_oss, TensorRtUseOSS, bool);
+  DECL_ARGUMENT_FIELD(tensorrt_with_interleaved, TensorRtWithInterleaved, bool);
   DECL_ARGUMENT_FIELD(tensorrt_shape_range_info_path,
                       TensorRtShapeRangeInfoPath, std::string);
   DECL_ARGUMENT_FIELD(tensorrt_tuned_dynamic_shape, TensorRtTunedDynamicShape,
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index dcbbee97a772c..3abda782ab6cf 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -108,6 +108,8 @@ void IRPassManager::CreatePasses(Argument *argument,
       pass->Set("enable_int8", new bool(enable_int8));
       pass->Set("use_calib_mode", new bool(use_calib_mode));
       pass->Set("use_oss", new bool(argument->tensorrt_use_oss()));
+      pass->Set("with_interleaved",
+                new bool(argument->tensorrt_with_interleaved()));
       pass->Set("precision_mode",
                 new AnalysisConfig::Precision(precision_mode));
 
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index a21118e23aa5c..ef50df3084f8c 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -369,6 +369,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
                   Get<int>("gpu_device_id"), min_input_shape, max_input_shape,
                   opt_input_shape, disable_trt_plugin_fp16);
   trt_engine->SetUseOSS(Get<bool>("use_oss"));
+  trt_engine->SetWithInterleaved(Get<bool>("with_interleaved"));
   trt_engine->SetUseDLA(Get<bool>("trt_use_dla"));
   trt_engine->SetDLACore(Get<int>("trt_dla_core"));
 
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index a1ab69906bfc4..273690719336c 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -189,6 +189,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   CP_MEMBER(trt_use_static_engine_);
   CP_MEMBER(trt_use_calib_mode_);
   CP_MEMBER(trt_use_oss_);
+  CP_MEMBER(trt_with_interleaved_);
   CP_MEMBER(trt_tuned_dynamic_shape_);
   CP_MEMBER(trt_allow_build_at_runtime_);
   CP_MEMBER(collect_shape_range_info_);
@@ -864,6 +865,8 @@ std::string AnalysisConfig::Summary() {
                                                         : "false"});
 
       os.InsertRow({"tensorrt_use_oss", trt_use_oss_ ? "true" : "false"});
+      os.InsertRow({"tensorrt_with_interleaved",
+                    trt_with_interleaved_ ? "true" : "false"});
       os.InsertRow({"tensorrt_use_dla", trt_use_dla_ ? "true" : "false"});
       if (trt_use_dla_) {
         os.InsertRow({"tensorrt_dla_core", std::to_string(trt_dla_core_)});
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 929984f50a7b8..2799fb9e174d3 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -605,6 +605,7 @@ void AnalysisPredictor::PrepareArgument() {
     argument_.SetTensorRtUseStaticEngine(config_.trt_use_static_engine_);
     argument_.SetTensorRtUseCalibMode(config_.trt_use_calib_mode_);
     argument_.SetTensorRtUseOSS(config_.trt_use_oss_);
+    argument_.SetTensorRtWithInterleaved(config_.trt_with_interleaved_);
     argument_.SetMinInputShape(config_.min_input_shape_);
     argument_.SetMaxInputShape(config_.max_input_shape_);
     argument_.SetOptimInputShape(config_.optim_input_shape_);
@@ -1603,5 +1604,11 @@ bool InternalUtils::RunWithExternalStream(paddle_infer::Predictor *p,
 #endif
   return false;
 }
+void InternalUtils::UpdateConfigInterleaved(paddle_infer::Config *c,
+                                            bool with_interleaved) {
+#ifdef PADDLE_WITH_CUDA
+  c->trt_with_interleaved_ = with_interleaved;
+#endif
+}
 }  // namespace experimental
 }  // namespace paddle_infer
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 77409f95b042e..f65170daccb62 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -796,6 +796,7 @@ struct PD_INFER_DECL AnalysisConfig {
   bool trt_use_static_engine_{false};
   bool trt_use_calib_mode_{true};
   bool trt_use_oss_{false};
+  bool trt_with_interleaved_{false};
   bool trt_use_dla_{false};
   int trt_dla_core_{0};
   std::map<std::string, std::vector<int>> min_input_shape_{};
@@ -883,6 +884,7 @@ struct PD_INFER_DECL AnalysisConfig {
   // So we release the memory when the predictor is set up.
   mutable bool is_valid_{true};
   std::string opt_cache_dir_;
+  friend class paddle_infer::experimental::InternalUtils;
 };
 
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h
index b137b7ba6f97e..c129efe494b4f 100644
--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -405,3 +405,24 @@ PD_INFER_DECL std::shared_ptr<framework::Cipher> MakeCipher(
     const std::string& config_file);
 
 }  // namespace paddle
+
+// forward declation
+using cudaStream_t = struct CUstream_st*;
+using hipStream_t = struct ihipStream_t*;
+
+namespace paddle_infer {
+class Predictor;
+using Config = paddle::AnalysisConfig;
+namespace experimental {
+class PD_INFER_DECL InternalUtils {
+ public:
+  // Note: Can only be used under thread_local semantics.
+  static bool RunWithExternalStream(paddle_infer::Predictor* pred,
+                                    cudaStream_t stream);
+  static bool RunWithExternalStream(paddle_infer::Predictor* pred,
+                                    hipStream_t stream);
+  static void UpdateConfigInterleaved(paddle_infer::Config* c,
+                                      bool with_interleaved);
+};
+}  // namespace experimental
+}  // namespace paddle_infer
diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h
index b2b9f2e407478..65906a57f46cb 100644
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -41,27 +41,11 @@ limitations under the License. */
 /// \since 2.0.0-beta
 ///
 
-// forward declation
-using cudaStream_t = struct CUstream_st*;
-using hipStream_t = struct ihipStream_t*;
-
 namespace paddle_infer {
 
 using PrecisionType = paddle::AnalysisConfig::Precision;
 using Config = paddle::AnalysisConfig;
 
-class Predictor;
-namespace experimental {
-class PD_INFER_DECL InternalUtils {
- public:
-  // Note: Can only be used under thread_local semantics.
-  static bool RunWithExternalStream(paddle_infer::Predictor* pred,
-                                    cudaStream_t stream);
-  static bool RunWithExternalStream(paddle_infer::Predictor* pred,
-                                    hipStream_t stream);
-};
-}  // namespace experimental
-
 ///
 /// \class Predictor
 ///
diff --git a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
index 71a2fa68f1749..0e66165191474 100644
--- a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
@@ -45,7 +45,7 @@ class BatchNormOpConverter : public OpConverter {
     auto* Scale_v = scope.FindVar(op_desc.Input("Scale").front());
     auto* Variance_v = scope.FindVar(op_desc.Input("Variance").front());
     const float eps = BOOST_GET_CONST(float, op_desc.GetAttr("epsilon"));
-
+    auto output_name = op_desc.Output("Y").front();
     PADDLE_ENFORCE_NOT_NULL(
         Bias_v,
         platform::errors::NotFound(
@@ -145,6 +145,10 @@ class BatchNormOpConverter : public OpConverter {
       expand_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X);
       expand_layer->setReshapeDimensions(expand_shape);
       X = expand_layer->getOutput(0);
+      expand_layer->getOutput(0)->setName(
+          ("reshape_before_batchnorm_out: " + output_name).c_str());
+      expand_layer->setName(
+          ("BN_Shuffle: (Output: " + output_name + ")").c_str());
     }
 
     layer = TRT_ENGINE_ADD_LAYER(engine_, ScaleNd, *X,
@@ -152,12 +156,13 @@ class BatchNormOpConverter : public OpConverter {
                                  shift_weights.get(), scale_weights.get(),
                                  power_weights.get(), dynamic_shape_offset);
 
-    auto output_name = op_desc.Output("Y").front();
     engine_->SetWeights(op_desc.Input("Bias").front(),
                         std::move(combile_bias_tensor));
     engine_->SetWeights(op_desc.Input("Scale").front(),
                         std::move(combile_scale_tensor));
     if (x_dim.nbDims < 3 + dynamic_shape_offset) {
+      layer->getOutput(0)->setName("batch_norm_out");
+      layer->setName(("BN: ScaleNd: (Output: " + output_name + ")").c_str());
       nvinfer1::Dims squeeze_shape;
       squeeze_shape.nbDims = x_dim.nbDims;
       for (int i = 0; i < squeeze_shape.nbDims; i++) {
@@ -166,10 +171,12 @@ class BatchNormOpConverter : public OpConverter {
       squeeze_layer =
           TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *(layer->getOutput(0)));
       squeeze_layer->setReshapeDimensions(squeeze_shape);
-      layer = static_cast<nvinfer1::ILayer*>(squeeze_layer);
+      RreplenishLayerAndOutput(squeeze_layer, "batchnorm_add_scale",
+                               {output_name}, test_mode);
+    } else {
+      RreplenishLayerAndOutput(layer, "batchnorm_add_scale", {output_name},
+                               test_mode);
     }
-    RreplenishLayerAndOutput(layer, "batchnorm_add_scale", {output_name},
-                             test_mode);
   }
 };
 
diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
index 7c5af43816c44..33f732c19a875 100644
--- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
@@ -50,6 +50,7 @@ class ElementwiseWeightOpConverter : public OpConverter {
                                         op_desc.Input("Y").front().c_str()));
     auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
     float* weight_data = nullptr;
+    auto output_name = op_desc.Output("Out")[0];
     weight_data =
         engine_->GetWeightCPUData(op_desc.Input("Y").front(), Y_t, false);
     nvinfer1::Dims dims_x = X->getDimensions();
@@ -80,6 +81,10 @@ class ElementwiseWeightOpConverter : public OpConverter {
         expand_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X);
         expand_layer->setReshapeDimensions(expand_shape);
         X = expand_layer->getOutput(0);
+        expand_layer->getOutput(0)->setName(
+            ("elementwise_reshape_out: " + output_name).c_str());
+        expand_layer->setName(
+            ("Elewise: Shuffle: (Output: " + output_name + ")").c_str());
       }
       if (op_type_ == "add") {
         nvinfer1::IScaleLayer* scale_layer = TRT_ENGINE_ADD_LAYER(
@@ -101,11 +106,12 @@ class ElementwiseWeightOpConverter : public OpConverter {
         squeeze_layer =
             TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *(layer->getOutput(0)));
         squeeze_layer->setReshapeDimensions(squeeze_shape);
-        layer = static_cast<nvinfer1::ILayer*>(squeeze_layer);
+        RreplenishLayerAndOutput(squeeze_layer, "elementwise_" + op_type_,
+                                 {output_name}, test_mode);
+      } else {
+        RreplenishLayerAndOutput(layer, "elementwise_" + op_type_,
+                                 {output_name}, test_mode);
       }
-      auto output_name = op_desc.Output("Out")[0];
-      RreplenishLayerAndOutput(layer, "elementwise_" + op_type_, {output_name},
-                               test_mode);
       if (op_desc.HasAttr("enable_int8")) {
 #if IS_TRT_VERSION_GE(5000)
         CHECK(op_desc.HasAttr("X_scale"));
diff --git a/paddle/fluid/inference/tensorrt/convert/gather_op.cc b/paddle/fluid/inference/tensorrt/convert/gather_op.cc
index e7b82388b6ab8..a98e7535de1b8 100644
--- a/paddle/fluid/inference/tensorrt/convert/gather_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/gather_op.cc
@@ -56,6 +56,8 @@ class GatherOpConverter : public OpConverter {
     index_shape.d[0] = -1;
 
     reshape_layer->setReshapeDimensions(index_shape);
+    reshape_layer->setName(
+        ("Gather: Shuffle: (Output: " + output_name + ")").c_str());
 
     auto layer = TRT_ENGINE_ADD_LAYER(engine_, Gather, *input_tensor,
                                       *reshape_layer->getOutput(0), axis);
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index 57a26aec6ebcb..7e0c8bf1da177 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -144,28 +144,44 @@ class OpConverter {
     it->SetEngine(engine);
     (*it)(op, scope, test_mode);
 
-    bool has_out_scale = op_desc.HasAttr("out_threshold");
-    if (has_out_scale) {
-      float out_scale =
-          BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold"));
-      std::string output_name = "";
-      if (op_desc.HasOutput("Output")) {
-        output_name = op_desc.Output("Output").front();
-      } else if (op_desc.HasOutput("Out")) {
-        output_name = op_desc.Output("Out").front();
-      } else if (op_desc.HasOutput("Y")) {
-        output_name = op_desc.Output("Y").front();
-      } else {
-        PADDLE_THROW(
-            platform::errors::NotFound("Op %s has out threshold but doesn't "
-                                       "have an output named \"Output\", "
-                                       "\"Out\" or \"Y\".",
-                                       op_desc.Type()));
+    size_t output_num = op_desc.OutputNames().size();
+    if (output_num == 1) {  // The number of output is 1
+      if (op_desc.HasAttr("out_threshold")) {
+        float out_scale =
+            BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold"));
+        std::string output_name = "";
+        if (op_desc.HasOutput("Output")) {
+          output_name = op_desc.Output("Output").front();
+        } else if (op_desc.HasOutput("Out")) {
+          output_name = op_desc.Output("Out").front();
+        } else if (op_desc.HasOutput("Y")) {
+          output_name = op_desc.Output("Y").front();
+        } else {
+          PADDLE_THROW(
+              platform::errors::NotFound("Op %s has out threshold but doesn't "
+                                         "have an output named \"Output\", "
+                                         "\"Out\" or \"Y\".",
+                                         op_desc.Type()));
+        }
+        auto* output_itensor = engine->GetITensor(output_name);
+        engine->SetTensorDynamicRange(output_itensor, out_scale);
+        VLOG(1) << "Set out scale = " << out_scale << " for tensor "
+                << output_name << ".";
+      }
+    } else if (output_num > 1) {  // The number of outputs greater than 1
+      for (size_t i = 0; i < output_num; ++i) {
+        if (op_desc.HasAttr("out_" + std::to_string(i) + "_threshold")) {
+          float out_scale = BOOST_GET_CONST(
+              float,
+              op_desc.GetAttr("out_" + std::to_string(i) + "_threshold"));
+          std::string output_name =
+              op_desc.Output(op_desc.OutputNames()[i]).front();
+          auto* output_itensor = engine->GetITensor(output_name);
+          engine->SetTensorDynamicRange(output_itensor, out_scale);
+          VLOG(1) << "Set out scale = " << out_scale << " for tensor "
+                  << output_name << ".";
+        }
       }
-      auto* output_itensor = engine->GetITensor(output_name);
-      engine->SetTensorDynamicRange(output_itensor, out_scale);
-      VLOG(1) << "Set out scale = " << out_scale << " for tensor "
-              << output_name << ".";
     }
   }
 
diff --git a/paddle/fluid/inference/tensorrt/convert/scale_op.cc b/paddle/fluid/inference/tensorrt/convert/scale_op.cc
index b527f2db53808..8b23a8161f593 100644
--- a/paddle/fluid/inference/tensorrt/convert/scale_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/scale_op.cc
@@ -89,21 +89,34 @@ class ScaleOpConverter : public OpConverter {
       expand_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
       expand_layer->setReshapeDimensions(expand_shape);
       input = expand_layer->getOutput(0);
+      expand_layer->getOutput(0)->setName(
+          ("before_reshape_out: " + out_name).c_str());
+      expand_layer->setName(
+          ("Scale: before_reshape (Output: " + out_name + ")").c_str());
     }
 
     if (bias_after_scale) {
       layer = TRT_ENGINE_ADD_LAYER(
           engine_, Scale, *input, nvinfer1::ScaleMode::kUNIFORM,
           shift_weights.get(), scale_weights.get(), power_weights.get());
+      layer->getOutput(0)->setName(
+          ("bias_after_scale_out: " + out_name).c_str());
+      layer->setName(("Scale: scale (Output: " + out_name + ")").c_str());
     } else {
       // add bias
       layer = TRT_ENGINE_ADD_LAYER(
           engine_, Scale, *(input), nvinfer1::ScaleMode::kUNIFORM,
           shift_weights.get(), power_weights.get(), power_weights.get());
+      layer->getOutput(0)->setName(
+          ("bias_before_scale：bias_out: " + out_name).c_str());
+      layer->setName(("Scale: scale_bias (Output: " + out_name + ")").c_str());
       // mul scale
       layer = TRT_ENGINE_ADD_LAYER(
           engine_, Scale, *(layer->getOutput(0)), nvinfer1::ScaleMode::kUNIFORM,
           power_weights.get(), scale_weights.get(), power_weights.get());
+      layer->getOutput(0)->setName(
+          ("bias_before_scale：scale_out: " + out_name).c_str());
+      layer->setName(("Scale: scale_scale (Output: " + out_name + ")").c_str());
     }
 
     PADDLE_ENFORCE_EQ(layer != nullptr, true,
@@ -119,6 +132,9 @@ class ScaleOpConverter : public OpConverter {
           TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *(layer->getOutput(0)));
       squeeze_layer->setReshapeDimensions(squeeze_shape);
       layer = static_cast<nvinfer1::ILayer*>(squeeze_layer);
+      layer->getOutput(0)->setName(("after_reshape_out: " + out_name).c_str());
+      layer->setName(
+          ("Scale: Shuffle_reshape (Output: " + out_name + ")").c_str());
     }
     RreplenishLayerAndOutput(layer, "scale", {out_name}, test_mode);
   }
diff --git a/paddle/fluid/inference/tensorrt/convert/slice_op.cc b/paddle/fluid/inference/tensorrt/convert/slice_op.cc
index 7f270b1f390b7..2c08f0fe2bded 100644
--- a/paddle/fluid/inference/tensorrt/convert/slice_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/slice_op.cc
@@ -30,10 +30,11 @@ class SliceOpConverter : public OpConverter {
     framework::OpDesc op_desc(op, nullptr);
     // Declare inputs
     auto* input = engine_->GetITensor(op_desc.Input("Input")[0]);
+    auto output_name = op_desc.Output("Out")[0];
 
+    float out_scale = 1;
     if (op_desc.HasAttr("out_threshold")) {
-      float out_scale =
-          BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold"));
+      out_scale = BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold"));
       engine_->SetTensorDynamicRange(input, out_scale);
     }
 
@@ -71,12 +72,22 @@ class SliceOpConverter : public OpConverter {
 
     nvinfer1::ILayer* layer = nullptr;
     if (engine_->with_dynamic_shape()) {
-#if IS_TRT_VERSION_GE(6000)
       if (engine_->use_oss() && engine_->with_ernie()) {
         std::vector<nvinfer1::ITensor*> plugin_inputs;
-        // plugin_inputs.emplace_back(trans_layer->getOutput(0));
-        plugin_inputs.emplace_back(input);
-
+        if (engine_->with_interleaved()) {
+          auto* shuffler_slice = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
+          nvinfer1::Permutation transpose_embed{2, 1, 0, 3};
+          shuffler_slice->setSecondTranspose(transpose_embed);
+          engine_->SetTensorDynamicRange(shuffler_slice->getOutput(0),
+                                         out_scale);
+          shuffler_slice->setName(
+              ("SpecialSlice_interleaved: Shuffle: (Output: " + output_name +
+               ")")
+                  .c_str());
+          plugin_inputs.emplace_back(shuffler_slice->getOutput(0));
+        } else {
+          plugin_inputs.emplace_back(input);
+        }
         std::string pos_name;
         if (engine_->Has("ernie_pos_name")) {
           pos_name = engine_->Get<std::string>("ernie_pos_name");
@@ -99,11 +110,6 @@ class SliceOpConverter : public OpConverter {
             new plugin::SlicePluginDynamic(starts, ends, axes, with_fp16);
         layer = engine_->AddDynamicPlugin(&input, 1, plugin);
       }
-#else
-      PADDLE_THROW(platform::errors::Fatal(
-          "You are running the TRT Dynamic Shape mode, need to confirm that "
-          "your TRT version is no less than 6.0"));
-#endif
     } else {
       bool with_fp16 =
           engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
@@ -111,8 +117,6 @@ class SliceOpConverter : public OpConverter {
           new plugin::SlicePlugin(starts, ends, axes, with_fp16);
       layer = engine_->AddPlugin(&input, 1, plugin);
     }
-
-    auto output_name = op_desc.Output("Out")[0];
     RreplenishLayerAndOutput(layer, "slice", {output_name}, test_mode);
   }
 };
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index 7aaeb739de194..663534feda1a8 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -407,6 +407,9 @@ class TensorRTEngine {
   void SetUseDLA(bool use_dla) { use_dla_ = use_dla; }
   void SetDLACore(int dla_core) { dla_core_ = dla_core; }
   void SetWithErnie(bool with_ernie) { with_ernie_ = with_ernie; }
+  void SetWithInterleaved(bool with_interleaved) {
+    with_interleaved_ = with_interleaved;
+  }
 
   void ClearWeights() {
     for (auto& weight_pair : weight_map) {
@@ -480,6 +483,7 @@ class TensorRTEngine {
 
   bool use_oss() { return use_oss_; }
   bool with_ernie() { return with_ernie_; }
+  bool with_interleaved() { return with_interleaved_; }
   bool disable_trt_plugin_fp16() { return disable_trt_plugin_fp16_; }
   bool with_dynamic_shape() { return with_dynamic_shape_; }
   AnalysisConfig::Precision precision() { return precision_; }
@@ -612,6 +616,7 @@ class TensorRTEngine {
   bool use_dla_{false};
   int dla_core_{0};
   bool with_ernie_{false};
+  bool with_interleaved_{false};
   nvinfer1::ILogger& logger_;
 
   // max data size for the buffers.

From 158bf13f1c133c6af77674560e33413be552d51f Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Thu, 13 Jan 2022 19:52:33 +0800
Subject: [PATCH 131/151] [PTen] Rename kernel register marco (#38861)

* rename register marco

* fix error changing

* fix format error
---
 cmake/pten_kernel.cmake                       |   6 +-
 paddle/pten/core/kernel_registry.h            | 820 +++---------------
 paddle/pten/kernels/cpu/cast_kernel.cc        |  30 +-
 paddle/pten/kernels/cpu/complex_kernel.cc     |  20 +-
 paddle/pten/kernels/cpu/dot_grad_kernel.cc    |  20 +-
 paddle/pten/kernels/cpu/dot_kernel.cc         |  20 +-
 paddle/pten/kernels/cpu/full_kernel.cc        |  50 +-
 paddle/pten/kernels/cpu/math_kernel.cc        | 108 +--
 paddle/pten/kernels/cpu/matmul_grad_kernel.cc |  52 +-
 paddle/pten/kernels/cpu/matmul_kernel.cc      |  16 +-
 paddle/pten/kernels/cpu/scale_kernel.cc       |  24 +-
 paddle/pten/kernels/cpu/sign_kernel.cc        |   3 +-
 paddle/pten/kernels/empty_kernel.cc           | 116 +--
 paddle/pten/kernels/flatten_grad_kernel.cc    |  60 +-
 paddle/pten/kernels/flatten_kernel.cc         | 120 +--
 paddle/pten/kernels/gpu/cast_kernel.cu        |  36 +-
 paddle/pten/kernels/gpu/complex_kernel.cu     |  22 +-
 paddle/pten/kernels/gpu/dot_grad_kernel.cu    |  20 +-
 paddle/pten/kernels/gpu/dot_kernel.cu         |  20 +-
 paddle/pten/kernels/gpu/full_kernel.cu        |  48 +-
 paddle/pten/kernels/gpu/math_kernel.cu        | 116 +--
 paddle/pten/kernels/gpu/matmul_grad_kernel.cu |  58 +-
 paddle/pten/kernels/gpu/matmul_kernel.cu      |  18 +-
 paddle/pten/kernels/gpu/scale_kernel.cu       |  24 +-
 paddle/pten/kernels/gpu/sign_kernel.cu        |   2 +-
 25 files changed, 636 insertions(+), 1193 deletions(-)

diff --git a/cmake/pten_kernel.cmake b/cmake/pten_kernel.cmake
index f962c1332093a..bc9fefb58f452 100644
--- a/cmake/pten_kernel.cmake
+++ b/cmake/pten_kernel.cmake
@@ -16,12 +16,12 @@
 function(kernel_declare TARGET_LIST)
     foreach(kernel_path ${TARGET_LIST})
         file(READ ${kernel_path} kernel_impl)
-        # TODO(chenweihang): rename PT_REGISTER_CTX_KERNEL to PT_REGISTER_KERNEL
+        # TODO(chenweihang): rename PT_REGISTER_KERNEL to PT_REGISTER_KERNEL
         # NOTE(chenweihang): now we don't recommend to use digit in kernel name
-        string(REGEX MATCH "(PT_REGISTER_CTX_KERNEL|PT_REGISTER_GENERAL_KERNEL)\\([ \t\r\n]*[a-z0-9_]*," first_registry "${kernel_impl}")
+        string(REGEX MATCH "(PT_REGISTER_KERNEL|PT_REGISTER_GENERAL_KERNEL)\\([ \t\r\n]*[a-z0-9_]*," first_registry "${kernel_impl}")
         if (NOT first_registry STREQUAL "")
             # parse the first kernel name
-            string(REPLACE "PT_REGISTER_CTX_KERNEL(" "" kernel_name "${first_registry}")
+            string(REPLACE "PT_REGISTER_KERNEL(" "" kernel_name "${first_registry}")
             string(REPLACE "PT_REGISTER_GENERAL_KERNEL(" "" kernel_name "${kernel_name}")
             string(REPLACE "," "" kernel_name "${kernel_name}")
             string(REGEX REPLACE "[ \t\r\n]+" "" kernel_name "${kernel_name}")
diff --git a/paddle/pten/core/kernel_registry.h b/paddle/pten/core/kernel_registry.h
index f08ef4acfd9ce..194ab52d25688 100644
--- a/paddle/pten/core/kernel_registry.h
+++ b/paddle/pten/core/kernel_registry.h
@@ -213,20 +213,20 @@ struct KernelRegistrar {
  * pointer of the corresponding data type is automatically instantiated
  * during registration.
  *
- * Note: `1TA` means `1 template argument`
+ * Note: `2TA` means `2 template argument`
  */
 #define PT_REGISTER_KERNEL(                                                \
     kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, ...)          \
   PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                       \
       pt_register_tp_kernel_ns_check_##kernel_name##_##backend##_##layout, \
       "PT_REGISTER_KERNEL must be called in global namespace.");           \
-  _PT_REGISTER_1TA_KERNEL(                                                 \
+  _PT_REGISTER_2TA_KERNEL(                                                 \
       kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, __VA_ARGS__)
 
 #ifndef _WIN32
-#define _PT_REGISTER_1TA_KERNEL(                                            \
+#define _PT_REGISTER_2TA_KERNEL(                                            \
     kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, ...)           \
-  PT_KERNEL_INSTANTIATION(meta_kernel_fn, cpp_dtype, __VA_ARGS__);          \
+  PT_KERNEL_INSTANTIATION(meta_kernel_fn, backend, cpp_dtype, __VA_ARGS__); \
   static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
       ::pten::Kernel*);                                                     \
   PT_KERNEL_REGISTRAR_INIT(                                                 \
@@ -252,7 +252,7 @@ struct KernelRegistrar {
  *
  * And msvc can work without template instantiation
  */
-#define _PT_REGISTER_1TA_KERNEL(                                            \
+#define _PT_REGISTER_2TA_KERNEL(                                            \
     kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, ...)           \
   static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
       ::pten::Kernel*);                                                     \
@@ -268,60 +268,76 @@ struct KernelRegistrar {
       ::pten::Kernel* kernel)
 #endif
 
-#define PT_KERNEL_INSTANTIATION(meta_kernel_fn, cpp_dtype, ...) \
-  _PT_KERNEL_INSTANTIATION(PT_NARGS(cpp_dtype, __VA_ARGS__),    \
-                           meta_kernel_fn,                      \
-                           cpp_dtype,                           \
+#define PT_KERNEL_INSTANTIATION(meta_kernel_fn, backend, cpp_dtype, ...) \
+  _PT_KERNEL_INSTANTIATION(PT_NARGS(cpp_dtype, __VA_ARGS__),             \
+                           meta_kernel_fn,                               \
+                           backend,                                      \
+                           cpp_dtype,                                    \
                            __VA_ARGS__)
 
-#define _PT_KERNEL_INSTANTIATION(N, meta_kernel_fn, cpp_dtype, ...) \
-  PT_CONCATENATE(_PT_KERNEL_INSTANTIATION_, N)                      \
-  (meta_kernel_fn, cpp_dtype, __VA_ARGS__)
+#define _PT_KERNEL_INSTANTIATION(N, meta_kernel_fn, backend, cpp_dtype, ...) \
+  PT_CONCATENATE(_PT_KERNEL_INSTANTIATION_, N)                               \
+  (meta_kernel_fn, backend, cpp_dtype, __VA_ARGS__)
 
-#define _PT_KERNEL_INSTANTIATION_1(meta_kernel_fn, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>
-#define _PT_KERNEL_INSTANTIATION_2(meta_kernel_fn, cpp_dtype, ...)        \
-  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_1(meta_kernel_fn, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_3(meta_kernel_fn, cpp_dtype, ...)        \
-  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_2(meta_kernel_fn, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_4(meta_kernel_fn, cpp_dtype, ...)        \
-  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_3(meta_kernel_fn, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_5(meta_kernel_fn, cpp_dtype, ...)        \
-  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_4(meta_kernel_fn, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_6(meta_kernel_fn, cpp_dtype, ...)        \
-  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_5(meta_kernel_fn, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_7(meta_kernel_fn, cpp_dtype, ...)        \
-  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_6(meta_kernel_fn, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_8(meta_kernel_fn, cpp_dtype, ...)        \
-  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_7(meta_kernel_fn, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_9(meta_kernel_fn, cpp_dtype, ...)        \
-  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_8(meta_kernel_fn, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_10(meta_kernel_fn, cpp_dtype, ...)       \
-  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_9(meta_kernel_fn, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_11(meta_kernel_fn, cpp_dtype, ...)       \
-  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_10(meta_kernel_fn, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_12(meta_kernel_fn, cpp_dtype, ...)       \
-  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_11(meta_kernel_fn, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_13(meta_kernel_fn, cpp_dtype, ...)       \
-  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_12(meta_kernel_fn, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_14(meta_kernel_fn, cpp_dtype, ...)       \
-  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_13(meta_kernel_fn, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_15(meta_kernel_fn, cpp_dtype, ...)       \
-  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_14(meta_kernel_fn, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_1(meta_kernel_fn, backend, cpp_dtype, ...) \
+  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)    \
+      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>
+#define _PT_KERNEL_INSTANTIATION_2(meta_kernel_fn, backend, cpp_dtype, ...) \
+  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)    \
+      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                  \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_1(meta_kernel_fn, backend, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_3(meta_kernel_fn, backend, cpp_dtype, ...) \
+  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)    \
+      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                  \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_2(meta_kernel_fn, backend, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_4(meta_kernel_fn, backend, cpp_dtype, ...) \
+  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)    \
+      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                  \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_3(meta_kernel_fn, backend, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_5(meta_kernel_fn, backend, cpp_dtype, ...) \
+  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)    \
+      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                  \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_4(meta_kernel_fn, backend, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_6(meta_kernel_fn, backend, cpp_dtype, ...) \
+  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)    \
+      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                  \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_5(meta_kernel_fn, backend, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_7(meta_kernel_fn, backend, cpp_dtype, ...) \
+  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)    \
+      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                  \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_6(meta_kernel_fn, backend, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_8(meta_kernel_fn, backend, cpp_dtype, ...) \
+  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)    \
+      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                  \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_7(meta_kernel_fn, backend, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_9(meta_kernel_fn, backend, cpp_dtype, ...) \
+  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)    \
+      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                  \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_8(meta_kernel_fn, backend, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_10(meta_kernel_fn, backend, cpp_dtype, ...) \
+  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)     \
+      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                   \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_9(meta_kernel_fn, backend, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_11(meta_kernel_fn, backend, cpp_dtype, ...) \
+  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)     \
+      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                   \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_10(meta_kernel_fn, backend, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_12(meta_kernel_fn, backend, cpp_dtype, ...) \
+  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)     \
+      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                   \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_11(meta_kernel_fn, backend, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_13(meta_kernel_fn, backend, cpp_dtype, ...) \
+  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)     \
+      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                   \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_12(meta_kernel_fn, backend, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_14(meta_kernel_fn, backend, cpp_dtype, ...) \
+  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)     \
+      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                   \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_13(meta_kernel_fn, backend, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_15(meta_kernel_fn, backend, cpp_dtype, ...) \
+  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)     \
+      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                   \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_14(meta_kernel_fn, backend, __VA_ARGS__))
 
 #define PT_KERNEL_REGISTRAR_INIT(                                              \
     kernel_name, backend, layout, args_def_fn, meta_kernel_fn, cpp_dtype, ...) \
@@ -373,10 +389,11 @@ struct KernelRegistrar {
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
       ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype>)>::Parse,                                \
+          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype>),                                   \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype>));                         \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
+      PT_VARIADIC_KERNEL(                                                     \
+          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
   int TouchKernelSymbolFor_##kernel_name##_##backend##_##layout() { return 0; }
 #define _PT_KERNEL_REGISTRAR_INIT_2(kernel_name,                              \
                                     backend,                                  \
@@ -393,10 +410,11 @@ struct KernelRegistrar {
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
       ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype>)>::Parse,                                \
+          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype>),                                   \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype>));                         \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
+      PT_VARIADIC_KERNEL(                                                     \
+          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_1(kernel_name,                          \
                                         backend,                              \
                                         layout,                               \
@@ -419,10 +437,11 @@ struct KernelRegistrar {
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
       ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype>)>::Parse,                                \
+          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype>),                                   \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype>));                         \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
+      PT_VARIADIC_KERNEL(                                                     \
+          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_2(kernel_name,                          \
                                         backend,                              \
                                         layout,                               \
@@ -445,10 +464,11 @@ struct KernelRegistrar {
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
       ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype>)>::Parse,                                \
+          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype>),                                   \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype>));                         \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
+      PT_VARIADIC_KERNEL(                                                     \
+          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_3(kernel_name,                          \
                                         backend,                              \
                                         layout,                               \
@@ -471,10 +491,11 @@ struct KernelRegistrar {
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
       ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype>)>::Parse,                                \
+          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype>),                                   \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype>));                         \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
+      PT_VARIADIC_KERNEL(                                                     \
+          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_4(kernel_name,                          \
                                         backend,                              \
                                         layout,                               \
@@ -497,10 +518,11 @@ struct KernelRegistrar {
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
       ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype>)>::Parse,                                \
+          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype>),                                   \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype>));                         \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
+      PT_VARIADIC_KERNEL(                                                     \
+          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_5(kernel_name,                          \
                                         backend,                              \
                                         layout,                               \
@@ -523,10 +545,11 @@ struct KernelRegistrar {
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
       ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype>)>::Parse,                                \
+          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype>),                                   \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype>));                         \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
+      PT_VARIADIC_KERNEL(                                                     \
+          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_6(kernel_name,                          \
                                         backend,                              \
                                         layout,                               \
@@ -549,10 +572,11 @@ struct KernelRegistrar {
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
       ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype>)>::Parse,                                \
+          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype>),                                   \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype>));                         \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
+      PT_VARIADIC_KERNEL(                                                     \
+          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_7(kernel_name,                          \
                                         backend,                              \
                                         layout,                               \
@@ -575,10 +599,11 @@ struct KernelRegistrar {
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
       ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype>)>::Parse,                                \
+          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype>),                                   \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype>));                         \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
+      PT_VARIADIC_KERNEL(                                                     \
+          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_8(kernel_name,                          \
                                         backend,                              \
                                         layout,                               \
@@ -601,10 +626,11 @@ struct KernelRegistrar {
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
       ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype>)>::Parse,                                \
+          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype>),                                   \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype>));                         \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
+      PT_VARIADIC_KERNEL(                                                     \
+          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_9(kernel_name,                          \
                                         backend,                              \
                                         layout,                               \
@@ -627,10 +653,11 @@ struct KernelRegistrar {
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
       ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype>)>::Parse,                                \
+          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype>),                                   \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype>));                         \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
+      PT_VARIADIC_KERNEL(                                                     \
+          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_10(kernel_name,                         \
                                          backend,                             \
                                          layout,                              \
@@ -653,10 +680,11 @@ struct KernelRegistrar {
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
       ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype>)>::Parse,                                \
+          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype>),                                   \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype>));                         \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
+      PT_VARIADIC_KERNEL(                                                     \
+          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_11(kernel_name,                         \
                                          backend,                             \
                                          layout,                              \
@@ -679,10 +707,11 @@ struct KernelRegistrar {
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
       ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype>)>::Parse,                                \
+          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype>),                                   \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype>));                         \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
+      PT_VARIADIC_KERNEL(                                                     \
+          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_12(kernel_name,                         \
                                          backend,                             \
                                          layout,                              \
@@ -705,10 +734,11 @@ struct KernelRegistrar {
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
       ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype>)>::Parse,                                \
+          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype>),                                   \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype>));                         \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
+      PT_VARIADIC_KERNEL(                                                     \
+          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_13(kernel_name,                         \
                                          backend,                             \
                                          layout,                              \
@@ -731,10 +761,11 @@ struct KernelRegistrar {
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
       ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype>)>::Parse,                                \
+          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype>),                                   \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype>));                         \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
+      PT_VARIADIC_KERNEL(                                                     \
+          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_14(kernel_name,                         \
                                          backend,                             \
                                          layout,                              \
@@ -743,41 +774,6 @@ struct KernelRegistrar {
                                          meta_kernel_fn,                      \
                                          __VA_ARGS__))
 
-/** PT_REGISTER_NO_TEMPLATE_KERNEL
- *
- * Basic Kernel register marco, used to register a no template argument kernel
- * function, pass in the complete function pointe of the kernel, this
- * registration macro will not do automatic template instantiation.
- *
- * Note: developer maybe register 2 kernel with same name, backend and diff
- * layout, so the layout also need to be a part of symbol var name. If developer
- * register 2 kernel with same name, backend, layout and diff dtype, he should
- * use another register marco PT_REGISTER_KERNEL.
- *
- * TODO(chenweihang): remove this marco later
- */
-#define PT_REGISTER_NO_TEMPLATE_KERNEL(                                      \
-    kernel_name, backend, layout, kernel_fn, dtype)                          \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                         \
-      pt_register_no_t_kernel_ns_check_##kernel_name##_##backend##_##layout, \
-      "PT_REGISTER_NO_TEMPLATE_KERNEL must be called in global namespace."); \
-  static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(  \
-      ::pten::Kernel*);                                                      \
-  static const ::pten::KernelRegistrar                                       \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout(                  \
-          #kernel_name,                                                      \
-          BACKEND(backend),                                                  \
-          DATALAYOUT(layout),                                                \
-          ::pten::KernelArgsParseFunctor<decltype(&kernel_fn)>::Parse,       \
-          &__PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout,     \
-          PT_KERNEL(kernel_fn),                                              \
-          PT_VARIADIC_KERNEL(kernel_fn));                                    \
-  int TouchKernelSymbolFor_##kernel_name##_##backend##_##layout() {          \
-    return 0;                                                                \
-  }                                                                          \
-  void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(         \
-      ::pten::Kernel* kernel)
-
 /** PT_REGISTER_GENERAL_KERNEL
  *
  * Basic Kernel register marco, used to register a instantiated kernel function
@@ -832,558 +828,6 @@ struct KernelRegistrar {
       ::pten::Kernel* kernel)
 #endif
 
-/** PT_REGISTER_CTX_KERNEL
- *
- * Used for kernel registration with device context and data type as
- * template parameter.
- */
-#define PT_REGISTER_CTX_KERNEL(                                                \
-    kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, ...)              \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                           \
-      pt_register_tp_ctx_kernel_ns_check_##kernel_name##_##backend##_##layout, \
-      "PT_REGISTER_CTX_KERNEL must be called in global namespace.");           \
-  _PT_REGISTER_2TA_KERNEL(                                                     \
-      kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, __VA_ARGS__)
-
-#ifndef _WIN32
-#define _PT_REGISTER_2TA_KERNEL(                                             \
-    kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, ...)            \
-  PT_KERNEL_INSTANTIATION2(meta_kernel_fn, backend, cpp_dtype, __VA_ARGS__); \
-  static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(  \
-      ::pten::Kernel*);                                                      \
-  PT_KERNEL_REGISTRAR_INIT2(                                                 \
-      kernel_name,                                                           \
-      backend,                                                               \
-      layout,                                                                \
-      &__PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout,         \
-      meta_kernel_fn,                                                        \
-      cpp_dtype,                                                             \
-      __VA_ARGS__);                                                          \
-  void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(         \
-      ::pten::Kernel* kernel)
-#else
-#define _PT_REGISTER_2TA_KERNEL(                                            \
-    kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, ...)           \
-  static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
-      ::pten::Kernel*);                                                     \
-  PT_KERNEL_REGISTRAR_INIT2(                                                \
-      kernel_name,                                                          \
-      backend,                                                              \
-      layout,                                                               \
-      &__PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout,        \
-      meta_kernel_fn,                                                       \
-      cpp_dtype,                                                            \
-      __VA_ARGS__);                                                         \
-  void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(        \
-      ::pten::Kernel* kernel)
-#endif
-
-#define PT_KERNEL_INSTANTIATION2(meta_kernel_fn, backend, cpp_dtype, ...) \
-  _PT_KERNEL_INSTANTIATION2(PT_NARGS(cpp_dtype, __VA_ARGS__),             \
-                            meta_kernel_fn,                               \
-                            backend,                                      \
-                            cpp_dtype,                                    \
-                            __VA_ARGS__)
-
-#define _PT_KERNEL_INSTANTIATION2(N, meta_kernel_fn, backend, cpp_dtype, ...) \
-  PT_CONCATENATE(_PT_KERNEL_INSTANTIATION2_, N)                               \
-  (meta_kernel_fn, backend, cpp_dtype, __VA_ARGS__)
-
-#define _PT_KERNEL_INSTANTIATION2_1(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)     \
-      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>
-#define _PT_KERNEL_INSTANTIATION2_2(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)     \
-      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                   \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION2_1(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION2_3(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)     \
-      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                   \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION2_2(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION2_4(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)     \
-      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                   \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION2_3(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION2_5(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)     \
-      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                   \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION2_4(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION2_6(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)     \
-      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                   \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION2_5(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION2_7(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)     \
-      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                   \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION2_6(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION2_8(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)     \
-      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                   \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION2_7(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION2_9(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)     \
-      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                   \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION2_8(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION2_10(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)      \
-      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                    \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION2_9(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION2_11(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)      \
-      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                    \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION2_10(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION2_12(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)      \
-      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                    \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION2_11(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION2_13(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)      \
-      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                    \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION2_12(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION2_14(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)      \
-      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                    \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION2_13(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION2_15(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)      \
-      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                    \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION2_14(meta_kernel_fn, backend, __VA_ARGS__))
-
-#define PT_KERNEL_REGISTRAR_INIT2(                                             \
-    kernel_name, backend, layout, args_def_fn, meta_kernel_fn, cpp_dtype, ...) \
-  _PT_KERNEL_REGISTRAR_INIT2(PT_NARGS(cpp_dtype, __VA_ARGS__),                 \
-                             kernel_name,                                      \
-                             backend,                                          \
-                             layout,                                           \
-                             args_def_fn,                                      \
-                             meta_kernel_fn,                                   \
-                             cpp_dtype,                                        \
-                             __VA_ARGS__)
-
-// clang-format off
-
-/* The =pre-commit always treats this macro into the wrong format,
-  and multi-line macros cannot be skipped with NOLINT.*/
-#define _PT_KERNEL_REGISTRAR_INIT2(N,              \
-                                  kernel_name,    \
-                                  backend,        \
-                                  layout,         \
-                                  args_def_fn,    \
-                                  meta_kernel_fn, \
-                                  cpp_dtype,      \
-                                  ...)            \
-  PT_CONCATENATE(_PT_KERNEL_REGISTRAR_INIT2_, N) ( \
-    kernel_name,                                  \
-    backend,                                      \
-    layout,                                       \
-    PT_ID,                                        \
-    args_def_fn,                                  \
-    meta_kernel_fn,                               \
-    cpp_dtype,                                    \
-    __VA_ARGS__)
-
-// clang-format on
-
-#define _PT_KERNEL_REGISTRAR_INIT2_1(kernel_name,                             \
-                                     backend,                                 \
-                                     layout,                                  \
-                                     registrar_id,                            \
-                                     args_def_fn,                             \
-                                     meta_kernel_fn,                          \
-                                     cpp_dtype,                               \
-                                     ...)                                     \
-  static const ::pten::KernelRegistrar PT_CONCATENATE(                        \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
-      #kernel_name,                                                           \
-      BACKEND(backend),                                                       \
-      DATALAYOUT(layout),                                                     \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
-      args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
-      PT_VARIADIC_KERNEL(                                                     \
-          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
-  int TouchKernelSymbolFor_##kernel_name##_##backend##_##layout() { return 0; }
-#define _PT_KERNEL_REGISTRAR_INIT2_2(kernel_name,                             \
-                                     backend,                                 \
-                                     layout,                                  \
-                                     registrar_id,                            \
-                                     args_def_fn,                             \
-                                     meta_kernel_fn,                          \
-                                     cpp_dtype,                               \
-                                     ...)                                     \
-  static const ::pten::KernelRegistrar PT_CONCATENATE(                        \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
-      #kernel_name,                                                           \
-      BACKEND(backend),                                                       \
-      DATALAYOUT(layout),                                                     \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
-      args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
-      PT_VARIADIC_KERNEL(                                                     \
-          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT2_1(kernel_name,                         \
-                                         backend,                             \
-                                         layout,                              \
-                                         PT_ID,                               \
-                                         args_def_fn,                         \
-                                         meta_kernel_fn,                      \
-                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT2_3(kernel_name,                             \
-                                     backend,                                 \
-                                     layout,                                  \
-                                     registrar_id,                            \
-                                     args_def_fn,                             \
-                                     meta_kernel_fn,                          \
-                                     cpp_dtype,                               \
-                                     ...)                                     \
-  static const ::pten::KernelRegistrar PT_CONCATENATE(                        \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
-      #kernel_name,                                                           \
-      BACKEND(backend),                                                       \
-      DATALAYOUT(layout),                                                     \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
-      args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
-      PT_VARIADIC_KERNEL(                                                     \
-          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT2_2(kernel_name,                         \
-                                         backend,                             \
-                                         layout,                              \
-                                         PT_ID,                               \
-                                         args_def_fn,                         \
-                                         meta_kernel_fn,                      \
-                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT2_4(kernel_name,                             \
-                                     backend,                                 \
-                                     layout,                                  \
-                                     registrar_id,                            \
-                                     args_def_fn,                             \
-                                     meta_kernel_fn,                          \
-                                     cpp_dtype,                               \
-                                     ...)                                     \
-  static const ::pten::KernelRegistrar PT_CONCATENATE(                        \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
-      #kernel_name,                                                           \
-      BACKEND(backend),                                                       \
-      DATALAYOUT(layout),                                                     \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
-      args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
-      PT_VARIADIC_KERNEL(                                                     \
-          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT2_3(kernel_name,                         \
-                                         backend,                             \
-                                         layout,                              \
-                                         PT_ID,                               \
-                                         args_def_fn,                         \
-                                         meta_kernel_fn,                      \
-                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT2_5(kernel_name,                             \
-                                     backend,                                 \
-                                     layout,                                  \
-                                     registrar_id,                            \
-                                     args_def_fn,                             \
-                                     meta_kernel_fn,                          \
-                                     cpp_dtype,                               \
-                                     ...)                                     \
-  static const ::pten::KernelRegistrar PT_CONCATENATE(                        \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
-      #kernel_name,                                                           \
-      BACKEND(backend),                                                       \
-      DATALAYOUT(layout),                                                     \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
-      args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
-      PT_VARIADIC_KERNEL(                                                     \
-          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT2_4(kernel_name,                         \
-                                         backend,                             \
-                                         layout,                              \
-                                         PT_ID,                               \
-                                         args_def_fn,                         \
-                                         meta_kernel_fn,                      \
-                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT2_6(kernel_name,                             \
-                                     backend,                                 \
-                                     layout,                                  \
-                                     registrar_id,                            \
-                                     args_def_fn,                             \
-                                     meta_kernel_fn,                          \
-                                     cpp_dtype,                               \
-                                     ...)                                     \
-  static const ::pten::KernelRegistrar PT_CONCATENATE(                        \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
-      #kernel_name,                                                           \
-      BACKEND(backend),                                                       \
-      DATALAYOUT(layout),                                                     \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
-      args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
-      PT_VARIADIC_KERNEL(                                                     \
-          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT2_5(kernel_name,                         \
-                                         backend,                             \
-                                         layout,                              \
-                                         PT_ID,                               \
-                                         args_def_fn,                         \
-                                         meta_kernel_fn,                      \
-                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT2_7(kernel_name,                             \
-                                     backend,                                 \
-                                     layout,                                  \
-                                     registrar_id,                            \
-                                     args_def_fn,                             \
-                                     meta_kernel_fn,                          \
-                                     cpp_dtype,                               \
-                                     ...)                                     \
-  static const ::pten::KernelRegistrar PT_CONCATENATE(                        \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
-      #kernel_name,                                                           \
-      BACKEND(backend),                                                       \
-      DATALAYOUT(layout),                                                     \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
-      args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
-      PT_VARIADIC_KERNEL(                                                     \
-          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT2_6(kernel_name,                         \
-                                         backend,                             \
-                                         layout,                              \
-                                         PT_ID,                               \
-                                         args_def_fn,                         \
-                                         meta_kernel_fn,                      \
-                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT2_8(kernel_name,                             \
-                                     backend,                                 \
-                                     layout,                                  \
-                                     registrar_id,                            \
-                                     args_def_fn,                             \
-                                     meta_kernel_fn,                          \
-                                     cpp_dtype,                               \
-                                     ...)                                     \
-  static const ::pten::KernelRegistrar PT_CONCATENATE(                        \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
-      #kernel_name,                                                           \
-      BACKEND(backend),                                                       \
-      DATALAYOUT(layout),                                                     \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
-      args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
-      PT_VARIADIC_KERNEL(                                                     \
-          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT2_7(kernel_name,                         \
-                                         backend,                             \
-                                         layout,                              \
-                                         PT_ID,                               \
-                                         args_def_fn,                         \
-                                         meta_kernel_fn,                      \
-                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT2_9(kernel_name,                             \
-                                     backend,                                 \
-                                     layout,                                  \
-                                     registrar_id,                            \
-                                     args_def_fn,                             \
-                                     meta_kernel_fn,                          \
-                                     cpp_dtype,                               \
-                                     ...)                                     \
-  static const ::pten::KernelRegistrar PT_CONCATENATE(                        \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
-      #kernel_name,                                                           \
-      BACKEND(backend),                                                       \
-      DATALAYOUT(layout),                                                     \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
-      args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
-      PT_VARIADIC_KERNEL(                                                     \
-          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT2_8(kernel_name,                         \
-                                         backend,                             \
-                                         layout,                              \
-                                         PT_ID,                               \
-                                         args_def_fn,                         \
-                                         meta_kernel_fn,                      \
-                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT2_10(kernel_name,                            \
-                                      backend,                                \
-                                      layout,                                 \
-                                      registrar_id,                           \
-                                      args_def_fn,                            \
-                                      meta_kernel_fn,                         \
-                                      cpp_dtype,                              \
-                                      ...)                                    \
-  static const ::pten::KernelRegistrar PT_CONCATENATE(                        \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
-      #kernel_name,                                                           \
-      BACKEND(backend),                                                       \
-      DATALAYOUT(layout),                                                     \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
-      args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
-      PT_VARIADIC_KERNEL(                                                     \
-          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT2_9(kernel_name,                         \
-                                         backend,                             \
-                                         layout,                              \
-                                         PT_ID,                               \
-                                         args_def_fn,                         \
-                                         meta_kernel_fn,                      \
-                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT2_11(kernel_name,                            \
-                                      backend,                                \
-                                      layout,                                 \
-                                      registrar_id,                           \
-                                      args_def_fn,                            \
-                                      meta_kernel_fn,                         \
-                                      cpp_dtype,                              \
-                                      ...)                                    \
-  static const ::pten::KernelRegistrar PT_CONCATENATE(                        \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
-      #kernel_name,                                                           \
-      BACKEND(backend),                                                       \
-      DATALAYOUT(layout),                                                     \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
-      args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
-      PT_VARIADIC_KERNEL(                                                     \
-          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT2_10(kernel_name,                        \
-                                          backend,                            \
-                                          layout,                             \
-                                          PT_ID,                              \
-                                          args_def_fn,                        \
-                                          meta_kernel_fn,                     \
-                                          __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT2_12(kernel_name,                            \
-                                      backend,                                \
-                                      layout,                                 \
-                                      registrar_id,                           \
-                                      args_def_fn,                            \
-                                      meta_kernel_fn,                         \
-                                      cpp_dtype,                              \
-                                      ...)                                    \
-  static const ::pten::KernelRegistrar PT_CONCATENATE(                        \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
-      #kernel_name,                                                           \
-      BACKEND(backend),                                                       \
-      DATALAYOUT(layout),                                                     \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
-      args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
-      PT_VARIADIC_KERNEL(                                                     \
-          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT2_11(kernel_name,                        \
-                                          backend,                            \
-                                          layout,                             \
-                                          PT_ID,                              \
-                                          args_def_fn,                        \
-                                          meta_kernel_fn,                     \
-                                          __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT2_13(kernel_name,                            \
-                                      backend,                                \
-                                      layout,                                 \
-                                      registrar_id,                           \
-                                      args_def_fn,                            \
-                                      meta_kernel_fn,                         \
-                                      cpp_dtype,                              \
-                                      ...)                                    \
-  static const ::pten::KernelRegistrar PT_CONCATENATE(                        \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
-      #kernel_name,                                                           \
-      BACKEND(backend),                                                       \
-      DATALAYOUT(layout),                                                     \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
-      args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
-      PT_VARIADIC_KERNEL(                                                     \
-          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT2_12(kernel_name,                        \
-                                          backend,                            \
-                                          layout,                             \
-                                          PT_ID,                              \
-                                          args_def_fn,                        \
-                                          meta_kernel_fn,                     \
-                                          __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT2_14(kernel_name,                            \
-                                      backend,                                \
-                                      layout,                                 \
-                                      registrar_id,                           \
-                                      args_def_fn,                            \
-                                      meta_kernel_fn,                         \
-                                      cpp_dtype,                              \
-                                      ...)                                    \
-  static const ::pten::KernelRegistrar PT_CONCATENATE(                        \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
-      #kernel_name,                                                           \
-      BACKEND(backend),                                                       \
-      DATALAYOUT(layout),                                                     \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
-      args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
-      PT_VARIADIC_KERNEL(                                                     \
-          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT2_13(kernel_name,                        \
-                                          backend,                            \
-                                          layout,                             \
-                                          PT_ID,                              \
-                                          args_def_fn,                        \
-                                          meta_kernel_fn,                     \
-                                          __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT2_15(kernel_name,                            \
-                                      backend,                                \
-                                      layout,                                 \
-                                      registrar_id,                           \
-                                      args_def_fn,                            \
-                                      meta_kernel_fn,                         \
-                                      cpp_dtype,                              \
-                                      ...)                                    \
-  static const ::pten::KernelRegistrar PT_CONCATENATE(                        \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
-      #kernel_name,                                                           \
-      BACKEND(backend),                                                       \
-      DATALAYOUT(layout),                                                     \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
-      args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
-      PT_VARIADIC_KERNEL(                                                     \
-          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT2_14(kernel_name,                        \
-                                          backend,                            \
-                                          layout,                             \
-                                          PT_ID,                              \
-                                          args_def_fn,                        \
-                                          meta_kernel_fn,                     \
-                                          __VA_ARGS__))
-
 /** PT_DECLARE_KERNEL
  *
  * Used to export the symbols of the file where the kernel is located,
diff --git a/paddle/pten/kernels/cpu/cast_kernel.cc b/paddle/pten/kernels/cpu/cast_kernel.cc
index c6736cdd1bcf0..a0006f49a2b38 100644
--- a/paddle/pten/kernels/cpu/cast_kernel.cc
+++ b/paddle/pten/kernels/cpu/cast_kernel.cc
@@ -58,20 +58,20 @@ void CastKernel(const Context& dev_ctx,
 
 }  // namespace pten
 
-PT_REGISTER_CTX_KERNEL(cast,
-                       CPU,
-                       ALL_LAYOUT,
-                       pten::CastKernel,
-                       float,
-                       double,
-                       int,
-                       int64_t,
-                       int16_t,
-                       bool,
-                       uint8_t,
-                       paddle::platform::float16,
-                       paddle::platform::bfloat16,
-                       paddle::platform::complex<float>,
-                       paddle::platform::complex<double>) {
+PT_REGISTER_KERNEL(cast,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::CastKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   int16_t,
+                   bool,
+                   uint8_t,
+                   paddle::platform::float16,
+                   paddle::platform::bfloat16,
+                   paddle::platform::complex<float>,
+                   paddle::platform::complex<double>) {
   kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
 }
diff --git a/paddle/pten/kernels/cpu/complex_kernel.cc b/paddle/pten/kernels/cpu/complex_kernel.cc
index 10e7e684db3c1..59a7577153a61 100644
--- a/paddle/pten/kernels/cpu/complex_kernel.cc
+++ b/paddle/pten/kernels/cpu/complex_kernel.cc
@@ -21,13 +21,13 @@
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/complex.h"
 
-PT_REGISTER_CTX_KERNEL(conj,
-                       CPU,
-                       ALL_LAYOUT,
-                       pten::ConjKernel,
-                       paddle::platform::complex<float>,
-                       paddle::platform::complex<double>,
-                       float,
-                       double,
-                       int,
-                       int64_t) {}
+PT_REGISTER_KERNEL(conj,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::ConjKernel,
+                   paddle::platform::complex<float>,
+                   paddle::platform::complex<double>,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/pten/kernels/cpu/dot_grad_kernel.cc b/paddle/pten/kernels/cpu/dot_grad_kernel.cc
index c9d5c35e134c8..ed927f820f0e7 100644
--- a/paddle/pten/kernels/cpu/dot_grad_kernel.cc
+++ b/paddle/pten/kernels/cpu/dot_grad_kernel.cc
@@ -20,13 +20,13 @@
 
 #include "paddle/fluid/platform/complex.h"
 
-PT_REGISTER_CTX_KERNEL(dot_grad,
-                       CPU,
-                       ALL_LAYOUT,
-                       pten::DotGradKernel,
-                       float,
-                       double,
-                       int,
-                       int64_t,
-                       paddle::platform::complex<float>,
-                       paddle::platform::complex<double>) {}
+PT_REGISTER_KERNEL(dot_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::DotGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   paddle::platform::complex<float>,
+                   paddle::platform::complex<double>) {}
diff --git a/paddle/pten/kernels/cpu/dot_kernel.cc b/paddle/pten/kernels/cpu/dot_kernel.cc
index 72e9e28907f90..0baf9ba0a8bdd 100644
--- a/paddle/pten/kernels/cpu/dot_kernel.cc
+++ b/paddle/pten/kernels/cpu/dot_kernel.cc
@@ -49,13 +49,13 @@ void DotKernel(const Context& dev_ctx,
 using complex64 = ::paddle::platform::complex<float>;
 using complex128 = ::paddle::platform::complex<double>;
 
-PT_REGISTER_CTX_KERNEL(dot,
-                       CPU,
-                       ALL_LAYOUT,
-                       pten::DotKernel,
-                       float,
-                       double,
-                       int,
-                       int64_t,
-                       complex64,
-                       complex128) {}
+PT_REGISTER_KERNEL(dot,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::DotKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {}
diff --git a/paddle/pten/kernels/cpu/full_kernel.cc b/paddle/pten/kernels/cpu/full_kernel.cc
index 1ae8001d79dc7..919471d86ac53 100644
--- a/paddle/pten/kernels/cpu/full_kernel.cc
+++ b/paddle/pten/kernels/cpu/full_kernel.cc
@@ -18,29 +18,29 @@ limitations under the License. */
 #include "paddle/pten/core/kernel_registry.h"
 #include "paddle/pten/kernels/impl/full_kernel_impl.h"
 
-PT_REGISTER_CTX_KERNEL(full,
-                       CPU,
-                       ALL_LAYOUT,
-                       pten::FullKernel,
-                       float,
-                       double,
-                       uint8_t,
-                       int16_t,
-                       int,
-                       int64_t,
-                       bool,
-                       paddle::platform::float16,
-                       paddle::platform::bfloat16,
-                       paddle::platform::complex<float>,
-                       paddle::platform::complex<double>) {}
+PT_REGISTER_KERNEL(full,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::FullKernel,
+                   float,
+                   double,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t,
+                   bool,
+                   paddle::platform::float16,
+                   paddle::platform::bfloat16,
+                   paddle::platform::complex<float>,
+                   paddle::platform::complex<double>) {}
 
-PT_REGISTER_CTX_KERNEL(full_like,
-                       CPU,
-                       ALL_LAYOUT,
-                       pten::FullLikeKernel,
-                       float,
-                       double,
-                       int,
-                       int64_t,
-                       bool,
-                       paddle::platform::float16) {}
+PT_REGISTER_KERNEL(full_like,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::FullLikeKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   paddle::platform::float16) {}
diff --git a/paddle/pten/kernels/cpu/math_kernel.cc b/paddle/pten/kernels/cpu/math_kernel.cc
index be0d52355bce6..83388d0d9a80f 100644
--- a/paddle/pten/kernels/cpu/math_kernel.cc
+++ b/paddle/pten/kernels/cpu/math_kernel.cc
@@ -118,60 +118,60 @@ using complex128 = ::paddle::platform::complex<double>;
 
 // NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16
 // using bfloat16 = ::paddle::platform::bfloat16;
-PT_REGISTER_CTX_KERNEL(
+PT_REGISTER_KERNEL(
     mean, CPU, ALL_LAYOUT, pten::MeanKernel, float, double, bool) {}
-PT_REGISTER_CTX_KERNEL(add,
-                       CPU,
-                       ALL_LAYOUT,
-                       pten::AddKernel,
-                       float,
-                       double,
-                       int,
-                       int64_t,
-                       complex64,
-                       complex128) {}
-PT_REGISTER_CTX_KERNEL(subtract,
-                       CPU,
-                       ALL_LAYOUT,
-                       pten::SubtractKernel,
-                       float,
-                       double,
-                       int,
-                       int64_t,
-                       complex64,
-                       complex128) {}
-PT_REGISTER_CTX_KERNEL(divide,
-                       CPU,
-                       ALL_LAYOUT,
-                       pten::DivideKernel,
-                       float,
-                       double,
-                       int,
-                       int64_t,
-                       complex64,
-                       complex128) {}
-PT_REGISTER_CTX_KERNEL(multiply,
-                       CPU,
-                       ALL_LAYOUT,
-                       pten::MultiplyKernel,
-                       float,
-                       double,
-                       int,
-                       int64_t,
-                       bool,
-                       complex64,
-                       complex128) {}
-PT_REGISTER_CTX_KERNEL(sum,
-                       CPU,
-                       ALL_LAYOUT,
-                       pten::SumKernel,
-                       bool,
-                       float,
-                       double,
-                       paddle::platform::float16,
-                       int,
-                       int64_t,
-                       complex64,
-                       complex128) {
+PT_REGISTER_KERNEL(add,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::AddKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {}
+PT_REGISTER_KERNEL(subtract,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::SubtractKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {}
+PT_REGISTER_KERNEL(divide,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::DivideKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {}
+PT_REGISTER_KERNEL(multiply,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::MultiplyKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   complex64,
+                   complex128) {}
+PT_REGISTER_KERNEL(sum,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::SumKernel,
+                   bool,
+                   float,
+                   double,
+                   paddle::platform::float16,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {
   kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
 }
diff --git a/paddle/pten/kernels/cpu/matmul_grad_kernel.cc b/paddle/pten/kernels/cpu/matmul_grad_kernel.cc
index 5a8abb6701b0e..4738e21573194 100644
--- a/paddle/pten/kernels/cpu/matmul_grad_kernel.cc
+++ b/paddle/pten/kernels/cpu/matmul_grad_kernel.cc
@@ -19,29 +19,29 @@ limitations under the License. */
 
 #include "paddle/pten/kernels/impl/matmul_grad_kernel_impl.h"
 
-PT_REGISTER_CTX_KERNEL(matmul_grad,
-                       CPU,
-                       ALL_LAYOUT,
-                       pten::MatmulGradKernel,
-                       float,
-                       double,
-                       paddle::platform::complex<float>,
-                       paddle::platform::complex<double>) {}
-
-PT_REGISTER_CTX_KERNEL(matmul_double_grad,
-                       CPU,
-                       ALL_LAYOUT,
-                       pten::MatmulDoubleGradKernel,
-                       float,
-                       double,
-                       paddle::platform::complex<float>,
-                       paddle::platform::complex<double>) {}
-
-PT_REGISTER_CTX_KERNEL(matmul_triple_grad,
-                       CPU,
-                       ALL_LAYOUT,
-                       pten::MatmulTripleGradKernel,
-                       float,
-                       double,
-                       paddle::platform::complex<float>,
-                       paddle::platform::complex<double>) {}
+PT_REGISTER_KERNEL(matmul_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::MatmulGradKernel,
+                   float,
+                   double,
+                   paddle::platform::complex<float>,
+                   paddle::platform::complex<double>) {}
+
+PT_REGISTER_KERNEL(matmul_double_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::MatmulDoubleGradKernel,
+                   float,
+                   double,
+                   paddle::platform::complex<float>,
+                   paddle::platform::complex<double>) {}
+
+PT_REGISTER_KERNEL(matmul_triple_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::MatmulTripleGradKernel,
+                   float,
+                   double,
+                   paddle::platform::complex<float>,
+                   paddle::platform::complex<double>) {}
diff --git a/paddle/pten/kernels/cpu/matmul_kernel.cc b/paddle/pten/kernels/cpu/matmul_kernel.cc
index edba402ec1d84..f749e9cb27979 100644
--- a/paddle/pten/kernels/cpu/matmul_kernel.cc
+++ b/paddle/pten/kernels/cpu/matmul_kernel.cc
@@ -20,11 +20,11 @@ limitations under the License. */
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/pten/kernels/impl/matmul_kernel_impl.h"
 
-PT_REGISTER_CTX_KERNEL(matmul,
-                       CPU,
-                       ALL_LAYOUT,
-                       pten::MatmulKernel,
-                       float,
-                       double,
-                       paddle::platform::complex<float>,
-                       paddle::platform::complex<double>) {}
+PT_REGISTER_KERNEL(matmul,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::MatmulKernel,
+                   float,
+                   double,
+                   paddle::platform::complex<float>,
+                   paddle::platform::complex<double>) {}
diff --git a/paddle/pten/kernels/cpu/scale_kernel.cc b/paddle/pten/kernels/cpu/scale_kernel.cc
index 0582fb87b4457..7088bba01aa78 100644
--- a/paddle/pten/kernels/cpu/scale_kernel.cc
+++ b/paddle/pten/kernels/cpu/scale_kernel.cc
@@ -51,15 +51,15 @@ void ScaleKernel(const Context& dev_ctx,
 
 }  // namespace pten
 
-PT_REGISTER_CTX_KERNEL(scale,
-                       CPU,
-                       ALL_LAYOUT,
-                       pten::ScaleKernel,
-                       float,
-                       double,
-                       paddle::platform::bfloat16,
-                       uint8_t,
-                       int8_t,
-                       int16_t,
-                       int,
-                       int64_t) {}
+PT_REGISTER_KERNEL(scale,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::ScaleKernel,
+                   float,
+                   double,
+                   paddle::platform::bfloat16,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   int,
+                   int64_t) {}
diff --git a/paddle/pten/kernels/cpu/sign_kernel.cc b/paddle/pten/kernels/cpu/sign_kernel.cc
index a7b62822d6e0f..25fa2bb5fe4ef 100644
--- a/paddle/pten/kernels/cpu/sign_kernel.cc
+++ b/paddle/pten/kernels/cpu/sign_kernel.cc
@@ -21,5 +21,4 @@ limitations under the License. */
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/bfloat16.h"
 
-PT_REGISTER_CTX_KERNEL(sign, CPU, ALL_LAYOUT, pten::SignKernel, float, double) {
-}
+PT_REGISTER_KERNEL(sign, CPU, ALL_LAYOUT, pten::SignKernel, float, double) {}
diff --git a/paddle/pten/kernels/empty_kernel.cc b/paddle/pten/kernels/empty_kernel.cc
index 2dd55a13e38e5..eb67ed6655f47 100644
--- a/paddle/pten/kernels/empty_kernel.cc
+++ b/paddle/pten/kernels/empty_kernel.cc
@@ -34,66 +34,66 @@ void EmptyLikeKernel(const Context& dev_ctx, DenseTensor* out) {
 
 }  // namespace pten
 
-PT_REGISTER_CTX_KERNEL(empty,
-                       CPU,
-                       ALL_LAYOUT,
-                       pten::EmptyKernel,
-                       float,
-                       double,
-                       uint8_t,
-                       int16_t,
-                       int,
-                       int64_t,
-                       bool,
-                       paddle::platform::float16,
-                       paddle::platform::bfloat16,
-                       paddle::platform::complex<float>,
-                       paddle::platform::complex<double>) {}
+PT_REGISTER_KERNEL(empty,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::EmptyKernel,
+                   float,
+                   double,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t,
+                   bool,
+                   paddle::platform::float16,
+                   paddle::platform::bfloat16,
+                   paddle::platform::complex<float>,
+                   paddle::platform::complex<double>) {}
 
-PT_REGISTER_CTX_KERNEL(empty_like,
-                       CPU,
-                       ALL_LAYOUT,
-                       pten::EmptyLikeKernel,
-                       float,
-                       double,
-                       uint8_t,
-                       int16_t,
-                       int,
-                       int64_t,
-                       bool,
-                       paddle::platform::float16,
-                       paddle::platform::bfloat16,
-                       paddle::platform::complex<float>,
-                       paddle::platform::complex<double>) {}
+PT_REGISTER_KERNEL(empty_like,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::EmptyLikeKernel,
+                   float,
+                   double,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t,
+                   bool,
+                   paddle::platform::float16,
+                   paddle::platform::bfloat16,
+                   paddle::platform::complex<float>,
+                   paddle::platform::complex<double>) {}
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_REGISTER_CTX_KERNEL(empty,
-                       GPU,
-                       ALL_LAYOUT,
-                       pten::EmptyKernel,
-                       float,
-                       double,
-                       uint8_t,
-                       int16_t,
-                       int,
-                       int64_t,
-                       bool,
-                       paddle::platform::float16,
-                       paddle::platform::complex<float>,
-                       paddle::platform::complex<double>) {}
+PT_REGISTER_KERNEL(empty,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::EmptyKernel,
+                   float,
+                   double,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t,
+                   bool,
+                   paddle::platform::float16,
+                   paddle::platform::complex<float>,
+                   paddle::platform::complex<double>) {}
 
-PT_REGISTER_CTX_KERNEL(empty_like,
-                       GPU,
-                       ALL_LAYOUT,
-                       pten::EmptyLikeKernel,
-                       float,
-                       double,
-                       uint8_t,
-                       int16_t,
-                       int,
-                       int64_t,
-                       bool,
-                       paddle::platform::float16,
-                       paddle::platform::complex<float>,
-                       paddle::platform::complex<double>) {}
+PT_REGISTER_KERNEL(empty_like,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::EmptyLikeKernel,
+                   float,
+                   double,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t,
+                   bool,
+                   paddle::platform::float16,
+                   paddle::platform::complex<float>,
+                   paddle::platform::complex<double>) {}
 #endif
diff --git a/paddle/pten/kernels/flatten_grad_kernel.cc b/paddle/pten/kernels/flatten_grad_kernel.cc
index d6aea31748d6c..45f3c6558d9c8 100644
--- a/paddle/pten/kernels/flatten_grad_kernel.cc
+++ b/paddle/pten/kernels/flatten_grad_kernel.cc
@@ -33,41 +33,41 @@ void FlattenGradKernel(const Context& dev_ctx,
 
 }  // namespace pten
 
-PT_REGISTER_CTX_KERNEL(flatten_grad,
-                       CPU,
-                       ALL_LAYOUT,
-                       pten::FlattenGradKernel,
-                       float,
-                       double,
-                       uint8_t,
-                       int8_t,
-                       int,
-                       int64_t) {}
+PT_REGISTER_KERNEL(flatten_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::FlattenGradKernel,
+                   float,
+                   double,
+                   uint8_t,
+                   int8_t,
+                   int,
+                   int64_t) {}
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_REGISTER_CTX_KERNEL(flatten_grad,
-                       GPU,
-                       ALL_LAYOUT,
-                       pten::FlattenGradKernel,
-                       float,
-                       paddle::platform::float16,
-                       double,
-                       uint8_t,
-                       int8_t,
-                       int,
-                       int64_t) {}
+PT_REGISTER_KERNEL(flatten_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::FlattenGradKernel,
+                   float,
+                   paddle::platform::float16,
+                   double,
+                   uint8_t,
+                   int8_t,
+                   int,
+                   int64_t) {}
 
 #endif
 
 #ifdef PADDLE_WITH_XPU
-PT_REGISTER_CTX_KERNEL(flatten_grad,
-                       XPU,
-                       ALL_LAYOUT,
-                       pten::FlattenGradKernel,
-                       float,
-                       paddle::platform::float16,
-                       int8_t,
-                       int,
-                       int64_t) {}
+PT_REGISTER_KERNEL(flatten_grad,
+                   XPU,
+                   ALL_LAYOUT,
+                   pten::FlattenGradKernel,
+                   float,
+                   paddle::platform::float16,
+                   int8_t,
+                   int,
+                   int64_t) {}
 
 #endif
diff --git a/paddle/pten/kernels/flatten_kernel.cc b/paddle/pten/kernels/flatten_kernel.cc
index b284d3690830f..9201a8df9d166 100644
--- a/paddle/pten/kernels/flatten_kernel.cc
+++ b/paddle/pten/kernels/flatten_kernel.cc
@@ -48,72 +48,72 @@ void FlattenWithXShape(const Context& dev_ctx,
 
 }  // namespace pten
 
-PT_REGISTER_CTX_KERNEL(flatten,
-                       CPU,
-                       ALL_LAYOUT,
-                       pten::FlattenKernel,
-                       float,
-                       double,
-                       uint8_t,
-                       int8_t,
-                       int,
-                       int64_t) {}
+PT_REGISTER_KERNEL(flatten,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::FlattenKernel,
+                   float,
+                   double,
+                   uint8_t,
+                   int8_t,
+                   int,
+                   int64_t) {}
 
-PT_REGISTER_CTX_KERNEL(flatten_with_xshape,
-                       CPU,
-                       ALL_LAYOUT,
-                       pten::FlattenWithXShape,
-                       float,
-                       double,
-                       uint8_t,
-                       int8_t,
-                       int,
-                       int64_t) {}
+PT_REGISTER_KERNEL(flatten_with_xshape,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::FlattenWithXShape,
+                   float,
+                   double,
+                   uint8_t,
+                   int8_t,
+                   int,
+                   int64_t) {}
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_REGISTER_CTX_KERNEL(flatten,
-                       GPU,
-                       ALL_LAYOUT,
-                       pten::FlattenKernel,
-                       float,
-                       paddle::platform::float16,
-                       double,
-                       uint8_t,
-                       int8_t,
-                       int,
-                       int64_t) {}
+PT_REGISTER_KERNEL(flatten,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::FlattenKernel,
+                   float,
+                   paddle::platform::float16,
+                   double,
+                   uint8_t,
+                   int8_t,
+                   int,
+                   int64_t) {}
 
-PT_REGISTER_CTX_KERNEL(flatten_with_xshape,
-                       GPU,
-                       ALL_LAYOUT,
-                       pten::FlattenWithXShape,
-                       float,
-                       paddle::platform::float16,
-                       double,
-                       uint8_t,
-                       int8_t,
-                       int,
-                       int64_t) {}
+PT_REGISTER_KERNEL(flatten_with_xshape,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::FlattenWithXShape,
+                   float,
+                   paddle::platform::float16,
+                   double,
+                   uint8_t,
+                   int8_t,
+                   int,
+                   int64_t) {}
 #endif
 
 #ifdef PADDLE_WITH_XPU
-PT_REGISTER_CTX_KERNEL(flatten,
-                       XPU,
-                       ALL_LAYOUT,
-                       pten::FlattenKernel,
-                       float,
-                       paddle::platform::float16,
-                       int8_t,
-                       int,
-                       int64_t) {}
+PT_REGISTER_KERNEL(flatten,
+                   XPU,
+                   ALL_LAYOUT,
+                   pten::FlattenKernel,
+                   float,
+                   paddle::platform::float16,
+                   int8_t,
+                   int,
+                   int64_t) {}
 
-PT_REGISTER_CTX_KERNEL(flatten_with_xshape,
-                       XPU,
-                       ALL_LAYOUT,
-                       pten::FlattenWithXShape,
-                       float,
-                       paddle::platform::float16,
-                       int8_t,
-                       int,
-                       int64_t) {}
+PT_REGISTER_KERNEL(flatten_with_xshape,
+                   XPU,
+                   ALL_LAYOUT,
+                   pten::FlattenWithXShape,
+                   float,
+                   paddle::platform::float16,
+                   int8_t,
+                   int,
+                   int64_t) {}
 #endif
diff --git a/paddle/pten/kernels/gpu/cast_kernel.cu b/paddle/pten/kernels/gpu/cast_kernel.cu
index 0bbe7a3a132d1..2f91c94ba5f75 100644
--- a/paddle/pten/kernels/gpu/cast_kernel.cu
+++ b/paddle/pten/kernels/gpu/cast_kernel.cu
@@ -60,24 +60,24 @@ void CastKernel(const Context& dev_ctx,
 
 }  // namespace pten
 
-#define PTEN_REGISTER_CAST_CUDA_BASE_TYPE(op_name, ...)     \
-  PT_REGISTER_CTX_KERNEL(cast,                              \
-                         GPU,                               \
-                         ALL_LAYOUT,                        \
-                         pten::CastKernel,                  \
-                         float,                             \
-                         double,                            \
-                         int,                               \
-                         int64_t,                           \
-                         int16_t,                           \
-                         bool,                              \
-                         uint8_t,                           \
-                         paddle::platform::float16,         \
-                         paddle::platform::complex<float>,  \
-                         paddle::platform::complex<double>, \
-                         ##__VA_ARGS__) {                   \
-    kernel->OutputAt(0).SetDataType(                        \
-        paddle::experimental::DataType::UNDEFINED);         \
+#define PTEN_REGISTER_CAST_CUDA_BASE_TYPE(op_name, ...) \
+  PT_REGISTER_KERNEL(cast,                              \
+                     GPU,                               \
+                     ALL_LAYOUT,                        \
+                     pten::CastKernel,                  \
+                     float,                             \
+                     double,                            \
+                     int,                               \
+                     int64_t,                           \
+                     int16_t,                           \
+                     bool,                              \
+                     uint8_t,                           \
+                     paddle::platform::float16,         \
+                     paddle::platform::complex<float>,  \
+                     paddle::platform::complex<double>, \
+                     ##__VA_ARGS__) {                   \
+    kernel->OutputAt(0).SetDataType(                    \
+        paddle::experimental::DataType::UNDEFINED);     \
   }
 
 #if !defined(PADDLE_WITH_HIP)
diff --git a/paddle/pten/kernels/gpu/complex_kernel.cu b/paddle/pten/kernels/gpu/complex_kernel.cu
index 02f050f5bc838..1c82077793e0a 100644
--- a/paddle/pten/kernels/gpu/complex_kernel.cu
+++ b/paddle/pten/kernels/gpu/complex_kernel.cu
@@ -21,14 +21,14 @@
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/complex.h"
 
-PT_REGISTER_CTX_KERNEL(conj,
-                       GPU,
-                       ALL_LAYOUT,
-                       pten::ConjKernel,
-                       paddle::platform::float16,
-                       paddle::platform::complex<float>,
-                       paddle::platform::complex<double>,
-                       float,
-                       double,
-                       int,
-                       int64_t) {}
+PT_REGISTER_KERNEL(conj,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::ConjKernel,
+                   paddle::platform::float16,
+                   paddle::platform::complex<float>,
+                   paddle::platform::complex<double>,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/pten/kernels/gpu/dot_grad_kernel.cu b/paddle/pten/kernels/gpu/dot_grad_kernel.cu
index 42af96f7c7265..4b0d7fed4c9fd 100644
--- a/paddle/pten/kernels/gpu/dot_grad_kernel.cu
+++ b/paddle/pten/kernels/gpu/dot_grad_kernel.cu
@@ -20,13 +20,13 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/complex.h"
 
-PT_REGISTER_CTX_KERNEL(dot_grad,
-                       GPU,
-                       ALL_LAYOUT,
-                       pten::DotGradKernel,
-                       float,
-                       double,
-                       int,
-                       int64_t,
-                       paddle::platform::complex<float>,
-                       paddle::platform::complex<double>) {}
+PT_REGISTER_KERNEL(dot_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::DotGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   paddle::platform::complex<float>,
+                   paddle::platform::complex<double>) {}
diff --git a/paddle/pten/kernels/gpu/dot_kernel.cu b/paddle/pten/kernels/gpu/dot_kernel.cu
index 08d8f83c408de..18bab5c15a058 100644
--- a/paddle/pten/kernels/gpu/dot_kernel.cu
+++ b/paddle/pten/kernels/gpu/dot_kernel.cu
@@ -52,13 +52,13 @@ void DotKernel(const Context& dev_ctx,
 using complex64 = ::paddle::platform::complex<float>;
 using complex128 = ::paddle::platform::complex<double>;
 
-PT_REGISTER_CTX_KERNEL(dot,
-                       GPU,
-                       ALL_LAYOUT,
-                       pten::DotKernel,
-                       float,
-                       double,
-                       int,
-                       int64_t,
-                       complex64,
-                       complex128) {}
+PT_REGISTER_KERNEL(dot,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::DotKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {}
diff --git a/paddle/pten/kernels/gpu/full_kernel.cu b/paddle/pten/kernels/gpu/full_kernel.cu
index ae1f8529db3de..2f6346daa888f 100644
--- a/paddle/pten/kernels/gpu/full_kernel.cu
+++ b/paddle/pten/kernels/gpu/full_kernel.cu
@@ -18,28 +18,28 @@ limitations under the License. */
 #include "paddle/pten/core/kernel_registry.h"
 #include "paddle/pten/kernels/impl/full_kernel_impl.h"
 
-PT_REGISTER_CTX_KERNEL(full,
-                       GPU,
-                       ALL_LAYOUT,
-                       pten::FullKernel,
-                       float,
-                       double,
-                       uint8_t,
-                       int16_t,
-                       int,
-                       int64_t,
-                       bool,
-                       paddle::platform::float16,
-                       paddle::platform::complex<float>,
-                       paddle::platform::complex<double>) {}
+PT_REGISTER_KERNEL(full,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::FullKernel,
+                   float,
+                   double,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t,
+                   bool,
+                   paddle::platform::float16,
+                   paddle::platform::complex<float>,
+                   paddle::platform::complex<double>) {}
 
-PT_REGISTER_CTX_KERNEL(full_like,
-                       GPU,
-                       ALL_LAYOUT,
-                       pten::FullLikeKernel,
-                       float,
-                       double,
-                       int,
-                       int64_t,
-                       bool,
-                       paddle::platform::float16) {}
+PT_REGISTER_KERNEL(full_like,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::FullLikeKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   paddle::platform::float16) {}
diff --git a/paddle/pten/kernels/gpu/math_kernel.cu b/paddle/pten/kernels/gpu/math_kernel.cu
index 557080638038d..1fd085ab5fe40 100644
--- a/paddle/pten/kernels/gpu/math_kernel.cu
+++ b/paddle/pten/kernels/gpu/math_kernel.cu
@@ -110,64 +110,64 @@ using float16 = paddle::platform::float16;
 using complex64 = ::paddle::platform::complex<float>;
 using complex128 = ::paddle::platform::complex<double>;
 
-PT_REGISTER_CTX_KERNEL(
+PT_REGISTER_KERNEL(
     mean, GPU, ALL_LAYOUT, pten::MeanKernel, float, double, bool, float16) {}
-PT_REGISTER_CTX_KERNEL(add,
-                       GPU,
-                       ALL_LAYOUT,
-                       pten::AddKernel,
-                       float,
-                       double,
-                       int,
-                       int64_t,
-                       float16,
-                       complex64,
-                       complex128) {}
-PT_REGISTER_CTX_KERNEL(subtract,
-                       GPU,
-                       ALL_LAYOUT,
-                       pten::SubtractKernel,
-                       float,
-                       double,
-                       int,
-                       int64_t,
-                       float16,
-                       complex64,
-                       complex128) {}
-PT_REGISTER_CTX_KERNEL(divide,
-                       GPU,
-                       ALL_LAYOUT,
-                       pten::DivideKernel,
-                       float,
-                       double,
-                       int,
-                       int64_t,
-                       float16,
-                       complex64,
-                       complex128) {}
-PT_REGISTER_CTX_KERNEL(multiply,
-                       GPU,
-                       ALL_LAYOUT,
-                       pten::MultiplyKernel,
-                       float,
-                       double,
-                       int,
-                       int64_t,
-                       bool,
-                       float16,
-                       complex64,
-                       complex128) {}
-PT_REGISTER_CTX_KERNEL(sum,
-                       GPU,
-                       ALL_LAYOUT,
-                       pten::SumKernel,
-                       bool,
-                       float,
-                       double,
-                       float16,
-                       int,
-                       int64_t,
-                       complex64,
-                       complex128) {
+PT_REGISTER_KERNEL(add,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::AddKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   float16,
+                   complex64,
+                   complex128) {}
+PT_REGISTER_KERNEL(subtract,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::SubtractKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   float16,
+                   complex64,
+                   complex128) {}
+PT_REGISTER_KERNEL(divide,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::DivideKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   float16,
+                   complex64,
+                   complex128) {}
+PT_REGISTER_KERNEL(multiply,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::MultiplyKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   float16,
+                   complex64,
+                   complex128) {}
+PT_REGISTER_KERNEL(sum,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::SumKernel,
+                   bool,
+                   float,
+                   double,
+                   float16,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {
   kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
 }
diff --git a/paddle/pten/kernels/gpu/matmul_grad_kernel.cu b/paddle/pten/kernels/gpu/matmul_grad_kernel.cu
index f20c3f82c9262..993b17f6b8ed0 100644
--- a/paddle/pten/kernels/gpu/matmul_grad_kernel.cu
+++ b/paddle/pten/kernels/gpu/matmul_grad_kernel.cu
@@ -19,32 +19,32 @@ limitations under the License. */
 
 #include "paddle/pten/kernels/impl/matmul_grad_kernel_impl.h"
 
-PT_REGISTER_CTX_KERNEL(matmul_grad,
-                       GPU,
-                       ALL_LAYOUT,
-                       pten::MatmulGradKernel,
-                       float,
-                       double,
-                       paddle::platform::float16,
-                       paddle::platform::complex<float>,
-                       paddle::platform::complex<double>) {}
-
-PT_REGISTER_CTX_KERNEL(matmul_double_grad,
-                       GPU,
-                       ALL_LAYOUT,
-                       pten::MatmulDoubleGradKernel,
-                       float,
-                       double,
-                       paddle::platform::float16,
-                       paddle::platform::complex<float>,
-                       paddle::platform::complex<double>) {}
-
-PT_REGISTER_CTX_KERNEL(matmul_triple_grad,
-                       GPU,
-                       ALL_LAYOUT,
-                       pten::MatmulTripleGradKernel,
-                       float,
-                       double,
-                       paddle::platform::float16,
-                       paddle::platform::complex<float>,
-                       paddle::platform::complex<double>) {}
+PT_REGISTER_KERNEL(matmul_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::MatmulGradKernel,
+                   float,
+                   double,
+                   paddle::platform::float16,
+                   paddle::platform::complex<float>,
+                   paddle::platform::complex<double>) {}
+
+PT_REGISTER_KERNEL(matmul_double_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::MatmulDoubleGradKernel,
+                   float,
+                   double,
+                   paddle::platform::float16,
+                   paddle::platform::complex<float>,
+                   paddle::platform::complex<double>) {}
+
+PT_REGISTER_KERNEL(matmul_triple_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::MatmulTripleGradKernel,
+                   float,
+                   double,
+                   paddle::platform::float16,
+                   paddle::platform::complex<float>,
+                   paddle::platform::complex<double>) {}
diff --git a/paddle/pten/kernels/gpu/matmul_kernel.cu b/paddle/pten/kernels/gpu/matmul_kernel.cu
index debda455818a9..a3ab88913a3b6 100644
--- a/paddle/pten/kernels/gpu/matmul_kernel.cu
+++ b/paddle/pten/kernels/gpu/matmul_kernel.cu
@@ -20,12 +20,12 @@ limitations under the License. */
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/pten/kernels/impl/matmul_kernel_impl.h"
 
-PT_REGISTER_CTX_KERNEL(matmul,
-                       GPU,
-                       ALL_LAYOUT,
-                       pten::MatmulKernel,
-                       float,
-                       double,
-                       paddle::platform::float16,
-                       paddle::platform::complex<float>,
-                       paddle::platform::complex<double>) {}
+PT_REGISTER_KERNEL(matmul,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::MatmulKernel,
+                   float,
+                   double,
+                   paddle::platform::float16,
+                   paddle::platform::complex<float>,
+                   paddle::platform::complex<double>) {}
diff --git a/paddle/pten/kernels/gpu/scale_kernel.cu b/paddle/pten/kernels/gpu/scale_kernel.cu
index ff7e2a6ed284c..4d63701413cd6 100644
--- a/paddle/pten/kernels/gpu/scale_kernel.cu
+++ b/paddle/pten/kernels/gpu/scale_kernel.cu
@@ -64,15 +64,15 @@ void ScaleKernel(const ContextT& dev_ctx,
 
 }  // namespace pten
 
-PT_REGISTER_CTX_KERNEL(scale,
-                       GPU,
-                       ALL_LAYOUT,
-                       pten::ScaleKernel,
-                       float,
-                       double,
-                       paddle::platform::float16,
-                       uint8_t,
-                       int8_t,
-                       int16_t,
-                       int,
-                       int64_t) {}
+PT_REGISTER_KERNEL(scale,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::ScaleKernel,
+                   float,
+                   double,
+                   paddle::platform::float16,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   int,
+                   int64_t) {}
diff --git a/paddle/pten/kernels/gpu/sign_kernel.cu b/paddle/pten/kernels/gpu/sign_kernel.cu
index e7eb7e46861c8..16356507dc8ea 100644
--- a/paddle/pten/kernels/gpu/sign_kernel.cu
+++ b/paddle/pten/kernels/gpu/sign_kernel.cu
@@ -23,5 +23,5 @@ limitations under the License. */
 
 using float16 = paddle::platform::float16;
 
-PT_REGISTER_CTX_KERNEL(
+PT_REGISTER_KERNEL(
     sign, GPU, ALL_LAYOUT, pten::SignKernel, float, double, float16) {}

From 9ff989aeae54472f766bc6ffef8a13111ca8da51 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=9F=B3=E6=99=93=E4=BC=9F?=
 <39303645+Shixiaowei02@users.noreply.github.com>
Date: Fri, 14 Jan 2022 11:26:01 +0800
Subject: [PATCH 132/151] remove interface: DenseTensor::release, test=develop
 (#38937)

---
 paddle/fluid/pybind/eager_method.cc         | 2 +-
 paddle/pten/api/lib/utils/tensor_utils.cc   | 6 ++----
 paddle/pten/core/dense_tensor.h             | 6 ------
 paddle/pten/tests/core/test_dense_tensor.cc | 7 -------
 4 files changed, 3 insertions(+), 18 deletions(-)

diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index a8c1da2a8b866..46b56f27ff98e 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -189,7 +189,7 @@ static PyObject* eager_tensor__clear_gradient(EagerTensorObject* self,
             << " is initialized, will be released.";
     auto dense_tensor =
         std::dynamic_pointer_cast<pten::DenseTensor>(grad->impl());
-    dense_tensor->release();
+    dense_tensor->MoveMemoryHolder();
   }
   Py_INCREF(Py_None);
   return Py_None;
diff --git a/paddle/pten/api/lib/utils/tensor_utils.cc b/paddle/pten/api/lib/utils/tensor_utils.cc
index 0b6cb8d95cc1a..53d641896e43f 100644
--- a/paddle/pten/api/lib/utils/tensor_utils.cc
+++ b/paddle/pten/api/lib/utils/tensor_utils.cc
@@ -306,10 +306,8 @@ void MovesStorage(pten::DenseTensor* src, paddle::framework::Tensor* dst) {
           "The destination Tensor is nullptr when move storage."));
   dst->Resize(src->dims());
   dst->set_type(pten::TransToProtoVarType(src->dtype()));
-  auto storage = src->release();
-  std::shared_ptr<pten::Allocation> holder(
-      new TensorStorage(std::move(storage)));
-  dst->ResetHolderWithType(holder, pten::TransToProtoVarType(src->dtype()));
+  auto storage = src->MoveMemoryHolder();
+  dst->ResetHolderWithType(storage, pten::TransToProtoVarType(src->dtype()));
   dst->set_offset(src->meta().offset);
 }
 
diff --git a/paddle/pten/core/dense_tensor.h b/paddle/pten/core/dense_tensor.h
index 1802a2461158f..4f25fc296724c 100644
--- a/paddle/pten/core/dense_tensor.h
+++ b/paddle/pten/core/dense_tensor.h
@@ -172,12 +172,6 @@ class DenseTensor : public TensorBase,
   /// \return The actual storage size occupied by tensor.
   size_t capacity() const { return storage_->size(); }
 
-  /// \brief Release the storage area for other purposes. Because of the
-  /// destruction of encapsulation, we do not support two dense tensors directly
-  /// sharing the same intrusive pointer.
-  /// \return The rvalue of instrusize pointer releated to the released storage.
-  intrusive_ptr<Storage> release() { return std::move(storage_); }
-
   /// \brief Get the mutable data pointer value of type T.
   /// Memory allocation may occur when calling this interface:
   /// 1. When the storage size is not enough to meet the current shape of the
diff --git a/paddle/pten/tests/core/test_dense_tensor.cc b/paddle/pten/tests/core/test_dense_tensor.cc
index c6db228c2b757..8277c0d8dadb7 100644
--- a/paddle/pten/tests/core/test_dense_tensor.cc
+++ b/paddle/pten/tests/core/test_dense_tensor.cc
@@ -116,9 +116,6 @@ TEST(dense_tensor, resize) {
   CHECK_EQ(tensor_0.capacity(), 6u);
   tensor_0.mutable_data<int8_t>();
   CHECK_EQ(tensor_0.capacity(), 6u);
-
-  auto storage = tensor_0.release();
-  CHECK_EQ(storage->size(), 6u);
 }
 
 TEST(dense_tensor, shallow_copy) {
@@ -133,10 +130,6 @@ TEST(dense_tensor, shallow_copy) {
 
   DenseTensor tensor_1(tensor_0);
   CHECK(tensor_0.meta() == tensor_1.meta());
-
-  // Copy constructor: Now shares the underlying shared_ptr<Allocation> instead
-  // of Storage
-  CHECK(tensor_0.release() != tensor_1.release());
 }
 
 }  // namespace tests

From 9e0686ed45f79bbe6a5434bf453509cab0b630ea Mon Sep 17 00:00:00 2001
From: Sing_chan <51314274+betterpig@users.noreply.github.com>
Date: Fri, 14 Jan 2022 11:29:37 +0800
Subject: [PATCH 133/151] fix bug of -DPADDLE_WITH_SSE3 not set when WITH_AVX
 AND AVX_FOUND even SSE3_FOUND (#38931)

---
 cmake/configure.cmake | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 32ba2ff3ac627..88e8dde8addbc 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -31,10 +31,12 @@ endif(NOT WITH_PROFILER)
 if(WITH_AVX AND AVX_FOUND)
     set(SIMD_FLAG ${AVX_FLAG})
     add_definitions(-DPADDLE_WITH_AVX)
-elseif(SSE3_FOUND)
-    if(NOT WIN32)
-        set(SIMD_FLAG ${SSE3_FLAG})
-    endif()
+elseif(SSE3_FOUND AND NOT WIN32)
+    set(SIMD_FLAG ${SSE3_FLAG})
+endif()
+
+if (SSE3_FOUND)
+    # TODO: Runtime detection should be used here.
     add_definitions(-DPADDLE_WITH_SSE3)
 endif()
 

From 7f8d5bc8f02d10db46cce9a975db584528742ed7 Mon Sep 17 00:00:00 2001
From: qipengh <huangqipeng@cambricon.com>
Date: Fri, 14 Jan 2022 11:37:26 +0800
Subject: [PATCH 134/151] [MLU]Add mean and reduce_mean op (#38872)

* [MLU]: add mean and reduce mean op

* [MLU]add mlu pytest dir in CMakeLists.txt

* [MLU]fix tensor data

* [MLU]fix TensorToPyArray and license
---
 paddle/fluid/framework/tensor_util.cc         |  40 +++-
 paddle/fluid/memory/detail/buddy_allocator.cc |   5 +-
 paddle/fluid/memory/memcpy.cc                 |  10 +
 paddle/fluid/operators/mean_op_mlu.cc         | 127 ++++++++++++
 paddle/fluid/operators/mlu/mlu_baseop.h       |  15 +-
 .../reduce_ops/reduce_mean_op_mlu.cc          | 127 ++++++++++++
 paddle/fluid/pybind/tensor_py.h               |  28 ++-
 .../fluid/tests/unittests/CMakeLists.txt      |   4 +
 .../fluid/tests/unittests/mlu/CMakeLists.txt  |   9 +
 .../tests/unittests/mlu/test_mean_op_mlu.py   |  83 ++++++++
 .../unittests/mlu/test_reduce_mean_op_mlu.py  | 185 ++++++++++++++++++
 .../tests/unittests/mlu/test_relu_op_mlu.py   | 166 ++++++++++++++++
 .../paddle/fluid/tests/unittests/op_test.py   |  11 +-
 13 files changed, 796 insertions(+), 14 deletions(-)
 create mode 100644 paddle/fluid/operators/mean_op_mlu.cc
 create mode 100644 paddle/fluid/operators/reduce_ops/reduce_mean_op_mlu.cc
 create mode 100644 python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt
 create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_mean_op_mlu.py
 create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_reduce_mean_op_mlu.py
 create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_relu_op_mlu.py

diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 5fd581220097b..724e3cc1e2ee8 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -396,7 +396,8 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place,
                     TENSOR* dst) {
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   const platform::DeviceContext* dev_ctx;
-  if (platform::is_gpu_place(dst_place) || platform::is_npu_place(dst_place)) {
+  if (platform::is_gpu_place(dst_place) || platform::is_npu_place(dst_place) ||
+      platform::is_mlu_place(dst_place)) {
     dev_ctx = pool.Get(dst_place);
   } else {
     dev_ctx = pool.Get(src.place());
@@ -1048,6 +1049,29 @@ void TensorToStream(std::ostream& os, const Tensor& tensor,
 #else
       PADDLE_THROW(platform::errors::Unimplemented(
           "XPUPlace is not supported when not compiled with XPU"));
+#endif
+    } else if (platform::is_mlu_place(tensor.place())) {
+#ifdef PADDLE_WITH_MLU
+      constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
+      std::unique_ptr<char[]> buf(new char[kBufSize]);
+      auto& mlu_dev_ctx =
+          static_cast<const platform::MLUDeviceContext&>(dev_ctx);
+      platform::CPUPlace cpu;
+      uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
+      while (size != 0) {
+        size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
+        memory::Copy(cpu, buf.get(),
+                     BOOST_GET_CONST(platform::MLUPlace, tensor.place()),
+                     reinterpret_cast<const void*>(data), size_to_write,
+                     mlu_dev_ctx.stream());
+        mlu_dev_ctx.Wait();
+        os.write(buf.get(), size_to_write);
+        data += size_to_write;
+        size -= size_to_write;
+      }
+#else
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "MLUPlace is not supported when not compiled with MLU"));
 #endif
     } else if (platform::is_npu_place(tensor.place())) {
 #ifdef PADDLE_WITH_ASCEND_CL
@@ -1127,9 +1151,11 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
     size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
     if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
         platform::is_xpu_place(dev_ctx.GetPlace()) ||
+        platform::is_mlu_place(dev_ctx.GetPlace()) ||
         platform::is_npu_place(dev_ctx.GetPlace())) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_ASCEND_CL)
+    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MLU) ||  \
+    defined(PADDLE_WITH_ASCEND_CL)
       Tensor cpu_tensor;
       cpu_tensor.Resize(framework::make_ddim(shape));
       framework::VisitDataType(
@@ -1148,6 +1174,9 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
       } else if (platform::is_xpu_place(dev_ctx.GetPlace())) {
         PADDLE_THROW(platform::errors::Unimplemented(
             "XPUPlace is not supported when not compiled with XPU"));
+      } else if (platform::is_mlu_place(dev_ctx.GetPlace())) {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "MLUPlace is not supported when not compiled with MLU"));
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
             "NPUPlace is not supported when not compiled with NPU"));
@@ -1192,9 +1221,11 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
     size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
     if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
         platform::is_xpu_place(dev_ctx.GetPlace()) ||
+        platform::is_mlu_place(dev_ctx.GetPlace()) ||
         platform::is_npu_place(dev_ctx.GetPlace())) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_ASCEND_CL)
+    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MLU) ||  \
+    defined(PADDLE_WITH_ASCEND_CL)
       Tensor cpu_tensor;
       cpu_tensor.Resize(framework::make_ddim(dims));
       framework::VisitDataType(
@@ -1213,6 +1244,9 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
       } else if (platform::is_xpu_place(dev_ctx.GetPlace())) {
         PADDLE_THROW(platform::errors::Unimplemented(
             "XPUPlace is not supported when not compiled with XPU"));
+      } else if (platform::is_mlu_place(dev_ctx.GetPlace())) {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "MLUPlace is not supported when not compiled with MLU"));
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
             "NPUPlace is not supported when not compiled with NPU"));
diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc
index 96fcd6254d885..b02fb6642be3f 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@@ -231,9 +231,8 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
   allocate_bytes = DeviceAllocateSize(&platform::NPUInitAllocSize,
                                       &platform::NPUReallocSize, request_bytes);
 #elif defined(PADDLE_WITH_MLU)
-  allocate_bytes =
-      DeviceAllocateSize(&platform::MLUInitAllocSize(),
-                         &platform::MLUReallocSize(), request_bytes);
+  allocate_bytes = DeviceAllocateSize(&platform::MLUInitAllocSize,
+                                      &platform::MLUReallocSize, request_bytes);
 #endif
 
   // Allocate a new block
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index e6aed2c90dace..153e19a9f1450 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -508,6 +508,9 @@ void Copy<platform::CPUPlace, platform::MLUPlace>(platform::CPUPlace dst_place,
     platform::RecordEvent record_event("MLUMemcpyD2HAsync:MLU->CPU");
     platform::MLUMemcpyD2HAsync(dst, src, num, stream);
   } else {
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    static_cast<platform::MLUDeviceContext*>(pool.Get(src_place))->Wait();
+
     VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place
             << " to " << dst_place;
     platform::RecordEvent record_event("MLUMemcpyD2HSync:MLU->CPU");
@@ -530,6 +533,9 @@ void Copy<platform::MLUPlace, platform::CPUPlace>(platform::MLUPlace dst_place,
     platform::RecordEvent record_event("MLUMemcpyH2DAsync:CPU->MLU");
     platform::MLUMemcpyH2DAsync(dst, src, num, stream);
   } else {
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    static_cast<platform::MLUDeviceContext*>(pool.Get(src_place))->Wait();
+
     VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place
             << " to " << dst_place;
     platform::RecordEvent record_event("MLUMemcpyH2DSync:CPU->MLU");
@@ -554,6 +560,10 @@ void Copy<platform::MLUPlace, platform::MLUPlace>(platform::MLUPlace dst_place,
           "MLUMemcpyD2DAsync(same_mlu):MLU->MLU");
       platform::MLUMemcpyD2DAsync(dst, src, num, stream);
     } else {
+      platform::DeviceContextPool& pool =
+          platform::DeviceContextPool::Instance();
+      static_cast<platform::MLUDeviceContext*>(pool.Get(src_place))->Wait();
+
       VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place
               << " to " << dst_place;
       platform::RecordEvent record_event("MLUMemcpyD2DSync(same_mlu):MLU->MLU");
diff --git a/paddle/fluid/operators/mean_op_mlu.cc b/paddle/fluid/operators/mean_op_mlu.cc
new file mode 100644
index 0000000000000..9862c2bd95256
--- /dev/null
+++ b/paddle/fluid/operators/mean_op_mlu.cc
@@ -0,0 +1,127 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/mean_op.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+#include "paddle/fluid/platform/device/mlu/device_context.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class MeanMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+
+    const T* in_data = input->data<T>();
+    T* out_data = output->mutable_data<T>(context.GetPlace());
+    auto numel = input->numel();
+    auto rank = input->dims().size();
+    auto place = context.GetPlace();
+    auto stream = context.template device_context<MLUDeviceContext>().stream();
+
+    if (rank == 0) {  // scalar
+      auto mlu_place = BOOST_GET(platform::MLUPlace, place);
+      memory::Copy(mlu_place, out_data, mlu_place, in_data, numel * sizeof(T),
+                   stream);
+      return;
+    }
+
+    std::vector<int> reduce_dims;
+    reduce_dims.reserve(rank);
+    for (decltype(rank) i = 0; i < rank; ++i) {
+      reduce_dims.push_back(i);
+    }
+
+    MLUCnnlTensorDesc input_desc(*input, CNNL_LAYOUT_ARRAY,
+                                 ToCnnlDataType(input->type()));
+    MLUCnnlTensorDesc output_desc(*output, CNNL_LAYOUT_ARRAY,
+                                  ToCnnlDataType(output->type()));
+
+    MLUCnnlReduceDesc reduction_desc(
+        reduce_dims, CNNL_REDUCE_AVG, ToCnnlDataType<T>(),
+        CNNL_NOT_PROPAGATE_NAN, CNNL_REDUCE_NO_INDICES, CNNL_32BIT_INDICES);
+
+    MLUCnnl::Reduce(context, true /*need_workspace*/, reduction_desc.get(),
+                    nullptr, input_desc.get(),
+                    reinterpret_cast<const void*>(in_data), 0 /*indices_size*/,
+                    nullptr, nullptr, output_desc.get(),
+                    reinterpret_cast<void*>(out_data));
+  }
+};
+
+template <typename T>
+class MeanMLUGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto output_grad = context.Input<Tensor>(framework::GradVarName("Out"));
+    PADDLE_ENFORCE_EQ(output_grad->numel(), 1,
+                      platform::errors::InvalidArgument(
+                          "Mean Gradient Input Tensor len should be 1. But "
+                          "received Out@Grad's elements num is %d.",
+                          output_grad->numel()));
+    auto input_grad = context.Output<Tensor>(framework::GradVarName("X"));
+    input_grad->mutable_data<T>(context.GetPlace());
+
+    auto in_data = output_grad->data<T>();
+    auto numel = input_grad->numel();
+    auto rank = input_grad->dims().size();
+    auto out_data = input_grad->data<T>();
+    auto place = context.GetPlace();
+    auto stream = context.template device_context<MLUDeviceContext>().stream();
+
+    if (rank == 0) {  // scalar
+      auto mlu_place = BOOST_GET(platform::MLUPlace, place);
+      memory::Copy(mlu_place, out_data, mlu_place, in_data, numel * sizeof(T),
+                   stream);
+      return;
+    }
+
+    // means
+    Tensor mean_var(output_grad->type());
+    mean_var.mutable_data<T>(input_grad->dims(), context.GetPlace());
+    MLUCnnlTensorDesc mean_var_desc(mean_var, CNNL_LAYOUT_ARRAY,
+                                    ToCnnlDataType(mean_var.type()));
+    auto value = static_cast<T>(1.0 / static_cast<float>(input_grad->numel()));
+    MLUCnnl::Fill(context, value, mean_var_desc.get(), GetBasePtr(&mean_var));
+
+    // means mul output_grad
+    MLUCnnlTensorDesc in_desc(*output_grad, CNNL_LAYOUT_ARRAY,
+                              ToCnnlDataType(output_grad->type()));
+    MLUCnnlTensorDesc out_desc(*input_grad, CNNL_LAYOUT_ARRAY,
+                               ToCnnlDataType(input_grad->type()));
+
+    MLUCnnlOpTensorDesc op_tensor_desc(CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(),
+                                       CNNL_NOT_PROPAGATE_NAN);
+
+    MLUCnnl::OpTensor(context, op_tensor_desc.get(), in_desc.get(),
+                      reinterpret_cast<const void*>(in_data),
+                      mean_var_desc.get(), GetBasePtr(&mean_var),
+                      out_desc.get(), reinterpret_cast<void*>(out_data),
+                      ToCnnlDataType<T>());
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(mean, ops::MeanMLUKernel<float>,
+                       ops::MeanMLUKernel<plat::float16>);
+REGISTER_OP_MLU_KERNEL(mean_grad, ops::MeanMLUGradKernel<float>,
+                       ops::MeanMLUGradKernel<plat::float16>);
diff --git a/paddle/fluid/operators/mlu/mlu_baseop.h b/paddle/fluid/operators/mlu/mlu_baseop.h
index ab398a92c2972..8082c45d14b95 100644
--- a/paddle/fluid/operators/mlu/mlu_baseop.h
+++ b/paddle/fluid/operators/mlu/mlu_baseop.h
@@ -45,12 +45,22 @@ enum MLULogicMethod {
   CNNL_LOGIC_OP_OR = 7,
 };
 
+inline const void* GetBasePtr(const Tensor* t) { return t->data(); }
+
+inline void* GetBasePtr(Tensor* t) { return t->data(); }
+
 template <typename T>
 inline cnnlDataType_t ToCnnlDataType(const T& t) {
   auto type = framework::ToDataType(t);
   return ToCnnlDataType(type);
 }
 
+template <typename T>
+inline cnnlDataType_t ToCnnlDataType() {
+  auto type = framework::ToDataType(std::type_index(typeid(T)));
+  return ToCnnlDataType(type);
+}
+
 template <>
 inline cnnlDataType_t ToCnnlDataType(const framework::proto::VarType::Type& t) {
   cnnlDataType_t type = CNNL_DTYPE_FLOAT;
@@ -89,11 +99,12 @@ NarrowT CheckedNarrowing(const WideT& wide) {
   return narrow;
 }
 
-static cnnlHandle_t GetHandleFromCTX(const ExecutionContext& ctx) {
+inline static cnnlHandle_t GetHandleFromCTX(const ExecutionContext& ctx) {
   return ctx.template device_context<MLUDeviceContext>().cnnl_handle();
 }
 
-static const MLUDeviceContext& GetDevCtxFromCTX(const ExecutionContext& ctx) {
+inline static const MLUDeviceContext& GetDevCtxFromCTX(
+    const ExecutionContext& ctx) {
   return ctx.template device_context<MLUDeviceContext>();
 }
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op_mlu.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op_mlu.cc
new file mode 100644
index 0000000000000..ef7e9940f0590
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op_mlu.cc
@@ -0,0 +1,127 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+#include "paddle/fluid/platform/device/mlu/device_context.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class ReduceMeanMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+    output->mutable_data<T>(context.GetPlace());
+
+    bool reduce_all = context.Attr<bool>("reduce_all");
+    auto dims = context.Attr<std::vector<int>>("dim");
+    auto input_dims = framework::vectorize(input->dims());
+    const auto& input_dim_size = input->dims().size();
+    std::vector<int> reduce_dims;
+    if (reduce_all) {
+      for (size_t i = 0; i < input_dims.size(); i++) {
+        reduce_dims.push_back(static_cast<int>(i));
+      }
+    } else {
+      for (size_t i = 0; i < dims.size(); ++i) {
+        if (dims[i] < 0) {
+          reduce_dims.push_back(dims[i] + input_dim_size);
+        } else {
+          reduce_dims.push_back(dims[i]);
+        }
+      }
+    }
+
+    MLUCnnlTensorDesc input_desc(*input, CNNL_LAYOUT_ARRAY,
+                                 ToCnnlDataType(input->type()));
+    MLUCnnlTensorDesc output_desc(*output, CNNL_LAYOUT_ARRAY,
+                                  ToCnnlDataType(output->type()));
+
+    MLUCnnlReduceDesc reduction_desc(
+        reduce_dims, CNNL_REDUCE_AVG, ToCnnlDataType<T>(),
+        CNNL_NOT_PROPAGATE_NAN, CNNL_REDUCE_NO_INDICES, CNNL_32BIT_INDICES);
+
+    MLUCnnl::Reduce(context, true /*need_workspace*/, reduction_desc.get(),
+                    nullptr, input_desc.get(), GetBasePtr(input),
+                    0 /*indices_size*/, nullptr, nullptr, output_desc.get(),
+                    GetBasePtr(output));
+  }
+};
+
+template <typename T>
+class ReduceMeanGradMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* input = context.Input<Tensor>("X");
+    auto* output_grad = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* input_grad = context.Output<Tensor>(framework::GradVarName("X"));
+    input_grad->mutable_data<T>(context.GetPlace());
+
+    bool reduce_all = context.Attr<bool>("reduce_all");
+    auto reduce_dims = context.Attr<std::vector<int>>("dim");
+    auto input_dims = framework::vectorize(input->dims());
+
+    int reduce_numel = 1;
+    if (reduce_all) {
+      reduce_dims.clear();
+      for (size_t d = 0; d < input_dims.size(); ++d) {
+        reduce_dims.push_back(static_cast<int>(d));
+      }
+    }
+    for (auto& d : reduce_dims) {
+      if (d < 0) {
+        d = d + input_dims.size();
+      }
+      reduce_numel *= input_dims[d];
+    }
+
+    Tensor tmp_output_grad(output_grad->type());
+    auto tmp_output_dims = input_dims;
+    for (auto d : reduce_dims) {
+      tmp_output_dims[d] = 1;
+    }
+    tmp_output_grad.ShareDataWith(*output_grad);
+    tmp_output_grad.Resize(framework::make_ddim(tmp_output_dims));
+
+    MLUCnnlTensorDesc output_grad_desc(tmp_output_grad, CNNL_LAYOUT_ARRAY,
+                                       ToCnnlDataType(tmp_output_grad.type()));
+    MLUCnnlTensorDesc input_grad_desc(*input_grad, CNNL_LAYOUT_ARRAY,
+                                      ToCnnlDataType(input_grad->type()));
+
+    auto value = static_cast<T>(1.0 / static_cast<float>(reduce_numel));
+    MLUCnnl::Fill(context, value, input_grad_desc.get(),
+                  GetBasePtr(input_grad));
+
+    MLUCnnlOpTensorDesc op_tensor_desc(CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(),
+                                       CNNL_NOT_PROPAGATE_NAN);
+
+    MLUCnnl::OpTensor(context, op_tensor_desc.get(), output_grad_desc.get(),
+                      GetBasePtr(&tmp_output_grad), input_grad_desc.get(),
+                      GetBasePtr(input_grad), input_grad_desc.get(),
+                      GetBasePtr(input_grad), ToCnnlDataType<T>());
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(reduce_mean, ops::ReduceMeanMLUKernel<float>,
+                       ops::ReduceMeanMLUKernel<plat::float16>);
+REGISTER_OP_MLU_KERNEL(reduce_mean_grad, ops::ReduceMeanGradMLUKernel<float>,
+                       ops::ReduceMeanGradMLUKernel<plat::float16>);
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index b31b7456ebca7..1fe6686919453 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -232,6 +232,13 @@ T TensorGetElement(const framework::Tensor &self, size_t offset) {
     auto p = BOOST_GET_CONST(platform::CUDAPlace, self.place());
     paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T),
                          nullptr);
+#endif
+  } else if (platform::is_mlu_place(self.place())) {
+#ifdef PADDLE_WITH_MLU
+    const T *a = self.data<T>();
+    auto p = BOOST_GET_CONST(platform::MLUPlace, self.place());
+    paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T),
+                         nullptr);
 #endif
   } else if (platform::is_npu_place(self.place())) {
 #if defined(PADDLE_WITH_ASCEND_CL)
@@ -267,6 +274,13 @@ void TensorSetElement(framework::Tensor *self, size_t offset, T elem) {
     T *a = self->mutable_data<T>(p);
     paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T),
                          nullptr);
+#endif
+  } else if (platform::is_mlu_place(self->place())) {
+#ifdef PADDLE_WITH_MLU
+    auto p = BOOST_GET_CONST(platform::MLUPlace, self->place());
+    T *a = self->mutable_data<T>(p);
+    paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T),
+                         nullptr);
 #endif
   } else if (platform::is_npu_place(self->place())) {
 #if defined(PADDLE_WITH_ASCEND_CL)
@@ -543,6 +557,11 @@ inline framework::Tensor *_getTensor(const framework::Tensor &self,
 #ifdef PADDLE_WITH_XPU
     output->mutable_data(BOOST_GET_CONST(platform::XPUPlace, place),
                          self.type());
+#endif
+  } else if (platform::is_mlu_place(place)) {
+#ifdef PADDLE_WITH_MLU
+    output->mutable_data(BOOST_GET_CONST(platform::MLUPlace, place),
+                         self.type());
 #endif
   } else {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -845,8 +864,13 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
 
     size_t copy_bytes = sizeof_dtype * numel;
     auto p = BOOST_GET_CONST(platform::MLUPlace, tensor.place());
-    paddle::memory::Copy(platform::CPUPlace(), py_arr.mutable_data(), p,
-                         tensor_buf_ptr, copy_bytes, nullptr);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &ctx = *pool.Get(tensor.place());
+    paddle::memory::Copy(
+        platform::CPUPlace(), py_arr.mutable_data(), p, tensor_buf_ptr,
+        copy_bytes,
+        reinterpret_cast<const platform::MLUDeviceContext &>(ctx).stream());
+    ctx.Wait();
     return py_arr;
 #else
     PADDLE_THROW(platform::errors::PermissionDenied(
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index b46a10c8c79d8..67697fcfd8398 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -803,6 +803,10 @@ if (WITH_MKLDNN)
     add_subdirectory(mkldnn)
 endif()
 
+if (WITH_MLU)
+    add_subdirectory(mlu)
+endif()
+
 add_subdirectory(asp)
 
 add_subdirectory(ir)
diff --git a/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt
new file mode 100644
index 0000000000000..8fcd3f196dc19
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt
@@ -0,0 +1,9 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+if (WITH_MLU)
+    foreach(TEST_OP ${TEST_OPS})
+        py_test_modules(${TEST_OP} MODULES ${TEST_OP})
+    endforeach(TEST_OP)
+
+endif()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_mean_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_mean_op_mlu.py
new file mode 100644
index 0000000000000..36419327db6b0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_mean_op_mlu.py
@@ -0,0 +1,83 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+
+paddle.enable_static()
+SEED = 2021
+
+
+class TestMean(OpTest):
+    def setUp(self):
+        self.set_mlu()
+        self.place = paddle.device.MLUPlace(0)
+        self.op_type = "mean"
+        self.init_dtype()
+
+        x = np.random.random([1, 100]).astype(self.dtype)
+        self.inputs = {'X': x}
+
+        self.attrs = {}
+        np_out = np.mean(x)
+        self.outputs = {'Out': np_out}
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ['X'], 'Out')
+
+
+class TestMeanFP16(OpTest):
+    def setUp(self):
+        self.set_mlu()
+        self.place = paddle.MLUPlace(0)
+        self.op_type = "mean"
+        self.init_dtype()
+
+        x = np.random.random([3, 200]).astype(self.dtype)
+        self.inputs = {'X': x}
+
+        self.attrs = {}
+        np_out = np.mean(x)
+        self.outputs = {'Out': np_out}
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_reduce_mean_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_reduce_mean_op_mlu.py
new file mode 100644
index 0000000000000..c0be644c79115
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_reduce_mean_op_mlu.py
@@ -0,0 +1,185 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+
+
+class TestMeanOp(OpTest):
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "reduce_mean"
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float32")}
+        self.outputs = {'Out': self.inputs['X'].mean(axis=0)}
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ['X'], 'Out')
+
+
+class TestMeanOp5D(TestMeanOp):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "reduce_mean"
+        self.inputs = {
+            'X': np.random.random((1, 2, 5, 6, 10)).astype("float32")
+        }
+        self.outputs = {'Out': self.inputs['X'].mean(axis=0)}
+
+
+class TestMeanOp6D(TestMeanOp):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "reduce_mean"
+        self.inputs = {
+            'X': np.random.random((1, 1, 2, 5, 6, 10)).astype("float32")
+        }
+        self.outputs = {'Out': self.inputs['X'].mean(axis=0)}
+
+
+class TestMeanOp8D(TestMeanOp):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "reduce_mean"
+        self.inputs = {
+            'X': np.random.random((1, 3, 1, 2, 1, 4, 3, 10)).astype("float32")
+        }
+        self.attrs = {'dim': (0, 3)}
+        self.outputs = {'Out': self.inputs['X'].mean(axis=(0, 3))}
+
+
+class Test1DReduce(TestMeanOp):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "reduce_mean"
+        self.inputs = {'X': np.random.random(120).astype("float32")}
+        self.outputs = {'Out': self.inputs['X'].mean(axis=0)}
+
+
+class Test2DReduce0(Test1DReduce):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "reduce_mean"
+        self.attrs = {'dim': [0]}
+        self.inputs = {'X': np.random.random((20, 10)).astype("float32")}
+        self.outputs = {'Out': self.inputs['X'].mean(axis=0)}
+
+
+class Test2DReduce1(Test1DReduce):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "reduce_mean"
+        self.attrs = {'dim': [1]}
+        self.inputs = {'X': np.random.random((20, 10)).astype("float32")}
+        self.outputs = {
+            'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim']))
+        }
+
+
+class Test3DReduce0(Test1DReduce):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "reduce_mean"
+        self.attrs = {'dim': [1]}
+        self.inputs = {'X': np.random.random((5, 6, 7)).astype("float32")}
+        self.outputs = {
+            'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim']))
+        }
+
+
+class Test3DReduce1(Test1DReduce):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "reduce_mean"
+        self.attrs = {'dim': [2]}
+        self.inputs = {'X': np.random.random((5, 6, 7)).astype("float32")}
+        self.outputs = {
+            'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim']))
+        }
+
+
+class Test3DReduce2(Test1DReduce):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "reduce_mean"
+        self.attrs = {'dim': [-2]}
+        self.inputs = {'X': np.random.random((5, 6, 7)).astype("float32")}
+        self.outputs = {
+            'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim']))
+        }
+
+
+class Test3DReduce3(Test1DReduce):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "reduce_mean"
+        self.attrs = {'dim': [1, 2]}
+        self.inputs = {'X': np.random.random((5, 6, 7)).astype("float32")}
+        self.outputs = {
+            'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim']))
+        }
+
+
+class TestKeepDimReduce(Test1DReduce):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "reduce_mean"
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float32")}
+        self.attrs = {'dim': [1], 'keep_dim': True}
+        self.outputs = {
+            'Out': self.inputs['X'].mean(
+                axis=tuple(self.attrs['dim']), keepdims=self.attrs['keep_dim'])
+        }
+
+
+class TestKeepDim8DReduce(Test1DReduce):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "reduce_mean"
+        self.inputs = {
+            'X': np.random.random((2, 5, 3, 2, 2, 3, 4, 2)).astype("float32")
+        }
+        self.attrs = {'dim': (3, 4, 5), 'keep_dim': True}
+        self.outputs = {
+            'Out': self.inputs['X'].mean(
+                axis=tuple(self.attrs['dim']), keepdims=self.attrs['keep_dim'])
+        }
+
+
+class TestReduceAll(Test1DReduce):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "reduce_mean"
+        self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float32")}
+        self.attrs = {'reduce_all': True}
+        self.outputs = {'Out': self.inputs['X'].mean()}
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_relu_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_relu_op_mlu.py
new file mode 100644
index 0000000000000..25c50f67949e7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_relu_op_mlu.py
@@ -0,0 +1,166 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+class TestRelu(OpTest):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "relu"
+        self.place = paddle.MLUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.rand(3, 2).astype(self.dtype)
+        out = x
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+class TestReluFp16(OpTest):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "relu"
+        self.place = paddle.MLUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.rand(3, 2).astype(self.dtype)
+        out = x
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-5)
+
+
+class TestReluNeg(OpTest):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "relu"
+        self.place = paddle.MLUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.array([0.1, -0.1, -1.0]).astype(self.dtype)
+        out = np.array([0.1, 0.0, 0.0]).astype(self.dtype)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+class TestReluNet(unittest.TestCase):
+    def _test(self, run_mlu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            sum = paddle.add(a, b)
+            z = paddle.nn.functional.relu(sum)
+
+            fc_1 = fluid.layers.fc(input=z, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_mlu:
+            place = paddle.MLUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_mlu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        mlu_pred, mlu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(mlu_pred, cpu_pred))
+        self.assertTrue(np.allclose(mlu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index ec59c27558332..01d851469a8d1 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -326,6 +326,9 @@ def is_rocm_op_test():
         def is_npu_op_test():
             return hasattr(cls, "use_npu") and cls.use_npu == True
 
+        def is_mlu_op_test():
+            return hasattr(cls, "use_mlu") and cls.use_mlu == True
+
         if not hasattr(cls, "op_type"):
             raise AssertionError(
                 "This test do not have op_type in class attrs, "
@@ -348,7 +351,8 @@ def is_npu_op_test():
                 and not is_xpu_op_test() \
                 and not is_mkldnn_op_test() \
                 and not is_rocm_op_test() \
-                and not is_npu_op_test():
+                and not is_npu_op_test() \
+                and not is_mlu_op_test():
                 raise AssertionError(
                     "This test of %s op needs check_grad with fp64 precision." %
                     cls.op_type)
@@ -1297,7 +1301,8 @@ def find_actual(target_name, fetch_list):
         # No effect on original OpTest
         # Currently not support ParallelExecutor on XPUPlace.
         if not paddle.is_compiled_with_xpu(
-        ) and not paddle.is_compiled_with_npu():
+        ) and not paddle.is_compiled_with_npu(
+        ) and not paddle.is_compiled_with_mlu():
             self.check_inplace_output_with_place(
                 place, no_check_set=no_check_set, inplace_atol=inplace_atol)
 
@@ -1547,11 +1552,9 @@ def check_grad_with_place(self,
                 delta=numeric_grad_delta,
                 in_place=in_place) for input_to_check in inputs_to_check
         ]
-
         analytic_grads = self._get_gradient(inputs_to_check, place,
                                             output_names, no_grad_set,
                                             user_defined_grad_outputs)
-
         # comparison of bf16 results will happen as fp32
         # loop over list of grads and convert bf16 to fp32
         fp32_analytic_grads = []

From 556d509791b2b0a6c12781f7ecb6bbf811ee3bec Mon Sep 17 00:00:00 2001
From: YuanRisheng <yuanrisheng@baidu.com>
Date: Fri, 14 Jan 2022 11:47:16 +0800
Subject: [PATCH 135/151] refactor impl of elementwise op part2 (#38898)

---
 .../elementwise/elementwise_op_function.h     | 621 +-------------
 paddle/pten/kernels/cpu/elementwise.h         | 144 ++++
 paddle/pten/kernels/gpu/elementwise.h         | 768 ++++++++++++++++++
 3 files changed, 919 insertions(+), 614 deletions(-)

diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h
index 626046890fb06..7cd04318d3f49 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -49,12 +49,6 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/for_range.h"
-#define GetDivMod(dividend, divisor, div, mod) \
-  do {                                         \
-    const auto dividend_copy = dividend;       \
-    *div = dividend_copy / divisor;            \
-    *mod = dividend_copy % divisor;            \
-  } while (0)
 
 #define DIVUP(x, y) (((x) + (y)-1) / (y))
 
@@ -138,613 +132,11 @@ inline void GetBroadcastDimsArrays(const framework::DDim &x_dims,
                                       axis);
 }
 
-template <typename Functor, typename T, typename OutType = T>
-void CommonForwardBroadcastCPU(const framework::Tensor *x,
-                               const framework::Tensor *y, framework::Tensor *z,
-                               int *x_dims_array, int *y_dims_array,
-                               int *out_dims_array, int max_dim,
-                               const platform::CPUDeviceContext &ctx,
-                               Functor func,
-                               const bool is_xsize_larger = true) {
-  pten::CommonForwardBroadcastCPU(x, y, z, x_dims_array, y_dims_array,
-                                  out_dims_array, max_dim, ctx, func,
-                                  is_xsize_larger);
-}
-
-#if defined(__NVCC__) || defined(__HIPCC__)
-
-template <typename T, typename DX_OP, typename Tout = T>
-__global__ void CommonGradBroadcastCUDAKernel(
-    const int *x_strides_array, const int *y_strides_array,
-    const int *out_dims_array, const int *y_strides_order,
-    const int *y_dims_order, const T *x, const T *y, const Tout *out,
-    const Tout *dout, T *dx, int out_size, int max_dim, int thread_num,
-    DX_OP dx_op) {
-  T val(0);
-  int i = blockIdx.x;
-  int tid = threadIdx.x;
-  for (int j = tid; j < thread_num; j += blockDim.x) {
-    const int X_index = i * thread_num + j;
-    int out_index = X_index;
-    int C_index = 0;
-    int B_index = i * thread_num + j;
-    int remainder = 0;
-#pragma unroll
-    for (int d = max_dim - 1; d >= 0; --d) {
-      GetDivMod(B_index, y_dims_order[d], &B_index, &remainder);
-      C_index += remainder * y_strides_order[d];
-    }
-    int x_index = 0;
-    int y_index = 0;
-    int C_index_val = C_index;
-#pragma unroll
-    for (int d = max_dim - 1; d >= 0; --d) {
-      GetDivMod(C_index_val, out_dims_array[d], &C_index_val, &remainder);
-      x_index += remainder * x_strides_array[d];
-      y_index += remainder * y_strides_array[d];
-    }
-    out_index = C_index;
-    val += dx_op(x[x_index], y[y_index], out[out_index], dout[out_index]);
-  }
-  val = paddle::platform::reduceSum(val, tid, thread_num);
-  if (threadIdx.x == 0) {
-    dx[i] = val;
-  }
-}
-
-template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
-void CommonGradBroadcastCUDA(
-    const framework::Tensor &x, const framework::Tensor &y,
-    const framework::Tensor &out, const framework::Tensor &dout,
-    framework::Tensor *dx, framework::Tensor *dy, int *x_dims_array,
-    int *y_dims_array, int *out_dims_array, int max_dim,
-    const platform::CUDADeviceContext &ctx, DX_OP dx_op, DY_OP dy_op) {
-  const auto gplace = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
-  auto cplace = platform::CPUPlace();
-  const T *x_data = x.data<T>();
-  const T *y_data = y.data<T>();
-  const Tout *out_data = out.data<Tout>();
-  const Tout *dout_data = dout.data<Tout>();
-  T *dx_data = dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace());
-  T *dy_data = dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace());
-
-  std::vector<int> x_one_indexs;
-  std::vector<int> y_one_indexs;
-  for (int i = 0; i < max_dim; i++) {
-    if (x_dims_array[i] != y_dims_array[i]) {
-      if (x_dims_array[i] == 1) {
-        x_one_indexs.push_back(i);
-      }
-      if (y_dims_array[i] == 1) {
-        y_one_indexs.push_back(i);
-      }
-    }
-  }
-
-  std::vector<int> x_trans_indexs(max_dim);
-  std::vector<int> y_trans_indexs(max_dim);
-  pten::ComputeBroadcastTranspositionArray(
-      x_one_indexs.data(), x_trans_indexs.data(), max_dim, x_one_indexs.size());
-  pten::ComputeBroadcastTranspositionArray(
-      y_one_indexs.data(), y_trans_indexs.data(), max_dim, y_one_indexs.size());
-
-  // compute array stride for cuda kernel;
-  // e.g. x.dims=[2,3,4], x_stride=[12,4,1]
-  std::vector<int> x_strides_array(max_dim);
-  std::vector<int> y_strides_array(max_dim);
-  std::vector<int> out_strides_array(max_dim);
-  int x_stride = 1;
-  int y_stride = 1;
-  int z_stride = 1;
-  for (int i = max_dim - 1; i >= 0; i--) {
-    x_strides_array[i] = x_dims_array[i] == 1 ? 0 : x_stride;
-    y_strides_array[i] = y_dims_array[i] == 1 ? 0 : y_stride;
-    out_strides_array[i] = z_stride;
-    x_stride *= x_dims_array[i];
-    y_stride *= y_dims_array[i];
-    z_stride *= out_dims_array[i];
-  }
-
-  std::vector<int> x_strides_order(max_dim);
-  std::vector<int> y_strides_order(max_dim);
-  std::vector<int> x_dims_order(max_dim);
-  std::vector<int> y_dims_order(max_dim);
-  for (int i = 0; i < max_dim; ++i) {
-    x_strides_order[i] = out_strides_array[x_trans_indexs[i]];
-    y_strides_order[i] = out_strides_array[y_trans_indexs[i]];
-    x_dims_order[i] = out_dims_array[x_trans_indexs[i]];
-    y_dims_order[i] = out_dims_array[y_trans_indexs[i]];
-  }
-  std::vector<int> x_broadcast_pos;
-  std::vector<int> y_broadcast_pos;
-
-  int bytes = max_dim * sizeof(int);
-
-  for (int i = 0; i < max_dim; ++i) {
-    if (x_dims_array[i] != out_dims_array[i] && x_dims_array[i] == 1) {
-      x_broadcast_pos.emplace_back(i);
-    }
-    if (y_dims_array[i] != out_dims_array[i] && y_dims_array[i] == 1) {
-      y_broadcast_pos.emplace_back(i);
-    }
-  }
-
-  auto stream = ctx.stream();
-  bool can_split_x = false;
-  bool can_split_y = false;
-
-  auto FastCommonCUDAF = [&](const std::vector<int> &broadcast_pos, bool is_y) {
-    int h =
-        std::accumulate(out_dims_array, out_dims_array + broadcast_pos.size(),
-                        1, std::multiplies<int>());
-    int w =
-        std::accumulate(out_dims_array + broadcast_pos.size(),
-                        out_dims_array + max_dim, 1, std::multiplies<int>());
-
-    VLOG(3) << "FastCommonCUDAF elementwise w:" << w << " h:" << h
-            << " is_y:" << is_y;
-
-    int split_h;
-    int split_w;
-    int kh = h;
-    int kw = w;
-
-    if (is_y) {
-      split_h =
-          std::accumulate(x_dims_array, x_dims_array + broadcast_pos.size(), 1,
-                          std::multiplies<int>());
-      split_w =
-          std::accumulate(x_dims_array + broadcast_pos.size(),
-                          x_dims_array + max_dim, 1, std::multiplies<int>());
-
-    } else {
-      split_h =
-          std::accumulate(y_dims_array, y_dims_array + broadcast_pos.size(), 1,
-                          std::multiplies<int>());
-      split_w =
-          std::accumulate(y_dims_array + broadcast_pos.size(),
-                          y_dims_array + max_dim, 1, std::multiplies<int>());
-    }
-
-    if (h > split_h) kh = split_h;
-    if (w > split_w) kw = split_w;
-
-    if (is_y) {
-      if (w < 16 || h < 16) {
-        int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, h);
-        int grid_size = w;
-        pten::CommonGradBroadcast1CUDAKernelHeight<<<grid_size, block_size, 0,
-                                                     stream>>>(
-            x_data, y_data, out_data, dout_data, h, w, dy_op, dy_data, kh, kw,
-            is_y);
-      } else {
-        dim3 block_size = dim3(BLOCK_X, BLOCK_Y);
-        int grid_size = (w + BLOCK_X - 1) / BLOCK_X;
-        pten::FastCommonGradBroadcastCUDAKernelHeight<<<grid_size, block_size,
-                                                        0, stream>>>(
-            x_data, y_data, out_data, dout_data, h, w, dy_op, dy_data, kh, kw,
-            is_y);
-      }
-    } else {
-      if (w < 16 || h < 16) {
-        int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, h);
-        int grid_size = w;
-        pten::CommonGradBroadcast1CUDAKernelHeight<<<grid_size, block_size, 0,
-                                                     stream>>>(
-            x_data, y_data, out_data, dout_data, h, w, dx_op, dx_data, kh, kw,
-            is_y);
-      } else {
-        dim3 block_size = dim3(BLOCK_X, BLOCK_Y);
-        int grid_size = (w + BLOCK_X - 1) / BLOCK_X;
-        pten::FastCommonGradBroadcastCUDAKernelHeight<<<grid_size, block_size,
-                                                        0, stream>>>(
-            x_data, y_data, out_data, dout_data, h, w, dx_op, dx_data, kh, kw,
-            is_y);
-      }
-    }
-  };
-
-  auto FastBroadCastHeightCUDAF = [&](const std::vector<int> &broadcast_pos,
-                                      bool x_large) {
-    int h =
-        std::accumulate(out_dims_array, out_dims_array + broadcast_pos.size(),
-                        1, std::multiplies<int>());
-    int w =
-        std::accumulate(out_dims_array + broadcast_pos.size(),
-                        out_dims_array + max_dim, 1, std::multiplies<int>());
-
-    VLOG(3) << "FastBroadCastHeightCUDAF w:" << w << " h:" << h;
-
-    if (w < 16 || h < 16) {
-      int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, h);
-      int grid_size = w;
-      pten::ElemwiseGradBroadcast1CUDAKernel<<<grid_size, block_size, 0,
-                                               stream>>>(
-          x_data, y_data, out_data, dout_data, h, w, x_large, dx_op, dy_op,
-          dx_data, dy_data);
-    } else {
-      dim3 block_size = dim3(BLOCK_X, BLOCK_Y);
-      int grid_size = (w + BLOCK_X - 1) / BLOCK_X;
-      pten::FastElemwiseGradBroadcast1CUDAKernel<<<grid_size, block_size, 0,
-                                                   stream>>>(
-          x_data, y_data, out_data, dout_data, h, w, x_large, dx_op, dy_op,
-          dx_data, dy_data);
-    }
-  };
-
-  auto FastBroadCastAllCUDAF = [&](const std::vector<int> &broadcast_pos,
-                                   int max_dim, bool is_x_large) {
-    int axis = broadcast_pos[0];
-    int pre = std::accumulate(out_dims_array, out_dims_array + axis, 1,
-                              std::multiplies<int>());
-    int mid = 1;
-    int post = 1;
-
-    if (broadcast_pos.size() == 1) {
-      mid = out_dims_array[axis];
-      post =
-          std::accumulate(out_dims_array + axis + 1, out_dims_array + max_dim,
-                          1, std::multiplies<int>());
-    } else {
-      mid = std::accumulate(out_dims_array + axis,
-                            out_dims_array + broadcast_pos.back() + 1, 1,
-                            std::multiplies<int>());
-      post =
-          std::accumulate(out_dims_array + broadcast_pos.back() + 1,
-                          out_dims_array + max_dim, 1, std::multiplies<int>());
-    }
-
-    VLOG(3) << "FastBroadCastAllCUDAF pre:" << pre << " mid:" << mid
-            << " post:" << post;
-
-    int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, mid);
-    int grid_size = pre * post;
-
-    pten::FastCommonGradBroadcastAllCUDAKernel<<<grid_size, block_size, 0,
-                                                 stream>>>(
-        x_data, y_data, out_data, dout_data, pre, mid, post, is_x_large, dx_op,
-        dy_op, dx_data, dy_data);
-  };
-
-  auto FastBroadCastOneCUDAF = [&](const std::vector<int> &broadcast_pos,
-                                   int max_dim, bool is_x) {
-    int axis = broadcast_pos[0];
-    int pre = std::accumulate(out_dims_array, out_dims_array + axis, 1,
-                              std::multiplies<int>());
-    int mid = out_dims_array[axis];
-    int post =
-        std::accumulate(out_dims_array + axis + 1, out_dims_array + max_dim, 1,
-                        std::multiplies<int>());
-
-    int k_pre;
-    int k_mid;
-    int k_post;
-
-    if (is_x) {
-      k_pre = std::accumulate(y_dims_array, y_dims_array + axis, 1,
-                              std::multiplies<int>());
-      k_mid = y_dims_array[axis];
-      k_post = std::accumulate(y_dims_array + axis + 1, y_dims_array + max_dim,
-                               1, std::multiplies<int>());
-      int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, mid);
-      int grid_size = pre * post;
-      // we need to calc y offset with blockid, so do x_pre/y_pre to get left
-      // size.
-      if (k_pre != pre) k_pre = pre / k_pre;
-
-      pten::FastCommonGradBroadcastOneCUDAKernel<<<grid_size, block_size, 0,
-                                                   stream>>>(
-          x_data, y_data, out_data, dout_data, pre, mid, post, k_pre, k_mid,
-          k_post, true, dx_op, dx_data);
-    } else {
-      k_pre = std::accumulate(x_dims_array, x_dims_array + axis, 1,
-                              std::multiplies<int>());
-      k_mid = x_dims_array[axis];
-      k_post = std::accumulate(x_dims_array + axis + 1, x_dims_array + max_dim,
-                               1, std::multiplies<int>());
-      int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, mid);
-      int grid_size = pre * post;
-      if (k_pre != pre) k_pre = pre / k_pre;
-
-      pten::FastCommonGradBroadcastOneCUDAKernel<<<grid_size, block_size, 0,
-                                                   stream>>>(
-          x_data, y_data, out_data, dout_data, pre, mid, post, k_pre, k_mid,
-          k_post, false, dy_op, dy_data);
-    }
-    VLOG(3) << "FastBroadCastOneCUDAF pre:" << pre << " mid:" << mid
-            << " post:" << post;
-  };
-
-  // do fast elementwise if: 1. only one input need to do broadcast, we can
-  // fallback
-  // to old fast path.
-  // 2. if both x and y need broadcast, then do it one by one.
-  bool fast_broadcast = false;
-  if (x_broadcast_pos.empty() && !y_broadcast_pos.empty()) {
-    can_split_y = pten::SplitDims(y_broadcast_pos, max_dim);
-    if (can_split_y) {
-      // only y need to do broadcast on h
-      if (y_broadcast_pos[0] == 0) {
-        FastBroadCastHeightCUDAF(y_broadcast_pos, true);
-        fast_broadcast = true;
-      }
-    } else if (y_broadcast_pos.size() == 1 ||
-               pten::CheckContiguousDims(
-                   y_broadcast_pos)) {  // for only one dim and
-                                        // contiguous broadcast.
-      // If cannot split,  which means input has 3 parts
-      FastBroadCastAllCUDAF(y_broadcast_pos, max_dim, true);
-      fast_broadcast = true;
-    }
-  } else if (y_broadcast_pos.empty() && !x_broadcast_pos.empty()) {
-    // only x need broadcast
-    can_split_x = pten::SplitDims(x_broadcast_pos, max_dim);
-    if (can_split_x) {
-      if (x_broadcast_pos[0] == 0) {
-        FastBroadCastHeightCUDAF(x_broadcast_pos, false);
-        fast_broadcast = true;
-      }
-    } else if (x_broadcast_pos.size() == 1 ||
-               pten::CheckContiguousDims(x_broadcast_pos)) {
-      FastBroadCastAllCUDAF(x_broadcast_pos, max_dim, false);
-      fast_broadcast = true;
-    }
-  } else if (!x_broadcast_pos.empty() && !y_broadcast_pos.empty()) {
-    // do x and y broadcast each.
-    can_split_y = pten::SplitDims(y_broadcast_pos, max_dim);
-    bool fast_broadcast_x = false;
-    bool fast_broadcast_y = false;
-    if (can_split_y) {
-      // begin at start.
-      if (y_broadcast_pos[0] == 0) {
-        FastCommonCUDAF(y_broadcast_pos, true);
-        fast_broadcast_y = true;
-      }
-    } else if (y_broadcast_pos.size() == 1) {
-      FastBroadCastOneCUDAF(y_broadcast_pos, max_dim, false);
-      can_split_y = true;
-      fast_broadcast_y = true;
-    }
-    can_split_x = pten::SplitDims(x_broadcast_pos, max_dim);
-    if (can_split_x) {
-      if (x_broadcast_pos[0] == 0) {
-        FastCommonCUDAF(x_broadcast_pos, false);
-        fast_broadcast_x = true;
-      }
-    } else if (x_broadcast_pos.size() == 1) {
-      FastBroadCastOneCUDAF(x_broadcast_pos, max_dim, true);
-      can_split_x = true;
-      fast_broadcast_x = true;
-    }
-    VLOG(3) << "CommonBroadcast can_split_y:" << can_split_y
-            << " can_split_x:" << can_split_x;
-    // if both x and y into fast path then return
-    if (fast_broadcast_x && fast_broadcast_y) {
-      fast_broadcast = true;
-    }
-    if (can_split_y && can_split_x && fast_broadcast) return;
-  }
-
-  // Should remove memory copy, use reg instead.
-  if (fast_broadcast) {
-    return;
-  }
-  int x_blocks = 0;
-  int x_threads = 0;
-  pten::ComputeBroadcastKernelSize(x_dims_array, out_dims_array, &x_blocks,
-                                   &x_threads, max_dim);
-  int y_blocks = 0;
-  int y_threads = 0;
-  pten::ComputeBroadcastKernelSize(y_dims_array, out_dims_array, &y_blocks,
-                                   &y_threads, max_dim);
-
-  auto x_strides_array_tmp = memory::Alloc(ctx, bytes);
-  int *x_strides_array_gpu =
-      reinterpret_cast<int *>(x_strides_array_tmp->ptr());
-  memory::Copy(gplace, x_strides_array_gpu, cplace, x_strides_array.data(),
-               bytes, ctx.stream());
-
-  auto y_strides_array_tmp = memory::Alloc(ctx, bytes);
-  int *y_strides_array_gpu =
-      reinterpret_cast<int *>(y_strides_array_tmp->ptr());
-  memory::Copy(gplace, y_strides_array_gpu, cplace, y_strides_array.data(),
-               bytes, ctx.stream());
-
-  auto out_dims_array_tmp = memory::Alloc(ctx, bytes);
-  int *out_dims_array_gpu = reinterpret_cast<int *>(out_dims_array_tmp->ptr());
-  memory::Copy(gplace, out_dims_array_gpu, cplace, out_dims_array, bytes,
-               ctx.stream());
-
-  const int out_size = std::accumulate(out_dims_array, out_dims_array + max_dim,
-                                       1, std::multiplies<int>());
-  int x_block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, x_threads);
-  int y_block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, y_threads);
-  if (dx) {
-    auto x_strides_order_tmp = memory::Alloc(ctx, bytes);
-    int *x_strides_order_gpu =
-        reinterpret_cast<int *>(x_strides_order_tmp->ptr());
-    memory::Copy(gplace, x_strides_order_gpu, cplace, x_strides_order.data(),
-                 bytes, ctx.stream());
-
-    auto x_dims_order_tmp = memory::Alloc(ctx, bytes);
-    int *x_dims_order_gpu = reinterpret_cast<int *>(x_dims_order_tmp->ptr());
-    memory::Copy(gplace, x_dims_order_gpu, cplace, x_dims_order.data(), bytes,
-                 ctx.stream());
-    CommonGradBroadcastCUDAKernel<
-        T, DX_OP, Tout><<<x_blocks, x_block_size, 0, ctx.stream()>>>(
-        x_strides_array_gpu, y_strides_array_gpu, out_dims_array_gpu,
-        x_strides_order_gpu, x_dims_order_gpu, x_data, y_data, out_data,
-        dout_data, dx_data, out_size, max_dim, x_threads, dx_op);
-  }
-  if (dy) {
-    auto y_strides_order_tmp = memory::Alloc(ctx, bytes);
-    int *y_strides_order_gpu =
-        reinterpret_cast<int *>(y_strides_order_tmp->ptr());
-    memory::Copy(gplace, y_strides_order_gpu, cplace, y_strides_order.data(),
-                 bytes, ctx.stream());
-
-    auto y_dims_order_tmp = memory::Alloc(ctx, bytes);
-    int *y_dims_order_gpu = reinterpret_cast<int *>(y_dims_order_tmp->ptr());
-    memory::Copy(gplace, y_dims_order_gpu, cplace, y_dims_order.data(), bytes,
-                 ctx.stream());
-    CommonGradBroadcastCUDAKernel<
-        T, DY_OP, Tout><<<y_blocks, y_block_size, 0, ctx.stream()>>>(
-        x_strides_array_gpu, y_strides_array_gpu, out_dims_array_gpu,
-        y_strides_order_gpu, y_dims_order_gpu, x_data, y_data, out_data,
-        dout_data, dy_data, out_size, max_dim, y_threads, dy_op);
-  }
-}
-
-#endif  // __NVCC__ or __HIPCC__
-
 inline framework::DDim trim_trailing_singular_dims(
     const framework::DDim &dims) {
   return pten::funcs::trim_trailing_singular_dims(dims);
 }
 
-template <typename DeviceContext, typename T, typename DX_OP, typename DY_OP,
-          typename Tout = T>
-void CommonElementwiseBroadcastBackward(
-    const framework::ExecutionContext &ctx, const framework::DDim &x_dims,
-    const framework::DDim &y_dims, const framework::Tensor &x,
-    const framework::Tensor &y, const framework::Tensor &out,
-    const framework::Tensor &dout, int axis, framework::Tensor *dx,
-    framework::Tensor *dy, DX_OP dx_op, DY_OP dy_op) {
-  int max_dim = std::max(x_dims.size(), y_dims.size());
-  axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
-  std::vector<int> x_dims_array(max_dim);
-  std::vector<int> y_dims_array(max_dim);
-  std::vector<int> out_dims_array(max_dim);
-  GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array.data(),
-                         y_dims_array.data(), out_dims_array.data(), max_dim,
-                         axis);
-  // for inplace strategy. memset will make dx and dout clear and get wrong
-  // result.
-  if (dx && dx->IsSharedBufferWith(dout)) {
-    dx->clear();
-    dx->mutable_data<T>(x_dims, ctx.GetPlace());
-  }
-
-  VLOG(3) << "CommonElementwiseBroadcastBackward xdims:"
-          << framework::make_ddim(x_dims_array)
-          << " ydim:" << framework::make_ddim(y_dims_array);
-
-  if (platform::is_gpu_place(ctx.GetPlace())) {
-#if defined(__NVCC__) || defined(__HIPCC__)
-    CommonGradBroadcastCUDA<T, DX_OP, DY_OP, Tout>(
-        x, y, out, dout, dx, dy, x_dims_array.data(), y_dims_array.data(),
-        out_dims_array.data(), max_dim,
-        ctx.template device_context<platform::CUDADeviceContext>(), dx_op,
-        dy_op);
-#endif
-  } else {
-    pten::CommonGradBroadcastCPU<T, DX_OP, DY_OP, Tout>(
-        x, y, out, dout, dx, dy, x_dims_array.data(), y_dims_array.data(),
-        out_dims_array.data(), max_dim,
-        ctx.template device_context<platform::CPUDeviceContext>(), dx_op,
-        dy_op);
-  }
-}
-
-template <typename DeviceContext, typename T, typename DX_OP, typename DY_OP,
-          typename Tout = T>
-void ElemwiseGradComputeWithBroadcast(
-    const framework::ExecutionContext &ctx, const framework::DDim &x_dims,
-    const framework::DDim &y_dims, const framework::Tensor &x,
-    const framework::Tensor &y, const framework::Tensor &out,
-    const framework::Tensor &dout, int axis, framework::Tensor *dx,
-    framework::Tensor *dy, DX_OP dx_op, DY_OP dy_op) {
-  bool is_xsize_larger = true;
-
-  int max_dim = x_dims.size();
-  if (x_dims.size() < y_dims.size()) {
-    is_xsize_larger = false;
-    max_dim = y_dims.size();
-  }
-
-  axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
-  PADDLE_ENFORCE_GE(
-      axis, 0,
-      platform::errors::InvalidArgument(
-          "Axis should be great than or equal to 0, but received axis is %d.",
-          axis));
-  PADDLE_ENFORCE_LT(axis, max_dim,
-                    platform::errors::InvalidArgument(
-                        "Axis should be less than %d, but received axis is %d.",
-                        max_dim, axis));
-
-  int pre, n, post, is_run_common_broadcast, axis_trim = 0;
-  if (is_xsize_larger) {
-    auto y_dims_trimed = trim_trailing_singular_dims(y_dims);
-    axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis;
-    pten::funcs::get_mid_dims(x_dims, y_dims_trimed, axis_trim, &pre, &n, &post,
-                              &is_run_common_broadcast);
-  } else {
-    auto x_dims_trimed = trim_trailing_singular_dims(x_dims);
-    axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis;
-    pten::funcs::get_mid_dims(y_dims, x_dims_trimed, axis_trim, &pre, &n, &post,
-                              &is_run_common_broadcast);
-  }
-  // special case for common backward implementation.
-  if (is_run_common_broadcast) {
-    CommonElementwiseBroadcastBackward<DeviceContext, T, DX_OP, DY_OP, Tout>(
-        ctx, x_dims, y_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
-    return;
-  }
-  if (post == 1) {
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-#if defined(__NVCC__) || defined(__HIPCC__)
-      pten::ElemwiseGradBroadcast1CUDA(
-          ctx.template device_context<DeviceContext>().stream(), x.data<T>(),
-          y.data<T>(), out.data<Tout>(), dout.data<Tout>(), pre, n,
-          is_xsize_larger, dx_op, dy_op,
-          dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
-          dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
-#endif
-    } else {
-      pten::ElemwiseGradBroadcast1CPU(
-          x.data<T>(), y.data<T>(), out.data<Tout>(), dout.data<Tout>(), pre, n,
-          is_xsize_larger, dx_op, dy_op,
-          dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
-          dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
-    }
-  } else {
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-#if defined(__NVCC__) || defined(__HIPCC__)
-      pten::ElemwiseGradBroadcast2CUDA(
-          ctx.template device_context<DeviceContext>().stream(), x.data<T>(),
-          y.data<T>(), out.data<Tout>(), dout.data<Tout>(), pre, n, post,
-          is_xsize_larger, dx_op, dy_op,
-          dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
-          dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
-#endif
-    } else {
-      pten::ElemwiseGradBroadcast2CPU(
-          x.data<T>(), y.data<T>(), out.data<Tout>(), dout.data<Tout>(), pre, n,
-          post, is_xsize_larger, dx_op, dy_op,
-          dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
-          dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
-    }
-  }
-}
-
-template <typename Functor, typename DeviceContext, typename T,
-          typename OutType = T>
-void CommonElementwiseBroadcastForward(
-    const framework::ExecutionContext &ctx, const framework::Tensor *x,
-    const framework::Tensor *y, framework::Tensor *z,
-    const framework::DDim &x_dims, const framework::DDim &y_dims, Functor func,
-    int axis, const bool is_xsize_larger = true) {
-  z->mutable_data<OutType>(ctx.GetPlace());
-  auto pt_x = paddle::experimental::MakePtenDenseTensor(*x);
-  auto pt_y = paddle::experimental::MakePtenDenseTensor(*y);
-  auto pt_z = paddle::experimental::MakePtenDenseTensor(*z);
-  const auto &dev_ctx = ctx.template device_context<DeviceContext>();
-  pten::CommonElementwiseBroadcastForward(dev_ctx, *pt_x.get(), *pt_y.get(),
-                                          pt_z.get(), x_dims, y_dims, func,
-                                          axis, is_xsize_larger);
-}
-
 template <typename DeviceContext, typename T, typename DX_OP, typename DY_OP,
           typename Tout = T>
 void ElemwiseGradCompute(const framework::ExecutionContext &ctx,
@@ -755,14 +147,14 @@ void ElemwiseGradCompute(const framework::ExecutionContext &ctx,
                          DX_OP dx_op, DY_OP dy_op) {
   const framework::DDim &x_dim = x.dims();
   const framework::DDim &y_dim = y.dims();
+  const auto &dev_ctx = ctx.template device_context<DeviceContext>();
   if (x.dims() == y.dims()) {
-    const auto &dev_ctx = ctx.template device_context<DeviceContext>();
     pten::funcs::ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP,
                                                 Tout>(
         dev_ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
   } else {
-    ElemwiseGradComputeWithBroadcast<DeviceContext, T, DX_OP, DY_OP, Tout>(
-        ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
+    pten::ElemwiseGradComputeWithBroadcast<T, DX_OP, DY_OP, Tout>(
+        dev_ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
   }
 }
 
@@ -780,14 +172,15 @@ void ElemwiseExplicitGradCompute(const framework::ExecutionContext &ctx,
                                  DX_OP dx_op, DY_OP dy_op) {
   const framework::DDim &x_dim = x.dims();
   const framework::DDim &y_dim = y.dims();
+  const auto &dev_ctx = ctx.template device_context<DeviceContext>();
   if (x.dims() == y.dims()) {
-    const auto &dev_ctx = ctx.template device_context<DeviceContext>();
     pten::funcs::ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP>(
         dev_ctx, x_dim, y_dim, dout, dout, out, dout, axis, dx, dy, dx_op,
         dy_op);
   } else {
-    ElemwiseGradComputeWithBroadcast<DeviceContext, T, DX_OP, DY_OP>(
-        ctx, x_dim, y_dim, dout, dout, out, dout, axis, dx, dy, dx_op, dy_op);
+    pten::ElemwiseGradComputeWithBroadcast<T, DX_OP, DY_OP>(
+        dev_ctx, x_dim, y_dim, dout, dout, out, dout, axis, dx, dy, dx_op,
+        dy_op);
   }
 }
 
diff --git a/paddle/pten/kernels/cpu/elementwise.h b/paddle/pten/kernels/cpu/elementwise.h
index 97db997a16478..b448586754d60 100644
--- a/paddle/pten/kernels/cpu/elementwise.h
+++ b/paddle/pten/kernels/cpu/elementwise.h
@@ -549,4 +549,148 @@ static void ElemwiseGradBroadcast2CPU(const T* x,
   }
 }
 
+template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
+void CommonElementwiseBroadcastBackward(const CPUContext& ctx,
+                                        const DDim& x_dims,
+                                        const DDim& y_dims,
+                                        const DenseTensor& x,
+                                        const DenseTensor& y,
+                                        const DenseTensor& out,
+                                        const DenseTensor& dout,
+                                        int axis,
+                                        DenseTensor* dx,
+                                        DenseTensor* dy,
+                                        DX_OP dx_op,
+                                        DY_OP dy_op) {
+  int max_dim = std::max(x_dims.size(), y_dims.size());
+  axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
+  std::vector<int> x_dims_array(max_dim);
+  std::vector<int> y_dims_array(max_dim);
+  std::vector<int> out_dims_array(max_dim);
+  funcs::GetBroadcastDimsArrays(x_dims,
+                                y_dims,
+                                x_dims_array.data(),
+                                y_dims_array.data(),
+                                out_dims_array.data(),
+                                max_dim,
+                                axis);
+  // for inplace strategy. memset will make dx and dout clear and get wrong
+  // result.
+  if (dx && dx->IsSharedBufferWith(dout)) {
+    dx->clear();
+    dx->mutable_data<T>(x_dims, ctx.GetPlace());
+  }
+
+  VLOG(3) << "CommonElementwiseBroadcastBackward xdims:"
+          << paddle::framework::make_ddim(x_dims_array)
+          << " ydim:" << paddle::framework::make_ddim(y_dims_array);
+
+  CommonGradBroadcastCPU<T, DX_OP, DY_OP, Tout>(x,
+                                                y,
+                                                out,
+                                                dout,
+                                                dx,
+                                                dy,
+                                                x_dims_array.data(),
+                                                y_dims_array.data(),
+                                                out_dims_array.data(),
+                                                max_dim,
+                                                ctx,
+                                                dx_op,
+                                                dy_op);
+}
+
+template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
+void ElemwiseGradComputeWithBroadcast(const CPUContext& ctx,
+                                      const DDim& x_dims,
+                                      const DDim& y_dims,
+                                      const DenseTensor& x,
+                                      const DenseTensor& y,
+                                      const DenseTensor& out,
+                                      const DenseTensor& dout,
+                                      int axis,
+                                      DenseTensor* dx,
+                                      DenseTensor* dy,
+                                      DX_OP dx_op,
+                                      DY_OP dy_op) {
+  bool is_xsize_larger = true;
+
+  int max_dim = x_dims.size();
+  if (x_dims.size() < y_dims.size()) {
+    is_xsize_larger = false;
+    max_dim = y_dims.size();
+  }
+
+  axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
+  PADDLE_ENFORCE_GE(
+      axis,
+      0,
+      paddle::platform::errors::InvalidArgument(
+          "Axis should be great than or equal to 0, but received axis is %d.",
+          axis));
+  PADDLE_ENFORCE_LT(axis,
+                    max_dim,
+                    paddle::platform::errors::InvalidArgument(
+                        "Axis should be less than %d, but received axis is %d.",
+                        max_dim,
+                        axis));
+
+  int pre, n, post, is_run_common_broadcast, axis_trim = 0;
+  if (is_xsize_larger) {
+    auto y_dims_trimed = funcs::trim_trailing_singular_dims(y_dims);
+    axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis;
+    funcs::get_mid_dims(x_dims,
+                        y_dims_trimed,
+                        axis_trim,
+                        &pre,
+                        &n,
+                        &post,
+                        &is_run_common_broadcast);
+  } else {
+    auto x_dims_trimed = funcs::trim_trailing_singular_dims(x_dims);
+    axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis;
+    funcs::get_mid_dims(y_dims,
+                        x_dims_trimed,
+                        axis_trim,
+                        &pre,
+                        &n,
+                        &post,
+                        &is_run_common_broadcast);
+  }
+  // special case for common backward implementation.
+  if (is_run_common_broadcast) {
+    CommonElementwiseBroadcastBackward<T, DX_OP, DY_OP, Tout>(
+        ctx, x_dims, y_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
+    return;
+  }
+  if (post == 1) {
+    ElemwiseGradBroadcast1CPU(
+        x.data<T>(),
+        y.data<T>(),
+        out.data<Tout>(),
+        dout.data<Tout>(),
+        pre,
+        n,
+        is_xsize_larger,
+        dx_op,
+        dy_op,
+        dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
+        dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
+  } else {
+    ElemwiseGradBroadcast2CPU(
+        x.data<T>(),
+        y.data<T>(),
+        out.data<Tout>(),
+        dout.data<Tout>(),
+        pre,
+        n,
+        post,
+        is_xsize_larger,
+        dx_op,
+        dy_op,
+        dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
+        dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
+  }
+}
+
 }  // namespace pten
diff --git a/paddle/pten/kernels/gpu/elementwise.h b/paddle/pten/kernels/gpu/elementwise.h
index 4dfcd7a2152e0..5abc40c75d17f 100644
--- a/paddle/pten/kernels/gpu/elementwise.h
+++ b/paddle/pten/kernels/gpu/elementwise.h
@@ -18,7 +18,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/function_traits.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"
 #include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/kernels/funcs/cuda_kernel_config.h"
+#include "paddle/pten/kernels/funcs/elementwise_base.h"
 
 #ifdef __HIPCC__
 constexpr int ELEMWISE_MAX_BLOCK_DIM = 256;
@@ -28,6 +31,13 @@ constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024;
 #define BLOCK_X 32
 #define BLOCK_Y 32
 
+#define GetDivMod(dividend, divisor, div, mod) \
+  do {                                         \
+    const auto dividend_copy = dividend;       \
+    *div = dividend_copy / divisor;            \
+    *mod = dividend_copy % divisor;            \
+  } while (0)
+
 namespace pten {
 
 namespace kps = paddle::operators::kernel_primitives;
@@ -1469,4 +1479,762 @@ static void ElemwiseGradBroadcast2CUDA(gpuStream_t stream,
       x, y, out, dout, pre, n, post, is_xsize_larger, dx_op, dy_op, dx, dy);
 }
 
+template <typename T, typename DX_OP, typename Tout = T>
+__global__ void CommonGradBroadcastCUDAKernel(const int *x_strides_array,
+                                              const int *y_strides_array,
+                                              const int *out_dims_array,
+                                              const int *y_strides_order,
+                                              const int *y_dims_order,
+                                              const T *x,
+                                              const T *y,
+                                              const Tout *out,
+                                              const Tout *dout,
+                                              T *dx,
+                                              int out_size,
+                                              int max_dim,
+                                              int thread_num,
+                                              DX_OP dx_op) {
+  T val(0);
+  int i = blockIdx.x;
+  int tid = threadIdx.x;
+  for (int j = tid; j < thread_num; j += blockDim.x) {
+    const int X_index = i * thread_num + j;
+    int out_index = X_index;
+    int C_index = 0;
+    int B_index = i * thread_num + j;
+    int remainder = 0;
+#pragma unroll
+    for (int d = max_dim - 1; d >= 0; --d) {
+      GetDivMod(B_index, y_dims_order[d], &B_index, &remainder);
+      C_index += remainder * y_strides_order[d];
+    }
+    int x_index = 0;
+    int y_index = 0;
+    int C_index_val = C_index;
+#pragma unroll
+    for (int d = max_dim - 1; d >= 0; --d) {
+      GetDivMod(C_index_val, out_dims_array[d], &C_index_val, &remainder);
+      x_index += remainder * x_strides_array[d];
+      y_index += remainder * y_strides_array[d];
+    }
+    out_index = C_index;
+    val += dx_op(x[x_index], y[y_index], out[out_index], dout[out_index]);
+  }
+  val = paddle::platform::reduceSum(val, tid, thread_num);
+  if (threadIdx.x == 0) {
+    dx[i] = val;
+  }
+}
+
+template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
+void CommonGradBroadcastCUDA(const DenseTensor &x,
+                             const DenseTensor &y,
+                             const DenseTensor &out,
+                             const DenseTensor &dout,
+                             DenseTensor *dx,
+                             DenseTensor *dy,
+                             int *x_dims_array,
+                             int *y_dims_array,
+                             int *out_dims_array,
+                             int max_dim,
+                             const GPUContext &ctx,
+                             DX_OP dx_op,
+                             DY_OP dy_op) {
+  const auto gplace =
+      BOOST_GET_CONST(paddle::platform::CUDAPlace, ctx.GetPlace());
+  auto cplace = paddle::platform::CPUPlace();
+  const T *x_data = x.data<T>();
+  const T *y_data = y.data<T>();
+  const Tout *out_data = out.data<Tout>();
+  const Tout *dout_data = dout.data<Tout>();
+  T *dx_data = dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace());
+  T *dy_data = dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace());
+
+  std::vector<int> x_one_indexs;
+  std::vector<int> y_one_indexs;
+  for (int i = 0; i < max_dim; i++) {
+    if (x_dims_array[i] != y_dims_array[i]) {
+      if (x_dims_array[i] == 1) {
+        x_one_indexs.push_back(i);
+      }
+      if (y_dims_array[i] == 1) {
+        y_one_indexs.push_back(i);
+      }
+    }
+  }
+
+  std::vector<int> x_trans_indexs(max_dim);
+  std::vector<int> y_trans_indexs(max_dim);
+  ComputeBroadcastTranspositionArray(
+      x_one_indexs.data(), x_trans_indexs.data(), max_dim, x_one_indexs.size());
+  ComputeBroadcastTranspositionArray(
+      y_one_indexs.data(), y_trans_indexs.data(), max_dim, y_one_indexs.size());
+
+  // compute array stride for cuda kernel;
+  // e.g. x.dims=[2,3,4], x_stride=[12,4,1]
+  std::vector<int> x_strides_array(max_dim);
+  std::vector<int> y_strides_array(max_dim);
+  std::vector<int> out_strides_array(max_dim);
+  int x_stride = 1;
+  int y_stride = 1;
+  int z_stride = 1;
+  for (int i = max_dim - 1; i >= 0; i--) {
+    x_strides_array[i] = x_dims_array[i] == 1 ? 0 : x_stride;
+    y_strides_array[i] = y_dims_array[i] == 1 ? 0 : y_stride;
+    out_strides_array[i] = z_stride;
+    x_stride *= x_dims_array[i];
+    y_stride *= y_dims_array[i];
+    z_stride *= out_dims_array[i];
+  }
+
+  std::vector<int> x_strides_order(max_dim);
+  std::vector<int> y_strides_order(max_dim);
+  std::vector<int> x_dims_order(max_dim);
+  std::vector<int> y_dims_order(max_dim);
+  for (int i = 0; i < max_dim; ++i) {
+    x_strides_order[i] = out_strides_array[x_trans_indexs[i]];
+    y_strides_order[i] = out_strides_array[y_trans_indexs[i]];
+    x_dims_order[i] = out_dims_array[x_trans_indexs[i]];
+    y_dims_order[i] = out_dims_array[y_trans_indexs[i]];
+  }
+  std::vector<int> x_broadcast_pos;
+  std::vector<int> y_broadcast_pos;
+
+  int bytes = max_dim * sizeof(int);
+
+  for (int i = 0; i < max_dim; ++i) {
+    if (x_dims_array[i] != out_dims_array[i] && x_dims_array[i] == 1) {
+      x_broadcast_pos.emplace_back(i);
+    }
+    if (y_dims_array[i] != out_dims_array[i] && y_dims_array[i] == 1) {
+      y_broadcast_pos.emplace_back(i);
+    }
+  }
+
+  auto stream = ctx.stream();
+  bool can_split_x = false;
+  bool can_split_y = false;
+
+  auto FastCommonCUDAF = [&](const std::vector<int> &broadcast_pos, bool is_y) {
+    int h = std::accumulate(out_dims_array,
+                            out_dims_array + broadcast_pos.size(),
+                            1,
+                            std::multiplies<int>());
+    int w = std::accumulate(out_dims_array + broadcast_pos.size(),
+                            out_dims_array + max_dim,
+                            1,
+                            std::multiplies<int>());
+
+    VLOG(3) << "FastCommonCUDAF elementwise w:" << w << " h:" << h
+            << " is_y:" << is_y;
+
+    int split_h;
+    int split_w;
+    int kh = h;
+    int kw = w;
+
+    if (is_y) {
+      split_h = std::accumulate(x_dims_array,
+                                x_dims_array + broadcast_pos.size(),
+                                1,
+                                std::multiplies<int>());
+      split_w = std::accumulate(x_dims_array + broadcast_pos.size(),
+                                x_dims_array + max_dim,
+                                1,
+                                std::multiplies<int>());
+
+    } else {
+      split_h = std::accumulate(y_dims_array,
+                                y_dims_array + broadcast_pos.size(),
+                                1,
+                                std::multiplies<int>());
+      split_w = std::accumulate(y_dims_array + broadcast_pos.size(),
+                                y_dims_array + max_dim,
+                                1,
+                                std::multiplies<int>());
+    }
+
+    if (h > split_h) kh = split_h;
+    if (w > split_w) kw = split_w;
+
+    if (is_y) {
+      if (w < 16 || h < 16) {
+        int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, h);
+        int grid_size = w;
+        CommonGradBroadcast1CUDAKernelHeight<<<grid_size,
+                                               block_size,
+                                               0,
+                                               stream>>>(x_data,
+                                                         y_data,
+                                                         out_data,
+                                                         dout_data,
+                                                         h,
+                                                         w,
+                                                         dy_op,
+                                                         dy_data,
+                                                         kh,
+                                                         kw,
+                                                         is_y);
+      } else {
+        dim3 block_size = dim3(BLOCK_X, BLOCK_Y);
+        int grid_size = (w + BLOCK_X - 1) / BLOCK_X;
+        FastCommonGradBroadcastCUDAKernelHeight<<<grid_size,
+                                                  block_size,
+                                                  0,
+                                                  stream>>>(x_data,
+                                                            y_data,
+                                                            out_data,
+                                                            dout_data,
+                                                            h,
+                                                            w,
+                                                            dy_op,
+                                                            dy_data,
+                                                            kh,
+                                                            kw,
+                                                            is_y);
+      }
+    } else {
+      if (w < 16 || h < 16) {
+        int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, h);
+        int grid_size = w;
+        CommonGradBroadcast1CUDAKernelHeight<<<grid_size,
+                                               block_size,
+                                               0,
+                                               stream>>>(x_data,
+                                                         y_data,
+                                                         out_data,
+                                                         dout_data,
+                                                         h,
+                                                         w,
+                                                         dx_op,
+                                                         dx_data,
+                                                         kh,
+                                                         kw,
+                                                         is_y);
+      } else {
+        dim3 block_size = dim3(BLOCK_X, BLOCK_Y);
+        int grid_size = (w + BLOCK_X - 1) / BLOCK_X;
+        FastCommonGradBroadcastCUDAKernelHeight<<<grid_size,
+                                                  block_size,
+                                                  0,
+                                                  stream>>>(x_data,
+                                                            y_data,
+                                                            out_data,
+                                                            dout_data,
+                                                            h,
+                                                            w,
+                                                            dx_op,
+                                                            dx_data,
+                                                            kh,
+                                                            kw,
+                                                            is_y);
+      }
+    }
+  };
+
+  auto FastBroadCastHeightCUDAF = [&](const std::vector<int> &broadcast_pos,
+                                      bool x_large) {
+    int h = std::accumulate(out_dims_array,
+                            out_dims_array + broadcast_pos.size(),
+                            1,
+                            std::multiplies<int>());
+    int w = std::accumulate(out_dims_array + broadcast_pos.size(),
+                            out_dims_array + max_dim,
+                            1,
+                            std::multiplies<int>());
+
+    VLOG(3) << "FastBroadCastHeightCUDAF w:" << w << " h:" << h;
+
+    if (w < 16 || h < 16) {
+      int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, h);
+      int grid_size = w;
+      ElemwiseGradBroadcast1CUDAKernel<<<grid_size, block_size, 0, stream>>>(
+          x_data,
+          y_data,
+          out_data,
+          dout_data,
+          h,
+          w,
+          x_large,
+          dx_op,
+          dy_op,
+          dx_data,
+          dy_data);
+    } else {
+      dim3 block_size = dim3(BLOCK_X, BLOCK_Y);
+      int grid_size = (w + BLOCK_X - 1) / BLOCK_X;
+      FastElemwiseGradBroadcast1CUDAKernel<<<grid_size,
+                                             block_size,
+                                             0,
+                                             stream>>>(x_data,
+                                                       y_data,
+                                                       out_data,
+                                                       dout_data,
+                                                       h,
+                                                       w,
+                                                       x_large,
+                                                       dx_op,
+                                                       dy_op,
+                                                       dx_data,
+                                                       dy_data);
+    }
+  };
+
+  auto FastBroadCastAllCUDAF = [&](
+      const std::vector<int> &broadcast_pos, int max_dim, bool is_x_large) {
+    int axis = broadcast_pos[0];
+    int pre = std::accumulate(
+        out_dims_array, out_dims_array + axis, 1, std::multiplies<int>());
+    int mid = 1;
+    int post = 1;
+
+    if (broadcast_pos.size() == 1) {
+      mid = out_dims_array[axis];
+      post = std::accumulate(out_dims_array + axis + 1,
+                             out_dims_array + max_dim,
+                             1,
+                             std::multiplies<int>());
+    } else {
+      mid = std::accumulate(out_dims_array + axis,
+                            out_dims_array + broadcast_pos.back() + 1,
+                            1,
+                            std::multiplies<int>());
+      post = std::accumulate(out_dims_array + broadcast_pos.back() + 1,
+                             out_dims_array + max_dim,
+                             1,
+                             std::multiplies<int>());
+    }
+
+    VLOG(3) << "FastBroadCastAllCUDAF pre:" << pre << " mid:" << mid
+            << " post:" << post;
+
+    int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, mid);
+    int grid_size = pre * post;
+
+    FastCommonGradBroadcastAllCUDAKernel<<<grid_size, block_size, 0, stream>>>(
+        x_data,
+        y_data,
+        out_data,
+        dout_data,
+        pre,
+        mid,
+        post,
+        is_x_large,
+        dx_op,
+        dy_op,
+        dx_data,
+        dy_data);
+  };
+
+  auto FastBroadCastOneCUDAF = [&](
+      const std::vector<int> &broadcast_pos, int max_dim, bool is_x) {
+    int axis = broadcast_pos[0];
+    int pre = std::accumulate(
+        out_dims_array, out_dims_array + axis, 1, std::multiplies<int>());
+    int mid = out_dims_array[axis];
+    int post = std::accumulate(out_dims_array + axis + 1,
+                               out_dims_array + max_dim,
+                               1,
+                               std::multiplies<int>());
+
+    int k_pre;
+    int k_mid;
+    int k_post;
+
+    if (is_x) {
+      k_pre = std::accumulate(
+          y_dims_array, y_dims_array + axis, 1, std::multiplies<int>());
+      k_mid = y_dims_array[axis];
+      k_post = std::accumulate(y_dims_array + axis + 1,
+                               y_dims_array + max_dim,
+                               1,
+                               std::multiplies<int>());
+      int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, mid);
+      int grid_size = pre * post;
+      // we need to calc y offset with blockid, so do x_pre/y_pre to get left
+      // size.
+      if (k_pre != pre) k_pre = pre / k_pre;
+
+      FastCommonGradBroadcastOneCUDAKernel<<<grid_size,
+                                             block_size,
+                                             0,
+                                             stream>>>(x_data,
+                                                       y_data,
+                                                       out_data,
+                                                       dout_data,
+                                                       pre,
+                                                       mid,
+                                                       post,
+                                                       k_pre,
+                                                       k_mid,
+                                                       k_post,
+                                                       true,
+                                                       dx_op,
+                                                       dx_data);
+    } else {
+      k_pre = std::accumulate(
+          x_dims_array, x_dims_array + axis, 1, std::multiplies<int>());
+      k_mid = x_dims_array[axis];
+      k_post = std::accumulate(x_dims_array + axis + 1,
+                               x_dims_array + max_dim,
+                               1,
+                               std::multiplies<int>());
+      int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, mid);
+      int grid_size = pre * post;
+      if (k_pre != pre) k_pre = pre / k_pre;
+
+      FastCommonGradBroadcastOneCUDAKernel<<<grid_size,
+                                             block_size,
+                                             0,
+                                             stream>>>(x_data,
+                                                       y_data,
+                                                       out_data,
+                                                       dout_data,
+                                                       pre,
+                                                       mid,
+                                                       post,
+                                                       k_pre,
+                                                       k_mid,
+                                                       k_post,
+                                                       false,
+                                                       dy_op,
+                                                       dy_data);
+    }
+    VLOG(3) << "FastBroadCastOneCUDAF pre:" << pre << " mid:" << mid
+            << " post:" << post;
+  };
+
+  // do fast elementwise if: 1. only one input need to do broadcast, we can
+  // fallback
+  // to old fast path.
+  // 2. if both x and y need broadcast, then do it one by one.
+  bool fast_broadcast = false;
+  if (x_broadcast_pos.empty() && !y_broadcast_pos.empty()) {
+    can_split_y = SplitDims(y_broadcast_pos, max_dim);
+    if (can_split_y) {
+      // only y need to do broadcast on h
+      if (y_broadcast_pos[0] == 0) {
+        FastBroadCastHeightCUDAF(y_broadcast_pos, true);
+        fast_broadcast = true;
+      }
+    } else if (y_broadcast_pos.size() == 1 ||
+               CheckContiguousDims(y_broadcast_pos)) {  // for only one dim and
+                                                        // contiguous broadcast.
+      // If cannot split,  which means input has 3 parts
+      FastBroadCastAllCUDAF(y_broadcast_pos, max_dim, true);
+      fast_broadcast = true;
+    }
+  } else if (y_broadcast_pos.empty() && !x_broadcast_pos.empty()) {
+    // only x need broadcast
+    can_split_x = SplitDims(x_broadcast_pos, max_dim);
+    if (can_split_x) {
+      if (x_broadcast_pos[0] == 0) {
+        FastBroadCastHeightCUDAF(x_broadcast_pos, false);
+        fast_broadcast = true;
+      }
+    } else if (x_broadcast_pos.size() == 1 ||
+               CheckContiguousDims(x_broadcast_pos)) {
+      FastBroadCastAllCUDAF(x_broadcast_pos, max_dim, false);
+      fast_broadcast = true;
+    }
+  } else if (!x_broadcast_pos.empty() && !y_broadcast_pos.empty()) {
+    // do x and y broadcast each.
+    can_split_y = SplitDims(y_broadcast_pos, max_dim);
+    bool fast_broadcast_x = false;
+    bool fast_broadcast_y = false;
+    if (can_split_y) {
+      // begin at start.
+      if (y_broadcast_pos[0] == 0) {
+        FastCommonCUDAF(y_broadcast_pos, true);
+        fast_broadcast_y = true;
+      }
+    } else if (y_broadcast_pos.size() == 1) {
+      FastBroadCastOneCUDAF(y_broadcast_pos, max_dim, false);
+      can_split_y = true;
+      fast_broadcast_y = true;
+    }
+    can_split_x = SplitDims(x_broadcast_pos, max_dim);
+    if (can_split_x) {
+      if (x_broadcast_pos[0] == 0) {
+        FastCommonCUDAF(x_broadcast_pos, false);
+        fast_broadcast_x = true;
+      }
+    } else if (x_broadcast_pos.size() == 1) {
+      FastBroadCastOneCUDAF(x_broadcast_pos, max_dim, true);
+      can_split_x = true;
+      fast_broadcast_x = true;
+    }
+    VLOG(3) << "CommonBroadcast can_split_y:" << can_split_y
+            << " can_split_x:" << can_split_x;
+    // if both x and y into fast path then return
+    if (fast_broadcast_x && fast_broadcast_y) {
+      fast_broadcast = true;
+    }
+    if (can_split_y && can_split_x && fast_broadcast) return;
+  }
+
+  // Should remove memory copy, use reg instead.
+  if (fast_broadcast) {
+    return;
+  }
+  int x_blocks = 0;
+  int x_threads = 0;
+  ComputeBroadcastKernelSize(
+      x_dims_array, out_dims_array, &x_blocks, &x_threads, max_dim);
+  int y_blocks = 0;
+  int y_threads = 0;
+  ComputeBroadcastKernelSize(
+      y_dims_array, out_dims_array, &y_blocks, &y_threads, max_dim);
+
+  auto x_strides_array_tmp = paddle::memory::Alloc(ctx, bytes);
+  int *x_strides_array_gpu =
+      reinterpret_cast<int *>(x_strides_array_tmp->ptr());
+  paddle::memory::Copy(gplace,
+                       x_strides_array_gpu,
+                       cplace,
+                       x_strides_array.data(),
+                       bytes,
+                       ctx.stream());
+
+  auto y_strides_array_tmp = paddle::memory::Alloc(ctx, bytes);
+  int *y_strides_array_gpu =
+      reinterpret_cast<int *>(y_strides_array_tmp->ptr());
+  paddle::memory::Copy(gplace,
+                       y_strides_array_gpu,
+                       cplace,
+                       y_strides_array.data(),
+                       bytes,
+                       ctx.stream());
+
+  auto out_dims_array_tmp = paddle::memory::Alloc(ctx, bytes);
+  int *out_dims_array_gpu = reinterpret_cast<int *>(out_dims_array_tmp->ptr());
+  paddle::memory::Copy(
+      gplace, out_dims_array_gpu, cplace, out_dims_array, bytes, ctx.stream());
+
+  const int out_size = std::accumulate(
+      out_dims_array, out_dims_array + max_dim, 1, std::multiplies<int>());
+  int x_block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, x_threads);
+  int y_block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, y_threads);
+  if (dx) {
+    auto x_strides_order_tmp = paddle::memory::Alloc(ctx, bytes);
+    int *x_strides_order_gpu =
+        reinterpret_cast<int *>(x_strides_order_tmp->ptr());
+    paddle::memory::Copy(gplace,
+                         x_strides_order_gpu,
+                         cplace,
+                         x_strides_order.data(),
+                         bytes,
+                         ctx.stream());
+
+    auto x_dims_order_tmp = paddle::memory::Alloc(ctx, bytes);
+    int *x_dims_order_gpu = reinterpret_cast<int *>(x_dims_order_tmp->ptr());
+    paddle::memory::Copy(gplace,
+                         x_dims_order_gpu,
+                         cplace,
+                         x_dims_order.data(),
+                         bytes,
+                         ctx.stream());
+    CommonGradBroadcastCUDAKernel<
+        T,
+        DX_OP,
+        Tout><<<x_blocks, x_block_size, 0, ctx.stream()>>>(x_strides_array_gpu,
+                                                           y_strides_array_gpu,
+                                                           out_dims_array_gpu,
+                                                           x_strides_order_gpu,
+                                                           x_dims_order_gpu,
+                                                           x_data,
+                                                           y_data,
+                                                           out_data,
+                                                           dout_data,
+                                                           dx_data,
+                                                           out_size,
+                                                           max_dim,
+                                                           x_threads,
+                                                           dx_op);
+  }
+  if (dy) {
+    auto y_strides_order_tmp = paddle::memory::Alloc(ctx, bytes);
+    int *y_strides_order_gpu =
+        reinterpret_cast<int *>(y_strides_order_tmp->ptr());
+    paddle::memory::Copy(gplace,
+                         y_strides_order_gpu,
+                         cplace,
+                         y_strides_order.data(),
+                         bytes,
+                         ctx.stream());
+
+    auto y_dims_order_tmp = paddle::memory::Alloc(ctx, bytes);
+    int *y_dims_order_gpu = reinterpret_cast<int *>(y_dims_order_tmp->ptr());
+    paddle::memory::Copy(gplace,
+                         y_dims_order_gpu,
+                         cplace,
+                         y_dims_order.data(),
+                         bytes,
+                         ctx.stream());
+    CommonGradBroadcastCUDAKernel<
+        T,
+        DY_OP,
+        Tout><<<y_blocks, y_block_size, 0, ctx.stream()>>>(x_strides_array_gpu,
+                                                           y_strides_array_gpu,
+                                                           out_dims_array_gpu,
+                                                           y_strides_order_gpu,
+                                                           y_dims_order_gpu,
+                                                           x_data,
+                                                           y_data,
+                                                           out_data,
+                                                           dout_data,
+                                                           dy_data,
+                                                           out_size,
+                                                           max_dim,
+                                                           y_threads,
+                                                           dy_op);
+  }
+}
+
+template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
+void CommonElementwiseBroadcastBackward(const GPUContext &ctx,
+                                        const DDim &x_dims,
+                                        const DDim &y_dims,
+                                        const DenseTensor &x,
+                                        const DenseTensor &y,
+                                        const DenseTensor &out,
+                                        const DenseTensor &dout,
+                                        int axis,
+                                        DenseTensor *dx,
+                                        DenseTensor *dy,
+                                        DX_OP dx_op,
+                                        DY_OP dy_op) {
+  int max_dim = std::max(x_dims.size(), y_dims.size());
+  axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
+  std::vector<int> x_dims_array(max_dim);
+  std::vector<int> y_dims_array(max_dim);
+  std::vector<int> out_dims_array(max_dim);
+  funcs::GetBroadcastDimsArrays(x_dims,
+                                y_dims,
+                                x_dims_array.data(),
+                                y_dims_array.data(),
+                                out_dims_array.data(),
+                                max_dim,
+                                axis);
+  // for inplace strategy. memset will make dx and dout clear and get wrong
+  // result.
+  if (dx && dx->IsSharedBufferWith(dout)) {
+    dx->clear();
+    dx->mutable_data<T>(x_dims, ctx.GetPlace());
+  }
+
+  VLOG(3) << "CommonElementwiseBroadcastBackward xdims:"
+          << paddle::framework::make_ddim(x_dims_array)
+          << " ydim:" << paddle::framework::make_ddim(y_dims_array);
+
+  CommonGradBroadcastCUDA<T, DX_OP, DY_OP, Tout>(x,
+                                                 y,
+                                                 out,
+                                                 dout,
+                                                 dx,
+                                                 dy,
+                                                 x_dims_array.data(),
+                                                 y_dims_array.data(),
+                                                 out_dims_array.data(),
+                                                 max_dim,
+                                                 ctx,
+                                                 dx_op,
+                                                 dy_op);
+}
+
+template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
+void ElemwiseGradComputeWithBroadcast(const GPUContext &ctx,
+                                      const DDim &x_dims,
+                                      const DDim &y_dims,
+                                      const DenseTensor &x,
+                                      const DenseTensor &y,
+                                      const DenseTensor &out,
+                                      const DenseTensor &dout,
+                                      int axis,
+                                      DenseTensor *dx,
+                                      DenseTensor *dy,
+                                      DX_OP dx_op,
+                                      DY_OP dy_op) {
+  bool is_xsize_larger = true;
+
+  int max_dim = x_dims.size();
+  if (x_dims.size() < y_dims.size()) {
+    is_xsize_larger = false;
+    max_dim = y_dims.size();
+  }
+
+  axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
+  PADDLE_ENFORCE_GE(
+      axis,
+      0,
+      paddle::platform::errors::InvalidArgument(
+          "Axis should be great than or equal to 0, but received axis is %d.",
+          axis));
+  PADDLE_ENFORCE_LT(axis,
+                    max_dim,
+                    paddle::platform::errors::InvalidArgument(
+                        "Axis should be less than %d, but received axis is %d.",
+                        max_dim,
+                        axis));
+
+  int pre, n, post, is_run_common_broadcast, axis_trim = 0;
+  if (is_xsize_larger) {
+    auto y_dims_trimed = funcs::trim_trailing_singular_dims(y_dims);
+    axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis;
+    funcs::get_mid_dims(x_dims,
+                        y_dims_trimed,
+                        axis_trim,
+                        &pre,
+                        &n,
+                        &post,
+                        &is_run_common_broadcast);
+  } else {
+    auto x_dims_trimed = funcs::trim_trailing_singular_dims(x_dims);
+    axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis;
+    funcs::get_mid_dims(y_dims,
+                        x_dims_trimed,
+                        axis_trim,
+                        &pre,
+                        &n,
+                        &post,
+                        &is_run_common_broadcast);
+  }
+  // special case for common backward implementation.
+  if (is_run_common_broadcast) {
+    CommonElementwiseBroadcastBackward<T, DX_OP, DY_OP, Tout>(
+        ctx, x_dims, y_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
+    return;
+  }
+  if (post == 1) {
+    ElemwiseGradBroadcast1CUDA(
+        ctx.stream(),
+        x.data<T>(),
+        y.data<T>(),
+        out.data<Tout>(),
+        dout.data<Tout>(),
+        pre,
+        n,
+        is_xsize_larger,
+        dx_op,
+        dy_op,
+        dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
+        dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
+  } else {
+    ElemwiseGradBroadcast2CUDA(
+        ctx.stream(),
+        x.data<T>(),
+        y.data<T>(),
+        out.data<Tout>(),
+        dout.data<Tout>(),
+        pre,
+        n,
+        post,
+        is_xsize_larger,
+        dx_op,
+        dy_op,
+        dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
+        dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
+  }
+}
+
 }  // namespace pten

From 4c77a9086c488a9a0b11d4e7f0c406c31716345e Mon Sep 17 00:00:00 2001
From: Baibaifan <39549453+Baibaifan@users.noreply.github.com>
Date: Fri, 14 Jan 2022 15:38:49 +0800
Subject: [PATCH 136/151] Add dygraph sharding stage3 (#38052)

---
 paddle/pten/core/dense_tensor.cc              |   4 +
 .../meta_parallel/sharding/sharding_stage3.py | 675 ++++++++++++++++++
 .../meta_parallel/sharding/sharding_utils.py  |  31 +-
 .../fluid/tests/unittests/CMakeLists.txt      |   3 +
 .../unittests/dygraph_sharding_stage3.py      | 233 ++++++
 .../unittests/test_dygraph_sharding_stage3.py |  31 +
 6 files changed, 960 insertions(+), 17 deletions(-)
 create mode 100644 python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
 create mode 100644 python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage3.py

diff --git a/paddle/pten/core/dense_tensor.cc b/paddle/pten/core/dense_tensor.cc
index 0b5f5cb18e13d..eb6f834d72779 100644
--- a/paddle/pten/core/dense_tensor.cc
+++ b/paddle/pten/core/dense_tensor.cc
@@ -435,6 +435,10 @@ inline T* DenseTensor::mutable_data(const paddle::platform::Place& place,
 }
 
 void DenseTensor::ShareBufferWith(const DenseTensor& tensor) {
+  if (storage_ == nullptr) {
+    storage_ = make_intrusive<paddle::experimental::SharedStorage>(
+        paddle::platform::CPUPlace());
+  }
   if (storage_ != nullptr && tensor.storage_ != nullptr) {
     storage_->set_data_shared(tensor.storage_->data_shared());
   }
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
new file mode 100644
index 0000000000000..e5d04aac1551e
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
@@ -0,0 +1,675 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import copy
+import time
+import contextlib
+import logging
+import functools
+import numpy as np
+from itertools import chain
+from functools import reduce
+from types import MethodType
+from collections import deque, OrderedDict
+
+import paddle
+from paddle import nn
+from paddle.autograd import PyLayer
+import paddle.fluid.core as core
+import paddle.distributed as dist
+from paddle.fluid.framework import ParamBase
+from paddle.fluid.clip import ClipGradByGlobalNorm
+from paddle.distributed.collective import _get_global_group
+
+from .sharding_utils import Type, ShardingClipGrad
+from ..pp_utils.utils import _all_gather
+
+# CUDA alignment 256 bytes
+alignment = {"gpu": 256, }
+align = {
+    Type.fp16.value: 2,
+    Type.fp32.value: 4,
+}
+
+global CHECK_LAYER
+CHECK_LAYER = dict()  # Help to check layer's id -> layer's name
+
+
+class ShardingStage3(nn.Layer):
+    """ 
+    A wrapper for Sharding Stage3 Layer in Dygraph. 
+
+    .. warning: ShardingStage3 encapsulates the layer strategy and integrates it into the nn.Layer.
+
+    .. ZeRO: https://arxiv.org/pdf/1910.02054.pdf.
+    """
+
+    def __init__(self,
+                 layer,
+                 optimizer,
+                 group=None,
+                 sync_buffers=False,
+                 device="gpu",
+                 pertrain_sync_models=True,
+                 accumulate_grads=False,
+                 offload=False,
+                 sync_comm=False):
+        super().__init__()
+
+        # Default configs
+        assert core.is_compiled_with_cuda(), "Only support CUDA."
+        self._layer = layer
+        self._default_device = device
+        self.__sync_buffers = sync_buffers
+        self._accumulate_grads = accumulate_grads
+        self._offload = offload
+        self._sync_comm = sync_comm
+
+        # Communication group establishment
+        self._group = dist.new_group(_get_global_group()
+                                     .ranks) if group is None else group
+        self._world_size_scaling = 1.0 / self._group.nranks
+        assert self._group.nranks > 1, "Training must be distributed, ranks must be greater than 1."
+        self._rank = self._group.rank
+        self._global_root_rank = 0  # picking rank 0 as the reference
+        self._global_ranks = self._group.ranks
+        self._param2buffer_size = dict()  # {param.name: size}
+        self._param2buffer = dict(
+        )  # {param.name: [(start0, end0),(start1, end1), ...]}
+        self._trainable_params = dict()  # {layer.name: [trainable_params]}
+
+        assert not isinstance(
+            optimizer, list), "Multiple optimizers are not supported now."
+        self._optim = _OptimizerWrapper(optimizer, self._offload, self._group,
+                                        self._update_params_slice)
+        self._ori_parameter_list = self._optim._parameter_list
+        self._ori_param_groups = self._optim._param_groups
+
+        # Replace optimizer's _grad_clip
+        if isinstance(self._optim._grad_clip, ClipGradByGlobalNorm):
+            logging.warning(
+                "While using ClipGradByGlobalNorm in ShardingStage3, the grad clip of original optimizer will be changed."
+            )
+            self._optim._grad_clip = ShardingClipGrad(self._optim._grad_clip,
+                                                      paddle.get_device(),
+                                                      self._group)
+
+        # Synchronous all ranks models
+        if pertrain_sync_models:
+            self._sync_params_and_buffers()
+
+        self._segment_rank_params(self._layer)
+
+        # In the first step, record the execution order of the layer
+        self._order_tracer = OrderedDict()
+        self._order_tracer["order"] = 0
+        self._order_tracer["layer"] = []
+        # Register task flow
+        self._task_flow = TaskFlow()
+        # Register forward hooks
+        self._register_forward_hooks(self._layer)
+        # Register backward parameter hooks
+        self._register_backward_hooks()
+        # Redefine optimizer step and clear function
+        self._redefine_opt_step()
+        self._redefine_opt_clear()
+
+    @paddle.no_grad()
+    def _sync_params_and_buffers(self):
+        """
+        Sync all model states for all ranks
+        """
+
+        for p in self._layer.parameters():
+            dist.broadcast(
+                p,
+                src=self._global_root_rank,
+                group=self._group,
+                use_calc_stream=True)
+
+        # Multi stream operation will be supported later
+        dist.wait(tensor=p, group=self._group, use_calc_stream=True)
+
+    def _clear_gradients(self):
+        assert len(self._trainable_params.keys()) > 0
+        current_layer_params = self._layer.parameters(include_sublayers=True)
+        trainable_params = list(
+            filter(lambda x: x.trainable, current_layer_params))
+        for param in trainable_params:
+            assert hasattr(
+                param, "fw_storage"
+            ), "Find {} don't have fw_storage attribute.".format(param.name)
+
+            # param.bw_storage.zero_()
+            param.fw_storage.clear_gradient(False)
+            param.fw_storage._gradient_set_empty(False)
+            param.bw_storage._clear()
+
+    # Update param memery slice
+    def _update_params_slice(self):
+        update_list = self._update_params()
+
+        if not isinstance(self._optim._param_groups[0], dict):
+            slice_params = [param.fw_storage for param in update_list]
+            self._optim._parameter_list = slice_params
+            self._optim._param_groups = slice_params
+        else:
+            params_name_list = list(map(lambda p: p.name, update_list))
+            for param_group in self._optim._param_groups:
+                slice_p = []
+                for p in param_group['params']:
+                    if p.name in params_name_list:
+                        assert hasattr(
+                            p, "fw_storage"
+                        ), "Find {} don't have fw_storage attribute.".format(
+                            p.name)
+                        slice_p.append(p.fw_storage)
+                    param_group['params'] = slice_p
+
+    def forward(self, *inputs, **kwargs):
+        """
+        A wrapper for Sharding Stage3 layer.
+        """
+        # 1.Sync layer's buffers state
+        if self.__sync_buffers:
+            self._sync_buffers()
+
+        # 2.Normal FW on the base model
+        fw = self._layer(*inputs, **kwargs)
+
+        return fw
+
+    def _segment_rank_params(self, layer, name="last_layer"):
+        current_layer_params = _current_layer_params(layer)
+        if current_layer_params:
+            CHECK_LAYER[id(layer)] = name
+            self._flatten_layer_params(layer, current_layer_params)
+
+        for name, sub_layer in layer.named_children():
+            self._segment_rank_params(sub_layer, name)
+
+    def _flatten_layer_params(self, layer, current_layer_params):
+        def _add_manage_info(trainable_param):
+            return _PartitionParam(trainable_param)
+
+        trainable_params = list(
+            filter(lambda x: x.trainable, current_layer_params))
+        assert id(layer) not in self._trainable_params.keys()
+        self._trainable_params[id(layer)] = list(
+            map(_add_manage_info, trainable_params))
+
+        for param in self._trainable_params[id(layer)]:
+            if param.name in self._param2buffer.keys():
+                continue
+            self._param2buffer[param.name] = []
+            # 1.Params alignment
+            offset = 0
+            # CUDA alignment 256 bytes
+            size = param._numel() * align[param.dtype]
+            remaining = size % alignment[self._default_device]
+            ali = 0 if remaining == 0 else alignment[
+                self._default_device] - remaining
+            align_ = ali // align[param.dtype]
+
+            offset = align_ + param._numel()
+            buffer_size = offset if offset % self._group.nranks == 0 else offset + self._group.nranks - (
+                offset % self._group.nranks)
+            self._param2buffer_size[param.name] = buffer_size
+
+            # 2.Combination param buffer
+            assert buffer_size % self._group.nranks == 0
+            pre_buffer = buffer_size // self._group.nranks
+
+            for rank_ in range(self._group.nranks):
+                self._param2buffer[param.name].append(
+                    (rank_ * pre_buffer, (rank_ + 1) * pre_buffer))
+
+            # 3.Flatten layer params and release other rank buffer
+            self._param_storage(param, buffer_size)
+
+    def _param_storage(self, param, buffer_size):
+        assert isinstance(buffer_size, int)
+        value = np.zeros(
+            buffer_size,
+            dtype=np.float16) if Type.fp16.value == param.dtype else np.zeros(
+                buffer_size, dtype=np.float32)
+        buffer = core.VarBase(value=value, place=core.CPUPlace())
+
+        param_shape = param.shape
+        origin_state = param.stop_gradient
+        param.stop_gradient = True
+        param.flatten_()
+        param.stop_gradient = origin_state
+        start, end = self._param2buffer[param.name][self._rank]
+
+        # Copy the current param value
+        tmp_var = core.VarBase(
+            tensor=buffer._slice(0, param._numel()), place=core.CPUPlace())
+        param_cpu = param.cpu()
+        tmp_var.value().get_tensor().set(param_cpu.value().get_tensor(),
+                                         core.CPUPlace())
+        param.value().get_tensor()._set_dims(param_shape)
+        param._clear()
+
+        # Current rank param_storage
+        param.fw_storage = core.VarBase(
+            buffer._slice(start, end), "slice@" + param.name)
+        param.status = "part"
+
+        # Updata optimizer master weights
+        if param.dtype == Type.fp16.value:
+            self._optim._master_weights[param.fw_storage.name] = paddle.cast(
+                param.fw_storage, Type.fp32.value)
+
+    def _register_forward_hooks(self, layer):
+        current_layer_params = _current_layer_params(layer)
+        if current_layer_params:
+            self._register_forward_all_hooks(layer, self._task_flow)
+
+        for _, sub_layer in layer.named_children():
+            self._register_forward_hooks(sub_layer)
+
+    def _register_forward_all_hooks(self, sub_layer, task_flow):
+        def _forward_pre_hook(layer, inputs):
+            return ForwardPreHooks(layer, self._order_tracer,
+                                   self._trainable_params, self._param2buffer,
+                                   self._rank, self._group, self._sync_comm,
+                                   task_flow)
+
+        def _forward_post_hook(layer, inputs, outputs):
+            return ForwardPostHooks.apply(
+                outputs, layer, self._order_tracer, self._trainable_params,
+                self._param2buffer, self._param2buffer_size, self._rank,
+                self._group, self._sync_comm, task_flow)
+
+        # register previous forward hooks
+        sub_layer.register_forward_pre_hook(_forward_pre_hook)
+
+        # register post forward hooks
+        sub_layer.register_forward_post_hook(_forward_post_hook)
+
+    @paddle.no_grad()
+    def _sync_buffers(self):
+        for buffer in self._layer.buffers(include_sublayers=True):
+            dist.broadcast(
+                buffer,
+                self._global_root_rank,
+                self._group,
+                use_calc_stream=True)
+        # Multi stream operation will be supported later
+        dist.wait(tensor=buffer, group=self._group, use_calc_stream=True)
+
+    def __getattr__(self, name):
+        """Forward missing attributes to wrapped layer."""
+        try:
+            return super().__getattr__(name)
+        except AttributeError:
+            return getattr(self._layer, name)
+
+    def _update_params(self):
+        update_list = []
+        assert len(self._trainable_params.keys()) > 0
+        current_layer_params = self._layer.parameters(include_sublayers=True)
+        trainable_params = list(
+            filter(lambda x: x.trainable, current_layer_params))
+        for param in trainable_params:
+            assert hasattr(
+                param,
+                "fw_storage"), "Find {} don't have fw_storage attribute".format(
+                    param.name)
+
+            if self._accumulate_grads:
+                param.bw_storage.scale_(scale=self._world_size_scaling)
+            param.fw_storage = _VarBaseWrapper(param)
+            param.fw_storage._copy_gradient_from(param.bw_storage)
+            update_list.append(param)
+        return update_list
+
+    def get_all_parameters(self):
+        assert len(self._trainable_params.keys()) > 0
+        current_layer_params = self._layer.parameters(include_sublayers=True)
+        trainable_params = list(
+            filter(lambda x: x.trainable, current_layer_params))
+        for param in trainable_params:
+            if param.use_count > 0:
+                continue
+            assert hasattr(
+                param,
+                "fw_storage"), "Find {} don't have fw_storage attribute".format(
+                    param.name)
+
+            full_param = _all_gather(
+                param.fw_storage, self._group, use_calc_stream=True)
+            dist.wait(
+                tensor=full_param, group=self._group, use_calc_stream=True)
+            core.VarBase(full_param._slice(0, param._numel()))._share_buffer_to(
+                param)
+            param.value().get_tensor()._set_dims(param.shape)
+            param.fw_storage._clear()
+            param.fw_storage = None
+            param.status = "all"
+            param.use_count += 1
+
+        self._optim._parameter_list = self._ori_parameter_list
+        self._optim._param_groups = self._ori_param_groups
+
+    def _register_backward_hooks(self):
+        current_layer_params = self._layer.parameters(include_sublayers=True)
+        trainable_params = list(
+            filter(lambda x: x.trainable, current_layer_params))
+
+        for param in trainable_params:
+            allreduce_function = self._get_allreduce_fn(param)
+            param._register_backward_hook(allreduce_function)
+
+    def _get_allreduce_fn(self, param):
+        @paddle.no_grad()
+        def reduce(*_):
+            if param.name in self._task_flow.full_grad.keys():
+                full_grad = self._task_flow.full_grad[param.name]
+                with paddle.amp.auto_cast(enable=False):
+                    if not self._accumulate_grads:
+                        full_grad.scale_(scale=self._world_size_scaling)
+                    # Only support sync allreduce current rank's layer now
+                    dist.all_reduce(
+                        tensor=full_grad,
+                        group=self._group,
+                        use_calc_stream=True)
+                    dist.wait(
+                        tensor=full_grad,
+                        group=self._group,
+                        use_calc_stream=True)
+
+                    start, end = self._param2buffer[param.name][self._rank]
+                    if not self._accumulate_grads or param.bw_storage is None:
+                        param.bw_storage = core.VarBase(
+                            full_grad._slice(start, end)).detach().clone()
+                    else:
+                        param.bw_storage.add_(
+                            core.VarBase(full_grad._slice(start, end)).detach()
+                            .clone())
+                param.clear_gradient(False)
+                param._gradient_set_empty(False)
+                tmp_var = self._task_flow.full_grad.pop(param.name)
+                tmp_var._clear()
+
+            if param.name in self._task_flow.full_param.keys():
+                if param.status == "all":
+                    param.use_count = 0
+                    param._clear()
+                    start, end = self._param2buffer[param.name][self._rank]
+                    with paddle.amp.auto_cast(enable=False):
+                        param.fw_storage = core.VarBase(
+                            self._task_flow.full_param[param.name]._slice(start,
+                                                                          end),
+                            param.name + "@slice").detach().clone()
+                    param.status = "part"
+                    tmp_var = self._task_flow.full_param.pop(param.name)
+                    tmp_var._clear()
+
+        return reduce
+
+    def _redefine_opt_step(self):
+        params_slice_func = self._update_params_slice
+        opt_step = self._optim.step
+        update_scaler = self._optim.update_scaler
+
+        def _opt_step(self):
+            if not update_scaler:
+                params_slice_func()
+            opt_step()
+
+        self._optim.step = MethodType(_opt_step, self._optim)
+
+    def _redefine_opt_clear(self):
+        clear_func = self._clear_gradients
+
+        def _opt_clear(self):
+            clear_func()
+
+        self._optim.clear_grad = MethodType(_opt_clear, self._optim)
+
+
+def ForwardPreHooks(layer, order_tracer, trainable_params, param2buffer, rank,
+                    group, sync_comm, task_flow):
+
+    # Record layer's id
+    layer_id = id(layer)
+    use_calc, sync_wait = False, False
+
+    if layer_id not in order_tracer.keys() or sync_comm:
+        use_calc, sync_wait = True, True
+        task_flow.use_calc[layer_id] = use_calc
+    else:
+        task_flow.use_calc[layer_id] = use_calc
+        _wait_layer(trainable_params, layer_id, task_flow, group, use_calc)
+
+        if layer_id == order_tracer["layer"][-1]: return
+        order_ = order_tracer[layer_id]
+        layer_id = order_tracer["layer"][order_ + 1]
+    _allgather_buffer(
+        layer_id,
+        trainable_params,
+        group,
+        use_calc_stream=use_calc,
+        task_flow=task_flow,
+        sync_wait=sync_wait)
+    return
+
+
+class ForwardPostHooks(PyLayer):
+    @staticmethod
+    def forward(ctx, inputs, layer, order_tracer, trainable_params,
+                param2buffer, param2buffer_size, rank, group, sync_comm,
+                task_flow):
+        _release_param(layer, trainable_params, param2buffer, rank, task_flow)
+
+        layer_id = id(layer)
+        if layer_id not in order_tracer.keys():
+            order_ = order_tracer["order"]
+            order_tracer[layer_id] = order_
+            order_tracer["order"] += 1
+            order_tracer["layer"].append(layer_id)
+        ctx.order_tracer = order_tracer
+        ctx.task_flow = task_flow
+        ctx.group = group
+        ctx.layer = layer
+        ctx.sync_comm = sync_comm
+        ctx.trainable_params = trainable_params
+        ctx.param2buffer_size = param2buffer_size
+
+        return inputs
+
+    @staticmethod
+    def backward(ctx, *args):
+        # Load context value
+        order_tracer = ctx.order_tracer
+        task_flow = ctx.task_flow
+        group = ctx.group
+        layer = ctx.layer
+        trainable_params = ctx.trainable_params
+        param2buffer_size = ctx.param2buffer_size
+        sync_comm = ctx.sync_comm
+        layer_id = id(layer)
+        use_calc, sync_wait = False, False
+        if sync_comm:
+            use_calc, sync_wait = True, True
+            _allgather_buffer(
+                layer_id,
+                trainable_params,
+                group,
+                use_calc_stream=use_calc,
+                task_flow=task_flow,
+                sync_wait=sync_wait)
+        else:
+            _wait_layer(trainable_params, layer_id, task_flow, group, use_calc)
+        _create_params_grad(layer, trainable_params, param2buffer_size,
+                            task_flow)
+        task_flow.use_calc[layer_id] = use_calc
+        if layer_id != order_tracer["layer"][0] and not sync_comm:
+            layer_next_id = order_tracer["layer"][order_tracer[layer_id] - 1]
+            _allgather_buffer(
+                layer_next_id,
+                trainable_params,
+                group,
+                use_calc_stream=use_calc,
+                task_flow=task_flow,
+                sync_wait=sync_wait)
+
+        return args
+
+
+class TaskFlow:
+    """
+    Task flows, one way linked list for task acquisition.
+    """
+
+    def __init__(self,
+                 full_param=dict(),
+                 full_grad=dict(),
+                 use_calc=dict(),
+                 callback=None):
+        self.full_param = full_param
+        self.full_grad = full_grad
+        self.use_calc = use_calc
+        self.callback = callback
+
+
+def _release_param(layer, trainable_params, param2buffer, rank, task_flow):
+    for param in trainable_params[id(layer)]:
+        # async communicate share weight not clear
+        param.use_count -= 1
+        if param.use_count == 0:
+            param._clear()
+            if param.name in task_flow.full_param.keys():
+                start, end = param2buffer[param.name][rank]
+                with paddle.amp.auto_cast(enable=False):
+                    param.fw_storage = core.VarBase(
+                        task_flow.full_param[param.name]._slice(start, end),
+                        param.name + "@slice").detach().clone()
+                param.status = "part"
+                tmp_var = task_flow.full_param.pop(param.name)
+                tmp_var._clear()
+    return
+
+
+def _wait_layer(trainable_params, layer_id, task_flow, group, use_calc_stream):
+    for param in trainable_params[layer_id]:
+        if param.status == "all":
+            param.use_count += 1
+            continue
+        if param.name in task_flow.full_param.keys():
+            full_param = task_flow.full_param[param.name]
+            with paddle.amp.auto_cast(enable=False):
+                paddle.device.cuda.synchronize()
+            core.VarBase(full_param._slice(0, param._numel()))._share_buffer_to(
+                param)
+            param.value().get_tensor()._set_dims(param.shape)
+            param.fw_storage._clear()
+            param.fw_storage = None
+            param.status = "all"
+            param.use_count += 1
+        else:
+            _allgather_buffer(
+                layer_id,
+                trainable_params,
+                group,
+                use_calc_stream,
+                task_flow,
+                sync_wait=True)
+            break
+    return task_flow
+
+
+def _allgather_buffer(layer_id,
+                      trainable_params,
+                      group,
+                      use_calc_stream,
+                      task_flow,
+                      sync_wait=False):
+    for param in trainable_params[layer_id]:
+        if param.status == "all":
+            param.use_count += 1
+            continue
+        with paddle.amp.auto_cast(enable=False):
+            full_param = _all_gather(
+                param.fw_storage, group, use_calc_stream=use_calc_stream)
+        if sync_wait:
+            with paddle.amp.auto_cast(enable=False):
+                dist.wait(
+                    tensor=full_param,
+                    group=group,
+                    use_calc_stream=use_calc_stream)
+            core.VarBase(full_param._slice(0, param._numel()))._share_buffer_to(
+                param)
+            param.value().get_tensor()._set_dims(param.shape)
+            param.fw_storage._clear()
+            param.fw_storage = None
+            param.status = "all"
+            param.use_count += 1
+        task_flow.full_param[param.name] = full_param
+    return task_flow
+
+
+@paddle.no_grad()
+def _create_params_grad(layer, trainable_params, param2buffer_size, task_flow):
+    for param in trainable_params[id(layer)]:
+        if param.name in task_flow.full_grad.keys():
+            continue
+        assert isinstance(param2buffer_size[param.name], int)
+        temp_grad = paddle.zeros(
+            [param2buffer_size[param.name]], dtype=param.dtype)
+        param._copy_gradient_from(
+            core.VarBase(temp_grad._slice(0, param._numel())))
+        task_flow.full_grad[param.name] = temp_grad
+    return task_flow
+
+
+def _PartitionParam(param):
+    if not hasattr(param, "fw_storage"):
+        setattr(param, "fw_storage", None)
+        setattr(param, "bw_storage", None)
+        setattr(param, "status", "all")
+        setattr(param, "use_count", 0)
+    return param
+
+
+def _VarBaseWrapper(param):
+    varbase = param.fw_storage
+    tmp_param = ParamBase(
+        shape=varbase.shape, dtype=varbase.dtype, name="slice@" + param.name)
+    varbase._share_buffer_to(tmp_param)
+    tmp_param.regularizer = param.regularizer
+    tmp_param.optimize_attr['learning_rate'] = param.optimize_attr[
+        'learning_rate']
+    varbase._clear()
+    return tmp_param
+
+
+def _OptimizerWrapper(optimizer, offload, group, update_params_slice):
+    if not hasattr(optimizer, "_optim"):
+        setattr(optimizer, "_optim", optimizer)
+        setattr(optimizer, "offload", offload)
+        setattr(optimizer, "group", group)
+        setattr(optimizer, "update_scaler", None)
+        setattr(optimizer, "update_slice", update_params_slice)
+    return optimizer
+
+
+def _current_layer_params(layer):
+    return layer.parameters(
+        include_sublayers=False) + list(layer.extra_parameters) if hasattr(
+            layer, "extra_parameters") else layer.parameters(
+                include_sublayers=False)
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
index 272aada576be8..5f696195c1abc 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
@@ -152,6 +152,9 @@ def unscale_method(self, optimizer):
         param_grads = []
         param_grads_fp16 = []
         param_grads_fp32 = []
+        if hasattr(optimizer, "update_slice"):
+            optimizer.update_slice()
+            optimizer.update_scaler = True
 
         if getattr(optimizer._optim, '_param_groups', None) and isinstance(
                 optimizer._optim._param_groups[0], dict):
@@ -161,27 +164,21 @@ def unscale_method(self, optimizer):
                     if param._grad_ivar() is not None:
                         param_grads.append(param._grad_ivar())
                         if param._grad_ivar(
-                        ).dtype == core.VarDesc.VarType.FP16:
+                        ).dtype in [core.VarDesc.VarType.FP16, paddle.float16]:
                             param_grads_fp16.append(param._grad_ivar())
                         else:
                             param_grads_fp32.append(param._grad_ivar())
         else:
-            param_grads = [
-                param._grad_ivar() for param in optimizer._optim._parameter_list
-                if param._grad_ivar() is not None
-            ]
-            param_grads_fp16 = [
-                param._grad_ivar() for param in optimizer._optim._parameter_list
-                if (param._grad_ivar() is not None
-                    ) and (param._grad_ivar().dtype == core.VarDesc.VarType.FP16
-                           )
-            ]
-            param_grads_fp32 = [
-                param._grad_ivar() for param in optimizer._optim._parameter_list
-                if (param._grad_ivar() is not None
-                    ) and (param._grad_ivar().dtype == core.VarDesc.VarType.FP32
-                           )
-            ]
+            for param in optimizer._optim._parameter_list:
+                if param.grad is not None:
+                    param_grads.append(param.grad)
+                    if param.grad.dtype in [
+                            core.VarDesc.VarType.FP16, paddle.float16
+                    ]:
+                        param_grads_fp16.append(param.grad)
+                    else:
+                        param_grads_fp32.append(param.grad)
+
         temp_found_inf_fp16 = to_variable(np.array([0]).astype(np.bool))
         temp_found_inf_fp32 = to_variable(np.array([0]).astype(np.bool))
 
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 67697fcfd8398..c0c13866ccd55 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -34,6 +34,7 @@ list(APPEND DIST_TEST_OPS test_parallel_dygraph_tensor_parallel)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_sharding_parallel)
 list(APPEND DIST_TEST_OPS test_dygraph_sharding_optimizer_stage2)
 list(APPEND DIST_TEST_OPS test_dygraph_sharding_stage2)
+list(APPEND DIST_TEST_OPS test_dygraph_sharding_stage3)
 list(APPEND DIST_TEST_OPS test_auto_parallel_parallelizer)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_mp_layers)
 list(APPEND DIST_TEST_OPS test_hybrid_parallel_inference_helper)
@@ -250,6 +251,7 @@ if ((NOT WITH_GPU) AND (NOT WITH_ROCM))
     list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sharding_parallel)
     list(REMOVE_ITEM TEST_OPS test_dygraph_sharding_optimizer_stage2)
     list(REMOVE_ITEM TEST_OPS test_dygraph_sharding_stage2)
+    list(REMOVE_ITEM TEST_OPS test_dygraph_sharding_stage3)
     list(REMOVE_ITEM TEST_OPS test_auto_parallel_parallelizer)
     list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_mp_layers)
     LIST(REMOVE_ITEM TEST_OPS test_imperative_auto_mixed_precision)
@@ -1058,6 +1060,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL)
     set_tests_properties(test_parallel_dygraph_sharding_parallel PROPERTIES TIMEOUT 120)
     set_tests_properties(test_dygraph_sharding_optimizer_stage2 PROPERTIES TIMEOUT 120)
     set_tests_properties(test_dygraph_sharding_stage2 PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_dygraph_sharding_stage3 PROPERTIES TIMEOUT 120)
     set_tests_properties(test_auto_parallel_parallelizer PROPERTIES TIMEOUT 120)
     set_tests_properties(test_parallel_dygraph_mp_layers PROPERTIES TIMEOUT 120)
     set_tests_properties(test_hybrid_parallel_inference_helper PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py
new file mode 100644
index 0000000000000..5b0bec9c454b0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py
@@ -0,0 +1,233 @@
+# -*- coding: UTF-8 -*-
+
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import argparse
+import ast
+import time
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.dygraph.nn import Linear
+from paddle.distributed import fleet
+from paddle.fluid.dygraph import nn
+
+from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import ShardingOptimizerStage2
+from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2 import ShardingStage2
+from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage3 import ShardingStage3
+from paddle.distributed.fleet.meta_parallel.sharding.sharding_utils import ShardingScaler
+
+epoch = 10
+batch_size = 32
+paddle.seed(2021)
+np.random.seed(2021)
+base_lr = 0.1
+momentum_rate = 0.9
+l2_decay = 1e-4
+fleet.init(is_collective=True)
+
+
+class MLP(fluid.Layer):
+    def __init__(self, linear_size=1000, param_attr=None, bias_attr=None):
+        super(MLP, self).__init__()
+
+        self._linear1 = Linear(linear_size, linear_size)
+        self._linear2 = Linear(linear_size, linear_size)
+        self._linear3 = Linear(linear_size, 10)
+
+    def forward(self, inputs):
+        y = self._linear1(inputs)
+        y = self._linear2(y)
+        y = self._linear3(y)
+        return y
+
+
+def reader_decorator(linear_size=1000):
+    def __reader__():
+        for _ in range(100):
+            img = np.random.rand(linear_size).astype('float32')
+            label = np.ones(1).astype('int64')
+            yield img, label
+
+    return __reader__
+
+
+def optimizer_setting(model, use_pure_fp16, opt_group=False):
+    clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
+    optimizer = paddle.optimizer.AdamW(
+        parameters=[{
+            "params": model.parameters()
+        }] if opt_group else model.parameters(),
+        learning_rate=0.001,
+        weight_decay=0.00001,
+        grad_clip=clip,
+        multi_precision=use_pure_fp16)
+
+    return optimizer
+
+
+def train_mlp(model,
+              sharding_stage,
+              use_pure_fp16=False,
+              accumulate_grad=False,
+              opt_group=False,
+              recompute=False):
+    group = paddle.distributed.new_group([0, 1])
+    if opt_group:
+        optimizer = optimizer_setting(
+            model=model, use_pure_fp16=use_pure_fp16, opt_group=opt_group)
+    else:
+        optimizer = optimizer_setting(model=model, use_pure_fp16=use_pure_fp16)
+
+    if use_pure_fp16:
+        model = paddle.amp.decorate(
+            models=model, level='O2', save_dtype='float32')
+        scaler = paddle.amp.GradScaler(init_loss_scaling=32768)
+        scaler = ShardingScaler(scaler)
+    if sharding_stage == 2:
+        optimizer = ShardingOptimizerStage2(
+            params=model.parameters(), optim=optimizer, group=group)
+        model = ShardingStage2(
+            model,
+            optimizer,
+            group=group,
+            buffer_max_size=2**21,
+            accumulate_grads=accumulate_grad)
+    elif sharding_stage == 3:
+        model = ShardingStage3(
+            model, optimizer=optimizer, group=group, sync_comm=recompute)
+
+    train_reader = paddle.batch(
+        reader_decorator(), batch_size=batch_size, drop_last=True)
+
+    train_loader = paddle.io.DataLoader.from_generator(
+        capacity=32,
+        use_double_buffer=True,
+        iterable=True,
+        return_list=True,
+        use_multiprocess=True)
+    train_loader.set_sample_list_generator(train_reader)
+
+    for eop in range(epoch):
+        model.train()
+        for batch_id, data in enumerate(train_loader()):
+            img, label = data
+            label.stop_gradient = True
+            img.stop_gradient = True
+            with paddle.amp.auto_cast(True, level='O2'):
+                out = model(img)
+                loss = paddle.nn.functional.cross_entropy(
+                    input=out, label=label)
+            avg_loss = paddle.mean(x=loss.cast(dtype=paddle.float32))
+            if not accumulate_grad:
+                if not use_pure_fp16:
+                    avg_loss.backward()
+                    optimizer.step()
+                else:
+                    scaler.scale(avg_loss).backward()
+                    scaler.step(optimizer)
+                    scaler.update()
+                optimizer.clear_grad()
+        if accumulate_grad:
+            if not use_pure_fp16:
+                avg_loss.backward()
+                optimizer.step()
+            else:
+                scaler.scale(avg_loss).backward()
+                scaler.step(optimizer)
+                scaler.update()
+            optimizer.clear_grad()
+    if sharding_stage == 3:
+        model.get_all_parameters()
+    return model.parameters()
+
+
+def test_stage2_stage3():
+    mlp, mlp1, mlp2, mlp3, mlp4, mlp5, mlp6, mlp7, mlp8 = MLP(), MLP(), MLP(
+    ), MLP(), MLP(), MLP(), MLP(), MLP(), MLP()
+    state_dict = mlp.state_dict()
+    mlp1.set_state_dict(state_dict)
+    mlp2.set_state_dict(state_dict)
+    mlp3.set_state_dict(state_dict)
+    mlp4.set_state_dict(state_dict)
+    mlp5.set_state_dict(state_dict)
+    mlp6.set_state_dict(state_dict)
+    mlp7.set_state_dict(state_dict)
+    mlp8.set_state_dict(state_dict)
+    # fp32 
+    stage2_params = train_mlp(
+        mlp1, sharding_stage=2, use_pure_fp16=False, opt_group=True)
+    stage3_params = train_mlp(
+        mlp2, sharding_stage=3, use_pure_fp16=False, opt_group=True)
+    for i in range(len(stage2_params)):
+        for j in range(len(stage3_params)):
+            if stage2_params[i].name == stage3_params[j].name:
+                np.testing.assert_allclose(
+                    stage2_params[i].numpy(),
+                    stage3_params[j].numpy(),
+                    rtol=1e-6)
+    # fp32 accumulate grad
+    stage2_params = train_mlp(
+        mlp3,
+        sharding_stage=2,
+        use_pure_fp16=False,
+        accumulate_grad=True,
+        opt_group=True)
+    stage3_params = train_mlp(
+        mlp4,
+        sharding_stage=3,
+        use_pure_fp16=False,
+        accumulate_grad=True,
+        opt_group=True)
+    for i in range(len(stage2_params)):
+        for j in range(len(stage3_params)):
+            if stage2_params[i].name == stage3_params[j].name:
+                np.testing.assert_allclose(
+                    stage2_params[i].numpy(),
+                    stage3_params[j].numpy(),
+                    rtol=1e-6)
+    # fp16
+    stage2_params = train_mlp(
+        mlp5, sharding_stage=2, use_pure_fp16=True, opt_group=False)
+    stage3_params = train_mlp(
+        mlp6, sharding_stage=3, use_pure_fp16=True, opt_group=False)
+    for i in range(len(stage2_params)):
+        for j in range(len(stage3_params)):
+            if stage2_params[i].name == stage3_params[j].name:
+                np.testing.assert_allclose(
+                    stage2_params[i].numpy(),
+                    stage3_params[j].numpy(),
+                    rtol=1e-6)
+    # fp16 recompute
+    stage3_params = train_mlp(
+        mlp7, sharding_stage=3, use_pure_fp16=True, opt_group=False)
+    stage3_params_re = train_mlp(
+        mlp8,
+        sharding_stage=3,
+        use_pure_fp16=True,
+        opt_group=False,
+        recompute=True)
+    for i in range(len(stage3_params)):
+        for j in range(len(stage3_params_re)):
+            if stage3_params[i].name == stage3_params_re[j].name:
+                np.testing.assert_allclose(
+                    stage3_params[i].numpy(),
+                    stage3_params_re[j].numpy(),
+                    rtol=1e-6)
+    return
+
+
+if __name__ == '__main__':
+    test_stage2_stage3()
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage3.py b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage3.py
new file mode 100644
index 0000000000000..89d5f2e8c7b29
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage3.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle.fluid as fluid
+
+from test_parallel_dygraph_dataparallel import TestMultipleGpus
+
+
+class TestDygraphShardingStage3(TestMultipleGpus):
+
+    # check sharding logic as well as the accuracy with single mode
+    def test_dygraph_sharding_optimizer_stage3(self):
+        self.run_mnist_2gpu('dygraph_sharding_stage3.py')
+
+
+if __name__ == "__main__":
+    unittest.main()

From 0de8a805a89eb70203163a34858ff504afff30df Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E6=98=8E=E5=86=AC?=
 <78149749+winter-wang@users.noreply.github.com>
Date: Fri, 14 Jan 2022 16:05:00 +0800
Subject: [PATCH 137/151] [infrt] update the version of llvm. test=develop
 (#38843)

---
 cmake/external/llvm.cmake                     |  13 +-
 paddle/infrt/CMakeLists.txt                   |   1 -
 paddle/infrt/common/global.h                  |   2 +-
 paddle/infrt/dialect/CMakeLists.txt           |   6 +-
 paddle/infrt/dialect/basic_kernels.cc         |  22 +--
 paddle/infrt/dialect/basic_kernels.h          |   5 +-
 paddle/infrt/dialect/basic_kernels.td         |   7 +-
 paddle/infrt/dialect/dense_tensor.cc          | 148 +++++-------------
 paddle/infrt/dialect/dense_tensor.h           |  51 ++++--
 paddle/infrt/dialect/diagnostic_utils.cc      |   7 +-
 paddle/infrt/dialect/diagnostic_utils.h       |   6 +-
 paddle/infrt/dialect/dialect.cc               |  16 +-
 paddle/infrt/dialect/infrt_base.cc            |   6 +-
 paddle/infrt/dialect/infrt_base.h             |  32 ++--
 paddle/infrt/dialect/infrt_base.td            |   6 +-
 paddle/infrt/dialect/init_infrt_dialects.cc   |  12 +-
 paddle/infrt/dialect/init_infrt_dialects.h    |   8 +-
 paddle/infrt/dialect/mlir_loader.cc           |  18 ++-
 paddle/infrt/dialect/mlir_loader.h            |   9 +-
 paddle/infrt/dialect/mlir_loader_test.cc      |  11 +-
 paddle/infrt/dialect/mlir_tests/rewrite.mlir  |   2 +-
 .../dialect/mlir_tests/rewrite_conv_bn.mlir   |   2 +-
 paddle/infrt/dialect/mlir_tests/trt_ops.mlir  |   2 +-
 paddle/infrt/dialect/ops.td                   |   6 -
 paddle/infrt/dialect/opt.cc                   |  26 +--
 paddle/infrt/dialect/pd_op_base.td            |   2 +-
 paddle/infrt/dialect/pd_ops.cc                |  29 ++--
 paddle/infrt/dialect/pd_ops.h                 |  36 ++---
 paddle/infrt/dialect/pd_ops.td                |  14 +-
 paddle/infrt/dialect/pd_types.h               |  11 +-
 paddle/infrt/dialect/print_ir.cc              |  45 +++---
 paddle/infrt/dialect/tensor_shape.cc          |  16 +-
 paddle/infrt/dialect/tensor_shape.h           |   8 +-
 paddle/infrt/dialect/tensor_shape_base.td     |   4 +-
 paddle/infrt/dialect/tensorrt/trt_exec.cc     |   4 +-
 .../dialect/tensorrt/trt_graph_fuse_pass.cc   |  78 +++++----
 .../dialect/tensorrt/trt_graph_fuse_pass.h    |  12 +-
 .../dialect/tensorrt/trt_graph_split_pass.cc  |  20 +--
 .../dialect/tensorrt/trt_graph_split_pass.h   |  10 +-
 .../dialect/tensorrt/trt_op_teller_pass.cc    |  25 ++-
 .../dialect/tensorrt/trt_op_teller_pass.h     |  14 +-
 paddle/infrt/dialect/tensorrt/trt_ops.cc      |  22 ++-
 paddle/infrt/dialect/tensorrt/trt_ops.h       |  41 +++--
 paddle/infrt/dialect/test_kernels.cc          |  75 ++++-----
 paddle/infrt/dialect/test_kernels.h           |   7 +-
 paddle/infrt/dialect/types.cc                 |  17 --
 paddle/infrt/dialect/types.h                  |  16 --
 paddle/infrt/host_context/core_runtime.cc     |   6 +-
 paddle/infrt/host_context/core_runtime.h      |   6 +-
 paddle/infrt/host_context/kernel_frame.h      |   6 +-
 .../host_context/kernel_registry_test.cc      |   6 +-
 .../infrt/host_context/kernel_utils_test.cc   |   6 +-
 .../host_context/mlir_function_executable.cc  |   1 +
 .../host_context/mlir_function_executable.h   |   3 +-
 .../host_context/mlir_program_executor.h      |   4 +-
 .../host_context/mlir_to_runtime_translate.cc |  90 ++++++-----
 .../host_context/mlir_to_runtime_translate.h  |   8 +-
 .../mlir_to_runtime_translate_test.cc         |  12 +-
 paddle/infrt/host_context/op_executable.cc    |   7 +-
 paddle/infrt/host_context/op_executable.h     |  12 +-
 paddle/infrt/kernel/basic_kernels.cc          |   6 +-
 paddle/infrt/kernel/basic_kernels.h           |  12 +-
 paddle/infrt/kernel/tensor_kernels.cc         |   6 +-
 paddle/infrt/kernel/tensor_kernels.h          |  12 +-
 paddle/infrt/kernel/tensor_shape_kernels.cc   |   6 +-
 paddle/infrt/kernel/tensor_shape_kernels.h    |  12 +-
 paddle/infrt/kernel/test_kernels.cc           |   6 +-
 paddle/infrt/kernel/test_kernels.h            |  12 +-
 paddle/infrt/paddle/cpp/desc_api.h            |   8 +-
 paddle/infrt/paddle/model_parser.cc           |   6 +-
 paddle/infrt/paddle/model_parser.h            |   6 +-
 paddle/infrt/paddle/pb/block_desc.cc          |   8 +-
 paddle/infrt/paddle/pb/block_desc.h           |   8 +-
 paddle/infrt/paddle/pb/op_desc.cc             |   8 +-
 paddle/infrt/paddle/pb/op_desc.h              |   8 +-
 paddle/infrt/paddle/pb/program_desc.cc        |   8 +-
 paddle/infrt/paddle/pb/program_desc.h         |   8 +-
 paddle/infrt/paddle/pb/var_desc.cc            |   8 +-
 paddle/infrt/paddle/pb/var_desc.h             |   8 +-
 79 files changed, 616 insertions(+), 637 deletions(-)
 delete mode 100644 paddle/infrt/dialect/ops.td
 delete mode 100644 paddle/infrt/dialect/types.cc
 delete mode 100644 paddle/infrt/dialect/types.h

diff --git a/cmake/external/llvm.cmake b/cmake/external/llvm.cmake
index e080a7359af98..27210e5260048 100644
--- a/cmake/external/llvm.cmake
+++ b/cmake/external/llvm.cmake
@@ -1,7 +1,7 @@
 include(FetchContent)
 
-set(LLVM_DOWNLOAD_URL https://paddle-inference-dist.bj.bcebos.com/CINN/llvm11.tar.gz)
-set(LLVM_MD5 39d32b6be466781dddf5869318dcba53)
+set(LLVM_DOWNLOAD_URL https://paddle-inference-dist.bj.bcebos.com/infrt/llvm_b5149f4e66a49a98b67e8e2de4e24a4af8e2781b.tar.gz)
+set(LLVM_MD5 022819bb5760817013cf4b8a37e97d5e)
 
 set(FETCHCONTENT_BASE_DIR ${THIRD_PARTY_PATH}/llvm)
 set(FETCHCONTENT_QUIET OFF)
@@ -51,7 +51,7 @@ message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}")
 # To build with MLIR, the LLVM is build from source code using the following flags:
 
 #[==[
-cmake -G Ninja ../llvm \
+cmake ../llvm  -G "Unix Makefiles" \
   -DLLVM_ENABLE_PROJECTS="mlir;clang" \
   -DLLVM_BUILD_EXAMPLES=OFF \
   -DLLVM_TARGETS_TO_BUILD="X86" \
@@ -59,8 +59,10 @@ cmake -G Ninja ../llvm \
   -DLLVM_ENABLE_ASSERTIONS=ON \
   -DLLVM_ENABLE_ZLIB=OFF \
   -DLLVM_ENABLE_RTTI=ON \
+  -DLLVM_INSTALL_UTILS=ON \
+  -DCMAKE_INSTALL_PREFIX=./install
 #]==]
-# The matched llvm-project version is f9dc2b7079350d0fed3bb3775f496b90483c9e42 (currently a temporary commit)
+# The matched llvm-project version is b5149f4e66a49a98b67e8e2de4e24a4af8e2781b (currently a temporary commit)
 
 add_definitions(${LLVM_DEFINITIONS})
 
@@ -75,7 +77,7 @@ add_definitions(${LLVM_DEFINITIONS})
 
 
 # The minimum needed libraries for MLIR IR parse and transform.
-set(MLIR_IR_LIBS MLIRAnalysis MLIRStandardOps MLIRPass MLIRParser MLIRDialect MLIRIR MLIROptLib)
+set(MLIR_IR_LIBS MLIRAnalysis MLIRPass MLIRParser MLIRDialect MLIRIR MLIROptLib)
 
 
 # tb_base is the name of a xxx.td file (without the .td suffix)
@@ -89,6 +91,7 @@ function(mlir_tablegen_on td_base)
   mlir_tablegen(${td_base}.cpp.inc -gen-op-defs)
   if (mlir_tablegen_on_DIALECT)
     mlir_tablegen(${td_base}_dialect.hpp.inc --gen-dialect-decls -dialect=${mlir_tablegen_on_DIALECT})
+    mlir_tablegen(${td_base}_dialect.cpp.inc --gen-dialect-defs -dialect=${mlir_tablegen_on_DIALECT})
   endif()
   add_public_tablegen_target(${td_base}_IncGen)
   add_custom_target(${td_base}_inc DEPENDS ${td_base}_IncGen)
diff --git a/paddle/infrt/CMakeLists.txt b/paddle/infrt/CMakeLists.txt
index 8f05d286bf033..8af3012a220ad 100644
--- a/paddle/infrt/CMakeLists.txt
+++ b/paddle/infrt/CMakeLists.txt
@@ -77,7 +77,6 @@ add_subdirectory(paddle)
 
 # MLIR td file generations
 set(infrt_mlir_incs
-        ops_inc
         basic_kernels_inc
         test_kernels_inc
         infrt_base_inc
diff --git a/paddle/infrt/common/global.h b/paddle/infrt/common/global.h
index f89164d03f31d..e6586cb3a3c60 100644
--- a/paddle/infrt/common/global.h
+++ b/paddle/infrt/common/global.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "mlir/IR/MLIRContext.h"
+#include <mlir/IR/MLIRContext.h>
 #include "paddle/infrt/tensor/dense_host_tensor.h"
 
 namespace infrt {
diff --git a/paddle/infrt/dialect/CMakeLists.txt b/paddle/infrt/dialect/CMakeLists.txt
index d145843684c63..c064b2145266b 100644
--- a/paddle/infrt/dialect/CMakeLists.txt
+++ b/paddle/infrt/dialect/CMakeLists.txt
@@ -2,7 +2,6 @@ core_gather_headers()
 
 gather_srcs(infrt_src SRCS
     dialect.cc
-    types.cc
     basic_kernels.cc
     test_kernels.cc
     infrt_base.cc
@@ -14,8 +13,6 @@ gather_srcs(infrt_src SRCS
     pd_types.cc
     pd_ops.cc
     )
-
-mlir_tablegen_on(ops)
 mlir_tablegen_on(basic_kernels)
 mlir_tablegen_on(test_kernels)
 mlir_tablegen_on(infrt_base DIALECT infrt)
@@ -27,8 +24,7 @@ mlir_add_rewriter(rewrite)
 
 # TODO(Superjomn) add a cmake function cc_executable to ecapsulate the following code
 add_executable(infrtopt opt.cc)
-target_link_libraries(infrtopt infrt ${mlir_libs})
-add_dependencies(infrtopt infrt)
+target_link_libraries(infrtopt infrt)
 
 add_executable(print-ir print_ir.cc)
 target_link_libraries(print-ir infrt ${mlir_libs})
diff --git a/paddle/infrt/dialect/basic_kernels.cc b/paddle/infrt/dialect/basic_kernels.cc
index b4d2b9182b0c5..bad7e73ec5ae5 100644
--- a/paddle/infrt/dialect/basic_kernels.cc
+++ b/paddle/infrt/dialect/basic_kernels.cc
@@ -17,17 +17,17 @@
 #include <llvm/ADT/STLExtras.h>
 #include <mlir/IR/Attributes.h>
 #include <mlir/IR/Builders.h>
-#include <mlir/IR/Function.h>
-#include <mlir/IR/Module.h>
+#include <mlir/IR/BuiltinOps.h>
+#include <mlir/IR/BuiltinTypes.h>
 #include <mlir/IR/OpDefinition.h>
 #include <mlir/IR/OpImplementation.h>
-#include <mlir/IR/StandardTypes.h>
 #include <mlir/IR/TypeUtilities.h>
 #include <mlir/Support/LogicalResult.h>
 
 #include "paddle/infrt/dialect/dense_tensor.h"
 
-namespace infrt::dialect {
+namespace infrt {
+namespace dialect {
 using namespace mlir;  // NOLINT
 
 static ParseResult parseCallOp(OpAsmParser &parser,       // NOLINT
@@ -71,12 +71,12 @@ static ParseResult parseConstantF64Op(OpAsmParser &parser,       // NOLINT
 static ParseResult parseConstantI32Op(OpAsmParser &parser,       // NOLINT
                                       OperationState &result) {  // NOLINT
   return parseConstantOp(
-      IntegerType::get(32, result.getContext()), parser, result);
+      IntegerType::get(result.getContext(), 32), parser, result);
 }
 static ParseResult parseConstantI64Op(OpAsmParser &parser,       // NOLINT
                                       OperationState &result) {  // NOLINT
   return parseConstantOp(
-      IntegerType::get(64, result.getContext()), parser, result);
+      IntegerType::get(result.getContext(), 64), parser, result);
 }
 
 static ParseResult parseReturnOp(OpAsmParser &parser,       // NOLINT
@@ -90,10 +90,10 @@ static ParseResult parseReturnOp(OpAsmParser &parser,       // NOLINT
 }
 
 static void print(OpAsmPrinter &p, CallOp op) {  // NOLINT
-  p << "infrt.call " << op.getAttr("callee") << "(";
+  p << "infrt.call " << op->getAttr("callee") << "(";
   p.printOperands(op.getOperands());
   p << ")";
-  p.printOptionalAttrDict(op.getAttrs(), {"callee"});
+  p.printOptionalAttrDict(op->getAttrs(), {"callee"});
   p << " : ";
 }
 
@@ -145,7 +145,7 @@ static LogicalResult verify(ConstantF64Op op) { return success(); }
 static LogicalResult verify(ConstantI64Op op) { return success(); }
 
 static LogicalResult verify(ReturnOp op) {
-  auto function = dyn_cast<FuncOp>(op.getParentOp());
+  auto function = dyn_cast<FuncOp>(op->getParentOp());
 
   if (!function) return success();
 
@@ -157,8 +157,8 @@ static LogicalResult verify(ReturnOp op) {
 
   return success();
 }
+}  // namespace dialect
+}  // namespace infrt
 
 #define GET_OP_CLASSES
 #include "paddle/infrt/dialect/basic_kernels.cpp.inc"
-
-}  // namespace infrt::dialect
diff --git a/paddle/infrt/dialect/basic_kernels.h b/paddle/infrt/dialect/basic_kernels.h
index 65316bc1437c0..b82abcd52d28f 100644
--- a/paddle/infrt/dialect/basic_kernels.h
+++ b/paddle/infrt/dialect/basic_kernels.h
@@ -13,12 +13,9 @@
 // limitations under the License.
 
 #pragma once
+#include <mlir/IR/BuiltinTypes.h>
 #include <mlir/IR/OpDefinition.h>
 #include <mlir/Interfaces/SideEffectInterfaces.h>
 
-using namespace mlir;  // NOLINT
-
-namespace infrt::dialect {
 #define GET_OP_CLASSES
 #include "paddle/infrt/dialect/basic_kernels.hpp.inc"
-}  // namespace infrt::dialect
diff --git a/paddle/infrt/dialect/basic_kernels.td b/paddle/infrt/dialect/basic_kernels.td
index df5e4d8a2c6a1..7d8de79fbae2b 100644
--- a/paddle/infrt/dialect/basic_kernels.td
+++ b/paddle/infrt/dialect/basic_kernels.td
@@ -27,7 +27,7 @@ def CallOp : INFRT_Op<"call"> {
   let results = (outs Variadic<AnyType>);
 
   let extraClassDeclaration = [{
-      StringRef getCallee() { return callee(); }
+      mlir::StringRef getCallee() { return callee(); }
       mlir::FunctionType getCalleeType();
     }];
 }
@@ -57,9 +57,8 @@ def ReturnOp : INFRT_Op<"return", [Terminator]> {
 
   let arguments = (ins Variadic<AnyType>:$operands);
 
-  let builders = [OpBuilder<
-                  "OpBuilder &b, OperationState &result",
-                  [{ build(b, result, llvm::None); }]>];
+  let builders = [OpBuilder<(ins),
+                  [{ build($_builder, $_state, llvm::None); }]>];
 }
 
 class AddOp<string suffix, Type type> : INFRT_Op<"add." # suffix, [NoSideEffect]> {
diff --git a/paddle/infrt/dialect/dense_tensor.cc b/paddle/infrt/dialect/dense_tensor.cc
index 629a7b16523fc..7685cdc65b9ad 100644
--- a/paddle/infrt/dialect/dense_tensor.cc
+++ b/paddle/infrt/dialect/dense_tensor.cc
@@ -17,12 +17,11 @@
 #include <llvm/ADT/STLExtras.h>
 #include <mlir/IR/Attributes.h>
 #include <mlir/IR/Builders.h>
+#include <mlir/IR/BuiltinOps.h>
+#include <mlir/IR/BuiltinTypes.h>
 #include <mlir/IR/DialectImplementation.h>
-#include <mlir/IR/Function.h>
-#include <mlir/IR/Module.h>
 #include <mlir/IR/OpDefinition.h>
 #include <mlir/IR/OpImplementation.h>
-#include <mlir/IR/StandardTypes.h>
 #include <mlir/IR/TypeUtilities.h>
 #include <mlir/Support/LogicalResult.h>
 
@@ -31,68 +30,37 @@
 #include "paddle/infrt/common/global.h"
 #include "paddle/infrt/dialect/tensor_shape.h"
 
-namespace infrt::dt {
-
+namespace infrt {
+namespace dt {
 void DTDialect::initialize() {
-  allowUnknownTypes();
   addOperations<
 #define GET_OP_LIST
 #include "paddle/infrt/dialect/dense_tensor.cpp.inc"
       >();
 }
 
-namespace detail {
-struct TensorTypeStorage : public mlir::TypeStorage {
-  TensorTypeStorage(TargetType target,
-                    LayoutType layout,
-                    PrecisionType precision)
-      : target_(target), layout_(layout), precision_(precision) {}
-
-  using KeyTy = std::tuple<TargetType, LayoutType, PrecisionType>;
-
-  bool operator==(const KeyTy &key) const {
-    return key == KeyTy(target_, layout_, precision_);
-  }
-
-  static llvm::hash_code hashKey(const KeyTy &key) {
-    return llvm::hash_value(key);
-  }
-
-  static TensorTypeStorage *construct(
-      mlir::TypeStorageAllocator &allocator,  // NOLINT
-      const KeyTy &key) {
-    return new (allocator.allocate<TensorTypeStorage>())
-        TensorTypeStorage(std::get<0>(key), std::get<1>(key), std::get<2>(key));
-  }
-
-  TargetType target_;
-  LayoutType layout_;
-  PrecisionType precision_;
-};
-}  // namespace detail
-
 llvm::Optional<TargetType> GetTargetType(mlir::StringRef key) {
-  if (key.equals_lower("x86"))
+  if (key.equals_insensitive("x86"))
     return TargetType::X86;
-  else if (key.equals_lower("cuda"))
+  else if (key.equals_insensitive("cuda"))
     return TargetType::CUDA;
   else
     return llvm::None;
 }
 
 llvm::Optional<LayoutType> GetLayoutType(mlir::StringRef key) {
-  if (key.equals_lower("nchw"))
+  if (key.equals_insensitive("nchw"))
     return LayoutType::NCHW;
-  else if (key.equals_lower("nhwc"))
+  else if (key.equals_insensitive("nhwc"))
     return LayoutType::NHWC;
   else
     return llvm::None;
 }
 
 llvm::Optional<PrecisionType> GetPrecisionType(mlir::StringRef key) {
-  if (key.equals_lower("i32"))
+  if (key.equals_insensitive("i32"))
     return PrecisionType::I32;
-  else if (key.equals_lower("f32"))
+  else if (key.equals_insensitive("f32"))
     return PrecisionType::F32;
   else
     return llvm::None;
@@ -111,7 +79,7 @@ LayoutType TensorType::layout() { return getImpl()->layout_; }
 
 PrecisionType TensorType::precision() { return getImpl()->precision_; }
 
-raw_ostream &operator<<(raw_ostream &os, TensorType tensorType) {
+mlir::raw_ostream &operator<<(mlir::raw_ostream &os, TensorType tensorType) {
   os << "TensorType<" << tensorType.target() << ", " << tensorType.layout()
      << ", " << tensorType.precision() << ">";
   return os;
@@ -133,7 +101,7 @@ StringType StringType::get(mlir::MLIRContext *context) {
   return Base::get(context);
 }
 
-raw_ostream &operator<<(raw_ostream &os, TargetType type) {
+mlir::raw_ostream &operator<<(mlir::raw_ostream &os, TargetType type) {
   switch (type) {
     case (TargetType::X86):
       os << "X86";
@@ -147,7 +115,7 @@ raw_ostream &operator<<(raw_ostream &os, TargetType type) {
   return os;
 }
 
-raw_ostream &operator<<(raw_ostream &os, LayoutType type) {
+mlir::raw_ostream &operator<<(mlir::raw_ostream &os, LayoutType type) {
   switch (type) {
     case (LayoutType::NCHW):
       os << "NCHW";
@@ -161,7 +129,7 @@ raw_ostream &operator<<(raw_ostream &os, LayoutType type) {
   return os;
 }
 
-raw_ostream &operator<<(raw_ostream &os, PrecisionType type) {
+mlir::raw_ostream &operator<<(mlir::raw_ostream &os, PrecisionType type) {
   switch (type) {
     case (PrecisionType::I32):
       os << "I32";
@@ -175,103 +143,69 @@ raw_ostream &operator<<(raw_ostream &os, PrecisionType type) {
   return os;
 }
 
-static Type getTensorType(mlir::MLIRContext *context) {
-  auto t_dialect = Identifier::get("t", context);
-  return OpaqueType::get(t_dialect, "tensor", context);
+static mlir::Type getTensorType(mlir::MLIRContext *context) {
+  auto t_dialect = mlir::Identifier::get("t", context);
+  return mlir::OpaqueType::get(t_dialect, "tensor");
 }
 
-static ParseResult parseCreateUninitTensorOp(
-    OpAsmParser &parser,       // NOLINT
-    OperationState &result) {  // NOLINT
+static mlir::ParseResult parseCreateUninitTensorOp(
+    mlir::OpAsmParser &parser,       // NOLINT
+    mlir::OperationState &result) {  // NOLINT
   auto loc = parser.getCurrentLocation();
-  ::mlir::Type outputRawTypes[1];
-  ::llvm::ArrayRef<::mlir::Type> outputTypes(outputRawTypes);
+  mlir::Type outputRawTypes[1];
+  ::llvm::ArrayRef<mlir::Type> outputTypes(outputRawTypes);
 
   mlir::ArrayAttr shapeAttr;
   if (parser.parseAttribute(shapeAttr,
                             parser.getBuilder().getI64Type(),
                             "shape",
                             result.attributes))
-    return failure();
-  if (parser.parseOptionalAttrDict(result.attributes)) return failure();
+    return mlir::failure();
+  if (parser.parseOptionalAttrDict(result.attributes)) return mlir::failure();
 
-  if (parser.parseArrow()) return failure();
-  if (parser.parseType(outputRawTypes[0])) return failure();
+  if (parser.parseArrow()) return mlir::failure();
+  if (parser.parseType(outputRawTypes[0])) return mlir::failure();
   if (!outputRawTypes[0].isa<TensorType>())
     return parser.emitError(loc, "invalid kind of type specified");
   result.addTypes(outputTypes);
-  return success();
+  return mlir::success();
 }
 
 template <typename CreateUninitTensorOp>
-static void printCreateUninitTensorOp(OpAsmPrinter &p,  // NOLINT
+static void printCreateUninitTensorOp(mlir::OpAsmPrinter &p,  // NOLINT
                                       CreateUninitTensorOp op) {
   p << CreateUninitTensorOp::getOperationName();
   p << " ";
   p.printAttributeWithoutType(op.shapeAttr());
-  p.printOptionalAttrDict(op.getAttrs(), /*elidedAttrs=*/{"shape"});
+  p.printOptionalAttrDict(op->getAttrs(), /*elidedAttrs=*/{"shape"});
   p << " -> ";
   p << op.getOperation()->getResultTypes();
 }
 
-// TODO(shibo): can be removed?
-// static ParseResult parseFillTensorWithConstantOp(OpAsmParser& parser,
-// OperationState& result) {
-//  auto loc = parser.getCurrentLocation();
-//  ::mlir::OpAsmParser::OperandType inputRawOperands[1];
-//  ::llvm::ArrayRef<::mlir::OpAsmParser::OperandType>
-//  inputOperands(inputRawOperands);
-//  ::mlir::Type inputRawTypes[1];
-//  ::llvm::ArrayRef<::mlir::Type> inputTypes(inputRawTypes);
-//
-//  if (parser.parseOperand(inputRawOperands[0])) return failure();
-//
-//  if (parser.parseColon()) return failure();
-//  if (parser.parseType(inputRawTypes[0])) return failure();
-//  if (!inputRawTypes[0].isa<TensorType>())
-//    return parser.emitError(loc, "invalid kind of type specified");
-//
-//  Attribute value_attr;
-//  if (parser.resolveOperands(inputOperands, inputTypes, loc, result.operands))
-//  return failure();
-//  if (parser.parseAttribute(value_attr, "value", result.attributes)) return
-//  failure();
-//  return success();
-//}
-
-// TODO(shibo): can be removed?
-// template <typename FillTensorOp>
-// static void printFillTensorWithConstantOp(OpAsmPrinter& p, FillTensorOp op) {
-//  p << FillTensorOp::getOperationName();
-//  p << " ";
-//  p.printOperand(op.getOperand());
-//  p << " : ";
-//  p << op.getOperation()->getOperandTypes();
-//  p << " ";
-//  p << op.getAttr("value");
-//}
-
-static ParseResult parseSetTensorOp(OpAsmParser &parser,       // NOLINT
-                                    OperationState &result) {  // NOLINT
-  SmallVector<OpAsmParser::OperandType, 1> operands;
-  if (parser.parseOperandList(operands, 1)) return failure();
+static mlir::ParseResult parseSetTensorOp(
+    mlir::OpAsmParser &parser,       // NOLINT
+    mlir::OperationState &result) {  // NOLINT
+  llvm::SmallVector<mlir::OpAsmParser::OperandType, 1> operands;
+  if (parser.parseOperandList(operands, 1)) return mlir::failure();
 
   auto tensor_type = getTensorType(result.getContext());
 
-  Attribute value_attr;
-  return failure(
+  mlir::Attribute value_attr;
+  return mlir::failure(
       parser.resolveOperand(operands[0], tensor_type, result.operands) ||
       parser.parseAttribute(value_attr, "values", result.attributes));
 }
 
 template <typename SetTensorOp>
-static void printSetTensorOp(OpAsmPrinter &p, SetTensorOp op) {  // NOLINT
+static void printSetTensorOp(mlir::OpAsmPrinter &p, SetTensorOp op) {  // NOLINT
   p << SetTensorOp::getOperationName() << " ";
   p.printOperand(op.getOperand());
-  p << " " << op.getAttr("values");
+  p << " " << op->getAttr("values");
 }
+}  // namespace dt
+}  // namespace infrt
 
 #define GET_OP_CLASSES
 #include "paddle/infrt/dialect/dense_tensor.cpp.inc"  // NOLINT
 
-}  // namespace infrt::dt
+#include "paddle/infrt/dialect/dense_tensor_dialect.cpp.inc"
diff --git a/paddle/infrt/dialect/dense_tensor.h b/paddle/infrt/dialect/dense_tensor.h
index 866c62213ab05..416925d3382ba 100644
--- a/paddle/infrt/dialect/dense_tensor.h
+++ b/paddle/infrt/dialect/dense_tensor.h
@@ -19,13 +19,8 @@
 
 #include <string>
 
-using namespace mlir;  // NOLINT
-namespace infrt::dt {
-
-namespace detail {
-struct TensorTypeStorage;
-}  // namespace detail
-
+namespace infrt {
+namespace dt {
 enum class TargetType : uint8_t { X86, CUDA };
 enum class LayoutType : uint8_t { NCHW, NHWC };
 enum class PrecisionType : uint8_t { I32, F32 };
@@ -34,9 +29,39 @@ llvm::Optional<TargetType> GetTargetType(mlir::StringRef key);
 llvm::Optional<LayoutType> GetLayoutType(mlir::StringRef key);
 llvm::Optional<PrecisionType> GetPrecisionType(mlir::StringRef key);
 
-raw_ostream &operator<<(raw_ostream &os, TargetType type);
-raw_ostream &operator<<(raw_ostream &os, LayoutType type);
-raw_ostream &operator<<(raw_ostream &os, PrecisionType type);
+mlir::raw_ostream &operator<<(mlir::raw_ostream &os, TargetType type);
+mlir::raw_ostream &operator<<(mlir::raw_ostream &os, LayoutType type);
+mlir::raw_ostream &operator<<(mlir::raw_ostream &os, PrecisionType type);
+
+namespace detail {
+struct TensorTypeStorage : public mlir::TypeStorage {
+  TensorTypeStorage(TargetType target,
+                    LayoutType layout,
+                    PrecisionType precision)
+      : target_(target), layout_(layout), precision_(precision) {}
+
+  using KeyTy = std::tuple<TargetType, LayoutType, PrecisionType>;
+
+  bool operator==(const KeyTy &key) const {
+    return key == KeyTy(target_, layout_, precision_);
+  }
+
+  static llvm::hash_code hashKey(const KeyTy &key) {
+    return llvm::hash_value(key);
+  }
+
+  static TensorTypeStorage *construct(
+      mlir::TypeStorageAllocator &allocator,  // NOLINT
+      const KeyTy &key) {
+    return new (allocator.allocate<TensorTypeStorage>())
+        TensorTypeStorage(std::get<0>(key), std::get<1>(key), std::get<2>(key));
+  }
+
+  TargetType target_;
+  LayoutType layout_;
+  PrecisionType precision_;
+};
+}  // namespace detail
 
 class TensorType : public mlir::Type::TypeBase<TensorType,
                                                mlir::Type,
@@ -52,7 +77,7 @@ class TensorType : public mlir::Type::TypeBase<TensorType,
   PrecisionType precision();
 };
 
-raw_ostream &operator<<(raw_ostream &os, TensorType tensorType);
+mlir::raw_ostream &operator<<(mlir::raw_ostream &os, TensorType tensorType);
 
 class TensorMapType : public mlir::Type::TypeBase<TensorMapType,
                                                   mlir::Type,
@@ -70,10 +95,10 @@ class StringType
   static StringType get();
   static StringType get(mlir::MLIRContext *context);
 };
+}  // namespace dt
+}  // namespace infrt
 
 #include "paddle/infrt/dialect/dense_tensor_dialect.hpp.inc"
 
 #define GET_OP_CLASSES
 #include "paddle/infrt/dialect/dense_tensor.hpp.inc"
-
-}  // namespace infrt::dt
diff --git a/paddle/infrt/dialect/diagnostic_utils.cc b/paddle/infrt/dialect/diagnostic_utils.cc
index a28176e38fdc7..4151001067ecb 100644
--- a/paddle/infrt/dialect/diagnostic_utils.cc
+++ b/paddle/infrt/dialect/diagnostic_utils.cc
@@ -14,9 +14,11 @@
 
 #include "paddle/infrt/dialect/diagnostic_utils.h"
 
+#include <llvm/Support/raw_ostream.h>
 #include <string>
 
-namespace infrt::dialect {
+namespace infrt {
+namespace dialect {
 
 struct MyScopedDiagnosicHandler::Impl {
   Impl() : diag_stream_(diag_str_) {}
@@ -49,4 +51,5 @@ mlir::LogicalResult MyScopedDiagnosicHandler::handler(mlir::Diagnostic *diag) {
   return mlir::failure(true);
 }
 
-}  // namespace infrt::dialect
+}  // namespace dialect
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/diagnostic_utils.h b/paddle/infrt/dialect/diagnostic_utils.h
index 3a8098cf75181..746e61c8fe5c3 100644
--- a/paddle/infrt/dialect/diagnostic_utils.h
+++ b/paddle/infrt/dialect/diagnostic_utils.h
@@ -18,7 +18,8 @@
 
 #include <memory>
 
-namespace infrt::dialect {
+namespace infrt {
+namespace dialect {
 
 /**
  * A scoped diagnostic handler to help debug MLIR process.
@@ -36,4 +37,5 @@ class MyScopedDiagnosicHandler : public mlir::SourceMgrDiagnosticHandler {
   std::unique_ptr<Impl> impl_;
 };
 
-}  // namespace infrt::dialect
+}  // namespace dialect
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/dialect.cc b/paddle/infrt/dialect/dialect.cc
index cbcd5d0f0fa78..fe07b91d22ed5 100644
--- a/paddle/infrt/dialect/dialect.cc
+++ b/paddle/infrt/dialect/dialect.cc
@@ -13,24 +13,26 @@
 // limitations under the License.
 
 #include <mlir/IR/Builders.h>
+#include <mlir/IR/BuiltinTypes.h>
 #include <mlir/IR/Dialect.h>
-#include <mlir/IR/Function.h>
 #include <mlir/IR/OpDefinition.h>
 #include <mlir/IR/OpImplementation.h>
-#include <mlir/IR/StandardTypes.h>
 #include <mlir/Interfaces/SideEffectInterfaces.h>
 #include <mlir/Support/LogicalResult.h>
 
-namespace infrt::hlir::dialect {
+namespace infrt {
+namespace hlir {
+namespace dialect {
 
-class CinnDialect : public ::mlir::Dialect {
+class CinnDialect : public mlir::Dialect {
  public:
-  explicit CinnDialect(::mlir::MLIRContext* ctx);
+  explicit CinnDialect(mlir::MLIRContext* ctx);
 
   //! We should register this function in dialect
   static llvm::StringRef getDialectNamespace() {
     return "infrt::hlir::dialect";
   }
 };
-
-}  // namespace infrt::hlir::dialect
+}  // namespace dialect
+}  // namespace hlir
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/infrt_base.cc b/paddle/infrt/dialect/infrt_base.cc
index b28ad5ad4b5a5..e8005661bbd65 100644
--- a/paddle/infrt/dialect/infrt_base.cc
+++ b/paddle/infrt/dialect/infrt_base.cc
@@ -18,7 +18,8 @@
 #include "paddle/infrt/dialect/dense_tensor.h"
 #include "paddle/infrt/dialect/test_kernels.h"
 
-namespace infrt::dialect {
+namespace infrt {
+namespace dialect {
 
 // ----INFRTDialect definition begin----
 void INFRTDialect::initialize() {
@@ -124,4 +125,5 @@ void INFRTDialect::printType(mlir::Type type,
 
 // ----INFRTDialect definition end----
 
-}  // namespace infrt::dialect
+}  // namespace dialect
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/infrt_base.h b/paddle/infrt/dialect/infrt_base.h
index 58acd7c9a409a..1a7fbcf395a6e 100644
--- a/paddle/infrt/dialect/infrt_base.h
+++ b/paddle/infrt/dialect/infrt_base.h
@@ -18,19 +18,17 @@
 #include <mlir/IR/Dialect.h>
 #include <mlir/IR/DialectImplementation.h>
 #include <mlir/IR/MLIRContext.h>
-#include <mlir/IR/StandardTypes.h>
 #include <mlir/IR/TypeUtilities.h>
 #include <mlir/IR/Types.h>
 
 #include "paddle/infrt/dialect/infrt_base.hpp.inc"
 
-namespace infrt::dialect {
-
-class INFRTDialect : public ::mlir::Dialect {
-  explicit INFRTDialect(::mlir::MLIRContext *context)
-      : ::mlir::Dialect(getDialectNamespace(),
-                        context,
-                        ::mlir::TypeID::get<INFRTDialect>()) {
+namespace infrt {
+namespace dialect {
+class INFRTDialect : public mlir::Dialect {
+  explicit INFRTDialect(mlir::MLIRContext *context)
+      : mlir::Dialect(
+            getDialectNamespace(), context, mlir::TypeID::get<INFRTDialect>()) {
     initialize();
   }
 
@@ -41,15 +39,12 @@ class INFRTDialect : public ::mlir::Dialect {
                  mlir::DialectAsmPrinter &printer) const override;
 
   void initialize();
-  friend class ::mlir::MLIRContext;
+  friend class mlir::MLIRContext;
 
  public:
   static ::llvm::StringRef getDialectNamespace() { return "infrt"; }
 };
-
-}  // namespace infrt::dialect
-
-namespace mlir {
+}  // namespace dialect
 
 template <typename T>
 static mlir::IntegerAttr createI32Attr(mlir::OpBuilder &b,  // NOLINT
@@ -58,17 +53,16 @@ static mlir::IntegerAttr createI32Attr(mlir::OpBuilder &b,  // NOLINT
   return b.getIntegerAttr(b.getI32Type(), constant);
 }
 
-static mlir::SmallVector<::mlir::Value, 4> cvtValueToValueRange(
+static mlir::SmallVector<mlir::Value, 4> cvtValueToValueRange(
     const mlir::Value &operand) {
-  return mlir::SmallVector<::mlir::Value, 4>(1, operand);
+  return mlir::SmallVector<mlir::Value, 4>(1, operand);
 }
 
-static mlir::SmallVector<::mlir::Value, 4> concatTwoValueRange(
+static mlir::SmallVector<mlir::Value, 4> concatTwoValueRange(
     mlir::ValueRange operand_0, mlir::ValueRange operand_1) {
-  mlir::SmallVector<::mlir::Value, 4> operands;
+  mlir::SmallVector<mlir::Value, 4> operands;
   operands.append(operand_0.begin(), operand_0.end());
   operands.append(operand_1.begin(), operand_1.end());
   return operands;
 }
-
-}  // namespace mlir
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/infrt_base.td b/paddle/infrt/dialect/infrt_base.td
index 7d6fdbbbf2f68..1abd294236d93 100644
--- a/paddle/infrt/dialect/infrt_base.td
+++ b/paddle/infrt/dialect/infrt_base.td
@@ -28,11 +28,11 @@ def TensorMapType :
 def BufferType : OpaqueType<"b", "buffer", "buffer">;
 
 class INFRT_createI32Attr<string value> : NativeCodeCall<
-    "mlir::createI32Attr($_builder, $_loc, " # value # ")">;
+    "infrt::createI32Attr($_builder, $_loc, " # value # ")">;
 
 def INFRT_cvtValueToValueRange : NativeCodeCall<
-    "mlir::cvtValueToValueRange($0)">;
+    "infrt::cvtValueToValueRange($0)">;
 
 def INFRT_concatTwoValueRange : NativeCodeCall<
-    "mlir::concatTwoValueRange($0, $1)">;
+    "infrt::concatTwoValueRange($0, $1)">;
 #endif  // INFRT_BASE
diff --git a/paddle/infrt/dialect/init_infrt_dialects.cc b/paddle/infrt/dialect/init_infrt_dialects.cc
index 4bc2bf70942d2..c3769414dbb39 100644
--- a/paddle/infrt/dialect/init_infrt_dialects.cc
+++ b/paddle/infrt/dialect/init_infrt_dialects.cc
@@ -23,12 +23,10 @@
 #include "paddle/infrt/dialect/tensor_shape.h"
 
 namespace infrt {
-
-void RegisterCinnDialects(mlir::DialectRegistry& registry) {  // NOLINT
-  registry.insert<ts::TensorShapeDialect>();
-  registry.insert<dialect::INFRTDialect>();
-  registry.insert<dt::DTDialect>();
-  registry.insert<mlir::pd::PaddleDialect>();
+void registerCinnDialects(mlir::DialectRegistry &registry) {  // NOLINT
+  registry.insert<ts::TensorShapeDialect,
+                  dialect::INFRTDialect,
+                  dt::DTDialect,
+                  mlir::pd::PaddleDialect>();
 }
-
 }  // namespace infrt
diff --git a/paddle/infrt/dialect/init_infrt_dialects.h b/paddle/infrt/dialect/init_infrt_dialects.h
index 50caca018980d..0912e9ef2555b 100644
--- a/paddle/infrt/dialect/init_infrt_dialects.h
+++ b/paddle/infrt/dialect/init_infrt_dialects.h
@@ -14,10 +14,8 @@
 
 #pragma once
 
-#include "mlir/IR/Dialect.h"
-
+#include <mlir/IR/Dialect.h>
+#include <mlir/IR/MLIRContext.h>
 namespace infrt {
-
-void RegisterCinnDialects(mlir::DialectRegistry& registry);  // NOLINT
-
+void registerCinnDialects(mlir::DialectRegistry &registry);  // NOLINT
 }  // namespace infrt
diff --git a/paddle/infrt/dialect/mlir_loader.cc b/paddle/infrt/dialect/mlir_loader.cc
index b318a6a763483..1d0696e77dcda 100644
--- a/paddle/infrt/dialect/mlir_loader.cc
+++ b/paddle/infrt/dialect/mlir_loader.cc
@@ -16,8 +16,8 @@
 
 #include <llvm/Support/SourceMgr.h>
 #include <mlir/Dialect/StandardOps/IR/Ops.h>
+#include <mlir/IR/BuiltinTypes.h>
 #include <mlir/IR/Diagnostics.h>
-#include <mlir/IR/Function.h>
 #include <mlir/IR/OperationSupport.h>
 #include <mlir/Parser.h>
 #include <unordered_map>
@@ -30,12 +30,15 @@
 #include "paddle/infrt/dialect/diagnostic_utils.h"
 #include "paddle/infrt/dialect/init_infrt_dialects.h"
 
-namespace infrt::dialect {
+namespace infrt {
+namespace dialect {
 
 mlir::OwningModuleRef LoadMlirSource(mlir::MLIRContext* context,
                                      const std::string& mlir_source) {
   // context->allowUnregisteredDialects();
-  RegisterCinnDialects(context->getDialectRegistry());
+  mlir::DialectRegistry registry;
+  registerCinnDialects(registry);
+  context->appendDialectRegistry(registry);
   // Currenetly, We only used the CinnDialect and mlir::BuiltinDialect is
   // enough。Don't need StandardOpsDialect.
   // context->getDialectRegistry().insert<mlir::StandardOpsDialect>();
@@ -57,9 +60,9 @@ mlir::OwningModuleRef LoadMlirSource(mlir::MLIRContext* context,
 mlir::OwningModuleRef LoadMlirFile(const std::string& file_name,
                                    mlir::MLIRContext* context) {
   // context->allowUnregisteredDialects();
-  RegisterCinnDialects(context->getDialectRegistry());
-  context->getDialectRegistry().insert<mlir::StandardOpsDialect>();
-
+  mlir::DialectRegistry registry;
+  registerCinnDialects(registry);
+  context->appendDialectRegistry(registry);
   mlir::ScopedDiagnosticHandler scope_handler(
       context, [](mlir::Diagnostic& diag) {
         if (diag.getSeverity() != mlir::DiagnosticSeverity::Error)
@@ -71,4 +74,5 @@ mlir::OwningModuleRef LoadMlirFile(const std::string& file_name,
   return mlir::parseSourceFile(std::string(file_name), context);
 }
 
-}  // namespace infrt::dialect
+}  // namespace dialect
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/mlir_loader.h b/paddle/infrt/dialect/mlir_loader.h
index 092da7d9ce03f..5e50ad9e5a271 100644
--- a/paddle/infrt/dialect/mlir_loader.h
+++ b/paddle/infrt/dialect/mlir_loader.h
@@ -15,16 +15,17 @@
 #pragma once
 
 #include <glog/logging.h>
-#include <mlir/IR/Module.h>
+#include <mlir/IR/BuiltinOps.h>
 #include <string>
 
 #include <memory>
 
-namespace infrt::dialect {
+namespace infrt {
+namespace dialect {
 
 mlir::OwningModuleRef LoadMlirSource(mlir::MLIRContext* context,
                                      const std::string& mlir_source);
 mlir::OwningModuleRef LoadMlirFile(const std::string& file_name,
                                    mlir::MLIRContext* context);
-
-}  // namespace infrt::dialect
+}  // namespace dialect
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/mlir_loader_test.cc b/paddle/infrt/dialect/mlir_loader_test.cc
index 1b622d585ad8e..1115053073044 100644
--- a/paddle/infrt/dialect/mlir_loader_test.cc
+++ b/paddle/infrt/dialect/mlir_loader_test.cc
@@ -17,14 +17,15 @@
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 #include <llvm/Support/SourceMgr.h>
-#include <mlir/IR/Function.h>
+#include <mlir/IR/BuiltinTypes.h>
 #include <mlir/Parser.h>
 
 #include <string>
 
 #include "paddle/infrt/dialect/init_infrt_dialects.h"
 
-namespace infrt::dialect {
+namespace infrt {
+namespace dialect {
 
 TEST(MlirLoader, basic) {
   mlir::MLIRContext context;
@@ -42,8 +43,7 @@ func @main() -> f32 {
 )ROC";
 
   auto module = LoadMlirSource(&context, source);
-  module->verify();
-
+  EXPECT_TRUE(mlir::succeeded(module->verify()));
   LOG(INFO) << "module name: " << module->getOperationName().data();
   for (auto func : module->getOps<mlir::FuncOp>()) {
     LOG(INFO) << "get func " << func.getName().str();
@@ -54,4 +54,5 @@ func @main() -> f32 {
   }
 }
 
-}  // namespace infrt::dialect
+}  // namespace dialect
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/mlir_tests/rewrite.mlir b/paddle/infrt/dialect/mlir_tests/rewrite.mlir
index bfad9d1f6924d..5e207634da8e4 100644
--- a/paddle/infrt/dialect/mlir_tests/rewrite.mlir
+++ b/paddle/infrt/dialect/mlir_tests/rewrite.mlir
@@ -20,5 +20,5 @@ func @main() -> tensor<?xf32> {
   %c2 = "pd.matmul"(%e1, %b2) {transpose_x=true, transpose_y=false} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
   %d2 = "pd.elementwise_add"(%c2, %bias2) {axis=1:i32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
   %e2 = "pd.relu"(%d2) {} : (tensor<?xf32>) -> tensor<?xf32>
-  infrt.return %e2 : tensor<?xf32>
+  "pd.fetch"(%e2) {name="output"} :(tensor<?xf32>)->()
 }
\ No newline at end of file
diff --git a/paddle/infrt/dialect/mlir_tests/rewrite_conv_bn.mlir b/paddle/infrt/dialect/mlir_tests/rewrite_conv_bn.mlir
index 9ea1ec0ebca36..2889b92b18ef0 100644
--- a/paddle/infrt/dialect/mlir_tests/rewrite_conv_bn.mlir
+++ b/paddle/infrt/dialect/mlir_tests/rewrite_conv_bn.mlir
@@ -11,5 +11,5 @@ func @main() -> tensor<?xf32> {
 
   %c = "pd.conv2d"(%a, %filter, %bias) {} : (tensor<?x3x256x256xf32>, tensor<3x64x3x3xf32>, tensor<64xf32>) -> tensor<?x3x256x256xf32>
   %d = "pd.batch_norm"(%c, %scale, %bias2, %mean, %var) {} : (tensor<?x3x256x256xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>) -> tensor<?x3x256x256xf32>
-  infrt.return %d : tensor<?x3x256x256xf32>
+  "pd.fetch"(%d) {name="output"} :(tensor<?x3x256x256xf32>)->()
 }
\ No newline at end of file
diff --git a/paddle/infrt/dialect/mlir_tests/trt_ops.mlir b/paddle/infrt/dialect/mlir_tests/trt_ops.mlir
index 009b6d1c19653..d98f107bab41e 100644
--- a/paddle/infrt/dialect/mlir_tests/trt_ops.mlir
+++ b/paddle/infrt/dialect/mlir_tests/trt_ops.mlir
@@ -18,5 +18,5 @@ func @main() -> tensor<?xf32> {
   %d2 = "pd.elementwise_add"(%c2, %bias2) {axis=1:i32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
   %e2 = "pd.relu"(%d2) {} : (tensor<?xf32>) -> tensor<?xf32>
   
-  "pd.fetch"(%e2) :(tensor<?xf32>)->()
+  "pd.fetch"(%e2) {name="output"} :(tensor<?xf32>)->()
 }
diff --git a/paddle/infrt/dialect/ops.td b/paddle/infrt/dialect/ops.td
deleted file mode 100644
index 264134a447c63..0000000000000
--- a/paddle/infrt/dialect/ops.td
+++ /dev/null
@@ -1,6 +0,0 @@
-include "mlir/IR/OpBase.td"
-include "paddle/infrt/dialect/infrt_base.td"
-
-
-class INFRT_Op<string mnemonic, list<OpTrait> traits = []> :
-    Op<INFRT_Dialect, mnemonic, traits>;
diff --git a/paddle/infrt/dialect/opt.cc b/paddle/infrt/dialect/opt.cc
index d90d25230d0c2..5bcf5a23f4c53 100644
--- a/paddle/infrt/dialect/opt.cc
+++ b/paddle/infrt/dialect/opt.cc
@@ -12,34 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <glog/logging.h>
-#include <llvm/Support/CommandLine.h>
-#include <mlir/Dialect/Affine/IR/AffineOps.h>
-#include <mlir/Dialect/LLVMIR/LLVMDialect.h>
-#include <mlir/IR/AsmState.h>
-#include <mlir/IR/Dialect.h>
-#include <mlir/InitAllDialects.h>
-#include <mlir/InitAllPasses.h>
-#include <mlir/Pass/Pass.h>
-#include <mlir/Pass/PassManager.h>
-#include <mlir/Support/FileUtilities.h>
 #include <mlir/Support/MlirOptMain.h>
 #include <mlir/Transforms/Passes.h>
-
-#include <iostream>
-
-#include "paddle/infrt/common/global.h"
 #include "paddle/infrt/dialect/init_infrt_dialects.h"
-#include "paddle/infrt/dialect/mlir_loader.h"
 
 int main(int argc, char **argv) {
-  mlir::MLIRContext *context = infrt::Global::getMLIRContext();
-
-  auto &registry = context->getDialectRegistry();
-  infrt::RegisterCinnDialects(registry);
-
+  mlir::DialectRegistry registry;
+  infrt::registerCinnDialects(registry);
   mlir::registerCanonicalizerPass();
-
   return mlir::failed(
-      mlir::MlirOptMain(argc, argv, "INFRT mlir pass driver", registry));
+      mlir::MlirOptMain(argc, argv, "infrt mlir pass driver", registry));
 }
diff --git a/paddle/infrt/dialect/pd_op_base.td b/paddle/infrt/dialect/pd_op_base.td
index af53df113dfb3..a3e3c4ae59277 100644
--- a/paddle/infrt/dialect/pd_op_base.td
+++ b/paddle/infrt/dialect/pd_op_base.td
@@ -16,7 +16,7 @@ def PD_Dialect : Dialect {
     This dialect contains the PaddlePaddle operators.
   }];
 
-  let cppNamespace = "::mlir::pd";
+  let cppNamespace = "mlir::pd";
 }
 
 class PD_Op<string mnemonic, list<OpTrait> traits = []> :
diff --git a/paddle/infrt/dialect/pd_ops.cc b/paddle/infrt/dialect/pd_ops.cc
index ce10be6d100f8..fe38996883846 100644
--- a/paddle/infrt/dialect/pd_ops.cc
+++ b/paddle/infrt/dialect/pd_ops.cc
@@ -14,10 +14,15 @@
 
 #include "paddle/infrt/dialect/pd_ops.h"
 
-#include "mlir/IR/Matchers.h"
-#include "mlir/IR/PatternMatch.h"
+#include <mlir/IR/Matchers.h>
+#include <mlir/IR/PatternMatch.h>
 #include "paddle/infrt/dialect/infrt_base.h"
 
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/pd_ops.cpp.inc"  // NOLINT
+
+#include "paddle/infrt/dialect/rewrite.hpp.inc"  // NOLINT
+
 namespace mlir {
 namespace pd {
 PaddleDialect::PaddleDialect(MLIRContext *context)
@@ -36,12 +41,6 @@ mlir::Operation *PaddleDialect::materializeConstant(mlir::OpBuilder &builder,
   return builder.create<ConstantOp>(loc, value);
 }
 
-#define GET_OP_CLASSES
-#include "paddle/infrt/dialect/pd_ops.cpp.inc"  // NOLINT
-#undef GET_OP_CLASSES
-
-#include "paddle/infrt/dialect/rewrite.hpp.inc"  // NOLINT
-
 void ConstantOp::build(OpBuilder &builder,
                        OperationState &state,
                        Attribute value) {
@@ -66,8 +65,8 @@ LogicalResult ConstantOp::inferReturnTypes(
   inferredReturnTypes.push_back(attributes.get("value").getType());
   return success();
 }
-::mlir::OpFoldResult ConstantOp::fold(
-    ::llvm::ArrayRef<::mlir::Attribute> operands) {
+mlir::OpFoldResult ConstantOp::fold(
+    ::llvm::ArrayRef<mlir::Attribute> operands) {
   return value();
 }
 
@@ -82,11 +81,11 @@ LogicalResult ElementwiseAdd::inferReturnTypes(
   return success();
 }
 void ElementwiseAdd::getCanonicalizationPatterns(
-    ::mlir::OwningRewritePatternList &results, ::mlir::MLIRContext *context) {
+    mlir::OwningRewritePatternList &results, mlir::MLIRContext *context) {
   results.insert<FuseMulAdd>(context);
 }
 
-::mlir::OpFoldResult ElementwiseAdd::fold(
+mlir::OpFoldResult ElementwiseAdd::fold(
     llvm::ArrayRef<mlir::Attribute> operands) {
   if (getElementTypeOrSelf(getType()).isa<FloatType>()) {
     if (!operands[0] || !operands[1]) return {};
@@ -154,17 +153,17 @@ LogicalResult MulOp::inferReturnTypes(
 }
 
 void ReluOp::getCanonicalizationPatterns(
-    ::mlir::OwningRewritePatternList &results, ::mlir::MLIRContext *context) {
+    mlir::OwningRewritePatternList &results, mlir::MLIRContext *context) {
   results.insert<FuseFCRelu>(context);
 }
 
 void FusedRepeatedFCRelu::getCanonicalizationPatterns(
-    ::mlir::OwningRewritePatternList &results, ::mlir::MLIRContext *context) {
+    mlir::OwningRewritePatternList &results, mlir::MLIRContext *context) {
   results.insert<FuseRepeatedFCRelu2>(context);
 }
 
 void BatchNormOp::getCanonicalizationPatterns(
-    ::mlir::OwningRewritePatternList &results, ::mlir::MLIRContext *context) {
+    mlir::OwningRewritePatternList &results, mlir::MLIRContext *context) {
   results.insert<FuseBatchNormWithConvPattern>(context);
 }
 
diff --git a/paddle/infrt/dialect/pd_ops.h b/paddle/infrt/dialect/pd_ops.h
index 71e0a53988d1a..7d1d1d6f58451 100644
--- a/paddle/infrt/dialect/pd_ops.h
+++ b/paddle/infrt/dialect/pd_ops.h
@@ -14,21 +14,20 @@
 
 #pragma once
 
-#include "mlir/Dialect/Traits.h"
-#include "mlir/IR/Attributes.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/Dialect.h"
-#include "mlir/IR/Function.h"
-#include "mlir/IR/Matchers.h"
-#include "mlir/IR/Module.h"
-#include "mlir/IR/OpImplementation.h"
-#include "mlir/IR/StandardTypes.h"
-#include "mlir/IR/TypeUtilities.h"
-#include "mlir/Interfaces/CallInterfaces.h"
-#include "mlir/Interfaces/DerivedAttributeOpInterface.h"
-#include "mlir/Interfaces/InferTypeOpInterface.h"
-#include "mlir/Interfaces/LoopLikeInterface.h"
-#include "mlir/Interfaces/SideEffectInterfaces.h"
+#include <mlir/Dialect/Traits.h>
+#include <mlir/IR/Attributes.h>
+#include <mlir/IR/Builders.h>
+#include <mlir/IR/BuiltinOps.h>
+#include <mlir/IR/BuiltinTypes.h>
+#include <mlir/IR/Dialect.h>
+#include <mlir/IR/Matchers.h>
+#include <mlir/IR/OpImplementation.h>
+#include <mlir/IR/TypeUtilities.h>
+#include <mlir/Interfaces/CallInterfaces.h>
+#include <mlir/Interfaces/DerivedAttributeOpInterface.h>
+#include <mlir/Interfaces/InferTypeOpInterface.h>
+#include <mlir/Interfaces/LoopLikeInterface.h>
+#include <mlir/Interfaces/SideEffectInterfaces.h>
 
 namespace mlir {
 namespace pd {
@@ -53,9 +52,8 @@ class PaddleDialect : public Dialect {
   }
 };
 
-#define GET_OP_CLASSES
-#include "paddle/infrt/dialect/pd_ops.hpp.inc"
-#undef GET_OP_CLASSES
-
 }  // namespace pd
 }  // namespace mlir
+
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/pd_ops.hpp.inc"
diff --git a/paddle/infrt/dialect/pd_ops.td b/paddle/infrt/dialect/pd_ops.td
index b020b7ad5dbc7..3addf15082a12 100644
--- a/paddle/infrt/dialect/pd_ops.td
+++ b/paddle/infrt/dialect/pd_ops.td
@@ -24,6 +24,16 @@ def PD_FeedOp : PD_Op<"feed"> {
 def PD_FetchOp : PD_Op<"fetch", [Terminator]> {
   let summary = "fetch Op";
 
+  let description = [{
+    Return the output tensor from the subgraph.
+  }];
+
+  let arguments = (ins PD_Tensor :$inputs, StrAttr:$name);
+}
+
+def PD_ReturnOp : PD_Op<"return", [Terminator]> {
+  let summary = "return Op";
+
   let description = [{
     Fetch tensor from the graph.
   }];
@@ -31,7 +41,7 @@ def PD_FetchOp : PD_Op<"fetch", [Terminator]> {
   let arguments = (ins Variadic<PD_Tensor>:$inputs);
 }
 
-def PD_GraphOp : PD_Op<"graph", [SingleBlockImplicitTerminator<"FetchOp">]> {
+def PD_GraphOp : PD_Op<"graph", [SingleBlockImplicitTerminator<"ReturnOp">]> {
   let summary = "paddle graph Op";
   let description = [{
     Describe a paddle graph or subgraph.
@@ -50,7 +60,7 @@ def PD_ConstantOp : PD_Op<"constant", [NoSideEffect, ConstantLike, DeclareOpInte
   let hasFolder = 1;
 
   let builders = [
-    OpBuilder<"OpBuilder &builder, OperationState &state, Attribute value">,
+    OpBuilder<(ins "Attribute":$value)>,
   ];
 }
 
diff --git a/paddle/infrt/dialect/pd_types.h b/paddle/infrt/dialect/pd_types.h
index 6f9fe56338a9f..0da888a9c0769 100644
--- a/paddle/infrt/dialect/pd_types.h
+++ b/paddle/infrt/dialect/pd_types.h
@@ -18,12 +18,11 @@
 
 #pragma once
 
-#include "mlir/IR/Diagnostics.h"
-#include "mlir/IR/Location.h"
-#include "mlir/IR/Operation.h"
-#include "mlir/IR/StandardTypes.h"
-#include "mlir/IR/TypeUtilities.h"
-#include "mlir/IR/Types.h"
+#include <mlir/IR/Diagnostics.h>
+#include <mlir/IR/Location.h>
+#include <mlir/IR/Operation.h>
+#include <mlir/IR/TypeUtilities.h>
+#include <mlir/IR/Types.h>
 
 namespace mlir {
 namespace PD {
diff --git a/paddle/infrt/dialect/print_ir.cc b/paddle/infrt/dialect/print_ir.cc
index 43a3577b90f10..5cfd16ee85943 100644
--- a/paddle/infrt/dialect/print_ir.cc
+++ b/paddle/infrt/dialect/print_ir.cc
@@ -11,26 +11,25 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
+#include <llvm/ADT/Optional.h>
+#include <llvm/Support/CommandLine.h>
+#include <llvm/Support/ScopedPrinter.h>
+#include <mlir/IR/BuiltinOps.h>
+#include <llvm/Support/raw_os_ostream.hv
+#include <llvm/Support/raw_ostream.h>
+#include <mlir/Dialect/StandardOps/IR/Ops.h>
+#include <mlir/IR/AsmState.h>
+#include <mlir/IR/Block.h>
+#include <mlir/IR/MLIRContext.h>
+#include <mlir/IR/Operation.h>
+#include <mlir/IR/Region.h>
+#include <mlir/IR/Verifier.h>
+#include <mlir/Parser.h>
+#include <mlir/Pass/PassManager.h>
+#include <mlir/Support/LogicalResult.h>
+#include <mlir/Transforms/Passes.h>
 #include <iostream>
 
-#include "llvm/ADT/Optional.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/ScopedPrinter.h"
-#include "llvm/Support/raw_os_ostream.h"
-#include "llvm/Support/raw_ostream.h"
-#include "mlir/Dialect/StandardOps/IR/Ops.h"
-#include "mlir/IR/AsmState.h"
-#include "mlir/IR/Block.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/Module.h"
-#include "mlir/IR/Operation.h"
-#include "mlir/IR/Region.h"
-#include "mlir/IR/Verifier.h"
-#include "mlir/Parser.h"
-#include "mlir/Pass/PassManager.h"
-#include "mlir/Support/LogicalResult.h"
-#include "mlir/Transforms/Passes.h"
 #include "paddle/infrt/common/global.h"
 #include "paddle/infrt/dialect/init_infrt_dialects.h"
 
@@ -114,17 +113,15 @@ int main(int argc, char **argv) {
   mlir::registerPassManagerCLOptions();
   cl::ParseCommandLineOptions(argc, argv, "mlir demo");
 
-  mlir::MLIRContext *context = infrt::Global::getMLIRContext();
-  // context->allowUnregisteredDialects();
-  auto &registry = context->getDialectRegistry();
-  infrt::RegisterCinnDialects(registry);
-
+  mlir::DialectRegistry registry;
+  infrt::registerCinnDialects(registry);
+  mlir::MLIRContext context(registry);
   // mlir will verify module automatically after parsing.
   // https://github.com/llvm/llvm-project/blob/38d18d93534d290d045bbbfa86337e70f1139dc2/mlir/lib/Parser/Parser.cpp#L2051
   // mlir::OwningModuleRef module_ref = mlir::parseSourceString(mlir_source,
   // context);
   mlir::OwningModuleRef module_ref =
-      mlir::parseSourceFile(inputFilename, context);
+      mlir::parseSourceFile(inputFilename, &context);
   std::cout << "----------print IR Structure begin----------" << std::endl;
   printOperation(module_ref->getOperation(), 0);
   std::cout << "----------print IR Structure end----------" << std::endl;
diff --git a/paddle/infrt/dialect/tensor_shape.cc b/paddle/infrt/dialect/tensor_shape.cc
index ef5a5525cb22f..92c03818264ee 100644
--- a/paddle/infrt/dialect/tensor_shape.cc
+++ b/paddle/infrt/dialect/tensor_shape.cc
@@ -17,16 +17,16 @@
 #include <llvm/ADT/STLExtras.h>
 #include <mlir/IR/Attributes.h>
 #include <mlir/IR/Builders.h>
+#include <mlir/IR/BuiltinOps.h>
+#include <mlir/IR/BuiltinTypes.h>
 #include <mlir/IR/DialectImplementation.h>
-#include <mlir/IR/Function.h>
-#include <mlir/IR/Module.h>
 #include <mlir/IR/OpDefinition.h>
 #include <mlir/IR/OpImplementation.h>
-#include <mlir/IR/StandardTypes.h>
 #include <mlir/IR/TypeUtilities.h>
 #include <mlir/Support/LogicalResult.h>
 
-namespace infrt::ts {
+namespace infrt {
+namespace ts {
 using namespace mlir;  // NOLINT
 
 void TensorShapeDialect::initialize() {
@@ -48,8 +48,8 @@ Type TensorShapeDialect::parseType(DialectAsmParser &parser) const {
   return Type();
 }
 
-void TensorShapeDialect::printType(::mlir::Type type,
-                                   ::mlir::DialectAsmPrinter &os) const {
+void TensorShapeDialect::printType(mlir::Type type,
+                                   mlir::DialectAsmPrinter &os) const {
   if (type.isa<ShapeType>()) {
     os << "shape";
     return;
@@ -61,8 +61,10 @@ void TensorShapeDialect::printType(::mlir::Type type,
   }
   llvm_unreachable("unexpected 'shape' type kind");
 }
+}  // namespace ts
+}  // namespace infrt
 
 #define GET_OP_CLASSES
 #include "paddle/infrt/dialect/tensor_shape.cpp.inc"  // NOLINT
 
-}  // namespace infrt::ts
+#include "paddle/infrt/dialect/tensor_shape_dialect.cpp.inc"
diff --git a/paddle/infrt/dialect/tensor_shape.h b/paddle/infrt/dialect/tensor_shape.h
index bd3fa8853675a..af892af735d2a 100644
--- a/paddle/infrt/dialect/tensor_shape.h
+++ b/paddle/infrt/dialect/tensor_shape.h
@@ -17,7 +17,8 @@
 #include <mlir/IR/OpDefinition.h>
 #include <mlir/Interfaces/SideEffectInterfaces.h>
 
-namespace infrt::ts {
+namespace infrt {
+namespace ts {
 
 class ShapeType
     : public mlir::Type::TypeBase<ShapeType, mlir::Type, mlir::TypeStorage> {
@@ -31,10 +32,9 @@ class PartialShapeType : public mlir::Type::TypeBase<PartialShapeType,
  public:
   using Base::Base;
 };
+}  // namespace ts
+}  // namespace infrt
 
-using namespace mlir;  // NOLINT
 #define GET_OP_CLASSES
 #include "paddle/infrt/dialect/tensor_shape.hpp.inc"
 #include "paddle/infrt/dialect/tensor_shape_dialect.hpp.inc"
-
-}  // namespace infrt::ts
diff --git a/paddle/infrt/dialect/tensor_shape_base.td b/paddle/infrt/dialect/tensor_shape_base.td
index ea1c1854d77ca..c3988307f4dd5 100644
--- a/paddle/infrt/dialect/tensor_shape_base.td
+++ b/paddle/infrt/dialect/tensor_shape_base.td
@@ -19,7 +19,7 @@ def TensorShapeDialect : Dialect {
 def TS_Shape : DialectType<TensorShapeDialect,
 CPred<"$_self.isa<::infrt::ts::ShapeType>()">, "!ts.shape type">,
 BuildableType<"$_builder.getType<::infrt::ts::ShapeType>()"> {
-    let typeDescription = [{
+    let description = [{
         `!ts.shape type` represents a static tensor shape.
 }];
 }
@@ -27,7 +27,7 @@ BuildableType<"$_builder.getType<::infrt::ts::ShapeType>()"> {
 def TS_PartialShape : DialectType<TensorShapeDialect,
 CPred<"$_self.isa<::infrt::ts::PartialShapeType>()">, "!ts.partial_shape type">,
 BuildableType<"$_builder.getType<::infrt::ts::PartialShapeType>()"> {
-    let typeDescription = [{
+    let description = [{
         `!ts.partial_shape type` represents either a static tensor shape, unranked
         tensor shape or a ranked tensor shape with unknown dimension sizes.
 }];
diff --git a/paddle/infrt/dialect/tensorrt/trt_exec.cc b/paddle/infrt/dialect/tensorrt/trt_exec.cc
index dc0f2acb2b733..1baef7a3f77fd 100644
--- a/paddle/infrt/dialect/tensorrt/trt_exec.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_exec.cc
@@ -11,10 +11,10 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#include <llvm/Support/CommandLine.h>
+#include <mlir/Pass/PassManager.h>
 #include <iostream>
 #include <string>
-#include "llvm/Support/CommandLine.h"
-#include "mlir/Pass/PassManager.h"
 #include "paddle/infrt/common/global.h"
 #include "paddle/infrt/dialect/mlir_loader.h"
 #include "paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h"
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc
index 181f462962aee..1da80ef2c3b10 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc
@@ -14,14 +14,13 @@
 
 #include "paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h"
 
+#include <llvm/ADT/SetVector.h>
+#include <mlir/Analysis/SliceAnalysis.h>
+#include <mlir/IR/Builders.h>
+#include <paddle/infrt/dialect/pd_ops.h>
 #include <list>
 #include <unordered_set>
 #include <vector>
-#include "llvm/ADT/SetVector.h"
-#include "mlir/Analysis/SliceAnalysis.h"
-#include "mlir/IR/Builders.h"
-#include "paddle/infrt/dialect/pd_ops.h"
-#include "paddle/infrt/dialect/tensorrt/trt_ops.h"
 
 namespace infrt {
 namespace trt {
@@ -32,9 +31,9 @@ namespace {
 // Reference the function nameed "FlexibleDFS" but defined in:
 // paddle/fluid/framework/ir/subgraph_detector.cc.
 
-bool reverseDfs(std::vector<::mlir::Operation *> source,
-                const std::function<bool(const ::mlir::Operation *)> &func) {
-  std::unordered_set<const ::mlir::Operation *> visited;
+bool reverseDfs(std::vector<mlir::Operation *> source,
+                const std::function<bool(const mlir::Operation *)> &func) {
+  std::unordered_set<const mlir::Operation *> visited;
   while (!source.empty()) {
     auto node = source.back();
     source.pop_back();
@@ -44,7 +43,7 @@ bool reverseDfs(std::vector<::mlir::Operation *> source,
     auto values = node->getOperands();
     for (auto value : values) {
       // if the value is a block argument, the node is nullptr.
-      ::mlir::Operation *node = value.getDefiningOp();
+      mlir::Operation *node = value.getDefiningOp();
       if (node != nullptr && !visited.count(node)) {
         source.emplace_back(node);
       }
@@ -54,19 +53,19 @@ bool reverseDfs(std::vector<::mlir::Operation *> source,
 }
 
 // merge the first&second graph op to a new graph op.
-void mergeTwoAdjacentGraphOp(::mlir::OpBuilder &builder,  // NOLINT
-                             ::mlir::pd::GraphOp first,
-                             ::mlir::pd::GraphOp second) {
+void mergeTwoAdjacentGraphOp(mlir::OpBuilder &builder,  // NOLINT
+                             mlir::pd::GraphOp first,
+                             mlir::pd::GraphOp second) {
   // comput inputs and outputs
-  ::llvm::SmallVector<::mlir::Value, 4> inputs(first.getOperands()), outputs;
-  for (::mlir::Value input : second.getOperands()) {
+  ::llvm::SmallVector<mlir::Value, 4> inputs(first.getOperands()), outputs;
+  for (mlir::Value input : second.getOperands()) {
     if (input.getDefiningOp() != first) {
       inputs.push_back(input);
     }
   }
-  ::llvm::DenseMap<::mlir::Value, unsigned int> op_output_mapping;
-  for (::mlir::Value output : first.getResults()) {
-    for (::mlir::Operation *user : output.getUsers()) {
+  ::llvm::DenseMap<mlir::Value, unsigned int> op_output_mapping;
+  for (mlir::Value output : first.getResults()) {
+    for (mlir::Operation *user : output.getUsers()) {
       if (user != second && user->getParentOp() != second) {
         op_output_mapping[output] = outputs.size();
         outputs.push_back(output);
@@ -74,19 +73,19 @@ void mergeTwoAdjacentGraphOp(::mlir::OpBuilder &builder,  // NOLINT
       }
     }
   }
-  auto fetch_op = second.getBody()->getTerminator();
-  outputs.append(fetch_op->getOperands().begin(),
-                 fetch_op->getOperands().end());
-  ::llvm::SmallVector<::mlir::Type, 4> fetch_types;
+  auto return_op = second.getBody()->getTerminator();
+  outputs.append(return_op->getOperands().begin(),
+                 return_op->getOperands().end());
+  ::llvm::SmallVector<mlir::Type, 4> return_types;
   for (auto value : outputs) {
-    fetch_types.push_back(value.getType());
+    return_types.push_back(value.getType());
   }
 
   // create the new graph op
   builder.setInsertionPoint(first);
   auto loc = first.getLoc();
-  auto graph_op = builder.create<::mlir::pd::GraphOp>(loc, fetch_types, inputs);
-  ::mlir::Block *block = new ::mlir::Block;
+  auto graph_op = builder.create<mlir::pd::GraphOp>(loc, return_types, inputs);
+  mlir::Block *block = new mlir::Block;
   auto copy_range = second.getBody()->without_terminator();
   block->getOperations().splice(block->begin(),
                                 second.getBody()->getOperations(),
@@ -98,18 +97,18 @@ void mergeTwoAdjacentGraphOp(::mlir::OpBuilder &builder,  // NOLINT
                                 copy_range.begin(),
                                 copy_range.end());
   builder.setInsertionPointToEnd(block);
-  builder.create<mlir::pd::FetchOp>(loc, outputs);
+  builder.create<mlir::pd::ReturnOp>(loc, outputs);
   graph_op.body().push_back(block);
 
   // mapping the output
   unsigned int num_result = first.getNumResults();
-  fetch_op = first.getBody()->getTerminator();
+  return_op = first.getBody()->getTerminator();
   for (unsigned int index = 0; index < num_result; ++index) {
     auto origin_value = first.getResult(index);
     if (op_output_mapping.find(origin_value) == op_output_mapping.end()) {
-      origin_value.replaceAllUsesWith(fetch_op->getOperand(index));
+      origin_value.replaceAllUsesWith(return_op->getOperand(index));
     } else {
-      auto inner_value = fetch_op->getOperand(index);
+      auto inner_value = return_op->getOperand(index);
       auto outer_value = graph_op.getResult(op_output_mapping[origin_value]);
       while (!origin_value.use_empty()) {
         auto replace_value =
@@ -128,13 +127,13 @@ void mergeTwoAdjacentGraphOp(::mlir::OpBuilder &builder,  // NOLINT
 
 // Topological sort the function op.
 void topoSortBlock(mlir::Block &body) {  // NOLINT
-  llvm::SetVector<Operation *> toSort;
+  llvm::SetVector<mlir::Operation *> toSort;
   if (body.empty()) return;
   for (auto it = body.rbegin(); it != body.rend(); ++it) {
     toSort.insert(&*it);
   }
-  llvm::SetVector<Operation *> result =
-      ::mlir::topologicalSort(std::move(toSort));
+  llvm::SetVector<mlir::Operation *> result =
+      mlir::topologicalSort(std::move(toSort));
   for (auto *op : result) {
     op->moveBefore(body.getTerminator());
   }
@@ -145,21 +144,21 @@ void topoSortBlock(mlir::Block &body) {  // NOLINT
 // Implementation of the trtGraphFusePass.
 void trtGraphFusePass::runOnFunction() {
   mlir::Block &body = getFunction().front();
-  ::mlir::OpBuilder builder(&body, body.begin());
+  mlir::OpBuilder builder(&body, body.begin());
   bool changed = false;
   do {
     changed = false;
     for (auto &op : body) {
-      ::mlir::pd::GraphOp graph_op =
-          ::llvm::dyn_cast_or_null<::mlir::pd::GraphOp>(&op);
+      mlir::pd::GraphOp graph_op =
+          ::llvm::dyn_cast_or_null<mlir::pd::GraphOp>(&op);
       if (nullptr == graph_op) continue;
 
       for (auto user_op : op.getUsers()) {
-        ::mlir::pd::GraphOp user_graph_op =
-            ::llvm::dyn_cast_or_null<::mlir::pd::GraphOp>(user_op);
+        mlir::pd::GraphOp user_graph_op =
+            ::llvm::dyn_cast_or_null<mlir::pd::GraphOp>(user_op);
         if (nullptr == user_graph_op) continue;
         // get all dst input nodes except src.
-        std::vector<::mlir::Operation *> source_nodes;
+        std::vector<mlir::Operation *> source_nodes;
         for (auto operand : user_op->getOperands()) {
           auto input = operand.getDefiningOp();
           if (input != &op && input != nullptr) {
@@ -167,9 +166,8 @@ void trtGraphFusePass::runOnFunction() {
           }
         }
         // Reverse DFS from the source_nodes.
-        if (!reverseDfs(source_nodes, [&op](const ::mlir::Operation *n) {
-              return n == &op;
-            })) {
+        if (!reverseDfs(source_nodes,
+                        [&op](const mlir::Operation *n) { return n == &op; })) {
           mergeTwoAdjacentGraphOp(builder, graph_op, user_graph_op);
           changed = true;
           break;
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h
index e7134e88f316c..f1e555c6f67ec 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #pragma once
-#include "mlir/Pass/Pass.h"
+#include <mlir/Pass/Pass.h>
 
 namespace infrt {
 namespace trt {
@@ -28,15 +28,15 @@ namespace trt {
  *  %a = "pd.feed"()...
  *  %c = "pd.graph"(%a) {
  *     %m = "pd.conv2d"(%a)...
- *     "pd.fetch" %m
+ *     "pd.return" %m
  *  } ...
  *  %d = "pd.graph"(%c) {
  *      %m = "pd.conv3d"(%c)...
- *      "pd.fetch" %m
+ *      "pd.return" %m
  *  } ...
  *  %f = "pd.graph"(%a) {
  *      %m = "pd.conv2d"(%a)...
- *      "pd.fetch" %m
+ *      "pd.return" %m
  *  } ...
  *  "pd.fetch" %d, %f
  *
@@ -47,13 +47,13 @@ namespace trt {
  *     %m = "pd.conv2d"(%a)...
  *     %n = "pd.conv3d"(%m)...
  *     %s = "pd.conv2d"(%a)...
- *     "pd.fetch" %n, %s
+ *     "pd.return" %n, %s
  *  } ...
  *  "pd.fetch" %d, %f
  * }
  */
 class trtGraphFusePass
-    : public ::mlir::PassWrapper<trtGraphFusePass, ::mlir::FunctionPass> {
+    : public mlir::PassWrapper<trtGraphFusePass, mlir::FunctionPass> {
  public:
   ::llvm::StringRef getName() const override { return "trtGraphFusePass"; }
   void runOnFunction() override;
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc
index 2b45364de2036..257f2b5285425 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc
@@ -14,7 +14,7 @@
 
 #include "paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h"
 
-#include "mlir/IR/Builders.h"
+#include <mlir/IR/Builders.h>
 #include "paddle/infrt/dialect/pd_ops.h"
 #include "paddle/infrt/dialect/tensorrt/trt_ops.h"
 
@@ -22,24 +22,24 @@ namespace infrt {
 namespace trt {
 // Implementation of the trtGraphSplitPass。
 void trtGraphSplitPass::runOnFunction() {
-  std::vector<::mlir::pd::GraphOp> worklist;
-  ::mlir::Block& block = getFunction().front();
+  std::vector<mlir::pd::GraphOp> worklist;
+  mlir::Block& block = getFunction().front();
   for (auto& op : block) {
-    ::mlir::pd::GraphOp graph_op =
-        ::llvm::dyn_cast_or_null<::mlir::pd::GraphOp>(&op);
+    mlir::pd::GraphOp graph_op =
+        ::llvm::dyn_cast_or_null<mlir::pd::GraphOp>(&op);
     if (nullptr != graph_op &&
         graph_op.getBody()->getOperations().size() <= min_subgraph_size_) {
       worklist.push_back(graph_op);
     }
   }
   while (!worklist.empty()) {
-    ::mlir::pd::GraphOp graph_op = worklist.back();
+    mlir::pd::GraphOp graph_op = worklist.back();
     worklist.pop_back();
-    ::mlir::Block* body = graph_op.getBody();
-    auto fetch_op = body->getTerminator();
-    graph_op.replaceAllUsesWith(fetch_op->getOperands());
+    mlir::Block* body = graph_op.getBody();
+    auto return_op = body->getTerminator();
+    graph_op.replaceAllUsesWith(return_op->getOperands());
     auto copy_range = body->without_terminator();
-    block.getOperations().splice(::mlir::Block::iterator(graph_op),
+    block.getOperations().splice(mlir::Block::iterator(graph_op),
                                  body->getOperations(),
                                  copy_range.begin(),
                                  copy_range.end());
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h
index 092df0cf834e5..d30d186647fc3 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #pragma once
-#include "mlir/Pass/Pass.h"
+#include <mlir/Pass/Pass.h>
 
 namespace infrt {
 namespace trt {
@@ -31,9 +31,9 @@ namespace trt {
  *     %m = "pd.conv2d"(%a)...
  *     %n = "pd.conv3d"(%m)...
  *     %s = "pd.conv2d"(%a)...
- *     "pd.fetch" %n, %s
+ *     "pd.return" (%n, %s)
  *  } ...
- *  "pd.fetch" %d, %f
+ *  "pd.fetch" (%d, %f)
  * }
  *
  * destination func:
@@ -42,11 +42,11 @@ namespace trt {
  *  %c = "pd.conv2d"(%a) ...
  *  %d = "pd.conv3d"(%c) ...
  *  %f = "pd.conv2d"(%a) ...
- *  "pd.fetch" %d, %f
+ *  "pd.fetch" (%d, %f)
  * }
  */
 class trtGraphSplitPass
-    : public ::mlir::PassWrapper<trtGraphSplitPass, ::mlir::FunctionPass> {
+    : public mlir::PassWrapper<trtGraphSplitPass, mlir::FunctionPass> {
  public:
   ::llvm::StringRef getName() const override { return "trtGraphSplitPass"; }
   void runOnFunction() override;
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc
index 7b7fbb05c1d13..4e8d40b982b2e 100644
--- a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc
@@ -14,49 +14,48 @@
 
 #include "paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h"
 
-#include "mlir/IR/Builders.h"
+#include <mlir/IR/Builders.h>
 #include "paddle/infrt/dialect/pd_ops.h"
-#include "paddle/infrt/dialect/tensorrt/trt_ops.h"
 
 namespace infrt {
 namespace trt {
 // Implementation of the trtOpTellerPass。
 void trtOpTellerPass::runOnFunction() {
-  ::mlir::Block &body = getFunction().front();
-  std::vector<::mlir::Operation *> worklist;
+  mlir::Block &body = getFunction().front();
+  std::vector<mlir::Operation *> worklist;
   worklist.reserve(body.getOperations().size());
   for (auto &op : body) {
     worklist.push_back(&op);
   }
   // Build GraphOp.
-  ::mlir::OpBuilder builder(&body, body.begin());
+  mlir::OpBuilder builder(&body, body.begin());
   while (!worklist.empty()) {
     auto *op = worklist.back();
     worklist.pop_back();
     if (op == nullptr) continue;
-    auto op1 = ::llvm::dyn_cast_or_null<::mlir::pd::FeedOp>(op);
+    auto op1 = ::llvm::dyn_cast_or_null<mlir::pd::FeedOp>(op);
     if (op1) continue;
-    auto op2 = ::llvm::dyn_cast_or_null<::mlir::pd::FetchOp>(op);
+    auto op2 = ::llvm::dyn_cast_or_null<mlir::pd::FetchOp>(op);
     if (op2) continue;
-    auto op3 = ::llvm::dyn_cast_or_null<::mlir::pd::GraphOp>(op);
+    auto op3 = ::llvm::dyn_cast_or_null<mlir::pd::GraphOp>(op);
     if (op3) continue;
     builder.setInsertionPoint(op);
     auto loc = getFunction().getLoc();
-    auto graph_op = builder.create<::mlir::pd::GraphOp>(
+    auto graph_op = builder.create<mlir::pd::GraphOp>(
         loc, op->getResultTypes(), op->getOperands());
 
-    ::llvm::SmallVector<::mlir::Value, 4> tblgen_repl_values;
+    ::llvm::SmallVector<mlir::Value, 4> tblgen_repl_values;
     for (auto v :
-         ::llvm::SmallVector<::mlir::Value, 4>{graph_op.getODSResults(0)}) {
+         ::llvm::SmallVector<mlir::Value, 4>{graph_op.getODSResults(0)}) {
       tblgen_repl_values.push_back(v);
     }
     op->replaceAllUsesWith(tblgen_repl_values);
     // Build graph op.
-    ::mlir::Block *block = new ::mlir::Block;
+    mlir::Block *block = new mlir::Block;
     graph_op.body().push_back(block);
     op->moveBefore(block, block->begin());
     builder.setInsertionPointToEnd(block);
-    builder.create<mlir::pd::FetchOp>(loc, op->getResults());
+    builder.create<mlir::pd::ReturnOp>(loc, op->getResults());
   }
 }
 }  // namespace trt
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h
index b03945b3459c0..fb16c974f7fb3 100644
--- a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h
+++ b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #pragma once
-#include "mlir/Pass/Pass.h"
+#include <mlir/Pass/Pass.h>
 
 namespace infrt {
 namespace trt {
@@ -29,7 +29,7 @@ namespace trt {
  *  %c = "pd.conv2d"(%a) ...
  *  %d = "pd.conv3d"(%c) ...
  *  %f = "pd.conv2d"(%a) ...
- *  "pd.fetch" %d, %f
+ *  "pd.fetch" (%d, %f)
  * }
  *
  * destination func:
@@ -37,23 +37,23 @@ namespace trt {
  *  %a = "pd.feed"()...
  *  %c = "pd.graph"(%a) {
  *     %m = "pd.conv2d"(%a)...
- *     "pd.fetch" %m
+ *     "pd.return" (%m)
  *  } ...
  *  %d = "pd.graph"(%c) {
  *      %m = "pd.conv3d"(%c)...
- *      "pd.fetch" %m
+ *      "pd.return" (%m)
  *  } ...
  *  %f = "pd.graph"(%a) {
  *      %m = "pd.conv2d"(%a)...
- *      "pd.fetch" %m
+ *      "pd.return" (%m)
  *  } ...
- *  "pd.fetch" %d, %f
+ *  "pd.fetch" (%d, %f)
  * }
  * TODO(winter-wang): Supplementary how to judge the operators can be supported
  * by tensorrt.
  */
 class trtOpTellerPass
-    : public ::mlir::PassWrapper<trtOpTellerPass, ::mlir::FunctionPass> {
+    : public mlir::PassWrapper<trtOpTellerPass, mlir::FunctionPass> {
  public:
   ::llvm::StringRef getName() const override { return "trtOpTellerPass"; }
   void runOnFunction() override;
diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.cc b/paddle/infrt/dialect/tensorrt/trt_ops.cc
index 4c02238b10e1d..35b7967892caf 100644
--- a/paddle/infrt/dialect/tensorrt/trt_ops.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_ops.cc
@@ -13,27 +13,25 @@
 // limitations under the License.
 
 #include "paddle/infrt/dialect/tensorrt/trt_ops.h"
-#include "mlir/IR/Matchers.h"
-#include "mlir/IR/OpImplementation.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/Interfaces/CallInterfaces.h"
-#include "mlir/Interfaces/SideEffectInterfaces.h"
+#include <mlir/IR/Matchers.h>
+#include <mlir/IR/OpImplementation.h>
+#include <mlir/IR/PatternMatch.h>
+#include <mlir/Interfaces/CallInterfaces.h>
+#include <mlir/Interfaces/SideEffectInterfaces.h>
 
 namespace infrt {
 namespace trt {
 
-TensorRTDialect::TensorRTDialect(::mlir::MLIRContext *context)
-    : ::mlir::Dialect("trt", context, ::mlir::TypeID::get<TensorRTDialect>()) {
+TensorRTDialect::TensorRTDialect(mlir::MLIRContext *context)
+    : mlir::Dialect("trt", context, mlir::TypeID::get<TensorRTDialect>()) {
   addOperations<
 #define GET_OP_LIST
 #include "paddle/infrt/dialect/tensorrt/trt_ops.cpp.inc"  // NOLINT
       >();
-#undef GET_OP_LIST
 }
 
-#define GET_OP_CLASSES
-#include "paddle/infrt/dialect/tensorrt/trt_ops.cpp.inc"  // NOLINT
-#undef GET_OP_CLASSES
-
 }  // namespace trt
 }  // namespace infrt
+
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/tensorrt/trt_ops.cpp.inc"  // NOLINT
diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.h b/paddle/infrt/dialect/tensorrt/trt_ops.h
index c9043c2280de0..a37491ec1abc7 100644
--- a/paddle/infrt/dialect/tensorrt/trt_ops.h
+++ b/paddle/infrt/dialect/tensorrt/trt_ops.h
@@ -14,37 +14,32 @@
 
 #pragma once
 
-#include "mlir/Dialect/Traits.h"
-#include "mlir/IR/Attributes.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/Dialect.h"
-#include "mlir/IR/Function.h"
-#include "mlir/IR/Matchers.h"
-#include "mlir/IR/Module.h"
-#include "mlir/IR/OpImplementation.h"
-#include "mlir/IR/StandardTypes.h"
-#include "mlir/IR/TypeUtilities.h"
-#include "mlir/Interfaces/CallInterfaces.h"
-#include "mlir/Interfaces/DerivedAttributeOpInterface.h"
-#include "mlir/Interfaces/InferTypeOpInterface.h"
-#include "mlir/Interfaces/LoopLikeInterface.h"
-#include "mlir/Interfaces/SideEffectInterfaces.h"
+#include <mlir/Dialect/Traits.h>
+#include <mlir/IR/Attributes.h>
+#include <mlir/IR/Builders.h>
+#include <mlir/IR/BuiltinOps.h>
+#include <mlir/IR/BuiltinTypes.h>
+#include <mlir/IR/Dialect.h>
+#include <mlir/IR/Matchers.h>
+#include <mlir/IR/OpImplementation.h>
+#include <mlir/IR/TypeUtilities.h>
+#include <mlir/Interfaces/CallInterfaces.h>
+#include <mlir/Interfaces/DerivedAttributeOpInterface.h>
+#include <mlir/Interfaces/InferTypeOpInterface.h>
+#include <mlir/Interfaces/LoopLikeInterface.h>
+#include <mlir/Interfaces/SideEffectInterfaces.h>
 
 namespace infrt {
 namespace trt {
 
-class TensorRTDialect : public ::mlir::Dialect {
+class TensorRTDialect : public mlir::Dialect {
  public:
-  explicit TensorRTDialect(::mlir::MLIRContext* context);
+  explicit TensorRTDialect(mlir::MLIRContext* context);
   static llvm::StringRef getDialectNamespace() { return "trt"; }
 };
 
-// mlir bug。 can be removed safety when update mlir to llvm11.
-using namespace mlir;  // NOLINT
+}  // namespace trt
+}  // namespace infrt
 
 #define GET_OP_CLASSES
 #include "paddle/infrt/dialect/tensorrt/trt_ops.hpp.inc"
-#undef GET_OP_CLASSES
-
-}  // namespace trt
-}  // namespace infrt
diff --git a/paddle/infrt/dialect/test_kernels.cc b/paddle/infrt/dialect/test_kernels.cc
index 894d96f95ad5c..c4588d7cf8bab 100644
--- a/paddle/infrt/dialect/test_kernels.cc
+++ b/paddle/infrt/dialect/test_kernels.cc
@@ -14,14 +14,13 @@
 
 #include "paddle/infrt/dialect/test_kernels.h"
 
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/OpDefinition.h"
-#include "mlir/IR/OpImplementation.h"
-#include "mlir/IR/StandardTypes.h"
-#include "mlir/IR/TypeUtilities.h"
-
-namespace infrt::dialect {
+#include <mlir/IR/Builders.h>
+#include <mlir/IR/OpDefinition.h>
+#include <mlir/IR/OpImplementation.h>
+#include <mlir/IR/TypeUtilities.h>
 
+namespace infrt {
+namespace dialect {
 //===----------------------------------------------------------------------===//
 // BenchmarkOp
 //===----------------------------------------------------------------------===//
@@ -32,65 +31,67 @@ namespace infrt::dialect {
 // ...
 // }
 
-static ParseResult parseBenchmarkOp(OpAsmParser &parser,       // NOLINT
-                                    OperationState &result) {  // NOLINT
-  StringAttr nameAttr;
+static mlir::ParseResult parseBenchmarkOp(
+    mlir::OpAsmParser &parser,       // NOLINT
+    mlir::OperationState &result) {  // NOLINT
+  mlir::StringAttr nameAttr;
   if (parser.parseAttribute(nameAttr, "name", result.attributes))
-    return failure();
+    return mlir::failure();
 
   // Parse the operands, e.g. (%c : i32, %d : f32)
-  if (parser.parseLParen()) return failure();
+  if (parser.parseLParen()) return mlir::failure();
 
-  SmallVector<OpAsmParser::OperandType, 4> operands;
-  SmallVector<Type, 4> types;
+  llvm::SmallVector<mlir::OpAsmParser::OperandType, 4> operands;
+  llvm::SmallVector<mlir::Type, 4> types;
   llvm::SMLoc type_loc = parser.getCurrentLocation();
 
   if (parser.parseOptionalRParen()) {
     // Parse non-empty operands
     do {
       // Parse %c : i32,
-      OpAsmParser::OperandType operand;
-      Type type;
+      mlir::OpAsmParser::OperandType operand;
+      mlir::Type type;
 
       if (parser.parseOperand(operand) || parser.parseColonType(type))
-        return failure();
+        return mlir::failure();
 
       operands.push_back(operand);
       types.push_back(type);
     } while (succeeded(parser.parseOptionalComma()));
 
-    if (parser.parseRParen()) return failure();
+    if (parser.parseRParen()) return mlir::failure();
   }
 
   if (parser.resolveOperands(operands, types, type_loc, result.operands))
-    return failure();
+    return mlir::failure();
 
   // Parse the keyword attribute, e.g. max_count = 100, duration_secs = 1
   do {
-    StringRef attr;
-    Attribute resultAttr;
+    mlir::StringRef attr;
+    mlir::Attribute resultAttr;
     if (parser.parseKeyword(&attr) || parser.parseEqual() ||
         parser.parseAttribute(resultAttr,
                               parser.getBuilder().getIntegerType(32),
                               attr,
                               result.attributes))
-      return failure();
-  } while (succeeded(parser.parseOptionalComma()));
+      return mlir::failure();
+  } while (mlir::succeeded(parser.parseOptionalComma()));
 
   // Set the default attribute num_warmup_runs to 1 if unset
   auto setDefaultAttrIfUnset = [&](const char *attr_name, int value) {
     bool found = llvm::any_of(result.attributes,
-                              [attr_name](const NamedAttribute &attr) {
-                                return attr.first == attr_name;
+                              [attr_name](const mlir::NamedAttribute &attr) {
+                                return attr.getName() == attr_name;
                               });
     if (!found) {
-      IntegerAttr default_val = parser.getBuilder().getI32IntegerAttr(value);
+      mlir::IntegerAttr default_val =
+          parser.getBuilder().getI32IntegerAttr(value);
       result.addAttribute(attr_name, default_val);
     }
   };
   setDefaultAttrIfUnset("num_warmup_runs", 1);
 
-  Region *target = result.addRegion();
+  mlir::Region *target = result.addRegion();
   return parser.parseRegion(*target,
                             operands,
                             types,
@@ -102,11 +103,11 @@ static ParseResult parseBenchmarkOp(OpAsmParser &parser,       // NOLINT
 //       max_count = 100, duration_secs = 1 {
 // ...
 // }
-static void print(OpAsmPrinter &p, BenchmarkOp op) {  // NOLINT
+static void print(mlir::OpAsmPrinter &p, BenchmarkOp op) {  // NOLINT
   p << "infrt.benchmark ";
 
   // Print the name attribute, e.g "add.i32"
-  auto name_attr = op.getAttr("name");
+  auto name_attr = op->getAttr("name");
   p << name_attr;
 
   // Print the operands and types, e.g. (%c : i32, %d : f32)
@@ -120,13 +121,13 @@ static void print(OpAsmPrinter &p, BenchmarkOp op) {  // NOLINT
 
   bool need_comma = false;
   // Print the attributes, e.g. max_count = 100, duration_secs = 1
-  for (auto &name_attr : op.getAttrs()) {
-    auto id = name_attr.first;
+  for (auto &name_attr : op->getAttrs()) {
+    auto id = name_attr.getName();
     if (id == "name") continue;
     if (need_comma) p << ", ";
-    auto attr = name_attr.second;
+    auto attr = name_attr.getValue();
     p << id << " = ";
-    if (auto int_attr = attr.dyn_cast<IntegerAttr>()) {
+    if (auto int_attr = attr.dyn_cast<mlir::IntegerAttr>()) {
       int_attr.getValue().print(p.getStream(), /*isSigned=*/false);
     } else {
       op.emitOpError("Unexpected attribute");
@@ -142,7 +143,7 @@ static void print(OpAsmPrinter &p, BenchmarkOp op) {  // NOLINT
   p.printRegion(op.region(), /*printEntryBlockArgs=*/false);
 }
 
-static LogicalResult verify(BenchmarkOp op) {
+static mlir::LogicalResult verify(BenchmarkOp op) {
   // Verify that the target benchmark region has exactly one return value.
   auto &region = op.region();
   auto &last_op = region.front().back();
@@ -154,10 +155,10 @@ static LogicalResult verify(BenchmarkOp op) {
         "incorrect number of return values. One return value is expected");
   }
 
-  return success();
+  return mlir::success();
 }
+}  // namespace dialect
+}  // namespace infrt
 
 #define GET_OP_CLASSES
 #include "paddle/infrt/dialect/test_kernels.cpp.inc"
-
-}  // namespace infrt::dialect
diff --git a/paddle/infrt/dialect/test_kernels.h b/paddle/infrt/dialect/test_kernels.h
index 29d4209cb7280..73c8a6fb387bc 100644
--- a/paddle/infrt/dialect/test_kernels.h
+++ b/paddle/infrt/dialect/test_kernels.h
@@ -13,11 +13,8 @@
 // limitations under the License.
 
 #pragma once
-#include "mlir/IR/OpDefinition.h"
-#include "mlir/Interfaces/SideEffectInterfaces.h"
+#include <mlir/IR/OpDefinition.h>
+#include <mlir/Interfaces/SideEffectInterfaces.h>
 
-namespace infrt::dialect {
-using namespace mlir;  // NOLINT
 #define GET_OP_CLASSES
 #include "paddle/infrt/dialect/test_kernels.hpp.inc"
-}  // namespace infrt::dialect
diff --git a/paddle/infrt/dialect/types.cc b/paddle/infrt/dialect/types.cc
deleted file mode 100644
index 6d6f6a20b46c9..0000000000000
--- a/paddle/infrt/dialect/types.cc
+++ /dev/null
@@ -1,17 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/infrt/dialect/types.h"
-
-namespace infrt::hlir::mlir {}  // namespace infrt::hlir::mlir
diff --git a/paddle/infrt/dialect/types.h b/paddle/infrt/dialect/types.h
deleted file mode 100644
index a9a2b61871cc0..0000000000000
--- a/paddle/infrt/dialect/types.h
+++ /dev/null
@@ -1,16 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <mlir/IR/StandardTypes.h>
diff --git a/paddle/infrt/host_context/core_runtime.cc b/paddle/infrt/host_context/core_runtime.cc
index cdb8cc99ecb26..e3917bd07d242 100644
--- a/paddle/infrt/host_context/core_runtime.cc
+++ b/paddle/infrt/host_context/core_runtime.cc
@@ -23,7 +23,8 @@
 #include "paddle/infrt/host_context/op_executable.h"
 #include "paddle/infrt/host_context/symbol_table.h"
 
-namespace infrt::host_context {
+namespace infrt {
+namespace host_context {
 
 struct CoreRuntime::Impl {
   KernelRegistry* kernel_registry{};
@@ -90,4 +91,5 @@ llvm::SmallVector<ValueRef, 4> CoreRuntime::GetResults(
 
 CoreRuntime::~CoreRuntime() {}
 
-}  // namespace infrt::host_context
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/core_runtime.h b/paddle/infrt/host_context/core_runtime.h
index 802f8b17bb010..acb6a66cac630 100644
--- a/paddle/infrt/host_context/core_runtime.h
+++ b/paddle/infrt/host_context/core_runtime.h
@@ -22,7 +22,8 @@
 
 #include "paddle/infrt/host_context/value.h"
 
-namespace infrt::host_context {
+namespace infrt {
+namespace host_context {
 
 class KernelRegistry;
 class OpExecutable;
@@ -83,4 +84,5 @@ class CoreRuntimeBuilder : public CoreRuntime {
   OpExecutableBuilder* NewOpExecutable(const std::string& op_name);
 };
 
-}  // namespace infrt::host_context
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/kernel_frame.h b/paddle/infrt/host_context/kernel_frame.h
index 20cb17dc7fbe2..5186b88fe2c41 100644
--- a/paddle/infrt/host_context/kernel_frame.h
+++ b/paddle/infrt/host_context/kernel_frame.h
@@ -21,7 +21,8 @@
 #include "llvm/ADT/SmallVector.h"
 #include "paddle/infrt/host_context/value.h"
 
-namespace infrt::host_context {
+namespace infrt {
+namespace host_context {
 
 /**
  * KernelFrame captures the states(input arguments, attributes, results)
@@ -163,4 +164,5 @@ class KernelFrameBuilder : public KernelFrame {
   }
 };
 
-}  // namespace infrt::host_context
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/kernel_registry_test.cc b/paddle/infrt/host_context/kernel_registry_test.cc
index f36ec2a1cac7d..7fca56343041c 100644
--- a/paddle/infrt/host_context/kernel_registry_test.cc
+++ b/paddle/infrt/host_context/kernel_registry_test.cc
@@ -18,7 +18,8 @@
 
 #include "paddle/infrt/host_context/kernel_utils.h"
 
-namespace infrt::host_context {
+namespace infrt {
+namespace host_context {
 
 int add_i32(int a, int b) { return a + b; }
 
@@ -44,4 +45,5 @@ TEST(KernelRegistry, basic) {
   ASSERT_EQ(results[0]->get<int>(), 3);
 }
 
-}  // namespace infrt::host_context
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/kernel_utils_test.cc b/paddle/infrt/host_context/kernel_utils_test.cc
index 1904eb106a293..bebd8d86e50bb 100644
--- a/paddle/infrt/host_context/kernel_utils_test.cc
+++ b/paddle/infrt/host_context/kernel_utils_test.cc
@@ -16,7 +16,8 @@
 
 #include <gtest/gtest.h>
 
-namespace infrt::host_context {
+namespace infrt {
+namespace host_context {
 
 int add_i32(int a, int b) { return a + b; }
 float add_f32(float a, float b) { return a + b; }
@@ -66,4 +67,5 @@ TEST(KernelImpl, pair) {
   ASSERT_EQ(results[1]->get<float>(), 3.f);
 }
 
-}  // namespace infrt::host_context
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/mlir_function_executable.cc b/paddle/infrt/host_context/mlir_function_executable.cc
index 5f8dacf8e448a..47ec27ebec300 100644
--- a/paddle/infrt/host_context/mlir_function_executable.cc
+++ b/paddle/infrt/host_context/mlir_function_executable.cc
@@ -15,6 +15,7 @@
 #include "paddle/infrt/host_context/mlir_function_executable.h"
 
 #include <glog/logging.h>
+#include <mlir/IR/BuiltinOps.h>
 
 #include <string>  // NOLINT
 
diff --git a/paddle/infrt/host_context/mlir_function_executable.h b/paddle/infrt/host_context/mlir_function_executable.h
index ba5fa154d6fcc..a6428df86e6b2 100644
--- a/paddle/infrt/host_context/mlir_function_executable.h
+++ b/paddle/infrt/host_context/mlir_function_executable.h
@@ -13,7 +13,8 @@
 // limitations under the License.
 
 #pragma once
-#include <mlir/IR/Function.h>
+#include <mlir/IR/BuiltinTypes.h>
+#include <mlir/IR/Region.h>
 
 #include <string>
 #include <unordered_map>
diff --git a/paddle/infrt/host_context/mlir_program_executor.h b/paddle/infrt/host_context/mlir_program_executor.h
index b2af4d2d79db5..c2ccb90640b21 100644
--- a/paddle/infrt/host_context/mlir_program_executor.h
+++ b/paddle/infrt/host_context/mlir_program_executor.h
@@ -15,9 +15,9 @@
 #pragma once
 
 #include <mlir/Dialect/StandardOps/IR/Ops.h>
+#include <mlir/IR/BuiltinOps.h>
+#include <mlir/IR/BuiltinTypes.h>
 #include <mlir/IR/Diagnostics.h>
-#include <mlir/IR/Function.h>
-#include <mlir/IR/Module.h>
 #include <mlir/IR/OperationSupport.h>
 #include <unordered_map>
 
diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate.cc b/paddle/infrt/host_context/mlir_to_runtime_translate.cc
index 25324b1291582..3dbc7a702be38 100644
--- a/paddle/infrt/host_context/mlir_to_runtime_translate.cc
+++ b/paddle/infrt/host_context/mlir_to_runtime_translate.cc
@@ -16,8 +16,9 @@
 
 #include <llvm/Support/SourceMgr.h>
 #include <mlir/Dialect/StandardOps/IR/Ops.h>
+#include <mlir/IR/BuiltinOps.h>
+#include <mlir/IR/BuiltinTypes.h>
 #include <mlir/IR/Diagnostics.h>
-#include <mlir/IR/Function.h>
 #include <mlir/IR/OperationSupport.h>
 #include <mlir/Parser.h>
 
@@ -40,7 +41,8 @@
 #include "paddle/infrt/host_context/value.h"
 #include "paddle/infrt/tensor/tensor_shape.h"
 
-namespace infrt::host_context {
+namespace infrt {
+namespace host_context {
 
 template <typename T>
 std::string DumpToString(T& op) {  // NOLINT
@@ -113,10 +115,10 @@ bool MlirToRuntimeTranslator::EmitConstantOp(mlir::Operation* op) {
 
 template <>
 boost::optional<int32_t> MlirToRuntimeTranslator::EmitAttribute(
-    const mlir::Attribute* attr) {
-  if (!attr->isa<mlir::IntegerAttr>()) return boost::none;
-  if (attr->isa<mlir::IntegerAttr>()) {
-    auto val = attr->cast<mlir::IntegerAttr>();
+    const mlir::Attribute& attr) {
+  if (!attr.isa<mlir::IntegerAttr>()) return boost::none;
+  if (attr.isa<mlir::IntegerAttr>()) {
+    auto val = attr.cast<mlir::IntegerAttr>();
     if (val.getType().isInteger(32)) {
       return val.getInt();
     }
@@ -125,10 +127,10 @@ boost::optional<int32_t> MlirToRuntimeTranslator::EmitAttribute(
 }
 template <>
 boost::optional<int64_t> MlirToRuntimeTranslator::EmitAttribute(
-    const mlir::Attribute* attr) {
-  if (!attr->isa<mlir::IntegerAttr>()) return boost::none;
-  if (attr->isa<mlir::IntegerAttr>()) {
-    auto val = attr->cast<mlir::IntegerAttr>();
+    const mlir::Attribute& attr) {
+  if (!attr.isa<mlir::IntegerAttr>()) return boost::none;
+  if (attr.isa<mlir::IntegerAttr>()) {
+    auto val = attr.cast<mlir::IntegerAttr>();
     if (val.getType().isInteger(64)) {
       return val.getInt();
     }
@@ -139,10 +141,10 @@ boost::optional<int64_t> MlirToRuntimeTranslator::EmitAttribute(
 // TODO(Superjomn) Make double and float parsing share some thing.
 template <>
 boost::optional<float> MlirToRuntimeTranslator::EmitAttribute(
-    const mlir::Attribute* attr) {
-  if (!attr->isa<mlir::FloatAttr>()) return boost::none;
-  if (attr->isa<mlir::FloatAttr>()) {
-    auto val = attr->cast<mlir::FloatAttr>();
+    const mlir::Attribute& attr) {
+  if (!attr.isa<mlir::FloatAttr>()) return boost::none;
+  if (attr.isa<mlir::FloatAttr>()) {
+    auto val = attr.cast<mlir::FloatAttr>();
     if (val.getType().isF32()) return val.getValueAsDouble();
   }
   return boost::none;
@@ -150,10 +152,10 @@ boost::optional<float> MlirToRuntimeTranslator::EmitAttribute(
 
 template <>
 boost::optional<double> MlirToRuntimeTranslator::EmitAttribute(
-    const mlir::Attribute* attr) {
-  if (!attr->isa<mlir::FloatAttr>()) return boost::none;
-  if (attr->isa<mlir::FloatAttr>()) {
-    auto val = attr->cast<mlir::FloatAttr>();
+    const mlir::Attribute& attr) {
+  if (!attr.isa<mlir::FloatAttr>()) return boost::none;
+  if (attr.isa<mlir::FloatAttr>()) {
+    auto val = attr.cast<mlir::FloatAttr>();
     if (val.getType().isF64()) return val.getValueAsDouble();
   }
   return boost::none;
@@ -161,17 +163,17 @@ boost::optional<double> MlirToRuntimeTranslator::EmitAttribute(
 
 template <>
 boost::optional<std::string> MlirToRuntimeTranslator::EmitAttribute(
-    const mlir::Attribute* attr) {
-  if (!attr->isa<mlir::StringAttr>()) return boost::none;
-  return attr->cast<mlir::StringAttr>().getValue().str();
+    const mlir::Attribute& attr) {
+  if (!attr.isa<mlir::StringAttr>()) return boost::none;
+  return attr.cast<mlir::StringAttr>().getValue().str();
 }
 
 #define PROCESS_ARRAY_INT(type__, bits__)                                      \
   template <>                                                                  \
   boost::optional<std::vector<type__>> MlirToRuntimeTranslator::EmitAttribute( \
-      const mlir::Attribute* attr) {                                           \
-    if (!attr->isa<mlir::ArrayAttr>()) return boost::none;                     \
-    auto array = attr->cast<mlir::ArrayAttr>();                                \
+      const mlir::Attribute& attr) {                                           \
+    if (!attr.isa<mlir::ArrayAttr>()) return boost::none;                      \
+    auto array = attr.cast<mlir::ArrayAttr>();                                 \
     CHECK(!array.empty());                                                     \
                                                                                \
     if (!array[0].getType().isInteger(bits__)) {                               \
@@ -191,9 +193,9 @@ PROCESS_ARRAY_INT(int64_t, 64);
 
 template <>
 boost::optional<std::vector<float>> MlirToRuntimeTranslator::EmitAttribute(
-    const mlir::Attribute* attr) {
-  if (!attr->isa<mlir::ArrayAttr>()) return boost::none;
-  auto array = attr->cast<mlir::ArrayAttr>();
+    const mlir::Attribute& attr) {
+  if (!attr.isa<mlir::ArrayAttr>()) return boost::none;
+  auto array = attr.cast<mlir::ArrayAttr>();
   CHECK(!array.empty());
 
   if (!array[0].getType().isF32()) return boost::none;
@@ -207,9 +209,9 @@ boost::optional<std::vector<float>> MlirToRuntimeTranslator::EmitAttribute(
 
 template <>
 boost::optional<std::vector<double>> MlirToRuntimeTranslator::EmitAttribute(
-    const mlir::Attribute* attr) {
-  if (!attr->isa<mlir::ArrayAttr>()) return boost::none;
-  auto array = attr->cast<mlir::ArrayAttr>();
+    const mlir::Attribute& attr) {
+  if (!attr.isa<mlir::ArrayAttr>()) return boost::none;
+  auto array = attr.cast<mlir::ArrayAttr>();
   CHECK(!array.empty());
 
   if (!array[0].getType().isF64()) return boost::none;
@@ -236,7 +238,8 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(mlir::Operation* op) {
   for (int i = 0, e = op->getNumOperands(); i < e; i++) {
     // function argument as value
     auto operand = op->getOperand(i);
-    if (operand.getKind() == mlir::Value::Kind::BlockArgument) {
+    /// if (operand.getKind() == mlir::Value::Kind::BlockArgument) {
+    if (operand.isa<mlir::BlockArgument>()) {
       mlir::BlockArgument arg = operand.dyn_cast<mlir::BlockArgument>();
       Value* arg_value = GetValue(arg);
       impl_->cur_op->AppendArgument(arg_value);
@@ -283,25 +286,25 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(mlir::Operation* op) {
 
   for (size_t i = 0; i < attrs.size(); i++) {
     auto& attr = attrs[i];
-    if (auto v = EmitAttribute<int32_t>(&attr.second)) {
+    if (auto v = EmitAttribute<int32_t>(attr.getValue())) {
       impl_->cur_op->AppendAttribute(new Value(*v));
-    } else if (auto v = EmitAttribute<int64_t>(&attr.second)) {
+    } else if (auto v = EmitAttribute<int64_t>(attr.getValue())) {
       impl_->cur_op->AppendAttribute(new Value(*v));
-    } else if (auto v = EmitAttribute<float>(&attr.second)) {
+    } else if (auto v = EmitAttribute<float>(attr.getValue())) {
       impl_->cur_op->AppendAttribute(new Value(*v));
-    } else if (auto v = EmitAttribute<double>(&attr.second)) {
+    } else if (auto v = EmitAttribute<double>(attr.getValue())) {
       impl_->cur_op->AppendAttribute(new Value(*v));
-    } else if (auto v = EmitAttribute<std::string>(&attr.second)) {
+    } else if (auto v = EmitAttribute<std::string>(attr.getValue())) {
       impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
-    } else if (auto v = EmitAttribute<std::vector<int16_t>>(&attr.second)) {
+    } else if (auto v = EmitAttribute<std::vector<int16_t>>(attr.getValue())) {
       impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
-    } else if (auto v = EmitAttribute<std::vector<int32_t>>(&attr.second)) {
+    } else if (auto v = EmitAttribute<std::vector<int32_t>>(attr.getValue())) {
       impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
-    } else if (auto v = EmitAttribute<std::vector<int64_t>>(&attr.second)) {
+    } else if (auto v = EmitAttribute<std::vector<int64_t>>(attr.getValue())) {
       impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
-    } else if (auto v = EmitAttribute<std::vector<float>>(&attr.second)) {
+    } else if (auto v = EmitAttribute<std::vector<float>>(attr.getValue())) {
       impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
-    } else if (auto v = EmitAttribute<std::vector<double>>(&attr.second)) {
+    } else if (auto v = EmitAttribute<std::vector<double>>(attr.getValue())) {
       impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
     } else {
       LOG(FATAL) << "Not supported attribute type";
@@ -330,7 +333,7 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(mlir::Operation* op) {
     llvm::SmallVector<mlir::Type, 0> results;
 
     auto func_type =
-        mlir::FunctionType::get(inputs, results, region.getContext());
+        mlir::FunctionType::get(region.getContext(), inputs, results);
     auto* function = impl_->cur_op->CreateFunctionExecutable(
         &region, func_type, &impl_->func_defs);
     impl_->cur_op->AppendAttribute(new Value(function));
@@ -555,4 +558,5 @@ void TestMlir(mlir::ModuleOp module, KernelRegistry* registry) {
   execute.Run();
 }
 
-}  // namespace infrt::host_context
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate.h b/paddle/infrt/host_context/mlir_to_runtime_translate.h
index 598e81bfd96d8..fcd79eaf386ee 100644
--- a/paddle/infrt/host_context/mlir_to_runtime_translate.h
+++ b/paddle/infrt/host_context/mlir_to_runtime_translate.h
@@ -29,7 +29,8 @@ class Attribute;
 class Value;
 }  // namespace mlir
 
-namespace infrt::host_context {
+namespace infrt {
+namespace host_context {
 
 class CoreRuntimeBuilder;
 class Value;
@@ -73,7 +74,7 @@ class MlirToRuntimeTranslator {
   bool EmitCallOp(mlir::Operation* op, function_defs_t* function_table);
 
   template <typename T>
-  boost::optional<T> EmitAttribute(const mlir::Attribute* attr);
+  boost::optional<T> EmitAttribute(const mlir::Attribute& attr);
 
   Value* GetOpResult(mlir::Operation* op);
 
@@ -104,4 +105,5 @@ void MlirToRuntimeTranslate(mlir::ModuleOp module, CoreRuntimeBuilder* runtime);
  */
 void TestMlir(mlir::ModuleOp module, KernelRegistry* registry);
 
-}  // namespace infrt::host_context
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc b/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc
index 9b85be977ab6c..375daa4515e17 100644
--- a/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc
+++ b/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc
@@ -29,7 +29,8 @@
 #include "paddle/infrt/kernel/tensor_shape_kernels.h"
 #include "paddle/infrt/kernel/test_kernels.h"
 
-namespace infrt::host_context {
+namespace infrt {
+namespace host_context {
 
 TEST(MlirToRuntimeTranslate, basic) {
   mlir::MLIRContext context;
@@ -48,7 +49,7 @@ func @main() -> () {
 )ROC";
 
   auto module = dialect::LoadMlirSource(&context, source);
-  module->verify();
+  EXPECT_TRUE(mlir::succeeded(module->verify()));
 
   KernelRegistry registry;
   kernel::RegisterFloatBasicKernels(&registry);
@@ -74,7 +75,7 @@ func @main() -> () {
 )ROC";
 
   auto module = dialect::LoadMlirSource(&context, source);
-  module->verify();
+  EXPECT_TRUE(mlir::succeeded(module->verify()));
 
   KernelRegistry registry;
   kernel::RegisterFloatBasicKernels(&registry);
@@ -115,7 +116,7 @@ infrt.return %a0, %b0: !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F
   // LOG(INFO) << "content: " << content << std::endl;
 
   auto module = dialect::LoadMlirSource(context, content);
-  module->verify();
+  EXPECT_TRUE(mlir::succeeded(module->verify()));
 
   host_context::KernelRegistry registry;
 
@@ -157,4 +158,5 @@ infrt.return %a0, %b0: !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F
   }
 }
 
-}  // namespace infrt::host_context
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/op_executable.cc b/paddle/infrt/host_context/op_executable.cc
index 6b10ed473719e..cf40d7315c6a5 100644
--- a/paddle/infrt/host_context/op_executable.cc
+++ b/paddle/infrt/host_context/op_executable.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/infrt/host_context/op_executable.h"
 
+#include <mlir/IR/BuiltinOps.h>
 #include <string>
 
 #include "paddle/infrt/host_context/kernel_frame.h"
@@ -21,7 +22,8 @@
 #include "paddle/infrt/host_context/mlir_function_executable.h"
 #include "paddle/infrt/host_context/symbol_table.h"
 
-namespace infrt::host_context {
+namespace infrt {
+namespace host_context {
 
 struct OpExecutable::Impl {
   Impl(const std::string& op_name,
@@ -148,4 +150,5 @@ void OpExecutable::Execute() {
 
 OpExecutable::~OpExecutable() {}
 
-}  // namespace infrt::host_context
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/op_executable.h b/paddle/infrt/host_context/op_executable.h
index e2248225a5caf..550f6ab6349ed 100644
--- a/paddle/infrt/host_context/op_executable.h
+++ b/paddle/infrt/host_context/op_executable.h
@@ -14,19 +14,18 @@
 
 #pragma once
 #include <llvm/ADT/ArrayRef.h>
-
+#include <mlir/IR/BuiltinTypes.h>
+#include <mlir/IR/Region.h>
 #include <memory>
 #include <string>
 #include <unordered_map>
 
-#include "mlir/IR/Function.h"
-#include "mlir/IR/Region.h"
-
 namespace mlir {
 class FuncOp;
 }  // namespace mlir
 
-namespace infrt::host_context {
+namespace infrt {
+namespace host_context {
 
 class SymbolTable;
 class KernelRegistry;
@@ -89,4 +88,5 @@ class OpExecutableBuilder : public OpExecutable {
       function_defs_t* function_defs);
 };
 
-}  // namespace infrt::host_context
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/kernel/basic_kernels.cc b/paddle/infrt/kernel/basic_kernels.cc
index d7f2c3865157d..b186cfcfd2b35 100644
--- a/paddle/infrt/kernel/basic_kernels.cc
+++ b/paddle/infrt/kernel/basic_kernels.cc
@@ -23,7 +23,8 @@
 
 using infrt::host_context::Attribute;
 
-namespace infrt::kernel {
+namespace infrt {
+namespace kernel {
 
 template <typename T>
 T add(T a, T b) {
@@ -82,4 +83,5 @@ void RegisterFloatBasicKernels(host_context::KernelRegistry *registry) {
   registry->AddKernel("infrt.print.f32", INFRT_KERNEL(print<float>));
 }
 
-}  // namespace infrt::kernel
+}  // namespace kernel
+}  // namespace infrt
diff --git a/paddle/infrt/kernel/basic_kernels.h b/paddle/infrt/kernel/basic_kernels.h
index 9e98885cf6ebf..feb66be61f530 100644
--- a/paddle/infrt/kernel/basic_kernels.h
+++ b/paddle/infrt/kernel/basic_kernels.h
@@ -15,13 +15,16 @@
 #pragma once
 #include <string>
 
-namespace infrt::host_context {
+namespace infrt {
+namespace host_context {
 
 struct KernelRegistry;
 
-}  // namespace infrt::host_context
+}  // namespace host_context
+}  // namespace infrt
 
-namespace infrt::kernel {
+namespace infrt {
+namespace kernel {
 
 /**
  * Register all the basic kernels to \p registry.
@@ -31,4 +34,5 @@ void RegisterBasicKernels(host_context::KernelRegistry* registry);
 void RegisterIntBasicKernels(host_context::KernelRegistry* registry);
 void RegisterFloatBasicKernels(host_context::KernelRegistry* registry);
 
-}  // namespace infrt::kernel
+}  // namespace kernel
+}  // namespace infrt
diff --git a/paddle/infrt/kernel/tensor_kernels.cc b/paddle/infrt/kernel/tensor_kernels.cc
index 2fa477aa4dbda..51e0004922374 100644
--- a/paddle/infrt/kernel/tensor_kernels.cc
+++ b/paddle/infrt/kernel/tensor_kernels.cc
@@ -25,7 +25,8 @@
 #include "paddle/infrt/tensor/tensor_map.h"
 #include "paddle/infrt/tensor/tensor_shape.h"
 
-namespace infrt::kernel {
+namespace infrt {
+namespace kernel {
 using namespace host_context;  // NOLINT
 using namespace tensor;        // NOLINT
 
@@ -76,4 +77,5 @@ void RegisterTensorKernels(host_context::KernelRegistry *registry) {
                       INFRT_KERNEL(ShallowCopyTensor));
 }
 
-}  // namespace infrt::kernel
+}  // namespace kernel
+}  // namespace infrt
diff --git a/paddle/infrt/kernel/tensor_kernels.h b/paddle/infrt/kernel/tensor_kernels.h
index 8f2180ba80a4f..df8e25c32393c 100644
--- a/paddle/infrt/kernel/tensor_kernels.h
+++ b/paddle/infrt/kernel/tensor_kernels.h
@@ -14,12 +14,16 @@
 
 #pragma once
 
-namespace infrt::host_context {
+namespace infrt {
+namespace host_context {
 struct KernelRegistry;
-}  // namespace infrt::host_context
+}  // namespace host_context
+}  // namespace infrt
 
-namespace infrt::kernel {
+namespace infrt {
+namespace kernel {
 
 void RegisterTensorKernels(host_context::KernelRegistry* registry);
 
-}  // namespace infrt::kernel
+}  // namespace kernel
+}  // namespace infrt
diff --git a/paddle/infrt/kernel/tensor_shape_kernels.cc b/paddle/infrt/kernel/tensor_shape_kernels.cc
index a04b492819298..4edbecfa10886 100644
--- a/paddle/infrt/kernel/tensor_shape_kernels.cc
+++ b/paddle/infrt/kernel/tensor_shape_kernels.cc
@@ -24,7 +24,8 @@
 #include "paddle/infrt/host_context/kernel_utils.h"
 #include "paddle/infrt/tensor/tensor_shape.h"
 
-namespace infrt::kernel {
+namespace infrt {
+namespace kernel {
 
 void PrintShape(const tensor::TensorShape& shape) {
   llvm::raw_os_ostream oos(std::cout);
@@ -35,4 +36,5 @@ void RegisterTensorShapeKernels(host_context::KernelRegistry* registry) {
   registry->AddKernel("ts.print_shape", INFRT_KERNEL(PrintShape));
 }
 
-}  // namespace infrt::kernel
+}  // namespace kernel
+}  // namespace infrt
diff --git a/paddle/infrt/kernel/tensor_shape_kernels.h b/paddle/infrt/kernel/tensor_shape_kernels.h
index e87c6c37e88a0..e31a37463be43 100644
--- a/paddle/infrt/kernel/tensor_shape_kernels.h
+++ b/paddle/infrt/kernel/tensor_shape_kernels.h
@@ -14,14 +14,18 @@
 
 #pragma once
 
-namespace infrt::host_context {
+namespace infrt {
+namespace host_context {
 
 class KernelRegistry;
 
-}  // namespace infrt::host_context
+}  // namespace host_context
+}  // namespace infrt
 
-namespace infrt::kernel {
+namespace infrt {
+namespace kernel {
 
 void RegisterTensorShapeKernels(host_context::KernelRegistry* registry);
 
-}  // namespace infrt::kernel
+}  // namespace kernel
+}  // namespace infrt
diff --git a/paddle/infrt/kernel/test_kernels.cc b/paddle/infrt/kernel/test_kernels.cc
index d5f64d09b602f..ccfb3356a855f 100644
--- a/paddle/infrt/kernel/test_kernels.cc
+++ b/paddle/infrt/kernel/test_kernels.cc
@@ -33,7 +33,8 @@ using infrt::host_context::Attribute;
 using infrt::host_context::MlirFunctionExecutable;
 using infrt::host_context::RemainingArguments;
 
-namespace infrt::kernel {
+namespace infrt {
+namespace kernel {
 namespace {
 class BenchmarkStats {
  public:
@@ -197,4 +198,5 @@ void RegisterTestKernels(host_context::KernelRegistry *registry) {
                       INFRT_KERNEL(ShadowCopyTensor));
 }
 
-}  // namespace infrt::kernel
+}  // namespace kernel
+}  // namespace infrt
diff --git a/paddle/infrt/kernel/test_kernels.h b/paddle/infrt/kernel/test_kernels.h
index f42884dfaf2c9..f5639ec1afaad 100644
--- a/paddle/infrt/kernel/test_kernels.h
+++ b/paddle/infrt/kernel/test_kernels.h
@@ -15,17 +15,21 @@
 #pragma once
 #include <string>
 
-namespace infrt::host_context {
+namespace infrt {
+namespace host_context {
 
 struct KernelRegistry;
 
-}  // namespace infrt::host_context
+}  // namespace host_context
+}  // namespace infrt
 
-namespace infrt::kernel {
+namespace infrt {
+namespace kernel {
 
 /**
  * Register all the test kernels to registry.
  */
 void RegisterTestKernels(host_context::KernelRegistry* registry);
 
-}  // namespace infrt::kernel
+}  // namespace kernel
+}  // namespace infrt
diff --git a/paddle/infrt/paddle/cpp/desc_api.h b/paddle/infrt/paddle/cpp/desc_api.h
index ccd79c048ab14..3b2dcb0018b2f 100644
--- a/paddle/infrt/paddle/cpp/desc_api.h
+++ b/paddle/infrt/paddle/cpp/desc_api.h
@@ -18,7 +18,9 @@
 #include <string>
 #include <vector>
 
-namespace infrt::paddle::cpp {
+namespace infrt {
+namespace paddle {
+namespace cpp {
 
 /*
  * Compatible interfaces for all the different kinds of XXXDesc. All the XXXDesc
@@ -226,4 +228,6 @@ class ProgramDescAPI {
   virtual void SetVersion(int64_t version) = 0;
 };
 
-}  // namespace infrt::paddle::cpp
+}  // namespace cpp
+}  // namespace paddle
+}  // namespace infrt
diff --git a/paddle/infrt/paddle/model_parser.cc b/paddle/infrt/paddle/model_parser.cc
index 285280e69435b..f3de1a630451c 100644
--- a/paddle/infrt/paddle/model_parser.cc
+++ b/paddle/infrt/paddle/model_parser.cc
@@ -22,7 +22,8 @@
 #include "paddle/infrt/common/target.h"
 #include "paddle/infrt/common/type.h"
 
-namespace infrt::paddle {
+namespace infrt {
+namespace paddle {
 
 int SizeOfType(framework_proto::VarType::Type type) {
   using Type = framework_proto::VarType::Type;
@@ -169,4 +170,5 @@ void LoadParam(const std::string &path, _Variable *out, const Target &target) {
   LoadLoDTensor(fin, out, target);
 }
 
-}  // namespace infrt::paddle
+}  // namespace paddle
+}  // namespace infrt
diff --git a/paddle/infrt/paddle/model_parser.h b/paddle/infrt/paddle/model_parser.h
index 73125fadedb82..373f77033dcef 100644
--- a/paddle/infrt/paddle/model_parser.h
+++ b/paddle/infrt/paddle/model_parser.h
@@ -25,7 +25,8 @@
 #include "paddle/infrt/paddle/scope.h"
 #include "paddle/infrt/paddle/tensor.h"
 
-namespace infrt::paddle {
+namespace infrt {
+namespace paddle {
 namespace framework_proto = ::paddle::framework::proto;
 
 // Read a __model__ file.
@@ -52,4 +53,5 @@ void TensorFromStream(
     const common::Target& target = common::DefaultHostTarget());
 void ReadBinaryFile(const std::string& filename, std::string* contents);
 
-}  // namespace infrt::paddle
+}  // namespace paddle
+}  // namespace infrt
diff --git a/paddle/infrt/paddle/pb/block_desc.cc b/paddle/infrt/paddle/pb/block_desc.cc
index 11186bc68af16..5b28fa5464c54 100644
--- a/paddle/infrt/paddle/pb/block_desc.cc
+++ b/paddle/infrt/paddle/pb/block_desc.cc
@@ -14,7 +14,9 @@
 
 #include "paddle/infrt/paddle/pb/block_desc.h"
 
-namespace infrt::paddle::pb {
+namespace infrt {
+namespace paddle {
+namespace pb {
 
 template <>
 framework_proto::VarDesc* BlockDesc::GetVar<framework_proto::VarDesc>(
@@ -40,4 +42,6 @@ framework_proto::OpDesc* BlockDesc::AddOp<framework_proto::OpDesc>() {
   return desc_->add_ops();
 }
 
-}  // namespace infrt::paddle::pb
+}  // namespace pb
+}  // namespace paddle
+}  // namespace infrt
diff --git a/paddle/infrt/paddle/pb/block_desc.h b/paddle/infrt/paddle/pb/block_desc.h
index 9c1b7f9adf172..c9e325699a4bc 100644
--- a/paddle/infrt/paddle/pb/block_desc.h
+++ b/paddle/infrt/paddle/pb/block_desc.h
@@ -18,7 +18,9 @@
 #include "paddle/infrt/paddle/cpp/desc_api.h"
 #include "paddle/infrt/paddle/framework.pb.h"
 
-namespace infrt::paddle::pb {
+namespace infrt {
+namespace paddle {
+namespace pb {
 
 namespace framework_proto = ::paddle::framework::proto;
 
@@ -74,4 +76,6 @@ class BlockDesc : public cpp::BlockDescAPI {
   framework_proto::BlockDesc* desc_;  // not_own
 };
 
-}  // namespace infrt::paddle::pb
+}  // namespace pb
+}  // namespace paddle
+}  // namespace infrt
diff --git a/paddle/infrt/paddle/pb/op_desc.cc b/paddle/infrt/paddle/pb/op_desc.cc
index c7b1e66f50642..32dcefb1ac684 100644
--- a/paddle/infrt/paddle/pb/op_desc.cc
+++ b/paddle/infrt/paddle/pb/op_desc.cc
@@ -14,7 +14,9 @@
 
 #include "paddle/infrt/paddle/pb/op_desc.h"
 
-namespace infrt::paddle::pb {
+namespace infrt {
+namespace paddle {
+namespace pb {
 
 google::protobuf::internal::RepeatedPtrIterator<framework_proto::OpDesc_Attr>
 FindAttr(framework_proto::OpDesc *desc, const std::string &name) {
@@ -136,4 +138,6 @@ GET_ATTRS_IMPL(std::vector<std::string>, strings);
 GET_ATTR_IMPL(std::string, s);
 GET_ATTRS_IMPL(std::vector<int64_t>, longs);
 
-}  // namespace infrt::paddle::pb
+}  // namespace pb
+}  // namespace paddle
+}  // namespace infrt
diff --git a/paddle/infrt/paddle/pb/op_desc.h b/paddle/infrt/paddle/pb/op_desc.h
index 81d57d9f32252..2829f2aca2e08 100644
--- a/paddle/infrt/paddle/pb/op_desc.h
+++ b/paddle/infrt/paddle/pb/op_desc.h
@@ -19,7 +19,9 @@
 #include "paddle/infrt/paddle/framework.pb.h"
 #include "paddle/infrt/support/variant.h"
 
-namespace infrt::paddle::pb {
+namespace infrt {
+namespace paddle {
+namespace pb {
 
 namespace framework_proto = ::paddle::framework::proto;
 
@@ -195,4 +197,6 @@ template <>
 void OpDesc::SetAttr<std::vector<int>>(const std::string &name,
                                        const std::vector<int> &v);
 
-}  // namespace infrt::paddle::pb
+}  // namespace pb
+}  // namespace paddle
+}  // namespace infrt
diff --git a/paddle/infrt/paddle/pb/program_desc.cc b/paddle/infrt/paddle/pb/program_desc.cc
index ed8a7e36e0129..9d725485a974d 100644
--- a/paddle/infrt/paddle/pb/program_desc.cc
+++ b/paddle/infrt/paddle/pb/program_desc.cc
@@ -17,7 +17,9 @@
 #include <algorithm>
 #include <limits>
 
-namespace infrt::paddle::pb {
+namespace infrt {
+namespace paddle {
+namespace pb {
 
 template <>
 framework_proto::BlockDesc* ProgramDesc::GetBlock<framework_proto::BlockDesc>(
@@ -32,4 +34,6 @@ ProgramDesc::AddBlock<framework_proto::BlockDesc>() {
   return desc_->add_blocks();
 }
 
-}  // namespace infrt::paddle::pb
+}  // namespace pb
+}  // namespace paddle
+}  // namespace infrt
diff --git a/paddle/infrt/paddle/pb/program_desc.h b/paddle/infrt/paddle/pb/program_desc.h
index 4adad650c974d..b1e64f8e86611 100644
--- a/paddle/infrt/paddle/pb/program_desc.h
+++ b/paddle/infrt/paddle/pb/program_desc.h
@@ -21,7 +21,9 @@
 #include "paddle/infrt/paddle/cpp/desc_api.h"
 #include "paddle/infrt/paddle/framework.pb.h"
 
-namespace infrt::paddle::pb {
+namespace infrt {
+namespace paddle {
+namespace pb {
 namespace framework_proto = ::paddle::framework::proto;
 
 class ProgramDesc : public cpp::ProgramDescAPI {
@@ -58,4 +60,6 @@ class ProgramDesc : public cpp::ProgramDescAPI {
   framework_proto::ProgramDesc *desc_;  // not_own
 };
 
-}  // namespace infrt::paddle::pb
+}  // namespace pb
+}  // namespace paddle
+}  // namespace infrt
diff --git a/paddle/infrt/paddle/pb/var_desc.cc b/paddle/infrt/paddle/pb/var_desc.cc
index cf80df4f1b845..7ea2e24da3446 100644
--- a/paddle/infrt/paddle/pb/var_desc.cc
+++ b/paddle/infrt/paddle/pb/var_desc.cc
@@ -19,7 +19,9 @@
 #include "paddle/infrt/paddle/cpp/desc_api.h"
 #include "paddle/infrt/paddle/framework.pb.h"
 
-namespace infrt::paddle::pb {
+namespace infrt {
+namespace paddle {
+namespace pb {
 
 cpp::VarDescAPI::Type VarDesc::GetType() const {
   auto type = desc_->type().type();
@@ -364,4 +366,6 @@ VarDesc::mutable_tensor_descs() {
   return std::vector<framework_proto::VarType::TensorDesc *>();
 }
 
-}  // namespace infrt::paddle::pb
+}  // namespace pb
+}  // namespace paddle
+}  // namespace infrt
diff --git a/paddle/infrt/paddle/pb/var_desc.h b/paddle/infrt/paddle/pb/var_desc.h
index 4cff5fdee0375..7215ba6bb6aa7 100644
--- a/paddle/infrt/paddle/pb/var_desc.h
+++ b/paddle/infrt/paddle/pb/var_desc.h
@@ -23,7 +23,9 @@
 #include "paddle/infrt/paddle/cpp/desc_api.h"
 #include "paddle/infrt/paddle/framework.pb.h"
 
-namespace infrt::paddle::pb {
+namespace infrt {
+namespace paddle {
+namespace pb {
 namespace framework_proto = ::paddle::framework::proto;
 
 // convert between std::vector and protobuf repeated.
@@ -121,4 +123,6 @@ class VarDesc : public cpp::VarDescAPI {
   framework_proto::VarDesc *desc_;
 };
 
-}  // namespace infrt::paddle::pb
+}  // namespace pb
+}  // namespace paddle
+}  // namespace infrt

From 87ee3e4f5438c567796e128b73eb7703aa56d2ec Mon Sep 17 00:00:00 2001
From: Zhangjingyu06 <92561254+Zhangjingyu06@users.noreply.github.com>
Date: Fri, 14 Jan 2022 16:15:47 +0800
Subject: [PATCH 138/151] [XPU]add stack_grad op for kunlun2,*test=kunlun
 (#38674)

* [XPU]add split op for kunlun2,*test=kunlun

* [XPU]add split op for kunlun2,*test=kunlun

* [XPU]add split op for kunlun,*test=kunlun

* [XPU]add stack_grad op for kunlun2,*test=kunlun

Co-authored-by: QingshuChen <chenqingshu@baidu.com>
---
 paddle/fluid/operators/stack_op_xpu.cc        | 43 ++++++++++++++++---
 .../fluid/platform/device/xpu/xpu1_op_list.h  |  1 +
 .../fluid/platform/device/xpu/xpu2_op_list.h  |  2 +
 .../tests/unittests/xpu/test_stack_op_xpu.py  | 19 +++++++-
 4 files changed, 58 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/operators/stack_op_xpu.cc b/paddle/fluid/operators/stack_op_xpu.cc
index 01ec4a2b16b4a..a2590e1180c1a 100644
--- a/paddle/fluid/operators/stack_op_xpu.cc
+++ b/paddle/fluid/operators/stack_op_xpu.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,9 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/operators/stack_op.h"
 #include <string>
-#ifdef PADDLE_WITH_XPU
+#include <vector>
+#include "paddle/fluid/operators/concat_op.h"
+#include "paddle/fluid/platform/device/xpu/xpu_header.h"
 
 namespace paddle {
 namespace operators {
@@ -59,14 +62,44 @@ class StackXPUKernel : public framework::OpKernel<T> {
   }
 };
 
+template <typename DeviceContext, typename T>
+class StackGradXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    auto dx = ctx.MultiOutput<Tensor>(framework::GradVarName("X"));
+    auto axis = ctx.Attr<int>("axis");
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    auto dy_dims = dy->dims();
+
+    if (axis < 0) axis += dy_dims.size() + 1;
+    auto dy_shape = framework::vectorize<int>(dy_dims);
+
+    std::vector<int> dx_dims_list(dx.size(), 1);
+    std::vector<T*> dx_lists;
+    for (auto out : dx) {
+      dx_lists.push_back(out->mutable_data<T>(ctx.GetPlace()));
+    }
+
+    int r = xpu::split<T>(dev_ctx.x_context(), dy->data<T>(), dx_lists,
+                          dy_shape, dx_dims_list, axis);
+    PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
+                      platform::errors::External(
+                          "The stack_grad XPU kernel return wrong value[%d %s]",
+                          r, XPUAPIErrorMsg[r]));
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace plat = paddle::platform;
 namespace ops = paddle::operators;
-
 REGISTER_OP_XPU_KERNEL(stack,
-                       ops::StackXPUKernel<plat::XPUDeviceContext, int64_t>,
+                       ops::StackXPUKernel<plat::XPUDeviceContext, float>,
                        ops::StackXPUKernel<plat::XPUDeviceContext, int>,
-                       ops::StackXPUKernel<plat::XPUDeviceContext, float>);
+                       ops::StackXPUKernel<plat::XPUDeviceContext, int64_t>);
+REGISTER_OP_XPU_KERNEL(stack_grad,
+                       ops::StackGradXPUKernel<plat::XPUDeviceContext, float>,
+                       ops::StackGradXPUKernel<plat::XPUDeviceContext, int>);
 #endif
diff --git a/paddle/fluid/platform/device/xpu/xpu1_op_list.h b/paddle/fluid/platform/device/xpu/xpu1_op_list.h
index 26a1426bea036..a76bdd4ae9679 100644
--- a/paddle/fluid/platform/device/xpu/xpu1_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu1_op_list.h
@@ -300,6 +300,7 @@ XPUOpMap& get_kl1_ops() {
                                 pOpKernelType(vartype::UINT8, XPUPlace()),
                                 pOpKernelType(vartype::FP32, XPUPlace())})},
       {"stack", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"stack_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"tanh_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"tanh", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
index 79261a5d7bc88..3d140b4693a6f 100644
--- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
@@ -333,6 +333,8 @@ XPUOpMap& get_kl2_ops() {
       {"stack", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                               pOpKernelType(vartype::INT64, XPUPlace()),
                               pOpKernelType(vartype::INT32, XPUPlace())})},
+      {"stack_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                                   pOpKernelType(vartype::INT32, XPUPlace())})},
       {"sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                             pOpKernelType(vartype::FP16, XPUPlace())})},
       {"tanh_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_stack_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_stack_op_xpu.py
index 68e5a6ccdbfb7..20446aee41ec7 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_stack_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_stack_op_xpu.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -66,6 +66,15 @@ def test_check_output(self):
             place = paddle.XPUPlace(0)
             self.check_output_with_place(place)
 
+    def test_check_grad(self):
+        if self.dtype == 'int64' or self.dtype == 'int32':
+            pass
+        else:
+            if paddle.is_compiled_with_xpu():
+                paddle.enable_static()
+                place = paddle.XPUPlace(0)
+                self.check_grad_with_place(place, self.get_x_names(), 'Y')
+
 
 class TestStackOp1(TestStackOpBase):
     def initParameters(self):
@@ -81,11 +90,17 @@ class TestStackOp3(TestStackOpBase):
     def initParameters(self):
         self.axis = -1
 
+    def test_check_grad(self):
+        pass
+
 
 class TestStackOp4(TestStackOpBase):
     def initParameters(self):
         self.axis = -4
 
+    def test_check_grad(self):
+        pass
+
 
 class TestStackOp5(TestStackOpBase):
     def initParameters(self):
@@ -113,7 +128,7 @@ def initDefaultParameters(self):
         self.num_inputs = 4
         self.input_dim = (5, 6, 7)
         self.axis = 0
-        self.dtype = 'int'
+        self.dtype = 'int32'
 
     def initParameters(self):
         self.num_inputs = 16

From 050aa6fe5a524b0e7b85201c54a0da315701518d Mon Sep 17 00:00:00 2001
From: heliqi <heliqi@baidu.com>
Date: Fri, 14 Jan 2022 16:50:56 +0800
Subject: [PATCH 139/151]  add flatten_contiguous_range OpConvert for
 Paddle-TRT (#38922)

* add trt_convert_flatten_contiguous_rang op

* trt version >7,support trt_convert_flatten_contiguous_rang

* trt version >7,support trt_convert_flatten_contiguous_rang

* trt version >7,support trt_convert_flatten_contiguous_rang

* test cast add trt version >=7 skip
---
 .../ir_passes/tensorrt_subgraph_pass.cc       |   7 +-
 .../fluid/inference/api/analysis_predictor.cc |   1 +
 .../inference/tensorrt/convert/CMakeLists.txt |   2 +-
 .../convert/flatten_contiguous_range_op.cc    | 136 ++++++++++++++++++
 paddle/fluid/inference/tensorrt/op_teller.cc  |  32 +++++
 ...st_trt_convert_flatten_contiguous_range.py | 115 +++++++++++++++
 6 files changed, 290 insertions(+), 3 deletions(-)
 create mode 100644 paddle/fluid/inference/tensorrt/convert/flatten_contiguous_range_op.cc
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_flatten_contiguous_range.py

diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index ef50df3084f8c..55bbc55450876 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -46,8 +46,11 @@ void analysis::TensorRtSubgraphPass::ApplyImpl(
               << " is diabled by config in TensorRT";
       return false;
     }
-    return tensorrt::OpTeller::Global().Tell(node, no_calib_int8,
-                                             with_dynamic_shape);
+    bool is_ok = tensorrt::OpTeller::Global().Tell(node, no_calib_int8,
+                                                   with_dynamic_shape);
+    if (!is_ok)
+      VLOG(3) << node->Op()->Type().c_str() << " op is not in TensorRT";
+    return is_ok;
   };
 
   framework::ir::SubGraphFuser fuser(
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 2799fb9e174d3..d4b680288e347 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1416,6 +1416,7 @@ USE_TRT_CONVERTER(elementwise_min_tensor);
 USE_TRT_CONVERTER(elementwise_pow_tensor);
 USE_TRT_CONVERTER(transpose);
 USE_TRT_CONVERTER(flatten);
+USE_TRT_CONVERTER(flatten_contiguous_range);
 USE_TRT_CONVERTER(matmul);
 USE_TRT_CONVERTER(conv2d);
 USE_TRT_CONVERTER(relu);
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index a885b69fa7fbc..017caca6adc81 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -3,7 +3,7 @@ nv_library(tensorrt_converter
            SRCS matmul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
                 batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc group_norm_op.cc
                 pad_op.cc split_op.cc prelu_op.cc leaky_relu_op.cc gelu_op.cc layer_norm_op.cc multihead_matmul_op.cc
-                shuffle_channel_op.cc swish_op.cc instance_norm_op.cc stack_op.cc transpose_op.cc flatten_op.cc
+                shuffle_channel_op.cc swish_op.cc instance_norm_op.cc stack_op.cc transpose_op.cc flatten_op.cc flatten_contiguous_range_op.cc
                 emb_eltwise_layernorm.cc skip_layernorm.cc scale_op.cc slice_op.cc hard_sigmoid_op.cc hard_swish_op.cc clip_op.cc
                 gather_op.cc
                 anchor_generator_op.cc
diff --git a/paddle/fluid/inference/tensorrt/convert/flatten_contiguous_range_op.cc b/paddle/fluid/inference/tensorrt/convert/flatten_contiguous_range_op.cc
new file mode 100644
index 0000000000000..706814340a0e9
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/flatten_contiguous_range_op.cc
@@ -0,0 +1,136 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+/*
+ * flatten_contiguous_range trt converter
+ */
+class FlattenContiguousRangeOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    framework::OpDesc op_desc(op, nullptr);
+    // Declare inputs
+    auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
+    int dims = input->getDimensions().nbDims;
+    int start_axis = BOOST_GET_CONST(int, op_desc.GetAttr("start_axis"));
+    int stop_axis = BOOST_GET_CONST(int, op_desc.GetAttr("stop_axis"));
+
+    nvinfer1::IShuffleLayer* layer = nullptr;
+    if (!engine_->with_dynamic_shape()) {
+      if (start_axis < 0) start_axis += dims + 1;
+      if (stop_axis < 0) stop_axis += dims + 1;
+      int dim_prod = 1;
+      nvinfer1::Dims flatten_dim;
+      flatten_dim.nbDims = dims - (stop_axis - start_axis);
+      for (int i = 0, j = 0; i < dims; ++i) {
+        if (start_axis <= i + 1 && i + 1 <= stop_axis) {
+          int dim_i = input->getDimensions().d[i];
+          PADDLE_ENFORCE_GT(dim_i, 0, platform::errors::InvalidArgument(
+                                          "flatten_contiguous_range input dim "
+                                          "should be > 0, but got %d.",
+                                          dim_i));
+          dim_prod *= dim_i;
+          if (i + 1 == stop_axis) {
+            flatten_dim.d[j++] = dim_prod;
+          }
+        } else {
+          flatten_dim.d[j++] = input->getDimensions().d[i];
+        }
+      }
+      layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
+      layer->setReshapeDimensions(flatten_dim);
+    } else {
+      if (start_axis < 0) start_axis += dims;
+      if (stop_axis < 0) stop_axis += dims;
+      auto* shape_layer = TRT_ENGINE_ADD_LAYER(engine_, Shape, *input);
+      auto* shape_layer_itensor = shape_layer->getOutput(0);
+
+      nvinfer1::Dims start_dim, size_dim, stride_dim;
+      start_dim.nbDims = 1;
+      size_dim.nbDims = 1;
+      stride_dim.nbDims = 1;
+      start_dim.d[0] = start_axis;
+      size_dim.d[0] = stop_axis - start_axis + 1;
+      stride_dim.d[0] = 1;
+      auto* slice_layer =
+          TRT_ENGINE_ADD_LAYER(engine_, Slice, *shape_layer_itensor, start_dim,
+                               size_dim, stride_dim);
+      uint32_t reduce_dim = 1;
+      auto* reduce_prod_layer = TRT_ENGINE_ADD_LAYER(
+          engine_, Reduce, *(slice_layer->getOutput(0)),
+          nvinfer1::ReduceOperation::kPROD, reduce_dim, true);
+
+      nvinfer1::ITensor* input_shape = nullptr;
+      if (start_axis == 0 && stop_axis == dims - 1) {
+        input_shape = reduce_prod_layer->getOutput(0);
+      } else {
+        std::vector<nvinfer1::ITensor*> itensors;
+        if (start_axis > 0) {
+          nvinfer1::Dims left_start_dim, left_size_dim, left_stride_dim;
+          left_start_dim.nbDims = 1;
+          left_size_dim.nbDims = 1;
+          left_stride_dim.nbDims = 1;
+          left_start_dim.d[0] = 0;
+          left_size_dim.d[0] = start_axis;
+          left_stride_dim.d[0] = 1;
+          auto* slice_layer_left = TRT_ENGINE_ADD_LAYER(
+              engine_, Slice, *shape_layer_itensor, left_start_dim,
+              left_size_dim, left_stride_dim);
+          itensors.push_back(slice_layer_left->getOutput(0));
+        }
+        itensors.push_back(reduce_prod_layer->getOutput(0));
+        if (stop_axis < dims - 1) {
+          nvinfer1::Dims right_start_dim, right_size_dim, right_stride_dim;
+          right_start_dim.nbDims = 1;
+          right_size_dim.nbDims = 1;
+          right_stride_dim.nbDims = 1;
+          right_start_dim.d[0] = stop_axis + 1;
+          right_size_dim.d[0] = dims - stop_axis - 1;
+          right_stride_dim.d[0] = 1;
+          auto* slice_layer_right = TRT_ENGINE_ADD_LAYER(
+              engine_, Slice, *shape_layer_itensor, right_start_dim,
+              right_size_dim, right_stride_dim);
+          itensors.push_back(slice_layer_right->getOutput(0));
+        }
+        auto* concat_layer = TRT_ENGINE_ADD_LAYER(
+            engine_, Concatenation, itensors.data(), itensors.size());
+        concat_layer->setAxis(0);
+        input_shape = concat_layer->getOutput(0);
+      }
+      layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
+      layer->setInput(1, *input_shape);
+    }
+    auto output_name = op_desc.Output("Out")[0];
+    RreplenishLayerAndOutput(layer, "flatten_contiguous_range", {output_name},
+                             test_mode);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(flatten_contiguous_range,
+                          FlattenContiguousRangeOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index ddee4e0d682b0..6663103d4ca37 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -55,6 +55,7 @@ struct SimpleOpTypeSetTeller : public Teller {
 // #endif
 #if IS_TRT_VERSION_GE(7000)
     teller_set.insert("tile");
+    teller_set.insert("flatten_contiguous_range");
 #endif
 #if CUDA_VERSION >= 10020
     teller_set.insert("reshape");
@@ -531,6 +532,37 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
         if (axis != 1) return false;
       }
     }
+    if (op_type == "flatten_contiguous_range") {
+      if (!with_dynamic_shape) {
+        int start_axis = BOOST_GET_CONST(int, desc.GetAttr("start_axis"));
+        int stop_axis = BOOST_GET_CONST(int, desc.GetAttr("stop_axis"));
+        auto x_var_name = desc.Input("X")[0];
+        auto* block = desc.Block();
+        if (block == nullptr) {
+          VLOG(3) << "The block desc is nullptr, we can't continue to analyze. "
+                     "Developers need to check whether block_desc is passed in "
+                     "the pass.";
+          return false;
+        }
+        auto* x_var_desc = block->FindVar(x_var_name);
+        const auto x_shape = x_var_desc->GetShape();
+        int dims = x_shape.size();
+        if (start_axis < 0) start_axis += dims;
+        if (start_axis == 0) {
+          VLOG(3) << "TRT flatten_contiguous_range not support the "
+                     "batch-dimension being changed";
+          return false;
+        }
+        if (stop_axis < 0) stop_axis += dims;
+        for (int i = start_axis; i <= stop_axis; ++i) {
+          if (x_shape[i] < 0) {
+            VLOG(3) << "On TRT static shape,flatten_contiguous_range input dim "
+                       "should be > 0";
+            return false;
+          }
+        }
+      }
+    }
 
     if (op_type == "gather") {
       auto gather_inputs = desc.Inputs();
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_flatten_contiguous_range.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_flatten_contiguous_range.py
new file mode 100644
index 0000000000000..a4060349d4bed
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_flatten_contiguous_range.py
@@ -0,0 +1,115 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig
+import unittest
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+
+
+class TrtConvertFlattenContiguousRangeTest(TrtLayerAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_configs(self):
+        def generate_input(batch):
+            return np.random.random([2, batch, 4, 8, 3]).astype(np.float32)
+
+        for batch in [1, 2, 4]:
+            for start_axis in range(5):
+                for stop_axis in range(start_axis, 5):
+                    type = "flatten_contiguous_range"
+                    op_outputs = {
+                        "Out": ["output_data"],
+                        "XShape": ["xshape_data"]
+                    }
+                    ops_config = [{
+                        "op_type": type,
+                        "op_inputs": {
+                            "X": ["input_data"]
+                        },
+                        "op_outputs": op_outputs,
+                        "op_attrs": {
+                            "start_axis": start_axis,
+                            "stop_axis": stop_axis,
+                        }
+                    }]
+                    ops = self.generate_op_config(ops_config)
+
+                    program_config = ProgramConfig(
+                        ops=ops,
+                        weights={},
+                        inputs={
+                            "input_data": TensorConfig(
+                                data_gen=partial(generate_input, batch))
+                        },
+                        outputs=["output_data"])
+                    yield program_config
+
+    def sample_predictor_configs(
+            self, program_config) -> (paddle_infer.Config, List[int], float):
+        def generate_dynamic_shape(attrs):
+            self.dynamic_shape.min_input_shape = {"input_data": [2, 1, 4, 8, 3]}
+            self.dynamic_shape.max_input_shape = {"input_data": [2, 4, 4, 8, 3]}
+            self.dynamic_shape.opt_input_shape = {"input_data": [2, 2, 4, 8, 3]}
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        def generate_trt_nodes_num(attrs, dynamic_shape):
+            ver = paddle_infer.get_trt_compile_version()
+            if ver[0] * 1000 + ver[1] * 100 + ver[0] * 10 >= 7000:
+                if dynamic_shape:
+                    return 1, 2
+                else:
+                    if attrs[0]['start_axis'] == 0:
+                        return 0, 3
+                    else:
+                        return 1, 2
+            else:
+                return 0, 3
+
+        attrs = [
+            program_config.ops[i].attrs
+            for i in range(len(program_config.ops))
+        ]
+
+        # for static_shape
+        clear_dynamic_shape()
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-5
+
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
+                                                                     True), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
+                                                                     True), 1e-5
+
+    def test(self):
+        self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()

From a88791481484ab6a61540a737336d79c65d021dc Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Sat, 15 Jan 2022 12:39:49 +0800
Subject: [PATCH 140/151] fix performance problem caused by Conj (#38939)

---
 paddle/pten/kernels/complex_kernel.h | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/paddle/pten/kernels/complex_kernel.h b/paddle/pten/kernels/complex_kernel.h
index b6074f117ea14..d12fc730fef87 100644
--- a/paddle/pten/kernels/complex_kernel.h
+++ b/paddle/pten/kernels/complex_kernel.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/fluid/platform/complex.h"
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/infermeta/unary.h"
 #include "paddle/pten/kernels/empty_kernel.h"
@@ -23,7 +24,13 @@ namespace pten {
 template <typename T, typename Context>
 void ConjKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
 
-template <typename T, typename Context>
+// If T is complex
+template <typename T,
+          typename Context,
+          std::enable_if_t<
+              std::is_same<T, paddle::platform::complex<float>>::value ||
+                  std::is_same<T, paddle::platform::complex<double>>::value,
+              bool> = true>
 DenseTensor Conj(const Context& dev_ctx, const DenseTensor& x) {
   auto out_meta = UnchangedInferMeta(x.meta());
   auto dense_out = pten::Empty<T, Context>(dev_ctx, std::move(out_meta));
@@ -31,4 +38,15 @@ DenseTensor Conj(const Context& dev_ctx, const DenseTensor& x) {
   return dense_out;
 }
 
+// If T is not complex
+template <typename T,
+          typename Context,
+          std::enable_if_t<
+              !std::is_same<T, paddle::platform::complex<float>>::value &&
+                  !std::is_same<T, paddle::platform::complex<double>>::value,
+              bool> = true>
+DenseTensor Conj(const Context& dev_ctx, const DenseTensor& x) {
+  return x;
+}
+
 }  // namespace pten

From 88966b283952096f81aab4918b7d83b303aabad2 Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Sat, 15 Jan 2022 14:35:33 +0800
Subject: [PATCH 141/151] [Unify Tensors PR #7] Merged LoDTensor with Tensor,
 test=allcases (#38880)

* Merged LoDTensor with Tensor,test=allcases

* Patched python level LoDTensor

* Fixed example code failure

* Polished function names, removed duplicated forward declarations
---
 paddle/fluid/distributed/fleet.h              |   2 +-
 .../fluid/distributed/service/brpc_utils.cc   |   2 +-
 .../test/brpc_service_dense_sgd_test.cc       |   2 +-
 .../test/brpc_service_sparse_sgd_test.cc      |   2 +-
 paddle/fluid/framework/data_feed.h            |   2 +-
 .../framework/details/fetch_async_op_handle.h |   2 +-
 .../framework/details/variable_visitor.cc     |   2 +-
 paddle/fluid/framework/device_worker.cc       |   2 +-
 paddle/fluid/framework/device_worker.h        |   3 +-
 paddle/fluid/framework/downpour_worker.cc     |   2 +-
 paddle/fluid/framework/feed_fetch_method.cc   |   2 +-
 paddle/fluid/framework/feed_fetch_method.h    |   2 +-
 .../ir/conv_affine_channel_fuse_pass.cc       |   2 +-
 .../fluid/framework/ir/conv_bn_fuse_pass.cc   |   2 +-
 .../framework/ir/delete_dropout_op_pass.cc    |   2 +-
 .../ir/delete_quant_dequant_op_pass.cc        |   2 +-
 .../ir/fusion_group/code_generator_tester.cc  |   2 +-
 paddle/fluid/framework/lod_tensor.cc          |  28 --
 paddle/fluid/framework/lod_tensor.h           |  24 +-
 paddle/fluid/framework/naive_executor.h       |   2 +-
 paddle/fluid/framework/operator.cc            |  12 +-
 paddle/fluid/framework/operator.h             |   6 -
 paddle/fluid/framework/pull_dense_worker.cc   |   2 +-
 paddle/fluid/framework/tensor.h               |   9 +-
 paddle/fluid/framework/tensor_util.cc         |  24 ++
 paddle/fluid/framework/tensor_util.h          |   4 +-
 paddle/fluid/framework/trainer.h              |   2 +-
 paddle/fluid/framework/var_type_traits.h      |   7 +-
 paddle/fluid/inference/api/api_impl.h         |   2 +-
 .../api/details/reset_tensor_array.h          |   2 +-
 paddle/fluid/operators/assert_op.cc           |   2 +-
 paddle/fluid/operators/assign_op.h            |   2 +-
 .../operators/controlflow/while_op_helper.h   |   2 +-
 paddle/fluid/operators/math/beam_search.cc    |   1 -
 .../fluid/operators/math/beam_search_npu.cc   |   1 -
 .../fluid/operators/math/sequence_padding.cc  |   1 -
 paddle/fluid/operators/math/sequence_scale.cc |   2 +-
 paddle/fluid/operators/math/sequence_scale.h  |   2 +-
 paddle/fluid/operators/memcpy_d2h_op.h        |   2 +-
 paddle/fluid/operators/memcpy_h2d_op.h        |   2 +-
 paddle/fluid/operators/memcpy_op.h            |   2 +-
 paddle/fluid/operators/merge_lod_tensor_op.cc |   2 +-
 paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc |   1 -
 paddle/fluid/operators/print_op.cc            |   2 +-
 paddle/fluid/operators/recurrent_op.cc        |   2 +-
 .../reorder_lod_tensor_by_rank_op.cc          |   2 +-
 paddle/fluid/operators/split_lod_tensor_op.cc |   2 +-
 paddle/fluid/operators/tensor_formatter.h     |   2 +-
 paddle/fluid/operators/transfer_layout_op.h   |   2 +-
 paddle/fluid/platform/lodtensor_printer.cc    |   2 +-
 paddle/fluid/pybind/pybind.cc                 | 277 ++++++------------
 paddle/pten/api/lib/utils/tensor_utils.cc     |  51 ++--
 paddle/pten/api/lib/utils/tensor_utils.h      |  18 +-
 python/paddle/fluid/__init__.py               |   5 +
 54 files changed, 203 insertions(+), 343 deletions(-)

diff --git a/paddle/fluid/distributed/fleet.h b/paddle/fluid/distributed/fleet.h
index 6d9ce01535e9d..697dbb9170f18 100644
--- a/paddle/fluid/distributed/fleet.h
+++ b/paddle/fluid/distributed/fleet.h
@@ -36,7 +36,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
-class LoDTensor;
+class Tensor;
 class Scope;
 class SelectedRows;
 class Variable;
diff --git a/paddle/fluid/distributed/service/brpc_utils.cc b/paddle/fluid/distributed/service/brpc_utils.cc
index 6eb8462977b60..db55c9ad438a7 100644
--- a/paddle/fluid/distributed/service/brpc_utils.cc
+++ b/paddle/fluid/distributed/service/brpc_utils.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 class Variable;
-class LoDTensor;
+class Tensor;
 }  // namespace framework
 }  // namespace paddle
 
diff --git a/paddle/fluid/distributed/test/brpc_service_dense_sgd_test.cc b/paddle/fluid/distributed/test/brpc_service_dense_sgd_test.cc
index 68d1d457500c7..c0c1fda4c4fca 100644
--- a/paddle/fluid/distributed/test/brpc_service_dense_sgd_test.cc
+++ b/paddle/fluid/distributed/test/brpc_service_dense_sgd_test.cc
@@ -31,7 +31,7 @@ class PSClient;
 class PSServer;
 }  // namespace distributed
 namespace framework {
-class LoDTensor;
+class Tensor;
 class Variable;
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc b/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc
index 8fb3434af6e28..471750feaefef 100644
--- a/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc
+++ b/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc
@@ -32,7 +32,7 @@ class PSClient;
 class PSServer;
 }  // namespace distributed
 namespace framework {
-class LoDTensor;
+class Tensor;
 class Variable;
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h
index a4100e66e7285..2533acaa6d35a 100644
--- a/paddle/fluid/framework/data_feed.h
+++ b/paddle/fluid/framework/data_feed.h
@@ -50,7 +50,7 @@ DECLARE_bool(enable_slotrecord_reset_shrink);
 namespace paddle {
 namespace framework {
 class DataFeedDesc;
-class LoDTensor;
+class Tensor;
 class Scope;
 class Variable;
 }  // namespace framework
diff --git a/paddle/fluid/framework/details/fetch_async_op_handle.h b/paddle/fluid/framework/details/fetch_async_op_handle.h
index f863cc304b8a5..41df0d90aaf81 100644
--- a/paddle/fluid/framework/details/fetch_async_op_handle.h
+++ b/paddle/fluid/framework/details/fetch_async_op_handle.h
@@ -24,7 +24,7 @@
 
 namespace paddle {
 namespace framework {
-class LoDTensor;
+class Tensor;
 
 namespace ir {
 class Node;
diff --git a/paddle/fluid/framework/details/variable_visitor.cc b/paddle/fluid/framework/details/variable_visitor.cc
index 71e5dd28eded1..56c88e9d25a91 100644
--- a/paddle/fluid/framework/details/variable_visitor.cc
+++ b/paddle/fluid/framework/details/variable_visitor.cc
@@ -18,7 +18,7 @@
 
 namespace paddle {
 namespace framework {
-class LoDTensor;
+class Tensor;
 class Variable;
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/device_worker.cc b/paddle/fluid/framework/device_worker.cc
index fbaae5a21c274..3b70ef737f5be 100644
--- a/paddle/fluid/framework/device_worker.cc
+++ b/paddle/fluid/framework/device_worker.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-class LoDTensor;
+class Tensor;
 class Scope;
 
 void DeviceWorker::SetRootScope(Scope* root_scope) { root_scope_ = root_scope; }
diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h
index 15acedf3cf50a..332a584049127 100644
--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@@ -43,10 +43,9 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
-class LoDTensor;
+class Tensor;
 class ProgramDesc;
 class Scope;
-class Tensor;
 }  // namespace framework
 namespace platform {
 class DeviceContext;
diff --git a/paddle/fluid/framework/downpour_worker.cc b/paddle/fluid/framework/downpour_worker.cc
index 11f70acb73aa7..cc97af4b1969d 100644
--- a/paddle/fluid/framework/downpour_worker.cc
+++ b/paddle/fluid/framework/downpour_worker.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
-class LoDTensor;
+class Tensor;
 class Variable;
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc
index 2eac65c90c02f..0c3aafd85f283 100644
--- a/paddle/fluid/framework/feed_fetch_method.cc
+++ b/paddle/fluid/framework/feed_fetch_method.cc
@@ -22,7 +22,7 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-class LoDTensor;
+class Tensor;
 class Variable;
 
 void SetFeedVariable(Scope* scope, const LoDTensor& input,
diff --git a/paddle/fluid/framework/feed_fetch_method.h b/paddle/fluid/framework/feed_fetch_method.h
index 4c2f5b9796a22..dc9310ff5b263 100644
--- a/paddle/fluid/framework/feed_fetch_method.h
+++ b/paddle/fluid/framework/feed_fetch_method.h
@@ -23,7 +23,7 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-class LoDTensor;
+class Tensor;
 class Scope;
 
 void SetFeedVariable(Scope* scope, const LoDTensor& input,
diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
index 6cd16132c2a10..c883412a9a4c3 100644
--- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
@@ -20,7 +20,7 @@
 
 namespace paddle {
 namespace framework {
-class LoDTensor;
+class Tensor;
 class Scope;
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
index b6c410dc957fd..6443d0594a9c5 100644
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
@@ -21,7 +21,7 @@
 
 namespace paddle {
 namespace framework {
-class LoDTensor;
+class Tensor;
 class Scope;
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/delete_dropout_op_pass.cc b/paddle/fluid/framework/ir/delete_dropout_op_pass.cc
index 09962239a01b1..c0a4f099e39d4 100644
--- a/paddle/fluid/framework/ir/delete_dropout_op_pass.cc
+++ b/paddle/fluid/framework/ir/delete_dropout_op_pass.cc
@@ -17,7 +17,7 @@
 
 namespace paddle {
 namespace framework {
-class LoDTensor;
+class Tensor;
 }  // namespace framework
 }  // namespace paddle
 
diff --git a/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc b/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc
index b99f2266f39b2..af75646551e28 100644
--- a/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc
+++ b/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc
@@ -18,7 +18,7 @@
 
 namespace paddle {
 namespace framework {
-class LoDTensor;
+class Tensor;
 }  // namespace framework
 }  // namespace paddle
 
diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
index 0d490d4e669fc..09fd6b8dd1116 100644
--- a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
+++ b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
@@ -24,7 +24,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
-class LoDTensor;
+class Tensor;
 }  // namespace framework
 }  // namespace paddle
 
diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc
index 69a2a6eefaf8c..4681933a66cd3 100644
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -27,34 +27,6 @@ class DeviceContext;
 namespace paddle {
 namespace framework {
 
-std::ostream &operator<<(std::ostream &os, const LoD &lod) {
-  os << "{";
-  for (auto &v : lod) {
-    os << "{";
-    bool is_first = true;
-    for (auto &i : v) {
-      if (is_first) {
-        os << i;
-        is_first = false;
-      } else {
-        os << ", " << i;
-      }
-    }
-    os << "}";
-  }
-  os << "}";
-
-  return os;
-}
-
-std::ostream &operator<<(std::ostream &os, const LoDTensor &t) {
-  if (t.lod().size() > 0) {
-    os << "  - lod: " << t.lod() << "\n";
-  }
-  os << static_cast<Tensor>(t);
-  return os;
-}
-
 std::string LoDToString(const LoD &lod) {
   std::ostringstream stream;
   stream << lod;
diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h
index 22f2027998137..bbb8f8005168c 100644
--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
@@ -28,9 +28,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 
 namespace paddle {
-namespace framework {
-class LoDTensor;
-}  // namespace framework
 namespace platform {
 class DeviceContext;
 }  // namespace platform
@@ -39,6 +36,8 @@ class DeviceContext;
 namespace paddle {
 namespace framework {
 
+using LoDTensor = paddle::framework::Tensor;
+
 /*
  * LoD is short for Level of Details.
  *
@@ -56,9 +55,6 @@ namespace framework {
  */
 using LoD = std::vector<Vector<size_t>>;
 
-std::ostream& operator<<(std::ostream& os, const LoD& lod);
-std::ostream& operator<<(std::ostream& os, const LoDTensor& t);
-
 std::string LoDToString(const LoD& lod);
 
 LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
@@ -102,22 +98,6 @@ bool CheckLoD(const LoD& in, int tensor_height = -1);
  */
 bool CheckAbsLoD(const LoD& in, int tensor_height = -1);
 
-/*
- * LoDTensor (Level of details Tensor)
- * see https://en.wikipedia.org/wiki/Level_of_details for reference.
- */
-class LoDTensor : public Tensor {
- public:
-  using Tensor::Tensor;
-
-  // Split LoDTensor and copy to each place specified in places.
-  std::vector<LoDTensor> SplitLoDTensor(
-      const std::vector<platform::Place> places) const;
-
-  void MergeLoDTensor(const std::vector<const LoDTensor*>& lod_tensors,
-                      platform::Place place);
-};
-
 /*
  * Expand the `source` to fit the LoD of `lod`. For example, a `source`
  * LoDTensor is
diff --git a/paddle/fluid/framework/naive_executor.h b/paddle/fluid/framework/naive_executor.h
index ed475e66f626d..f706eabb47988 100644
--- a/paddle/fluid/framework/naive_executor.h
+++ b/paddle/fluid/framework/naive_executor.h
@@ -31,7 +31,7 @@ namespace framework {
  * Simple, intuitive and effective. Only single thread is supported, and
  * currently designed for inference.
  */
-class LoDTensor;
+class Tensor;
 class ProgramDesc;
 class Scope;
 
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 93349b8b88449..aa21c8eed256b 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -34,7 +34,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
-class LoDTensor;
+class Tensor;
 }  // namespace framework
 }  // namespace paddle
 #ifdef PADDLE_WITH_XPU
@@ -555,11 +555,6 @@ Variable* ExecutionContext::OutputVar(const std::string& name) const {
   return it->second.empty() ? nullptr : it->second[0];
 }
 
-template <>
-const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const {
-  return Input<LoDTensor>(name);
-}
-
 template <>
 const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
     const std::string& name) const {
@@ -584,11 +579,6 @@ const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
   return res;
 }
 
-template <>
-Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const {
-  return Output<LoDTensor>(name);
-}
-
 template <>
 std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
     const std::string& name) const {
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 9d75c66beb7d4..12946b416cf9f 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -479,16 +479,10 @@ class ExecutionArgumentMappingContext : public pten::ArgumentMappingContext {
   const ExecutionContext& ctx_;
 };
 
-template <>
-const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const;
-
 template <>
 const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
     const std::string& name) const;
 
-template <>
-Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const;
-
 template <>
 std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
     const std::string& name) const;
diff --git a/paddle/fluid/framework/pull_dense_worker.cc b/paddle/fluid/framework/pull_dense_worker.cc
index 77d8abcd26e9e..b13aaadc81661 100644
--- a/paddle/fluid/framework/pull_dense_worker.cc
+++ b/paddle/fluid/framework/pull_dense_worker.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-class LoDTensor;
+class Tensor;
 class Scope;
 class Variable;
 
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index fcdb837bc80ce..95405820a48d9 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -36,7 +36,7 @@ namespace paddle {
 
 namespace framework {
 
-class LoDTensor;
+using LoD = std::vector<paddle::framework::Vector<size_t>>;
 
 /*
  NOTE(liym27): [ What is TensorInplaceVersion used for? ]
@@ -74,6 +74,13 @@ class Tensor : public pten::DenseTensor {
   using DenseTensor = pten::DenseTensor;
   using DenseTensor::DenseTensor;
 
+  // Split Tensor and copy to each place specified in places.
+  std::vector<Tensor> SplitLoDTensor(
+      const std::vector<platform::Place> places) const;
+
+  void MergeLoDTensor(const std::vector<const Tensor*>& lod_tensors,
+                      platform::Place place);
+
   /*! The internal of two tensors share the same memory block. */
   Tensor& ShareDataWith(const Tensor& src);
 
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 724e3cc1e2ee8..84334417dc7da 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -1428,7 +1428,31 @@ std::ostream& print_tensor<paddle::platform::complex<double>>(
   return os;
 }
 
+std::ostream& operator<<(std::ostream& os, const LoD& lod) {
+  os << "{";
+  for (auto& v : lod) {
+    os << "{";
+    bool is_first = true;
+    for (auto& i : v) {
+      if (is_first) {
+        os << i;
+        is_first = false;
+      } else {
+        os << ", " << i;
+      }
+    }
+    os << "}";
+  }
+  os << "}";
+
+  return os;
+}
+
 std::ostream& operator<<(std::ostream& os, const Tensor& t) {
+  if (t.lod().size() > 0) {
+    os << "  - lod: " << t.lod() << "\n";
+  }
+
   os << "  - place: " << t.place() << "\n";
   os << "  - shape: [" << t.dims() << "]\n";
   os << "  - layout: " << DataLayoutToString(t.layout()) << "\n";
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index 11858e4166595..355be39baa2a5 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -39,6 +39,9 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+std::ostream& operator<<(std::ostream& os, const LoD& lod);
+std::ostream& operator<<(std::ostream& os, const Tensor& t);
+
 class PrintOptions {
  public:
   static PrintOptions& Instance() {
@@ -494,6 +497,5 @@ inline void TensorToVector(const Tensor& src, std::vector<bool>* dst) {
   delete[] array;
 }
 
-std::ostream& operator<<(std::ostream& os, const Tensor& t);
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h
index 4823c08305760..8bba9492a5686 100644
--- a/paddle/fluid/framework/trainer.h
+++ b/paddle/fluid/framework/trainer.h
@@ -40,7 +40,7 @@ namespace paddle {
 namespace framework {
 
 class Dataset;
-class LoDTensor;
+class Tensor;
 class ProgramDesc;
 class PullDenseWorker;
 class Scope;
diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h
index f4c41197a9dfa..715e7a14c5529 100644
--- a/paddle/fluid/framework/var_type_traits.h
+++ b/paddle/fluid/framework/var_type_traits.h
@@ -70,11 +70,10 @@ class BKCLCommunicator;
 namespace framework {
 class LoDRankTable;
 class ScopeBase;
-class LoDTensor;
+class Tensor;
 class ReaderHolder;
 class Scope;
 class SelectedRows;
-class Tensor;
 }  // namespace framework
 
 namespace operators {
@@ -164,8 +163,8 @@ struct VarTypeRegistryImpl {
 // Users should add other variable types below.
 // Paddle would generate unique Ids for each registered variable types.
 using VarTypeRegistry = detail::VarTypeRegistryImpl<
-    Tensor, LoDTensor, SelectedRows, std::vector<Scope *>, LoDRankTable,
-    Strings, LoDTensorArray, platform::PlaceList, ReaderHolder, String, Scope *,
+    Tensor, SelectedRows, std::vector<Scope *>, LoDRankTable, Strings,
+    LoDTensorArray, platform::PlaceList, ReaderHolder, String, Scope *,
     operators::reader::LoDTensorBlockingQueueHolder, FetchList, FeedList,
     operators::reader::OrderedMultiDeviceLoDTensorBlockingQueueHolder,
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
diff --git a/paddle/fluid/inference/api/api_impl.h b/paddle/fluid/inference/api/api_impl.h
index be771ac48fc15..bf67cfed35f89 100644
--- a/paddle/fluid/inference/api/api_impl.h
+++ b/paddle/fluid/inference/api/api_impl.h
@@ -35,7 +35,7 @@ limitations under the License. */
 namespace paddle {
 
 namespace framework {
-class LoDTensor;
+class Tensor;
 class Scope;
 }  // namespace framework
 
diff --git a/paddle/fluid/inference/api/details/reset_tensor_array.h b/paddle/fluid/inference/api/details/reset_tensor_array.h
index f12a54cdccedc..857160ad10282 100644
--- a/paddle/fluid/inference/api/details/reset_tensor_array.h
+++ b/paddle/fluid/inference/api/details/reset_tensor_array.h
@@ -23,7 +23,7 @@
 
 namespace paddle {
 namespace framework {
-class LoDTensor;
+class Tensor;
 class Scope;
 class SelectedRows;
 }  // namespace framework
diff --git a/paddle/fluid/operators/assert_op.cc b/paddle/fluid/operators/assert_op.cc
index 3e4250389fcfc..466e0e793e4e3 100644
--- a/paddle/fluid/operators/assert_op.cc
+++ b/paddle/fluid/operators/assert_op.cc
@@ -19,7 +19,7 @@
 namespace paddle {
 namespace framework {
 class InferShapeContext;
-class LoDTensor;
+class Tensor;
 class OpDesc;
 class Scope;
 class Variable;
diff --git a/paddle/fluid/operators/assign_op.h b/paddle/fluid/operators/assign_op.h
index bd314a00424bd..d9648c9617255 100644
--- a/paddle/fluid/operators/assign_op.h
+++ b/paddle/fluid/operators/assign_op.h
@@ -27,7 +27,7 @@ class DeviceContext;
 
 namespace paddle {
 namespace framework {
-class LoDTensor;
+class Tensor;
 class Variable;
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/operators/controlflow/while_op_helper.h b/paddle/fluid/operators/controlflow/while_op_helper.h
index 1685da4e95822..8ef12ca05e36a 100644
--- a/paddle/fluid/operators/controlflow/while_op_helper.h
+++ b/paddle/fluid/operators/controlflow/while_op_helper.h
@@ -24,7 +24,7 @@
 
 namespace paddle {
 namespace framework {
-class LoDTensor;
+class Tensor;
 class ProgramDesc;
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/beam_search.cc b/paddle/fluid/operators/math/beam_search.cc
index 5271da91b8c15..c52ba68331580 100644
--- a/paddle/fluid/operators/math/beam_search.cc
+++ b/paddle/fluid/operators/math/beam_search.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
-class LoDTensor;
 class Tensor;
 }  // namespace framework
 namespace platform {
diff --git a/paddle/fluid/operators/math/beam_search_npu.cc b/paddle/fluid/operators/math/beam_search_npu.cc
index 6afaaea0673b2..5aede02263dd5 100644
--- a/paddle/fluid/operators/math/beam_search_npu.cc
+++ b/paddle/fluid/operators/math/beam_search_npu.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
-class LoDTensor;
 class Tensor;
 }  // namespace framework
 namespace platform {
diff --git a/paddle/fluid/operators/math/sequence_padding.cc b/paddle/fluid/operators/math/sequence_padding.cc
index e29313e9f742c..491d40d3ae567 100644
--- a/paddle/fluid/operators/math/sequence_padding.cc
+++ b/paddle/fluid/operators/math/sequence_padding.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
-class LoDTensor;
 class Tensor;
 }  // namespace framework
 namespace platform {
diff --git a/paddle/fluid/operators/math/sequence_scale.cc b/paddle/fluid/operators/math/sequence_scale.cc
index 8e58411a1f247..f4193bb71fabb 100644
--- a/paddle/fluid/operators/math/sequence_scale.cc
+++ b/paddle/fluid/operators/math/sequence_scale.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
-class LoDTensor;
+class Tensor;
 }  // namespace framework
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/math/sequence_scale.h b/paddle/fluid/operators/math/sequence_scale.h
index d84513e024d7f..c6c84bb55dfa7 100644
--- a/paddle/fluid/operators/math/sequence_scale.h
+++ b/paddle/fluid/operators/math/sequence_scale.h
@@ -19,7 +19,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
-class LoDTensor;
+class Tensor;
 }  // namespace framework
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/memcpy_d2h_op.h b/paddle/fluid/operators/memcpy_d2h_op.h
index efa8af8054fc8..94eed5cf83fee 100644
--- a/paddle/fluid/operators/memcpy_d2h_op.h
+++ b/paddle/fluid/operators/memcpy_d2h_op.h
@@ -24,7 +24,7 @@ class DeviceContext;
 
 namespace paddle {
 namespace framework {
-class LoDTensor;
+class Tensor;
 class Variable;
 class SelectedRows;
 }  // namespace framework
diff --git a/paddle/fluid/operators/memcpy_h2d_op.h b/paddle/fluid/operators/memcpy_h2d_op.h
index a19dc3367a14b..cc6e771d105ae 100644
--- a/paddle/fluid/operators/memcpy_h2d_op.h
+++ b/paddle/fluid/operators/memcpy_h2d_op.h
@@ -25,7 +25,7 @@ class DeviceContext;
 
 namespace paddle {
 namespace framework {
-class LoDTensor;
+class Tensor;
 class Variable;
 class SelectedRows;
 }  // namespace framework
diff --git a/paddle/fluid/operators/memcpy_op.h b/paddle/fluid/operators/memcpy_op.h
index 57dafab1d5bc7..b270d87ad00ea 100644
--- a/paddle/fluid/operators/memcpy_op.h
+++ b/paddle/fluid/operators/memcpy_op.h
@@ -27,7 +27,7 @@ class DeviceContext;
 
 namespace paddle {
 namespace framework {
-class LoDTensor;
+class Tensor;
 class Variable;
 class SelectedRows;
 }  // namespace framework
diff --git a/paddle/fluid/operators/merge_lod_tensor_op.cc b/paddle/fluid/operators/merge_lod_tensor_op.cc
index 5024148fe5888..dae598ef64220 100644
--- a/paddle/fluid/operators/merge_lod_tensor_op.cc
+++ b/paddle/fluid/operators/merge_lod_tensor_op.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 class InferShapeContext;
-class LoDTensor;
+class Tensor;
 class OpDesc;
 class Scope;
 }  // namespace framework
diff --git a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
index 30e788bb395a4..754b46c823b28 100644
--- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
@@ -19,7 +19,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
-class LoDTensor;
 class Tensor;
 }  // namespace framework
 namespace platform {
diff --git a/paddle/fluid/operators/print_op.cc b/paddle/fluid/operators/print_op.cc
index c558f1852f54c..cef2993fc30d5 100644
--- a/paddle/fluid/operators/print_op.cc
+++ b/paddle/fluid/operators/print_op.cc
@@ -19,7 +19,7 @@
 namespace paddle {
 namespace framework {
 class InferShapeContext;
-class LoDTensor;
+class Tensor;
 class OpDesc;
 class Scope;
 }  // namespace framework
diff --git a/paddle/fluid/operators/recurrent_op.cc b/paddle/fluid/operators/recurrent_op.cc
index 556f1bccd1680..7adf7962e1987 100644
--- a/paddle/fluid/operators/recurrent_op.cc
+++ b/paddle/fluid/operators/recurrent_op.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 class InferShapeContext;
-class LoDTensor;
+class Tensor;
 class OpDesc;
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc b/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
index d8d4e641aeb3e..4ba071032162a 100644
--- a/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
+++ b/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 class LoDRankTable;
-class LoDTensor;
+class Tensor;
 class OpDesc;
 class Scope;
 }  // namespace framework
diff --git a/paddle/fluid/operators/split_lod_tensor_op.cc b/paddle/fluid/operators/split_lod_tensor_op.cc
index fe646b2830b66..0ff622d329919 100644
--- a/paddle/fluid/operators/split_lod_tensor_op.cc
+++ b/paddle/fluid/operators/split_lod_tensor_op.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 class InferShapeContext;
-class LoDTensor;
+class Tensor;
 class OpDesc;
 class Scope;
 }  // namespace framework
diff --git a/paddle/fluid/operators/tensor_formatter.h b/paddle/fluid/operators/tensor_formatter.h
index 4608663b3ed9b..38e3e7a94a524 100644
--- a/paddle/fluid/operators/tensor_formatter.h
+++ b/paddle/fluid/operators/tensor_formatter.h
@@ -20,7 +20,7 @@
 
 namespace paddle {
 namespace framework {
-class LoDTensor;
+class Tensor;
 }  // namespace framework
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/transfer_layout_op.h b/paddle/fluid/operators/transfer_layout_op.h
index 1f09aec05b936..28135e37ed7bb 100644
--- a/paddle/fluid/operators/transfer_layout_op.h
+++ b/paddle/fluid/operators/transfer_layout_op.h
@@ -29,7 +29,7 @@ class DeviceContext;
 
 namespace paddle {
 namespace framework {
-class LoDTensor;
+class Tensor;
 class Variable;
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/platform/lodtensor_printer.cc b/paddle/fluid/platform/lodtensor_printer.cc
index d607dbe5b9999..4a5dfbee15de2 100644
--- a/paddle/fluid/platform/lodtensor_printer.cc
+++ b/paddle/fluid/platform/lodtensor_printer.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
-class LoDTensor;
+class Tensor;
 class Variable;
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index b5845a1ef9628..5f4e9a8861390 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -875,12 +875,12 @@ PYBIND11_MODULE(core_noavx, m) {
       .def("set", SetTensorFromPyArray<paddle::platform::CUDAPinnedPlace>,
            py::arg("array"), py::arg("place"), py::arg("zero_copy") = false,
            R"DOC(
-        Set the data of LoDTensor on place with given numpy array.
+        Set the data of Tensor on place with given numpy array.
         
         Args:
           lod (numpy.ndarray): The data to set.
           place (CPUPlace|CUDAPlace|XPUPlace|IPUPlace|CUDAPinnedPlace|NPUPlace|MLUPlace): The place where the
-          LoDTensor is to be set.
+          Tensor is to be set.
           zero_copy (bool, optional): Whether to share memory with the input numpy array.
           This parameter only works with CPUPlace. Default: False.
 
@@ -893,17 +893,17 @@ PYBIND11_MODULE(core_noavx, m) {
                 import paddle.fluid as fluid
                 import numpy as np
 
-                t = fluid.LoDTensor()
+                t = fluid.Tensor()
                 t.set(np.ndarray([5, 30]), fluid.CPUPlace())
           )DOC")
 
       .def("shape",
            [](framework::Tensor &self) { return vectorize(self.dims()); },
            R"DOC(
-           Return the shape of LoDTensor.
+           Return the shape of Tensor.
 
            Returns:
-               list[int]: The shape of LoDTensor.
+               list[int]: The shape of Tensor.
 
 
            Examples:
@@ -912,7 +912,7 @@ PYBIND11_MODULE(core_noavx, m) {
                   import paddle.fluid as fluid
                   import numpy as np
 
-                  t = fluid.LoDTensor()
+                  t = fluid.Tensor()
                   t.set(np.ndarray([5, 30]), fluid.CPUPlace())
                   print(t.shape())  # [5, 30]
            )DOC")
@@ -949,117 +949,34 @@ PYBIND11_MODULE(core_noavx, m) {
            })
       .def("_share_data_with", &framework::Tensor::ShareDataWith)
       .def("__getitem__", PySliceTensor, py::return_value_policy::reference)
-      .def("__str__", [](const framework::Tensor &self) {
-        std::stringstream ostr;
-        ostr << self;
-        return ostr.str();
-      });
-
-  // TODO(cql): add reference: en_user_guide_lod_tensor
-  py::class_<LoDTensor, framework::Tensor>(m, "LoDTensor", R"DOC(
-    LoDTensor is a Tensor with optional LoD (Level of Details) information, 
-    it can be used for variable-length sequences, 
-    see :ref:`user_guide_lod_tensor` for details.
-
-    LoDTensor can be converted to numpy array using :code:`numpy.array(lod_tensor)`.
-
-    You can skip the following explanation if you don't need to know details 
-    of LoDTensor.
-
-    The following two examples show how to use LODtensor to represent 
-    variable-length sequences.
-    
-    Example 1:
-    
-    Suppose x is a LoDTensor representing a variable-length sequence. 
-    It contains two logical subsequences, the length of first logical sequence 
-    is 2 (e.g., number of samples is 2), the length of second logical sequence 
-    is 3, and the total length is 5. The data of the first logical sequence is 
-    [1, 2], [3, 4], and the data of the second logical sequence is [5, 6], 
-    [7, 8], [9, 10]. The data dimension of each sample is 2. So, the final 
-    shape of the LoDTensor is [5, 2], of which 5 is the total length and 2 is 
-    the dimension of each sample.
-    
-    Logically, we can represent the variable-length sequence in two ways: one 
-    is in the form of recursive sequence lengths, that is, 
-    x.recursive_sequence_lengths=[[2, 3]]; the other is in the form of offsets, 
-    that is, x.lod=[[0, 2, 2+3]]. These two representations are equivalent, and 
-    you can set and retrieve recursive_sequence_lengths or LoD through the 
-    corresponding interfaces of LoDTensor introduced later.
-
-    Actually, in order to access sequence faster, Paddle uses offset to store 
-    different lengths of sequences. 
-    Therefore, the operations on recursive_sequence_lengths will be converted 
-    to the operations on LoD eventually.
-    
-    .. code-block:: python
-
-      y.data = [[1, 2], [3, 4],
-                [5, 6], [7, 8],
-                [9, 10], [11, 12], [13, 14]]
-
-      y.shape = [2+2+3, 2]
-
-      y.recursive_sequence_lengths = [[2, 1], [2, 2, 3]]
-
-      y.lod = [[0, 2, 3], [0, 2, 4, 7]]
-
-    Example 2:
-
-    LoD may have more than one level (for example, a paragraph may have more 
-    than one sentence and a sentence may have more than one word). Suppose y 
-    is a LoDTensor and its lod_level is 2. 
-    From level = 0, there are two logical sequences, the length of which is 
-    2 and 1, respectively, indicating that the first logical sequence contains 
-    two sub-sequences and the second logical sequence contains one sub-sequence. 
-    From level = 1, the lengths of two sub-sequences contained by the first 
-    logical sequence is 2 and 2, and the length of sub-sequence contained by 
-    the second logical sequence is 3.
-      
-    Therefore, the LoDTensor is represented in the form of recursive sequence 
-    lengths as y.recursive_sequence_lengths=[[2,1], [2,2,3]]; and equally, in 
-    the form of offset, it is represented as y.lod=[[0,2,3], [0,2,4,7]].
-
-    .. code-block:: python
-
-      y.data = [[1, 2], [3, 4],
-                [5, 6], [7, 8],
-                [9, 10], [11, 12], [13, 14]]
-
-      y.shape = [2+2+3, 2]
-
-      y.recursive_sequence_lengths = [[2, 1], [2, 2, 3]]
-
-      y.lod = [[0, 2, 3], [0, 2, 4, 7]]
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-
-          t = fluid.LoDTensor()
-
-        )DOC")
-      .def("__array__",
-           [](framework::Tensor &self) { return TensorToPyArray(self); })
+      .def("__str__",
+           [](const framework::Tensor &self) {
+             std::stringstream ostr;
+             ostr << self;
+             return ostr.str();
+           }) /* ------ End of original Tensor ------ */
+      .def(
+          "__init__",
+          [](framework::Tensor &instance, const std::vector<std::vector<size_t>>
+                                              &recursive_sequence_lengths) {
+            LoD new_lod;
+            new_lod.reserve(recursive_sequence_lengths.size());
+            std::copy(recursive_sequence_lengths.begin(),
+                      recursive_sequence_lengths.end(),
+                      std::back_inserter(new_lod));
+            LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod);
+            PADDLE_ENFORCE_EQ(
+                CheckLoD(new_offset_lod, -1), true,
+                platform::errors::InvalidArgument(
+                    "The provided recursive_sequence_lengths info is invalid, "
+                    "the LoD converted by recursive_sequence_lengths is %s",
+                    new_lod));
+            new (&instance) framework::Tensor(new_offset_lod);
+          })
       .def("__init__",
-           [](LoDTensor &instance, const std::vector<std::vector<size_t>>
-                                       &recursive_sequence_lengths) {
-             LoD new_lod;
-             new_lod.reserve(recursive_sequence_lengths.size());
-             std::copy(recursive_sequence_lengths.begin(),
-                       recursive_sequence_lengths.end(),
-                       std::back_inserter(new_lod));
-             LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod);
-             PADDLE_ENFORCE_EQ(
-                 CheckLoD(new_offset_lod, -1), true,
-                 platform::errors::InvalidArgument(
-                     "The provided recursive_sequence_lengths info is invalid, "
-                     "the LoD converted by recursive_sequence_lengths is %s",
-                     new_lod));
-             new (&instance) LoDTensor(new_offset_lod);
+           [](framework::Tensor &instance) {
+             new (&instance) framework::Tensor();
            })
-      .def("__init__", [](LoDTensor &instance) { new (&instance) LoDTensor(); })
       // We implement offset based LOD in C++ while we use length based with
       // Python API. So we changed set_lod to set_recursive_sequence_lengths
       // to
@@ -1067,7 +984,8 @@ PYBIND11_MODULE(core_noavx, m) {
       // The discussion is here:
       // https://github.com/PaddlePaddle/Paddle/issues/10855
       .def("set_lod",
-           [](LoDTensor &self, const std::vector<std::vector<size_t>> &lod) {
+           [](framework::Tensor &self,
+              const std::vector<std::vector<size_t>> &lod) {
              // the input lod is offset-based level-of-detail info
              LoD new_lod;
              new_lod.reserve(lod.size());
@@ -1079,7 +997,7 @@ PYBIND11_MODULE(core_noavx, m) {
              self.set_lod(new_lod);
            },
            py::arg("lod"), R"DOC(
-           Set LoD of the LoDTensor.
+           Set LoD of the Tensor.
 
            Args:
                lod (list[list[int]]): The lod to set.
@@ -1093,14 +1011,14 @@ PYBIND11_MODULE(core_noavx, m) {
                  import paddle.fluid as fluid
                  import numpy as np
 
-                 t = fluid.LoDTensor()
+                 t = fluid.Tensor()
                  t.set(np.ndarray([5, 30]), fluid.CPUPlace())
                  t.set_lod([[0, 2, 5]])
                  print(t.lod()) # [[0, 2, 5]]
            )DOC")
       .def("set_recursive_sequence_lengths",
-           [](LoDTensor &self, const std::vector<std::vector<size_t>>
-                                   &recursive_sequence_lengths) {
+           [](framework::Tensor &self, const std::vector<std::vector<size_t>>
+                                           &recursive_sequence_lengths) {
              // the input recursive_sequence_lengths is length-based
              // level-of-detail info
              LoD new_lod;
@@ -1119,7 +1037,7 @@ PYBIND11_MODULE(core_noavx, m) {
              self.set_lod(new_offset_lod);
            },
            py::arg("recursive_sequence_lengths"), R"DOC(
-           Set LoD of the LoDTensor according to recursive sequence lengths.
+           Set LoD of the Tensor according to recursive sequence lengths.
 
            For example, if recursive_sequence_lengths=[[2, 3]], which means
            there are two sequences with length 2 and 3 respectively, the
@@ -1137,14 +1055,14 @@ PYBIND11_MODULE(core_noavx, m) {
                  import paddle.fluid as fluid
                  import numpy as np
 
-                 t = fluid.LoDTensor()
+                 t = fluid.Tensor()
                  t.set(np.ndarray([5, 30]), fluid.CPUPlace())
                  t.set_recursive_sequence_lengths([[2, 3]])
-                 print(t.recursive_sequence_length())  # [[2, 3]]
+                 print(t.recursive_sequence_lengths())  # [[2, 3]]
                  print(t.lod())  # [[0, 2, 5]]
            )DOC")
       .def("lod",
-           [](LoDTensor &self) -> std::vector<std::vector<size_t>> {
+           [](framework::Tensor &self) -> std::vector<std::vector<size_t>> {
              // output the offset-based lod info
              LoD lod = self.lod();
              std::vector<std::vector<size_t>> new_lod;
@@ -1153,10 +1071,10 @@ PYBIND11_MODULE(core_noavx, m) {
              return new_lod;
            },
            R"DOC(
-           Return the LoD of the LoDTensor.
+           Return the LoD of the Tensor.
 
            Returns:
-               list[list[int]]: The lod of the LoDTensor.
+               list[list[int]]: The lod of the Tensor.
            
            Examples:
                .. code-block:: python
@@ -1164,14 +1082,14 @@ PYBIND11_MODULE(core_noavx, m) {
                  import paddle.fluid as fluid
                  import numpy as np
 
-                 t = fluid.LoDTensor()
+                 t = fluid.Tensor()
                  t.set(np.ndarray([5, 30]), fluid.CPUPlace())
                  t.set_lod([[0, 2, 5]])
                  print(t.lod()) # [[0, 2, 5]]
            )DOC")
       // Set above comments of set_lod.
       .def("recursive_sequence_lengths",
-           [](LoDTensor &self) -> std::vector<std::vector<size_t>> {
+           [](framework::Tensor &self) -> std::vector<std::vector<size_t>> {
              // output the length-based lod info
              LoD lod = ConvertToLengthBasedLoD(self.lod());
              std::vector<std::vector<size_t>> new_lod;
@@ -1181,7 +1099,7 @@ PYBIND11_MODULE(core_noavx, m) {
            },
            R"DOC(
            Return the recursive sequence lengths corresponding to of the LodD 
-           of the LoDTensor.
+           of the Tensor.
 
            Returns:
                 list[list[int]]: The recursive sequence lengths.
@@ -1192,19 +1110,19 @@ PYBIND11_MODULE(core_noavx, m) {
                  import paddle.fluid as fluid
                  import numpy as np
 
-                 t = fluid.LoDTensor()
+                 t = fluid.Tensor()
                  t.set(np.ndarray([5, 30]), fluid.CPUPlace())
                  t.set_recursive_sequence_lengths([[2, 3]])
                  print(t.recursive_sequence_lengths()) # [[2, 3]]
            )DOC")
       .def("has_valid_recursive_sequence_lengths",
-           [](LoDTensor &self) -> bool {
+           [](framework::Tensor &self) -> bool {
              // Check that the lod info is valid and match the outermost
-             // dimension of the LoDTensor data
+             // dimension of the Tensor data
              return CheckLoD(self.lod(), vectorize(self.dims()).front());
            },
            R"DOC(
-           Check whether the LoD of the LoDTensor is valid.
+           Check whether the LoD of the Tensor is valid.
 
            Returns:
                bool: Whether the LoD is valid.
@@ -1215,91 +1133,80 @@ PYBIND11_MODULE(core_noavx, m) {
                  import paddle.fluid as fluid
                  import numpy as np
 
-                 t = fluid.LoDTensor()
+                 t = fluid.Tensor()
                  t.set(np.ndarray([5, 30]), fluid.CPUPlace())
                  t.set_recursive_sequence_lengths([[2, 3]])
                  print(t.has_valid_recursive_sequence_lengths()) # True
            )DOC")
-      .def("__getitem__", PySliceTensor, py::return_value_policy::reference,
-           R"DOC(
-           Slice the original Tensor, and remove the LoD information.
-
-           Returns:
-               out (Tensor): new Tensor(NOT LoDTensor).
-           )DOC")
-      .def("__str__",
-           [](const LoDTensor &self) {
-             std::stringstream ostr;
-             ostr << self;
-             return ostr.str();
-           })
       .def("_as_type",
-           [](const LoDTensor &self,
+           [](const framework::Tensor &self,
               paddle::framework::proto::VarType::Type type) {
-             LoDTensor dst;
+             framework::Tensor dst;
              if (self.IsInitialized() && self.numel() > 0) {
                TransDataType(self, type, &dst);
              }
              return dst;
            })
-      .def("_copy", [](const LoDTensor &self, const platform::Place &place) {
-        // follow fetch_op's inplementation
-        LoDTensor dst;
-        if (self.IsInitialized() && self.numel() > 0) {
-          TensorCopySync(self, place, &dst);
-        } else {
-          // Not copy, if the src tensor is empty.
-          dst.clear();
-          dst.Resize({0});
-        }
-        dst.set_lod(self.lod());
-        return dst;
+      .def("_copy",
+           [](const framework::Tensor &self, const platform::Place &place) {
+             // follow fetch_op's inplementation
+             framework::Tensor dst;
+             if (self.IsInitialized() && self.numel() > 0) {
+               TensorCopySync(self, place, &dst);
+             } else {
+               // Not copy, if the src tensor is empty.
+               dst.clear();
+               dst.Resize({0});
+             }
+             dst.set_lod(self.lod());
+             return dst;
 #ifdef _WIN32
-      });
+           });
 #else
            })
       .def(py::pickle(
-          [](const LoDTensor &t) {  // __getstate__
+          [](const framework::Tensor &t) {  // __getstate__
             auto holder = t.Holder();
-            PADDLE_ENFORCE_EQ(
-              platform::is_cpu_place(holder->place()), true,
-              platform::errors::PreconditionNotMet(
-                  "LoDTensor is not on CPU."
-                  "Now only LoDTensor on CPU can be serialized."));
-            auto* mmap_writer_allocation =
-              dynamic_cast<memory::allocation::MemoryMapWriterAllocation *>(
-                holder.get());
-            PADDLE_ENFORCE_NOT_NULL(mmap_writer_allocation,
-              platform::errors::PreconditionNotMet(
-                "LoDTensor is not in shared memory."
-                "Now only LoDTensor on shared memory can be serialized."));
+            PADDLE_ENFORCE_EQ(platform::is_cpu_place(holder->place()), true,
+                              platform::errors::PreconditionNotMet(
+                                  "Tensor is not on CPU."
+                                  "Now only Tensor on CPU can be serialized."));
+            auto *mmap_writer_allocation =
+                dynamic_cast<memory::allocation::MemoryMapWriterAllocation *>(
+                    holder.get());
+            PADDLE_ENFORCE_NOT_NULL(
+                mmap_writer_allocation,
+                platform::errors::PreconditionNotMet(
+                    "Tensor is not in shared memory."
+                    "Now only Tensor on shared memory can be serialized."));
             int type_idx = static_cast<int>(t.type());
 
             return py::make_tuple(mmap_writer_allocation->ipc_name(),
-                                  mmap_writer_allocation->size(),
-                                  type_idx, vectorize(t.dims()), t.lod());
+                                  mmap_writer_allocation->size(), type_idx,
+                                  vectorize(t.dims()), t.lod());
           },
           [](py::tuple t) {  // __setstate__
             if (t.size() != 5)
-              throw std::runtime_error("Invalid LoDTensor state!");
+              throw std::runtime_error("Invalid Tensor state!");
 
             // 1. Create a new C++ instance
-            LoDTensor tensor;
+            framework::Tensor tensor;
 
             // 2. Rebuild Allocation
             const std::string &ipc_name = t[0].cast<std::string>();
             size_t size = t[1].cast<size_t>();
             auto shared_reader_holder =
-              memory::allocation::RebuildMemoryMapReaderAllocation(
-                ipc_name, size);
+                memory::allocation::RebuildMemoryMapReaderAllocation(ipc_name,
+                                                                     size);
 
             // 3. Maintain global fd set
-            VLOG(3) << "LoDTensor ipc name: " << ipc_name;
+            VLOG(3) << "Tensor ipc name: " << ipc_name;
             memory::allocation::MemoryMapFdSet::Instance().Insert(ipc_name);
 
-            // 4. Rebuild LoDTensor
-            tensor.ResetHolderWithType(shared_reader_holder,
-              static_cast<proto::VarType::Type>(t[2].cast<int>()));
+            // 4. Rebuild Tensor
+            tensor.ResetHolderWithType(
+                shared_reader_holder,
+                static_cast<proto::VarType::Type>(t[2].cast<int>()));
             tensor.Resize(make_ddim(t[3].cast<std::vector<int>>()));
             tensor.set_lod(t[4].cast<framework::LoD>());
 
diff --git a/paddle/pten/api/lib/utils/tensor_utils.cc b/paddle/pten/api/lib/utils/tensor_utils.cc
index 53d641896e43f..edd5cde938630 100644
--- a/paddle/pten/api/lib/utils/tensor_utils.cc
+++ b/paddle/pten/api/lib/utils/tensor_utils.cc
@@ -31,7 +31,7 @@ void SetLoD(DstLoD* dst, const SrcLoD& src) {
   }
 }
 
-std::unique_ptr<pten::DenseTensor> MakePtenDenseTensor(
+std::unique_ptr<pten::DenseTensor> MakePtenDenseTensorBase(
     const paddle::framework::Tensor& src) {
   VLOG(3) << "MakePtenDenseTensor based Tensor.";
   pten::DenseTensorMeta meta{pten::TransToPtenDataType(src.type()),
@@ -44,15 +44,15 @@ std::unique_ptr<pten::DenseTensor> MakePtenDenseTensor(
 }
 
 std::unique_ptr<pten::DenseTensor> MakePtenDenseTensor(
-    const paddle::framework::LoDTensor& src) {
-  auto out =
-      MakePtenDenseTensor(static_cast<const paddle::framework::Tensor&>(src));
+    const paddle::framework::Tensor& src) {
+  auto out = MakePtenDenseTensorBase(
+      static_cast<const paddle::framework::Tensor&>(src));
   SetLoD(&(pten::CompatibleDenseTensorUtils::GetMutableMeta(out.get())->lod),
          src.lod());
   return std::move(out);
 }
 
-std::unique_ptr<pten::DenseTensor> MakePtenDenseTensor(
+std::unique_ptr<pten::DenseTensor> MakePtenDenseTensorBase(
     const paddle::framework::Tensor& src, const pten::TensorArgDef& arg_def) {
   pten::DenseTensorMeta meta{
       arg_def.dtype, src.dims(), src.layout(), src.offset()};
@@ -71,16 +71,15 @@ std::unique_ptr<pten::DenseTensor> MakePtenDenseTensor(
 }
 
 std::unique_ptr<pten::DenseTensor> MakePtenDenseTensor(
-    const paddle::framework::LoDTensor& src,
-    const pten::TensorArgDef& arg_def) {
-  auto out = MakePtenDenseTensor(
+    const paddle::framework::Tensor& src, const pten::TensorArgDef& arg_def) {
+  auto out = MakePtenDenseTensorBase(
       static_cast<const paddle::framework::Tensor&>(src), arg_def);
   SetLoD(&(pten::CompatibleDenseTensorUtils::GetMutableMeta(out.get())->lod),
          src.lod());
   return std::move(out);
 }
 
-pten::Scalar MakePtenScalar(const paddle::framework::LoDTensor& src) {
+pten::Scalar MakePtenScalar(const paddle::framework::Tensor& src) {
   PADDLE_ENFORCE_EQ(src.numel(),
                     1,
                     paddle::platform::errors::InvalidArgument(
@@ -138,7 +137,7 @@ pten::Scalar MakePtenScalarFromVar(const framework::Variable& variable) {
   }
 }
 
-pten::ScalarArray MakePtenScalarArray(const paddle::framework::LoDTensor& src) {
+pten::ScalarArray MakePtenScalarArray(const paddle::framework::Tensor& src) {
   if (src.type() == paddle::framework::proto::VarType::INT64) {
     return {src.data<int64_t>(), src.numel()};
   } else if (src.type() == paddle::framework::proto::VarType::INT32) {
@@ -295,7 +294,7 @@ std::unique_ptr<pten::TensorBase> MakePtenTensorBaseFromVar(
   return {};
 }
 
-void MovesStorage(pten::DenseTensor* src, paddle::framework::Tensor* dst) {
+void MovesStorageBase(pten::DenseTensor* src, paddle::framework::Tensor* dst) {
   PADDLE_ENFORCE_NOT_NULL(
       src,
       platform::errors::InvalidArgument(
@@ -311,12 +310,12 @@ void MovesStorage(pten::DenseTensor* src, paddle::framework::Tensor* dst) {
   dst->set_offset(src->meta().offset);
 }
 
-void MovesStorage(pten::DenseTensor* src, paddle::framework::LoDTensor* dst) {
-  MovesStorage(src, static_cast<paddle::framework::Tensor*>(dst));
+void MovesStorage(pten::DenseTensor* src, paddle::framework::Tensor* dst) {
+  MovesStorageBase(src, static_cast<paddle::framework::Tensor*>(dst));
   SetLoD(dst->mutable_lod(), src->lod());
 }
 
-void SharesStorage(pten::DenseTensor* src, paddle::framework::Tensor* dst) {
+void SharesStorageBase(pten::DenseTensor* src, paddle::framework::Tensor* dst) {
   PADDLE_ENFORCE_NOT_NULL(
       src,
       platform::errors::InvalidArgument(
@@ -333,13 +332,13 @@ void SharesStorage(pten::DenseTensor* src, paddle::framework::Tensor* dst) {
   dst->set_offset(src->meta().offset);
 }
 
-void SharesStorage(pten::DenseTensor* src, paddle::framework::LoDTensor* dst) {
-  SharesStorage(src, static_cast<paddle::framework::Tensor*>(dst));
+void SharesStorage(pten::DenseTensor* src, paddle::framework::Tensor* dst) {
+  SharesStorageBase(src, static_cast<paddle::framework::Tensor*>(dst));
   SetLoD(dst->mutable_lod(), src->lod());
 }
 
-void ReMakePtenDenseTensor(const paddle::framework::Tensor& src,
-                           pten::DenseTensor* dst) {
+void ReMakePtenDenseTensorBase(const paddle::framework::Tensor& src,
+                               pten::DenseTensor* dst) {
   VLOG(3) << "ReMakePtenDenseTensor based Tensor.";
   auto* meta = pten::CompatibleDenseTensorUtils::GetMutableMeta(dst);
   meta->dims = src.dims();
@@ -361,17 +360,17 @@ void ReMakePtenDenseTensor(const paddle::framework::Tensor& src,
   shared_storage->ResetAllocation(src.Holder());
 }
 
-void ReMakePtenDenseTensor(const paddle::framework::LoDTensor& src,
+void ReMakePtenDenseTensor(const paddle::framework::Tensor& src,
                            pten::DenseTensor* dst) {
   auto* meta = pten::CompatibleDenseTensorUtils::GetMutableMeta(dst);
   SetLoD(&meta->lod, src.lod());
-  ReMakePtenDenseTensor(static_cast<const paddle::framework::Tensor&>(src),
-                        dst);
+  ReMakePtenDenseTensorBase(static_cast<const paddle::framework::Tensor&>(src),
+                            dst);
 }
 
-void ReMakePtenDenseTensorByArgDef(const paddle::framework::Tensor& src,
-                                   const pten::TensorArgDef& arg_def,
-                                   pten::DenseTensor* dst) {
+void ReMakePtenDenseTensorByArgDefBase(const paddle::framework::Tensor& src,
+                                       const pten::TensorArgDef& arg_def,
+                                       pten::DenseTensor* dst) {
   VLOG(3) << "ReMakePtenDenseTensor based Tensor and TensorArgDef.";
   auto* meta = pten::CompatibleDenseTensorUtils::GetMutableMeta(dst);
   meta->dims = src.dims();
@@ -395,12 +394,12 @@ void ReMakePtenDenseTensorByArgDef(const paddle::framework::Tensor& src,
   }
 }
 
-void ReMakePtenDenseTensorByArgDef(const paddle::framework::LoDTensor& src,
+void ReMakePtenDenseTensorByArgDef(const paddle::framework::Tensor& src,
                                    const pten::TensorArgDef& arg_def,
                                    pten::DenseTensor* dst) {
   auto* meta = pten::CompatibleDenseTensorUtils::GetMutableMeta(dst);
   SetLoD(&meta->lod, src.lod());
-  ReMakePtenDenseTensorByArgDef(
+  ReMakePtenDenseTensorByArgDefBase(
       static_cast<const paddle::framework::Tensor&>(src), arg_def, dst);
 }
 
diff --git a/paddle/pten/api/lib/utils/tensor_utils.h b/paddle/pten/api/lib/utils/tensor_utils.h
index 06edb4a7516b0..0ac4ac7a33179 100644
--- a/paddle/pten/api/lib/utils/tensor_utils.h
+++ b/paddle/pten/api/lib/utils/tensor_utils.h
@@ -33,12 +33,9 @@ namespace experimental {
 std::unique_ptr<pten::DenseTensor> MakePtenDenseTensor(
     const paddle::framework::Tensor& src);
 
-std::unique_ptr<pten::DenseTensor> MakePtenDenseTensor(
-    const paddle::framework::LoDTensor& src);
-
-pten::Scalar MakePtenScalar(const paddle::framework::LoDTensor& src);
+pten::Scalar MakePtenScalar(const paddle::framework::Tensor& src);
 
-pten::ScalarArray MakePtenScalarArray(const paddle::framework::LoDTensor& src);
+pten::ScalarArray MakePtenScalarArray(const paddle::framework::Tensor& src);
 
 pten::Scalar MakePtenScalarFromVar(const framework::Variable& variable);
 
@@ -56,12 +53,8 @@ std::unique_ptr<pten::TensorBase> MakePtenTensorBaseFromVar(
 
 void MovesStorage(pten::DenseTensor* src, paddle::framework::Tensor* dst);
 
-void MovesStorage(pten::DenseTensor* src, paddle::framework::LoDTensor* dst);
-
 void SharesStorage(pten::DenseTensor* src, paddle::framework::Tensor* dst);
 
-void SharesStorage(pten::DenseTensor* src, paddle::framework::LoDTensor* dst);
-
 /**
  * In order to improve the compatibility state performance, some tricky tool
  * functions are added.
@@ -74,17 +67,10 @@ void SharesStorage(pten::DenseTensor* src, paddle::framework::LoDTensor* dst);
 void ReMakePtenDenseTensor(const paddle::framework::Tensor& src,
                            pten::DenseTensor* dst);
 
-void ReMakePtenDenseTensor(const paddle::framework::LoDTensor& src,
-                           pten::DenseTensor* dst);
-
 void ReMakePtenDenseTensorByArgDef(const paddle::framework::Tensor& src,
                                    const pten::TensorArgDef& arg_def,
                                    pten::DenseTensor* dst);
 
-void ReMakePtenDenseTensorByArgDef(const paddle::framework::LoDTensor& src,
-                                   const pten::TensorArgDef& arg_def,
-                                   pten::DenseTensor* dst);
-
 void ReMakePtenDenseTensorFromVar(const framework::Variable& variable,
                                   const pten::TensorArgDef& arg_def,
                                   pten::DenseTensor* dst);
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index ec589b40e907f..0339abe0960c2 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -32,6 +32,10 @@
     except Exception as e:
         raise e
 
+# Patch LoDTensor
+from . import core
+core.LoDTensor = core.Tensor
+
 # import all class inside framework into fluid module
 from . import framework
 from .framework import *
@@ -69,6 +73,7 @@
 from . import distribute_lookup_table
 from .param_attr import ParamAttr, WeightNormParamAttr
 from .data_feeder import DataFeeder
+
 from .core import LoDTensor, LoDTensorArray, Scope, _Scope
 from .core import CPUPlace, XPUPlace, CUDAPlace, CUDAPinnedPlace, NPUPlace, IPUPlace, MLUPlace
 from .incubate import fleet

From 1053b1d5ed04f411db50e66848210d9f1996bde4 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Sat, 15 Jan 2022 14:52:58 +0800
Subject: [PATCH 142/151] replace last contextT (#38971)

---
 paddle/pten/kernels/gpu/scale_kernel.cu |  4 ++--
 paddle/pten/kernels/math_kernel.h       | 24 ++++++++++++------------
 paddle/pten/kernels/scale_kernel.h      |  8 ++++----
 3 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/paddle/pten/kernels/gpu/scale_kernel.cu b/paddle/pten/kernels/gpu/scale_kernel.cu
index 4d63701413cd6..14ee75e4f9130 100644
--- a/paddle/pten/kernels/gpu/scale_kernel.cu
+++ b/paddle/pten/kernels/gpu/scale_kernel.cu
@@ -43,8 +43,8 @@ struct ScaleFunctor {
   }
 };
 
-template <typename T, typename ContextT>
-void ScaleKernel(const ContextT& dev_ctx,
+template <typename T, typename Context>
+void ScaleKernel(const Context& dev_ctx,
                  const DenseTensor& x,
                  const Scalar& scale,
                  float bias,
diff --git a/paddle/pten/kernels/math_kernel.h b/paddle/pten/kernels/math_kernel.h
index e01103fc5b847..65c0f84e696de 100644
--- a/paddle/pten/kernels/math_kernel.h
+++ b/paddle/pten/kernels/math_kernel.h
@@ -67,8 +67,8 @@ void SumKernel(const Context& dev_ctx,
                DataType out_dtype,
                DenseTensor* out);
 
-template <typename T, typename ContextT>
-DenseTensor Add(const ContextT& dev_ctx,
+template <typename T, typename Context>
+DenseTensor Add(const Context& dev_ctx,
                 const DenseTensor& x,
                 const DenseTensor& y,
                 int axis) {
@@ -77,12 +77,12 @@ DenseTensor Add(const ContextT& dev_ctx,
       pten::make_intrusive<paddle::experimental::SharedStorage>(
           dev_ctx.GetPlace()),
       std::move(out_meta));
-  AddKernel<T, ContextT>(dev_ctx, x, y, axis, &dense_out);
+  AddKernel<T, Context>(dev_ctx, x, y, axis, &dense_out);
   return dense_out;
 }
 
-template <typename T, typename ContextT>
-DenseTensor Subtract(const ContextT& dev_ctx,
+template <typename T, typename Context>
+DenseTensor Subtract(const Context& dev_ctx,
                      const DenseTensor& x,
                      const DenseTensor& y,
                      int axis) {
@@ -91,12 +91,12 @@ DenseTensor Subtract(const ContextT& dev_ctx,
       pten::make_intrusive<paddle::experimental::SharedStorage>(
           dev_ctx.GetPlace()),
       std::move(out_meta));
-  SubtractKernel<T, ContextT>(dev_ctx, x, y, axis, &dense_out);
+  SubtractKernel<T, Context>(dev_ctx, x, y, axis, &dense_out);
   return dense_out;
 }
 
-template <typename T, typename ContextT>
-DenseTensor Divide(const ContextT& dev_ctx,
+template <typename T, typename Context>
+DenseTensor Divide(const Context& dev_ctx,
                    const DenseTensor& x,
                    const DenseTensor& y,
                    int axis) {
@@ -105,12 +105,12 @@ DenseTensor Divide(const ContextT& dev_ctx,
       pten::make_intrusive<paddle::experimental::SharedStorage>(
           dev_ctx.GetPlace()),
       std::move(out_meta));
-  DivideKernel<T, ContextT>(dev_ctx, x, y, axis, &dense_out);
+  DivideKernel<T, Context>(dev_ctx, x, y, axis, &dense_out);
   return dense_out;
 }
 
-template <typename T, typename ContextT>
-DenseTensor Multiply(const ContextT& dev_ctx,
+template <typename T, typename Context>
+DenseTensor Multiply(const Context& dev_ctx,
                      const DenseTensor& x,
                      const DenseTensor& y,
                      int axis) {
@@ -119,7 +119,7 @@ DenseTensor Multiply(const ContextT& dev_ctx,
       pten::make_intrusive<paddle::experimental::SharedStorage>(
           dev_ctx.GetPlace()),
       std::move(out_meta));
-  MultiplyKernel<T, ContextT>(dev_ctx, x, y, axis, &dense_out);
+  MultiplyKernel<T, Context>(dev_ctx, x, y, axis, &dense_out);
   return dense_out;
 }
 
diff --git a/paddle/pten/kernels/scale_kernel.h b/paddle/pten/kernels/scale_kernel.h
index ba16db566b8bb..1cd11f0b8788f 100644
--- a/paddle/pten/kernels/scale_kernel.h
+++ b/paddle/pten/kernels/scale_kernel.h
@@ -28,15 +28,15 @@ void ScaleKernel(const Context& dev_ctx,
                  bool bias_after_scale,
                  DenseTensor* out);
 
-template <typename T, typename ContextT>
-DenseTensor Scale(const ContextT& dev_ctx,
+template <typename T, typename Context>
+DenseTensor Scale(const Context& dev_ctx,
                   const DenseTensor& x,
                   const Scalar& scale,
                   float bias,
                   bool bias_after_scale) {
   auto out_meta = UnchangedInferMeta(x.meta());
-  auto dense_out = pten::Empty<T, ContextT>(dev_ctx, std::move(out_meta));
-  ScaleKernel<T, ContextT>(
+  auto dense_out = pten::Empty<T, Context>(dev_ctx, std::move(out_meta));
+  ScaleKernel<T, Context>(
       dev_ctx, x, scale, bias, bias_after_scale, &dense_out);
   return dense_out;
 }

From 35d2b71ab531b7b34c42576da49651ba7282300f Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Sat, 15 Jan 2022 15:44:48 +0800
Subject: [PATCH 143/151] [PTen] Remove cached kernel context (#38953)

* remove cached kernel context

* revert dataloader format change
---
 .../framework/new_executor/interpretercore.cc |   9 +-
 .../new_executor/interpretercore_util.cc      |  11 +-
 .../new_executor/new_executor_defs.cc         |   4 -
 .../new_executor/new_executor_defs.h          |   5 +-
 paddle/fluid/framework/operator.cc            | 122 +++++-------------
 paddle/fluid/framework/operator.h             |  13 +-
 paddle/fluid/imperative/layer.cc              |  15 +--
 paddle/fluid/imperative/op_base.h             |   5 -
 paddle/fluid/imperative/prepared_operator.cc  | 100 ++++----------
 paddle/fluid/imperative/prepared_operator.h   |  13 +-
 paddle/fluid/imperative/tracer.cc             |   2 -
 .../fluid/dataloader/dataloader_iter.py       |  19 ---
 12 files changed, 82 insertions(+), 236 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index 950756c0394a5..aea9ad2035396 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -418,15 +418,16 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
         VLOG(4) << "Run pten kernel: " << op->Type();
         VLOG(4) << instr_node.InnerRuntimeContext().get() << " "
                 << &instr_node.DeviceContext();
+        pten::KernelContext pt_kernel_context;
         op_with_kernel->BuildPtenKernelContext(
             *instr_node.InnerRuntimeContext().get(),
-            const_cast<platform::DeviceContext*>(&instr_node.DeviceContext()));
+            const_cast<platform::DeviceContext*>(&instr_node.DeviceContext()),
+            &pt_kernel_context);
 
-        (*instr_node.PtenKernel())(instr_node.PtenKernelContext());
+        (*instr_node.PtenKernel())(&pt_kernel_context);
 
         op_with_kernel->WriteBackToOutputs(
-            instr_node.InnerRuntimeContext().get());
-        instr_node.PtenKernelContext()->ClearData();
+            instr_node.InnerRuntimeContext().get(), &pt_kernel_context);
       } else {
         instr_node.KernelFunc()(*instr_node.InnerExecutionContext().get());
       }
diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.cc b/paddle/fluid/framework/new_executor/interpretercore_util.cc
index 7ced4853c2d8f..214a1d728266b 100644
--- a/paddle/fluid/framework/new_executor/interpretercore_util.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc
@@ -425,13 +425,14 @@ void build_op_func_list(const platform::Place& place,
       }
 
       if (run_pten_kernel) {
-        op_with_kernel->BuildPtenKernelContext(runtime_context, dev_ctx);
+        pten::KernelContext pt_kernel_context;
+        op_with_kernel->BuildPtenKernelContext(runtime_context, dev_ctx,
+                                               &pt_kernel_context);
         op_func_node.pt_kernel_ = op_with_kernel->PtenKernel();
-        op_func_node.pt_kernel_context_ = op_with_kernel->PtenKernelContext();
 
-        (*op_func_node.pt_kernel_)(op_func_node.pt_kernel_context_);
-        op_with_kernel->WriteBackToOutputs(&runtime_context);
-        op_func_node.pt_kernel_context_->ClearData();
+        (*op_func_node.pt_kernel_)(&pt_kernel_context);
+        op_with_kernel->WriteBackToOutputs(&runtime_context,
+                                           &pt_kernel_context);
       } else {
         op_func_node.kernel_func_ = OpKernelComputeFunc(kernel_iter->second);
         op_func_node.kernel_func_(exec_ctx);
diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.cc b/paddle/fluid/framework/new_executor/new_executor_defs.cc
index 654746794da4e..fb29e18887b4e 100644
--- a/paddle/fluid/framework/new_executor/new_executor_defs.cc
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.cc
@@ -688,10 +688,6 @@ pten::Kernel* Instruction::PtenKernel() const {
   return op_func_node_.pt_kernel_;
 }
 
-pten::KernelContext* Instruction::PtenKernelContext() const {
-  return op_func_node_.pt_kernel_context_;
-}
-
 OpFuncType Instruction::KernelType() const { return op_func_node_.type_; }
 
 OperatorBase* Instruction::OpBase() const {
diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h
index 5d63eb33d424b..0ef85a25a237b 100644
--- a/paddle/fluid/framework/new_executor/new_executor_defs.h
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.h
@@ -299,8 +299,7 @@ struct OpFuncNode {
   platform::DeviceContext* dev_ctx_;  // not owned
 
   // fit for pten kernel
-  pten::Kernel* pt_kernel_{nullptr};                 // not owned
-  pten::KernelContext* pt_kernel_context_{nullptr};  // not onwed
+  pten::Kernel* pt_kernel_{nullptr};  // not owned
 
   OpFuncType type_;
 };
@@ -322,8 +321,6 @@ class Instruction {
 
   pten::Kernel* PtenKernel() const;
 
-  pten::KernelContext* PtenKernelContext() const;
-
   OpFuncType KernelType() const;
 
   OperatorBase* OpBase() const;
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index aa21c8eed256b..ff12edb72c06a 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1192,13 +1192,10 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
     platform::RecordEvent record_event("compute",
                                        platform::EventRole::kInnerOp);
     if (run_pten_kernel_) {
-      if (pt_kernel_context_ == nullptr) {
-        pt_kernel_context_.reset(new pten::KernelContext());
-      }
-      BuildPtenKernelContext(*runtime_ctx, dev_ctx);
-      (*pt_kernel_)(pt_kernel_context_.get());
-      WriteBackToOutputs(runtime_ctx);
-      pt_kernel_context_->ClearData();
+      pten::KernelContext pt_kernel_context;
+      BuildPtenKernelContext(*runtime_ctx, dev_ctx, &pt_kernel_context);
+      (*pt_kernel_)(&pt_kernel_context);
+      WriteBackToOutputs(runtime_ctx, &pt_kernel_context);
     } else {
       (*kernel_func_)(
           ExecutionContext(*this, exec_scope, *dev_ctx, *runtime_ctx));
@@ -1791,18 +1788,9 @@ KernelSignature OperatorWithKernel::GetExpectedPtenKernelArgs(
 }
 
 void OperatorWithKernel::BuildPtenKernelContext(
-    const RuntimeContext& ctx, platform::DeviceContext* dev_ctx) const {
-  if (pt_kernel_context_ == nullptr) {
-    pt_kernel_context_.reset(new pten::KernelContext());
-  }
-  // TODO(chenweihang): now only work for very simple case,
-  // many cases need to be deal with later:
-  // 1. the input and output are not tensor
-  // 2. the dispensbale, duplicable input and output
-  // 3. needless attributes remove
-  // 4. use pt Tensor directly
-  // 5. kernel input is not DenseTensor
-  pt_kernel_context_->SetDeviceContext(dev_ctx);
+    const RuntimeContext& ctx, platform::DeviceContext* dev_ctx,
+    pten::KernelContext* pt_kernel_context) const {
+  pt_kernel_context->SetDeviceContext(dev_ctx);
 
   auto& input_names = std::get<0>(pt_kernel_signature_->args);
   auto& attr_names = std::get<1>(pt_kernel_signature_->args);
@@ -1836,33 +1824,14 @@ void OperatorWithKernel::BuildPtenKernelContext(
 
     // calcute the start and end index of the input tensors
     size_t start_idx =
-        (i == 0 ? 0 : pt_kernel_context_->InputRangeAt(i - 1).second);
+        (i == 0 ? 0 : pt_kernel_context->InputRangeAt(i - 1).second);
     size_t end_idx = start_idx + ins_vector.size();
-    auto current_vector_size = pt_kernel_context_->InputsSize();
 
-    // If the memory needed is less than the current memory allocated, we will
-    // reuse the current memory by using ReMakePtenDenseTensorFromVar.
-    // Otherwise，we will create new storage.
     for (size_t offset = 0; offset < ins_vector.size(); ++offset) {
-      if (current_vector_size > start_idx + offset) {
-        auto& input_ptr =
-            pt_kernel_context_->MutableInputPtrAt(start_idx + offset);
-        if (input_ptr == nullptr) {
-          input_ptr = experimental::MakePtenTensorBaseFromVar(
-              *ins_vector[offset], in_def);
-        } else {
-          experimental::ReMakePtenDenseTensorFromVar(
-              *ins_vector[offset], in_def,
-              pt_kernel_context_->MutableInputAt<pten::DenseTensor>(start_idx +
-                                                                    offset));
-        }
-      } else {
-        pt_kernel_context_->EmplaceBackInputWithoutSetRange(
-            experimental::MakePtenTensorBaseFromVar(*ins_vector[offset],
-                                                    in_def));
-      }
+      pt_kernel_context->EmplaceBackInputWithoutSetRange(
+          experimental::MakePtenTensorBaseFromVar(*ins_vector[offset], in_def));
     }
-    pt_kernel_context_->AssignInputRange(std::make_pair(start_idx, end_idx), i);
+    pt_kernel_context->AssignInputRange(std::make_pair(start_idx, end_idx), i);
   }
 
   for (size_t i = 0; i < output_names.size(); ++i) {
@@ -1870,43 +1839,24 @@ void OperatorWithKernel::BuildPtenKernelContext(
     auto& outs_vector = ctx.outputs.at(output_names[i]);
 
     size_t start_idx =
-        (i == 0 ? 0 : pt_kernel_context_->OutputRangeAt(i - 1).second);
+        (i == 0 ? 0 : pt_kernel_context->OutputRangeAt(i - 1).second);
     size_t end_idx = start_idx + outs_vector.size();
-    auto current_vector_size = pt_kernel_context_->OutputsSize();
 
-    // If the memory needed is less than the current memory allocated, we will
-    // reuse the current memory by using ReMakePtenDenseTensorFromVar.
-    // Otherwise，we will create new storage.
     for (size_t offset = 0; offset < outs_vector.size(); ++offset) {
-      if (current_vector_size > start_idx + offset) {
-        auto* buffer_tensor =
-            pt_kernel_context_->MutableOutputAt<pten::DenseTensor>(start_idx +
-                                                                   offset);
-        if (buffer_tensor) {
-          experimental::ReMakePtenDenseTensorFromVar(outs_vector[offset],
-                                                     out_def, buffer_tensor);
-        }
-      } else {
-        pt_kernel_context_->EmplaceBackOutputWithoutSetRange(
-            experimental::MakePtenTensorBaseFromVar(outs_vector[offset],
-                                                    out_def));
-      }
+      pt_kernel_context->EmplaceBackOutputWithoutSetRange(
+          experimental::MakePtenTensorBaseFromVar(outs_vector[offset],
+                                                  out_def));
     }
 
     // Deal with the case that some outputs are NULL when run the kernel.
     // For example : the outputs of matmul_grad are dx and dy,
     // sometimes dx or dy may be NULL.
     if (outs_vector.empty()) {
-      if (current_vector_size > start_idx) {
-        pt_kernel_context_->SetOutputWithoutSetRange(start_idx, {nullptr});
-      } else {
-        pt_kernel_context_->EmplaceBackOutputWithoutSetRange({nullptr});
-      }
+      pt_kernel_context->EmplaceBackOutputWithoutSetRange({nullptr});
       end_idx = start_idx + 1;
     }
 
-    pt_kernel_context_->AssignOutputRange(std::make_pair(start_idx, end_idx),
-                                          i);
+    pt_kernel_context->AssignOutputRange(std::make_pair(start_idx, end_idx), i);
   }
 
   for (size_t i = 0; i < attr_names.size(); ++i) {
@@ -1915,11 +1865,11 @@ void OperatorWithKernel::BuildPtenKernelContext(
       if (attr_iter != Attrs().end()) {  // shape is in the attribute
         if (std::type_index(attr_iter->second.type()) ==
             std::type_index(typeid(std::vector<int64_t>))) {
-          pt_kernel_context_->EmplaceBackAttr(std::move(pten::ScalarArray(
+          pt_kernel_context->EmplaceBackAttr(std::move(pten::ScalarArray(
               BOOST_GET_CONST(std::vector<int64_t>, attr_iter->second))));
         } else if (std::type_index(attr_iter->second.type()) ==
                    std::type_index(typeid(std::vector<int32_t>))) {
-          pt_kernel_context_->EmplaceBackAttr(std::move(pten::ScalarArray(
+          pt_kernel_context->EmplaceBackAttr(std::move(pten::ScalarArray(
               BOOST_GET_CONST(std::vector<int32_t>, attr_iter->second))));
         } else {
           PADDLE_THROW(platform::errors::Unimplemented(
@@ -1930,10 +1880,10 @@ void OperatorWithKernel::BuildPtenKernelContext(
       } else {  // shape is in the input
         auto& ins_vector = ctx.inputs.at(attr_names[i]);
         if (ins_vector.size() == 1) {  // ShapeTensor
-          pt_kernel_context_->EmplaceBackAttr(std::move(
+          pt_kernel_context->EmplaceBackAttr(std::move(
               experimental::MakePtenScalarArrayFromVar(*ins_vector.front())));
         } else {  // ShapeTensorList
-          pt_kernel_context_->EmplaceBackAttr(std::move(
+          pt_kernel_context->EmplaceBackAttr(std::move(
               experimental::MakePtenScalarArrayFromVarList(ins_vector)));
         }
       }
@@ -1946,11 +1896,11 @@ void OperatorWithKernel::BuildPtenKernelContext(
       if (attr_iter != Attrs().end()) {  // scalar is in the attribute
         auto& attr = Attrs().at(attr_names[i]);
         if (std::type_index(attr.type()) == std::type_index(typeid(float))) {
-          pt_kernel_context_->EmplaceBackAttr(
+          pt_kernel_context->EmplaceBackAttr(
               std::move(pten::Scalar(BOOST_GET_CONST(float, attr))));
         } else if (std::type_index(attr.type()) ==
                    std::type_index(typeid(std::string))) {
-          pt_kernel_context_->EmplaceBackAttr(
+          pt_kernel_context->EmplaceBackAttr(
               std::move(pten::Scalar(BOOST_GET_CONST(std::string, attr))));
         } else {
           PADDLE_THROW(platform::errors::Unimplemented(
@@ -1960,7 +1910,7 @@ void OperatorWithKernel::BuildPtenKernelContext(
         }
       } else {
         auto& ins_vector = ctx.inputs.at(attr_names[i]);
-        pt_kernel_context_->EmplaceBackAttr(std::move(
+        pt_kernel_context->EmplaceBackAttr(std::move(
             experimental::MakePtenScalarFromVar(*ins_vector.front())));
       }
 
@@ -1968,17 +1918,17 @@ void OperatorWithKernel::BuildPtenKernelContext(
       // TODO(chenweihang): support other attrs later
       auto& attr = Attrs().at(attr_names[i]);
       if (attr_defs[i].type_index == std::type_index(typeid(int))) {
-        pt_kernel_context_->EmplaceBackAttr(BOOST_GET_CONST(int, attr));
+        pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(int, attr));
       } else if (attr_defs[i].type_index == std::type_index(typeid(float))) {
-        pt_kernel_context_->EmplaceBackAttr(BOOST_GET_CONST(float, attr));
+        pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(float, attr));
       } else if (attr_defs[i].type_index == std::type_index(typeid(bool))) {
-        pt_kernel_context_->EmplaceBackAttr(BOOST_GET_CONST(bool, attr));
+        pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(bool, attr));
       } else if (attr_defs[i].type_index ==
                  std::type_index(typeid(pten::DataType))) {
         auto data_type = pten::TransToPtenDataType(
             static_cast<framework::proto::VarType::Type>(
                 BOOST_GET_CONST(int, attr)));
-        pt_kernel_context_->EmplaceBackAttr(data_type);
+        pt_kernel_context->EmplaceBackAttr(data_type);
       } else if (attr_defs[i].type_index ==
                  std::type_index(typeid(std::vector<int64_t>))) {
         if (std::type_index(attr.type()) ==
@@ -1987,7 +1937,7 @@ void OperatorWithKernel::BuildPtenKernelContext(
           const auto& vector_int_attr = BOOST_GET_CONST(std::vector<int>, attr);
           const std::vector<int64_t> vector_int64_attr(vector_int_attr.begin(),
                                                        vector_int_attr.end());
-          pt_kernel_context_->EmplaceBackAttr(vector_int64_attr);
+          pt_kernel_context->EmplaceBackAttr(vector_int64_attr);
         }
         // TODO(YuanRisheng) Need support vector<int64_t> attr
 
@@ -2001,20 +1951,16 @@ void OperatorWithKernel::BuildPtenKernelContext(
   }
 }
 
-void OperatorWithKernel::WriteBackToOutputs(RuntimeContext* ctx) const {
-  // auto& input_names = std::get<0>(pt_kernel_signature_->args);
-  // auto& attr_names = std::get<1>(pt_kernel_signature_->args);
+void OperatorWithKernel::WriteBackToOutputs(
+    RuntimeContext* ctx, pten::KernelContext* pt_kernel_context) const {
   auto& output_names = std::get<2>(pt_kernel_signature_->args);
 
-  // pt_kernel_context_
-
   for (size_t i = 0; i < output_names.size(); ++i) {
     auto& outs_vector = ctx->outputs.at(output_names[i]);
 
-    auto& range_pair = pt_kernel_context_->OutputRangeAt(i);
-    auto pten_outs =
-        pt_kernel_context_->MutableOutputBetween<pten::DenseTensor>(
-            range_pair.first, range_pair.second);
+    auto& range_pair = pt_kernel_context->OutputRangeAt(i);
+    auto pten_outs = pt_kernel_context->MutableOutputBetween<pten::DenseTensor>(
+        range_pair.first, range_pair.second);
 
     for (size_t j = 0; j < pten_outs.size(); ++j) {
       if (pten_outs[j]) {
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 12946b416cf9f..3aab9165eae0a 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -589,16 +589,14 @@ class OperatorWithKernel : public OperatorBase {
   void ChoosePtenKernel(const ExecutionContext& ctx) const;
 
   void BuildPtenKernelContext(const RuntimeContext& ctx,
-                              platform::DeviceContext* dev_ctx) const;
+                              platform::DeviceContext* dev_ctx,
+                              pten::KernelContext* pt_kernel_context) const;
 
-  void WriteBackToOutputs(RuntimeContext* ctx) const;
+  void WriteBackToOutputs(RuntimeContext* ctx,
+                          pten::KernelContext* pt_kernel_context) const;
 
   pten::Kernel* PtenKernel() const { return pt_kernel_.get(); }
 
-  pten::KernelContext* PtenKernelContext() const {
-    return pt_kernel_context_.get();
-  }
-
   const OpKernelType* kernel_type() const { return kernel_type_.get(); }
 
  private:
@@ -657,9 +655,6 @@ class OperatorWithKernel : public OperatorBase {
   mutable bool run_pten_kernel_ = false;
   mutable std::unique_ptr<KernelSignature> pt_kernel_signature_;
   mutable std::unique_ptr<pten::Kernel> pt_kernel_;
-  // In order to reduce the compatibility phase
-  // performance overhead, temporarily cache KernelContext
-  mutable std::unique_ptr<pten::KernelContext> pt_kernel_context_;
 };
 
 extern bool OpSupportGPU(const std::string& op_type);
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index d8ee400e35082..cc7fcf455a13d 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -409,8 +409,6 @@ void VarBase::_CopyGradientFrom(const VarBase& src) {
   }
 }
 
-pten::KernelContext OpBase::pt_kernel_context_;
-
 void OpBase::SetType(const std::string& type) {
   op_ = framework::OpRegistry::CreateOp(type, {}, {}, {}, false);
 }
@@ -426,8 +424,7 @@ static void OpBaseRunImpl(const framework::OperatorBase& op,
                           const NameVarMap<VarType>& outs,
                           const framework::AttributeMap& attrs,
                           const framework::AttributeMap& default_attrs,
-                          const platform::Place& place,
-                          pten::KernelContext* pt_kernel_context) {
+                          const platform::Place& place) {
   auto* op_kernel = dynamic_cast<const framework::OperatorWithKernel*>(&op);
   PADDLE_ENFORCE_NOT_NULL(
       op_kernel, platform::errors::PermissionDenied(
@@ -468,8 +465,8 @@ static void OpBaseRunImpl(const framework::OperatorBase& op,
    * after the execution of op, but the original input is directly
    * overwritten in the previous dynamic graph implemention.
    */
-  auto prepared_op = PreparedOp::Prepare(ins, outs, *op_kernel, place, attrs,
-                                         default_attrs, pt_kernel_context);
+  auto prepared_op =
+      PreparedOp::Prepare(ins, outs, *op_kernel, place, attrs, default_attrs);
   auto tmp_ins_ptr =
       PrepareData<VarType>(*op_kernel, ins, prepared_op.kernel_type());
   if (tmp_ins_ptr == nullptr) {
@@ -497,8 +494,7 @@ void OpBase::Run(const framework::OperatorBase& op,
                  const framework::AttributeMap& attrs,
                  const framework::AttributeMap& default_attrs,
                  const platform::Place& place) {
-  OpBaseRunImpl<VarBase>(op, ins, outs, attrs, default_attrs, place,
-                         &pt_kernel_context_);
+  OpBaseRunImpl<VarBase>(op, ins, outs, attrs, default_attrs, place);
 }
 
 void OpBase::Run(const framework::OperatorBase& op,
@@ -507,8 +503,7 @@ void OpBase::Run(const framework::OperatorBase& op,
                  const framework::AttributeMap& attrs,
                  const framework::AttributeMap& default_attrs,
                  const platform::Place& place) {
-  OpBaseRunImpl<VariableWrapper>(op, ins, outs, attrs, default_attrs, place,
-                                 &pt_kernel_context_);
+  OpBaseRunImpl<VariableWrapper>(op, ins, outs, attrs, default_attrs, place);
 }
 
 void ClearNoNeedBufferInputs(OpBase* op) {
diff --git a/paddle/fluid/imperative/op_base.h b/paddle/fluid/imperative/op_base.h
index cb76a82353282..3d0847605566b 100644
--- a/paddle/fluid/imperative/op_base.h
+++ b/paddle/fluid/imperative/op_base.h
@@ -183,8 +183,6 @@ class OpBase {
                   const framework::AttributeMap& default_attrs,
                   const platform::Place& place);
 
-  static pten::KernelContext* GetKernelContext() { return &pt_kernel_context_; }
-
   bool HasVoidFunctionPostHook() const {
     return !void_function_post_hooks_.empty();
   }
@@ -212,9 +210,6 @@ class OpBase {
   std::unique_ptr<framework::OperatorBase> op_;
   platform::Place place_;
   size_t id_{-1UL};
-  // In order to reduce the compatibility phase
-  // performance overhead, temporarily cache KernelContext
-  static pten::KernelContext pt_kernel_context_;
   std::vector<std::shared_ptr<std::function<void()>>> void_function_post_hooks_;
 };
 
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 46e974c8f43f3..15a278c2e6464 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -117,7 +117,6 @@ PreparedOp::PreparedOp(const framework::OperatorBase& op,
                        const framework::OpKernelType& kernel_type,
                        const framework::KernelSignature& kernel_signature,
                        const pten::Kernel& pt_kernel,
-                       pten::KernelContext* pt_kernel_context,
                        platform::DeviceContext* dev_ctx)
     : op_(op),
       ctx_(ctx),
@@ -126,8 +125,7 @@ PreparedOp::PreparedOp(const framework::OperatorBase& op,
       dev_ctx_(dev_ctx),
       run_pten_kernel_(true),
       pt_kernel_signature_(kernel_signature),
-      pt_kernel_(pt_kernel),
-      pt_kernel_context_(pt_kernel_context) {}
+      pt_kernel_(pt_kernel) {}
 
 template <typename VarType>
 PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
@@ -135,8 +133,7 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
                        const framework::OperatorWithKernel& op,
                        const platform::Place& place,
                        const framework::AttributeMap& attrs,
-                       const framework::AttributeMap& default_attrs,
-                       pten::KernelContext* pt_kernel_context) {
+                       const framework::AttributeMap& default_attrs) {
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   auto* dev_ctx = pool.Get(place);
 
@@ -178,7 +175,7 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
 
       // TODO(chenweihang): using CPUKernel when miss device kernel case
       return PreparedOp(op, ctx, expected_kernel_key, pt_kernel_signature,
-                        pt_kernel, pt_kernel_context, dev_ctx);
+                        pt_kernel, dev_ctx);
     } else {
       VLOG(6) << "Dynamic mode ChoosePtenKernel - kernel `" << pt_kernel_name
               << "` not found.";
@@ -247,10 +244,8 @@ PreparedOp PreparedOp::Prepare(const NameVarMap<VarBase>& ins,
                                const framework::OperatorWithKernel& op,
                                const platform::Place& place,
                                const framework::AttributeMap& attrs,
-                               const framework::AttributeMap& default_attrs,
-                               pten::KernelContext* pt_kernel_context) {
-  return PrepareImpl<VarBase>(ins, outs, op, place, attrs, default_attrs,
-                              pt_kernel_context);
+                               const framework::AttributeMap& default_attrs) {
+  return PrepareImpl<VarBase>(ins, outs, op, place, attrs, default_attrs);
 }
 
 PreparedOp PreparedOp::Prepare(const NameVarMap<VariableWrapper>& ins,
@@ -258,10 +253,9 @@ PreparedOp PreparedOp::Prepare(const NameVarMap<VariableWrapper>& ins,
                                const framework::OperatorWithKernel& op,
                                const platform::Place& place,
                                const framework::AttributeMap& attrs,
-                               const framework::AttributeMap& default_attrs,
-                               pten::KernelContext* pt_kernel_context) {
+                               const framework::AttributeMap& default_attrs) {
   return PrepareImpl<VariableWrapper>(ins, outs, op, place, attrs,
-                                      default_attrs, pt_kernel_context);
+                                      default_attrs);
 }
 
 template <typename VarType>
@@ -271,13 +265,6 @@ static void BuildDygraphPtenKernelContext(
     const NameVarMap<VarType>& outs, const framework::AttributeMap& attrs,
     const framework::AttributeMap& default_attrs,
     platform::DeviceContext* dev_ctx, pten::KernelContext* kernel_ctx) {
-  // TODO(chenweihang): now only work for very simple case,
-  // many cases need to be deal with later:
-  // 1. the input and output are not tensor
-  // 2. the dispensbale, duplicable input and output
-  // 3. needless attributes remove
-  // 4. use pt Tensor directly
-  // 5. kernel input is not DenseTensor
   kernel_ctx->SetDeviceContext(dev_ctx);
 
   auto& input_names = std::get<0>(pt_kernel_signature.args);
@@ -312,26 +299,11 @@ static void BuildDygraphPtenKernelContext(
 
     size_t start_idx = (i == 0 ? 0 : kernel_ctx->InputRangeAt(i - 1).second);
     size_t end_idx = start_idx + ins_vector.size();
-    auto current_vector_size = kernel_ctx->InputsSize();
 
-    // If the memory needed is less than the current memory allocated, we will
-    // reuse the current memory by using ReMakePtenDenseTensorFromVar.
-    // Otherwise，we will create new storage.
     for (size_t offset = 0; offset < ins_vector.size(); ++offset) {
       const auto& variable = ins_vector[offset]->Var();
-      if (current_vector_size > start_idx + offset) {
-        auto& input_ptr = kernel_ctx->MutableInputPtrAt(start_idx + offset);
-        if (input_ptr == nullptr) {
-          input_ptr = experimental::MakePtenTensorBaseFromVar(variable, in_def);
-        } else {
-          experimental::ReMakePtenDenseTensorFromVar(
-              variable, in_def, kernel_ctx->MutableInputAt<pten::DenseTensor>(
-                                    start_idx + offset));
-        }
-      } else {
-        kernel_ctx->EmplaceBackInputWithoutSetRange(
-            experimental::MakePtenTensorBaseFromVar(variable, in_def));
-      }
+      kernel_ctx->EmplaceBackInputWithoutSetRange(
+          paddle::experimental::MakePtenTensorBaseFromVar(variable, in_def));
     }
     kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i);
   }
@@ -340,15 +312,10 @@ static void BuildDygraphPtenKernelContext(
     auto& out_def = output_defs.at(i);
 
     size_t start_idx = (i == 0 ? 0 : kernel_ctx->OutputRangeAt(i - 1).second);
-    auto current_vector_size = kernel_ctx->OutputsSize();
 
     auto iter = outs.find(output_names[i]);
     if (iter == outs.end()) {
-      if (current_vector_size > start_idx) {
-        kernel_ctx->SetOutputWithoutSetRange(start_idx, {nullptr});
-      } else {
-        kernel_ctx->EmplaceBackOutputWithoutSetRange({nullptr});
-      }
+      kernel_ctx->EmplaceBackOutputWithoutSetRange({nullptr});
       kernel_ctx->AssignOutputRange(std::make_pair(start_idx, start_idx + 1),
                                     i);
       continue;
@@ -357,27 +324,10 @@ static void BuildDygraphPtenKernelContext(
     auto& outs_vector = iter->second;
     size_t end_idx = start_idx + outs_vector.size();
 
-    // If the memory needed is less than the current memory allocated, we will
-    // reuse the current memory by using ReMakePtenDenseTensorFromVar.
-    // Otherwise，we will create new storage.
     for (size_t offset = 0; offset < outs_vector.size(); ++offset) {
-      if (current_vector_size > start_idx + offset) {
-        auto* buffer_tensor =
-            kernel_ctx->MutableOutputAt<pten::DenseTensor>(start_idx + offset);
-        if (buffer_tensor) {
-          experimental::ReMakePtenDenseTensorFromVar(
-              outs_vector[offset]->MutableVar(), out_def, buffer_tensor);
-        } else {
-          kernel_ctx->SetOutputWithoutSetRange(
-              start_idx + offset,
-              experimental::MakePtenTensorBaseFromVar(
-                  outs_vector[offset]->MutableVar(), out_def));
-        }
-      } else {
-        kernel_ctx->EmplaceBackOutputWithoutSetRange(
-            experimental::MakePtenTensorBaseFromVar(
-                outs_vector[offset]->MutableVar(), out_def));
-      }
+      kernel_ctx->EmplaceBackOutputWithoutSetRange(
+          paddle::experimental::MakePtenTensorBaseFromVar(
+              outs_vector[offset]->MutableVar(), out_def));
     }
     kernel_ctx->AssignOutputRange(std::make_pair(start_idx, end_idx), i);
   }
@@ -556,19 +506,20 @@ static void PreparedOpRunPtImpl(
     const framework::OperatorBase& op,
     const framework::OpKernelType& kernel_type,
     const framework::KernelSignature& pt_kernel_signature,
-    const pten::Kernel& pt_kernel, pten::KernelContext* pt_kernel_context,
-    platform::DeviceContext* dev_ctx, const NameVarMap<VarType>& ins,
-    const NameVarMap<VarType>& outs, const framework::AttributeMap& attrs,
+    const pten::Kernel& pt_kernel, platform::DeviceContext* dev_ctx,
+    const NameVarMap<VarType>& ins, const NameVarMap<VarType>& outs,
+    const framework::AttributeMap& attrs,
     const framework::AttributeMap& default_attrs) {
   DygraphInferShapeContext<VarType> infer_shape_ctx(
       &ins, &outs, &attrs, &default_attrs, op.Type(), &kernel_type);
   op.Info().infer_shape_(&infer_shape_ctx);
 
+  pten::KernelContext pt_kernel_context;
   BuildDygraphPtenKernelContext<VarType>(pt_kernel_signature, pt_kernel, ins,
                                          outs, attrs, default_attrs, dev_ctx,
-                                         pt_kernel_context);
+                                         &pt_kernel_context);
 
-  pt_kernel(pt_kernel_context);
+  pt_kernel(&pt_kernel_context);
 
   if (FLAGS_benchmark) {
     dev_ctx->Wait();
@@ -578,10 +529,7 @@ static void PreparedOpRunPtImpl(
 #endif
   }
 
-  WriteBackToOutputs<VarType>(pt_kernel_signature, outs, pt_kernel_context);
-
-  // Ensure that it does not affect the VarBase life cycle management
-  pt_kernel_context->ClearData();
+  WriteBackToOutputs<VarType>(pt_kernel_signature, outs, &pt_kernel_context);
 
   // TODO(chenweihang): add debug flags later
   if (framework::IsComplexType(kernel_type.data_type_)) {
@@ -595,8 +543,8 @@ void PreparedOp::Run(const NameVarMap<VarBase>& ins,
                      const framework::AttributeMap& default_attrs) {
   if (run_pten_kernel_) {
     PreparedOpRunPtImpl<VarBase>(op_, kernel_type_, pt_kernel_signature_,
-                                 pt_kernel_, pt_kernel_context_, dev_ctx_, ins,
-                                 outs, attrs, default_attrs);
+                                 pt_kernel_, dev_ctx_, ins, outs, attrs,
+                                 default_attrs);
   } else {
     PreparedOpRunImpl<VarBase>(op_, ctx_, kernel_type_, func_, dev_ctx_, ins,
                                outs, attrs, default_attrs);
@@ -609,8 +557,8 @@ void PreparedOp::Run(const NameVarMap<VariableWrapper>& ins,
                      const framework::AttributeMap& default_attrs) {
   if (run_pten_kernel_) {
     PreparedOpRunPtImpl<VariableWrapper>(
-        op_, kernel_type_, pt_kernel_signature_, pt_kernel_, pt_kernel_context_,
-        dev_ctx_, ins, outs, attrs, default_attrs);
+        op_, kernel_type_, pt_kernel_signature_, pt_kernel_, dev_ctx_, ins,
+        outs, attrs, default_attrs);
   } else {
     PreparedOpRunImpl<VariableWrapper>(op_, ctx_, kernel_type_, func_, dev_ctx_,
                                        ins, outs, attrs, default_attrs);
diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h
index 29747e79ef6fa..22f016e2cadc1 100644
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -153,25 +153,21 @@ class PreparedOp {
              const framework::RuntimeContext& ctx,
              const framework::OpKernelType& kernel_type,
              const framework::KernelSignature& kernel_signature,
-             const pten::Kernel& pt_kernel,
-             pten::KernelContext* pt_kernel_context,
-             platform::DeviceContext* dev_ctx);
+             const pten::Kernel& pt_kernel, platform::DeviceContext* dev_ctx);
 
   static PreparedOp Prepare(const NameVarMap<VarBase>& ins,
                             const NameVarMap<VarBase>& outs,
                             const framework::OperatorWithKernel& op,
                             const platform::Place& place,
                             const framework::AttributeMap& attrs,
-                            const framework::AttributeMap& default_attrs,
-                            pten::KernelContext* pt_kernel_context = nullptr);
+                            const framework::AttributeMap& default_attrs);
 
   static PreparedOp Prepare(const NameVarMap<VariableWrapper>& ins,
                             const NameVarMap<VariableWrapper>& outs,
                             const framework::OperatorWithKernel& op,
                             const platform::Place& place,
                             const framework::AttributeMap& attrs,
-                            const framework::AttributeMap& default_attrs,
-                            pten::KernelContext* pt_kernel_context = nullptr);
+                            const framework::AttributeMap& default_attrs);
 
   void Run(const NameVarMap<VarBase>& in, const NameVarMap<VarBase>& out,
            const framework::AttributeMap& attrs,
@@ -196,9 +192,6 @@ class PreparedOp {
   bool run_pten_kernel_{false};
   framework::KernelSignature pt_kernel_signature_;
   pten::Kernel pt_kernel_;
-  // In order to reduce the compatibility phase
-  // performance overhead, temporarily cache KernelContext
-  pten::KernelContext* pt_kernel_context_;
 };
 
 }  // namespace imperative
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index 682916a9b323b..7ed9f08906a73 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -231,8 +231,6 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins,
     OpBase::Run(*op, new_ins, outs, attrs, default_attrs, place);
   } catch (platform::EnforceNotMet& exception) {
     framework::AppendErrorOpHint(type, &exception);
-    // Compatible impl: clear pten kernel context data when throw error
-    OpBase::GetKernelContext()->ClearData();
     throw std::move(exception);
   } catch (std::exception& ex) {
     PADDLE_THROW(platform::errors::Fatal(
diff --git a/python/paddle/fluid/dataloader/dataloader_iter.py b/python/paddle/fluid/dataloader/dataloader_iter.py
index 10a9358612960..a3e6ea6d1bc78 100644
--- a/python/paddle/fluid/dataloader/dataloader_iter.py
+++ b/python/paddle/fluid/dataloader/dataloader_iter.py
@@ -202,22 +202,6 @@ def _thread_loop(self, legacy_expected_place):
         # APIs in this thread.
         _set_expected_place(legacy_expected_place)
 
-        # NOTE(chenweihang): [ Why need to set not to execute pten kernel here? ]
-        # Now, in order to ensure that the execution performance of the dynamic
-        # graph mode in pten compatible state does not decline significantly,
-        # we have adopted the approach of caching a KernelContext globally for
-        # the dynamic graph tracer to reduce the construction and deconstruction
-        # overhead of data interfaces such as the compatible state DenseTensor.
-        # The static graph is each op caches a KernelContext, but the op of
-        # the dynamic graph will be constructed and destroyed every round of
-        # execution, so it is impossible to cache KernelContext for each op.
-        # However, it is not thread-safe if using only one global kernel context in
-        # dynamic graph. If the pten op of paddle is used in the DataLoader thread,
-        # it may cause access errors. We temporarily do not execute pten kernel
-        # in this scenario and will find a better solution later and remove
-        # this setting.
-        set_flags({'FLAGS_run_pten_kernel': False})
-
         while not self._thread_done_event.is_set():
             try:
                 indices = next(self._sampler_iter)
@@ -519,9 +503,6 @@ def _thread_loop(self, legacy_expected_place):
         # APIs in this thread.
         _set_expected_place(legacy_expected_place)
 
-        # NOTE(chenweihang): See Note [ Why need to set not to execute pten kernel here? ]
-        set_flags({'FLAGS_run_pten_kernel': False})
-
         while not self._thread_done_event.is_set():
             batch = self._get_data()
             if not self._thread_done_event.is_set():

From d13c779900b2cdab89d21e57f87ec571b8a441e7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=9F=B3=E6=99=93=E4=BC=9F?=
 <39303645+Shixiaowei02@users.noreply.github.com>
Date: Sat, 15 Jan 2022 20:03:54 +0800
Subject: [PATCH 144/151] isolates friends of storage, test=develop (#38977)

---
 paddle/pten/api/lib/utils/tensor_utils.cc | 45 +++++------------------
 paddle/pten/core/compat_utils.h           |  9 +----
 2 files changed, 10 insertions(+), 44 deletions(-)

diff --git a/paddle/pten/api/lib/utils/tensor_utils.cc b/paddle/pten/api/lib/utils/tensor_utils.cc
index edd5cde938630..f304268bedf45 100644
--- a/paddle/pten/api/lib/utils/tensor_utils.cc
+++ b/paddle/pten/api/lib/utils/tensor_utils.cc
@@ -325,9 +325,7 @@ void SharesStorageBase(pten::DenseTensor* src, paddle::framework::Tensor* dst) {
       platform::errors::InvalidArgument(
           "The destination Tensor is nullptr when move allocation."));
   dst->Resize(src->dims());
-  auto* storage = static_cast<SharedStorage*>(
-      pten::CompatibleDenseTensorUtils::UnsafeGetMutableStorage(src));
-  dst->ResetHolderWithType(storage->GetAllocation(),
+  dst->ResetHolderWithType(src->Holder(),
                            pten::TransToProtoVarType(src->dtype()));
   dst->set_offset(src->meta().offset);
 }
@@ -345,19 +343,7 @@ void ReMakePtenDenseTensorBase(const paddle::framework::Tensor& src,
   meta->dtype = pten::TransToPtenDataType(src.type());
   meta->layout = src.layout();
   meta->offset = src.offset();
-
-  auto* shared_storage = static_cast<SharedStorage*>(
-      pten::CompatibleDenseTensorUtils::UnsafeGetMutableStorage(dst));
-  PADDLE_ENFORCE_NOT_NULL(
-      shared_storage,
-      platform::errors::NotFound(
-          "Target DenseTensor's shared storage is nullptr."));
-
-  PADDLE_ENFORCE_EQ(src.IsInitialized(),
-                    true,
-                    paddle::platform::errors::InvalidArgument(
-                        "Source Tensor is not initialized."));
-  shared_storage->ResetAllocation(src.Holder());
+  dst->ResetHolder(src.Holder());
 }
 
 void ReMakePtenDenseTensor(const paddle::framework::Tensor& src,
@@ -378,19 +364,12 @@ void ReMakePtenDenseTensorByArgDefBase(const paddle::framework::Tensor& src,
   meta->layout = src.layout();
   meta->offset = src.offset();
 
-  auto* shared_storage = static_cast<SharedStorage*>(
-      pten::CompatibleDenseTensorUtils::UnsafeGetMutableStorage(dst));
-  PADDLE_ENFORCE_NOT_NULL(
-      shared_storage,
-      platform::errors::NotFound(
-          "Target DenseTensor's shared storage is nullptr."));
-
   if (src.IsInitialized() &&
       src.place() == pten::TransToFluidPlace(arg_def.backend)) {
-    shared_storage->ResetAllocation(src.Holder());
+    dst->ResetHolder(src.Holder());
   } else {
-    shared_storage->ResetAllocationPlace(
-        pten::TransToFluidPlace(arg_def.backend));
+    // This does not affect the correctness, and will be modified immediately.
+    // dst->mutable_data(pten::TransToFluidPlace(arg_def.backend));
   }
 }
 
@@ -481,14 +460,10 @@ void MakeVariableFromPtenTensor(pten::DenseTensor* src,
     tensor->Resize(src->dims());
     SetLoD(tensor->mutable_lod(), src->lod());
 
-    // here dynamic_cast is slow
-    auto* storage = static_cast<SharedStorage*>(
-        pten::CompatibleDenseTensorUtils::UnsafeGetMutableStorage(src));
-
     if (!tensor->IsInitialized() ||
         (tensor->IsInitialized() &&
-         !IsSameAllocation(tensor->Holder(), storage->GetAllocation()))) {
-      tensor->ResetHolderWithType(std::move(storage->GetAllocation()), dtype);
+         !IsSameAllocation(tensor->Holder(), src->Holder()))) {
+      tensor->ResetHolderWithType(std::move(src->Holder()), dtype);
     } else {
       // Even the pten tensor and Variable have the same Alloctation (both have
       // the same pointer address, same size and same place)
@@ -502,10 +477,8 @@ void MakeVariableFromPtenTensor(pten::DenseTensor* src,
     auto dtype = pten::TransToProtoVarType(src->dtype());
 
     if (!tensor->value().IsInitialized()) {
-      auto storage = dynamic_cast<SharedStorage*>(
-          pten::CompatibleDenseTensorUtils::UnsafeGetMutableStorage(src));
-      tensor->mutable_value()->ResetHolderWithType(
-          std::move(storage->GetAllocation()), dtype);
+      tensor->mutable_value()->ResetHolderWithType(std::move(src->Holder()),
+                                                   dtype);
     }
   } else {
     PADDLE_THROW(platform::errors::Unimplemented(
diff --git a/paddle/pten/core/compat_utils.h b/paddle/pten/core/compat_utils.h
index 0bd82080ddebc..46e53e3997cc1 100644
--- a/paddle/pten/core/compat_utils.h
+++ b/paddle/pten/core/compat_utils.h
@@ -31,10 +31,6 @@ namespace pten {
 
 class CompatibleDenseTensorUtils {
  public:
-  static Storage* UnsafeGetMutableStorage(DenseTensor* tensor) {
-    return tensor->storage_.get();
-  }
-
   static DenseTensorMeta* GetMutableMeta(DenseTensor* tensor) {
     return &(tensor->meta_);
   }
@@ -42,10 +38,7 @@ class CompatibleDenseTensorUtils {
   // only can deal with SharedStorage now
   static void ClearStorage(DenseTensor* tensor) {
     // use static_cast to improve performance, replace by dynamic_cast later
-    if (tensor->storage_ != nullptr) {
-      static_cast<paddle::experimental::SharedStorage*>(tensor->storage_.get())
-          ->Reset();
-    }
+    tensor->MoveMemoryHolder();
   }
 
   static DenseTensor Slice(const DenseTensor& tensor,

From 5c3586746792056d86a72f114167103f98b3af29 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=9F=B3=E6=99=93=E4=BC=9F?=
 <39303645+Shixiaowei02@users.noreply.github.com>
Date: Sat, 15 Jan 2022 21:58:35 +0800
Subject: [PATCH 145/151] updates the ctor of tensor, test=develop (#38946)

---
 .../accumulation_node_test.cc                 |  15 ++-
 .../autograd_meta_test.cc                     |   5 +-
 .../data_structure_tests/eager_tensor_test.cc |  10 +-
 .../grad_node_info_test.cc                    |  10 +-
 .../data_structure_tests/grad_node_test.h     |   5 +-
 .../grad_tensor_holder_test.cc                |  15 ++-
 .../tensor_wrapper_test.cc                    |  10 +-
 .../tests/task_tests/eager_utils_test.cc      |  15 ++-
 paddle/pten/api/lib/utils/allocator.h         |  16 +--
 paddle/pten/core/dense_tensor.cc              |   6 +-
 paddle/pten/core/dense_tensor.h               |   6 +-
 paddle/pten/core/storage.cc                   |   2 +-
 paddle/pten/core/storage.h                    |  19 +--
 paddle/pten/tests/api/CMakeLists.txt          |   2 -
 paddle/pten/tests/api/test_cast_api.cc        |   4 +-
 paddle/pten/tests/api/test_conj_api.cc        |   4 +-
 paddle/pten/tests/api/test_dot_api.cc         |   6 +-
 paddle/pten/tests/api/test_elementwise_api.cc |  24 ++--
 paddle/pten/tests/api/test_empty_api.cc       |  12 +-
 paddle/pten/tests/api/test_fill_api.cc        |  22 ++--
 paddle/pten/tests/api/test_flatten_api.cc     |   4 +-
 paddle/pten/tests/api/test_matmul_api.cc      |  20 +--
 paddle/pten/tests/api/test_mean_api.cc        |   4 +-
 paddle/pten/tests/api/test_reshape_api.cc     |   4 +-
 paddle/pten/tests/api/test_storage.cc         |  65 ---------
 paddle/pten/tests/api/test_sum_api.cc         |   4 +-
 paddle/pten/tests/api/test_tensor_utils.cc    | 124 ------------------
 paddle/pten/tests/api/test_to_api.cc          |   4 +-
 paddle/pten/tests/core/CMakeLists.txt         |   2 -
 paddle/pten/tests/core/allocator.h            |  67 +---------
 paddle/pten/tests/core/test_allocator.cc      |  95 --------------
 paddle/pten/tests/core/test_dense_tensor.cc   |  13 +-
 paddle/pten/tests/core/test_storage.cc        |  40 ------
 .../pten/tests/kernels/test_cast_dev_api.cc   |   4 +-
 .../pten/tests/kernels/test_conj_dev_api.cc   |   4 +-
 .../pten/tests/kernels/test_copy_dev_api.cc   |   6 +-
 .../tests/kernels/test_creation_dev_api.cc    |   8 +-
 paddle/pten/tests/kernels/test_dot_dev_api.cc |   6 +-
 .../tests/kernels/test_elementwise_dev_api.cc |  24 ++--
 .../tests/kernels/test_flatten_dev_api.cc     |   4 +-
 .../pten/tests/kernels/test_matmul_dev_api.cc |   6 +-
 .../pten/tests/kernels/test_mean_dev_api.cc   |   4 +-
 .../tests/kernels/test_reshape_dev_api.cc     |   4 +-
 .../pten/tests/kernels/test_scale_dev_api.cc  |  13 +-
 paddle/pten/tests/kernels/test_sum_dev_api.cc |   4 +-
 45 files changed, 175 insertions(+), 566 deletions(-)
 delete mode 100644 paddle/pten/tests/api/test_storage.cc
 delete mode 100644 paddle/pten/tests/api/test_tensor_utils.cc
 delete mode 100644 paddle/pten/tests/core/test_allocator.cc
 delete mode 100644 paddle/pten/tests/core/test_storage.cc

diff --git a/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc b/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc
index f249d2099f24c..cdc9701009513 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc
@@ -32,15 +32,17 @@ TEST(AccumulationNode, EagerTensor) {
   pten::DenseTensorMeta meta = pten::DenseTensorMeta(
       pten::DataType::FLOAT16, paddle::framework::make_ddim({1, 1}));
   std::shared_ptr<pten::DenseTensor> dt0 = std::make_shared<pten::DenseTensor>(
-      std::make_shared<paddle::experimental::DefaultAllocator>(
-          paddle::platform::CPUPlace()),
+      std::make_unique<paddle::experimental::DefaultAllocator>(
+          paddle::platform::CPUPlace())
+          .get(),
       meta);
   dt0->mutable_data<paddle::platform::float16>()[0] = 10.0;
   EagerTensor et0 = EagerTensor(dt0);
 
   std::shared_ptr<pten::DenseTensor> dt1 = std::make_shared<pten::DenseTensor>(
-      std::make_shared<paddle::experimental::DefaultAllocator>(
-          paddle::platform::CPUPlace()),
+      std::make_unique<paddle::experimental::DefaultAllocator>(
+          paddle::platform::CPUPlace())
+          .get(),
       meta);
 
   dt1->mutable_data<paddle::platform::float16>()[0] = 20.0;
@@ -48,8 +50,9 @@ TEST(AccumulationNode, EagerTensor) {
 
   std::shared_ptr<pten::DenseTensor> grad_dt =
       std::make_shared<pten::DenseTensor>(
-          std::make_shared<paddle::experimental::DefaultAllocator>(
-              paddle::platform::CPUPlace()),
+          std::make_unique<paddle::experimental::DefaultAllocator>(
+              paddle::platform::CPUPlace())
+              .get(),
           meta);
   EagerTensor grad_et = EagerTensor(grad_dt);
 
diff --git a/paddle/fluid/eager/tests/data_structure_tests/autograd_meta_test.cc b/paddle/fluid/eager/tests/data_structure_tests/autograd_meta_test.cc
index 96845569ca0c5..3d45dc831d411 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/autograd_meta_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/autograd_meta_test.cc
@@ -42,8 +42,9 @@ TEST(AutogradMeta, MemberFunction) {
   pten::DenseTensorMeta meta = pten::DenseTensorMeta(
       pten::DataType::FLOAT32, paddle::framework::make_ddim({1, 2}));
   std::shared_ptr<pten::DenseTensor> dt = std::make_shared<pten::DenseTensor>(
-      std::make_shared<paddle::experimental::DefaultAllocator>(
-          paddle::platform::CPUPlace()),
+      std::make_unique<paddle::experimental::DefaultAllocator>(
+          paddle::platform::CPUPlace())
+          .get(),
       meta);
   auto* dt_ptr = dt->mutable_data<float>();
   dt_ptr[0] = 5.0f;
diff --git a/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc b/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc
index 84daf4eac4ce6..a483ddb6a98f6 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc
@@ -36,8 +36,9 @@ TEST(EagerTensor, Constructor) {
   pten::DenseTensorMeta meta = pten::DenseTensorMeta(
       pten::DataType::FLOAT32, paddle::framework::make_ddim({1, 2}));
   std::shared_ptr<pten::DenseTensor> dt = std::make_shared<pten::DenseTensor>(
-      std::make_shared<paddle::experimental::DefaultAllocator>(
-          paddle::platform::CPUPlace()),
+      std::make_unique<paddle::experimental::DefaultAllocator>(
+          paddle::platform::CPUPlace())
+          .get(),
       meta);
   auto* dt_ptr = dt->mutable_data<float>();
   dt_ptr[0] = 5.0f;
@@ -65,8 +66,9 @@ TEST(EagerTensor, MemberFunction) {
   pten::DenseTensorMeta meta = pten::DenseTensorMeta(
       pten::DataType::FLOAT32, paddle::framework::make_ddim({1, 2}));
   std::shared_ptr<pten::DenseTensor> dt = std::make_shared<pten::DenseTensor>(
-      std::make_shared<paddle::experimental::DefaultAllocator>(
-          paddle::platform::CPUPlace()),
+      std::make_unique<paddle::experimental::DefaultAllocator>(
+          paddle::platform::CPUPlace())
+          .get(),
       meta);
   auto* dt_ptr = dt->mutable_data<float>();
   dt_ptr[0] = 5.0f;
diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc b/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc
index a89fb019d5b37..7f6609b88a527 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc
@@ -41,8 +41,9 @@ TEST(GradNodeInfo, GradNodeBase) {
   pten::DenseTensorMeta meta = pten::DenseTensorMeta(
       pten::DataType::FLOAT32, paddle::framework::make_ddim({1, 1}));
   std::shared_ptr<pten::DenseTensor> dt = std::make_shared<pten::DenseTensor>(
-      std::make_shared<paddle::experimental::DefaultAllocator>(
-          paddle::platform::CPUPlace()),
+      std::make_unique<paddle::experimental::DefaultAllocator>(
+          paddle::platform::CPUPlace())
+          .get(),
       meta);
   auto* dt_ptr = dt->mutable_data<float>();
   dt_ptr[0] = 5.0f;
@@ -97,8 +98,9 @@ TEST(GradNodeInfo, GradNodeBase) {
     pten::DenseTensorMeta meta = pten::DenseTensorMeta(
         pten::DataType::FLOAT32, paddle::framework::make_ddim({1, 1}));
     std::shared_ptr<pten::DenseTensor> dt = std::make_shared<pten::DenseTensor>(
-        std::make_shared<paddle::experimental::DefaultAllocator>(
-            paddle::platform::CPUPlace()),
+        std::make_unique<paddle::experimental::DefaultAllocator>(
+            paddle::platform::CPUPlace())
+            .get(),
         meta);
     auto* dt_ptr = dt->mutable_data<float>();
     dt_ptr[0] = 6.0f;
diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h b/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h
index 2870bfa8b0c94..433a00e27be0e 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h
+++ b/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h
@@ -37,8 +37,9 @@ class GradTestNode : public egr::GradNodeBase {
     pten::DenseTensorMeta meta = pten::DenseTensorMeta(
         pten::DataType::FLOAT32, paddle::framework::make_ddim({1, 1}));
     std::shared_ptr<pten::DenseTensor> dt = std::make_shared<pten::DenseTensor>(
-        std::make_shared<paddle::experimental::DefaultAllocator>(
-            paddle::platform::CPUPlace()),
+        std::make_unique<paddle::experimental::DefaultAllocator>(
+            paddle::platform::CPUPlace())
+            .get(),
         meta);
     auto* dt_ptr = dt->mutable_data<float>();
     dt_ptr[0] = 6.0f;
diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc b/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc
index 3581ef59cd5be..c88a5f5fdcef5 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc
@@ -36,8 +36,9 @@ TEST(GradTensorHolder, Constructor) {
   pten::DenseTensorMeta meta = pten::DenseTensorMeta(
       pten::DataType::FLOAT32, paddle::framework::make_ddim({2, 2}));
   std::shared_ptr<pten::DenseTensor> dt = std::make_shared<pten::DenseTensor>(
-      std::make_shared<paddle::experimental::DefaultAllocator>(
-          paddle::platform::CPUPlace()),
+      std::make_unique<paddle::experimental::DefaultAllocator>(
+          paddle::platform::CPUPlace())
+          .get(),
       meta);
   EagerTensor et = EagerTensor(dt);
 
@@ -52,15 +53,17 @@ TEST(GradTensorHolder, Interfaces) {
   pten::DenseTensorMeta meta = pten::DenseTensorMeta(
       pten::DataType::FLOAT32, paddle::framework::make_ddim({1, 1}));
   std::shared_ptr<pten::DenseTensor> dt0 = std::make_shared<pten::DenseTensor>(
-      std::make_shared<paddle::experimental::DefaultAllocator>(
-          paddle::platform::CPUPlace()),
+      std::make_unique<paddle::experimental::DefaultAllocator>(
+          paddle::platform::CPUPlace())
+          .get(),
       meta);
   dt0->mutable_data<float>()[0] = 10.0;
   EagerTensor et0 = EagerTensor(dt0);
 
   std::shared_ptr<pten::DenseTensor> dt1 = std::make_shared<pten::DenseTensor>(
-      std::make_shared<paddle::experimental::DefaultAllocator>(
-          paddle::platform::CPUPlace()),
+      std::make_unique<paddle::experimental::DefaultAllocator>(
+          paddle::platform::CPUPlace())
+          .get(),
       meta);
   dt1->mutable_data<float>()[0] = 20.0;
   EagerTensor et1 = EagerTensor(dt1);
diff --git a/paddle/fluid/eager/tests/data_structure_tests/tensor_wrapper_test.cc b/paddle/fluid/eager/tests/data_structure_tests/tensor_wrapper_test.cc
index 6d78cf42d0c48..8bc739d455a95 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/tensor_wrapper_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/tensor_wrapper_test.cc
@@ -25,8 +25,9 @@ TEST(TensorWrapper, Basic) {
   pten::DenseTensorMeta meta = pten::DenseTensorMeta(
       pten::DataType::FLOAT32, paddle::framework::make_ddim({1, 2}));
   std::shared_ptr<pten::DenseTensor> dt = std::make_shared<pten::DenseTensor>(
-      std::make_shared<paddle::experimental::DefaultAllocator>(
-          paddle::platform::CPUPlace()),
+      std::make_unique<paddle::experimental::DefaultAllocator>(
+          paddle::platform::CPUPlace())
+          .get(),
       meta);
   auto* dt_ptr = dt->mutable_data<float>();
   dt_ptr[0] = 5.0f;
@@ -51,8 +52,9 @@ TEST(TensorWrapper, Basic) {
   pten::DenseTensorMeta meta2 = pten::DenseTensorMeta(
       pten::DataType::FLOAT32, paddle::framework::make_ddim({1, 2}));
   std::shared_ptr<pten::DenseTensor> dt2 = std::make_shared<pten::DenseTensor>(
-      std::make_shared<paddle::experimental::DefaultAllocator>(
-          paddle::platform::CPUPlace()),
+      std::make_unique<paddle::experimental::DefaultAllocator>(
+          paddle::platform::CPUPlace())
+          .get(),
       meta2);
   auto* dt_ptr2 = dt->mutable_data<float>();
   dt_ptr2[0] = 6.0f;
diff --git a/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc b/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
index ea9aae83ff189..1b2f1287b069d 100644
--- a/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
@@ -31,15 +31,17 @@ TEST(EagerUtils, AutoGradMeta) {
   pten::DenseTensorMeta meta = pten::DenseTensorMeta(
       pten::DataType::FLOAT32, paddle::framework::make_ddim({1, 1}));
   std::shared_ptr<pten::DenseTensor> dt0 = std::make_shared<pten::DenseTensor>(
-      std::make_shared<paddle::experimental::DefaultAllocator>(
-          paddle::platform::CPUPlace()),
+      std::make_unique<paddle::experimental::DefaultAllocator>(
+          paddle::platform::CPUPlace())
+          .get(),
       meta);
   dt0->mutable_data<float>()[0] = 10.0;
   EagerTensor et0 = EagerTensor(dt0);
 
   std::shared_ptr<pten::DenseTensor> dt1 = std::make_shared<pten::DenseTensor>(
-      std::make_shared<paddle::experimental::DefaultAllocator>(
-          paddle::platform::CPUPlace()),
+      std::make_unique<paddle::experimental::DefaultAllocator>(
+          paddle::platform::CPUPlace())
+          .get(),
       meta);
   dt1->mutable_data<float>()[0] = 20.0;
   EagerTensor et1 = EagerTensor(dt1);
@@ -106,8 +108,9 @@ egr::EagerTensor CreateTestCPUTensor(T val,
       pten::DenseTensorMeta(pten::DataType::FLOAT32, ddim);
   egr::EagerTensor tensor;
   std::shared_ptr<pten::DenseTensor> dt = std::make_shared<pten::DenseTensor>(
-      std::make_shared<paddle::experimental::DefaultAllocator>(
-          paddle::platform::CPUPlace()),
+      std::make_unique<paddle::experimental::DefaultAllocator>(
+          paddle::platform::CPUPlace())
+          .get(),
       meta);
   auto* dt_ptr = dt->mutable_data<T>();
   for (int64_t i = 0; i < dt->numel(); i++) {
diff --git a/paddle/pten/api/lib/utils/allocator.h b/paddle/pten/api/lib/utils/allocator.h
index a8c05b7651689..acdba822ac4bb 100644
--- a/paddle/pten/api/lib/utils/allocator.h
+++ b/paddle/pten/api/lib/utils/allocator.h
@@ -22,25 +22,15 @@ limitations under the License. */
 namespace paddle {
 namespace experimental {
 
-class DefaultAllocator : public pten::deprecated::Allocator {
+class DefaultAllocator : public pten::Allocator {
  public:
-  using Allocation = pten::deprecated::Allocation;
   explicit DefaultAllocator(const paddle::platform::Place& place)
       : place_(place) {}
 
-  static void Delete(Allocation* allocation) {
-    paddle::memory::allocation::Allocator::AllocationDeleter(
-        allocation->CastContextWithoutCheck<paddle::memory::Allocation>());
+  AllocationPtr Allocate(size_t bytes_size) override {
+    return memory::Alloc(place_, bytes_size);
   }
 
-  Allocation Allocate(size_t bytes_size) override {
-    paddle::memory::AllocationPtr a = memory::Alloc(place_, bytes_size);
-    void* ptr = a->ptr();
-    return Allocation(ptr, a.release(), &Delete, place_);
-  }
-
-  const paddle::platform::Place& place() override { return place_; }
-
  private:
   paddle::platform::Place place_;
 };
diff --git a/paddle/pten/core/dense_tensor.cc b/paddle/pten/core/dense_tensor.cc
index eb6f834d72779..716e1ac3d30bb 100644
--- a/paddle/pten/core/dense_tensor.cc
+++ b/paddle/pten/core/dense_tensor.cc
@@ -24,13 +24,11 @@ limitations under the License. */
 
 namespace pten {
 
-DenseTensor::DenseTensor(const std::shared_ptr<Allocator>& a,
-                         const DenseTensorMeta& meta)
+DenseTensor::DenseTensor(Allocator* a, const DenseTensorMeta& meta)
     : meta_(meta),
       storage_(make_intrusive<TensorStorage>(a, SizeOf(dtype()) * numel())) {}
 
-DenseTensor::DenseTensor(const std::shared_ptr<Allocator>& a,
-                         DenseTensorMeta&& meta)
+DenseTensor::DenseTensor(Allocator* a, DenseTensorMeta&& meta)
     : meta_(std::move(meta)),
       storage_(make_intrusive<TensorStorage>(a, SizeOf(dtype()) * numel())) {}
 
diff --git a/paddle/pten/core/dense_tensor.h b/paddle/pten/core/dense_tensor.h
index 4f25fc296724c..db8d7a2a39c90 100644
--- a/paddle/pten/core/dense_tensor.h
+++ b/paddle/pten/core/dense_tensor.h
@@ -60,17 +60,15 @@ class TensorInplaceVersion {
 class DenseTensor : public TensorBase,
                     public TypeInfoTraits<TensorBase, DenseTensor> {
  public:
-  using Allocator = deprecated::Allocator;
-
   /// \brief Construct a dense tensor and allocate space.
   /// \param a The allocator used to allocate space.
   /// \param meta The meta data of dense tensor.
-  DenseTensor(const std::shared_ptr<Allocator>& a, const DenseTensorMeta& meta);
+  DenseTensor(Allocator* a, const DenseTensorMeta& meta);
 
   /// \brief Construct a dense tensor and allocate space.
   /// \param a The allocator used to allocate space.
   /// \param meta The meta data of dense tensor.
-  DenseTensor(const std::shared_ptr<Allocator>& a, DenseTensorMeta&& meta);
+  DenseTensor(Allocator* a, DenseTensorMeta&& meta);
 
   /// \brief Use existing storage space to create dense tensor. This interface
   /// can be used to deliberately create an uninitialized dense tensor.
diff --git a/paddle/pten/core/storage.cc b/paddle/pten/core/storage.cc
index f7c7f68734101..aacae7be88349 100644
--- a/paddle/pten/core/storage.cc
+++ b/paddle/pten/core/storage.cc
@@ -18,7 +18,7 @@ namespace pten {
 
 void TensorStorage::Realloc(size_t size) {
   this->Clear();
-  data_ = paddle::memory::AllocShared(alloc_->place(), size);
+  data_ = alloc_->Allocate(size);
   size_ = size;
 }
 
diff --git a/paddle/pten/core/storage.h b/paddle/pten/core/storage.h
index cf18dd913093a..97d7f8d0f1105 100644
--- a/paddle/pten/core/storage.h
+++ b/paddle/pten/core/storage.h
@@ -91,12 +91,11 @@ class Storage : public intrusive_ref_counter<Storage> {
 class TensorStorage : public Storage {
  public:
   using Place = paddle::platform::Place;
-  using Allocator = deprecated::Allocator;
 
-  explicit TensorStorage(const std::shared_ptr<Allocator>& a) : alloc_(a) {}
+  explicit TensorStorage(Allocator* a) : alloc_(a) {}
 
-  TensorStorage(const std::shared_ptr<Allocator>& a, size_t size)
-      : Storage(paddle::memory::AllocShared(a->place(), size)), alloc_(a) {
+  TensorStorage(Allocator* a, size_t size)
+      : Storage(a->Allocate(size)), alloc_(a) {
     size_ = data_->size();
   }
 
@@ -114,24 +113,18 @@ class TensorStorage : public Storage {
   size_t size() const noexcept override { return size_; }
 
   const Place& place() const override {
-    if (!data_ && !alloc_) {
+    if (!data_) {
       PADDLE_THROW(paddle::platform::errors::Unimplemented(
           "Unable to visit place: either data_ or alloc_ has to be initialized "
           "first."));
     }
-    if (data_) {
-      return data_->place();
-    }
-    return alloc_->place();
+    return data_->place();
   }
 
   bool OwnsMemory() const noexcept override { return true; }
-  const std::shared_ptr<Allocator>& allocator() const noexcept {
-    return alloc_;
-  }
 
  private:
-  const std::shared_ptr<Allocator> alloc_;
+  Allocator* alloc_;
   int64_t size_{0};
 };
 
diff --git a/paddle/pten/tests/api/CMakeLists.txt b/paddle/pten/tests/api/CMakeLists.txt
index ffbc551843148..79d9a3d82e69e 100644
--- a/paddle/pten/tests/api/CMakeLists.txt
+++ b/paddle/pten/tests/api/CMakeLists.txt
@@ -5,8 +5,6 @@ else()
 endif()
 
 cc_test(test_pten_exception SRCS test_pten_exception.cc DEPS gtest)
-cc_test(test_framework_storage SRCS test_storage.cc DEPS pten_api_utils)
-cc_test(test_framework_tensor_utils SRCS test_tensor_utils.cc DEPS pten_api_utils)
 
 cc_test(test_mean_api SRCS test_mean_api.cc DEPS pten_tensor pten_api pten_api_utils)
 cc_test(test_dot_api SRCS test_dot_api.cc DEPS pten_tensor pten_api pten_api_utils)
diff --git a/paddle/pten/tests/api/test_cast_api.cc b/paddle/pten/tests/api/test_cast_api.cc
index c2660a1f80019..6608d1ed08cab 100644
--- a/paddle/pten/tests/api/test_cast_api.cc
+++ b/paddle/pten/tests/api/test_cast_api.cc
@@ -30,10 +30,10 @@ using DDim = paddle::framework::DDim;
 // TODO(chenweihang): Remove this test after the API is used in the dygraph
 TEST(API, cast) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
   auto dense_x = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 4}),
                             pten::DataLayout::NCHW));
diff --git a/paddle/pten/tests/api/test_conj_api.cc b/paddle/pten/tests/api/test_conj_api.cc
index 928f8e414fda0..50d190257a16d 100644
--- a/paddle/pten/tests/api/test_conj_api.cc
+++ b/paddle/pten/tests/api/test_conj_api.cc
@@ -30,10 +30,10 @@ using DDim = paddle::framework::DDim;
 // TODO(chenweihang): Remove this test after the API is used in the dygraph
 TEST(API, conj) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
   auto dense_x = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::COMPLEX64,
                             framework::make_ddim({3, 10}),
                             pten::DataLayout::NCHW));
diff --git a/paddle/pten/tests/api/test_dot_api.cc b/paddle/pten/tests/api/test_dot_api.cc
index 41c03f8f26201..40e709b960334 100644
--- a/paddle/pten/tests/api/test_dot_api.cc
+++ b/paddle/pten/tests/api/test_dot_api.cc
@@ -30,17 +30,17 @@ using DDim = paddle::framework::DDim;
 // TODO(chenweihang): Remove this test after the API is used in the dygraph
 TEST(API, dot) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
   auto dense_x = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 10}),
                             pten::DataLayout::NCHW));
   auto* dense_x_data = dense_x->mutable_data<float>();
 
   auto dense_y = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 10}),
                             pten::DataLayout::NCHW));
diff --git a/paddle/pten/tests/api/test_elementwise_api.cc b/paddle/pten/tests/api/test_elementwise_api.cc
index e5971aae5513f..69af32eb457a6 100644
--- a/paddle/pten/tests/api/test_elementwise_api.cc
+++ b/paddle/pten/tests/api/test_elementwise_api.cc
@@ -30,17 +30,17 @@ using DDim = paddle::framework::DDim;
 // TODO(chenweihang): Remove this test after the API is used in the dygraph
 TEST(API, add) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
   auto dense_x = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 10}),
                             pten::DataLayout::NCHW));
   auto* dense_x_data = dense_x->mutable_data<float>();
 
   auto dense_y = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({10}),
                             pten::DataLayout::NCHW));
@@ -84,17 +84,17 @@ TEST(API, add) {
 // TODO(chenweihang): Remove this test after the API is used in the dygraph
 TEST(API, subtract) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
   auto dense_x = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 10}),
                             pten::DataLayout::NCHW));
   auto* dense_x_data = dense_x->mutable_data<float>();
 
   auto dense_y = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({10}),
                             pten::DataLayout::NCHW));
@@ -138,17 +138,17 @@ TEST(API, subtract) {
 // TODO(chenweihang): Remove this test after the API is used in the dygraph
 TEST(API, divide) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
   auto dense_x = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 10}),
                             pten::DataLayout::NCHW));
   auto* dense_x_data = dense_x->mutable_data<float>();
 
   auto dense_y = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({10}),
                             pten::DataLayout::NCHW));
@@ -192,17 +192,17 @@ TEST(API, divide) {
 
 TEST(API, multiply) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
   auto dense_x = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 10}),
                             pten::DataLayout::NCHW));
   auto* dense_x_data = dense_x->mutable_data<float>();
 
   auto dense_y = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({10}),
                             pten::DataLayout::NCHW));
diff --git a/paddle/pten/tests/api/test_empty_api.cc b/paddle/pten/tests/api/test_empty_api.cc
index fcc01ad8a7172..f4e3f472c7990 100644
--- a/paddle/pten/tests/api/test_empty_api.cc
+++ b/paddle/pten/tests/api/test_empty_api.cc
@@ -30,10 +30,10 @@ using DDim = paddle::framework::DDim;
 // TODO(chenweihang): Remove this test after the API is used in the dygraph
 TEST(API, empty_like) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
   auto dense_x = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 2}),
                             pten::DataLayout::NCHW));
@@ -55,11 +55,11 @@ TEST(API, empty_like) {
 
 TEST(API, empty1) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
 
   auto dense_shape = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::INT64,
                             framework::make_ddim({2}),
                             pten::DataLayout::NCHW));
@@ -83,11 +83,11 @@ TEST(API, empty1) {
 }
 
 TEST(API, empty2) {
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
 
   auto dense_scalar = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::INT32,
                             framework::make_ddim({1}),
                             pten::DataLayout::NCHW));
diff --git a/paddle/pten/tests/api/test_fill_api.cc b/paddle/pten/tests/api/test_fill_api.cc
index e87d094eec9d3..0d823765680e8 100644
--- a/paddle/pten/tests/api/test_fill_api.cc
+++ b/paddle/pten/tests/api/test_fill_api.cc
@@ -30,10 +30,10 @@ using DDim = paddle::framework::DDim;
 // TODO(chenweihang): Remove this test after the API is used in the dygraph
 TEST(API, full_like) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
   auto dense_x = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 2}),
                             pten::DataLayout::NCHW));
@@ -65,10 +65,10 @@ TEST(API, full_like) {
 
 TEST(API, zeros_like) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
   auto dense_x = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 2}),
                             pten::DataLayout::NCHW));
@@ -98,10 +98,10 @@ TEST(API, zeros_like) {
 
 TEST(API, ones_like) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
   auto dense_x = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::INT32,
                             framework::make_ddim({3, 2}),
                             pten::DataLayout::NCHW));
@@ -131,11 +131,11 @@ TEST(API, ones_like) {
 
 TEST(API, full1) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
 
   auto dense_shape = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::INT64,
                             framework::make_ddim({2}),
                             pten::DataLayout::NCHW));
@@ -144,7 +144,7 @@ TEST(API, full1) {
   shape_data[1] = 3;
 
   auto dense_scalar = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({1}),
                             pten::DataLayout::NCHW));
@@ -177,11 +177,11 @@ TEST(API, full1) {
 }
 
 TEST(API, full2) {
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
 
   auto dense_scalar = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::INT32,
                             framework::make_ddim({1}),
                             pten::DataLayout::NCHW));
diff --git a/paddle/pten/tests/api/test_flatten_api.cc b/paddle/pten/tests/api/test_flatten_api.cc
index 93c8a50f02a78..6c082b9653e6f 100644
--- a/paddle/pten/tests/api/test_flatten_api.cc
+++ b/paddle/pten/tests/api/test_flatten_api.cc
@@ -30,10 +30,10 @@ using DDim = paddle::framework::DDim;
 // TODO(chenweihang): Remove this test after the API is used in the dygraph
 TEST(API, flatten) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
   auto dense_x = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 2, 2, 3}),
                             pten::DataLayout::NCHW));
diff --git a/paddle/pten/tests/api/test_matmul_api.cc b/paddle/pten/tests/api/test_matmul_api.cc
index bef0e2af4cf92..03f686f1c3f5e 100644
--- a/paddle/pten/tests/api/test_matmul_api.cc
+++ b/paddle/pten/tests/api/test_matmul_api.cc
@@ -30,10 +30,10 @@ using DDim = paddle::framework::DDim;
 
 TEST(API, matmul_cpu) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
   auto dense_x = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 3}),
                             pten::DataLayout::NCHW));
@@ -41,7 +41,7 @@ TEST(API, matmul_cpu) {
   auto* dense_x_data = dense_x->mutable_data<float>();
 
   auto dense_y = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 3}),
                             pten::DataLayout::NCHW));
@@ -79,10 +79,10 @@ TEST(API, matmul_cpu) {
 TEST(API, matmul_cuda) {
   // Prepare CPU Dense Tensor
   const auto alloc_cpu =
-      std::make_shared<paddle::experimental::DefaultAllocator>(
+      std::make_unique<paddle::experimental::DefaultAllocator>(
           paddle::platform::CPUPlace());
   auto ref_x = std::make_shared<pten::DenseTensor>(
-      alloc_cpu,
+      alloc_cpu.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 3}),
                             pten::DataLayout::NCHW));
@@ -90,7 +90,7 @@ TEST(API, matmul_cuda) {
   auto* ref_x_data = ref_x->mutable_data<float>();
 
   auto ref_y = std::make_shared<pten::DenseTensor>(
-      alloc_cpu,
+      alloc_cpu.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 3}),
                             pten::DataLayout::NCHW));
@@ -104,16 +104,16 @@ TEST(API, matmul_cuda) {
 
   // 1. create tensor
   const auto alloc_cuda =
-      std::make_shared<paddle::experimental::DefaultAllocator>(
+      std::make_unique<paddle::experimental::DefaultAllocator>(
           paddle::platform::CUDAPlace());
   auto dense_x = std::make_shared<pten::DenseTensor>(
-      alloc_cuda,
+      alloc_cuda.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 3}),
                             pten::DataLayout::NCHW));
 
   auto dense_y = std::make_shared<pten::DenseTensor>(
-      alloc_cuda,
+      alloc_cuda.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 3}),
                             pten::DataLayout::NCHW));
@@ -143,7 +143,7 @@ TEST(API, matmul_cuda) {
   auto dense_out = std::dynamic_pointer_cast<pten::DenseTensor>(out.impl());
 
   auto ref_out = std::make_shared<pten::DenseTensor>(
-      alloc_cpu,
+      alloc_cpu.get(),
       pten::DenseTensorMeta(
           pten::DataType::FLOAT32, out.dims(), pten::DataLayout::NCHW));
 
diff --git a/paddle/pten/tests/api/test_mean_api.cc b/paddle/pten/tests/api/test_mean_api.cc
index a8c4c5306dced..9d90e58101cbd 100644
--- a/paddle/pten/tests/api/test_mean_api.cc
+++ b/paddle/pten/tests/api/test_mean_api.cc
@@ -30,10 +30,10 @@ using DDim = paddle::framework::DDim;
 // TODO(chenweihang): Remove this test after the API is used in the dygraph
 TEST(API, mean) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
   auto dense_x = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 4}),
                             pten::DataLayout::NCHW));
diff --git a/paddle/pten/tests/api/test_reshape_api.cc b/paddle/pten/tests/api/test_reshape_api.cc
index 227dcc6e9568d..59e9e9fab1122 100644
--- a/paddle/pten/tests/api/test_reshape_api.cc
+++ b/paddle/pten/tests/api/test_reshape_api.cc
@@ -30,10 +30,10 @@ using DDim = paddle::framework::DDim;
 // TODO(chenweihang): Remove this test after the API is used in the dygraph
 TEST(API, reshape) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
   auto dense_x = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 2, 2, 3}),
                             pten::DataLayout::NCHW));
diff --git a/paddle/pten/tests/api/test_storage.cc b/paddle/pten/tests/api/test_storage.cc
deleted file mode 100644
index 1a5d95f9419c5..0000000000000
--- a/paddle/pten/tests/api/test_storage.cc
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <vector>
-
-#include "gtest/gtest.h"
-
-#include "paddle/pten/api/lib/utils/allocator.h"
-#include "paddle/pten/api/lib/utils/storage.h"
-
-namespace paddle {
-namespace tests {
-
-TEST(host_storage, external_stroage) {
-  const size_t size{100};
-  const auto a = std::make_shared<experimental::DefaultAllocator>(
-      paddle::platform::CPUPlace());
-  pten::intrusive_ptr<pten::Storage> in_storage =
-      pten::make_intrusive<pten::TensorStorage>(a, size);
-  char* data = static_cast<char*>(in_storage->data());
-  for (size_t i = 0; i < size; ++i) {
-    data[i] = i;
-  }
-  const size_t delta{1};
-  const size_t n{10};
-  auto ex_storage =
-      pten::make_intrusive<experimental::ExternalStorage>(in_storage, delta, n);
-  CHECK_EQ(ex_storage->size(), n);
-  CHECK(paddle::platform::is_cpu_place(ex_storage->place()));
-  CHECK(!ex_storage->OwnsMemory());
-  for (size_t i = delta; i < delta + n; ++i) {
-    CHECK_EQ(data[i], static_cast<char>(i));
-  }
-}
-
-TEST(host_storage, external_vector) {
-  std::vector<char> data(100);
-  for (size_t i = 0; i < data.size(); ++i) {
-    data[i] = i;
-  }
-  const size_t delta{1};
-  const size_t n{10};
-  auto ex_storage = pten::make_intrusive<experimental::ExternalStorage>(
-      data.data(), n, paddle::platform::CPUPlace());
-  CHECK_EQ(ex_storage->size(), n);
-  CHECK(paddle::platform::is_cpu_place(ex_storage->place()));
-  CHECK(!ex_storage->OwnsMemory());
-  for (size_t i = delta; i < delta + n; ++i) {
-    CHECK_EQ(data[i], static_cast<char>(i));
-  }
-}
-
-}  // namespace tests
-}  // namespace paddle
diff --git a/paddle/pten/tests/api/test_sum_api.cc b/paddle/pten/tests/api/test_sum_api.cc
index ff1609d3d4051..5a7c9840e1114 100644
--- a/paddle/pten/tests/api/test_sum_api.cc
+++ b/paddle/pten/tests/api/test_sum_api.cc
@@ -30,10 +30,10 @@ using DDim = paddle::framework::DDim;
 // TODO(chenweihang): Remove this test after the API is used in the dygraph
 TEST(API, sum) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
   auto dense_x = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 4}),
                             pten::DataLayout::NCHW));
diff --git a/paddle/pten/tests/api/test_tensor_utils.cc b/paddle/pten/tests/api/test_tensor_utils.cc
deleted file mode 100644
index 041bd28ad892a..0000000000000
--- a/paddle/pten/tests/api/test_tensor_utils.cc
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "gtest/gtest.h"
-
-#include "paddle/pten/api/lib/utils/tensor_utils.h"
-#include "paddle/pten/core/tensor_meta.h"
-
-namespace paddle {
-namespace tests {
-
-using DDim = paddle::framework::DDim;
-using DataType = paddle::experimental::DataType;
-using DataLayout = paddle::experimental::DataLayout;
-
-using DenseTensor = pten::DenseTensor;
-using DenseTensorMeta = pten::DenseTensorMeta;
-
-TEST(tensor_utils, dense_tensor_to_lod_tensor) {
-  const DDim dims({2, 1});
-  const DataType dtype{DataType::FLOAT32};
-  const DataLayout layout{DataLayout::NCHW};
-  const pten::LoD lod{{0, 2}};
-  DenseTensorMeta meta(dtype, dims, layout, lod);
-
-  auto alloc =
-      std::make_shared<experimental::DefaultAllocator>(platform::CPUPlace());
-
-  DenseTensor dense_tensor(alloc, meta);
-  float* data = dense_tensor.mutable_data<float>();
-  data[0] = 1.0f;
-  data[1] = 2.1f;
-
-  framework::LoDTensor lod_tensor;
-  experimental::MovesStorage(&dense_tensor, &lod_tensor);
-
-  CHECK(dense_tensor.lod().size() == lod_tensor.lod().size());
-  CHECK(dense_tensor.lod()[0] ==
-        static_cast<paddle::framework::Vector<size_t>>((lod_tensor.lod()[0])));
-  CHECK(dense_tensor.dtype() == pten::TransToPtenDataType(lod_tensor.type()));
-  CHECK(dense_tensor.layout() == lod_tensor.layout());
-  CHECK(platform::is_cpu_place(lod_tensor.place()));
-
-  CHECK(lod_tensor.data<float>()[0] == 1.0f);
-  CHECK(lod_tensor.data<float>()[1] == 2.1f);
-
-  auto dense_tensor_1 = experimental::MakePtenDenseTensor(lod_tensor);
-  CHECK(dense_tensor_1->dims() == dims);
-  CHECK(dense_tensor_1->dtype() == dtype);
-  CHECK(dense_tensor_1->layout() == layout);
-  CHECK(dense_tensor_1->lod().size() == lod.size());
-  CHECK(dense_tensor_1->lod()[0] == lod[0]);
-  const float* data_1 = dense_tensor_1->data<float>();
-  CHECK(data_1[0] == 1.0f);
-  CHECK(data_1[1] == 2.1f);
-}
-
-TEST(tensor_utils, dense_tensor_to_tensor) {
-  const DDim dims({2, 1});
-  const DataType dtype{DataType::FLOAT32};
-  const DataLayout layout{DataLayout::NCHW};
-  DenseTensorMeta meta(dtype, dims, layout);
-
-  auto alloc =
-      std::make_shared<experimental::DefaultAllocator>(platform::CPUPlace());
-
-  DenseTensor dense_tensor(alloc, meta);
-  float* data = dense_tensor.mutable_data<float>();
-  data[0] = 1.0f;
-  data[1] = 2.1f;
-
-  framework::Tensor tensor;
-  experimental::MovesStorage(&dense_tensor, &tensor);
-
-  CHECK(dense_tensor.dtype() == pten::TransToPtenDataType(tensor.type()));
-  CHECK(dense_tensor.layout() == tensor.layout());
-  CHECK(platform::is_cpu_place(tensor.place()));
-
-  CHECK(tensor.data<float>()[0] == 1.0f);
-  CHECK(tensor.data<float>()[1] == 2.1f);
-
-  auto dense_tensor_1 = experimental::MakePtenDenseTensor(tensor);
-  CHECK(dense_tensor_1->dims() == dims);
-  CHECK(dense_tensor_1->dtype() == dtype);
-  CHECK(dense_tensor_1->layout() == layout);
-  const float* data_1 = dense_tensor_1->data<float>();
-  CHECK(data_1[0] == 1.0f);
-  CHECK(data_1[1] == 2.1f);
-}
-
-TEST(PtenUtils, VarToPtTensor) {
-  // 1. create Variable
-  paddle::framework::Variable v;
-  auto selected_rows = v.GetMutable<paddle::framework::SelectedRows>();
-  paddle::framework::Tensor* value = selected_rows->mutable_value();
-  auto* data = value->mutable_data<int>(paddle::framework::make_ddim({1, 1}),
-                                        paddle::platform::CPUPlace());
-  data[0] = 123;
-  pten::Backend expect_backend = pten::Backend::CPU;
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  expect_backend = pten::Backend::GPU;
-#endif
-  auto tensor_def = pten::TensorArgDef(
-      expect_backend, pten::DataLayout::NCHW, pten::DataType::INT32);
-  // 2. test API
-  auto tensor_x = experimental::MakePtenTensorBaseFromVar(v, tensor_def);
-  // 3. check result
-  ASSERT_EQ(tensor_x->dtype(), pten::DataType::INT32);
-}
-
-}  // namespace tests
-}  // namespace paddle
diff --git a/paddle/pten/tests/api/test_to_api.cc b/paddle/pten/tests/api/test_to_api.cc
index 47e8ff7c2c87e..9aef716029a69 100644
--- a/paddle/pten/tests/api/test_to_api.cc
+++ b/paddle/pten/tests/api/test_to_api.cc
@@ -28,10 +28,10 @@ namespace framework = paddle::framework;
 using DDim = paddle::framework::DDim;
 
 paddle::experimental::Tensor CreateInputTensor() {
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
   auto dense_x = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::INT64,
                             framework::make_ddim({3, 4}),
                             pten::DataLayout::NCHW));
diff --git a/paddle/pten/tests/core/CMakeLists.txt b/paddle/pten/tests/core/CMakeLists.txt
index 9a5cfecc2917b..07554f02d9992 100644
--- a/paddle/pten/tests/core/CMakeLists.txt
+++ b/paddle/pten/tests/core/CMakeLists.txt
@@ -1,5 +1,3 @@
-cc_test(test_allocator SRCS test_allocator.cc DEPS tensor_base)
-cc_test(test_storage SRCS test_storage.cc DEPS tensor_base)
 cc_test(test_dense_tensor SRCS test_dense_tensor.cc DEPS dense_tensor)
 cc_test(test_intrusive_ptr SRCS test_intrusive_ptr.cc)
 cc_test(test_type_info SRCS test_type_info.cc)
diff --git a/paddle/pten/tests/core/allocator.h b/paddle/pten/tests/core/allocator.h
index c2c74e1aacf1f..e78f288e8e545 100644
--- a/paddle/pten/tests/core/allocator.h
+++ b/paddle/pten/tests/core/allocator.h
@@ -21,76 +21,19 @@ limitations under the License. */
 namespace pten {
 namespace tests {
 
-class HostAllocatorSample : public pten::deprecated::RawAllocator {
+class FancyAllocator : public pten::Allocator {
  public:
-  using Place = paddle::platform::Place;
-  void* Allocate(size_t bytes_size) override {
-    return ::operator new(bytes_size);
-  }
-  void Deallocate(void* ptr, size_t bytes_size) override {
-    return ::operator delete(ptr);
-  }
-  const Place& place() const override { return place_; }
-
- private:
-  Place place_{paddle::platform::CPUPlace()};
-};
-
-class FancyAllocator : public pten::deprecated::Allocator {
- public:
-  using Allocation = pten::deprecated::Allocation;
   static void Delete(Allocation* allocation) {
     ::operator delete(allocation->ptr());
   }
 
-  Allocation Allocate(size_t bytes_size) override {
+  AllocationPtr Allocate(size_t bytes_size) override {
     void* data = ::operator new(bytes_size);
-    return Allocation(data, data, &Delete, place());
-  }
-
-  const paddle::platform::Place& place() override { return place_; }
-
-  paddle::platform::Place place_ = paddle::platform::CPUPlace();
-};
-
-template <typename T>
-struct CustomAllocator {
-  using value_type = T;
-  using Allocator = pten::deprecated::RawAllocator;
-
-  explicit CustomAllocator(const std::shared_ptr<Allocator>& a) noexcept
-      : alloc_(a) {}
-
-  CustomAllocator(const CustomAllocator&) noexcept = default;
-  T* allocate(std::size_t n) {
-    return static_cast<T*>(alloc_->Allocate(n * sizeof(T)));
-  }
-  void deallocate(T* p, std::size_t n) {
-    return alloc_->Deallocate(p, sizeof(T) * n);
+    auto* allocation =
+        new pten::Allocation(data, bytes_size, paddle::platform::CPUPlace());
+    return AllocationPtr(allocation, Delete);
   }
-
-  template <typename R, typename U>
-  friend bool operator==(const CustomAllocator<R>&,
-                         const CustomAllocator<U>&) noexcept;
-  template <typename R, typename U>
-  friend bool operator!=(const CustomAllocator<R>&,
-                         const CustomAllocator<U>&) noexcept;
-
- private:
-  std::shared_ptr<Allocator> alloc_;
 };
 
-template <typename T, typename U>
-inline bool operator==(const CustomAllocator<T>& lhs,
-                       const CustomAllocator<U>& rhs) noexcept {
-  return &lhs.alloc_ == &rhs.alloc_;
-}
-
-template <typename T, typename U>
-inline bool operator!=(const CustomAllocator<T>& lhs,
-                       const CustomAllocator<U>& rhs) noexcept {
-  return &lhs.alloc_ != &rhs.alloc_;
-}
-
 }  // namespace tests
 }  // namespace pten
diff --git a/paddle/pten/tests/core/test_allocator.cc b/paddle/pten/tests/core/test_allocator.cc
deleted file mode 100644
index 94ba9a1e1b9a2..0000000000000
--- a/paddle/pten/tests/core/test_allocator.cc
+++ /dev/null
@@ -1,95 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/generator.h"
-#include "paddle/pten/tests/core/allocator.h"
-#include "paddle/pten/tests/core/random.h"
-#include "paddle/pten/tests/core/timer.h"
-
-namespace pten {
-namespace tests {
-
-using RawAllocator = pten::deprecated::RawAllocator;
-using Allocator = pten::deprecated::Allocator;
-using Allocation = pten::deprecated::Allocation;
-
-template <typename T>
-bool host_allocator_test(size_t vector_size) {
-  std::vector<T> src(vector_size);
-  std::generate(src.begin(), src.end(), make_generator(src));
-  std::vector<T, CustomAllocator<T>> dst(
-      src.begin(),
-      src.end(),
-      CustomAllocator<T>(std::make_shared<HostAllocatorSample>()));
-  return std::equal(src.begin(), src.end(), dst.begin());
-}
-
-TEST(raw_allocator, host) {
-  CHECK(host_allocator_test<float>(1000));
-  CHECK(host_allocator_test<int32_t>(1000));
-  CHECK(host_allocator_test<int64_t>(1000));
-}
-
-class StorageRawAlloc {
- public:
-  StorageRawAlloc(const std::shared_ptr<RawAllocator>& a, size_t size)
-      : alloc_(a) {
-    data_ = alloc_->Allocate(size);
-  }
-  ~StorageRawAlloc() { alloc_->Deallocate(data_, size); }
-
- private:
-  void* data_;
-  size_t size;
-  std::shared_ptr<RawAllocator> alloc_;
-};
-
-class StorageFancyAlloc {
- public:
-  StorageFancyAlloc(const std::shared_ptr<Allocator>& a, size_t size)
-      : alloc_(a), allocation_(a->Allocate(size)) {}
-
- private:
-  std::shared_ptr<Allocator> alloc_;
-  Allocation allocation_;
-};
-
-TEST(benchmark, allocator) {
-  std::shared_ptr<RawAllocator> raw_allocator(new HostAllocatorSample);
-  std::shared_ptr<Allocator> fancy_allocator(new FancyAllocator);
-  const size_t cycles = 100;
-  Timer timer;
-  double t1{}, t2{};
-  for (size_t i = 0; i < cycles; ++i) {
-    timer.tic();
-    for (size_t i = 0; i < cycles; ++i) {
-      StorageRawAlloc(raw_allocator, i * 100);
-    }
-    t1 += timer.toc();
-    timer.tic();
-    for (size_t i = 0; i < cycles; ++i) {
-      StorageFancyAlloc(fancy_allocator, i * 100);
-    }
-    t2 += timer.toc();
-  }
-  std::cout << "The cost of raw alloc is " << t1 << "ms.\n";
-  std::cout << "The cost of fancy alloc with place is " << t2 << "ms.\n";
-}
-
-}  // namespace tests
-}  // namespace pten
diff --git a/paddle/pten/tests/core/test_dense_tensor.cc b/paddle/pten/tests/core/test_dense_tensor.cc
index 8277c0d8dadb7..8564969796c7e 100644
--- a/paddle/pten/tests/core/test_dense_tensor.cc
+++ b/paddle/pten/tests/core/test_dense_tensor.cc
@@ -75,7 +75,8 @@ TEST(dense_tensor, ctor) {
   const LoD lod{};
   DenseTensorMeta meta(dtype, dims, layout, lod);
 
-  auto alloc = std::make_shared<FancyAllocator>();
+  auto fancy_allocator = std::unique_ptr<Allocator>(new FancyAllocator);
+  auto* alloc = fancy_allocator.get();
 
   auto check_dense_tensor = [](const DenseTensor& t,
                                const DenseTensorMeta& m) -> bool {
@@ -95,10 +96,6 @@ TEST(dense_tensor, ctor) {
 
   DenseTensor tensor_1(alloc, DenseTensorMeta(meta));
   check_dense_tensor(tensor_0, meta);
-
-  DenseTensor tensor_2(make_intrusive<TensorStorage>(alloc), meta);
-  CHECK_NOTNULL(tensor_2.mutable_data<int8_t>());
-  check_dense_tensor(tensor_2, meta);
 }
 
 TEST(dense_tensor, resize) {
@@ -108,7 +105,8 @@ TEST(dense_tensor, resize) {
   const LoD lod{};
   DenseTensorMeta meta(dtype, dims, layout, lod);
 
-  auto alloc = std::make_shared<FancyAllocator>();
+  auto fancy_allocator = std::unique_ptr<Allocator>(new FancyAllocator);
+  auto* alloc = fancy_allocator.get();
   DenseTensor tensor_0(alloc, meta);
 
   CHECK_EQ(tensor_0.capacity(), 2u);
@@ -125,7 +123,8 @@ TEST(dense_tensor, shallow_copy) {
   const LoD lod{};
   DenseTensorMeta meta(dtype, dims, layout, lod);
 
-  auto alloc = std::make_shared<FancyAllocator>();
+  auto fancy_allocator = std::unique_ptr<Allocator>(new FancyAllocator);
+  auto* alloc = fancy_allocator.get();
   DenseTensor tensor_0(alloc, meta);
 
   DenseTensor tensor_1(tensor_0);
diff --git a/paddle/pten/tests/core/test_storage.cc b/paddle/pten/tests/core/test_storage.cc
deleted file mode 100644
index 69d1eae668c58..0000000000000
--- a/paddle/pten/tests/core/test_storage.cc
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <vector>
-
-#include "gtest/gtest.h"
-
-#include "paddle/pten/core/storage.h"
-#include "paddle/pten/tests/core/allocator.h"
-
-namespace pten {
-namespace tests {
-
-TEST(host_storage, internal) {
-  // TODO(Shixiaowei02): Here we need to consider the case
-  // where the size is zero.
-  const size_t size{100};
-  const auto a = std::make_shared<FancyAllocator>();
-  TensorStorage storage(a, size);
-  CHECK_EQ(storage.size(), size);
-  CHECK(paddle::platform::is_cpu_place(storage.place()));
-  CHECK(storage.OwnsMemory());
-  CHECK(storage.allocator() == a);
-  storage.Realloc(size + 100);
-  CHECK_EQ(storage.size(), size + 100);
-}
-
-}  // namespace tests
-}  // namespace pten
diff --git a/paddle/pten/tests/kernels/test_cast_dev_api.cc b/paddle/pten/tests/kernels/test_cast_dev_api.cc
index cb45d827e3be9..90624adeb344e 100644
--- a/paddle/pten/tests/kernels/test_cast_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_cast_dev_api.cc
@@ -31,9 +31,9 @@ using DDim = paddle::framework::DDim;
 
 TEST(DEV_API, cast) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
-  pten::DenseTensor dense_x(alloc,
+  pten::DenseTensor dense_x(alloc.get(),
                             pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                                   framework::make_ddim({3, 4}),
                                                   pten::DataLayout::NCHW));
diff --git a/paddle/pten/tests/kernels/test_conj_dev_api.cc b/paddle/pten/tests/kernels/test_conj_dev_api.cc
index 3392626dc2ad3..789d55491f368 100644
--- a/paddle/pten/tests/kernels/test_conj_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_conj_dev_api.cc
@@ -29,9 +29,9 @@ using DDim = paddle::framework::DDim;
 
 TEST(DEV_API, conj) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
-  pten::DenseTensor dense_x(alloc,
+  pten::DenseTensor dense_x(alloc.get(),
                             pten::DenseTensorMeta(pten::DataType::COMPLEX64,
                                                   framework::make_ddim({3, 4}),
                                                   pten::DataLayout::NCHW));
diff --git a/paddle/pten/tests/kernels/test_copy_dev_api.cc b/paddle/pten/tests/kernels/test_copy_dev_api.cc
index 3095c83d97c98..c4d8c37eb9e0f 100644
--- a/paddle/pten/tests/kernels/test_copy_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_copy_dev_api.cc
@@ -31,17 +31,17 @@ using DDim = paddle::framework::DDim;
 // in 'paddle/api'
 TEST(DEV_API, copy) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
   auto dense_src = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({2, 3}),
                             pten::DataLayout::NCHW));
   auto* dense_x_data = dense_src->mutable_data<float>();
 
   auto dense_dst = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({2, 3}),
                             pten::DataLayout::NCHW));
diff --git a/paddle/pten/tests/kernels/test_creation_dev_api.cc b/paddle/pten/tests/kernels/test_creation_dev_api.cc
index 4d753f7d09b8e..169a77cf3436b 100644
--- a/paddle/pten/tests/kernels/test_creation_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_creation_dev_api.cc
@@ -50,9 +50,9 @@ TEST(DEV_API, empty) {
 
 TEST(DEV_API, empty_like) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
-  pten::DenseTensor dense_x(alloc,
+  pten::DenseTensor dense_x(alloc.get(),
                             pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                                   framework::make_ddim({3, 2}),
                                                   pten::DataLayout::NCHW));
@@ -105,9 +105,9 @@ TEST(DEV_API, full) {
 
 TEST(DEV_API, full_like) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
-  pten::DenseTensor dense_x(alloc,
+  pten::DenseTensor dense_x(alloc.get(),
                             pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                                   framework::make_ddim({3, 2}),
                                                   pten::DataLayout::NCHW));
diff --git a/paddle/pten/tests/kernels/test_dot_dev_api.cc b/paddle/pten/tests/kernels/test_dot_dev_api.cc
index 6e2166cb673bd..a5773b8aa9690 100644
--- a/paddle/pten/tests/kernels/test_dot_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_dot_dev_api.cc
@@ -29,15 +29,15 @@ using DDim = paddle::framework::DDim;
 
 TEST(DEV_API, dot) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
-  pten::DenseTensor dense_x(alloc,
+  pten::DenseTensor dense_x(alloc.get(),
                             pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                                   framework::make_ddim({3, 10}),
                                                   pten::DataLayout::NCHW));
   auto* dense_x_data = dense_x.mutable_data<float>();
 
-  pten::DenseTensor dense_y(alloc,
+  pten::DenseTensor dense_y(alloc.get(),
                             pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                                   framework::make_ddim({3, 10}),
                                                   pten::DataLayout::NCHW));
diff --git a/paddle/pten/tests/kernels/test_elementwise_dev_api.cc b/paddle/pten/tests/kernels/test_elementwise_dev_api.cc
index bd09ecb770a5d..40998a8d57caa 100644
--- a/paddle/pten/tests/kernels/test_elementwise_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_elementwise_dev_api.cc
@@ -29,15 +29,15 @@ using DDim = paddle::framework::DDim;
 
 TEST(DEV_API, add) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
-  pten::DenseTensor dense_x(alloc,
+  pten::DenseTensor dense_x(alloc.get(),
                             pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                                   framework::make_ddim({3, 10}),
                                                   pten::DataLayout::NCHW));
   auto* dense_x_data = dense_x.mutable_data<float>();
 
-  pten::DenseTensor dense_y(alloc,
+  pten::DenseTensor dense_y(alloc.get(),
                             pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                                   framework::make_ddim({10}),
                                                   pten::DataLayout::NCHW));
@@ -82,15 +82,15 @@ TEST(DEV_API, add) {
 
 TEST(DEV_API, subtract) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
-  pten::DenseTensor dense_x(alloc,
+  pten::DenseTensor dense_x(alloc.get(),
                             pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                                   framework::make_ddim({3, 10}),
                                                   pten::DataLayout::NCHW));
   auto* dense_x_data = dense_x.mutable_data<float>();
 
-  pten::DenseTensor dense_y(alloc,
+  pten::DenseTensor dense_y(alloc.get(),
                             pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                                   framework::make_ddim({10}),
                                                   pten::DataLayout::NCHW));
@@ -135,15 +135,15 @@ TEST(DEV_API, subtract) {
 
 TEST(DEV_API, divide) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
-  pten::DenseTensor dense_x(alloc,
+  pten::DenseTensor dense_x(alloc.get(),
                             pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                                   framework::make_ddim({3, 10}),
                                                   pten::DataLayout::NCHW));
   auto* dense_x_data = dense_x.mutable_data<float>();
 
-  pten::DenseTensor dense_y(alloc,
+  pten::DenseTensor dense_y(alloc.get(),
                             pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                                   framework::make_ddim({10}),
                                                   pten::DataLayout::NCHW));
@@ -188,15 +188,15 @@ TEST(DEV_API, divide) {
 
 TEST(DEV_API, multiply) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
-  pten::DenseTensor dense_x(alloc,
+  pten::DenseTensor dense_x(alloc.get(),
                             pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                                   framework::make_ddim({3, 10}),
                                                   pten::DataLayout::NCHW));
   auto* dense_x_data = dense_x.mutable_data<float>();
 
-  pten::DenseTensor dense_y(alloc,
+  pten::DenseTensor dense_y(alloc.get(),
                             pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                                   framework::make_ddim({10}),
                                                   pten::DataLayout::NCHW));
diff --git a/paddle/pten/tests/kernels/test_flatten_dev_api.cc b/paddle/pten/tests/kernels/test_flatten_dev_api.cc
index f18e5c050ba70..d66ff468fcf48 100644
--- a/paddle/pten/tests/kernels/test_flatten_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_flatten_dev_api.cc
@@ -39,10 +39,10 @@ using DDim = paddle::framework::DDim;
 
 TEST(DEV_API, flatten) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
   pten::DenseTensor dense_x(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 2, 2, 3}),
                             pten::DataLayout::NCHW));
diff --git a/paddle/pten/tests/kernels/test_matmul_dev_api.cc b/paddle/pten/tests/kernels/test_matmul_dev_api.cc
index 7ac3d19554581..0c1338f195563 100644
--- a/paddle/pten/tests/kernels/test_matmul_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_matmul_dev_api.cc
@@ -29,16 +29,16 @@ using DDim = paddle::framework::DDim;
 
 TEST(DEV_API, dot) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
-  DenseTensor dense_x(alloc,
+  DenseTensor dense_x(alloc.get(),
                       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                             framework::make_ddim({3, 3}),
                                             pten::DataLayout::NCHW));
 
   auto* dense_x_data = dense_x.mutable_data<float>();
 
-  DenseTensor dense_y(alloc,
+  DenseTensor dense_y(alloc.get(),
                       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                             framework::make_ddim({3, 3}),
                                             pten::DataLayout::NCHW));
diff --git a/paddle/pten/tests/kernels/test_mean_dev_api.cc b/paddle/pten/tests/kernels/test_mean_dev_api.cc
index 4b254e7e6c1ac..98782fd5dae0b 100644
--- a/paddle/pten/tests/kernels/test_mean_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_mean_dev_api.cc
@@ -29,9 +29,9 @@ using DDim = paddle::framework::DDim;
 
 TEST(DEV_API, mean) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
-  pten::DenseTensor dense_x(alloc,
+  pten::DenseTensor dense_x(alloc.get(),
                             pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                                   framework::make_ddim({3, 4}),
                                                   pten::DataLayout::NCHW));
diff --git a/paddle/pten/tests/kernels/test_reshape_dev_api.cc b/paddle/pten/tests/kernels/test_reshape_dev_api.cc
index 0196e1c211004..02139d02de17e 100644
--- a/paddle/pten/tests/kernels/test_reshape_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_reshape_dev_api.cc
@@ -30,10 +30,10 @@ using DDim = paddle::framework::DDim;
 // TODO(chenweihang): Remove this test after the API is used in the dygraph
 TEST(DEV_API, reshape) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
   pten::DenseTensor dense_x(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 2, 2, 3}),
                             pten::DataLayout::NCHW));
diff --git a/paddle/pten/tests/kernels/test_scale_dev_api.cc b/paddle/pten/tests/kernels/test_scale_dev_api.cc
index fe26f56552b05..02f324deb4cec 100644
--- a/paddle/pten/tests/kernels/test_scale_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_scale_dev_api.cc
@@ -29,9 +29,9 @@ using DDim = paddle::framework::DDim;
 
 TEST(DEV_API, scale) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
-  pten::DenseTensor dense_x(alloc,
+  pten::DenseTensor dense_x(alloc.get(),
                             pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                                   framework::make_ddim({3, 4}),
                                                   pten::DataLayout::NCHW));
@@ -69,9 +69,9 @@ TEST(DEV_API, scale) {
 
 TEST(DEV_API, scale_host) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
-  pten::DenseTensor dense_x(alloc,
+  pten::DenseTensor dense_x(alloc.get(),
                             pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                                   framework::make_ddim({3, 4}),
                                                   pten::DataLayout::NCHW));
@@ -79,9 +79,8 @@ TEST(DEV_API, scale_host) {
   for (size_t i = 0; i < 12; ++i) {
     dense_x_data[i] = i * 1.0;
   }
-  const auto alloc2 = std::make_shared<paddle::experimental::DefaultAllocator>(
-      paddle::platform::CPUPlace());
-  pten::DenseTensor scale(alloc2,
+
+  pten::DenseTensor scale(alloc.get(),
                           pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                                 framework::make_ddim({1}),
                                                 pten::DataLayout::NCHW));
diff --git a/paddle/pten/tests/kernels/test_sum_dev_api.cc b/paddle/pten/tests/kernels/test_sum_dev_api.cc
index afaf903063781..312a6ce6100bb 100644
--- a/paddle/pten/tests/kernels/test_sum_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_sum_dev_api.cc
@@ -29,9 +29,9 @@ using DDim = paddle::framework::DDim;
 
 TEST(DEV_API, sum) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
-  pten::DenseTensor dense_x(alloc,
+  pten::DenseTensor dense_x(alloc.get(),
                             pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                                   framework::make_ddim({3, 4}),
                                                   pten::DataLayout::NCHW));

From 192184e8f3b36ca0b7843f765b2e004becf05e43 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Sun, 16 Jan 2022 22:54:18 +0800
Subject: [PATCH 146/151] [Pten] Add select kernel map method for infrt
 (#38972)

* add select kernel map method

* fix error
---
 paddle/pten/core/kernel_factory.cc            |  9 +++++++++
 paddle/pten/core/kernel_factory.h             |  3 +++
 paddle/pten/tests/core/CMakeLists.txt         |  2 +-
 paddle/pten/tests/core/test_kernel_factory.cc | 12 +++++++++++-
 4 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/paddle/pten/core/kernel_factory.cc b/paddle/pten/core/kernel_factory.cc
index 799b860859762..f10b58506f728 100644
--- a/paddle/pten/core/kernel_factory.cc
+++ b/paddle/pten/core/kernel_factory.cc
@@ -50,6 +50,15 @@ Kernel KernelFactory::SelectKernel(const std::string& kernel_name,
   return kernel_iter->second;
 }
 
+paddle::flat_hash_map<KernelKey, Kernel, KernelKey::Hash>
+KernelFactory::SelectKernelMap(const std::string& kernel_name) const {
+  auto iter = kernels_.find(kernel_name);
+  if (iter == kernels_.end()) {
+    return paddle::flat_hash_map<KernelKey, Kernel, KernelKey::Hash>();
+  }
+  return iter->second;
+}
+
 const Kernel& KernelFactory::SelectKernelOrThrowError(
     const std::string& kernel_name, const KernelKey& kernel_key) const {
   auto iter = kernels_.find(kernel_name);
diff --git a/paddle/pten/core/kernel_factory.h b/paddle/pten/core/kernel_factory.h
index e0585aea7f3db..bd26d86a34a09 100644
--- a/paddle/pten/core/kernel_factory.h
+++ b/paddle/pten/core/kernel_factory.h
@@ -232,6 +232,9 @@ class KernelFactory {
   Kernel SelectKernel(const std::string& kernel_name,
                       const KernelKey& kernel_key) const;
 
+  paddle::flat_hash_map<KernelKey, Kernel, KernelKey::Hash> SelectKernelMap(
+      const std::string& kernel_name) const;
+
  private:
   KernelFactory() = default;
 
diff --git a/paddle/pten/tests/core/CMakeLists.txt b/paddle/pten/tests/core/CMakeLists.txt
index 07554f02d9992..2d4ee7f6d6a47 100644
--- a/paddle/pten/tests/core/CMakeLists.txt
+++ b/paddle/pten/tests/core/CMakeLists.txt
@@ -2,4 +2,4 @@ cc_test(test_dense_tensor SRCS test_dense_tensor.cc DEPS dense_tensor)
 cc_test(test_intrusive_ptr SRCS test_intrusive_ptr.cc)
 cc_test(test_type_info SRCS test_type_info.cc)
 cc_test(test_convert_utils SRCS test_convert_utils.cc DEPS convert_utils)
-cc_test(test_kernel_factory SRCS test_kernel_factory.cc DEPS kernel_factory)
+cc_test(test_kernel_factory SRCS test_kernel_factory.cc DEPS kernel_factory scale_kernel)
diff --git a/paddle/pten/tests/core/test_kernel_factory.cc b/paddle/pten/tests/core/test_kernel_factory.cc
index 3f271b2a8f0d0..5355921ddbe01 100644
--- a/paddle/pten/tests/core/test_kernel_factory.cc
+++ b/paddle/pten/tests/core/test_kernel_factory.cc
@@ -16,9 +16,12 @@ limitations under the License. */
 #include <sstream>
 
 #include "paddle/pten/core/kernel_factory.h"
+#include "paddle/pten/core/kernel_registry.h"
 
 #include "gtest/gtest.h"
 
+PT_DECLARE_KERNEL(scale, CPU, ALL_LAYOUT);
+
 namespace pten {
 namespace tests {
 
@@ -33,9 +36,16 @@ TEST(KernelKey, ConstructAndOStream) {
   std::ostringstream oss;
   oss << key;
   std::cout << oss.str();
-  // EXPECT_EQ(oss.str(), "scale.host");
   oss.flush();
 }
 
+TEST(KernelFactory, SelectedKernelMap) {
+  auto kernel_map = pten::KernelFactory::Instance().SelectKernelMap("scale");
+  EXPECT_GT(kernel_map.size(), 1UL);
+  for (auto& iter : kernel_map) {
+    std::cout << iter.first << ": " << iter.second;
+  }
+}
+
 }  // namespace tests
 }  // namespace pten

From 3115d005aa6ec9fe2ae37332be6652124b9e5543 Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Mon, 17 Jan 2022 09:33:15 +0800
Subject: [PATCH 147/151] Removed debug info (#38947)

---
 paddle/fluid/framework/program_desc.cc | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/paddle/fluid/framework/program_desc.cc b/paddle/fluid/framework/program_desc.cc
index 60b93f4a71664..4a31adcca65ec 100644
--- a/paddle/fluid/framework/program_desc.cc
+++ b/paddle/fluid/framework/program_desc.cc
@@ -101,25 +101,20 @@ ProgramDesc::ProgramDesc(const std::string &binary_str) {
   PADDLE_ENFORCE_EQ(desc_.ParseFromString(binary_str), true,
                     platform::errors::InvalidArgument(
                         "Failed to parse program_desc from binary string."));
-  VLOG(1) << 3333;
   InitFromProto();
 }
 
 void ProgramDesc::InitFromProto() {
-  VLOG(1) << 4444;
   for (auto &block_desc : *desc_.mutable_blocks()) {
     blocks_.emplace_back(new BlockDesc(this, &block_desc));
   }
-  VLOG(1) << 5555;
   for (auto &block : blocks_) {
     for (auto *op : block->AllOps()) {
       for (const auto &attr : op->Proto()->attrs()) {
         if (attr.type() == proto::AttrType::BLOCK) {
-          VLOG(1) << 6666;
           size_t blk_idx = attr.block_idx();
           op->SetBlockAttr(attr.name(), this->MutableBlock(blk_idx));
         } else if (attr.type() == proto::AttrType::BLOCKS) {
-          VLOG(1) << 7777;
           auto blks_idx = attr.blocks_idx();
           std::vector<BlockDesc *> block_descs;
           for (int blk_idx : blks_idx) {

From f81569e37cbe61106255e1b52757010ffe84bf58 Mon Sep 17 00:00:00 2001
From: Jiabin Yang <marsyang199376@gmail.com>
Date: Mon, 17 Jan 2022 09:44:07 +0800
Subject: [PATCH 148/151] Support auto prune logic in eager mode  (#38960)

* support test_auto_prune_partial

* support rest of autoprune strategy in eager mode
---
 .../auto_code_generator/eager_generator.cc    |  2 +-
 paddle/fluid/eager/backward.cc                | 49 +++++++++++---
 paddle/fluid/eager/eager_tensor.h             |  8 +++
 .../eager/tests/task_tests/backward_test.cc   |  8 ++-
 .../cross_batch_accumulation_test.cc          |  1 +
 .../fluid/eager/tests/task_tests/hook_test.cc |  2 +
 paddle/fluid/eager/utils.cc                   |  7 +-
 paddle/fluid/pybind/eager_method.cc           | 22 ++++++-
 paddle/fluid/pybind/eager_properties.cc       | 23 ++++++-
 paddle/fluid/pybind/eager_utils.cc            | 12 ++++
 paddle/fluid/pybind/eager_utils.h             |  3 +
 paddle/pten/core/dense_tensor.cc              |  3 +
 .../fluid/dygraph/varbase_patch_methods.py    | 13 ++--
 .../tests/unittests/test_egr_python_api.py    | 28 +++++++-
 .../unittests/test_imperative_auto_prune.py   | 66 ++++++++++++++++---
 15 files changed, 211 insertions(+), 36 deletions(-)

diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
index 2c3207b116e29..11e033e1e5978 100644
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -1852,7 +1852,7 @@ static std::string GenerateGradNodeCCContents(
       "  %s\n"
       "  return outputs;\n";
   generated_grad_function_body = paddle::string::Sprintf(
-      BWD_RETURN_TEMPLATE, outs_size, generated_grad_function_body);
+      BWD_RETURN_TEMPLATE, in_vars.size(), generated_grad_function_body);
 
   // [Generation] Get Full Grad Function
   const char* GRAD_FUNCTION_TEMPLATE =
diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc
index 01cb1b81e341e..b0e3d81df3a64 100644
--- a/paddle/fluid/eager/backward.cc
+++ b/paddle/fluid/eager/backward.cc
@@ -103,7 +103,17 @@ void RunBackward(const std::vector<egr::EagerTensor>& tensors,
     VLOG(2) << "Out Rank of Tensor is slot: " << input_info.first
             << ", rank: " << input_info.second;
     // Get target GradNodeBase from target tensors
-    GradNodeBase* grad_node = auto_grad_meta->GetMutableGradNode().get();
+    auto shared_grad_node = auto_grad_meta->GetMutableGradNode();
+
+    if (shared_grad_node == nullptr || shared_grad_node.get() == nullptr ||
+        auto_grad_meta->StopGradient()) {
+      VLOG(3) << "Skip auto grad since there is no grad op for var or loss is "
+                 "stop_gradient=True: "
+              << tensor.name();
+      continue;
+    }
+
+    GradNodeBase* grad_node = shared_grad_node.get();
 
     // Prepare GradTensorHolder
     if (!node_input_buffers_dict.count(grad_node)) {
@@ -192,19 +202,38 @@ void RunBackward(const std::vector<egr::EagerTensor>& tensors,
         // Since we make edge has as same rank as bwd outputs, we indexing them
         // with
         // the same rank(i, j)
-        VLOG(6) << "Get Edge with slot: " << i << ", rank: " << j;
-        egr::EagerTensor& grad_output_tensor = grad_output_tensors[i][j];
-        if (!grad_output_tensor.defined() ||
-            !grad_output_tensor.initialized()) {
-          VLOG(6) << "We get grad_output_tensor with slot: " << i
-                  << ", rank: " << j << " as uninitialized or undefined tensor";
-        }
-        GradNodeBase* next_node = edge.GetMutableGradNode().get();
+        auto next_node_shared = edge.GetMutableGradNode();
 
         // Next node could be nullptr if it is leaf tensor with no
         // AccumulationNode attached
         // Or it could also originated from dispensable inputs
-        if (!next_node) continue;
+        if (!next_node_shared || !next_node_shared.get() ||
+            grad_output_tensors[i].empty()) {
+          continue;
+        }
+        PADDLE_ENFORCE_LT(
+            j, grad_output_tensors[i].size(),
+            paddle::platform::errors::Fatal(
+                "Rank of grad_output_tensors should be less than "
+                "grad_output_tensors[i].size(), which is: %d. This error may "
+                "indicate autoprune or autograd api error. ",
+                grad_output_tensors.size()));
+        egr::EagerTensor& grad_output_tensor = grad_output_tensors[i][j];
+
+        if ((!grad_output_tensor.defined() ||
+             !grad_output_tensor.initialized())) {
+          if (!grad_output_tensor.Var().IsInitialized()) {
+            VLOG(6)
+                << "We get grad_output_tensor with slot: " << i
+                << ", rank: " << j
+                << " as uninitialized or undefined in both tensor and variable";
+          }
+        }
+        VLOG(6) << "Get Edge and grad_output_tensor with slot: " << i
+                << ", rank: " << j
+                << " 's name is: " << grad_output_tensor.name();
+
+        auto* next_node = next_node_shared.get();
 
         if (!node_input_buffers_dict.count(next_node)) {
           node_input_buffers_dict[next_node] =
diff --git a/paddle/fluid/eager/eager_tensor.h b/paddle/fluid/eager/eager_tensor.h
index c58c0b9e66e7a..8b8423c6173fb 100644
--- a/paddle/fluid/eager/eager_tensor.h
+++ b/paddle/fluid/eager/eager_tensor.h
@@ -164,6 +164,14 @@ class EagerTensor final {
    */
   void reset() { tensor_->reset(); }
 
+  /**
+   * @brief Determine whether tensor is DenseTensor
+   *
+   * @return true
+   * @return false
+   */
+  bool is_dense_tensor() const { return tensor_->is_dense_tensor(); }
+
   /**
  * @brief Transfer the current Tensor to the specified device and return.
  *
diff --git a/paddle/fluid/eager/tests/task_tests/backward_test.cc b/paddle/fluid/eager/tests/task_tests/backward_test.cc
index 3737fd95ad64d..8f0e6cc5e41c9 100644
--- a/paddle/fluid/eager/tests/task_tests/backward_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/backward_test.cc
@@ -56,6 +56,7 @@ TEST(Backward, SingleNodeEmptyGrad) {
     auto_grad_meta->SetGradNode(
         std::dynamic_pointer_cast<GradNodeBase>(node0_ptr));
     auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
+    auto_grad_meta->SetStopGradient(false);
 
     // Connect Tensor and AccumulationNode via AutoGradMeta
     auto acc_node_ptr = std::make_shared<egr::GradNodeAccumulation>();
@@ -119,7 +120,7 @@ TEST(Backward, SingleNodeCustomGrad) {
     auto_grad_meta->SetGradNode(
         std::dynamic_pointer_cast<GradNodeBase>(node0_ptr));
     auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
-
+    auto_grad_meta->SetStopGradient(false);
     // Connect Tensor and AccumulationNode via AutoGradMeta
     auto acc_node_ptr = std::make_shared<egr::GradNodeAccumulation>();
 
@@ -189,7 +190,7 @@ TEST(Backward, LinearNodes) {
     auto_grad_meta->SetGradNode(
         std::dynamic_pointer_cast<GradNodeBase>(node0_ptr));
     auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
-
+    auto_grad_meta->SetStopGradient(false);
     // Connect Node0 -> Node1 via Edge
     auto meta0 = egr::AutogradMeta();
     meta0.SetStopGradient(false);
@@ -281,13 +282,14 @@ TEST(Backward, WithAccumulation) {
     auto_grad_meta0->SetGradNode(
         std::dynamic_pointer_cast<GradNodeBase>(node0_ptr));
     auto_grad_meta0->SetSingleOutRankWithSlot(0, 0);
-
+    auto_grad_meta0->SetStopGradient(false);
     // Connect Inp1 and Node1 via AutoGradMeta
     AutogradMeta* auto_grad_meta1 =
         EagerUtils::autograd_meta(&(target_tensors[1]));
     auto_grad_meta1->SetGradNode(
         std::dynamic_pointer_cast<GradNodeBase>(node1_ptr));
     auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
+    auto_grad_meta1->SetStopGradient(false);
 
     // Connect Node0 -> Node2 via Edge
     auto meta0 = egr::AutogradMeta();
diff --git a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
index 7f180fa1076fd..523f7102af04d 100644
--- a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
@@ -58,6 +58,7 @@ TEST(CrossBatchAccumulation, SingleScaleNode) {
     auto_grad_meta->SetGradNode(
         std::dynamic_pointer_cast<GradNodeBase>(scale_node_ptr));
     auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
+    auto_grad_meta->SetStopGradient(false);
     egr_utils_api::RetainGradForTensor(target_tensor);  // result: 1.0
 
     auto meta = AutogradMeta();
diff --git a/paddle/fluid/eager/tests/task_tests/hook_test.cc b/paddle/fluid/eager/tests/task_tests/hook_test.cc
index 0f8039dade801..4f4a33b1a743a 100644
--- a/paddle/fluid/eager/tests/task_tests/hook_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/hook_test.cc
@@ -93,6 +93,7 @@ TEST(RetainGrad, HookBeforeRetainGrad) {
     auto_grad_meta->SetGradNode(
         std::dynamic_pointer_cast<GradNodeBase>(scale_node_ptr));
     auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
+    auto_grad_meta->SetStopGradient(false);
     target_tensor.set_autograd_meta(
         std::dynamic_pointer_cast<paddle::experimental::AbstractAutogradMeta>(
             auto_grad_meta));
@@ -171,6 +172,7 @@ TEST(RetainGrad, HookAfterRetainGrad) {
     auto_grad_meta->SetGradNode(
         std::dynamic_pointer_cast<GradNodeBase>(scale_node_ptr));
     auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
+    auto_grad_meta->SetStopGradient(false);
     target_tensor.set_autograd_meta(
         std::dynamic_pointer_cast<paddle::experimental::AbstractAutogradMeta>(
             auto_grad_meta));
diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc
index e73dfa2ec8b6e..f50458e556276 100644
--- a/paddle/fluid/eager/utils.cc
+++ b/paddle/fluid/eager/utils.cc
@@ -99,7 +99,12 @@ std::pair<size_t, size_t> EagerUtils::OutRankInfo(
 
 std::shared_ptr<GradNodeBase> EagerUtils::grad_node(
     const egr::EagerTensor& target) {
-  return unsafe_autograd_meta(target)->GetMutableGradNode();
+  auto* meta = nullable_autograd_meta(target);
+  if (meta) {
+    return meta->GetMutableGradNode();
+  } else {
+    return nullptr;
+  }
 }
 
 void EagerUtils::SetHistory(std::vector<AutogradMeta*>* autograd_metas,
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index 46b56f27ff98e..4419640ccf328 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -298,6 +298,21 @@ static PyObject* eager_tensor_method_detach(EagerTensorObject* self,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
+static PyObject* eager_tensor_method_get_underline_tensor(
+    EagerTensorObject* self, PyObject* args, PyObject* kwargs) {
+  EAGER_SYNC_TRY
+  if (self->eager_tensor.is_dense_tensor()) {
+    auto* tensor = static_cast<paddle::framework::LoDTensor*>(
+        self->eager_tensor.impl().get());
+    VLOG(6) << "tensor: " << tensor->IsInitialized();
+    return ToPyObject(tensor);
+  } else {
+    Py_IncRef(Py_None);
+    return Py_None;
+  }
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
 PyMethodDef variable_methods[] = {
     {"numpy", (PyCFunction)(void (*)(void))eager_tensor_method_numpy,
      METH_VARARGS | METH_KEYWORDS, NULL},
@@ -315,14 +330,17 @@ PyMethodDef variable_methods[] = {
      METH_VARARGS | METH_KEYWORDS, NULL},
     {"_zero_grads", (PyCFunction)(void (*)(void))eager_tensor__zero_grads,
      METH_VARARGS | METH_KEYWORDS, NULL},
-    {"_is_shared_buffer_to",
+    {"_share_buffer_to",
      (PyCFunction)(void (*)(void))eager_tensor__share_buffer_to,
      METH_VARARGS | METH_KEYWORDS, NULL},
-    {"_share_buffer_with",
+    {"_is_shared_buffer_with",
      (PyCFunction)(void (*)(void))eager_tensor__is_shared_buffer_with,
      METH_VARARGS | METH_KEYWORDS, NULL},
     {"detach", (PyCFunction)(void (*)(void))eager_tensor_method_detach,
      METH_VARARGS | METH_KEYWORDS, NULL},
+    {"get_tensor",
+     (PyCFunction)(void (*)(void))eager_tensor_method_get_underline_tensor,
+     METH_VARARGS | METH_KEYWORDS, NULL},
     {NULL, NULL, 0, NULL}};
 
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc
index 038a1254d7ef6..5f1d809168a42 100644
--- a/paddle/fluid/pybind/eager_properties.cc
+++ b/paddle/fluid/pybind/eager_properties.cc
@@ -42,6 +42,18 @@ PyObject* eager_tensor_properties_get_name(EagerTensorObject* self,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
+PyObject* eager_tensor_properties_get_type(EagerTensorObject* self,
+                                           void* closure) {
+  EAGER_SYNC_TRY
+  if (self->eager_tensor.is_dense_tensor()) {
+    return ToPyObject(paddle::framework::proto::VarType::LOD_TENSOR);
+  } else {
+    Py_INCREF(Py_None);
+    return Py_None;
+  }
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
 int eager_tensor_properties_set_name(EagerTensorObject* self, PyObject* value,
                                      void* closure) {
   EAGER_SYNC_TRY
@@ -74,8 +86,13 @@ PyObject* eager_tensor_properties_get_grad(EagerTensorObject* self,
     return ToPyObject(*accumulation_grad_node->Grad());
   } else {
     VLOG(6) << "Get grad for tensor: " << self->eager_tensor.name();
-    auto meta = egr::EagerUtils::unsafe_autograd_meta(self->eager_tensor);
-    return ToPyObject(meta->Grad());
+    auto meta = egr::EagerUtils::nullable_autograd_meta(self->eager_tensor);
+    if (meta) {
+      return ToPyObject(meta->Grad());
+    } else {
+      Py_INCREF(Py_None);
+      return Py_None;
+    }
   }
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
@@ -185,6 +202,8 @@ struct PyGetSetDef variable_properties[] = {
      nullptr, nullptr},
     {"dtype", (getter)eager_tensor_properties_get_dtype, nullptr, nullptr,
      nullptr},
+    {"type", (getter)eager_tensor_properties_get_type, nullptr, nullptr,
+     nullptr},
     {nullptr, nullptr, nullptr, nullptr, nullptr}};
 
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index c1049d240795c..5c74653a719d3 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -450,6 +450,18 @@ PyObject* ToPyObject(const paddle::framework::proto::VarType::Type& dtype) {
   return obj.ptr();
 }
 
+PyObject* ToPyObject(const paddle::framework::proto::VarType& type) {
+  auto obj = ::pybind11::cast(type);
+  obj.inc_ref();
+  return obj.ptr();
+}
+
+PyObject* ToPyObject(const paddle::framework::LoDTensor* value) {
+  auto obj = ::pybind11::cast(value, py::return_value_policy::copy);
+  obj.inc_ref();
+  return obj.ptr();
+}
+
 PyObject* ToPyObject(const void* value) {
   if (value == nullptr) {
     Py_INCREF(Py_None);
diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h
index 20c82c572c325..e1a7ed2415014 100644
--- a/paddle/fluid/pybind/eager_utils.h
+++ b/paddle/fluid/pybind/eager_utils.h
@@ -11,6 +11,7 @@ limitations under the License. */
 #pragma once
 
 #include <Python.h>
+#include "paddle/pten/core/dense_tensor.h"
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
 
@@ -54,7 +55,9 @@ PyObject* ToPyObject(const std::vector<float>& value);
 PyObject* ToPyObject(const std::vector<double>& value);
 PyObject* ToPyObject(const std::vector<egr::EagerTensor>& value);
 PyObject* ToPyObject(const platform::Place& value);
+PyObject* ToPyObject(const framework::LoDTensor* value);
 PyObject* ToPyObject(const paddle::framework::proto::VarType::Type& dtype);
+PyObject* ToPyObject(const paddle::framework::proto::VarType& type);
 PyObject* ToPyObject(const void* value);
 PyObject* ToPyObject(
     const std::unordered_map<std::string, std::vector<std::string>>& value);
diff --git a/paddle/pten/core/dense_tensor.cc b/paddle/pten/core/dense_tensor.cc
index 716e1ac3d30bb..fe088a9568146 100644
--- a/paddle/pten/core/dense_tensor.cc
+++ b/paddle/pten/core/dense_tensor.cc
@@ -285,6 +285,9 @@ const paddle::platform::Place& DenseTensor::place() const {
       storage_,
       paddle::platform::errors::PreconditionNotMet(
           "Tensor not initialized yet when Tensor::place() is called."));
+  if (storage_->data_shared()) {
+    return storage_->data_shared()->place();
+  }
   return storage_->place();
 }
 
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index e06e7f52dd671..3cccaceb8e698 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -758,10 +758,10 @@ def is_combine_index(item):
 
     @framework.dygraph_only
     def _grad_ivar(self):
-        if self.grad._is_initialized():
-            return self.grad
-        else:
-            return None
+        if self.grad is not None:
+            if self.grad._is_initialized():
+                return self.grad
+        return None
 
     @framework.dygraph_only
     def _set_grad_ivar(self, value):
@@ -782,6 +782,10 @@ def clear_gradient(self, set_to_zero=True):
     def clone(self):
         return _C_ops_.assign(self)
 
+    @framework.dygraph_only
+    def value(self):
+        return self
+
     if core._in_eager_mode() and not hasattr(core, "eager"):
         return
 
@@ -805,6 +809,7 @@ def clone(self):
         setattr(core.eager.EagerTensor, "_set_grad_ivar", _set_grad_ivar)
         setattr(core.eager.EagerTensor, "clear_gradient", clear_gradient)
         setattr(core.eager.EagerTensor, "clone", clone)
+        setattr(core.eager.EagerTensor, "value", value)
     else:
         setattr(core.VarBase, "__name__", "Tensor")
         setattr(core.VarBase, "grad", grad)
diff --git a/python/paddle/fluid/tests/unittests/test_egr_python_api.py b/python/paddle/fluid/tests/unittests/test_egr_python_api.py
index 3ab7981cdb1a4..9630462b4963a 100644
--- a/python/paddle/fluid/tests/unittests/test_egr_python_api.py
+++ b/python/paddle/fluid/tests/unittests/test_egr_python_api.py
@@ -109,7 +109,7 @@ def test_dtype_base(self):
                                         core.VarDesc.VarType.COMPLEX128)
 
 
-class EagerTensorPropertiesTestCase(unittest.TestCase):
+class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase):
     def constructor(self, place):
         egr_tensor = core.eager.EagerTensor()
         self.assertEqual(egr_tensor.persistable, False)
@@ -645,7 +645,8 @@ def test_copy_and_copy_to(self):
                 self.assertTrue(tensor3.stop_gradient, True)
                 self.assertTrue(tensor3.place.is_cpu_place())
 
-        def test_share_buffer_to():
+    def test_share_buffer_to(self):
+        with _test_eager_guard():
             arr = np.ones([4, 16, 16, 32]).astype('float32')
             arr1 = np.zeros([4, 16]).astype('float32')
             arr2 = np.ones([4, 16, 16, 32]).astype('float32') + np.ones(
@@ -661,7 +662,7 @@ def test_share_buffer_to():
             else:
                 tensor2 = paddle.to_tensor(arr2, core.VarDesc.VarType.FP32,
                                            core.CPUPlace())
-            self.assertTrue(np.array_equal(tensor.numpy(), arr1))
+            self.assertTrue(np.array_equal(tensor.numpy(), arr))
             self.assertTrue(np.array_equal(tensor2.numpy(), arr2))
             tensor2._share_buffer_to(tensor)
             self.assertTrue(np.array_equal(tensor.numpy(), arr2))
@@ -694,6 +695,7 @@ def test_properties(self):
             self.assertEqual(tensor.stop_gradient, False)
             tensor.stop_gradient = True
             self.assertEqual(tensor.stop_gradient, True)
+            self.assertEqual(tensor.type, core.VarDesc.VarType.LOD_TENSOR)
 
     def test_global_properties(self):
         print("Test_global_properties")
@@ -714,6 +716,25 @@ def test_place_guard(self):
                 self.assertTrue(core.eager._get_expected_place().is_cpu_place())
         core._disable_eager_mode()
 
+    def test_value(self):
+        with _test_eager_guard():
+            arr = np.random.rand(4, 16, 16, 32).astype('float64')
+
+            egr_tensor0 = core.eager.EagerTensor(value=arr)
+            self.assertEqual(egr_tensor0.persistable, False)
+            self.assertTrue("generated" in egr_tensor0.name)
+            self.assertEqual(egr_tensor0.shape, [4, 16, 16, 32])
+            self.assertTrue(
+                egr_tensor0.place._equals(
+                    paddle.fluid.framework._current_expected_place()))
+            self.assertEqual(egr_tensor0.dtype, core.VarDesc.VarType.FP64)
+            self.assertEqual(egr_tensor0.stop_gradient, True)
+            self.assertTrue(egr_tensor0.value().get_tensor()._dtype(),
+                            core.VarDesc.VarType.FP64)
+            self.assertTrue(egr_tensor0.value().get_tensor()._place(),
+                            paddle.fluid.framework._current_expected_place())
+            self.assertTrue(egr_tensor0.value().get_tensor()._is_initialized())
+
 
 class EagerParamBaseUsageTestCase(unittest.TestCase):
     def test_print(self):
@@ -803,6 +824,7 @@ def test_backward_with_single_tensor(self):
             self.assertTrue(egr_tensor12.place._equals(paddle.fluid.CPUPlace()))
             self.assertTrue(np.array_equal(egr_tensor12.numpy(), arr4))
             self.assertTrue(np.array_equal(egr_tensor12.gradient(), None))
+            egr_tensor12.stop_gradient = False
             egr_tensor12.backward()
             self.assertTrue(np.array_equal(egr_tensor12.gradient(), arr))
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
index d2e1a4fbb1882..44d73612b1cb5 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
@@ -181,6 +181,7 @@ def test_auto_prune2(self):
             self.func_auto_prune2()
         self.func_auto_prune2()
 
+    # TODO(jiabin): Support this when we support better split tensor
     def test_auto_prune3(self):
         with fluid.dygraph.guard():
             case3 = AutoPruneLayer3(input_size=784)
@@ -217,7 +218,7 @@ def test_auto_prune5(self):
             self.assertTrue(case4.linear.weight._grad_ivar() is not None)
             self.assertTrue((part2.gradient() == 0).all())
 
-    def test_auto_prune6(self):
+    def func_auto_prune6(self):
         with fluid.dygraph.guard():
             value0 = np.arange(26).reshape(2, 13).astype("float32")
             value1 = np.arange(6).reshape(2, 3).astype("float32")
@@ -235,7 +236,12 @@ def test_auto_prune6(self):
             self.assertTrue(linear.weight.gradient() is None)
             self.assertTrue(out1.gradient() is None)
 
-    def test_auto_prune7(self):
+    def test_auto_prune6(self):
+        with _test_eager_guard():
+            self.func_auto_prune6()
+        self.func_auto_prune6()
+
+    def func_auto_prune7(self):
         with fluid.dygraph.guard():
             value0 = np.arange(26).reshape(2, 13).astype("float32")
             value1 = np.arange(6).reshape(2, 3).astype("float32")
@@ -253,7 +259,12 @@ def test_auto_prune7(self):
             self.assertTrue(linear.weight.gradient() is None)
             self.assertTrue(out1.gradient() is None)
 
-    def test_auto_prune8(self):
+    def test_auto_prune7(self):
+        with _test_eager_guard():
+            self.func_auto_prune7()
+        self.func_auto_prune7()
+
+    def func_auto_prune8(self):
         with fluid.dygraph.guard():
             value0 = np.arange(26).reshape(2, 13).astype("float32")
             value1 = np.arange(6).reshape(2, 3).astype("float32")
@@ -278,7 +289,12 @@ def test_auto_prune8(self):
             self.assertFalse(
                 np.array_equal(linear_origin, linear.weight.numpy()))
 
-    def test_auto_prune9(self):
+    def test_auto_prune8(self):
+        with _test_eager_guard():
+            self.func_auto_prune8()
+        self.func_auto_prune8()
+
+    def func_auto_prune9(self):
         with fluid.dygraph.guard():
             value0 = np.arange(26).reshape(2, 13).astype("float32")
             value1 = np.arange(6).reshape(2, 3).astype("float32")
@@ -307,7 +323,12 @@ def test_auto_prune9(self):
             except ValueError as e:
                 assert type(e) == ValueError
 
-    def test_auto_prune10(self):
+    def test_auto_prune9(self):
+        with _test_eager_guard():
+            self.func_auto_prune9()
+        self.func_auto_prune9()
+
+    def func_auto_prune10(self):
         with fluid.dygraph.guard():
             value0 = np.arange(26).reshape(2, 13).astype("float32")
             value1 = np.arange(6).reshape(2, 3).astype("float32")
@@ -321,12 +342,18 @@ def test_auto_prune10(self):
             out2 = linear2(b)
             out1.stop_gradient = True
             out = fluid.layers.concat(input=[out1, out2, c], axis=1)
+            #TODO(jiabin): In Eager Mode we don't actually need sort_sum_gradient, this test should be removed when we don't support fluid anymore.
             fluid.set_flags({'FLAGS_sort_sum_gradient': True})
             out.backward()
             self.assertTrue(linear.weight.gradient() is None)
             self.assertTrue(out1.gradient() is None)
 
-    def test_auto_prune_with_optimizer(self):
+    def test_auto_prune10(self):
+        with _test_eager_guard():
+            self.func_auto_prune10()
+        self.func_auto_prune10()
+
+    def func_auto_prune_with_optimizer(self):
         vocab_size = 100
         size = 20
         batch_size = 16
@@ -341,7 +368,6 @@ def test_auto_prune_with_optimizer(self):
             grad_clip = fluid.clip.GradientClipByGlobalNorm(0.001)
             optimizer = fluid.optimizer.AdamOptimizer(
                 0.001, parameter_list=model.parameters(), grad_clip=grad_clip)
-
             indices = fluid.dygraph.to_variable(indices)
             embed = fluid.dygraph.to_variable(embed)
             dummy_loss = model(embed)
@@ -374,7 +400,12 @@ def test_auto_prune_with_optimizer(self):
             assert model.embed1.weight._grad_ivar() is None
             assert model.linear_1.weight._grad_ivar() is None
 
-    def test_case2_prune_no_grad_branch(self):
+    def test_auto_prune_with_optimizer(self):
+        with _test_eager_guard():
+            self.func_auto_prune_with_optimizer()
+        self.func_auto_prune_with_optimizer()
+
+    def func_case2_prune_no_grad_branch(self):
         with fluid.dygraph.guard():
             value1 = np.arange(784).reshape(1, 784)
             value2 = np.arange(1).reshape(1, 1)
@@ -386,7 +417,12 @@ def test_case2_prune_no_grad_branch(self):
             self.assertTrue(case3.linear2.weight._grad_ivar() is None)
             self.assertTrue(case3.linear.weight._grad_ivar() is not None)
 
-    def test_case3_prune_no_grad_branch2(self):
+    def test_case2_prune_no_grad_branch(self):
+        with _test_eager_guard():
+            self.func_case2_prune_no_grad_branch()
+        self.func_case2_prune_no_grad_branch()
+
+    def func_case3_prune_no_grad_branch2(self):
         with fluid.dygraph.guard():
             value1 = np.arange(1).reshape(1, 1)
             linear = fluid.dygraph.Linear(1, 1, act=None)
@@ -399,13 +435,23 @@ def test_case3_prune_no_grad_branch2(self):
             loss.backward()
             self.assertTrue(linear.weight._grad_ivar() is None)
 
-    def test_case4_with_no_grad_op_maker(self):
+    def test_case3_prune_no_grad_branch2(self):
+        with _test_eager_guard():
+            self.func_case3_prune_no_grad_branch2()
+        self.func_case3_prune_no_grad_branch2()
+
+    def func_case4_with_no_grad_op_maker(self):
         with fluid.dygraph.guard():
             out = fluid.layers.gaussian_random(shape=[20, 30])
             loss = fluid.layers.mean(out)
             loss.backward()
             self.assertTrue(out._grad_ivar() is None)
 
+    def test_case4_with_no_grad_op_maker(self):
+        with _test_eager_guard():
+            self.func_case4_with_no_grad_op_maker()
+        self.func_case4_with_no_grad_op_maker()
+
 
 if __name__ == '__main__':
     unittest.main()

From 724d49da1844b8b4bd552c6381e5a9cd87d5abaf Mon Sep 17 00:00:00 2001
From: 0x45f <23097963+0x45f@users.noreply.github.com>
Date: Mon, 17 Jan 2022 10:34:08 +0800
Subject: [PATCH 149/151] [Dy2St]close enable_inplace PASS for PE and open
 test_mnist_pure_fp16.py for windows (#38752)

* close enable_inplace PASS for PE, and test dy2st pure fp16 training stability

* add some comment

* enlarge atol
---
 .../unittests/dygraph_to_static/test_mnist_pure_fp16.py   | 8 ++++++--
 .../unittests/dygraph_to_static/test_resnet_pure_fp16.py  | 6 +++++-
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist_pure_fp16.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist_pure_fp16.py
index 029e3e9a535b6..62878f5cfc93d 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist_pure_fp16.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist_pure_fp16.py
@@ -33,7 +33,7 @@ def train_dygraph(self):
         return self.train(to_static=False)
 
     def test_mnist_to_static(self):
-        if paddle.fluid.is_compiled_with_cuda() and os.name != 'nt':
+        if paddle.fluid.is_compiled_with_cuda():
             dygraph_loss = self.train_dygraph()
             static_loss = self.train_static()
             # NOTE: In pure fp16 training, loss is not stable, so we enlarge atol here.
@@ -52,7 +52,11 @@ def train(self, to_static=False):
 
         if to_static:
             print("Successfully to apply @to_static.")
-            mnist = paddle.jit.to_static(mnist)
+            build_strategy = paddle.static.BuildStrategy()
+            # Why set `build_strategy.enable_inplace = False` here?
+            # Because we find that this PASS strategy of PE makes dy2st training loss unstable.
+            build_strategy.enable_inplace = False
+            mnist = paddle.jit.to_static(mnist, build_strategy=build_strategy)
 
         optimizer = paddle.optimizer.Adam(
             learning_rate=0.001, parameters=mnist.parameters())
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_pure_fp16.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_pure_fp16.py
index 6620703ab7182..cf5c2b731141f 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_pure_fp16.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_pure_fp16.py
@@ -106,7 +106,11 @@ def train(to_static, build_strategy=None):
 class TestResnet(unittest.TestCase):
     def train(self, to_static):
         program_translator.enable(to_static)
-        return train(to_static)
+        build_strategy = paddle.static.BuildStrategy()
+        # Why set `build_strategy.enable_inplace = False` here?
+        # Because we find that this PASS strategy of PE makes dy2st training loss unstable.
+        build_strategy.enable_inplace = False
+        return train(to_static, build_strategy)
 
     def test_resnet(self):
         if fluid.is_compiled_with_cuda():

From 096afbe1f5b473ed994a44d8d08715f0f44ad2f5 Mon Sep 17 00:00:00 2001
From: ronnywang <524019753@qq.com>
Date: Mon, 17 Jan 2022 10:42:09 +0800
Subject: [PATCH 150/151] fix paddle.where torch diff (#38870)

* fix paddle.where torch diff

* update
---
 .../fluid/tests/unittests/test_where_op.py    | 38 +++++++++++++++++++
 python/paddle/tensor/search.py                | 29 ++++++++++----
 2 files changed, 60 insertions(+), 7 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_where_op.py b/python/paddle/fluid/tests/unittests/test_where_op.py
index 908b2577a826b..5b92fcf52def0 100644
--- a/python/paddle/fluid/tests/unittests/test_where_op.py
+++ b/python/paddle/fluid/tests/unittests/test_where_op.py
@@ -305,6 +305,36 @@ def test_dygraph_api_broadcast_8(self):
         b_shape = [2, 2, 1]
         self.__test_where_with_broadcast_dygraph(cond_shape, a_shape, b_shape)
 
+    def test_where_condition(self):
+        data = np.array([[True, False], [False, True]])
+        with program_guard(Program(), Program()):
+            x = fluid.layers.data(name='x', shape=[-1, 2])
+            y = paddle.where(x)
+            self.assertEqual(type(y), tuple)
+            self.assertEqual(len(y), 2)
+            z = fluid.layers.concat(list(y), axis=1)
+            exe = fluid.Executor(fluid.CPUPlace())
+
+            res, = exe.run(feed={'x': data},
+                           fetch_list=[z.name],
+                           return_numpy=False)
+        expect_out = np.array([[0, 0], [1, 1]])
+        self.assertTrue(np.allclose(expect_out, np.array(res)))
+
+        data = np.array([True, True, False])
+        with program_guard(Program(), Program()):
+            x = fluid.layers.data(name='x', shape=[-1])
+            y = paddle.where(x)
+            self.assertEqual(type(y), tuple)
+            self.assertEqual(len(y), 1)
+            z = fluid.layers.concat(list(y), axis=1)
+            exe = fluid.Executor(fluid.CPUPlace())
+            res, = exe.run(feed={'x': data},
+                           fetch_list=[z.name],
+                           return_numpy=False)
+        expect_out = np.array([[0], [1]])
+        self.assertTrue(np.allclose(expect_out, np.array(res)))
+
 
 class TestWhereOpError(unittest.TestCase):
     def test_errors(self):
@@ -326,6 +356,14 @@ def test_type():
 
             self.assertRaises(TypeError, test_type)
 
+    def test_value_error(self):
+        with fluid.dygraph.guard():
+            cond_shape = [2, 2, 4]
+            cond_tmp = paddle.rand(cond_shape)
+            cond = cond_tmp < 0.3
+            a = paddle.rand(cond_shape)
+            self.assertRaises(ValueError, paddle.where, cond, a)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py
index 0685e276458d3..e15d2d49d5493 100644
--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
@@ -523,23 +523,26 @@ def mode(x, axis=-1, keepdim=False, name=None):
     return values, indices
 
 
-def where(condition, x, y, name=None):
+def where(condition, x=None, y=None, name=None):
     r"""
     Return a tensor of elements selected from either $x$ or $y$, depending on $condition$.
 
+    **Note**:
+        ``paddle.where(condition)`` is identical to ``paddle.nonzero(condition, as_tuple=True)``.
+
     .. math::
 
       out_i =
-      \\begin{cases}
-      x_i, \quad  \\text{if}  \\ condition_i \\  is \\ True \\\\
-      y_i, \quad  \\text{if}  \\ condition_i \\  is \\ False \\\\
-      \\end{cases}
+      \begin{cases}
+      x_i, \quad  \text{if}  \ condition_i \  is \ True \\
+      y_i, \quad  \text{if}  \ condition_i \  is \ False \\
+      \end{cases}
 
 
     Args:
         condition(Tensor): The condition to choose x or y.
-        x(Tensor): x is a Tensor with data type float32, float64, int32, int64.
-        y(Tensor): y is a Tensor with data type float32, float64, int32, int64.
+        x(Tensor, optional): x is a Tensor with data type float32, float64, int32, int64. Either both or neither of x and y should be given.
+        y(Tensor, optional): y is a Tensor with data type float32, float64, int32, int64. Either both or neither of x and y should be given.
 
         name(str, optional): The default value is None. Normally there is no
             need for user to set this property. For more information, please
@@ -559,7 +562,19 @@ def where(condition, x, y, name=None):
 
           print(out)
           #out: [1.0, 1.0, 3.2, 1.2]
+
+          out = paddle.where(x>1)
+          print(out)
+          #out: (Tensor(shape=[2, 1], dtype=int64, place=CPUPlace, stop_gradient=True,
+          #            [[2],
+          #             [3]]),)
     """
+    if x is None and y is None:
+        return nonzero(condition, as_tuple=True)
+
+    if x is None or y is None:
+        raise ValueError("either both or neither of x and y should be given")
+
     if not in_dygraph_mode():
         check_variable_and_dtype(condition, 'condition', ['bool'], 'where')
         check_variable_and_dtype(

From 73742d362bf4e6089d8a3d0cbe956a4402a99253 Mon Sep 17 00:00:00 2001
From: sneaxiy <32832641+sneaxiy@users.noreply.github.com>
Date: Mon, 17 Jan 2022 10:42:33 +0800
Subject: [PATCH 151/151] add convert func for string helper (#38600)

---
 paddle/fluid/string/string_helper.h       | 16 ++++++++++++++++
 paddle/fluid/string/string_helper_test.cc |  7 +++++++
 2 files changed, 23 insertions(+)

diff --git a/paddle/fluid/string/string_helper.h b/paddle/fluid/string/string_helper.h
index 4f1aee7c7ed17..c52b7a99a777a 100644
--- a/paddle/fluid/string/string_helper.h
+++ b/paddle/fluid/string/string_helper.h
@@ -193,6 +193,22 @@ std::string join_strings(const Container& strs, const std::string& delim) {
   return str;
 }
 
+template <class Container, class DelimT, class ConvertFunc>
+std::string join_strings(const Container& strs, DelimT&& delim,
+                         ConvertFunc&& func) {
+  std::stringstream ss;
+  size_t i = 0;
+  for (const auto& elem : strs) {
+    if (i > 0) {
+      ss << delim;
+    }
+    ss << func(elem);
+    ++i;
+  }
+
+  return ss.str();
+}
+
 // A helper class for reading lines from file. A line buffer is maintained. It
 // doesn't need to know the maximum possible length of a line.
 
diff --git a/paddle/fluid/string/string_helper_test.cc b/paddle/fluid/string/string_helper_test.cc
index 4796bf7507aba..67456e16a93b6 100644
--- a/paddle/fluid/string/string_helper_test.cc
+++ b/paddle/fluid/string/string_helper_test.cc
@@ -56,3 +56,10 @@ TEST(StringHelper, JoinStrings) {
   result = paddle::string::join_strings(v, " new ");
   EXPECT_EQ(result, "hello new world");
 }
+
+TEST(StringHelper, JoinStringsWithConversion) {
+  std::vector<int> v = {2, 3};
+  auto result =
+      paddle::string::join_strings(v, ",", [](int x) { return x * x; });
+  EXPECT_EQ(result, "4,9");
+}