From 2a5aa2eea5aa307960db5336fe154bb95b4b0fb5 Mon Sep 17 00:00:00 2001
From: Yichen Zhang <32740647+pkuzyc@users.noreply.github.com>
Date: Wed, 20 Sep 2023 19:34:04 +0800
Subject: [PATCH 01/39] [Semi-Auto] Adapt reduction rule to phi (#57241)

* adapt reduction spmd rule to phi

* remove useless comments
---
 .../spmd_rules/reduction_spmd_rule.cc         | 191 ------------------
 .../spmd_rules/reduction_spmd_rule.h          |  46 -----
 .../auto_parallel/spmd_rules/rules.h          |  13 --
 paddle/phi/core/attribute.h                   |   5 +-
 .../auto_parallel/inferspmd_utils.cc          |  20 +-
 .../auto_parallel/inferspmd_utils.h           |  16 ++
 paddle/phi/infermeta/spmd_rules/reduction.cc  | 178 ++++++++++++++++
 paddle/phi/infermeta/spmd_rules/reduction.h   |  35 ++++
 paddle/phi/infermeta/spmd_rules/rules.h       |  49 +++++
 .../spmd_rules/test_reduction_rule.py         | 116 ++++++++---
 10 files changed, 389 insertions(+), 280 deletions(-)
 delete mode 100644 paddle/fluid/distributed/auto_parallel/spmd_rules/reduction_spmd_rule.cc
 delete mode 100644 paddle/fluid/distributed/auto_parallel/spmd_rules/reduction_spmd_rule.h
 create mode 100644 paddle/phi/infermeta/spmd_rules/reduction.cc
 create mode 100644 paddle/phi/infermeta/spmd_rules/reduction.h
diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/reduction_spmd_rule.cc b/paddle/fluid/distributed/auto_parallel/spmd_rules/reduction_spmd_rule.cc
deleted file mode 100644
index 62940545e8845..0000000000000
--- a/paddle/fluid/distributed/auto_parallel/spmd_rules/reduction_spmd_rule.cc
+++ /dev/null
@@ -1,191 +0,0 @@
-/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/distributed/auto_parallel/spmd_rules/reduction_spmd_rule.h"
-#include <algorithm>
-#include "paddle/phi/core/distributed/auto_parallel/utils.h"
-
-namespace paddle {
-namespace distributed {
-namespace auto_parallel {
-
-using phi::distributed::auto_parallel::str_join;
-
-std::string ReductionSPMDRule::GetOutputNotation(
-    int64_t input_ndim,
-    const std::string& input_axes,
-    const paddle::framework::AttributeMap& attrs) {
-  bool keep_dim = ExtractAttr<bool>("keep_dim", attrs);
-  std::vector<int64_t> reduce_dims =
-      ExtractAttr<std::vector<int64_t>>("axis", attrs);
-
-  // convert the negative dim value to normal dim value
-  for (auto& reduce_dim : reduce_dims) {
-    if (reduce_dim < 0) {
-      reduce_dim = input_ndim + reduce_dim;
-    }
-  }
-
-  std::string output_axes = "";
-  for (int64_t i = 0; i < input_ndim; i++) {
-    std::vector<int64_t>::iterator iter =
-        std::find(reduce_dims.begin(), reduce_dims.end(), i);
-    if (iter != reduce_dims.end()) {
-      // if i is reduce dim, the corresponding input axis
-      // will not be appended at the end of output_axes
-      if (keep_dim) {
-        output_axes.append(1, '1');
-      }
-    } else {
-      // otherwise, the corresponding input axis
-      // will be appended at the end of output_axes
-      output_axes.append(1, input_axes[i]);
-    }
-  }
-
-  return output_axes;
-}
-
-std::pair<std::vector<TensorDistAttr>, std::vector<TensorDistAttr>>
-ReductionSPMDRule::InferForward(const std::vector<DistTensorSpec>& input_specs,
-                                const paddle::framework::AttributeMap& attrs) {
-  // step0: Verify Input Args Based on Elementwise Logic
-  int64_t ninputs = input_specs.size();
-  PADDLE_ENFORCE_EQ(
-      ninputs,
-      1,
-      phi::errors::InvalidArgument("The size of InputSpec in reduction must "
-                                   "be equal to 1, but got [%d].",
-                                   ninputs));
-  VerifySpecs(input_specs, "reduction");
-
-  // step1: Build Einsum Notation
-  // get einsum notation for input
-  std::string alphabet = "abcdefghijklmnopqrstuvwxyz";
-  int64_t ndim = input_specs[0].shape().size();
-  std::vector<std::string> input_axes_vec;
-  std::string input_axes = alphabet.substr(0, ndim);
-  input_axes_vec.emplace_back(input_axes);
-
-  // get einsum notation for output
-  std::string output_axes = GetOutputNotation(ndim, alphabet, attrs);
-
-  // step2: Sharding Propogation
-  // step2.1: merge input shardings
-  std::vector<std::pair<std::string, std::vector<int64_t>>> axes_sharding_info;
-  axes_sharding_info = GetAxesDimsMappingPair(input_axes_vec, input_specs);
-  std::unordered_map<std::string, int64_t> axis_to_dim_map =
-      ShardingMergeForTensors(axes_sharding_info);
-
-  // step2.2: infer output dimsmapping from merged input dimsmapping
-  std::vector<int64_t> output_dims_mapping =
-      GetDimsMappingForAxes(output_axes, axis_to_dim_map);
-
-  // initialize output dist_attr's process_mesh, batch_dim and dynamic dims with
-  // input dist_attr.
-  TensorDistAttr output_dist_attr =
-      CopyTensorDistAttrForOutput(input_specs[0].dist_attr());
-  output_dist_attr.set_dims_mapping(output_dims_mapping);
-
-  // step3: handle partial
-  // Step3.1 Output Partial
-  std::vector<int64_t> partial_on_dims =
-      ResoluteOutputPartialDimension(axis_to_dim_map, output_axes);
-  output_dist_attr.set_partial_status(
-      partial_on_dims /*, handle reduce_type in future  */);
-
-  std::vector<TensorDistAttr> output_dist_attrs;
-  output_dist_attrs.emplace_back(output_dist_attr);
-
-  // Step3.2  handle input tensor partial (TODO)
-  // If the op is a linear op, i.e. `linearity` is true, it supports
-  // the input to be partial. Otherwise, the input cannot be partial
-  // on reduced axes, we should reshard the input when the reduced
-  // axes are parital.
-  VLOG(4) << "ReductionSPMDRule InferForward: ";
-  for (int64_t i = 0; i < ninputs; i++) {
-    VLOG(4) << "Input" << std::to_string(i) << " shape: ["
-            << str_join(input_specs[i].shape()) << "] "
-            << "src_dims_mapping: [" << str_join(input_specs[i].dims_mapping())
-            << "] "
-            << "dst_dims_mapping: [" << str_join(input_specs[i].dims_mapping())
-            << "]";
-  }
-  VLOG(4) << "Output dims_mapping: [" + str_join(output_dims_mapping) + "] "
-          << "partial_on_dims: [" + str_join(partial_on_dims) + "]\n\n";
-
-  return {{input_specs[0].dist_attr()}, output_dist_attrs};
-}
-
-std::pair<std::vector<TensorDistAttr>, std::vector<TensorDistAttr>>
-ReductionSPMDRule::InferBackward(
-    const std::vector<DistTensorSpec>& input_specs,
-    const std::vector<DistTensorSpec>& output_specs,
-    const paddle::framework::AttributeMap& attrs) {
-  // step0: Verify Input Args Based on Elementwise Logic
-  int64_t ninputs = input_specs.size();
-  int64_t noutputs = output_specs.size();
-  PADDLE_ENFORCE_EQ(
-      ninputs,
-      1,
-      phi::errors::InvalidArgument("The size of InputSpec in reduction must "
-                                   "be equal to 1, but got [%d].",
-                                   ninputs));
-  PADDLE_ENFORCE_EQ(
-      noutputs,
-      1,
-      phi::errors::InvalidArgument("The size of OutputSpec in reduction must "
-                                   "be equal to 1, but got [%d].",
-                                   ninputs));
-  VerifySpecs(output_specs, "reduction_backward");
-
-  // step1: Build Einsum Notation
-  // get einsum notation for input
-  std::string alphabet = "abcdefghijklmnopqrstuvwxyz";
-  int64_t ndim = input_specs[0].shape().size();
-  std::string input_axes = alphabet.substr(0, ndim);
-
-  // get einsum notation for output
-  std::string output_axes = GetOutputNotation(ndim, alphabet, attrs);
-
-  // step2: Sharding Propogation
-  std::unordered_map<std::string, int64_t> axis_to_dim_map =
-      ShardingMergeForTensors({{output_axes, output_specs[0].dims_mapping()}});
-
-  // step2.2: infer input dims mapping from output dims mapping
-  std::vector<int64_t> input_dims_mapping =
-      GetDimsMappingForAxes(input_axes, axis_to_dim_map, true);
-
-  // initialize input dist_attr's process_mesh, batch_dim and dynamic dims with
-  // input dist_attr.
-  TensorDistAttr input_dist_attr(input_specs[0].dist_attr());
-  input_dist_attr.set_dims_mapping(input_dims_mapping);
-
-  // step3: handle partial (TODO)
-
-  VLOG(4) << "ReductionSPMDRule InferBackward: ";
-  VLOG(4) << "Output shape:[" << str_join(output_specs[0].shape())
-          << "] dims_mapping: [" << str_join(output_specs[0].dims_mapping())
-          << "]";
-  VLOG(4) << "Input0: "
-          << " shape: [" << str_join(input_specs[0].shape()) << "] "
-          << "dims_mapping: [" << str_join(input_dist_attr.dims_mapping())
-          << "]";
-
-  return {{input_dist_attr}, {output_specs[0].dist_attr()}};
-}
-
-}  // namespace auto_parallel
-}  // namespace distributed
-}  // namespace paddle
diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/reduction_spmd_rule.h b/paddle/fluid/distributed/auto_parallel/spmd_rules/reduction_spmd_rule.h
deleted file mode 100644
index 36e412b704927..0000000000000
--- a/paddle/fluid/distributed/auto_parallel/spmd_rules/reduction_spmd_rule.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <iterator>
-#include <map>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/distributed/auto_parallel/spmd_rules/common.h"
-
-namespace paddle {
-namespace distributed {
-namespace auto_parallel {
-
-class ReductionSPMDRule : public SPMDRuleBase {
- public:
-  std::pair<std::vector<TensorDistAttr>, std::vector<TensorDistAttr>>
-  InferForward(const std::vector<DistTensorSpec>& input_specs,
-               const paddle::framework::AttributeMap& attrs) override;
-
-  std::pair<std::vector<TensorDistAttr>, std::vector<TensorDistAttr>>
-  InferBackward(const std::vector<DistTensorSpec>& input_specs,
-                const std::vector<DistTensorSpec>& output_specs,
-                const paddle::framework::AttributeMap& attrs) override;
-
- private:
-  std::string GetOutputNotation(int64_t input_ndim,
-                                const std::string& input_axes,
-                                const paddle::framework::AttributeMap& attrs);
-};
-}  // namespace auto_parallel
-}  // namespace distributed
-}  // namespace paddle
diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h b/paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h
index c876fa59a7034..54ae4325b8a15 100644
--- a/paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h
+++ b/paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h
@@ -18,7 +18,6 @@
 #include "paddle/fluid/distributed/auto_parallel/spmd_rules/cross_entropy_with_softmax_spmd_rule.h"
 #include "paddle/fluid/distributed/auto_parallel/spmd_rules/embedding_spmd_rule.h"
 #include "paddle/fluid/distributed/auto_parallel/spmd_rules/layer_norm_spmd_rule.h"
-#include "paddle/fluid/distributed/auto_parallel/spmd_rules/reduction_spmd_rule.h"
 #include "paddle/fluid/distributed/auto_parallel/spmd_rules/replicated_spmd_rule.h"
 #include "paddle/fluid/distributed/auto_parallel/spmd_rules/reshape_spmd_rule.h"
 #include "paddle/fluid/distributed/auto_parallel/spmd_rules/softmax_spmd_rule.h"
@@ -30,18 +29,6 @@ namespace paddle {
 namespace distributed {
 namespace auto_parallel {
 
-// reduction rules
-REGISTER_SPMD_RULE(all, ReductionSPMDRule);
-REGISTER_SPMD_RULE(amax, ReductionSPMDRule);
-REGISTER_SPMD_RULE(amin, ReductionSPMDRule);
-REGISTER_SPMD_RULE(any, ReductionSPMDRule);
-REGISTER_SPMD_RULE(frobenius_norm, ReductionSPMDRule);
-REGISTER_SPMD_RULE(max, ReductionSPMDRule);
-REGISTER_SPMD_RULE(mean, ReductionSPMDRule);
-REGISTER_SPMD_RULE(min, ReductionSPMDRule);
-REGISTER_SPMD_RULE(prod, ReductionSPMDRule);
-REGISTER_SPMD_RULE(sum, ReductionSPMDRule);
-
 // layer_norm rule
 REGISTER_SPMD_RULE(layer_norm, LayerNormSPMDRule);
 
diff --git a/paddle/phi/core/attribute.h b/paddle/phi/core/attribute.h
index 40c66a669c9e8..6f032f4a5bd99 100644
--- a/paddle/phi/core/attribute.h
+++ b/paddle/phi/core/attribute.h
@@ -30,14 +30,17 @@ namespace phi {
 class Place;
 
 // NOTE: Add needed type in the future
+// Move vector<int> before vector<bool>, because when
+// vector<bool> is before vector<int>, a python integer
+// list will be converted to vector<bool> in error.
 using Attribute = paddle::variant<bool,
                                   int,
                                   int64_t,
                                   float,
                                   double,
                                   std::string,
-                                  std::vector<bool>,
                                   std::vector<int>,
+                                  std::vector<bool>,
                                   std::vector<int64_t>,
                                   std::vector<float>,
                                   std::vector<double>,
diff --git a/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.cc b/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.cc
index a1895b6dfbd79..6e0c0f696fef4 100644
--- a/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.cc
+++ b/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.cc
@@ -54,7 +54,7 @@ AttrType InferSpmdContext::AttrAt(size_t idx) const {
 }
 
 template <>
-bool InferSpmdContext::AttrAt<bool>(size_t idx) const {
+bool InferSpmdContext::AttrAt(size_t idx) const {
   try {
     auto attr = attrs_.at(idx);
     if (attr.type() == typeid(int)) {
@@ -70,6 +70,24 @@ bool InferSpmdContext::AttrAt<bool>(size_t idx) const {
   }
 }
 
+template <>
+std::vector<int> InferSpmdContext::AttrAt(size_t idx) const {
+  try {
+    auto attr = attrs_.at(idx);
+    if (attr.type() == typeid(std::vector<bool>)) {
+      std::vector<bool> val = PADDLE_GET_CONST(std::vector<bool>, attr);
+      return std::vector<int>(val.begin(), val.end());
+    } else {
+      return paddle::get<std::vector<int>>(attr);
+    }
+  } catch (paddle::bad_variant_access const& e) {
+    PADDLE_THROW(phi::errors::InvalidArgument(
+        "Attribute cast error in InferSpmd Context, the input attr type is "
+        "`%s`, but the expected attribute type is `bool`.",
+        attrs_.at(idx).type().name()));
+  }
+}
+
 const Attribute& InferSpmdContext::AttrAt(size_t idx) const {
   return attrs_.at(idx);
 }
diff --git a/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h b/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h
index 3896bfcd6a2fe..23b147a4bb3d7 100644
--- a/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h
+++ b/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h
@@ -138,8 +138,24 @@ struct InferSpmdFnImpl<Return (*)(Args...), infer_spmd_fn> {
     }                                                                     \
   }
 
+#define PD_SPECIALIZE_InferSpmdFnCallHelper_FOR_CONST_ATTRIBUTE_REF(attr_type) \
+  template <typename... Tail>                                                  \
+  struct InferSpmdFnCallHelper<const attr_type&, Tail...> {                    \
+    template <int in_idx, int attr_idx, typename... PreviousArgs>              \
+    static SpmdInfo Call(const InferSpmdContext& ctx,                          \
+                         PreviousArgs&... pargs) {                             \
+      attr_type arg = ctx.AttrAt<attr_type>(attr_idx);                         \
+      return InferSpmdFnCallHelper<Tail...>::template Call<in_idx,             \
+                                                           attr_idx + 1>(      \
+          ctx, pargs..., arg);                                                 \
+    }                                                                          \
+  }
+
   // TODO(chenweihang): support other attr type later as needed
   PD_SPECIALIZE_InferSpmdFnCallHelper_FOR_ATTRIBUTE(bool);
+  PD_SPECIALIZE_InferSpmdFnCallHelper_FOR_CONST_ATTRIBUTE_REF(std::vector<int>);
+  PD_SPECIALIZE_InferSpmdFnCallHelper_FOR_CONST_ATTRIBUTE_REF(
+      std::vector<int64_t>);
 
   /* End case */
   template <typename T>
diff --git a/paddle/phi/infermeta/spmd_rules/reduction.cc b/paddle/phi/infermeta/spmd_rules/reduction.cc
new file mode 100644
index 0000000000000..24c90a1792341
--- /dev/null
+++ b/paddle/phi/infermeta/spmd_rules/reduction.cc
@@ -0,0 +1,178 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/infermeta/spmd_rules/reduction.h"
+
+#include "glog/logging.h"
+
+#include "paddle/phi/core/distributed/auto_parallel/dist_attr.h"
+#include "paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h"
+#include "paddle/phi/core/distributed/auto_parallel/utils.h"
+#include "paddle/phi/infermeta/spmd_rules/utils.h"
+
+namespace phi {
+namespace distributed {
+
+using phi::distributed::auto_parallel::str_join;
+
+////////////////// Utils Functions //////////////////
+std::string GetOutputNotation(int input_ndim,
+                              const std::string& input_axes,
+                              std::vector<int> reduce_dims,
+                              bool keep_dim) {
+  // convert the negative dim value to normal dim value
+  for (auto& reduce_dim : reduce_dims) {
+    if (reduce_dim < 0) {
+      reduce_dim = input_ndim + reduce_dim;
+    }
+  }
+
+  std::string output_axes = "";
+  for (int i = 0; i < input_ndim; i++) {
+    std::vector<int>::iterator iter =
+        std::find(reduce_dims.begin(), reduce_dims.end(), i);
+    if (iter != reduce_dims.end()) {
+      // if i is reduce dim, the corresponding input axis
+      // will not be appended at the end of output_axes
+      if (keep_dim) {
+        output_axes.append(1, '1');
+      }
+    } else {
+      // otherwise, the corresponding input axis
+      // will be appended at the end of output_axes
+      output_axes.append(1, input_axes[i]);
+    }
+  }
+
+  return output_axes;
+}
+
+SpmdInfo ReductionInferSpmd(const DistMetaTensor& x,
+                            const std::vector<int>& axis,
+                            bool keep_dim) {
+  // Step0: Verify input args based on reduction logic
+  auto x_shape = phi::vectorize(x.dims());
+  int x_ndim = x_shape.size();
+  auto x_dist_attr_src = x.dist_attr();
+  std::vector<int64_t> x_dims_mapping = x_dist_attr_src.dims_mapping();
+  PADDLE_ENFORCE_EQ(
+      x_ndim,
+      x_dims_mapping.size(),
+      phi::errors::InvalidArgument("The Tensor X's rank [%d] and X's "
+                                   "dims_mapping size [%d] are not matched.",
+                                   x_ndim,
+                                   x_dims_mapping.size()));
+
+  // Step1: Build Einsum Notation
+  // get einsum notation for input
+  std::string alphabet = "abcdefghijklmnopqrstuvwxyz";
+  std::string x_axes = alphabet.substr(0, x_ndim);
+
+  // get einsum notation for output
+  std::string out_axes = GetOutputNotation(x_ndim, alphabet, axis, keep_dim);
+
+  // Step2: Sharding Propogation
+  // Step2.1: Merge input shardings
+  std::pair<std::string, std::vector<int64_t>> x_sharding_info(x_axes,
+                                                               x_dims_mapping);
+  std::unordered_map<std::string, int64_t> axis_to_dim_map =
+      ShardingMergeForTensors({x_sharding_info});
+
+  // Step2.2: Infer output dimsmapping from merged input dimsmapping
+  std::vector<int64_t> out_dims_mapping =
+      GetDimsMappingForAxes(out_axes, axis_to_dim_map);
+
+  // initialize output dist_attr's process_mesh, batch_dim and dynamic dims with
+  // input dist_attr.
+  TensorDistAttr out_dist_attr = CopyTensorDistAttrForOutput(x_dist_attr_src);
+  out_dist_attr.set_dims_mapping(out_dims_mapping);
+
+  // Step3: handle partial
+  // Step3.1 Output Partial
+  std::vector<int64_t> partial_on_dims =
+      ResoluteOutputPartialDimension(axis_to_dim_map, out_axes);
+  out_dist_attr.set_partial_status(
+      partial_on_dims /*, handle reduce_type in future  */);
+
+  // Step3.2  handle input tensor partial (TODO)
+  // If the op is a linear op, i.e. `linearity` is true, it supports
+  // the input to be partial. Otherwise, the input cannot be partial
+  // on reduced axes, we should reshard the input when the reduced
+  // axes are parital.
+  VLOG(4) << "ReductionInferSpmd:";
+  VLOG(4) << "axis: " << str_join(axis) << ", keep_dim: " << keep_dim;
+  VLOG(4) << "Einsum Notation: " << x_axes << " --> " << out_axes;
+  VLOG(4) << "Input0 shape: [" << str_join(x_shape) << "] "
+          << "dims_mapping: [" << str_join(x_dims_mapping) << "]";
+  VLOG(4) << "Output dims_mapping: [" + str_join(out_dims_mapping) + "] "
+          << "partial_on_dims: [" + str_join(partial_on_dims) + "]\n\n";
+
+  return {{x_dist_attr_src}, {out_dist_attr}};
+}
+
+SpmdInfo ReductionInferSpmdReverse(const DistMetaTensor& x,
+                                   const DistMetaTensor& out,
+                                   const std::vector<int>& axis,
+                                   bool keep_dim) {
+  // Step0: Verify input args based on reduction logic
+  auto x_shape = phi::vectorize(x.dims());
+  auto out_shape = phi::vectorize(out.dims());
+  int x_ndim = x_shape.size();
+  int out_ndim = out_shape.size();
+  auto out_dist_attr_src = out.dist_attr();
+  std::vector<int64_t> out_dims_mapping = out_dist_attr_src.dims_mapping();
+  PADDLE_ENFORCE_EQ(
+      out_ndim,
+      out_dims_mapping.size(),
+      phi::errors::InvalidArgument("The Tensor Out's rank [%d] and Out's "
+                                   "dims_mapping size [%d] are not matched.",
+                                   out_ndim,
+                                   out_dims_mapping.size()));
+
+  // Step1: Build einsum notation
+  // get einsum notation for input
+  std::string alphabet = "abcdefghijklmnopqrstuvwxyz";
+  std::string x_axes = alphabet.substr(0, x_ndim);
+
+  // get einsum notation for output
+  std::string out_axes = GetOutputNotation(x_ndim, alphabet, axis, keep_dim);
+
+  // Step2: Sharding propogation
+  // Step2.1: Merge input shardings
+  std::unordered_map<std::string, int64_t> axis_to_dim_map =
+      ShardingMergeForTensors({{out_axes, out_dims_mapping}});
+
+  // Step2.2: Infer input dims mapping from output dims mapping
+  std::vector<int64_t> x_dims_mapping =
+      GetDimsMappingForAxes(x_axes, axis_to_dim_map, true);
+
+  // initialize input dist_attr's process_mesh, batch_dim and dynamic dims with
+  // input dist_attr.
+  TensorDistAttr x_dist_attr_dst(x.dist_attr());
+  x_dist_attr_dst.set_dims_mapping(x_dims_mapping);
+
+  // Step3: handle partial (TODO)
+
+  VLOG(4) << "ReductionInferSpmdReverse: ";
+  VLOG(4) << "Output shape:[" << str_join(out_shape) << "] dims_mapping: ["
+          << str_join(out_dims_mapping) << "]";
+  VLOG(4) << "Input0: "
+          << "shape: [" << str_join(x_shape) << "] "
+          << "dims_mapping: [" << str_join(x_dims_mapping) << "]\n\n";
+
+  return {{x_dist_attr_dst}, {out_dist_attr_src}};
+}
+
+}  // namespace distributed
+}  // namespace phi
diff --git a/paddle/phi/infermeta/spmd_rules/reduction.h b/paddle/phi/infermeta/spmd_rules/reduction.h
new file mode 100644
index 0000000000000..ed9341ddc6904
--- /dev/null
+++ b/paddle/phi/infermeta/spmd_rules/reduction.h
@@ -0,0 +1,35 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+
+#include "paddle/phi/core/distributed/auto_parallel/dist_meta_tensor.h"
+#include "paddle/phi/core/distributed/type_defs.h"
+
+namespace phi {
+namespace distributed {
+
+SpmdInfo ReductionInferSpmd(const DistMetaTensor& x,
+                            const std::vector<int>& axis,
+                            bool keep_dim);
+
+SpmdInfo ReductionInferSpmdReverse(const DistMetaTensor& x,
+                                   const DistMetaTensor& out,
+                                   const std::vector<int>& axis,
+                                   bool keep_dim);
+
+}  // namespace distributed
+}  // namespace phi
diff --git a/paddle/phi/infermeta/spmd_rules/rules.h b/paddle/phi/infermeta/spmd_rules/rules.h
index 4406e17495d14..71a726e3d8edc 100644
--- a/paddle/phi/infermeta/spmd_rules/rules.h
+++ b/paddle/phi/infermeta/spmd_rules/rules.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/phi/infermeta/spmd_rules/default_data_parallel.h"
 #include "paddle/phi/infermeta/spmd_rules/elementwise.h"
 #include "paddle/phi/infermeta/spmd_rules/matmul.h"
+#include "paddle/phi/infermeta/spmd_rules/reduction.h"
 #include "paddle/phi/infermeta/spmd_rules/replicated.h"
 
 /**
@@ -46,6 +47,16 @@ PD_REGISTER_SPMD_RULE(matmul,
                       PD_INFER_SPMD(phi::distributed::MatmulInferSpmd),
                       PD_INFER_SPMD(phi::distributed::MatmulInferSpmdReverse));
 
+PD_REGISTER_SPMD_RULE(
+    elementwise_unary,
+    PD_INFER_SPMD(phi::distributed::ElementwiseUnaryInferSpmd),
+    PD_INFER_SPMD(phi::distributed::ElementwiseUnaryInferSpmdReverse));
+
+PD_REGISTER_SPMD_RULE(
+    elementwise_binary,
+    PD_INFER_SPMD(phi::distributed::ElementwiseBinaryInferSpmd),
+    PD_INFER_SPMD(phi::distributed::ElementwiseBinaryInferSpmdReverse));
+
 // default data parallel rule
 PD_REGISTER_SPMD_RULE(
     unsqueeze,
@@ -408,5 +419,43 @@ PD_REGISTER_SPMD_RULE(
 
 // TODO(pkuzyc): add multiary elementwise rule
 
+// reduction rule
+PD_REGISTER_SPMD_RULE(
+    all,
+    PD_INFER_SPMD(phi::distributed::ReductionInferSpmd),
+    PD_INFER_SPMD(phi::distributed::ReductionInferSpmdReverse));
+PD_REGISTER_SPMD_RULE(
+    amax,
+    PD_INFER_SPMD(phi::distributed::ReductionInferSpmd),
+    PD_INFER_SPMD(phi::distributed::ReductionInferSpmdReverse));
+PD_REGISTER_SPMD_RULE(
+    amin,
+    PD_INFER_SPMD(phi::distributed::ReductionInferSpmd),
+    PD_INFER_SPMD(phi::distributed::ReductionInferSpmdReverse));
+PD_REGISTER_SPMD_RULE(
+    any,
+    PD_INFER_SPMD(phi::distributed::ReductionInferSpmd),
+    PD_INFER_SPMD(phi::distributed::ReductionInferSpmdReverse));
+PD_REGISTER_SPMD_RULE(
+    frobenius_norm,
+    PD_INFER_SPMD(phi::distributed::ReductionInferSpmd),
+    PD_INFER_SPMD(phi::distributed::ReductionInferSpmdReverse));
+PD_REGISTER_SPMD_RULE(
+    max,
+    PD_INFER_SPMD(phi::distributed::ReductionInferSpmd),
+    PD_INFER_SPMD(phi::distributed::ReductionInferSpmdReverse));
+PD_REGISTER_SPMD_RULE(
+    min,
+    PD_INFER_SPMD(phi::distributed::ReductionInferSpmd),
+    PD_INFER_SPMD(phi::distributed::ReductionInferSpmdReverse));
+PD_REGISTER_SPMD_RULE(
+    prod,
+    PD_INFER_SPMD(phi::distributed::ReductionInferSpmd),
+    PD_INFER_SPMD(phi::distributed::ReductionInferSpmdReverse));
+PD_REGISTER_SPMD_RULE(
+    sum,
+    PD_INFER_SPMD(phi::distributed::ReductionInferSpmd),
+    PD_INFER_SPMD(phi::distributed::ReductionInferSpmdReverse));
+
 }  // namespace distributed
 }  // namespace phi
diff --git a/test/auto_parallel/spmd_rules/test_reduction_rule.py b/test/auto_parallel/spmd_rules/test_reduction_rule.py
index f8069ee226583..ea8398d246fcc 100644
--- a/test/auto_parallel/spmd_rules/test_reduction_rule.py
+++ b/test/auto_parallel/spmd_rules/test_reduction_rule.py
@@ -13,13 +13,14 @@
 # limitations under the License.
 
 import unittest
+from collections import OrderedDict
 
-from paddle.distributed.auto_parallel.static.completion import get_spmd_rule
 from paddle.distributed.auto_parallel.static.dist_attribute import (
     DistTensorSpec,
     TensorDistAttr,
 )
 from paddle.distributed.fleet import auto
+from paddle.framework import core
 
 
 class TestReductionSPMDRule(unittest.TestCase):
@@ -28,7 +29,7 @@ class TestReductionSPMDRule(unittest.TestCase):
     """
 
     def setUp(self):
-        self.rule = get_spmd_rule("max")
+        self.rule = core.get_phi_spmd_rule("max")
 
         x_shape = [64, 32]
         process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3])
@@ -40,11 +41,7 @@ def setUp(self):
 
         self.out_dist_tensor_spec = DistTensorSpec(self.x_dist_tensor_spec)
 
-        self.attrs = {
-            'keep_dim': False,
-            'axis': [0],
-            'linearity': False,
-        }
+        self.attrs = OrderedDict([('axis', [0]), ('keep_dim', False)])
 
     def test_single_mesh_dim(self):
         # reduce on dim 0, keep_dim = false
@@ -53,7 +50,7 @@ def test_single_mesh_dim(self):
         self.attrs['axis'] = [0]
         self.x_dist_tensor_spec.set_dims_mapping([0, -1])
         result_dist_attrs = self.rule.infer_forward(
-            [self.x_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec, self.attrs['axis'], self.attrs['keep_dim']
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -73,7 +70,7 @@ def test_single_mesh_dim(self):
         self.attrs['axis'] = [0]
         self.x_dist_tensor_spec.set_dims_mapping([0, -1])
         result_dist_attrs = self.rule.infer_forward(
-            [self.x_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec, self.attrs['axis'], self.attrs['keep_dim']
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -89,7 +86,7 @@ def test_single_mesh_dim(self):
         self.attrs['axis'] = [1]
         self.x_dist_tensor_spec.set_dims_mapping([0, -1])
         result_dist_attrs = self.rule.infer_forward(
-            [self.x_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec, self.attrs['axis'], self.attrs['keep_dim']
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -104,7 +101,7 @@ def test_single_mesh_dim(self):
         self.attrs['axis'] = [1]
         self.x_dist_tensor_spec.set_dims_mapping([0, -1])
         result_dist_attrs = self.rule.infer_forward(
-            [self.x_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec, self.attrs['axis'], self.attrs['keep_dim']
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -119,7 +116,7 @@ def test_single_mesh_dim(self):
         self.attrs['axis'] = [0, 1]
         self.x_dist_tensor_spec.set_dims_mapping([0, -1])
         result_dist_attrs = self.rule.infer_forward(
-            [self.x_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec, self.attrs['axis'], self.attrs['keep_dim']
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -135,7 +132,7 @@ def test_single_mesh_dim(self):
         self.attrs['axis'] = [0, 1]
         self.x_dist_tensor_spec.set_dims_mapping([0, -1])
         result_dist_attrs = self.rule.infer_forward(
-            [self.x_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec, self.attrs['axis'], self.attrs['keep_dim']
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -156,7 +153,7 @@ def test_multi_mesh_dim(self):
         self.attrs['axis'] = [1, 2]
         self.x_dist_tensor_spec.set_dims_mapping([0, -1, -1])
         result_dist_attrs = self.rule.infer_forward(
-            [self.x_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec, self.attrs['axis'], self.attrs['keep_dim']
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -167,6 +164,7 @@ def test_multi_mesh_dim(self):
 
         self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [0, -1, -1])
         self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [0])
+        self.assertEqual(infered_output_dist_attrs[0]._is_partial(), False)
 
         # reduce on dim 1, 2, keep_dim = false
         # [-1, 0, 1] --> [-1, 0, 1], [-1], partial_on_dim:[0, 1]
@@ -174,7 +172,7 @@ def test_multi_mesh_dim(self):
         self.attrs['axis'] = [1, 2]
         self.x_dist_tensor_spec.set_dims_mapping([-1, 0, 1])
         result_dist_attrs = self.rule.infer_forward(
-            [self.x_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec, self.attrs['axis'], self.attrs['keep_dim']
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -192,7 +190,7 @@ def test_multi_mesh_dim(self):
         self.attrs['axis'] = [1, 2]
         self.x_dist_tensor_spec.set_dims_mapping([1, -1, -1])
         result_dist_attrs = self.rule.infer_forward(
-            [self.x_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec, self.attrs['axis'], self.attrs['keep_dim']
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -207,7 +205,7 @@ def test_multi_mesh_dim(self):
         self.attrs['axis'] = [1, 2]
         self.x_dist_tensor_spec.set_dims_mapping([0, 1, -1])
         result_dist_attrs = self.rule.infer_forward(
-            [self.x_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec, self.attrs['axis'], self.attrs['keep_dim']
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -225,7 +223,7 @@ def test_multi_mesh_dim(self):
         self.attrs['axis'] = [1, 2]
         self.x_dist_tensor_spec.set_dims_mapping([0, 1, -1])
         result_dist_attrs = self.rule.infer_forward(
-            [self.x_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec, self.attrs['axis'], self.attrs['keep_dim']
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -243,7 +241,10 @@ def test_backward_single_mesh_dim(self):
         self.out_dist_tensor_spec.shape = [32]
         self.out_dist_tensor_spec.set_dims_mapping([-1])
         result_dist_attrs = self.rule.infer_backward(
-            [self.x_dist_tensor_spec], [self.out_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec,
+            self.out_dist_tensor_spec,
+            self.attrs['axis'],
+            self.attrs['keep_dim'],
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -262,7 +263,10 @@ def test_backward_single_mesh_dim(self):
         self.out_dist_tensor_spec.shape = [1, 32]
         self.out_dist_tensor_spec.set_dims_mapping([-1, -1])
         result_dist_attrs = self.rule.infer_backward(
-            [self.x_dist_tensor_spec], [self.out_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec,
+            self.out_dist_tensor_spec,
+            self.attrs['axis'],
+            self.attrs['keep_dim'],
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -277,7 +281,10 @@ def test_backward_single_mesh_dim(self):
         self.out_dist_tensor_spec.shape = [64]
         self.out_dist_tensor_spec.set_dims_mapping([0])
         result_dist_attrs = self.rule.infer_backward(
-            [self.x_dist_tensor_spec], [self.out_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec,
+            self.out_dist_tensor_spec,
+            self.attrs['axis'],
+            self.attrs['keep_dim'],
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -292,7 +299,10 @@ def test_backward_single_mesh_dim(self):
         self.out_dist_tensor_spec.shape = [64, 1]
         self.out_dist_tensor_spec.set_dims_mapping([0, -1])
         result_dist_attrs = self.rule.infer_backward(
-            [self.x_dist_tensor_spec], [self.out_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec,
+            self.out_dist_tensor_spec,
+            self.attrs['axis'],
+            self.attrs['keep_dim'],
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -307,7 +317,10 @@ def test_backward_single_mesh_dim(self):
         self.out_dist_tensor_spec.shape = []
         self.out_dist_tensor_spec.set_dims_mapping([])
         result_dist_attrs = self.rule.infer_backward(
-            [self.x_dist_tensor_spec], [self.out_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec,
+            self.out_dist_tensor_spec,
+            self.attrs['axis'],
+            self.attrs['keep_dim'],
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -322,7 +335,10 @@ def test_backward_single_mesh_dim(self):
         self.out_dist_tensor_spec.shape = [1, 1]
         self.out_dist_tensor_spec.set_dims_mapping([-1, -1])
         result_dist_attrs = self.rule.infer_backward(
-            [self.x_dist_tensor_spec], [self.out_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec,
+            self.out_dist_tensor_spec,
+            self.attrs['axis'],
+            self.attrs['keep_dim'],
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -343,7 +359,10 @@ def test_backward_multi_mesh_dim(self):
         self.out_dist_tensor_spec.shape = [96]
         self.out_dist_tensor_spec.set_dims_mapping([0])
         result_dist_attrs = self.rule.infer_backward(
-            [self.x_dist_tensor_spec], [self.out_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec,
+            self.out_dist_tensor_spec,
+            self.attrs['axis'],
+            self.attrs['keep_dim'],
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -362,7 +381,10 @@ def test_backward_multi_mesh_dim(self):
         self.out_dist_tensor_spec.shape = [96]
         self.out_dist_tensor_spec.set_dims_mapping([-1])
         result_dist_attrs = self.rule.infer_backward(
-            [self.x_dist_tensor_spec], [self.out_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec,
+            self.out_dist_tensor_spec,
+            self.attrs['axis'],
+            self.attrs['keep_dim'],
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -377,7 +399,10 @@ def test_backward_multi_mesh_dim(self):
         self.out_dist_tensor_spec.shape = [96]
         self.out_dist_tensor_spec.set_dims_mapping([1])
         result_dist_attrs = self.rule.infer_backward(
-            [self.x_dist_tensor_spec], [self.out_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec,
+            self.out_dist_tensor_spec,
+            self.attrs['axis'],
+            self.attrs['keep_dim'],
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -392,13 +417,48 @@ def test_backward_multi_mesh_dim(self):
         self.out_dist_tensor_spec.shape = [96, 1, 1]
         self.out_dist_tensor_spec.set_dims_mapping([0, -1, -1])
         result_dist_attrs = self.rule.infer_backward(
-            [self.x_dist_tensor_spec], [self.out_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec,
+            self.out_dist_tensor_spec,
+            self.attrs['axis'],
+            self.attrs['keep_dim'],
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+
+        self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [0, -1, -1])
+        self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [0, -1, -1])
+
+    def test_backward_multi_mesh_dim_parital(self):
+        # reduction on dim 1, 2, keep_dim = true, partial_dim=[1]
+        # [0, -1, -1] --> [0, -1, -1], [0, -1, -1] (output --> input, output)
+        # output parital_dim: [1], input parital_dim: []
+        out_shape = [96, 1, 1]
+        process_mesh = auto.ProcessMesh(mesh=[[0, 1, 2], [3, 4, 5]])
+
+        self.x_dist_tensor_spec.set_process_mesh(process_mesh)
+        self.x_dist_tensor_spec.shape = [96, 24, 48]
+        out_tensor_dist_attr = TensorDistAttr()
+        out_tensor_dist_attr.dims_mapping = [0, -1, -1]
+        out_tensor_dist_attr.process_mesh = process_mesh
+        out_tensor_dist_attr._set_partial_dims([1])
+        self.out_dist_tensor_spec = DistTensorSpec(
+            out_shape, out_tensor_dist_attr
+        )
+
+        self.attrs['keep_dim'] = True
+        self.attrs['axis'] = [1, 2]
+        result_dist_attrs = self.rule.infer_backward(
+            self.x_dist_tensor_spec,
+            self.out_dist_tensor_spec,
+            self.attrs['axis'],
+            self.attrs['keep_dim'],
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
 
         self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [0, -1, -1])
         self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [0, -1, -1])
+        self.assertEqual(infered_input_dist_attrs[0]._is_partial(), False)
 
 
 if __name__ == "__main__":

From 4920462600b26c1050c29ac1caafc3fac72362ba Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Wed, 20 Sep 2023 19:48:41 +0800
Subject: [PATCH 02/39] [Dy2St]Modify jit.load into Lazy Initialization Mode
 for backward program (#57240)

* [Dy2St]Modify jit.load into Lazy Initialization Mode for backward program

* fix is_test

* fix typo

* fix logic

* fix build scope logic
---
 .../eager/to_static/run_program_op_func.h     |  7 ++-
 .../eager/to_static/run_program_op_node.h     | 45 +++++++++++--------
 paddle/fluid/framework/executor_cache.cc      | 23 +++++-----
 paddle/fluid/framework/executor_cache.h       |  2 +-
 python/paddle/jit/translated_layer.py         | 22 ++++-----
 5 files changed, 55 insertions(+), 44 deletions(-)

diff --git a/paddle/fluid/eager/to_static/run_program_op_func.h b/paddle/fluid/eager/to_static/run_program_op_func.h
index f0ca7c1518b24..a3bb3a2879300 100644
--- a/paddle/fluid/eager/to_static/run_program_op_func.h
+++ b/paddle/fluid/eager/to_static/run_program_op_func.h
@@ -140,8 +140,11 @@ inline void run_program_ad_func(
   RunProgramAPI(
       x_tmp, params_tmp, out, step_scope, dout, require_any_grad, attrs);
   VLOG(2) << "start run run_program grad";
-
-  if (require_any_grad) {
+  auto is_test = false;
+  if (attrs.count("is_test")) {
+    is_test = PADDLE_GET_CONST(bool, attrs.at("is_test"));
+  }
+  if (!is_test && require_any_grad) {
     auto x_names =
         PADDLE_GET_CONST(std::vector<std::string>, attrs.at("x_names"));
 
diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h
index ebab84ccd1521..fd0d6563945a5 100644
--- a/paddle/fluid/eager/to_static/run_program_op_node.h
+++ b/paddle/fluid/eager/to_static/run_program_op_node.h
@@ -315,14 +315,16 @@ static void ShareTensorsFromScopeByValue(
 static void ShareTensorsFromScopeWithPartialBlock(
     const std::vector<Tensor *> &tensors,
     const paddle::framework::BlockDesc &forward_global_block,
-    const paddle::framework::BlockDesc &backward_global_block,
+    const paddle::framework::BlockDesc *backward_global_block,
     paddle::framework::Scope *scope) {
   for (size_t i = 0; i < tensors.size(); ++i) {
     auto &name = tensors[i]->name();
+    bool in_forward_block = forward_global_block.HasVar(name);
+    bool in_backward_block =
+        backward_global_block && backward_global_block->HasVar(name);
     if (name == paddle::framework::kEmptyVarName ||
         name == paddle::framework::kFakeVarName ||
-        (!forward_global_block.HasVar(name) &&
-         !backward_global_block.HasVar(name))) {
+        (!in_forward_block && !in_backward_block)) {
       VLOG(2) << "find tensor name is " << name << ", skip it!";
       continue;
     }
@@ -660,10 +662,16 @@ inline void RunProgramAPI(
 
   auto *forward_global_block = PADDLE_GET_CONST(
       paddle::framework::BlockDesc *, attrs.at("forward_global_block"));
-  auto *backward_global_block = PADDLE_GET_CONST(
-      paddle::framework::BlockDesc *, attrs.at("backward_global_block"));
   auto *forward_program = forward_global_block->Program();
-  auto *backward_program = backward_global_block->Program();
+
+  paddle::framework::BlockDesc *backward_global_block = nullptr;
+  paddle::framework::ProgramDesc *backward_program = nullptr;
+
+  if (!is_test) {
+    backward_global_block = PADDLE_GET_CONST(paddle::framework::BlockDesc *,
+                                             attrs.at("backward_global_block"));
+    backward_program = backward_global_block->Program();
+  }
 
   auto &interpretercore_info_cache =
       paddle::framework::InterpreterCoreInfoCache::Instance();
@@ -710,9 +718,12 @@ inline void RunProgramAPI(
               global_inner_scope);
     }
     // Step 3. get all eager gc vars
-    std::set<std::string> skip_eager_delete_vars =
-        paddle::framework::details::ParseSafeEagerDeletionSkipVarsSet(
-            *backward_program);
+    std::set<std::string> skip_eager_delete_vars;
+    if (!is_test) {
+      skip_eager_delete_vars =
+          paddle::framework::details::ParseSafeEagerDeletionSkipVarsSet(
+              *backward_program);
+    }
 
     // all out_vars are skip_eager_var
     skip_eager_delete_vars.insert(output_names.begin(), output_names.end());
@@ -765,19 +776,15 @@ inline void RunProgramAPI(
         1);
     interpreter_core->Run({});
   }
-
+  VLOG(3) << paddle::framework::GenScopeTreeDebugInfo(out_scope_vec->front());
   {
     paddle::platform::RecordEvent record_event(
         "fetch_and_gc", paddle::platform::TracerEventType::UserDefined, 1);
     // Get Output
     details::ShareTensorsFromScopeWithPartialBlock(
-        out, *forward_global_block, *backward_global_block, global_inner_scope);
-    details::ShareTensorsFromScopeWithPartialBlock(dout,
-                                                   *forward_global_block,
-                                                   *backward_global_block,
-                                                   global_inner_scope);
-
-    VLOG(3) << paddle::framework::GenScopeTreeDebugInfo(out_scope_vec->front());
+        out, *forward_global_block, backward_global_block, global_inner_scope);
+    details::ShareTensorsFromScopeWithPartialBlock(
+        dout, *forward_global_block, backward_global_block, global_inner_scope);
 
     if (is_test || !require_any_grad) {
       VLOG(4) << "don't require any grad, set this scope can reused";
@@ -939,11 +946,11 @@ inline void RunProgramGradAPI(
     // Step 4. get outputs
     details::ShareTensorsFromScopeWithPartialBlock(x_grad,
                                                    *forward_global_block,
-                                                   *backward_global_block,
+                                                   backward_global_block,
                                                    global_inner_scope);
     details::ShareTensorsFromScopeWithPartialBlock(params_grad,
                                                    *forward_global_block,
-                                                   *backward_global_block,
+                                                   backward_global_block,
                                                    global_inner_scope);
     VLOG(4) << "after backward gc all vars";
     global_inner_scope->SetCanReused(true);
diff --git a/paddle/fluid/framework/executor_cache.cc b/paddle/fluid/framework/executor_cache.cc
index 1044f785451e0..64d5ce24d20fe 100644
--- a/paddle/fluid/framework/executor_cache.cc
+++ b/paddle/fluid/framework/executor_cache.cc
@@ -356,7 +356,7 @@ std::shared_ptr<InterpreterCore> CreateNewIRInterpreterCoreInfoToCache(
 std::unique_ptr<::pir::Program> ConstructFowardIrProgram(
     const paddle::framework::BlockDesc *forward_global_block,
     const paddle::framework::BlockDesc *backward_global_block,
-    const std::vector<std::string> output_names,
+    const std::vector<std::string> &output_names,
     const std::vector<paddle::Tensor> &x,
     const std::vector<std::string> &x_names,
     const std::vector<paddle::Tensor> &params,
@@ -415,19 +415,21 @@ std::unique_ptr<::pir::Program> ConstructFowardIrProgram(
   }
 
   std::set<std::string> set_parameter_names;
-  for (auto op_desc : backward_global_block->Program()->Block(0).AllOps()) {
-    for (const auto &n : op_desc->Inputs()) {
-      const auto &input_var_names = n.second;
-      for (const auto &var_name : input_var_names) {
-        set_parameter_names.insert(var_name);
-      }
-    }
-  }
-
   for (auto &t : output_names) {
     set_parameter_names.insert(t);
   }
 
+  if (backward_global_block != nullptr) {
+    for (auto op_desc : backward_global_block->Program()->Block(0).AllOps()) {
+      for (const auto &n : op_desc->Inputs()) {
+        const auto &input_var_names = n.second;
+        for (const auto &var_name : input_var_names) {
+          set_parameter_names.insert(var_name);
+        }
+      }
+    }
+  }
+
   for (auto &name : set_parameter_names) {
     if (!set_output_names.count(name)) {
       continue;
@@ -443,7 +445,6 @@ std::unique_ptr<::pir::Program> ConstructFowardIrProgram(
     op_desc->SetInput("x", {name});
     op_desc->SetOutput("out", {"@EMPTY@"});
   }
-
   paddle::translator::ProgramTranslator program_translator(&local_program,
                                                            program.get());
 
diff --git a/paddle/fluid/framework/executor_cache.h b/paddle/fluid/framework/executor_cache.h
index f55808175f09f..d30ed6396e65e 100644
--- a/paddle/fluid/framework/executor_cache.h
+++ b/paddle/fluid/framework/executor_cache.h
@@ -253,7 +253,7 @@ std::shared_ptr<InterpreterCore> CreateNewIRInterpreterCoreInfoToCache(
 std::unique_ptr<::pir::Program> ConstructFowardIrProgram(
     const paddle::framework::BlockDesc* forward_global_block,
     const paddle::framework::BlockDesc* backward_global_block,
-    const std::vector<std::string> output_names,
+    const std::vector<std::string>& output_names,
     const std::vector<paddle::Tensor>& x,
     const std::vector<std::string>& x_names,
     const std::vector<paddle::Tensor>& params,
diff --git a/python/paddle/jit/translated_layer.py b/python/paddle/jit/translated_layer.py
index a7f51c1a8c164..fce3211f23878 100644
--- a/python/paddle/jit/translated_layer.py
+++ b/python/paddle/jit/translated_layer.py
@@ -347,15 +347,11 @@ def __init__(self, program_desc):
         self._suffix_varname_dict = None
         # forward program
         self._infer_program_desc = self._preprocess(program_desc)
-        # forward + backward program
-        self._train_program_desc = self._append_backward_desc(
-            self._infer_program_desc
-        )
 
     # forward:
     @switch_to_static_graph
     def _create_forward_train_program(self):
-        whole_program = _build_program_by_desc(self._train_program_desc)
+        whole_program = _build_program_by_desc(self.train_program)
         end_op_index = self._infer_program_desc.block(0).op_size()
         if end_op_index > 0:
             return add_build_strategy_for(whole_program, 0, end_op_index)
@@ -369,7 +365,7 @@ def _forward_program_desc(self):
     # backward
     @switch_to_static_graph
     def _create_backward_train_program(self):
-        whole_program = _build_program_by_desc(self._train_program_desc)
+        whole_program = _build_program_by_desc(self.train_program)
         start_op_index = self._infer_program_desc.block(0).op_size() + len(
             self._output_descs
         )
@@ -389,9 +385,9 @@ def _backward_program_desc(self):
     def infer_program(self):
         return self._infer_program_desc
 
-    @property
+    @LazyInitialized
     def train_program(self):
-        return self._train_program_desc
+        return self._append_backward_desc(self._infer_program_desc)
 
     @property
     def forward_program(self):
@@ -1010,10 +1006,15 @@ def _run_dygraph(instance, input, program_holder):
             (
                 'forward_global_block',
                 forward_program.block(0),
-                'backward_global_block',
-                program_holder.backward_program.block(0),
             )
         )
+        if not instance._is_test:
+            attrs.extend(
+                (
+                    'backward_global_block',
+                    program_holder.backward_program.block(0),
+                )
+            )
 
     _legacy_C_ops.run_program(
         _valid_vars(input_vars),
@@ -1055,7 +1056,6 @@ def _run_static_graph(input, program_holder, trace_program):
         trace_program, exclude=param_var_names
     )
     trace_program.flush()
-    output_names = [var.name() for var in program_holder.output_descs]
     # append blocks from 'trace_program'
     _append_block(
         main_program,

From 70fe4b4961ce72adcf0a90532cd159e112feac58 Mon Sep 17 00:00:00 2001
From: risemeup1 <62429225+risemeup1@users.noreply.github.com>
Date: Wed, 20 Sep 2023 19:50:45 +0800
Subject: [PATCH 03/39] [gpups ci] (#52962)

* gpups information

* Update gpups_test.sh

* modify gpups,test=document_fix
---
 tools/gpups_test.sh | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/tools/gpups_test.sh b/tools/gpups_test.sh
index 86be766397652..31ad58a86456e 100644
--- a/tools/gpups_test.sh
+++ b/tools/gpups_test.sh
@@ -13,6 +13,20 @@
 # limitations under the License.
 
 
+function collect_failed_tests() {
+    for file in `ls $tmp_dir`; do
+        exit_code=0
+        grep -q 'The following tests FAILED:' $tmp_dir/$file||exit_code=$?
+        if [ $exit_code -ne 0 ]; then
+            failuretest=''
+        else
+            failuretest=`grep -A 10000 'The following tests FAILED:' $tmp_dir/$file | sed 's/The following tests FAILED://g'|sed '/^$/d'`
+            failed_test_lists="${failed_test_lists}
+            ${failuretest}"
+        fi
+    done
+}
+
 serial_list="^test_conv2d_op$|\
 ^test_conv2d_transpose_op$|\
 ^test_conv3d_op$"
@@ -48,7 +62,6 @@ parallel_list="^init_phi_test$|\
 ^test_dygraph_sharding_stage2_bf16$|\
 ^test_executor_feed_non_tensor$|\
 ^test_flash_attention$|\
-^test_flash_attention_deterministic$|\
 ^test_fused_adam_op$|\
 ^test_fused_attention_no_dropout$|\
 ^test_fused_attention_op$|\
@@ -93,16 +106,24 @@ parallel_list="^init_phi_test$|\
 ^test_top_k_v2_op$"
 
 cd ${work_dir}/build
-
+tmp_dir=`mktemp -d`
+tmpfile_rand=`date +%s%N`
+tmpfile=$tmp_dir/$tmpfile_rand"_"$i
 set +e
-ctest --output-on-failure -R "($parallel_list)" --timeout 120 -j4
+ctest --output-on-failure -R "($parallel_list)" --timeout 120 -j4 | tee -a $tmpfile; test ${PIPESTATUS[0]} -eq 0;
 EXIT_CODE_1=$?
 
-ctest --output-on-failure -R "($serial_list)" --timeout 120 -j1
+ctest --output-on-failure -R "($serial_list)" --timeout 120 -j1 | tee -a $tmpfile; test ${PIPESTATUS[0]} -eq 0;
 EXIT_CODE_2=$?
 set -e
 
 if [ "${EXIT_CODE_1}" != "0" ] || [ "${EXIT_CODE_2}" != "0" ];then
   echo "Sorry, some tests failed."
+  collect_failed_tests
+  rm -f $tmp_dir/*
+  echo "Summary Failed Tests... "
+  echo "========================================"
+  echo "The following tests FAILED: "
+  echo "${failuretest}" | sort -u
   exit 8
 fi

From 0cb7a2812829263dc5bab3597b7bd07127e81bd6 Mon Sep 17 00:00:00 2001
From: HydrogenSulfate <490868991@qq.com>
Date: Wed, 20 Sep 2023 20:13:02 +0800
Subject: [PATCH 04/39] correct default_dtype for ones, zeros, linspace,
 logspace, eye, full (#57487)

---
 python/paddle/tensor/creation.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index d6cad4b8eca34..c3e814cc906d4 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -303,7 +303,7 @@ def linspace(start, stop, num, dtype=None, name=None):
 
     """
     if dtype is None:
-        dtype = 'float32'
+        dtype = paddle.get_default_dtype()
     tensor_num = num
     tensor_start = start
     tensor_stop = stop
@@ -434,7 +434,7 @@ def logspace(start, stop, num, base=10.0, dtype=None, name=None):
             [1.]
     """
     if dtype is None:
-        dtype = 'float32'
+        dtype = paddle.get_default_dtype()
     tensor_num = num
     tensor_start = start
     tensor_stop = stop
@@ -1010,7 +1010,7 @@ def ones(shape, dtype=None, name=None):
              [1. 1.]]
     """
     if dtype is None:
-        dtype = core.VarDesc.VarType.FP32
+        dtype = paddle.get_default_dtype()
     return fill_constant(value=1.0, shape=shape, dtype=dtype, name=name)
 
 
@@ -1094,7 +1094,7 @@ def zeros(shape, dtype=None, name=None):
              [0. 0.]]
     """
     if dtype is None:
-        dtype = 'float32'
+        dtype = paddle.get_default_dtype()
     return fill_constant(value=0.0, shape=shape, dtype=dtype, name=name)
 
 
@@ -1176,8 +1176,8 @@ def _check_attr(attr, message):
     _check_attr(num_rows, "num_rows")
 
     if dtype is None:
-        dtype = core.VarDesc.VarType.FP32
-    elif not isinstance(dtype, core.VarDesc.VarType):
+        dtype = paddle.get_default_dtype()
+    if not isinstance(dtype, core.VarDesc.VarType):
         dtype = convert_np_dtype_to_dtype_(dtype)
     if num_columns is not None:
         _check_attr(num_columns, "num_columns")
@@ -1270,7 +1270,7 @@ def full(shape, fill_value, dtype=None, name=None):
     """
 
     if dtype is None:
-        dtype = 'float32'
+        dtype = paddle.get_default_dtype()
 
     return fill_constant(shape=shape, dtype=dtype, value=fill_value, name=name)
 

From be7ae2c74d19fc0ea0c1e205478389b98c537595 Mon Sep 17 00:00:00 2001
From: Yiqun Liu <Xreki@users.noreply.github.com>
Date: Wed, 20 Sep 2023 21:13:00 +0800
Subject: [PATCH 05/39] Try to fix performance drop. (#57525)

---
 paddle/phi/kernels/gpu/flip_kernel.cu           |  7 ++++++-
 paddle/phi/kernels/gpu/index_put_grad_kernel.cu | 12 ++++++++++--
 paddle/phi/kernels/gpu/index_put_kernel.cu      |  6 +++++-
 paddle/phi/kernels/gpu/roll_kernel_impl.h       |  6 +++++-
 4 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/paddle/phi/kernels/gpu/flip_kernel.cu b/paddle/phi/kernels/gpu/flip_kernel.cu
index f271eba26e0ab..71fdbcaaa68bb 100644
--- a/paddle/phi/kernels/gpu/flip_kernel.cu
+++ b/paddle/phi/kernels/gpu/flip_kernel.cu
@@ -40,7 +40,12 @@ __global__ void FlipCudaKernel(const T* in_data,
   int64_t cur_indices = idx;
   int64_t rem = 0;
   int64_t dst_offset = 0;
-  for (int i = 0; i < rank; ++i) {
+
+#pragma unroll
+  for (int i = 0; i < DDim::kMaxRank; ++i) {
+    if (i >= rank) {
+      break;
+    }
     int64_t temp = cur_indices;
     cur_indices = cur_indices / stride[i];
     rem = temp - cur_indices * stride[i];
diff --git a/paddle/phi/kernels/gpu/index_put_grad_kernel.cu b/paddle/phi/kernels/gpu/index_put_grad_kernel.cu
index 7e584e5c10318..915c7f40fa2cb 100644
--- a/paddle/phi/kernels/gpu/index_put_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_put_grad_kernel.cu
@@ -40,7 +40,11 @@ __global__ void SetZeroCudaKernel(int64_t** indices,
 
   int64_t cur_ix = 0;
   int64_t offset = 0;
-  for (int i = 0; i < rank; ++i) {
+#pragma unroll
+  for (int i = 0; i < DDim::kMaxRank; ++i) {
+    if (i >= rank) {
+      break;
+    }
     cur_ix = (static_cast<int64_t>(*(indices[i] + idx)));
     if (cur_ix < 0) {
       cur_ix += shape[i];
@@ -69,7 +73,11 @@ __global__ void IndexPutGradCudaKernel(
 
   int64_t cur_ix = 0;
   int64_t offset = 0;
-  for (int i = 0; i < rank; ++i) {
+#pragma unroll
+  for (int i = 0; i < DDim::kMaxRank; ++i) {
+    if (i >= rank) {
+      break;
+    }
     cur_ix = (static_cast<int64_t>(*(indices[i] + idx)));
     if (cur_ix < 0) {
       cur_ix += shape[i];
diff --git a/paddle/phi/kernels/gpu/index_put_kernel.cu b/paddle/phi/kernels/gpu/index_put_kernel.cu
index ccbd19aaba681..3af220ce16b31 100644
--- a/paddle/phi/kernels/gpu/index_put_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_put_kernel.cu
@@ -41,7 +41,11 @@ __global__ void IndexPutCudaKernel(const T* x,
     return;
   }
   int64_t offset = 0;
-  for (int i = 0; i < rank; ++i) {
+#pragma unroll
+  for (int i = 0; i < DDim::kMaxRank; ++i) {
+    if (i >= rank) {
+      break;
+    }
     cur_ix = (static_cast<int64_t>(*(indices[i] + idx)));
     if (cur_ix < 0) {
       cur_ix += shape[i];
diff --git a/paddle/phi/kernels/gpu/roll_kernel_impl.h b/paddle/phi/kernels/gpu/roll_kernel_impl.h
index 38e2a6ff669ad..c7ffcb2d5ca52 100644
--- a/paddle/phi/kernels/gpu/roll_kernel_impl.h
+++ b/paddle/phi/kernels/gpu/roll_kernel_impl.h
@@ -40,7 +40,11 @@ __global__ void RollCudaKernel(const T* input,
   int64_t output_idx = idx;
   int64_t new_dim_idx = 0;
 
-  for (size_t i = 0; i < rank; i++) {
+#pragma unroll
+  for (size_t i = 0; i < DDim::kMaxRank; i++) {
+    if (i >= rank) {
+      break;
+    }
     new_dim_idx = (output_idx / strides[i]) % sizes[i] + shifts[i];
     if (new_dim_idx >= sizes[i]) {
       output_idx += (shifts[i] - sizes[i]) * strides[i];

From c5d0e0c6b6930f8e25d24bf9c1ff189657552726 Mon Sep 17 00:00:00 2001
From: Yuang Liu <liuyuang@baidu.com>
Date: Thu, 21 Sep 2023 07:24:41 +0800
Subject: [PATCH 06/39] sharding stage 2 main grad bug fix (#57537)

---
 python/paddle/distributed/sharding/group_sharded.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/python/paddle/distributed/sharding/group_sharded.py b/python/paddle/distributed/sharding/group_sharded.py
index 2bbc93259eaa8..350f6eff4d001 100644
--- a/python/paddle/distributed/sharding/group_sharded.py
+++ b/python/paddle/distributed/sharding/group_sharded.py
@@ -28,6 +28,9 @@
 from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_utils import (
     GroupShardedScaler,
 )
+from paddle.distributed.fleet.utils.mix_precision_utils import (
+    MixPrecisionOptimizer,
+)
 from paddle.distributed.utils.log_utils import get_logger
 from paddle.optimizer import Optimizer
 
@@ -111,9 +114,10 @@ def group_sharded_parallel(
     assert isinstance(
         model, paddle.nn.Layer
     ), "The model must be the instance of paddle.nn.Layer."
-    assert isinstance(
-        optimizer, Optimizer
-    ), "The optimizer must be the instance of paddle.optimizer.Optimizer."
+    assert isinstance(optimizer, (MixPrecisionOptimizer, Optimizer)), (
+        "The optimizer must be the instance of paddle.optimizer.Optimizer "
+        "or MixPrecisionOptimizer for main grad."
+    )
     assert level in [
         'os',
         'os_g',

From 058b008e721e87c2f7b25079d49c66b47849d175 Mon Sep 17 00:00:00 2001
From: xiaoguoguo626807 <100397923+xiaoguoguo626807@users.noreply.github.com>
Date: Thu, 21 Sep 2023 09:11:47 +0800
Subject: [PATCH 07/39] =?UTF-8?q?=E3=80=90pir=E3=80=91add=20all=20Slice=20?=
 =?UTF-8?q?newir=20test=20=20(#57529)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* add reference of lbfgs

* add reference of lbfgs

* tmp

* split gen modify

* fix conflict

* add split

* fix bug

* fix bug

* test split

* add meta tensor

* refine code

* fix bug

* fix bug

* fix comflict

* Call _C_ops.sum in new ir

* modify concat kernel choose

* modify ci

* modify sum zero_dim optest

* modify split_with_num api

* modify split -1

* modify split test

* fix bug

* xxx

* delete extra modify

* add add_n

* tmp

* add split_with_num_grad

* expand first

* expand first

* modify split grad num bug

* modify ci

* modify ci

* clear code

* modify

* recover

* add add_n stop_gradient infer

* modify opreslut to value

* fix conflict

* recover to aviod conflict

* recover to aviod conflict

* modify opreslut to value

* recover complex tanh

* modify add_n optest

* skip bfp16

* modify split bf16

* fix conflict

* modify expand special case

* delete print

* code style

* slice optest pass

---------

Co-authored-by: zhangbo9674 <zhangbo54@baidu.com>
Co-authored-by: 0x45f <wangzhen45@baidu.com>
---
 .../pir/dialect/op_generator/op_build_gen.py  |  1 +
 test/legacy_test/test_slice_op.py             | 56 +++++++++++++------
 2 files changed, 40 insertions(+), 17 deletions(-)

diff --git a/paddle/fluid/pir/dialect/op_generator/op_build_gen.py b/paddle/fluid/pir/dialect/op_generator/op_build_gen.py
index bfb20bb8e283d..33bb81e43bf64 100644
--- a/paddle/fluid/pir/dialect/op_generator/op_build_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/op_build_gen.py
@@ -19,6 +19,7 @@
     'SplitWithNumInferMeta',
     'ConcatInferMeta',
     'ReduceIntArrayAxisInferMeta',
+    'SliceRawInferMeta',
 }
 
 _PREPARE_DATA_WITH_VECTOR_INT64_MTTABLE_ATTRIBUTE = {'FrobeniusNormOp'}
diff --git a/test/legacy_test/test_slice_op.py b/test/legacy_test/test_slice_op.py
index 194e933e1d0ec..065251b246928 100644
--- a/test/legacy_test/test_slice_op.py
+++ b/test/legacy_test/test_slice_op.py
@@ -71,7 +71,11 @@ def test_check_output(self):
 
     def test_check_grad_normal(self):
         self.check_grad(
-            ['Input'], 'Out', max_relative_error=0.006, check_prim=True
+            ['Input'],
+            'Out',
+            max_relative_error=0.006,
+            check_prim=True,
+            check_new_ir=True,
         )
 
 
@@ -157,7 +161,11 @@ def test_check_output(self):
 
     def test_check_grad_normal(self):
         self.check_grad(
-            ['Input'], 'Out', max_relative_error=0.006, check_prim=True
+            ['Input'],
+            'Out',
+            max_relative_error=0.006,
+            check_prim=True,
+            check_new_ir=True,
         )
 
 
@@ -195,10 +203,12 @@ def config(self):
         self.starts_infer = [-1, 0, -1]
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=True)
 
     def test_check_grad_normal(self):
-        self.check_grad(['Input'], 'Out', max_relative_error=0.006)
+        self.check_grad(
+            ['Input'], 'Out', max_relative_error=0.006, check_new_ir=True
+        )
 
 
 # Situation 2: starts(list, have tensor), ends(list, no tensor)
@@ -238,10 +248,12 @@ def config(self):
         self.starts_infer = [1, -1, 2]
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_dygraph=True, check_new_ir=True)
 
     def test_check_grad_normal(self):
-        self.check_grad(['Input'], 'Out', max_relative_error=0.006)
+        self.check_grad(
+            ['Input'], 'Out', max_relative_error=0.006, check_new_ir=True
+        )
 
 
 class TestSliceOp_decs_dim_5_starts_ListTensor(
@@ -289,10 +301,12 @@ def config(self):
         self.out = self.input[1, 0:3, 2:4, :]
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=True)
 
     def test_check_grad_normal(self):
-        self.check_grad(['Input'], 'Out', max_relative_error=0.006)
+        self.check_grad(
+            ['Input'], 'Out', max_relative_error=0.006, check_new_ir=True
+        )
 
 
 # Situation 4: starts(tensor), ends(tensor)
@@ -325,10 +339,12 @@ def config(self):
         self.out = self.input[1:3, 0:3, 2:4, :]
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=True)
 
     def test_check_grad_normal(self):
-        self.check_grad(['Input'], 'Out', max_relative_error=0.006)
+        self.check_grad(
+            ['Input'], 'Out', max_relative_error=0.006, check_new_ir=True
+        )
 
 
 # Situation 5: starts(tensor), ends(tensor)
@@ -362,10 +378,12 @@ def config(self):
         self.out = self.input[1, 0, 2:4, :]
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=True)
 
     def test_check_grad_normal(self):
-        self.check_grad(['Input'], 'Out', max_relative_error=0.006)
+        self.check_grad(
+            ['Input'], 'Out', max_relative_error=0.006, check_new_ir=True
+        )
 
 
 # Situation 6: starts(tensor), ends(list, have tensor)
@@ -406,10 +424,12 @@ def config(self):
         self.ends_infer = [-1, 3, 4]
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=True)
 
     def test_check_grad_normal(self):
-        self.check_grad(['Input'], 'Out', max_relative_error=0.006)
+        self.check_grad(
+            ['Input'], 'Out', max_relative_error=0.006, check_new_ir=True
+        )
 
 
 class TestSliceOp_ZeroDim(OpTest):
@@ -448,10 +468,10 @@ def config(self):
         self.out = self.input[0:20, 1:3, 1:3]
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=True)
 
     def test_check_grad_normal(self):
-        self.check_grad(['Input'], 'Out')
+        self.check_grad(['Input'], 'Out', check_new_ir=True)
 
 
 # Test CUDA float16
@@ -499,6 +519,7 @@ def test_check_grad_normal(self):
                 ['Input'],
                 'Out',
                 check_prim=True,
+                check_new_ir=True,
             )
 
 
@@ -546,6 +567,7 @@ def test_check_grad_normal(self):
                 'Out',
                 numeric_grad_delta=0.5,
                 check_prim=True,
+                check_new_ir=True,
             )
 
 
@@ -578,7 +600,7 @@ def test_check_output(self):
         self.check_output(check_new_ir=True)
 
     def test_check_grad_normal(self):
-        self.check_grad(['Input'], 'Out', check_prim=True)
+        self.check_grad(['Input'], 'Out', check_prim=True, check_new_ir=True)
 
 
 # Test python API

From 164abf27d2ae1d8e90691b26bc01789002535d46 Mon Sep 17 00:00:00 2001
From: Sonder <55493212+AndSonder@users.noreply.github.com>
Date: Thu, 21 Sep 2023 09:49:37 +0800
Subject: [PATCH 08/39] Support control flow for static build [Step 2: support
 conditional_block] (#56696)

* add conditional_block to OperatorBasesHandledInStaticBuild

* run op in FakeInitializeOutputsForOperatorBase

* add init_success judge

* fix build error

* fix

* add SetSubBlockCore func

* add PreStaticRun func

* add PreStaticRun to interpreter_base and new_ir_inter

* recover codes

* add PreStaticBuild and BlockCanBeStaticBuilt

* fix logic about RunPreStaticBuild

* change CreateOpFromOpDesc type

* fix build error

* fix build error

* remove IsOperatorBasesHandledInStaticBuild

* recover BlockCanBeStaticBuilt

* add logic about conditional_block run static build

* recover codes

* recover BlockCanBeStaticBuilt

* support static build condational block op when condational block is the last op in the block

* fix error

* fix logic about last op

* fit for sub block can't open static build

* add IsStaticBuild

* fix build error

* fit logic when sub block can't open static build

* close static build when sub_block don't support static_build

* recover third party

* add is_skil_fake_init logic

* set the backend of the lamb

* change start index

* add if conditional for cal is_skip_fake_init

* change name

* close static_build for test_conditional_block

* add static buiild support for conditional block in case of the output's dtype/place is changed but the following op is not use this output

* fix logic error

* fix timeout error

* fix

* remove useless codes

* fix

* fix

* fix build error

* move GetVarsInfo and RunPreStaticBuild from opeartor to static_build

* fix lamb backend registe

* fix build error

* fix build error

* remove lamp op test from new_ir_op_test_white_list

* fix

* move generating following_input_vars logic to static_build.cc

* remove HasInfo

* fix build error

* recover codes and turn off the flag
---
 .../interpreter/interpreter_util.cc           |  26 +-
 .../new_executor/interpreter/static_build.cc  | 222 ++++++++++++++++--
 .../new_executor/interpreter/static_build.h   |  38 ++-
 .../new_executor/interpreter_base_impl.h      |   6 +
 .../framework/new_executor/interpretercore.cc |   8 +
 .../framework/new_executor/interpretercore.h  |   5 +
 .../new_executor/new_ir_interpreter.cc        |   7 +
 .../new_executor/new_ir_interpreter.h         |   6 +
 .../new_executor/program_interpreter.cc       |  57 +++--
 .../new_executor/program_interpreter.h        |   6 +-
 paddle/phi/kernels/gpu/lamb_kernel.cu         |   2 +
 test/legacy_test/CMakeLists.txt               |   4 +
 test/white_list/new_ir_op_test_white_list     |   2 -
 13 files changed, 332 insertions(+), 57 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
index 67106932169a3..8015a50545e69 100644
--- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
@@ -527,11 +527,13 @@ platform::DeviceContext* ConstructDeviceContext(const OperatorBase* op,
   return default_dev_ctx;
 }
 
-void HandleOperatorBase(const platform::Place& place,
-                        std::shared_ptr<OperatorBase> op,
-                        OpFuncNode* op_func_node,
-                        Scope* scope,
-                        bool static_build) {
+void HandleOperatorBase(
+    const platform::Place& place,
+    std::shared_ptr<OperatorBase> op,
+    OpFuncNode* op_func_node,
+    Scope* scope,
+    bool static_build,
+    std::vector<std::shared_ptr<OperatorBase>> following_ops) {
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   auto* dev_ctx = pool.Get(place);
   // input, output is prepared. set the other attributes.
@@ -542,7 +544,8 @@ void HandleOperatorBase(const platform::Place& place,
     if (OperatorBasesMustRunInStaticBuild.count(op->Type())) {
       op->Run(*scope, place);
     }
-    FakeInitializeOutputsForOperatorBase(*op, place, scope);
+
+    FakeInitializeOutputsForOperatorBase(*op, place, scope, following_ops);
   } else {
     op->Run(*scope, place);  // Run without data transformer.
   }
@@ -690,8 +693,15 @@ void BuildOpFuncList(const platform::Place& place,
       if (dynamic_cast<framework::OperatorWithKernel*>(op) == nullptr) {
         VLOG(4) << "HandleOperatorBase";
         // op is not a operatorwithkernel, so direcly run OperatorBase::Run()
-        HandleOperatorBase(
-            place, ops[i], &op_func_node, local_scope, static_build);
+
+        std::vector<std::shared_ptr<OperatorBase>> following_ops(
+            ops.begin() + i + 1, ops.end());
+        HandleOperatorBase(place,
+                           ops[i],
+                           &op_func_node,
+                           local_scope,
+                           static_build,
+                           following_ops);
         vec_func_list->emplace_back(op_func_node);
       } else {
         VLOG(4) << "OP is not null";
diff --git a/paddle/fluid/framework/new_executor/interpreter/static_build.cc b/paddle/fluid/framework/new_executor/interpreter/static_build.cc
index 69b4920050925..0f9bd3f387a92 100644
--- a/paddle/fluid/framework/new_executor/interpreter/static_build.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/static_build.cc
@@ -15,11 +15,18 @@
 #include "paddle/fluid/framework/new_executor/interpreter/static_build.h"
 
 #include "paddle/fluid/eager/api/utils/global_utils.h"
+#include "paddle/fluid/framework/new_executor/new_executor_defs.h"
+#include "paddle/fluid/framework/new_executor/standalone_executor.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/operators/reader/buffered_reader.h"
 
+#ifdef PADDLE_WITH_DNNL
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+
 // These Ops is OperatorBase, but we have been handle them in static build
-std::set<std::string> OperatorBasesHandledInStaticBuild = {"read"};
+std::set<std::string> OperatorBasesHandledInStaticBuild = {"read",
+                                                           "conditional_block"};
 
 std::set<std::string> OperatorBasesMustRunInStaticBuild = {
     "create_double_buffer_reader", "create_py_reader"};
@@ -53,11 +60,68 @@ namespace paddle {
 namespace framework {
 namespace interpreter {
 
+using InterpreterCore = framework::InterpreterCore;
+
+static VarMetaInfo GetVarMetaInfo(const Scope& scope, const std::string& name) {
+  Variable* var = scope.FindVar(name);
+  phi::DataType dtype = phi::DataType::UNDEFINED;
+  phi::Place place = phi::Place();
+  if (var == nullptr) {
+    return VarMetaInfo(name, dtype, place);
+  }
+
+  if (var->IsType<phi::DenseTensor>()) {
+    const phi::DenseTensor& tensor = var->Get<phi::DenseTensor>();
+    if (!UNLIKELY(!tensor.IsInitialized())) {
+      dtype = tensor.dtype();
+      place = tensor.place();
+    }
+  } else if (var->IsType<phi::SelectedRows>()) {
+    auto tensor = var->Get<phi::SelectedRows>().value();
+    if (!UNLIKELY(!tensor.IsInitialized())) {
+      dtype = tensor.dtype();
+      place = tensor.place();
+    }
+  }
+  return VarMetaInfo(name, dtype, place);
+}
+
+std::vector<VarMetaInfo> GetVarsInfo(const Scope* scope,
+                                     VariableNameMap var_map,
+                                     const OperatorBase& op) {
+  std::vector<VarMetaInfo> var_info;
+
+  const std::unordered_set<std::string>* no_need_buffer_vars = nullptr;
+  if (op.Info().NoNeedBufferVarsInferer()) {
+    no_need_buffer_vars = &(op.Info().NoNeedBufferVarsInferer()(
+        op.Inputs(), op.Outputs(), op.Attrs()));
+    if (no_need_buffer_vars->empty()) no_need_buffer_vars = nullptr;
+  }
+  for (auto it = var_map.begin(); it != var_map.end();) {
+    auto& var = *it;
+    bool is_no_need_buffer_var =
+        (no_need_buffer_vars && no_need_buffer_vars->count(var.first) > 0);
+    std::string var_name;
+    var_info.reserve(var_info.size() + var.second.size());
+    for (size_t i = 0; i < var.second.size(); ++i) {
+      auto var_name = var.second[i];
+      if (scope && is_no_need_buffer_var) {
+        var_info.emplace_back(GetVarMetaInfo(*scope, var_name));
+      } else {
+        var_info.emplace_back(var_name);
+      }
+    }
+    ++it;
+  }
+  return var_info;
+}
+
 bool BlockCanBeStaticBuilt(const framework::BlockDesc& block) {
-  // in_black_list = (kernelCode >> 7) & 1
-  // is_operator_base = (kernelCode >> 6) & 1
-  // is_custom_op = (kernelCode >> 5) & 1
-  // use_mkldnn = (kernelCode >> 4) & 1
+  // in_black_list = (kernelCode >> 5) & 1
+  // is_operator_base = (kernelCode >> 4) & 1
+  // is_custom_op = (kernelCode >> 3) & 1
+  // use_mkldnn = (kernelCode >> 2) & 1
+  // sub_block_can_not_static_build = (kernelCode >> 1) & 1
   using KernelCode = int8_t;
   std::set<std::pair<std::string, KernelCode>> invalid_ops;
   for (auto& op : block.AllOps()) {
@@ -77,17 +141,22 @@ bool BlockCanBeStaticBuilt(const framework::BlockDesc& block) {
       use_mkldnn = attr.index() == 1 ? PADDLE_GET_CONST(int, attr)
                                      : PADDLE_GET_CONST(bool, attr);
     }
-    bool has_structured_kernel =
-        phi::KernelFactory::Instance().HasStructuredKernel(op_type);
+
+    bool sub_block_can_not_static_build = false;
+    if (op->HasAttr("sub_block")) {
+      auto* sub_block =
+          PADDLE_GET_CONST(framework::BlockDesc*, op->GetAttr("sub_block"));
+      sub_block_can_not_static_build = !BlockCanBeStaticBuilt(*sub_block);
+    }
 
     KernelCode kernel_code = static_cast<KernelCode>(
-        (in_black_list << 7) + (is_operator_base << 6) + (is_custom_op << 5) +
-        (use_mkldnn << 4) + (has_structured_kernel << 2));
+        (in_black_list << 5) + (is_operator_base << 4) + (is_custom_op << 3) +
+        (use_mkldnn << 2) + (sub_block_can_not_static_build << 1));
     if (!OpsCanSkipedFakeAllocInStaticBuild.count(op_type)) {
       if (in_black_list ||
           (is_operator_base &&
            !OperatorBasesHandledInStaticBuild.count(op_type)) ||
-          is_custom_op || use_mkldnn) {
+          is_custom_op || use_mkldnn || sub_block_can_not_static_build) {
         invalid_ops.insert(std::make_pair(op_type, kernel_code));
       }
     }
@@ -97,11 +166,12 @@ bool BlockCanBeStaticBuilt(const framework::BlockDesc& block) {
     std::stringstream ss;
     ss << "The following OPs are unable to static build:\n";
     for (auto& item : invalid_ops) {
-      ss << item.first << " [in_black_list = " << (item.second >> 7 & 1)
-         << ", is_operator_base = " << (item.second >> 6 & 1)
-         << ", is_custom_op = " << (item.second >> 5 & 1)
-         << ", use_mkldnn = " << (item.second >> 4 & 1)
-         << (item.second >> 2 & 1) << "]\n";
+      ss << item.first << " [in_black_list = " << (item.second >> 6 & 1)
+         << ", is_operator_base = " << (item.second >> 5 & 1)
+         << ", is_custom_op = " << (item.second >> 4 & 1)
+         << ", use_mkldnn = " << (item.second >> 3 & 1)
+         << ", sub_block_can_not_static_build = " << (item.second >> 1 & 1)
+         << "]\n";
     }
     VLOG(1) << ss.str();
   }
@@ -318,9 +388,59 @@ void FakeInitializeTensorBase(const platform::DeviceContext& dev_ctx,
   }
 }
 
-void FakeInitializeOutputsForOperatorBase(const OperatorBase& op,
-                                          const phi::Place& place,
-                                          Scope* scope) {
+void RunPreStaticBuild(const framework::Scope& scope,
+                       const platform::Place& dev_place,
+                       const OperatorBase& op) {
+  auto* scope_var = scope.FindVar(op.Output("Scope"));
+  PADDLE_ENFORCE_NOT_NULL(
+      scope_var,
+      platform::errors::PreconditionNotMet(
+          "Expect Scope variable to be set in conditional_block_op, but "
+          "got a null Scope variable. Please set the Scope variable."));
+
+  auto* scopes = scope_var->GetMutable<std::vector<framework::Scope*>>();
+  scopes->resize(1);
+  scopes->front() = &scope.NewScope();
+
+  auto& cur_scope = *scopes->front();
+#ifdef PADDLE_WITH_DNNL
+  // Executor on being destroyed clears oneDNN cache and resets
+  // registered model data layout. This is unwanted for nested
+  // Executors (executors declared inside control ops)
+  platform::DontClearMKLDNNCache(dev_place);
+#endif
+  auto* block = op.Attr<framework::BlockDesc*>("sub_block");
+  VLOG(3) << "Conditional block.idx = " << block->ID()
+          << ", scope = " << &cur_scope;
+
+  auto& skip_vars =
+      op.Attr<std::vector<std::string>>("skip_eager_deletion_vars");
+
+  std::unique_ptr<InterpreterCore> core;
+  LOG_FIRST_N(INFO, 1)
+      << "[ControlFlow][ConditionalBlock] New Executor is Running.";
+
+  VLOG(10) << "[interpreterCore cache]" << core.get();
+  VLOG_IF(10, core) << platform::is_same_place(core->GetPlace(), dev_place);
+
+  framework::interpreter::ExecutionConfig execution_config;
+  execution_config.create_local_scope = false;
+  execution_config.used_for_control_flow_op = true;
+  execution_config.skip_gc_vars =
+      std::set<std::string>(skip_vars.begin(), skip_vars.end());
+
+  core.reset(
+      new InterpreterCore(dev_place, *block, &cur_scope, execution_config));
+
+  std::vector<paddle::framework::OpFuncNode> op_func_nodes;
+  core->Build({}, &op_func_nodes);
+}
+
+void FakeInitializeOutputsForOperatorBase(
+    const OperatorBase& op,
+    const phi::Place& place,
+    Scope* scope,
+    std::vector<std::shared_ptr<OperatorBase>> following_ops) {
   const std::string& op_type = op.Type();
   if (OpsCanSkipedFakeAllocInStaticBuild.count(op_type)) {
     return;
@@ -329,7 +449,59 @@ void FakeInitializeOutputsForOperatorBase(const OperatorBase& op,
   phi::DeviceContext* dev_ctx =
       platform::DeviceContextPool::Instance().Get(place);
 
-  if (op_type == "read") {
+  if (op_type == "conditional_block") {
+    // Note(sonder): skip fake init for conditional_block when there is no
+    // op with kernel after it.
+    bool skip_fake_init = true;
+    std::unordered_set<std::string> following_input_vars;
+
+    for (size_t i = 0; i < following_ops.size(); ++i) {
+      if (dynamic_cast<framework::OperatorWithKernel*>(
+              following_ops[i].get()) != nullptr) {
+        VLOG(4) << "Find op with kernel after conditional_block : "
+                << following_ops[i]->Type();
+        skip_fake_init = false;
+        auto input_vars_info = GetVarsInfo(
+            scope, following_ops[i]->Inputs(), *following_ops[i].get());
+        for (auto& input_var_info : input_vars_info) {
+          following_input_vars.insert(input_var_info.name_);
+        }
+      }
+    }
+
+    if (skip_fake_init) {
+      return;
+    }
+
+    const std::vector<VarMetaInfo> out_var_info_before_build =
+        GetVarsInfo(scope, op.Outputs(), op);
+
+    RunPreStaticBuild(*scope, place, op);
+    const std::vector<VarMetaInfo> out_var_info_after_build =
+        GetVarsInfo(scope, op.Outputs(), op);
+
+    // Note(sonder): static_build is not supported if the output of
+    // conditional_block is changed after static build.
+    for (size_t i = 0; i < out_var_info_before_build.size(); ++i) {
+      // static build is supported in case of the output's dtype/place
+      // is changed but the following op is not use this output
+      if (out_var_info_before_build[i] != out_var_info_after_build[i]) {
+        auto var_name = out_var_info_before_build[i].name_;
+        if (following_input_vars.count(var_name)) {
+          PADDLE_THROW(phi::errors::PreconditionNotMet(
+              "The output %s s' dtype/place of conditional_block is "
+              "changed after static build. Befer static build, the "
+              "dtype is %s, place is %s. After static "
+              "build, the dtype is %s, place is %s.",
+              var_name,
+              out_var_info_before_build[i].dtype_,
+              out_var_info_before_build[i].place_,
+              out_var_info_after_build[i].dtype_,
+              out_var_info_after_build[i].place_));
+        }
+      }
+    }
+  } else if (op_type == "read") {
     const std::string& reader_name = op.Input("Reader");
     framework::ReaderHolder* reader =
         GET_DATA_SAFELY(scope->FindVar(reader_name), "Input", "Reader", "Read")
@@ -448,6 +620,18 @@ void FakeInitializeOutputsForFunctionKernel(
             if (beta1_pow->place() == beta2_pow->place()) {
               backend = phi::TransToPhiBackend(beta1_pow->place());
             }
+          } else if (op_type == "lamb") {
+            phi::TensorBase* beta1_pow = GetTensorFormVar(
+                runtime_ctx.inputs.find("Beta1Pow")->second.at(0));
+            phi::TensorBase* beta2_pow = GetTensorFormVar(
+                runtime_ctx.inputs.find("Beta2Pow")->second.at(0));
+            if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU &&
+                beta1_pow->place().GetType() == AllocationType::CPU &&
+                beta2_pow->place().GetType() == AllocationType::CPU) {
+              backend = phi::Backend::CPU;
+            } else {
+              backend = phi::TransToPhiBackend(dev_ctx.GetPlace());
+            }
           } else if (op_type == "reshape2") {
             phi::TensorBase* x =
                 GetTensorFormVar(runtime_ctx.inputs.find("X")->second.at(0));
diff --git a/paddle/fluid/framework/new_executor/interpreter/static_build.h b/paddle/fluid/framework/new_executor/interpreter/static_build.h
index e070f66b02549..302d612bc0311 100644
--- a/paddle/fluid/framework/new_executor/interpreter/static_build.h
+++ b/paddle/fluid/framework/new_executor/interpreter/static_build.h
@@ -23,11 +23,39 @@ namespace paddle {
 namespace framework {
 namespace interpreter {
 
+struct VarMetaInfo {
+  std::string name_;
+  phi::DataType dtype_;
+  phi::Place place_;
+
+  explicit VarMetaInfo(const std::string& name) : name_(name) {
+    dtype_ = phi::DataType::UNDEFINED;
+    place_ = phi::Place();
+  }
+
+  VarMetaInfo(const std::string& name,
+              const phi::DataType& dtype,
+              const platform::Place& place)
+      : name_(name), dtype_(dtype), place_(place) {}
+
+  bool operator==(const VarMetaInfo& other) const {
+    return name_ == other.name_ && dtype_ == other.dtype_ &&
+           place_ == other.place_;
+  }
+
+  bool operator!=(const VarMetaInfo& other) const {
+    return name_ != other.name_ || dtype_ != other.dtype_ ||
+           place_ != other.place_;
+  }
+};
+
 bool BlockCanBeStaticBuilt(const framework::BlockDesc& block);
 
-void FakeInitializeOutputsForOperatorBase(const OperatorBase& op,
-                                          const platform::Place& place,
-                                          Scope* scope);
+void FakeInitializeOutputsForOperatorBase(
+    const OperatorBase& op,
+    const phi::Place& place,
+    Scope* scope,
+    std::vector<std::shared_ptr<OperatorBase>> following_ops);
 
 void FakeInitializeOutputsForFunctionKernel(
     const framework::OperatorBase& op,
@@ -40,6 +68,10 @@ void FakeInitializeOutputsForStructureKernel(
     const framework::OpKernelType& op_kernel_type,
     ExecutionContext* execution_context);
 
+std::vector<VarMetaInfo> GetVarsInfo(const Scope* scope,
+                                     VariableNameMap var_map,
+                                     const OperatorBase& op);
+
 }  // namespace interpreter
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/interpreter_base_impl.h b/paddle/fluid/framework/new_executor/interpreter_base_impl.h
index 2c030ef1dc264..369216e0078c4 100644
--- a/paddle/fluid/framework/new_executor/interpreter_base_impl.h
+++ b/paddle/fluid/framework/new_executor/interpreter_base_impl.h
@@ -97,6 +97,12 @@ class InterpreterBaseImpl {
   virtual std::shared_ptr<std::vector<size_t>> GetDependencyCount() const = 0;
 
   virtual bool IsSharedResultsBuild() const = 0;
+
+  virtual void Build(
+      const std::vector<std::string>& feed_names,
+      std::vector<paddle::framework::OpFuncNode>* op_func_nodes) = 0;
+
+  virtual bool IsStaticBuild() const = 0;
 };
 
 inline void SetDeviceId(const platform::Place& place) {
diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index dc8110331a176..8e052d3b2685e 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -121,5 +121,13 @@ void InterpreterCore::SetOutputHooks(const std::vector<HookFunc>& hookfuncs) {
   impl_->SetOutputHooks(hookfuncs);
 }
 
+void InterpreterCore::Build(
+    const std::vector<std::string>& feed_names,
+    std::vector<paddle::framework::OpFuncNode>* op_func_nodes) {
+  impl_->Build(feed_names, op_func_nodes);
+}
+
+bool InterpreterCore::IsStaticBuild() const { return impl_->IsStaticBuild(); }
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h
index 47f2d9c6a3378..d21bd9e1fc378 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.h
+++ b/paddle/fluid/framework/new_executor/interpretercore.h
@@ -74,6 +74,11 @@ class InterpreterCore {
 
   void SetOutputHooks(const std::vector<HookFunc>& hookfuncs);
 
+  void Build(const std::vector<std::string>& feed_names,
+             std::vector<paddle::framework::OpFuncNode>* op_func_nodes);
+
+  bool IsStaticBuild() const;
+
  private:
   DISABLE_COPY_AND_ASSIGN(InterpreterCore);
 
diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
index 6b6cabb991382..55f70a573a1bc 100644
--- a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
@@ -1315,6 +1315,13 @@ void NewIRInterpreter::PreAnalysis() {
   VLOG(4) << "Done UpdateNcclOpNum";
 }
 
+void NewIRInterpreter::Build(
+    const std::vector<std::string>& feed_names,
+    std::vector<paddle::framework::OpFuncNode>* op_func_nodes) {
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "Build is not implemented in NewIRInterpreter."));
+}
+
 ::pir::Value NewIRInterpreter::GetValueByName(const std::string& var_name) {
   for (auto kv : value_2_var_name_) {
     if (kv.second == var_name) {
diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.h b/paddle/fluid/framework/new_executor/new_ir_interpreter.h
index cf5cb21ce81aa..c05eb6770b2ba 100644
--- a/paddle/fluid/framework/new_executor/new_ir_interpreter.h
+++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.h
@@ -100,6 +100,12 @@ class NewIRInterpreter : public InterpreterBaseImpl {
   void CheckCUDAGraphBeforeRun(const std::vector<std::string>& feed_names);
   void PrepareForCUDAGraphCapture();
 
+  void Build(
+      const std::vector<std::string>& feed_names,
+      std::vector<paddle::framework::OpFuncNode>* op_func_nodes) override;
+
+  bool IsStaticBuild() const override { return static_build_; }
+
   // workqueue
   std::shared_ptr<interpreter::AsyncWorkQueue> GetWorkQueue();
 
diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc
index a29e45515d894..1384a9fb487de 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/program_interpreter.cc
@@ -52,10 +52,6 @@ ProgramInterpreter::ProgramInterpreter(const platform::Place& place,
       var_scope_(scope) {
   VLOG(4) << "ProgramInterpreter(): " << this << " on " << place_;
 
-  static_build_ = FLAGS_new_executor_static_build &&
-                  !FLAGS_new_executor_use_cuda_graph &&
-                  interpreter::BlockCanBeStaticBuilt(block);
-
   exception_notifier_ = main_thread_blocker_.RegisterEvent(kExceptionCaught);
   completion_notifier_ = main_thread_blocker_.RegisterEvent(kTaskCompletion);
 
@@ -73,6 +69,10 @@ ProgramInterpreter::ProgramInterpreter(const platform::Place& place,
   }
   var_scope_.SetLocalScope(local_scope_);
 
+  static_build_ = FLAGS_new_executor_static_build &&
+                  !FLAGS_new_executor_use_cuda_graph &&
+                  interpreter::BlockCanBeStaticBuilt(block);
+
   instruction_scheduling_priority_less = [this](size_t lhs, size_t rhs) {
     SchedulingPriority lhs_scheduling_priority =
         vec_instruction_[lhs].GetSchedulingPriority();
@@ -129,28 +129,10 @@ void ProgramInterpreter::RunImpl() {
 
 FetchList ProgramInterpreter::Run(const std::vector<std::string>& feed_names,
                                   bool need_fetch) {
-  SetDeviceId(place_);
-  CheckCUDAGraphBeforeRun(feed_names);
-
-#ifdef PADDLE_WITH_DNNL
-  platform::AttachPointerHashToMKLDNNKey(this, place_);
-#endif
+  std::vector<paddle::framework::OpFuncNode> op_func_nodes;
+  Build(feed_names, &op_func_nodes);
 
   if (!is_build_) {
-    LOG_FIRST_N(INFO, 1) << "New Executor is Running.";
-    paddle::framework::interpreter::BuildVariableScope(
-        block_, execution_config_, &var_scope_);
-
-    std::vector<paddle::framework::OpFuncNode> op_func_nodes;
-    paddle::framework::interpreter::BuildOpFuncList(
-        place_,
-        block_,
-        execution_config_.skip_gc_vars,
-        &op_func_nodes,
-        &var_scope_,
-        execution_config_,
-        HasLocalScope(),
-        static_build_);
     SetFeedVarsInplaceSkip(feed_names);
     // convert vec func_list to graph
     Convert(&op_func_nodes);
@@ -189,6 +171,33 @@ FetchList ProgramInterpreter::Run(const std::vector<std::string>& feed_names,
   }
 }
 
+void ProgramInterpreter::Build(
+    const std::vector<std::string>& feed_names,
+    std::vector<paddle::framework::OpFuncNode>* op_func_nodes) {
+  SetDeviceId(place_);
+  CheckCUDAGraphBeforeRun(feed_names);
+
+#ifdef PADDLE_WITH_DNNL
+  platform::AttachPointerHashToMKLDNNKey(this, place_);
+#endif
+
+  if (!is_build_) {
+    LOG_FIRST_N(INFO, 1) << "New Executor is Running.";
+    paddle::framework::interpreter::BuildVariableScope(
+        block_, execution_config_, &var_scope_);
+
+    paddle::framework::interpreter::BuildOpFuncList(
+        place_,
+        block_,
+        execution_config_.skip_gc_vars,
+        op_func_nodes,
+        &var_scope_,
+        execution_config_,
+        HasLocalScope(),
+        static_build_);
+  }
+}
+
 FetchList ProgramInterpreter::Run(
     const std::vector<std::string>& feed_names,
     const std::vector<phi::DenseTensor>& feed_tensors) {
diff --git a/paddle/fluid/framework/new_executor/program_interpreter.h b/paddle/fluid/framework/new_executor/program_interpreter.h
index 27348d57fcd17..bef6385c211fb 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.h
+++ b/paddle/fluid/framework/new_executor/program_interpreter.h
@@ -48,6 +48,10 @@ class ProgramInterpreter : public InterpreterBaseImpl {
   paddle::framework::FetchList Run(const std::vector<std::string>& feed_names,
                                    bool need_fetch = true) override;
 
+  void Build(
+      const std::vector<std::string>& feed_names,
+      std::vector<paddle::framework::OpFuncNode>* op_func_nodes) override;
+
   void ShareWorkQueueFrom(InterpreterBaseImpl* src) override;
 
   void ShareBuildResultsFrom(const InterpreterBaseImpl& src) override;
@@ -92,7 +96,7 @@ class ProgramInterpreter : public InterpreterBaseImpl {
     force_evnets_to_wait_ = force_evnets_to_wait;
   }
 
-  bool IsStaticBuild() const { return static_build_; }
+  bool IsStaticBuild() const override { return static_build_; }
 
  private:
   // build graph
diff --git a/paddle/phi/kernels/gpu/lamb_kernel.cu b/paddle/phi/kernels/gpu/lamb_kernel.cu
index 220fa97a0e107..c1d1a812a881e 100644
--- a/paddle/phi/kernels/gpu/lamb_kernel.cu
+++ b/paddle/phi/kernels/gpu/lamb_kernel.cu
@@ -33,4 +33,6 @@ PD_REGISTER_KERNEL(lamb,
   kernel->OutputAt(3).SetDataType(phi::DataType::UNDEFINED);
   kernel->OutputAt(4).SetDataType(phi::DataType::UNDEFINED);
   kernel->OutputAt(5).SetDataType(phi::DataType::UNDEFINED);
+  kernel->OutputAt(3).SetBackend(phi::Backend::UNDEFINED);
+  kernel->OutputAt(4).SetBackend(phi::Backend::UNDEFINED);
 }
diff --git a/test/legacy_test/CMakeLists.txt b/test/legacy_test/CMakeLists.txt
index 5e000112784aa..9e7adef0a634f 100644
--- a/test/legacy_test/CMakeLists.txt
+++ b/test/legacy_test/CMakeLists.txt
@@ -1272,6 +1272,10 @@ set_tests_properties(
 set_tests_properties(
   test_cuda_graph_static_mode_error
   PROPERTIES ENVIRONMENT "FLAGS_CUDA_GRAPH_USE_STANDALONE_EXECUTOR=1")
+# In test_conditional_block, the sub block changes the dtype and place of the output variable.
+# The changed variable is used in the following op. Static build is not supported for this case.
+set_tests_properties(test_conditional_block
+                     PROPERTIES ENVIRONMENT "FLAGS_new_executor_static_build=0")
 
 # These UTs are to temporarily test static build for standalone_executor, will be removed after static build is enabled by default.
 set(STATIC_BUILD_TESTS
diff --git a/test/white_list/new_ir_op_test_white_list b/test/white_list/new_ir_op_test_white_list
index 613769ec5b657..b85c88fa6bb18 100644
--- a/test/white_list/new_ir_op_test_white_list
+++ b/test/white_list/new_ir_op_test_white_list
@@ -116,8 +116,6 @@ test_kron_op
 test_kthvalue_op
 test_label_smooth_op
 test_label_smooth_op_new_ir
-test_lamb_op
-test_lamb_op_static_build
 test_lerp_op
 test_lgamma_op
 test_linear_interp_v2_op

From 33d8ee204897a27ccbbb81a052b81cd1dbdf04fe Mon Sep 17 00:00:00 2001
From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com>
Date: Thu, 21 Sep 2023 09:55:35 +0800
Subject: [PATCH 09/39] [Pir] Support Run with feed_tensor (#57497)

* refine

* add flag

* add ut
---
 .../new_executor/new_ir_interpreter.cc        | 115 +++++++++++++++++-
 test/cpp/new_executor/CMakeLists.txt          |   1 +
 .../standalone_executor_new_ir_test.cc        |  81 ++++++++++++
 3 files changed, 195 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
index 55f70a573a1bc..47823eb82b428 100644
--- a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
@@ -935,8 +935,108 @@ void NewIRInterpreter::ConstructEventForJitInput() {
 paddle::framework::FetchList NewIRInterpreter::Run(
     const std::vector<std::string>& feed_names,
     const std::vector<phi::DenseTensor>& feed_tensors) {
-  PADDLE_THROW(platform::errors::Unimplemented(
-      "Run with feed_tensors is not implemented in NewIRInterpreter."));
+  auto FeedInput = [&] {
+    VLOG(4) << "Feed inputs";
+    for (size_t i = 0; i < feed_names.size(); ++i) {
+      auto* feed_var = InnerScope()->FindVar(feed_names[i]);
+      PADDLE_ENFORCE_NOT_NULL(
+          feed_var,
+          platform::errors::NotFound("Variable %s should not be nullptr.",
+                                     feed_names[i]));
+
+      auto feed_tensor = feed_var->GetMutable<phi::DenseTensor>();
+      feed_tensor->ShareDataWith(feed_tensors[i]);
+      feed_tensor->set_lod(feed_tensors[i].lod());
+    }
+  };
+
+  SetDeviceId(place_);
+  CheckCUDAGraphBeforeRun(feed_names);
+
+#ifdef PADDLE_WITH_DNNL
+  platform::AttachPointerHashToMKLDNNKey(this, place_);
+#endif
+
+  FeedInput();
+
+  if (!is_build_) {
+    LOG_FIRST_N(INFO, 1) << "New Executor is BetaRunning.";
+    // Build
+    VLOG(4) << "Done BuildScope";
+    VLOG(4) << DebugValueInfo();
+
+    SolvePersisableVarNames();
+
+    VLOG(4) << "Parameter value include: ";
+    for (auto parameter : parameter_var_names_) {
+      VLOG(4) << "Parameter value: " << parameter;
+    }
+
+    BuildInstruction();
+    VLOG(4) << "Done BuildInstruction";
+
+    PreAnalysis();
+    VLOG(4) << "Done PreAnalysis";
+
+    // Run
+    if (FLAGS_enable_new_ir_in_executor_trace_run || nccl_op_num_ > 1 ||
+        ((execution_config_.used_for_jit || execution_config_.used_for_cinn) &&
+         (sync_op_num_ == 0))) {
+      LOG_FIRST_N(INFO, 1) << "New ir interpreter is running in BetaRun mode "
+                              "with trace version.";
+      TraceRunImpl();
+    } else {
+      LOG_FIRST_N(INFO, 1) << "New ir interpreter is running in BetaRun mode "
+                              "with multi thread version.";
+      MultiThreadRunImpl();
+    }
+
+    is_build_ = true;
+    is_shared_results_build_ = true;
+  } else {
+    if (FLAGS_enable_new_ir_in_executor_trace_run || nccl_op_num_ > 1 ||
+        ((execution_config_.used_for_jit || execution_config_.used_for_cinn) &&
+         (sync_op_num_ == 0))) {
+      TraceRunImpl();
+    } else {
+      MultiThreadRunImpl();
+    }
+  }
+
+  if (HasLocalScope()) {
+    ClearLoDTensorArrayInLocalScope();
+  }
+  // return Fetch Tensors
+  Scope* inner_scope = InnerScope();
+  if (FLAGS_enable_new_ir_in_executor) {
+    framework::FetchList fetch_res;
+
+    for (auto& var_name : fetch_var_names_) {
+      auto* var = inner_scope->FindVar(var_name);
+      VLOG(0) << "fetch " << var_name << "[" << var << "]";
+      fetch_res.push_back(var->Get<phi::DenseTensor>());
+    }
+
+    VLOG(4) << "get fetch list size: " << fetch_res.size();
+    return fetch_res;
+  } else {
+    auto* fetch_var = inner_scope->FindVar(interpreter::kFetchVarName);
+    if (fetch_var) {
+      auto fetch_list =
+          std::move(*fetch_var->GetMutable<framework::FetchList>());
+#ifdef PADDLE_WITH_CUDA
+      if (platform::IsCUDAGraphCapturing()) {
+        PADDLE_ENFORCE_EQ(fetch_list.empty(),
+                          true,
+                          platform::errors::InvalidArgument(
+                              "Cannot fetch data when using CUDA Graph."));
+      }
+#endif
+      return fetch_list;
+    } else {
+      return {};
+    }
+  }
 }
 
 FetchList NewIRInterpreter::Run(const std::vector<std::string>& feed_names,
@@ -1252,6 +1352,16 @@ void NewIRInterpreter::RunInstructionBase(InstructionBase* instr_node) {
     VLOG(4) << "begin to run op " << instr_node->Name();
     if (!instr_node->IsArtificial()) {
       instr_node->Run();
+
+      if (FLAGS_benchmark) {
+        instr_node->DeviceContext().Wait();
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
+        VLOG(4) << "Operator(" << instr_node->Name()  // NOLINT
+                << "): context wait and get last error";
+#endif
+      }
+
       VLOG(4) << __func__ << " OP id:" << instr_node->Id()
               << " name:" << instr_node->Name() << " type:"
               << (instr_node->KernelType() == OpFuncType::kCpuSync
@@ -1260,6 +1370,7 @@ void NewIRInterpreter::RunInstructionBase(InstructionBase* instr_node) {
                              ? "kGpuSync"
                              : "kGpuAsync"))
               << " runs on " << platform::GetCurrentThreadName();
+
       VLOG(4) << "done instruction node run";
       CheckGC(instr_node);
       VLOG(4) << "done CheckGC";
diff --git a/test/cpp/new_executor/CMakeLists.txt b/test/cpp/new_executor/CMakeLists.txt
index 00285e39f518b..af09520b12a54 100644
--- a/test/cpp/new_executor/CMakeLists.txt
+++ b/test/cpp/new_executor/CMakeLists.txt
@@ -10,6 +10,7 @@ if(NOT WIN32)
     pd_op_dialect
     pd_kernel_dialect
     pir
+    phi
     standalone_executor)
 endif()
 
diff --git a/test/cpp/new_executor/standalone_executor_new_ir_test.cc b/test/cpp/new_executor/standalone_executor_new_ir_test.cc
index d200b2a1052ed..eac996ffebe0f 100644
--- a/test/cpp/new_executor/standalone_executor_new_ir_test.cc
+++ b/test/cpp/new_executor/standalone_executor_new_ir_test.cc
@@ -97,6 +97,87 @@ TEST(StandaloneExecutor, run) {
   EXPECT_EQ(res3, true);
 }
 
+TEST(StandaloneExecutor, run_feed_tensor) {
+  pir::IrContext* ctx = pir::IrContext::Instance();
+  pir::Program program(ctx);
+
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+
+  pir::Builder builder = pir::Builder(ctx, program.block());
+
+  pir::OpInfo feed_op_info =
+      ctx->GetRegisteredOpInfo(paddle::dialect::FeedOp::name());
+
+  pir::Type fp32_dtype = pir::Float32Type::get(ctx);
+  phi::DDim dims = {1};
+  phi::DataLayout data_layout = phi::DataLayout::NCHW;
+  phi::LoD lod = {{0}};
+  size_t offset = 0;
+  pir::Type dense_tensor_dtype = paddle::dialect::DenseTensorType::get(
+      ctx, fp32_dtype, dims, data_layout, lod, offset);
+
+  pir::AttributeMap attr_map1;
+  attr_map1.insert(std::pair<std::string, pir::Attribute>(
+      "name", pir::StrAttribute::get(ctx, "x")));
+  attr_map1.insert(std::pair<std::string, pir::Attribute>(
+      "col", pir::Int32Attribute::get(ctx, 0)));
+  pir::Operation* feed_op1 =
+      pir::Operation::Create({}, attr_map1, {dense_tensor_dtype}, feed_op_info);
+  program.block()->push_back(feed_op1);
+
+  pir::AttributeMap attr_map2;
+  attr_map2.insert(std::pair<std::string, pir::Attribute>(
+      "name", pir::StrAttribute::get(ctx, "y")));
+  attr_map2.insert(std::pair<std::string, pir::Attribute>(
+      "col", pir::Int32Attribute::get(ctx, 0)));
+  pir::Operation* feed_op2 =
+      pir::Operation::Create({}, attr_map2, {dense_tensor_dtype}, feed_op_info);
+  program.block()->push_back(feed_op2);
+
+  builder.Build<paddle::dialect::AddOp>(feed_op1->result(0),
+                                        feed_op2->result(0));
+
+  auto kernel_program = paddle::dialect::PdOpLowerToKernelPass(&program);
+
+  auto place = platform::CPUPlace();
+  Scope scope;
+  InterpreterCore test_core(place, {}, kernel_program->block(), &scope);
+
+  std::stringstream os;
+  os << reinterpret_cast<NewIRInterpreter*>(
+      const_cast<InterpreterBaseImpl*>(test_core.Impl()));
+  std::string out_name = os.str() + "_inner_var_2";
+  test_core.SetSkipGcVars({out_name});
+
+  phi::DenseTensorMeta meta(
+      phi::DataType::FLOAT32, dims, data_layout, lod, offset);
+  paddle::platform::DeviceContext* dev_ctx =
+      paddle::platform::DeviceContextPool::Instance().Get(
+          paddle::platform::CPUPlace());
+
+  phi::DenseTensor tensor_x;
+  tensor_x.set_meta(meta);
+  dev_ctx->Alloc(&tensor_x, phi::DataType::FLOAT32);
+  float* tensor_x_data = tensor_x.data<float>();
+  *tensor_x_data = 1.0;
+
+  phi::DenseTensor tensor_y;
+  tensor_y.set_meta(meta);
+  dev_ctx->Alloc(&tensor_y, phi::DataType::FLOAT32);
+  float* tensor_y_data = tensor_y.data<float>();
+  *tensor_y_data = 2.0;
+
+  test_core.Run({"x", "y"}, {tensor_x, tensor_y});
+
+  auto out_tensor =
+      test_core.local_scope() == nullptr
+          ? scope.FindVar(out_name)->Get<phi::DenseTensor>()
+          : test_core.local_scope()->FindVar(out_name)->Get<phi::DenseTensor>();
+
+  bool res0 = simple_cmp(out_tensor.data<float>()[0], 3.0);
+  EXPECT_EQ(res0, true);
+}
+
 TEST(StandaloneExecutor, run_inplace_sqrt) {
   pir::IrContext* ctx = pir::IrContext::Instance();
   pir::Program program((ctx));

From 2e5a6fbadef0fe215f08baba15dcecdf8039c7c6 Mon Sep 17 00:00:00 2001
From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com>
Date: Thu, 21 Sep 2023 10:12:09 +0800
Subject: [PATCH 10/39] [Pir] delete support mutable attribute for pow (#57503)

* refien

* fix bug

* fix

* refine
---
 .../fluid/pir/dialect/op_generator/api_gen.py |  8 +++++
 .../fluid/pir/dialect/op_generator/op_gen.py  |  8 +++++
 paddle/fluid/primitive/codegen/gen.py         |  2 +-
 test/legacy_test/test_activation_op.py        | 32 ++-----------------
 4 files changed, 20 insertions(+), 30 deletions(-)

diff --git a/paddle/fluid/pir/dialect/op_generator/api_gen.py b/paddle/fluid/pir/dialect/op_generator/api_gen.py
index 5a3afdf2036a9..d7e74f72b652f 100644
--- a/paddle/fluid/pir/dialect/op_generator/api_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/api_gen.py
@@ -125,6 +125,14 @@ def _parse_yaml(self, op_yaml_files, op_compat_yaml_file):
                 op_compat_item = op_compat_parser.get_compat(
                     op['forward']['name']
                 )
+
+            if (
+                op_compat_item is not None
+                and op_compat_item['op'] == "pow"
+                and 'scalar' in op_compat_item
+            ):
+                op_compat_item = op_compat_item.pop('scalar')
+
             op_info_items.append(OpInfoParser(op, op_compat_item))
         return op_info_items
 
diff --git a/paddle/fluid/pir/dialect/op_generator/op_gen.py b/paddle/fluid/pir/dialect/op_generator/op_gen.py
index 62e746044776d..46949bcb547a7 100644
--- a/paddle/fluid/pir/dialect/op_generator/op_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/op_gen.py
@@ -858,6 +858,14 @@ def OpGenerator(
             and 'forward' in op
         ):
             op_compat_item = op_compat_parser.get_compat(op['forward']['name'])
+
+        if (
+            op_compat_item is not None
+            and op_compat_item['op'] == "pow"
+            and 'scalar' in op_compat_item
+        ):
+            op_compat_item = op_compat_item.pop('scalar')
+
         op_info_items[op['name']] = OpInfoParser(op, op_compat_item)
     # (3) CodeGen: Traverse op_info_items and generate
     ops_name_list = []  # all op class name store in this list
diff --git a/paddle/fluid/primitive/codegen/gen.py b/paddle/fluid/primitive/codegen/gen.py
index 0239f3d702e96..f9a920730967d 100644
--- a/paddle/fluid/primitive/codegen/gen.py
+++ b/paddle/fluid/primitive/codegen/gen.py
@@ -291,7 +291,7 @@ def extend_compat_info(apis, compats):
                 backward_apis.append(apis_dict[backward_op_name])
         support_tensor_attrs_names = []
         compat_attrs_data_type = {}
-        if 'scalar' in compat_item:
+        if 'scalar' in compat_item and compat_item['op'] != "pow":
             for attr_name, attr_info in compat_item['scalar'].items():
                 if (
                     'support_tensor' in attr_info
diff --git a/test/legacy_test/test_activation_op.py b/test/legacy_test/test_activation_op.py
index 703cc4174d8f5..8b16ee5750eac 100644
--- a/test/legacy_test/test_activation_op.py
+++ b/test/legacy_test/test_activation_op.py
@@ -3588,33 +3588,7 @@ def init_shape(self):
         self.shape = []
 
 
-class TestPow_factor_tensor(TestActivation):
-    def setUp(self):
-        self.op_type = "pow"
-        self.python_api = paddle.pow
-        self.enable_cinn = False
-        self.init_dtype()
-
-        np.random.seed(1024)
-        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
-        out = np.power(x, 3)
-
-        self.inputs = {
-            'X': OpTest.np_dtype_to_base_dtype(x),
-            'FactorTensor': np.array([3.0]).astype(self.dtype),
-        }
-
-        self.attrs = {}
-        self.outputs = {'Out': out}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        self.check_grad(['X'], 'Out')
-
+class TestPow_API(TestActivation):
     def test_api(self):
         with static_guard():
             input = np.random.uniform(1, 2, [11, 17]).astype("float32")
@@ -4526,7 +4500,7 @@ def test_check_grad(self):
 create_test_act_fp16_class(TestLog1p)
 create_test_act_fp16_class(TestSquare)
 create_test_act_fp16_class(TestPow, check_prim=True)
-create_test_act_fp16_class(TestPow_factor_tensor)
+create_test_act_fp16_class(TestPow_API)
 create_test_act_fp16_class(TestSTanh)
 create_test_act_fp16_class(TestSoftplus)
 create_test_act_fp16_class(TestSoftsign)
@@ -4657,7 +4631,7 @@ def test_check_grad(self):
 create_test_act_bf16_class(TestLog1p)
 create_test_act_bf16_class(TestSquare)
 create_test_act_bf16_class(TestPow, check_prim=True)
-create_test_act_bf16_class(TestPow_factor_tensor)
+create_test_act_bf16_class(TestPow_API)
 create_test_act_bf16_class(TestSTanh)
 create_test_act_bf16_class(TestSoftplus)
 create_test_act_bf16_class(TestSoftsign)

From 00bd3aa99f33add638f567998b74e07323e2b2b9 Mon Sep 17 00:00:00 2001
From: YuanRisheng <yuanrisheng@baidu.com>
Date: Thu, 21 Sep 2023 10:29:31 +0800
Subject: [PATCH 11/39] update get/set parameter (#57539)

---
 .../pir/dialect/operator/ir/api_builder.cc    | 11 ++++++++++
 .../pir/dialect/operator/ir/api_builder.h     |  6 +++++
 .../pir/dialect/operator/ir/manual_api.cc     | 21 ++++++------------
 .../pir/dialect/operator/ir/manual_api.h      |  4 +---
 paddle/fluid/pybind/ir.cc                     | 15 +++++++++++++
 .../fluid/pybind/manual_static_op_function.h  |  7 +-----
 python/paddle/base/data_feeder.py             |  2 +-
 python/paddle/base/executor.py                | 11 +++++-----
 python/paddle/ir/core.py                      | 22 +++++++++++++------
 python/paddle/nn/initializer/constant.py      |  7 +++++-
 python/paddle/nn/initializer/xavier.py        | 13 ++++++-----
 python/paddle/tensor/math.py                  |  2 --
 test/ir/new_ir/test_build_model.py            | 12 +++++-----
 13 files changed, 82 insertions(+), 51 deletions(-)

diff --git a/paddle/fluid/pir/dialect/operator/ir/api_builder.cc b/paddle/fluid/pir/dialect/operator/ir/api_builder.cc
index 893c664b78b08..0662ced1cb40c 100644
--- a/paddle/fluid/pir/dialect/operator/ir/api_builder.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/api_builder.cc
@@ -48,5 +48,16 @@ void APIBuilder::ResetInsertionPointToEnd() {
   builder_->SetInsertionPointToEnd(builder_->block());
 }
 
+pir::Parameter* APIBuilder::GetParameter(const std::string& name) const {
+  pir::Program* program = builder_->block()->GetParentOp()->GetParentProgram();
+  return program->GetParameter(name);
+}
+
+void APIBuilder::SetParameter(const std::string& name,
+                              std::unique_ptr<pir::Parameter>&& parameter) {
+  pir::Program* program = builder_->block()->GetParentOp()->GetParentProgram();
+  program->SetParameter(name, std::move(parameter));
+}
+
 }  // namespace dialect
 }  // namespace paddle
diff --git a/paddle/fluid/pir/dialect/operator/ir/api_builder.h b/paddle/fluid/pir/dialect/operator/ir/api_builder.h
index a06f529d2c5be..060102de4bde0 100644
--- a/paddle/fluid/pir/dialect/operator/ir/api_builder.h
+++ b/paddle/fluid/pir/dialect/operator/ir/api_builder.h
@@ -17,6 +17,7 @@
 
 #include "paddle/pir/core/builder.h"
 #include "paddle/pir/core/macros.h"
+#include "paddle/pir/core/parameter.h"
 #include "paddle/pir/core/program.h"
 
 namespace paddle {
@@ -40,6 +41,11 @@ class APIBuilder {
 
   void ResetInsertionPointToEnd();
 
+  pir::Parameter* GetParameter(const std::string& name) const;
+
+  void SetParameter(const std::string& name,
+                    std::unique_ptr<pir::Parameter>&& parameter);
+
   std::shared_ptr<pir::Builder> GetBuilder() { return builder_; }
 
  private:
diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_api.cc b/paddle/fluid/pir/dialect/operator/ir/manual_api.cc
index ba8fc47744ed3..24e7a94b66650 100644
--- a/paddle/fluid/pir/dialect/operator/ir/manual_api.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/manual_api.cc
@@ -18,7 +18,7 @@
 #include "paddle/fluid/pir/dialect/operator/ir/pd_api.h"
 #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
 #include "paddle/pir/core/builtin_op.h"
-
+#include "paddle/pir/core/parameter.h"
 namespace paddle {
 namespace dialect {
 
@@ -46,25 +46,18 @@ pir::OpResult zeros_like(pir::Value x,
   return paddle::dialect::full_like(x, 0, dtype, place);
 }
 
-pir::OpResult get_parameter(const std::string& name,
-                            phi::DataType dtype,
-                            const std::vector<int64_t>& shape) {
-  phi::LoD lod;
-  size_t offset{0};
-  pir::Type out_dense_tensor_type = paddle::dialect::DenseTensorType::get(
-      pir::IrContext::Instance(),
-      TransToIrDataType(dtype),
-      phi::DDim(shape.data(), shape.size()),
-      phi::DataLayout::UNDEFINED,
-      lod,
-      offset);
+pir::OpResult get_parameter(const std::string& name) {
+  pir::Parameter* param = APIBuilder::Instance().GetParameter(name);
   pir::GetParameterOp get_parameter_op =
       APIBuilder::Instance().GetBuilder()->Build<pir::GetParameterOp>(
-          name, out_dense_tensor_type);
+          name, param->type());
   return get_parameter_op.result(0);
 }
 
 void set_parameter(pir::Value parameter, const std::string& name) {
+  std::unique_ptr<pir::Parameter> param(
+      new pir::Parameter(nullptr, 0, parameter.type()));
+  APIBuilder::Instance().SetParameter(name, std::move(param));
   APIBuilder::Instance().GetBuilder()->Build<pir::SetParameterOp>(parameter,
                                                                   name);
 }
diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_api.h b/paddle/fluid/pir/dialect/operator/ir/manual_api.h
index 7e5aba6fcbaa8..c919448f1ddb0 100644
--- a/paddle/fluid/pir/dialect/operator/ir/manual_api.h
+++ b/paddle/fluid/pir/dialect/operator/ir/manual_api.h
@@ -32,9 +32,7 @@ pir::OpResult zeros_like(pir::Value x,
                          phi::DataType dtype = phi::DataType::UNDEFINED,
                          const Place& place = {});
 
-pir::OpResult get_parameter(const std::string& name,
-                            phi::DataType dtype,
-                            const std::vector<int64_t>& shape);
+pir::OpResult get_parameter(const std::string& name);
 
 void set_parameter(pir::Value parameter, const std::string& name);
 
diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/ir.cc
index db3faebb1985b..913d7d6f7aa80 100644
--- a/paddle/fluid/pybind/ir.cc
+++ b/paddle/fluid/pybind/ir.cc
@@ -153,6 +153,11 @@ void BindProgram(py::module *m) {
            [](const std::shared_ptr<Program> &self) {
              return self->parameters_num();
            })
+      .def("move_parameters_from",
+           [](const std::shared_ptr<Program> &self,
+              const std::shared_ptr<Program> &other) {
+             self->set_parameters(std::move(other->parameters()));
+           })
       .def(
           "global_block",
           [](std::shared_ptr<Program> self) { return self->block(); },
@@ -375,9 +380,19 @@ void BindOpOperand(py::module *m) {
 bool GetOpResultBoolAttr(const OpResult &self, const std::string &attr_name) {
   auto *defining_op = self.owner();
   if (defining_op->HasAttribute(attr_name)) {
+    PADDLE_ENFORCE(
+        defining_op->attribute(attr_name).isa<pir::ArrayAttribute>(),
+        paddle::platform::errors::InvalidArgument(
+            "%s: Callstack attributes of %s is not ArrayAttribute type",
+            attr_name));
     auto attrs = defining_op->attribute(attr_name)
                      .dyn_cast<pir::ArrayAttribute>()
                      .AsVector();
+    PADDLE_ENFORCE(attrs[self.index()].isa<pir::BoolAttribute>(),
+                   paddle::platform::errors::InvalidArgument(
+                       "The index %d in %s is not BoolAttribute type",
+                       self.index(),
+                       attr_name));
     return attrs[self.index()].dyn_cast<pir::BoolAttribute>().data();
   } else {
     return true;
diff --git a/paddle/fluid/pybind/manual_static_op_function.h b/paddle/fluid/pybind/manual_static_op_function.h
index 68b9e22ec7f94..7c32b2ab1d4fa 100644
--- a/paddle/fluid/pybind/manual_static_op_function.h
+++ b/paddle/fluid/pybind/manual_static_op_function.h
@@ -35,13 +35,8 @@ static PyObject *static_api_get_parameter(PyObject *self,
     // Parse Attributes
     PyObject *name_obj = PyTuple_GET_ITEM(args, 0);
     std::string name = CastPyArg2String(name_obj, "name", 0);
-    PyObject *dtype_obj = PyTuple_GET_ITEM(args, 1);
-    phi::DataType dtype = CastPyArg2DataTypeDirectly(dtype_obj, "dtype", 1);
-    PyObject *shape_obj = PyTuple_GET_ITEM(args, 2);
-    phi::IntArray shape = CastPyArg2IntArray(shape_obj, "shape", 2);
     // Call ir static api
-    auto static_api_out =
-        paddle::dialect::get_parameter(name, dtype, shape.GetData());
+    auto static_api_out = paddle::dialect::get_parameter(name);
 
     return ToPyObject(static_api_out);
   } catch (...) {
diff --git a/python/paddle/base/data_feeder.py b/python/paddle/base/data_feeder.py
index 40154e1a0d429..78781a6856af1 100644
--- a/python/paddle/base/data_feeder.py
+++ b/python/paddle/base/data_feeder.py
@@ -47,7 +47,7 @@
 _PADDLE_NEW_IR_DTYPE_2_NUMPY_DTYPE = {
     core.DataType.BOOL: 'bool',
     core.DataType.FLOAT16: 'float16',
-    core.DataType.UINT16: 'uint16',
+    core.DataType.BFLOAT16: 'uint16',
     core.DataType.FLOAT32: 'float32',
     core.DataType.FLOAT64: 'float64',
     core.DataType.INT8: 'int8',
diff --git a/python/paddle/base/executor.py b/python/paddle/base/executor.py
index 9ea3d566c824a..e5fddd15329e3 100755
--- a/python/paddle/base/executor.py
+++ b/python/paddle/base/executor.py
@@ -515,11 +515,12 @@ def _add_pir_fetch_ops(program, fetch_list, fetch_var_name):
     if not has_fetch_operations(
         global_block, fetch_list, fetch_var_name, fetch_op
     ):
-        for i, fetch_input in enumerate(fetch_list):
-            assert isinstance(
-                fetch_input, OpResult
-            ), "Wrong type for fetch_list[%s]: %s" % (i, type(fetch_input))
-            paddle._ir_ops.fetch(fetch_input, fetch_var_name + str(i), i)
+        with paddle.static.program_guard(program):
+            for i, fetch_input in enumerate(fetch_list):
+                assert isinstance(
+                    fetch_input, OpResult
+                ), "Wrong type for fetch_list[%s]: %s" % (i, type(fetch_input))
+                paddle._ir_ops.fetch(fetch_input, fetch_var_name + str(i), i)
 
 
 def _merge_tensors(tensor, micro_batch_num):
diff --git a/python/paddle/ir/core.py b/python/paddle/ir/core.py
index 0ce01ebb3f593..908319458ed39 100644
--- a/python/paddle/ir/core.py
+++ b/python/paddle/ir/core.py
@@ -251,6 +251,12 @@ def program_guard(main_program, startup_program=None):
             switch_startup_program(startup_program)
 
 
+class ParameterMeta:
+    def __init__(self, shape, dtype):
+        self.shape = shape
+        self.dtype = dtype
+
+
 def create_parameter(
     dtype,
     shape,
@@ -266,19 +272,21 @@ def create_parameter(
     op_result_name = unique_name.generate('parameter')
     startup_program = default_startup_program()
     main_program = default_main_program()
-
-    with program_guard(default_main_program()):
-        param = get_parameter(op_result_name, dtype, shape)
-        trainable = kwargs.get('trainable', True)
-        param.stop_gradient = not trainable
-        param.is_persistable = True
+    parameter_meta = ParameterMeta(shape, dtype)
 
     with program_guard(startup_program):
         initializer = kwargs['initializer']
         init_result = initializer(
-            param, param.get_defining_op().get_parent_block()
+            parameter_meta, startup_program.global_block()
         )
         init_result.is_persistable = True
         set_parameter(init_result, op_result_name)
 
+    main_program.move_parameters_from(startup_program)
+    with program_guard(default_main_program()):
+        param = get_parameter(op_result_name, dtype, shape)
+        trainable = kwargs.get('trainable', True)
+        param.stop_gradient = not trainable
+        param.is_persistable = True
+
     return param
diff --git a/python/paddle/nn/initializer/constant.py b/python/paddle/nn/initializer/constant.py
index bc2baf08c9bb1..b4e9ee1df266a 100644
--- a/python/paddle/nn/initializer/constant.py
+++ b/python/paddle/nn/initializer/constant.py
@@ -58,7 +58,12 @@ def forward(self, var, block=None):
 
         assert isinstance(
             var,
-            (framework.Variable, framework.EagerParamBase, paddle.ir.OpResult),
+            (
+                framework.Variable,
+                framework.EagerParamBase,
+                paddle.ir.OpResult,
+                paddle.ir.core.ParameterMeta,
+            ),
         )
         assert isinstance(block, (framework.Block, paddle.ir.Block))
 
diff --git a/python/paddle/nn/initializer/xavier.py b/python/paddle/nn/initializer/xavier.py
index 7f479111fba3d..40eb6a874c9da 100644
--- a/python/paddle/nn/initializer/xavier.py
+++ b/python/paddle/nn/initializer/xavier.py
@@ -88,12 +88,13 @@ def forward(self, var, block=None):
 
         block = self._check_block(block)
         assert isinstance(block, (framework.Block, paddle.ir.Block))
-        check_variable_and_dtype(
-            var,
-            "Out",
-            ["uint16", "float16", "float32", "float64"],
-            "xavier_init",
-        )
+        if not isinstance(var, paddle.ir.core.ParameterMeta):
+            check_variable_and_dtype(
+                var,
+                "Out",
+                ["uint16", "float16", "float32", "float64"],
+                "xavier_init",
+            )
 
         f_in, f_out = self._compute_fans(var)
 
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 5cdd91b075426..56c553bce797e 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -889,8 +889,6 @@ def divide(x, y, name=None):
     """
     if in_dynamic_or_pir_mode():
         return _C_ops.divide(x, y)
-    elif in_pir_mode():
-        return paddle._ir_ops.divide(x, y)
     else:
         return _elementwise_op(LayerHelper('elementwise_div', **locals()))
 
diff --git a/test/ir/new_ir/test_build_model.py b/test/ir/new_ir/test_build_model.py
index f356cfc24ffdf..a6ddae7c443ea 100644
--- a/test/ir/new_ir/test_build_model.py
+++ b/test/ir/new_ir/test_build_model.py
@@ -31,12 +31,12 @@ def test_basic_network(self):
             exe = paddle.static.Executor()
             x_feed = np.ones([4, 4], dtype=np.float32) * 10
             y_feed = np.ones([4, 4], dtype=np.float32) * 2
-            (sum_value,) = exe.run(
-                main_program,
-                feed={'x': x_feed, 'y': y_feed},
-                fetch_list=[sum_out],
-            )
-            self.assertEqual(sum_value, 5 * 4 * 4)
+        (sum_value,) = exe.run(
+            main_program,
+            feed={'x': x_feed, 'y': y_feed},
+            fetch_list=[sum_out],
+        )
+        self.assertEqual(sum_value, 5 * 4 * 4)
 
         main_program = paddle.static.Program()
         with paddle.static.program_guard(main_program):

From 47040ef6c6df4b95617a58636b3c13ab64112a5a Mon Sep 17 00:00:00 2001
From: xiaoguoguo626807 <100397923+xiaoguoguo626807@users.noreply.github.com>
Date: Thu, 21 Sep 2023 10:31:27 +0800
Subject: [PATCH 12/39] add all cast newir test (#57527)

---
 test/legacy_test/test_cast_op.py | 42 ++++++++++++++++++++++++++------
 1 file changed, 34 insertions(+), 8 deletions(-)

diff --git a/test/legacy_test/test_cast_op.py b/test/legacy_test/test_cast_op.py
index e24eb6b44b631..47bc23d76f601 100644
--- a/test/legacy_test/test_cast_op.py
+++ b/test/legacy_test/test_cast_op.py
@@ -78,10 +78,16 @@ def setUp(self):
         self.public_python_api = cast_wrapper
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=True)
 
     def test_grad(self):
-        self.check_grad(['X'], ['Out'], check_prim=True, only_check_prim=True)
+        self.check_grad(
+            ['X'],
+            ['Out'],
+            check_prim=True,
+            only_check_prim=True,
+            check_new_ir=True,
+        )
 
 
 class TestCastOpFp32ToFp16(OpTest):
@@ -99,10 +105,16 @@ def setUp(self):
         self.public_python_api = cast_wrapper
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=True)
 
     def test_grad(self):
-        self.check_grad(['X'], ['Out'], check_prim=True, only_check_prim=True)
+        self.check_grad(
+            ['X'],
+            ['Out'],
+            check_prim=True,
+            only_check_prim=True,
+            check_new_ir=True,
+        )
 
 
 @unittest.skipIf(
@@ -128,10 +140,16 @@ def if_enable_cinn(self):
         self.enable_cinn = False
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=True)
 
     def test_grad(self):
-        self.check_grad(['X'], ['Out'], check_prim=True, only_check_prim=True)
+        self.check_grad(
+            ['X'],
+            ['Out'],
+            check_prim=True,
+            only_check_prim=True,
+            check_new_ir=True,
+        )
 
 
 @unittest.skipIf(
@@ -157,20 +175,28 @@ def if_enable_cinn(self):
         self.enable_cinn = False
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=True)
 
     def test_grad(self):
-        self.check_grad(['X'], ['Out'], check_prim=True, only_check_prim=True)
+        self.check_grad(
+            ['X'],
+            ['Out'],
+            check_prim=True,
+            only_check_prim=True,
+            check_new_ir=True,
+        )
 
 
 class TestCastOpError(unittest.TestCase):
     def test_errors(self):
+        paddle.enable_static()
         with program_guard(Program(), Program()):
             # The input type of cast_op must be Variable.
             x1 = base.create_lod_tensor(
                 np.array([[-1]]), [[1]], base.CPUPlace()
             )
             self.assertRaises(TypeError, paddle.cast, x1, 'int32')
+        paddle.disable_static()
 
 
 class TestCastOpEager(unittest.TestCase):

From 7bf03d344d53dd45ca23611d9de342e1e95c67d5 Mon Sep 17 00:00:00 2001
From: chen2016013 <111894720+chen2016013@users.noreply.github.com>
Date: Thu, 21 Sep 2023 10:43:39 +0800
Subject: [PATCH 13/39] [PIR] Print value info on python (#57471)

* fix bug

* rewrite __str__ in value and opresult to print info

* fix bug

* change as reviewed comments

* change as reviewed comments

* fix print str
---
 paddle/fluid/pybind/ir.cc        | 37 +++++++++++++++++++++++++++++++-
 paddle/pir/core/ir_printer.cc    |  5 +++++
 paddle/pir/core/value.h          |  2 ++
 test/ir/new_ir/test_ir_pybind.py | 10 ++++++++-
 4 files changed, 52 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/ir.cc
index 913d7d6f7aa80..22fd0f40a36b5 100644
--- a/paddle/fluid/pybind/ir.cc
+++ b/paddle/fluid/pybind/ir.cc
@@ -27,6 +27,7 @@
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/ir_adaptor/translator/translate.h"
 #include "paddle/fluid/ir_adaptor/translator/utils.h"
+#include "paddle/fluid/pir/dialect/kernel/ir/kernel_type.h"
 #include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h"
 #include "paddle/fluid/pir/dialect/operator/ir/api_builder.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
@@ -91,6 +92,20 @@ inline void SetProgramInt64Attr(std::shared_ptr<Program> program,
       attr_name, pir::Int64Attribute::get(pir::IrContext::Instance(), value));
 }
 
+std::string GetValueInfo(Value v) {
+  std::stringstream ss;
+  ss << "define_op_name=" << v.dyn_cast<OpResult>().owner()->name();
+  ss << ", index=" << v.dyn_cast<OpResult>().index();
+  ss << ", dtype=" << v.type();
+  if (v.type().isa<paddle::dialect::AllocatedDenseTensorType>()) {
+    ss << ", place="
+       << v.type()
+              .dyn_cast<paddle::dialect::AllocatedDenseTensorType>()
+              .place();
+  }
+  return ss.str();
+}
+
 void BindProgram(py::module *m) {
   py::class_<Program, std::shared_ptr<Program>> program(*m, "Program", R"DOC(
     Create Python Program. Program is an abstraction of model structure, divided into
@@ -353,7 +368,14 @@ void BindValue(py::module *m) {
              return self.impl() == other.Value::impl();
            })
       .def("__hash__",
-           [](const Value &self) { return std::hash<pir::Value>{}(self); });
+           [](const Value &self) { return std::hash<pir::Value>{}(self); })
+      .def("__str__", [](const Value &self) -> py::str {
+        std::ostringstream print_stream;
+        print_stream << "Value(";
+        print_stream << GetValueInfo(self);
+        print_stream << ")";
+        return print_stream.str();
+      });
 }
 
 void BindOpOperand(py::module *m) {
@@ -472,6 +494,19 @@ void BindOpResult(py::module *m) {
            })
       .def("__hash__",
            [](OpResult &self) { return std::hash<pir::Value>{}(self); })
+      .def("__str__",
+           [](OpResult &self) -> py::str {
+             std::ostringstream print_stream;
+             print_stream << "OpResult(";
+             print_stream << GetValueInfo(self);
+             if (GetOpResultBoolAttr(self, kAttrStopGradients)) {
+               print_stream << ", stop_gradient=True";
+             } else {
+               print_stream << ", stop_gradient=False";
+             }
+             print_stream << ")";
+             return print_stream.str();
+           })
       .def(
           "get_defining_op",
           [](const OpResult &self) -> pir::Operation * {
diff --git a/paddle/pir/core/ir_printer.cc b/paddle/pir/core/ir_printer.cc
index 52c49be812104..260d42e035e4d 100644
--- a/paddle/pir/core/ir_printer.cc
+++ b/paddle/pir/core/ir_printer.cc
@@ -317,6 +317,11 @@ void Operation::Print(std::ostream& os) {
   printer.PrintOperation(this);
 }
 
+void Value::Print(std::ostream& os) const {
+  IrPrinter printer(os);
+  printer.PrintValue(*this);
+}
+
 void Type::Print(std::ostream& os) const {
   BasicIrPrinter printer(os);
   printer.PrintType(*this);
diff --git a/paddle/pir/core/value.h b/paddle/pir/core/value.h
index 81a1717540e3d..00c7aa123746e 100644
--- a/paddle/pir/core/value.h
+++ b/paddle/pir/core/value.h
@@ -72,6 +72,8 @@ class IR_API Value {
 
   OpOperand first_use() const;
 
+  void Print(std::ostream &os) const;
+
   bool use_empty() const;
 
   bool HasOneUse() const;
diff --git a/test/ir/new_ir/test_ir_pybind.py b/test/ir/new_ir/test_ir_pybind.py
index 34aa4c90c873f..b9a6fb92ac548 100644
--- a/test/ir/new_ir/test_ir_pybind.py
+++ b/test/ir/new_ir/test_ir_pybind.py
@@ -103,6 +103,11 @@ def test_value(self):
         )
         # test value == opresult
         self.assertEqual(add_op.operands_source()[0], matmul_op.results()[0])
+        # test opresult print
+        self.assertTrue(
+            'dtype=pd_op.tensor<4x4xf32>'
+            in add_op.operands_source()[0].__str__()
+        )
         # test opresult == value
         self.assertEqual(
             add_op.operands()[0].source(), add_op.operands_source()[0]
@@ -110,10 +115,13 @@ def test_value(self):
         # test opresult == opresult
         self.assertEqual(add_op.operands()[0].source(), matmul_op.results()[0])
 
+        # test opresult print
         self.assertEqual(
             tanh_op.operands()[0].source().get_defining_op().name(), "pd_op.add"
         )
-
+        self.assertTrue(
+            'pd_op.tensor<4x4xf32>' in tanh_op.operands()[0].source().__str__()
+        )
         add_op.replace_all_uses_with(matmul_op.results())
         self.assertEqual(
             tanh_op.operands()[0].source().get_defining_op().name(),

From 3fd69fa01736459182576d5c1916766f0e287714 Mon Sep 17 00:00:00 2001
From: Ruibin Cheung <beinggod@foxmail.com>
Date: Thu, 21 Sep 2023 10:54:50 +0800
Subject: [PATCH 14/39] [NewComm] No.10 compatiable upgrade for
 distributed_fused_lamb op (#57424)

* [NewComm] No.10 compatiable upgrade for distributed_fused_lamb op

* fix
---
 .../optimizers/distributed_fused_lamb_op.cu   | 354 ++++++++++++++----
 .../phi/core/distributed/nccl_comm_context.cc |  17 +
 .../phi/core/distributed/nccl_comm_context.h  |  20 +-
 test/legacy_test/CMakeLists.txt               |   4 +-
 .../distributed_fused_lamb_test_base.py       |   5 +-
 ...est_distributed_fused_lamb_op_with_clip.py |  18 +
 ...buted_fused_lamb_op_with_gradient_merge.py |  17 +
 7 files changed, 359 insertions(+), 76 deletions(-)

diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
index fdec898edbe91..a672f5ac99aa8 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
@@ -21,6 +21,7 @@
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/cuda_stream.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/distributed/comm_context_manager.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/utils/data_type.h"
@@ -28,6 +29,14 @@
 #include "paddle/phi/kernels/funcs/tensor_to_string.h"
 #include "paddle/utils/optional.h"
 
+#include "paddle/fluid/distributed/collective/utils.h"
+
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#include "paddle/phi/core/distributed/nccl_comm_context.h"
+#include "paddle/phi/core/flags.h"
+PHI_DECLARE_bool(dynamic_static_unified_comm);
+#endif
+
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #include "math.h"  // NOLINT
@@ -48,6 +57,19 @@ using MasterT = typename phi::dtype::MPTypeTrait<T>::Type;
 using phi::funcs::FlattenToString;
 using phi::funcs::ToVector;
 
+static void CheckCommContextHasRingId(
+    const distributed::CommContextManager &comm_context_manager, int ring_id) {
+  PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(ring_id)),
+                    true,
+                    paddle::platform::errors::InvalidArgument(
+                        "You choose to use new communication library by "
+                        "setting environment "
+                        "variable FLAGS_dynamic_static_unified_comm True. "
+                        "But ring_id(%d) is "
+                        "not found in comm_context_manager.",
+                        std::to_string(ring_id)));
+}
+
 template <typename T>
 static void FillZeroWithPtr(T *x, size_t n, gpuStream_t stream) {
   static_assert(!std::is_same<T, void>::value, "T cannot be void.");
@@ -875,24 +897,68 @@ static void MultiTensorUpdateLambParamAndBetaPows(
 }
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-static bool CreatePreMulScaleOpIfSupported(ncclDataType_t dtype,
-                                           ncclComm_t comm,
-                                           const void *scale,
-                                           ncclRedOp_t *op) {
+static bool CreatePreMulScaleOpIfSupported(
+    ncclDataType_t dtype,
+    ncclComm_t comm,
+    const void *scale,
+    ncclRedOp_t *op,
+    distributed::NCCLCommContext *comm_ctx = nullptr) {
 #if NCCL_VERSION_CODE >= 21100
-  int ver;
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclGetVersion(&ver));
-  if (ver >= 21100) {
-    VLOG(10) << "ncclRedOpCreatePreMulSum is supported.";
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclRedOpCreatePreMulSum(
-        op, const_cast<void *>(scale), dtype, ncclScalarDevice, comm));
-    return true;
+  if (FLAGS_dynamic_static_unified_comm) {
+    PADDLE_ENFORCE_NOT_NULL(
+        comm_ctx,
+        phi::errors::InvalidArgument(
+            "You choose to use new communication library by "
+            "setting environment "
+            "variable FLAGS_dynamic_static_unified_comm True. "
+            "But parameter of comm_ctx should not be nullptr."));
+    int ver = comm_ctx->GetNcclVersion();
+    if (ver >= 21100) {
+      VLOG(10) << "ncclRedOpCreatePreMulSum is supported.";
+      comm_ctx->RedOpCreatePreMulSum(
+          op, const_cast<void *>(scale), dtype, ncclScalarDevice);
+      return true;
+    }
+  } else {
+    int ver;
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclGetVersion(&ver));
+    if (ver >= 21100) {
+      VLOG(10) << "ncclRedOpCreatePreMulSum is supported.";
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclRedOpCreatePreMulSum(
+          op, const_cast<void *>(scale), dtype, ncclScalarDevice, comm));
+      return true;
+    }
   }
 #endif
   VLOG(10) << "ncclRedOpCreatePreMulSum is not supported.";
   return false;
 }
 
+static void DestoryOpIfSupported(
+    ncclRedOp_t op,
+    ncclComm_t comm,
+    distributed::NCCLCommContext *comm_ctx = nullptr) {
+#if NCCL_VERSION_CODE >= 21100
+  VLOG(10) << "ncclRedOpDestroy starts";
+
+  if (FLAGS_dynamic_static_unified_comm) {
+    PADDLE_ENFORCE_NOT_NULL(
+        comm_ctx,
+        phi::errors::InvalidArgument(
+            "You choose to use new communication library by "
+            "setting environment "
+            "variable FLAGS_dynamic_static_unified_comm True. "
+            "But parameter of comm_ctx should not be nullptr."));
+    comm_ctx->RedOpDestroy(op);
+  } else {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclRedOpDestroy(op, comm));
+  }
+  VLOG(10) << "ncclRedOpDestroy ends";
+
+#endif
+  VLOG(10) << "ncclRedOpDestroy is not supported.";
+}
+
 template <typename T1, typename T2>
 static void LaunchScaleKernel(const phi::GPUContext &dev_ctx,
                               const T1 *x,
@@ -922,7 +988,18 @@ static void NCCLSumWithScaleBase(const T *sendbuff,
                                  ncclComm_t comm,
                                  gpuStream_t stream,
                                  const phi::GPUContext &dev_ctx,
+                                 distributed::NCCLCommContext *comm_ctx,
                                  const T *scale = nullptr) {
+  if (FLAGS_dynamic_static_unified_comm) {
+    PADDLE_ENFORCE_NOT_NULL(
+        comm_ctx,
+        phi::errors::InvalidArgument(
+            "You choose to use new communication library by "
+            "setting environment "
+            "variable FLAGS_dynamic_static_unified_comm True. "
+            "But parameter of comm_ctx should not be nullptr."));
+  }
+
   static_assert(
       std::is_same<T, float>::value || std::is_same<T, dtype::float16>::value,
       "T must be either float32 or float16.");
@@ -943,8 +1020,8 @@ static void NCCLSumWithScaleBase(const T *sendbuff,
   ncclRedOp_t op = ncclSum;
   ncclDataType_t dtype =
       std::is_same<T, float>::value ? ncclFloat32 : ncclFloat16;
-  bool should_destroy_op =
-      scale && CreatePreMulScaleOpIfSupported(dtype, comm, scale, &op);
+  bool should_destroy_op = scale && CreatePreMulScaleOpIfSupported(
+                                        dtype, comm, scale, &op, comm_ctx);
   memory_utils::Buffer buffer(dev_ctx.GetPlace());
   if (scale && !should_destroy_op) {
     T *new_sendbuff = buffer.Alloc<T>(numel);
@@ -952,21 +1029,44 @@ static void NCCLSumWithScaleBase(const T *sendbuff,
     sendbuff = new_sendbuff;
   }
 
-  if (UseReduceScatter) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclReduceScatter(
-        sendbuff, recvbuff, recvcount, dtype, op, comm, stream));
+  if (comm_ctx) {
+    // Here assume comm_ctx->GetNcclComm() have higher priority than comm
+    if (UseReduceScatter) {
+      // TODO(BeingGod): NCCLCommContext::ReduceScatter only accept DenseTensor,
+      // but sendbuff or recvbuff maybe allocated by Buffer.
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          phi::dynload::ncclReduceScatter(sendbuff,
+                                          recvbuff,
+                                          recvcount,
+                                          dtype,
+                                          op,
+                                          comm_ctx->GetNcclComm(),
+                                          stream));
+    } else {
+      // TODO(BeingGod): NCCLCommContext::AllReduce only accept DenseTensor,
+      // but sendbuff or recvbuff maybe allocated by Buffer.
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          phi::dynload::ncclAllReduce(sendbuff,
+                                      recvbuff,
+                                      recvcount,
+                                      dtype,
+                                      op,
+                                      comm_ctx->GetNcclComm(),
+                                      stream));
+    }
   } else {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(
-        sendbuff, recvbuff, recvcount, dtype, op, comm, stream));
+    if (UseReduceScatter) {
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclReduceScatter(
+          sendbuff, recvbuff, recvcount, dtype, op, comm, stream));
+    } else {
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(
+          sendbuff, recvbuff, recvcount, dtype, op, comm, stream));
+    }
   }
 
-#if NCCL_VERSION_CODE >= 21100
   if (should_destroy_op) {
-    VLOG(10) << "ncclRedOpDestroy starts";
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclRedOpDestroy(op, comm));
-    VLOG(10) << "ncclRedOpDestroy ends";
+    DestoryOpIfSupported(op, comm, comm_ctx);
   }
-#endif
 }
 
 template <typename T>
@@ -977,9 +1077,17 @@ static void NCCLReduceScatterWithScale(const T *sendbuff,
                                        ncclComm_t comm,
                                        gpuStream_t stream,
                                        const phi::GPUContext &dev_ctx,
+                                       distributed::NCCLCommContext *comm_ctx,
                                        const T *scale = nullptr) {
-  NCCLSumWithScaleBase<T, true>(
-      sendbuff, recvbuff, recvcount, nranks, comm, stream, dev_ctx, scale);
+  NCCLSumWithScaleBase<T, true>(sendbuff,
+                                recvbuff,
+                                recvcount,
+                                nranks,
+                                comm,
+                                stream,
+                                dev_ctx,
+                                comm_ctx,
+                                scale);
 }
 
 template <typename T>
@@ -990,9 +1098,17 @@ static void NCCLAllReduceWithScale(const T *sendbuff,
                                    ncclComm_t comm,
                                    gpuStream_t stream,
                                    const phi::GPUContext &dev_ctx,
+                                   distributed::NCCLCommContext *comm_ctx,
                                    const T *scale = nullptr) {
-  NCCLSumWithScaleBase<T, false>(
-      sendbuff, recvbuff, recvcount, nranks, comm, stream, dev_ctx, scale);
+  NCCLSumWithScaleBase<T, false>(sendbuff,
+                                 recvbuff,
+                                 recvcount,
+                                 nranks,
+                                 comm,
+                                 stream,
+                                 dev_ctx,
+                                 comm_ctx,
+                                 scale);
 }
 
 #endif
@@ -1643,26 +1759,71 @@ void DistributedFusedLambKernel(
   int64_t global_rank = 0, local_rank = 0;
   ncclComm_t global_comm = nullptr, local_comm = nullptr,
              external_comm = nullptr;
-  if (nranks > 1) {
-    auto *nccl_comm_handle =
-        paddle::platform::NCCLCommContext::Instance().Get(ring_ids[0], place);
-    global_comm = nccl_comm_handle->comm();
-    global_rank = nccl_comm_handle->rank();
+  paddle::platform::NCCLComm *nccl_comm_handle = nullptr,
+                             *local_nccl_comm_handle = nullptr;
+  distributed::NCCLCommContext *comm_ctx = nullptr, *local_comm_ctx = nullptr,
+                               *external_comm_ctx = nullptr;
+
+  const auto &comm_context_manager =
+      phi::distributed::CommContextManager::GetInstance();
+
+  if (FLAGS_dynamic_static_unified_comm) {
+    CheckCommContextHasRingId(comm_context_manager, ring_ids[0]);
+
+    comm_ctx = static_cast<phi::distributed::NCCLCommContext *>(
+        comm_context_manager.Get(std::to_string(ring_ids[0])));
+    PADDLE_ENFORCE_NE(comm_ctx,
+                      nullptr,
+                      paddle::platform::errors::Unavailable(
+                          "NCCLCommContext is nullptr, collective op should "
+                          "has ring_id attr."));
+
+    global_comm = comm_ctx->GetNcclComm();
+    global_rank = comm_ctx->GetRank();
     if (local_shard) {
-      auto *local_nccl_comm_handle =
-          paddle::platform::NCCLCommContext::Instance().Get(ring_ids[1], place);
-      local_comm = local_nccl_comm_handle->comm();
-      local_rank = local_nccl_comm_handle->rank();
+      CheckCommContextHasRingId(comm_context_manager, ring_ids[1]);
+
+      local_comm_ctx = static_cast<phi::distributed::NCCLCommContext *>(
+          comm_context_manager.Get(std::to_string(ring_ids[1])));
+      local_comm = local_comm_ctx->GetNcclComm();
+      local_rank = local_comm_ctx->GetRank();
       if (use_hierarchical_allreduce) {
-        external_comm = paddle::platform::NCCLCommContext::Instance()
-                            .Get(ring_ids[2], place)
-                            ->comm();
+        CheckCommContextHasRingId(comm_context_manager, ring_ids[2]);
+
+        external_comm_ctx = static_cast<phi::distributed::NCCLCommContext *>(
+            comm_context_manager.Get(std::to_string(ring_ids[2])));
+        external_comm = external_comm_ctx->GetNcclComm();
       }
     } else {
       local_comm = global_comm;
       local_rank = global_rank;
     }
+
+    VLOG(3) << "new comm_context_manager has ring_id " << ring_ids[0];
+  } else {
+    if (nranks > 1) {
+      nccl_comm_handle =
+          paddle::platform::NCCLCommContext::Instance().Get(ring_ids[0], place);
+      global_comm = nccl_comm_handle->comm();
+      global_rank = nccl_comm_handle->rank();
+      if (local_shard) {
+        local_nccl_comm_handle =
+            paddle::platform::NCCLCommContext::Instance().Get(ring_ids[1],
+                                                              place);
+        local_comm = local_nccl_comm_handle->comm();
+        local_rank = local_nccl_comm_handle->rank();
+        if (use_hierarchical_allreduce) {
+          external_comm = paddle::platform::NCCLCommContext::Instance()
+                              .Get(ring_ids[2], place)
+                              ->comm();
+        }
+      } else {
+        local_comm = global_comm;
+        local_rank = global_rank;
+      }
+    }
   }
+
   memory_utils::Buffer grad_norm_square_buffer(place);
   auto *fp32_square_grad_norm = grad_norm_square_buffer.Alloc<float>(2);
   memory_utils::Buffer cub_tmp_buffer(place);
@@ -1715,7 +1876,8 @@ void DistributedFusedLambKernel(
               num_devices,
               local_comm,
               stream,
-              dev_ctx);
+              dev_ctx,
+              local_comm_ctx);
           NCCLAllReduceWithScale(
               fp32_sum_grad + local_rank * fp32_numel_each_device,
               fp32_sum_grad + local_rank * fp32_numel_each_device,
@@ -1723,7 +1885,8 @@ void DistributedFusedLambKernel(
               nranks / num_devices,
               external_comm,
               stream,
-              dev_ctx);
+              dev_ctx,
+              external_comm_ctx);
 
           NCCLReduceScatterWithScale(
               fp16_grad_data,
@@ -1732,7 +1895,8 @@ void DistributedFusedLambKernel(
               num_devices,
               local_comm,
               stream,
-              dev_ctx);
+              dev_ctx,
+              local_comm_ctx);
           NCCLAllReduceWithScale(
               fp16_sum_grad + local_rank * fp16_numel_each_device,
               fp16_sum_grad + local_rank * fp16_numel_each_device,
@@ -1740,7 +1904,8 @@ void DistributedFusedLambKernel(
               nranks / num_devices,
               external_comm,
               stream,
-              dev_ctx);
+              dev_ctx,
+              external_comm_ctx);
         } else {
           NCCLAllReduceWithScale(fp32_grad_data,
                                  fp32_sum_grad,
@@ -1748,14 +1913,16 @@ void DistributedFusedLambKernel(
                                  nranks,
                                  global_comm,
                                  stream,
-                                 dev_ctx);
+                                 dev_ctx,
+                                 comm_ctx);
           NCCLAllReduceWithScale(fp16_grad_data,
                                  fp16_sum_grad,
                                  fp16_numel,
                                  nranks,
                                  global_comm,
                                  stream,
-                                 dev_ctx);
+                                 dev_ctx,
+                                 comm_ctx);
         }
         fp32_sum_grad += (local_rank * fp32_numel_each_device);
         fp16_sum_grad += (local_rank * fp16_numel_each_device);
@@ -1766,14 +1933,16 @@ void DistributedFusedLambKernel(
                                    nranks,
                                    global_comm,
                                    stream,
-                                   dev_ctx);
+                                   dev_ctx,
+                                   comm_ctx);
         NCCLReduceScatterWithScale(fp16_grad_data,
                                    fp16_sum_grad,
                                    fp16_numel_each_device,
                                    nranks,
                                    global_comm,
                                    stream,
-                                   dev_ctx);
+                                   dev_ctx,
+                                   comm_ctx);
       }
       // (2) Calculate the global grad norm
       GetSquareGradNorm(fp32_sum_grad,
@@ -1786,6 +1955,8 @@ void DistributedFusedLambKernel(
       VLOG(1) << "Grad square norm before all reduce: "
               << FlattenToString(fp32_square_grad_norm, 1, place);
       if (num_devices > 1) {
+        // TODO(BeingGod): NCCLCommContext::AllReduce only accept DenseTensor,
+        // but fp32_square_grad_norm is allocated by Buffer.
         PADDLE_ENFORCE_GPU_SUCCESS(
             phi::dynload::ncclAllReduce(fp32_square_grad_norm,
                                         fp32_square_grad_norm,
@@ -1852,6 +2023,7 @@ void DistributedFusedLambKernel(
               local_comm,
               stream,
               dev_ctx,
+              local_comm_ctx,
               fp32_scale);
           NCCLAllReduceWithScale(
               fp32_sum_grad + local_rank * fp32_numel_each_device,
@@ -1860,8 +2032,8 @@ void DistributedFusedLambKernel(
               nranks / num_devices,
               external_comm,
               stream,
-              dev_ctx);
-
+              dev_ctx,
+              external_comm_ctx);
           NCCLReduceScatterWithScale(
               fp16_grad_data,
               fp16_sum_grad + local_rank * fp16_numel_each_device,
@@ -1870,6 +2042,7 @@ void DistributedFusedLambKernel(
               local_comm,
               stream,
               dev_ctx,
+              local_comm_ctx,
               fp16_scale);
           NCCLAllReduceWithScale(
               fp16_sum_grad + local_rank * fp16_numel_each_device,
@@ -1878,7 +2051,8 @@ void DistributedFusedLambKernel(
               nranks / num_devices,
               external_comm,
               stream,
-              dev_ctx);
+              dev_ctx,
+              external_comm_ctx);
         } else {
           NCCLAllReduceWithScale(fp32_grad_data,
                                  fp32_sum_grad,
@@ -1887,6 +2061,7 @@ void DistributedFusedLambKernel(
                                  global_comm,
                                  stream,
                                  dev_ctx,
+                                 comm_ctx,
                                  fp32_scale);
           NCCLAllReduceWithScale(fp16_grad_data,
                                  fp16_sum_grad,
@@ -1895,6 +2070,7 @@ void DistributedFusedLambKernel(
                                  global_comm,
                                  stream,
                                  dev_ctx,
+                                 comm_ctx,
                                  fp16_scale);
         }
         fp32_sum_grad += (local_rank * fp32_numel_each_device);
@@ -1907,6 +2083,7 @@ void DistributedFusedLambKernel(
                                    global_comm,
                                    stream,
                                    dev_ctx,
+                                   comm_ctx,
                                    fp32_scale);
         NCCLReduceScatterWithScale(fp16_grad_data,
                                    fp16_sum_grad,
@@ -1915,6 +2092,7 @@ void DistributedFusedLambKernel(
                                    global_comm,
                                    stream,
                                    dev_ctx,
+                                   comm_ctx,
                                    fp16_scale);
       }
       VLOG(1) << "FP32 HasNanInf after all reduce: "
@@ -1929,6 +2107,8 @@ void DistributedFusedLambKernel(
                          stream,
                          &cub_tmp_buffer);
       if (num_devices > 1) {
+        // TODO(BeingGod): NCCLCommContext::AllReduce only accept DenseTensor,
+        // but fp32_square_grad_norm is allocated by Buffer.
         PADDLE_ENFORCE_GPU_SUCCESS(
             phi::dynload::ncclAllReduce(fp32_square_grad_norm,
                                         fp32_square_grad_norm,
@@ -1954,7 +2134,8 @@ void DistributedFusedLambKernel(
             num_devices,
             local_comm,
             stream,
-            dev_ctx);
+            dev_ctx,
+            local_comm_ctx);
         NCCLAllReduceWithScale(
             fp32_sum_grad + local_rank * fp32_numel_each_device,
             fp32_sum_grad + local_rank * fp32_numel_each_device,
@@ -1962,7 +2143,8 @@ void DistributedFusedLambKernel(
             nranks / num_devices,
             external_comm,
             stream,
-            dev_ctx);
+            dev_ctx,
+            external_comm_ctx);
         NCCLReduceScatterWithScale(
             fp16_grad_data,
             fp16_sum_grad + local_rank * fp16_numel_each_device,
@@ -1970,7 +2152,8 @@ void DistributedFusedLambKernel(
             num_devices,
             local_comm,
             stream,
-            dev_ctx);
+            dev_ctx,
+            local_comm_ctx);
         NCCLAllReduceWithScale(
             fp16_sum_grad + local_rank * fp16_numel_each_device,
             fp16_sum_grad + local_rank * fp16_numel_each_device,
@@ -1978,7 +2161,8 @@ void DistributedFusedLambKernel(
             nranks / num_devices,
             external_comm,
             stream,
-            dev_ctx);
+            dev_ctx,
+            external_comm_ctx);
       } else {
         NCCLAllReduceWithScale(fp32_grad_data,
                                fp32_sum_grad,
@@ -1986,14 +2170,16 @@ void DistributedFusedLambKernel(
                                nranks,
                                global_comm,
                                stream,
-                               dev_ctx);
+                               dev_ctx,
+                               comm_ctx);
         NCCLAllReduceWithScale(fp16_grad_data,
                                fp16_sum_grad,
                                fp16_numel,
                                nranks,
                                global_comm,
                                stream,
-                               dev_ctx);
+                               dev_ctx,
+                               comm_ctx);
       }
       fp32_sum_grad += (local_rank * fp32_numel_each_device);
       fp16_sum_grad += (local_rank * fp16_numel_each_device);
@@ -2004,14 +2190,16 @@ void DistributedFusedLambKernel(
                                  num_devices,
                                  global_comm,
                                  stream,
-                                 dev_ctx);
+                                 dev_ctx,
+                                 comm_ctx);
       NCCLReduceScatterWithScale(fp16_grad_data,
                                  fp16_sum_grad,
                                  fp16_numel_each_device,
                                  num_devices,
                                  global_comm,
                                  stream,
-                                 dev_ctx);
+                                 dev_ctx,
+                                 comm_ctx);
     }
     CheckHasNanInfGrad(fp32_sum_grad,
                        fp32_numel_each_device,
@@ -2021,6 +2209,8 @@ void DistributedFusedLambKernel(
                        stream,
                        &cub_tmp_buffer);
     if (num_devices > 1) {
+      // TODO(BeingGod): NCCLCommContext::AllReduce only accept DenseTensor,
+      // but fp32_square_grad_norm is allocated by Buffer.
       PADDLE_ENFORCE_GPU_SUCCESS(
           phi::dynload::ncclAllReduce(fp32_square_grad_norm,
                                       fp32_square_grad_norm,
@@ -2165,6 +2355,8 @@ void DistributedFusedLambKernel(
           << FlattenToString(trust_ratio_div_square_norm, param_num, place);
   if (num_devices > 1) {
     if (use_master_param_norm) {
+      // TODO(BeingGod): NCCLCommContext::AllReduce only accept DenseTensor,
+      // but param_square_norm is allocated by Buffer.
       PADDLE_ENFORCE_GPU_SUCCESS(
           phi::dynload::ncclAllReduce(param_square_norm + fp32_global_param_num,
                                       param_square_norm + fp32_global_param_num,
@@ -2174,6 +2366,8 @@ void DistributedFusedLambKernel(
                                       local_comm,
                                       stream));
     } else {
+      // TODO(BeingGod): NCCLCommContext::AllReduce only accept DenseTensor,
+      // but trust_ratio_div_square_norm is allocated by Buffer.
       PADDLE_ENFORCE_GPU_SUCCESS(
           phi::dynload::ncclAllReduce(trust_ratio_div_square_norm,
                                       trust_ratio_div_square_norm,
@@ -2209,13 +2403,21 @@ void DistributedFusedLambKernel(
         beta2);
     if (num_devices > 1) {
       // ncclAllGather
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::ncclAllGather(fp32_param_data + fp32_offset,
-                                      fp32_param_data,
-                                      fp32_numel_each_device,
-                                      ncclFloat32,
-                                      local_comm,
-                                      stream));
+      if (local_comm_ctx) {
+        auto send_buf = paddle::distributed::GetPartialTensor(
+            *fp32_param_out, fp32_offset, fp32_numel_each_device);
+        auto recv_buf = paddle::distributed::GetPartialTensor(
+            *fp32_param_out, 0, fp32_numel_each_device);
+        local_comm_ctx->AllGather(&recv_buf, send_buf, stream);
+      } else {
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            phi::dynload::ncclAllGather(fp32_param_data + fp32_offset,
+                                        fp32_param_data,
+                                        fp32_numel_each_device,
+                                        ncclFloat32,
+                                        local_comm,
+                                        stream));
+      }
     }
 
     beta1_pow_data = nullptr;
@@ -2239,13 +2441,21 @@ void DistributedFusedLambKernel(
         beta2);
     if (num_devices > 1) {
       // ncclAllGather
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::ncclAllGather(fp16_param_data + fp16_offset,
-                                      fp16_param_data,
-                                      fp16_numel_each_device,
-                                      ncclFloat16,
-                                      local_comm,
-                                      stream));
+      if (local_comm_ctx) {
+        auto send_buf = paddle::distributed::GetPartialTensor(
+            *fp16_param_out, fp16_offset, fp16_numel_each_device);
+        auto recv_buf = paddle::distributed::GetPartialTensor(
+            *fp16_param_out, 0, fp16_numel_each_device);
+        local_comm_ctx->AllGather(&recv_buf, send_buf, stream);
+      } else {
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            phi::dynload::ncclAllGather(fp16_param_data + fp16_offset,
+                                        fp16_param_data,
+                                        fp16_numel_each_device,
+                                        ncclFloat16,
+                                        local_comm,
+                                        stream));
+      }
     }
   }
   VLOG(10) << "Update Param done";
diff --git a/paddle/phi/core/distributed/nccl_comm_context.cc b/paddle/phi/core/distributed/nccl_comm_context.cc
index 90b6a4c447c92..bd49f0cff1708 100644
--- a/paddle/phi/core/distributed/nccl_comm_context.cc
+++ b/paddle/phi/core/distributed/nccl_comm_context.cc
@@ -33,8 +33,11 @@ NCCLCommContext::NCCLCommContext(int rank, int size, ncclUniqueId nccl_id)
     : CommContext(rank, size) {
   PADDLE_ENFORCE_GPU_SUCCESS(
       phi::dynload::ncclCommInitRank(&nccl_comm_, size_, nccl_id, rank_));
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclGetVersion(&nccl_version_));
 }
 
+int NCCLCommContext::GetNcclVersion() { return nccl_version_; }
+
 ncclComm_t NCCLCommContext::GetNcclComm() { return nccl_comm_; }
 
 gpuStream_t NCCLCommContext::GetStream() { return dev_ctx_->stream(); }
@@ -228,5 +231,19 @@ void NCCLCommContext::GroupStart() {
 }
 void NCCLCommContext::GroupEnd() { NCCL_CHECK(phi::dynload::ncclGroupEnd()); }
 
+#if NCCL_VERSION_CODE >= 21100
+void NCCLCommContext::RedOpCreatePreMulSum(ncclRedOp_t* op,
+                                           void* scalar,
+                                           ncclDataType_t dtype,
+                                           ncclScalarResidence_t residence) {
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclRedOpCreatePreMulSum(
+      op, scalar, dtype, residence, nccl_comm_));
+}
+
+void NCCLCommContext::RedOpDestroy(ncclRedOp_t op) {
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclRedOpDestroy(op, nccl_comm_));
+}
+#endif
+
 }  // namespace distributed
 }  // namespace phi
diff --git a/paddle/phi/core/distributed/nccl_comm_context.h b/paddle/phi/core/distributed/nccl_comm_context.h
index fdd45793a6387..b9fdce02f4b5f 100644
--- a/paddle/phi/core/distributed/nccl_comm_context.h
+++ b/paddle/phi/core/distributed/nccl_comm_context.h
@@ -40,7 +40,9 @@ namespace distributed {
 class NCCLCommContext final : public CommContext {
  public:
   NCCLCommContext(int rank, int size, ncclUniqueId nccl_id);
-  ~NCCLCommContext() {}
+  ~NCCLCommContext() override = default;
+
+  int GetNcclVersion();
 
   ncclComm_t GetNcclComm();
 
@@ -65,6 +67,7 @@ class NCCLCommContext final : public CommContext {
                  const phi::DenseTensor& in_tensor,
                  int root,
                  gpuStream_t stream);
+
   void Send(const phi::DenseTensor& in_tensor,
             const int64_t& count,
             const int& peer,
@@ -99,9 +102,24 @@ class NCCLCommContext final : public CommContext {
 
   void GroupEnd();
 
+#if NCCL_VERSION_CODE >= 21100
+  // Creates a new reduction operator which pre-multiplies input values by a
+  // given scalar locally before reducing them with peer values via summation.
+  void RedOpCreatePreMulSum(ncclRedOp_t* op,
+                            void* scalar,
+                            ncclDataType_t dtype,
+                            ncclScalarResidence_t residence);
+
+  // Destroys the reduction operator op. The operator must have been created by
+  // ncclRedOpCreatePreMul with the matching communicator comm.
+  void RedOpDestroy(ncclRedOp_t op);
+#endif
+
  private:
   DISABLE_COPY_AND_ASSIGN(NCCLCommContext);
 
+  int nccl_version_;
+
   ncclComm_t nccl_comm_;
 
   std::unique_ptr<phi::GPUContext> dev_ctx_;
diff --git a/test/legacy_test/CMakeLists.txt b/test/legacy_test/CMakeLists.txt
index 9e7adef0a634f..e6a060c7369a9 100644
--- a/test/legacy_test/CMakeLists.txt
+++ b/test/legacy_test/CMakeLists.txt
@@ -1013,11 +1013,11 @@ set_tests_properties(test_row_conv_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_parallel_executor_seresnext_with_fuse_all_reduce_gpu
                      PROPERTIES TIMEOUT 120)
 set_tests_properties(test_distributed_fused_lamb_op_with_clip PROPERTIES TIMEOUT
-                                                                         120)
+                                                                         240)
 set_tests_properties(test_distributed_fused_lamb_op_without_clip
                      PROPERTIES TIMEOUT 120)
 set_tests_properties(test_distributed_fused_lamb_op_with_gradient_merge
-                     PROPERTIES TIMEOUT 120)
+                     PROPERTIES TIMEOUT 240)
 set_tests_properties(test_elementwise_min_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_nan_inf PROPERTIES TIMEOUT 120)
 set_tests_properties(test_deformable_conv_v1_op PROPERTIES TIMEOUT 300)
diff --git a/test/legacy_test/distributed_fused_lamb_test_base.py b/test/legacy_test/distributed_fused_lamb_test_base.py
index baffc7dd5e546..ea011becc9090 100644
--- a/test/legacy_test/distributed_fused_lamb_test_base.py
+++ b/test/legacy_test/distributed_fused_lamb_test_base.py
@@ -270,7 +270,10 @@ def setUpClass(cls):
         paddle.enable_static()
         paddle.set_flags({'FLAGS_cudnn_deterministic': True})
         _clip_by_global_norm_using_mp_type(True)
-        fleet.init(role_maker=get_role_maker())
+        if os.environ.get("FLAGS_dynamic_static_unified_comm") == "1":
+            paddle.distributed.collective._init_parallel_env("nccl")
+        else:
+            fleet.init(role_maker=get_role_maker())
 
     def config(self):
         clip_after_allreduce = bool(
diff --git a/test/legacy_test/test_distributed_fused_lamb_op_with_clip.py b/test/legacy_test/test_distributed_fused_lamb_op_with_clip.py
index 671e11e7702fe..32ee6fd8b3958 100644
--- a/test/legacy_test/test_distributed_fused_lamb_op_with_clip.py
+++ b/test/legacy_test/test_distributed_fused_lamb_op_with_clip.py
@@ -41,6 +41,7 @@ def run_test(
     max_global_norm=-1.0,
     gradient_merge_steps=1,
     use_master_acc_grad=True,
+    need_env={},
 ):
     temp_dir = tempfile.TemporaryDirectory()
     if not paddle.is_compiled_with_cuda():
@@ -54,6 +55,8 @@ def run_test(
         '-u',
         '-m',
         'paddle.distributed.launch',
+        '--devices',
+        '0,1',
         '--log_dir',
         log_dir,
         get_test_file(),
@@ -65,6 +68,7 @@ def run_test(
     os.environ['MAX_GLOBAL_NORM'] = str(max_global_norm)
     os.environ['GRADIENT_MERGE_STEPS'] = str(gradient_merge_steps)
     os.environ['USE_MASTER_ACC_GRAD'] = str(1 if use_master_acc_grad else 0)
+    os.environ.update(need_env)
 
     touch_file_env = 'SUCCESS_TOUCH_FILE'
     touch_file_name = os.path.join(
@@ -87,6 +91,20 @@ def test_1(self):
     def test_2(self):
         run_test(clip_after_allreduce=False, max_global_norm=0.01)
 
+    def test_1_new_comm(self):
+        run_test(
+            clip_after_allreduce=True,
+            max_global_norm=0.01,
+            need_env={"FLAGS_dynamic_static_unified_comm": "1"},
+        )
+
+    def test_2_new_comm(self):
+        run_test(
+            clip_after_allreduce=False,
+            max_global_norm=0.01,
+            need_env={"FLAGS_dynamic_static_unified_comm": "1"},
+        )
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/legacy_test/test_distributed_fused_lamb_op_with_gradient_merge.py b/test/legacy_test/test_distributed_fused_lamb_op_with_gradient_merge.py
index 0c7096f5dae1a..f236be3a8d150 100644
--- a/test/legacy_test/test_distributed_fused_lamb_op_with_gradient_merge.py
+++ b/test/legacy_test/test_distributed_fused_lamb_op_with_gradient_merge.py
@@ -33,6 +33,23 @@ def test_gm_with_fp16_acc_grad(self):
             use_master_acc_grad=False,
         )
 
+    def test_gm_new_comm(self):
+        run_test(
+            clip_after_allreduce=True,
+            max_global_norm=-1.0,
+            gradient_merge_steps=2,
+            need_env={"FLAGS_dynamic_static_unified_comm": "1"},
+        )
+
+    def test_gm_with_fp16_acc_grad_new_comm(self):
+        run_test(
+            clip_after_allreduce=True,
+            max_global_norm=-1.0,
+            gradient_merge_steps=2,
+            use_master_acc_grad=False,
+            need_env={"FLAGS_dynamic_static_unified_comm": "1"},
+        )
+
 
 if __name__ == "__main__":
     unittest.main()

From 892dee35a525f1c752f2cbeff1a72df38b569155 Mon Sep 17 00:00:00 2001
From: iLeGend <824040212@qq.com>
Date: Thu, 21 Sep 2023 10:56:43 +0800
Subject: [PATCH 15/39] [NewComm] No.2 compatiable upgrade for partial_recv op
 (#57548)

* [NewComm] No.2 compatiable upgrade for partial_recv op

* fix

* add header

* fix typo
---
 .../collective/partial_recv_op.cu.cc          | 91 +++++++++++++++----
 1 file changed, 74 insertions(+), 17 deletions(-)

diff --git a/paddle/fluid/operators/collective/partial_recv_op.cu.cc b/paddle/fluid/operators/collective/partial_recv_op.cu.cc
index 0c33ca7c25c32..2a6aea1c7a13a 100644
--- a/paddle/fluid/operators/collective/partial_recv_op.cu.cc
+++ b/paddle/fluid/operators/collective/partial_recv_op.cu.cc
@@ -18,15 +18,21 @@ limitations under the License. */
 #include "paddle/fluid/distributed/collective/process_group.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
+#include "paddle/phi/core/distributed/nccl_comm_context.h"
+#include "paddle/phi/core/flags.h"
+PHI_DECLARE_bool(dynamic_static_unified_comm);
 #endif
 
+#include "paddle/fluid/distributed/collective/utils.h"
+#include "paddle/phi/core/distributed/comm_context_manager.h"
+
 namespace paddle {
 namespace operators {
 
 template <typename T, typename DeviceContext>
 class PartialRecvOpCUDAKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
+  void Compute(const framework::ExecutionContext& ctx) const override {
 #if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)) && \
     NCCL_VERSION_CODE >= 2703
     auto out = ctx.Output<phi::DenseTensor>("Out");
@@ -74,35 +80,86 @@ class PartialRecvOpCUDAKernel : public framework::OpKernel<T> {
     auto map = distributed::ProcessGroupMapFromGid::getInstance();
     if (map->has(rid)) {
       // Use ProcessGroup
-      distributed::ProcessGroup *pg = map->get(rid);
+      distributed::ProcessGroup* pg = map->get(rid);
       auto task = pg->Recv(out, peer, offset, recv_numel, /*sync_op*/ true);
       task->Wait();
     } else {
       gpuStream_t stream = nullptr;
-      auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
+      platform::NCCLComm* comm = nullptr;
+      phi::distributed::NCCLCommContext* comm_ctx = nullptr;
+
+      int nranks = 0;
+      int rank = 0;
+
+      const auto& comm_context_manager =
+          phi::distributed::CommContextManager::GetInstance();
+
+      if (FLAGS_dynamic_static_unified_comm) {
+        // Use New Communication Library
+        PADDLE_ENFORCE_EQ(
+            comm_context_manager.Has(std::to_string(rid)),
+            true,
+            platform::errors::InvalidArgument(
+                "You choose to use new communication library by "
+                "setting environment "
+                "variable FLAGS_dynamic_static_unified_comm True. "
+                "But ring_id(%d) is "
+                "not found in comm_context_manager.",
+                std::to_string(rid)));
+        comm_ctx = static_cast<phi::distributed::NCCLCommContext*>(
+            comm_context_manager.Get(std::to_string(rid)));
+        PADDLE_ENFORCE_NE(
+            comm_ctx,
+            nullptr,
+            platform::errors::Unavailable(
+                "NCCLCommContext is nullptr, collective op should "
+                "has ring_id attr."));
+
+        stream = comm_ctx->GetStream();
+        nranks = comm_ctx->GetSize();
+        rank = comm_ctx->GetRank();
+
+        VLOG(3) << "new comm_context_manager has ring_id " << rid;
+      } else {
+        comm = platform::NCCLCommContext::Instance().Get(rid, place);
+
+        stream = comm->stream();
+        nranks = comm->nranks();
+        rank = comm->rank();
+
+        VLOG(3) << "old NCCLCommContext has ring_id" << rid;
+      }
+
       if (ctx.Attr<bool>("use_calc_stream")) {
         // should ExecutionContext for calc stream.
         stream = ctx.cuda_device_context().stream();
-      } else {
-        stream = comm->stream();
       }
+
       PADDLE_ENFORCE_LT(peer,
-                        comm->nranks(),
+                        nranks,
                         platform::errors::InvalidArgument(
                             "The value of peer (%d) you set must "
-                            "be less than comm->nranks (%d).",
+                            "be less than nranks (%d).",
                             peer,
-                            comm->nranks()));
+                            nranks));
+
       ncclDataType_t dtype = platform::ToNCCLDataType(type);
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::ncclRecv(out->data<T>() + offset,
-                                      recv_numel,
-                                      dtype,
-                                      peer,
-                                      comm->comm(),
-                                      stream));
-      VLOG(3) << "rank " << comm->rank() << " recv " << recv_numel
-              << " from offset[" << offset << "] from " << peer;
+
+      if (comm_ctx) {
+        auto recv_buf = distributed::GetPartialTensor(*out, offset, recv_numel);
+
+        comm_ctx->Recv(&recv_buf, recv_numel, peer, stream);
+      } else {
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            platform::dynload::ncclRecv(out->data<T>() + offset,
+                                        recv_numel,
+                                        dtype,
+                                        peer,
+                                        comm->comm(),
+                                        stream));
+      }
+      VLOG(3) << "rank " << rank << " recv " << recv_numel << " from offset["
+              << offset << "] from " << peer;
     }
 #else
     PADDLE_THROW(platform::errors::Unavailable(

From 431a791a2c7626dcc669efba9bd77a880c625123 Mon Sep 17 00:00:00 2001
From: xuxinyi389 <104957571+xuxinyi389@users.noreply.github.com>
Date: Thu, 21 Sep 2023 10:56:53 +0800
Subject: [PATCH 16/39] Enhanced tuple support I (#57469)

* bugs_fix:tuple_support

* bug_fixes

* bug_fixes

* bug_fixes

* bug_fixes

* bug_fixes

* bug_fixes
---
 python/paddle/nn/functional/common.py | 30 +++++----
 python/paddle/nn/layer/common.py      |  8 +--
 python/paddle/vision/ops.py           | 26 +++++---
 test/legacy_test/test_box_coder_op.py | 92 +++++++++++++++++++++------
 test/legacy_test/test_min_op.py       |  9 +++
 test/legacy_test/test_unfold_op.py    | 11 ++++
 6 files changed, 133 insertions(+), 43 deletions(-)

diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index 5ef8e40d921b6..9b1da0dd36802 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -69,19 +69,19 @@ def unfold(x, kernel_sizes, strides=1, paddings=0, dilations=1, name=None):
     Parameters:
         x(Tensor):              4-D Tensor, input tensor of format [N, C, H, W],
                                   data type can be float32 or float64
-        kernel_sizes(int|list):   The size of convolution kernel, should be [k_h, k_w]
+        kernel_sizes(int|list|tuple):   The size of convolution kernel, should be [k_h, k_w]
                                   or an integer k treated as [k, k].
-        strides(int|list, optional):        The strides, should be [stride_h, stride_w]
+        strides(int|list|tuple, optional):        The strides, should be [stride_h, stride_w]
                                   or an integer stride treated as [sride, stride].
                                   For default, strides will be [1, 1].
-        paddings(int|list, optional):       The paddings of each dimension, should be
+        paddings(int|list|tuple, optional):       The paddings of each dimension, should be
                                   [padding_top, padding_left, padding_bottom, padding_right]
                                   or [padding_h, padding_w] or an integer padding.
                                   If [padding_h, padding_w] was given, it will expanded to
                                   [padding_h, padding_w, padding_h, padding_w]. If an integer
                                   padding was given, [padding, padding, padding, padding] will
                                   be used. For default, paddings will be [0, 0, 0, 0]
-        dilations(int|list, optional):      the dilations of convolution kernel, should be
+        dilations(int|list|tuple, optional):      the dilations of convolution kernel, should be
                                   [dilation_h, dilation_w], or an integer dilation treated as
                                   [dilation, dilation]. For default, it will be [1, 1].
         name(str, optional): The default value is None.
@@ -116,38 +116,42 @@ def unfold(x, kernel_sizes, strides=1, paddings=0, dilations=1, name=None):
     if isinstance(kernel_sizes, int):
         kernel_sizes = [kernel_sizes, kernel_sizes]
     else:
-        assert isinstance(kernel_sizes, list) and (
+        assert isinstance(kernel_sizes, (list, tuple)) and (
             len(kernel_sizes) == 2
-        ), "kernel_sizes should either be an integer or a list of two integers"
+        ), "kernel_sizes should either be an integer or a list/tuple of two integers"
+        kernel_sizes = list(kernel_sizes)
 
     if isinstance(strides, int):
         strides = [strides, strides]
     else:
-        assert isinstance(strides, list) and (
+        assert isinstance(strides, (list, tuple)) and (
             len(strides) == 2
-        ), "strides should either be an integer or a list of two integers"
+        ), "strides should either be an integer or a list/tuple of two integers"
+        strides = list(strides)
 
     if isinstance(dilations, int):
         dilations = [dilations, dilations]
     else:
-        assert isinstance(dilations, list) and (
+        assert isinstance(dilations, (list, tuple)) and (
             len(dilations) == 2
-        ), "dilations should either be an integer or a list of two integers"
+        ), "dilations should either be an integer or a list/tuple of two integers"
+        dilations = list(dilations)
 
     if isinstance(paddings, int):
         paddings = [paddings] * 4
-    elif isinstance(paddings, list):
+    elif isinstance(paddings, (list, tuple)):
+        paddings = list(paddings)
         if len(paddings) == 2:
             paddings = paddings * 2
         elif len(paddings) == 4:
             pass
         else:
             raise ValueError(
-                "paddings should either be an integer or a list of 2 or 4 integers"
+                "paddings should either be an integer or a list/tuple of 2 or 4 integers"
             )
     else:
         raise ValueError(
-            "Unexpected type of paddings, it should be either an integer or a list"
+            "Unexpected type of paddings, it should be either an integer or a list/tuple"
             "of 2 or 4 integers"
         )
 
diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py
index db11591db5fe7..0c55895d21253 100644
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -1551,17 +1551,17 @@ class Unfold(Layer):
 
 
     Parameters:
-        kernel_sizes(int|list): The size of convolution kernel, should be [k_h, k_w]
+        kernel_sizes(int|list|tuple): The size of convolution kernel, should be [k_h, k_w]
             or an integer k treated as [k, k].
-        strides(int|list, optional): The strides, should be [stride_h, stride_w]
+        strides(int|list|tuple, optional): The strides, should be [stride_h, stride_w]
             or an integer stride treated as [sride, stride]. For default, strides will be [1, 1].
-        paddings(int|list, optional): The paddings of each dimension, should be
+        paddings(int|list|tuple, optional): The paddings of each dimension, should be
             [padding_top, padding_left, padding_bottom, padding_right] or [padding_h, padding_w]
             or an integer padding. If [padding_h, padding_w] was given, it will expanded to
             [padding_h, padding_w, padding_h, padding_w]. If an integer padding was given,
             [padding, padding, padding, padding] will be used. For default,
             paddings will be [0, 0, 0, 0].
-        dilations(int|list, optional): The dilations of convolution kernel, should be
+        dilations(int|list|tuple, optional): The dilations of convolution kernel, should be
             [dilation_h, dilation_w], or an integer dilation treated as [dilation, dilation].
             For default, it will be [1, 1].
         name(str, optional): The default value is None. Normally there is no need for user to
diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py
index 677fd7602bcfa..d38f81a57ede9 100755
--- a/python/paddle/vision/ops.py
+++ b/python/paddle/vision/ops.py
@@ -616,10 +616,10 @@ def box_coder(
             left top coordinate of the anchor box, if the input is image feature
             map, they are close to the origin of the coordinate system.
             [xmax, ymax] is the right bottom coordinate of the anchor box.
-        prior_box_var (List|Tensor|None): prior_box_var supports three types
+        prior_box_var (Tensor|List|tuple|None): prior_box_var supports four types
             of input. One is Tensor with shape [M, 4] which holds M group and
-            data type is float32 or float64. The second is list consist of
-            4 elements shared by all boxes and data type is float32 or float64.
+            data type is float32 or float64. The second is list or tuple consist
+            of 4 elements shared by all boxes and data type is float32 or float64.
             Other is None and not involved in calculation.
         target_box (Tensor): This input can be a 2-D LoDTensor with shape
             [N, 4] when code_type is 'encode_center_size'. This input also can
@@ -685,7 +685,11 @@ def box_coder(
                 axis,
                 [],
             )
-        elif isinstance(prior_box_var, list):
+        elif isinstance(prior_box_var, (list, tuple)):
+            prior_box_var = list(prior_box_var)
+            assert (
+                len(prior_box_var) == 4
+            ), "Input prior_box_var must be Variable or list|tuple with 4 elements."
             output_box = _C_ops.box_coder(
                 prior_box,
                 None,
@@ -696,9 +700,10 @@ def box_coder(
                 prior_box_var,
             )
         else:
-            raise TypeError("Input prior_box_var must be Variable or list")
+            raise TypeError(
+                "Input prior_box_var must be Variable or list|tuple"
+            )
         return output_box
-
     else:
         check_variable_and_dtype(
             prior_box, 'prior_box', ['float32', 'float64'], 'box_coder'
@@ -720,10 +725,15 @@ def box_coder(
         }
         if isinstance(prior_box_var, Variable):
             inputs['PriorBoxVar'] = prior_box_var
-        elif isinstance(prior_box_var, list):
+        elif isinstance(prior_box_var, (list, tuple)):
             attrs['variance'] = prior_box_var
+            assert (
+                len(attrs['variance']) == 4
+            ), "Input prior_box_var must be Variable or list|tuple with 4 elements."
         else:
-            raise TypeError("Input prior_box_var must be Variable or list")
+            raise TypeError(
+                "Input prior_box_var must be Variable or list|tuple"
+            )
         helper.append_op(
             type="box_coder",
             inputs=inputs,
diff --git a/test/legacy_test/test_box_coder_op.py b/test/legacy_test/test_box_coder_op.py
index 7221fb2ba73f6..72ef401aa5fb7 100644
--- a/test/legacy_test/test_box_coder_op.py
+++ b/test/legacy_test/test_box_coder_op.py
@@ -372,27 +372,30 @@ def setUp(self):
 
     def test_dygraph_with_static(self):
         paddle.enable_static()
-        prior_box = paddle.static.data(
-            name='prior_box', shape=[80, 4], dtype='float32'
-        )
-        prior_box_var = paddle.static.data(
-            name='prior_box_var', shape=[80, 4], dtype='float32'
-        )
-        target_box = paddle.static.data(
-            name='target_box', shape=[20, 80, 4], dtype='float32'
-        )
+        exe = paddle.static.Executor()
+        main = paddle.static.Program()
+        startup = paddle.static.Program()
+        with paddle.static.program_guard(main, startup):
+            prior_box = paddle.static.data(
+                name='prior_box', shape=[80, 4], dtype='float32'
+            )
+            prior_box_var = paddle.static.data(
+                name='prior_box_var', shape=[80, 4], dtype='float32'
+            )
+            target_box = paddle.static.data(
+                name='target_box', shape=[20, 80, 4], dtype='float32'
+            )
 
-        boxes = paddle.vision.ops.box_coder(
-            prior_box=prior_box,
-            prior_box_var=prior_box_var,
-            target_box=target_box,
-            code_type="decode_center_size",
-            box_normalized=False,
-        )
+            boxes = paddle.vision.ops.box_coder(
+                prior_box=prior_box,
+                prior_box_var=prior_box_var,
+                target_box=target_box,
+                code_type="decode_center_size",
+                box_normalized=False,
+            )
 
-        exe = paddle.static.Executor()
         boxes_np = exe.run(
-            paddle.static.default_main_program(),
+            main,
             feed={
                 'prior_box': self.prior_box_np,
                 'prior_box_var': self.prior_box_var_np,
@@ -419,6 +422,59 @@ def test_dygraph_with_static(self):
         paddle.enable_static()
 
 
+class TestBoxCoderSupporttuple(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(678)
+        self.prior_box_np = np.random.random((80, 4)).astype('float32')
+        self.target_box_np = np.random.random((20, 80, 4)).astype('float32')
+
+    def test_support_tuple(self):
+        paddle.enable_static()
+        exe = paddle.static.Executor()
+        main = paddle.static.Program()
+        startup = paddle.static.Program()
+        with paddle.static.program_guard(main, startup):
+            prior_box = paddle.static.data(
+                name='prior_box', shape=[80, 4], dtype='float32'
+            )
+            target_box = paddle.static.data(
+                name='target_box', shape=[20, 80, 4], dtype='float32'
+            )
+
+            boxes = paddle.vision.ops.box_coder(
+                prior_box=prior_box,
+                prior_box_var=(1, 2, 3, 4),
+                target_box=target_box,
+                code_type="decode_center_size",
+                box_normalized=False,
+            )
+
+        boxes_np = exe.run(
+            main,
+            feed={
+                'prior_box': self.prior_box_np,
+                'target_box': self.target_box_np,
+            },
+            fetch_list=[boxes],
+        )[0]
+
+        paddle.disable_static()
+        prior_box_dy = paddle.to_tensor(self.prior_box_np)
+        target_box_dy = paddle.to_tensor(self.target_box_np)
+
+        boxes_dy = paddle.vision.ops.box_coder(
+            prior_box=prior_box_dy,
+            prior_box_var=(1, 2, 3, 4),
+            target_box=target_box_dy,
+            code_type="decode_center_size",
+            box_normalized=False,
+        )
+        boxes_dy_np = boxes_dy.numpy()
+
+        np.testing.assert_allclose(boxes_np, boxes_dy_np)
+        paddle.enable_static()
+
+
 if __name__ == '__main__':
     paddle.enable_static()
     unittest.main()
diff --git a/test/legacy_test/test_min_op.py b/test/legacy_test/test_min_op.py
index 7de7108d7d1ad..e24471b20dca8 100644
--- a/test/legacy_test/test_min_op.py
+++ b/test/legacy_test/test_min_op.py
@@ -83,6 +83,15 @@ def test_imperative_api(self):
         z_expected = np.array(np.min(np_x, axis=0))
         self.assertEqual((np_z == z_expected).all(), True)
 
+    def test_support_tuple(self):
+        paddle.disable_static()
+        np_x = np.array([10, 10]).astype('float64')
+        x = paddle.to_tensor(np_x)
+        z = paddle.min(x, axis=(0,))
+        np_z = z.numpy()
+        z_expected = np.array(np.min(np_x, axis=0))
+        self.assertEqual((np_z == z_expected).all(), True)
+
 
 class TestOutDtype(unittest.TestCase):
     def test_min(self):
diff --git a/test/legacy_test/test_unfold_op.py b/test/legacy_test/test_unfold_op.py
index 8a7f2aaf199f3..ef8174256e5cb 100644
--- a/test/legacy_test/test_unfold_op.py
+++ b/test/legacy_test/test_unfold_op.py
@@ -144,6 +144,17 @@ def test_check_output(self):
     def test_check_grad(self):
         self.check_grad(['X'], 'Y')
 
+    def test_support_tuple(self):
+        paddle.disable_static()
+        x = paddle.randn((10, 3, 64, 64))
+        paddle.nn.functional.unfold(x, 3, (1, 1), 1, 1)
+        paddle.nn.functional.unfold(x, 3, 1, (1, 1), 1)
+        paddle.nn.functional.unfold(x, 3, 1, 1, (1, 1))
+        out1 = paddle.nn.functional.unfold(x, 3, (1, 1), (1, 1), (1, 1))
+        out2 = paddle.nn.functional.unfold(x, (3, 3), (1, 1), (1, 1), (1, 1))
+        self.assertTrue(np.allclose(out1.numpy(), out2.numpy()))
+        paddle.enable_static()
+
 
 class TestUnfoldFP16Op(TestUnfoldOp):
     def init_dtype(self):

From 20893b0b10df7602c597fcfc920eaec015701860 Mon Sep 17 00:00:00 2001
From: Ligoml <39876205+Ligoml@users.noreply.github.com>
Date: Thu, 21 Sep 2023 10:57:05 +0800
Subject: [PATCH 17/39] Update CI api_docs_approval (#57542)

* Don't Merge

* make conflict

* reset

* updata check_api_approvals.sh
---
 tools/check_api_approvals.sh | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/tools/check_api_approvals.sh b/tools/check_api_approvals.sh
index 3989a0cceff1b..5f05b3cf6f080 100644
--- a/tools/check_api_approvals.sh
+++ b/tools/check_api_approvals.sh
@@ -43,22 +43,18 @@ api_spec_diff=`python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/flu
 if [ "$api_spec_diff" != "" -o "${api_params_diff}" != "" ]; then
     echo_line="You must have one RD (XiaoguangHu01, jeff41404, lanxianghit or qingqing01) approval for API change.\n"
     echo_line="${echo_line} and one TPM approval for API change: \n"
-    echo_line="${echo_line} jzhang533/ZhangJun, sunzhongkai588/SunZhongKai, dingjiaweiww/DingJiaWei, Ligoml/LiMengLiu for general APIs.\n"
-    echo_line="${echo_line} liuTINA0907/LiuShuangQiao for distributed related APIs.\n"
-    echo_line="${echo_line} leiqing1/LeiQing for inference related APIs.\n"
+    echo_line="${echo_line} jzhang533/ZhangJun, sunzhongkai588/SunZhongKai, Ligoml/LiMengLiu for general APIs.\n"
 
     check_approval 1 XiaoguangHu01 jeff41404 lanxianghit qingqing01
-    check_approval 1 jzhang533 sunzhongkai588 dingjiaweiww Ligoml liuTINA0907 leiqing1
+    check_approval 1 jzhang533 sunzhongkai588 Ligoml
 fi
 
 api_doc_spec_diff=`python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API_DEV.spec.doc  ${PADDLE_ROOT}/paddle/fluid/API_PR.spec.doc`
 if [ "$api_doc_spec_diff" != "" ]; then
     echo_line="You must have  one TPM approval for API documents change: \n"
-    echo_line="${echo_line} jzhang533/ZhangJun, sunzhongkai588/SunZhongKai, dingjiaweiww/DingJiaWei, Ligoml/LiMengLiu for general API docs.\n"
-    echo_line="${echo_line} liuTINA0907/LiuShuangQiao for distributed related API docs.\n"
-    echo_line="${echo_line} leiqing1/LeiQing for inference related API docs.\n"
+    echo_line="${echo_line} jzhang533/ZhangJun, sunzhongkai588/SunZhongKai, Ligoml/LiMengLiu for general API docs.\n"
 
-    check_approval 1 jzhang533 sunzhongkai588 dingjiaweiww Ligoml liuTINA0907 leiqing1
+    check_approval 1 jzhang533 sunzhongkai588 Ligoml
 fi
 
 api_yaml_diff=`python ${PADDLE_ROOT}/tools/check_api_yaml_same.py ${PADDLE_ROOT}/paddle/fluid/API_DEV.spec  ${PADDLE_ROOT}/paddle/fluid/API_PR.spec ${BRANCH} ${PADDLE_ROOT}`

From 69fa09a223fbcbd668099d425655f141dc5c1883 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=82=85=E5=89=91=E5=AF=92?= <Xs1580802568@gmail.com>
Date: Thu, 21 Sep 2023 10:59:19 +0800
Subject: [PATCH 18/39] add API for ir_compare and move it from namespace optim
 to ir_utils (#57531)

---
 .../auto_schedule/search_space/search_state.cc |  7 +++----
 .../auto_schedule/search_space/search_state.h  |  4 ++--
 paddle/cinn/ir/test/ir_compare_test.cc         | 18 +++++++-----------
 paddle/cinn/ir/utils/ir_compare.cc             |  8 ++++++++
 paddle/cinn/ir/utils/ir_compare.h              |  6 ++++++
 paddle/cinn/ir/utils/ir_visitor.cc             |  3 +--
 6 files changed, 27 insertions(+), 19 deletions(-)

diff --git a/paddle/cinn/auto_schedule/search_space/search_state.cc b/paddle/cinn/auto_schedule/search_space/search_state.cc
index 96ace0f505d7f..c16bf62840291 100644
--- a/paddle/cinn/auto_schedule/search_space/search_state.cc
+++ b/paddle/cinn/auto_schedule/search_space/search_state.cc
@@ -133,11 +133,10 @@ bool SearchStateEqual::operator()(const SearchState& lhs,
   // compare exprs size firstly
   if (lhs_exprs.size() != rhs_exprs.size()) return false;
 
-  // compare every expr one by one with ir::IrEqualVisitor
+  // compare every expr one by one with ir::ir_utils::IrEqualVisitor
   for (int i = 0; i < lhs_exprs.size(); ++i) {
-    ir::IrEqualVisitor compartor(
-        /*allow_name_suffix_diff=*/true);  // ignore suffix difference in name
-    if (!compartor.Compare(lhs_exprs[i], rhs_exprs[i])) return false;
+    if (!ir::ir_utils::IRCompare(lhs_exprs[i], rhs_exprs[i], true))
+      return false;
   }
   return true;
 }
diff --git a/paddle/cinn/auto_schedule/search_space/search_state.h b/paddle/cinn/auto_schedule/search_space/search_state.h
index 7991fb9540188..b3f45c5cd746c 100644
--- a/paddle/cinn/auto_schedule/search_space/search_state.h
+++ b/paddle/cinn/auto_schedule/search_space/search_state.h
@@ -70,8 +70,8 @@ struct SearchStateHash {
   size_t operator()(const SearchState& s) const;
 };
 
-// SearchStateHash equal functor, use ir::IrEqualVisitor to compare their AST
-// struct and fields
+// SearchStateHash equal functor, use ir::ir_utils::IrEqualVisitor to compare
+// their AST struct and fields
 struct SearchStateEqual {
   bool operator()(const SearchState& lhs, const SearchState& rhs) const;
 };
diff --git a/paddle/cinn/ir/test/ir_compare_test.cc b/paddle/cinn/ir/test/ir_compare_test.cc
index a1bca0cd5373f..cc9ce438221a2 100644
--- a/paddle/cinn/ir/test/ir_compare_test.cc
+++ b/paddle/cinn/ir/test/ir_compare_test.cc
@@ -23,7 +23,7 @@
 
 namespace cinn {
 namespace ir {
-
+namespace ir_utils {
 TEST(TestIrCompare, SingleFunction) {
   Target target = common::DefaultHostTarget();
 
@@ -128,20 +128,16 @@ TEST(TestIrCompare, SingleFunction) {
   ASSERT_EQ(func2_str, utils::GetStreamCnt(funcs_2.front()));
   ASSERT_EQ(func3_str, utils::GetStreamCnt(funcs_3.front()));
 
-  IrEqualVisitor compartor;
   // they are different at the name of root ScheduleBlock
-  ASSERT_TRUE(compartor.Compare(funcs_1.front(), funcs_2.front()));
+  ASSERT_TRUE(IRCompare(funcs_1.front(), funcs_2.front()));
   // compare with itself
-  ASSERT_TRUE(compartor.Compare(funcs_1.front(), funcs_1.front()));
-  IrEqualVisitor compartor_allow_suffix_diff(true);
+  ASSERT_TRUE(IRCompare(funcs_1.front(), funcs_1.front()));
   // they are euqal if allowing suffix of name different
-  ASSERT_TRUE(
-      compartor_allow_suffix_diff.Compare(funcs_1.front(), funcs_2.front()));
+  ASSERT_TRUE(IRCompare(funcs_1.front(), funcs_2.front(), true));
 
-  ASSERT_FALSE(compartor.Compare(funcs_1.front(), funcs_3.front()));
-  ASSERT_FALSE(
-      compartor_allow_suffix_diff.Compare(funcs_1.front(), funcs_3.front()));
+  ASSERT_FALSE(IRCompare(funcs_1.front(), funcs_3.front()));
+  ASSERT_FALSE(IRCompare(funcs_1.front(), funcs_3.front(), true));
 }
-
+}  // namespace ir_utils
 }  // namespace ir
 }  // namespace cinn
diff --git a/paddle/cinn/ir/utils/ir_compare.cc b/paddle/cinn/ir/utils/ir_compare.cc
index c303262d04fbd..87324be608048 100644
--- a/paddle/cinn/ir/utils/ir_compare.cc
+++ b/paddle/cinn/ir/utils/ir_compare.cc
@@ -22,6 +22,8 @@
 namespace cinn {
 namespace ir {
 
+namespace ir_utils {
+
 bool IrEqualVisitor::Compare(const Expr& lhs, const Expr& rhs) {
   if (lhs.get() == rhs.get()) {  // the same object, including both are null
     return true;
@@ -358,5 +360,11 @@ bool IrEqualVisitor::Visit(const ScheduleBlockRealize* lhs, const Expr* other) {
          Compare(lhs->schedule_block, rhs->schedule_block);
 }
 
+bool IRCompare(const Expr& lhs, const Expr& rhs, bool allow_name_suffix_diff) {
+  IrEqualVisitor ir_equal_visitor(allow_name_suffix_diff);
+  return ir_equal_visitor.Compare(lhs, rhs);
+}
+
+}  // namespace ir_utils
 }  // namespace ir
 }  // namespace cinn
diff --git a/paddle/cinn/ir/utils/ir_compare.h b/paddle/cinn/ir/utils/ir_compare.h
index 9e4b335857b98..d41e6db0441a7 100644
--- a/paddle/cinn/ir/utils/ir_compare.h
+++ b/paddle/cinn/ir/utils/ir_compare.h
@@ -20,6 +20,7 @@
 
 namespace cinn {
 namespace ir {
+namespace ir_utils {
 
 // Determine whether two ir AST trees are euqal by comparing their struct and
 // fields of each node through dfs visitor
@@ -47,5 +48,10 @@ class IrEqualVisitor : public IRVisitorRequireReImpl<bool, const Expr*> {
   bool allow_name_suffix_diff_ = false;
 };
 
+bool IRCompare(const Expr& lhs,
+               const Expr& rhs,
+               bool allow_name_suffix_diff = false);
+
+}  // namespace ir_utils
 }  // namespace ir
 }  // namespace cinn
diff --git a/paddle/cinn/ir/utils/ir_visitor.cc b/paddle/cinn/ir/utils/ir_visitor.cc
index 9ef6a78df1fcd..f55259be2c641 100644
--- a/paddle/cinn/ir/utils/ir_visitor.cc
+++ b/paddle/cinn/ir/utils/ir_visitor.cc
@@ -23,8 +23,7 @@ namespace ir {
 
 bool operator==(Expr a, Expr b) {
   if (a.get() == b.get()) return true;
-  IrEqualVisitor cmp;
-  return cmp.Compare(a, b);
+  return ir_utils::IRCompare(a, b);
 }
 
 bool operator!=(Expr a, Expr b) { return !(a == b); }

From b718b1be52e67f72974de7db42fc0fecf070ac18 Mon Sep 17 00:00:00 2001
From: hong <43953930+phlrain@users.noreply.github.com>
Date: Thu, 21 Sep 2023 11:00:31 +0800
Subject: [PATCH 19/39] pir support pixel unshuffle op (#57521)

---
 paddle/fluid/operators/pixel_unshuffle_op.cc | 105 -------------------
 paddle/phi/api/yaml/backward.yaml            |   9 ++
 paddle/phi/api/yaml/op_compat.yaml           |   7 ++
 paddle/phi/api/yaml/ops.yaml                 |   9 ++
 paddle/phi/ops/compat/pixel_unshuffle_sig.cc |  30 ------
 test/legacy_test/test_pixel_unshuffle.py     |   4 +-
 6 files changed, 27 insertions(+), 137 deletions(-)
 delete mode 100644 paddle/fluid/operators/pixel_unshuffle_op.cc
 delete mode 100644 paddle/phi/ops/compat/pixel_unshuffle_sig.cc

diff --git a/paddle/fluid/operators/pixel_unshuffle_op.cc b/paddle/fluid/operators/pixel_unshuffle_op.cc
deleted file mode 100644
index 52b7452d7a8cc..0000000000000
--- a/paddle/fluid/operators/pixel_unshuffle_op.cc
+++ /dev/null
@@ -1,105 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/phi/core/infermeta_utils.h"
-#include "paddle/phi/infermeta/backward.h"
-#include "paddle/phi/infermeta/unary.h"
-
-namespace paddle {
-namespace operators {
-
-class PixelUnshuffleOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-};
-
-class PixelUnshuffleOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(Tensor, default Tensor<float>), "
-             "the input feature data of PixelUnshuffleOp, the layout is "
-             "[N, C, H, W] or [N, H, W, C].");
-    AddOutput("Out",
-              "(Tensor, default Tensor<float>), the output of "
-              "PixelUnshuffleOp. The layout is [N, C*factor^2, H/factor, "
-              "W/factor] or [N, H/factor, W/factor, C*factor^2].");
-    AddAttr<int>("downscale_factor",
-                 "the factor to decrease spatial resolution by.")
-        .SetDefault(1);
-    AddAttr<std::string>(
-        "data_format",
-        "An optional string from: \"NHWC\", \"NCHW\". "
-        "Defaults to \"NHWC\", Specify the data format of the input data.")
-        .SetDefault("NCHW");
-
-    AddComment(R"DOC(
-    Pixel Unshuffle operator
-    This operator rearranges elements in a tensor of shape :math:`(*, C, H, W)`
-    to a tensor of shape :math:`(*, C\times r^2, H / r, W / r)`.
-
-    This operation is the reversion of PixelShuffle operation.
-
-    Please refer to the paper:
-    `Real-Time Single Image and Video Super-Resolution Using an Efficient
-    Sub-Pixel Convolutional Neural Network <https://arxiv.org/abs/1609.05158v2>`_
-    by Shi et. al (2016) for more details.
-    )DOC");
-  }
-};
-
-template <typename T>
-class PixelUnshuffleGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("pixel_unshuffle_grad");
-    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    op->SetAttrMap(this->Attrs());
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-  }
-};
-
-class PixelUnshuffleGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-DECLARE_INFER_SHAPE_FUNCTOR(pixel_unshuffle,
-                            PixelUnshuffleInferShapeFunctor,
-                            PD_INFER_META(phi::PixelUnshuffleInferMeta));
-
-REGISTER_OPERATOR(pixel_unshuffle,
-                  ops::PixelUnshuffleOp,
-                  ops::PixelUnshuffleOpMaker,
-                  ops::PixelUnshuffleGradOpMaker<paddle::framework::OpDesc>,
-                  ops::PixelUnshuffleGradOpMaker<paddle::imperative::OpBase>,
-                  PixelUnshuffleInferShapeFunctor);
-
-DECLARE_INFER_SHAPE_FUNCTOR(pixel_unshuffle_grad,
-                            PixelUnshuffleGradInferShapeFunctor,
-                            PD_INFER_META(phi::PixelUnshuffleGradInferMeta));
-
-REGISTER_OPERATOR(pixel_unshuffle_grad,
-                  ops::PixelUnshuffleGradOp,
-                  PixelUnshuffleGradInferShapeFunctor);
diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml
index b6eeb5e07005c..2f48bb80478e6 100644
--- a/paddle/phi/api/yaml/backward.yaml
+++ b/paddle/phi/api/yaml/backward.yaml
@@ -1646,6 +1646,15 @@
   kernel :
     func : pixel_shuffle_grad
 
+- backward_op : pixel_unshuffle_grad
+  forward : pixel_unshuffle (Tensor x, int downscale_factor=1, str data_format="NCHW") -> Tensor(out)
+  args : (Tensor out_grad, int downscale_factor, str data_format)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : PixelUnshuffleGradInferMeta
+  kernel :
+    func : pixel_unshuffle_grad
+
 - backward_op : poisson_grad
   forward : poisson (Tensor x) -> Tensor(out)
   args : (Tensor out_grad)
diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml
index 31125b8df0ce7..8a85147a66da0 100755
--- a/paddle/phi/api/yaml/op_compat.yaml
+++ b/paddle/phi/api/yaml/op_compat.yaml
@@ -2103,6 +2103,13 @@
   outputs :
     out : Out
 
+- op : pixel_unshuffle
+  backward : pixel_unshuffle_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
 - op : poisson
   inputs :
     x : X
diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
index 4e67144ba8a89..c93f94c2b3320 100644
--- a/paddle/phi/api/yaml/ops.yaml
+++ b/paddle/phi/api/yaml/ops.yaml
@@ -1934,6 +1934,15 @@
     func : pixel_shuffle
   backward : pixel_shuffle_grad
 
+- op : pixel_unshuffle
+  args : (Tensor x, int downscale_factor=1, str data_format="NCHW")
+  output : Tensor
+  infer_meta :
+    func : PixelUnshuffleInferMeta
+  kernel :
+    func : pixel_unshuffle
+  backward : pixel_unshuffle_grad
+
 - op : poisson
   args : (Tensor x)
   output : Tensor
diff --git a/paddle/phi/ops/compat/pixel_unshuffle_sig.cc b/paddle/phi/ops/compat/pixel_unshuffle_sig.cc
deleted file mode 100644
index 6c983c1e24c28..0000000000000
--- a/paddle/phi/ops/compat/pixel_unshuffle_sig.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/core/compat/op_utils.h"
-
-namespace phi {
-
-KernelSignature PixelUnshuffleGradOpArgumentMapping(
-    const ArgumentMappingContext& ctx UNUSED) {
-  return KernelSignature("pixel_unshuffle_grad",
-                         {"Out@GRAD"},
-                         {"downscale_factor", "data_format"},
-                         {"X@GRAD"});
-}
-
-}  // namespace phi
-
-PD_REGISTER_ARG_MAPPING_FN(pixel_unshuffle_grad,
-                           phi::PixelUnshuffleGradOpArgumentMapping);
diff --git a/test/legacy_test/test_pixel_unshuffle.py b/test/legacy_test/test_pixel_unshuffle.py
index ec6ce803d1277..eb2c287b3f886 100644
--- a/test/legacy_test/test_pixel_unshuffle.py
+++ b/test/legacy_test/test_pixel_unshuffle.py
@@ -69,8 +69,8 @@ def pixel_unshuffle_np(x, down_factor, data_format="NCHW"):
 
 
 def pixel_unshuffle_wrapper(x, downscale_factor, data_format):
-    return paddle._legacy_C_ops.pixel_unshuffle(
-        x, "downscale_factor", downscale_factor, "data_format", data_format
+    return paddle.nn.functional.pixel_unshuffle(
+        x, downscale_factor, data_format
     )
 
 

From c4dd10935231f0cf4253225e912a195435dd2d2b Mon Sep 17 00:00:00 2001
From: Zhenghai Zhang <65210872+ccsuzzh@users.noreply.github.com>
Date: Thu, 21 Sep 2023 11:07:18 +0800
Subject: [PATCH 20/39] [CodeStyle][task 39] enable isort in
 `python/paddle/base` (part1) (#57413)

* enable isort rule in python/paddle/base

* fix bug

* fix bug

* fix bug
---
 pyproject.toml                                |  3 +-
 python/paddle/base/backward.py                | 19 ++++----
 python/paddle/base/compiler.py                | 19 ++++----
 python/paddle/base/data_feed_desc.py          |  3 +-
 python/paddle/base/data_feeder.py             | 13 +++---
 python/paddle/base/dataset.py                 |  6 ++-
 python/paddle/base/default_scope_funcs.py     |  3 +-
 python/paddle/base/dygraph/base.py            | 16 ++++---
 python/paddle/base/dygraph/math_op_patch.py   | 10 ++---
 .../base/dygraph/tensor_patch_methods.py      | 34 +++++++-------
 python/paddle/base/dygraph/tracer.py          |  3 +-
 python/paddle/base/dygraph_utils.py           |  3 +-
 python/paddle/base/executor.py                | 33 ++++++--------
 .../incubate/checkpoint/auto_checkpoint.py    | 11 ++---
 python/paddle/base/initializer.py             |  3 +-
 python/paddle/base/io.py                      |  1 +
 python/paddle/base/layer_helper.py            | 11 ++---
 python/paddle/base/layer_helper_base.py       |  9 ++--
 python/paddle/base/layers/io.py               |  6 +--
 .../base/layers/layer_function_generator.py   | 13 +++---
 python/paddle/base/layers/math_op_patch.py    | 11 +++--
 python/paddle/base/lod_tensor.py              |  3 +-
 python/paddle/base/multiprocess_utils.py      |  7 ++-
 python/paddle/base/param_attr.py              |  2 +-
 python/paddle/base/reader.py                  | 44 +++++++++----------
 python/paddle/base/trainer_desc.py            |  2 +-
 python/paddle/base/trainer_factory.py         | 26 ++++++-----
 python/paddle/base/unique_name.py             |  3 +-
 python/paddle/base/variable_index.py          | 24 +++++-----
 python/paddle/base/wrapped_decorator.py       |  3 +-
 30 files changed, 172 insertions(+), 172 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index e11ab2108c2be..8dd98b65873aa 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,8 +16,9 @@ skip = ["build", "third_party", "__init__.py"]
 extend_skip_glob = [
     # These files do not need to be formatted,
     # see .flake8 for more details
-    "python/paddle/base/**",
     "python/paddle/utils/gast/**",
+    "python/paddle/base/core.py",
+    "python/paddle/base/framework.py",
 ]
 
 [tool.ruff]
diff --git a/python/paddle/base/backward.py b/python/paddle/base/backward.py
index 563e423e0c7ea..1f3f67a98b640 100755
--- a/python/paddle/base/backward.py
+++ b/python/paddle/base/backward.py
@@ -12,23 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .proto import framework_pb2
-
-from paddle.base import framework as framework
-from paddle.base import program_guard
-from . import core
 import collections
 import copy
 import logging
-from . import unique_name
-from . import log_helper
-import paddle.base
-from .data_feeder import check_type
+import re
 import warnings
-
 from collections.abc import Sequence
 
-import re
+import paddle.base
+from paddle.base import framework as framework
+from paddle.base import program_guard
+
+from . import core, log_helper, unique_name
+from .data_feeder import check_type
+from .proto import framework_pb2
 
 __all__ = [
     'append_backward',
diff --git a/python/paddle/base/compiler.py b/python/paddle/base/compiler.py
index 69ae6f1d31344..3ee939920dc2b 100644
--- a/python/paddle/base/compiler.py
+++ b/python/paddle/base/compiler.py
@@ -14,9 +14,9 @@
 
 import sys
 import warnings
-from . import framework
-from .framework import cuda_places, cpu_places, xpu_places
-from . import core
+
+from . import core, framework
+from .framework import cpu_places, cuda_places, xpu_places
 
 __all__ = [
     'CompiledProgram',
@@ -399,10 +399,11 @@ def convert_concrete_program(
         """
         Convert the ConcreteProgram to IPUConcreteProgram.
         """
-        from ..base.dygraph.base import switch_to_static_graph
+        import paddle
+
         from ..base import backward
+        from ..base.dygraph.base import switch_to_static_graph
         from ..base.framework import device_guard
-        import paddle
 
         inputs = concrete_program.inputs
         outputs = concrete_program.outputs
@@ -508,14 +509,12 @@ def patch_program_cache(ipu_strategy):
         Returns:
             None
         """
+        from paddle.jit.dy2static import logging_utils
+        from paddle.jit.dy2static.partial_program import partial_program_from
         from paddle.jit.dy2static.program_translator import (
+            MAX_TRACED_PROGRAM_COUNT,
             CacheKey,
             ProgramCache,
-            MAX_TRACED_PROGRAM_COUNT,
-        )
-        from paddle.jit.dy2static import logging_utils
-        from paddle.jit.dy2static.partial_program import (
-            partial_program_from,
         )
 
         old_getter = ProgramCache.__getitem__
diff --git a/python/paddle/base/data_feed_desc.py b/python/paddle/base/data_feed_desc.py
index 8aa69890f1933..de1b00d090bb1 100644
--- a/python/paddle/base/data_feed_desc.py
+++ b/python/paddle/base/data_feed_desc.py
@@ -12,9 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from paddle.base.proto import data_feed_pb2
 from google.protobuf import text_format
 
+from paddle.base.proto import data_feed_pb2
+
 __all__ = ['DataFeedDesc']
 
 
diff --git a/python/paddle/base/data_feeder.py b/python/paddle/base/data_feeder.py
index 78781a6856af1..52ed983ffa729 100644
--- a/python/paddle/base/data_feeder.py
+++ b/python/paddle/base/data_feeder.py
@@ -12,20 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from . import core
-import numpy as np
-import warnings
 import struct
+import warnings
+
+import numpy as np
 
+from ..ir import OpResult
+from . import core
 from .framework import (
     Variable,
+    _cpu_num,
+    _cuda_ids,
     default_main_program,
     in_dygraph_mode,
     in_pir_mode,
 )
-from .framework import _cpu_num, _cuda_ids
-
-from ..ir import OpResult
 
 __all__ = ['DataFeeder']
 
diff --git a/python/paddle/base/dataset.py b/python/paddle/base/dataset.py
index 533fb69a6621b..961a392349707 100644
--- a/python/paddle/base/dataset.py
+++ b/python/paddle/base/dataset.py
@@ -13,10 +13,12 @@
 # limitations under the License.
 """This is definition of dataset class, which is high performance IO."""
 
-from paddle.base.proto import data_feed_pb2
 from google.protobuf import text_format
-from . import core
+
+from paddle.base.proto import data_feed_pb2
+
 from ..utils import deprecated
+from . import core
 
 __all__ = ['DatasetFactory', 'InMemoryDataset', 'QueueDataset']
 
diff --git a/python/paddle/base/default_scope_funcs.py b/python/paddle/base/default_scope_funcs.py
index 80cfe40db57ad..992714e6cd409 100644
--- a/python/paddle/base/default_scope_funcs.py
+++ b/python/paddle/base/default_scope_funcs.py
@@ -26,9 +26,10 @@
 invoked in a new local scope.
 """
 
-import paddle.base.core
 import threading
 
+import paddle.base.core
+
 __tl_scope__ = threading.local()
 
 __all__ = [
diff --git a/python/paddle/base/dygraph/base.py b/python/paddle/base/dygraph/base.py
index 7edb748026d84..d85fc8ca25bf7 100644
--- a/python/paddle/base/dygraph/base.py
+++ b/python/paddle/base/dygraph/base.py
@@ -11,20 +11,22 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from ..wrapped_decorator import signature_safe_contextmanager, wrap_decorator
-import decorator
 import inspect
 import sys
+import warnings
+
+import decorator
 import numpy as np
-from paddle.base import core
-from paddle.base import framework
+
+import paddle
+from paddle.base import core, framework
 from paddle.base.framework import global_var
 from paddle.base.multiprocess_utils import CleanupFuncRegistrar
-from .tracer import Tracer
+
 from ..data_feeder import convert_dtype
-import warnings
 from ..framework import _get_paddle_place
-import paddle
+from ..wrapped_decorator import signature_safe_contextmanager, wrap_decorator
+from .tracer import Tracer
 
 __all__ = [
     'no_grad',
diff --git a/python/paddle/base/dygraph/math_op_patch.py b/python/paddle/base/dygraph/math_op_patch.py
index 9448d7d9de9dd..5972b545f93e2 100644
--- a/python/paddle/base/dygraph/math_op_patch.py
+++ b/python/paddle/base/dygraph/math_op_patch.py
@@ -12,15 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .. import core
-from ..framework import (
-    convert_np_dtype_to_dtype_,
-)
-from .. import framework
-
 import numpy as np
+
 from paddle import _C_ops, _legacy_C_ops
 
+from .. import core, framework
+from ..framework import convert_np_dtype_to_dtype_
+
 _supported_int_dtype_ = [
     core.VarDesc.VarType.UINT8,
     core.VarDesc.VarType.INT8,
diff --git a/python/paddle/base/dygraph/tensor_patch_methods.py b/python/paddle/base/dygraph/tensor_patch_methods.py
index 8026884c34fc8..4f1b138abaae4 100644
--- a/python/paddle/base/dygraph/tensor_patch_methods.py
+++ b/python/paddle/base/dygraph/tensor_patch_methods.py
@@ -13,33 +13,33 @@
 # limitations under the License.
 
 import inspect
-import numpy as np
-import warnings
 import sys
+import warnings
+
+import numpy as np
 
 import paddle
-from .. import framework
-from ..framework import convert_np_dtype_to_dtype_
-from .. import core
-from .. import unique_name
+import paddle.profiler as profiler
+import paddle.utils.deprecated as deprecated
+from paddle import _C_ops
+from paddle.base.data_feeder import (
+    _PADDLE_DTYPE_2_NUMPY_DTYPE,
+    convert_uint16_to_float,
+)
+from paddle.profiler.utils import in_profiler_mode
+
+from .. import core, framework, unique_name
 from ..framework import (
-    Variable,
+    EagerParamBase,
     Parameter,
+    Variable,
     _getitem_static,
-    _setitem_static,
     _setitem_impl_,
-    EagerParamBase,
+    _setitem_static,
+    convert_np_dtype_to_dtype_,
 )
 from .base import switch_to_static_graph
 from .math_op_patch import monkey_patch_math_tensor
-from paddle.base.data_feeder import (
-    convert_uint16_to_float,
-    _PADDLE_DTYPE_2_NUMPY_DTYPE,
-)
-import paddle.utils.deprecated as deprecated
-import paddle.profiler as profiler
-from paddle.profiler.utils import in_profiler_mode
-from paddle import _C_ops
 
 _grad_scalar = None
 
diff --git a/python/paddle/base/dygraph/tracer.py b/python/paddle/base/dygraph/tracer.py
index 35cbe88f91f87..4df9517073c66 100644
--- a/python/paddle/base/dygraph/tracer.py
+++ b/python/paddle/base/dygraph/tracer.py
@@ -13,9 +13,8 @@
 # limitations under the License.
 
 
-from paddle.base import core
-from paddle.base import framework
 from paddle import _C_ops, _legacy_C_ops
+from paddle.base import core, framework
 
 name_mapping = {
     "graph_send_recv": {
diff --git a/python/paddle/base/dygraph_utils.py b/python/paddle/base/dygraph_utils.py
index 655a5f4f8b773..926c4680017ce 100644
--- a/python/paddle/base/dygraph_utils.py
+++ b/python/paddle/base/dygraph_utils.py
@@ -12,9 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .framework import dygraph_only
 from paddle import _legacy_C_ops
 
+from .framework import dygraph_only
+
 
 @dygraph_only
 def _append_activation_in_dygraph(
diff --git a/python/paddle/base/executor.py b/python/paddle/base/executor.py
index e5fddd15329e3..0921d7b79d14b 100755
--- a/python/paddle/base/executor.py
+++ b/python/paddle/base/executor.py
@@ -12,36 +12,31 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import copy
 import logging
 import os
 import sys
 import warnings
-import numpy as np
+from functools import lru_cache
 
-from . import set_flags, get_flags
-from .framework import Program, default_main_program
+import numpy as np
 
 from ..ir import OpResult
-from .wrapped_decorator import signature_safe_contextmanager
+from . import compiler, core, framework, get_flags, set_flags, unique_name
 from .data_feeder import convert_dtype
-from .framework import Variable, Operator, in_pir_mode
-
 from .framework import (
-    convert_np_dtype_to_dtype_,
+    Operator,
+    Program,
+    Variable,
     _apply_pass,
+    convert_np_dtype_to_dtype_,
+    default_main_program,
+    in_pir_mode,
     paddle_type_to_proto_type,
 )
-
-from . import core
-from . import unique_name
-from . import compiler
-from .trainer_factory import TrainerFactory
-from .trainer_factory import FetchHandlerMonitor
-import copy
-from . import framework
 from .incubate.checkpoint import auto_checkpoint as acp
-
-from functools import lru_cache
+from .trainer_factory import FetchHandlerMonitor, TrainerFactory
+from .wrapped_decorator import signature_safe_contextmanager
 
 __all__ = ['Executor', 'global_scope', 'scope_guard']
 
@@ -614,8 +609,8 @@ def _to_str(var):
 
 
 def _prepare_fleet_executor():
-    from ..distributed.fleet.proto import fleet_executor_desc_pb2
     from ..distributed.backup_env import getenv_or_backup
+    from ..distributed.fleet.proto import fleet_executor_desc_pb2
 
     trainer_endpoints_str = getenv_or_backup("PADDLE_TRAINER_ENDPOINTS", "")
     trainer_endpoints = trainer_endpoints_str.split(',')
@@ -945,7 +940,7 @@ def _get_program_and_executor(self, cached_data):
             # print(f"Program after convert:\n {inner_program}", flush=True)
         else:
             build_strategy = None
-            from paddle.incubate.autograd import prim_enabled, prim2orig
+            from paddle.incubate.autograd import prim2orig, prim_enabled
 
             if prim_enabled() and program == default_main_program():
                 prim2orig()
diff --git a/python/paddle/base/incubate/checkpoint/auto_checkpoint.py b/python/paddle/base/incubate/checkpoint/auto_checkpoint.py
index 23239b692c975..e8f75f3a4ed55 100644
--- a/python/paddle/base/incubate/checkpoint/auto_checkpoint.py
+++ b/python/paddle/base/incubate/checkpoint/auto_checkpoint.py
@@ -12,16 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-import logging
 import json
+import logging
 import os
+import sys
 import time
 from threading import current_thread
 
-from paddle.base import unique_name, compiler
-from .checkpoint_saver import SerializableBase, CheckpointSaver, PaddleModel
-from paddle.base.framework import in_dygraph_mode, Program
+from paddle.base import compiler, unique_name
+from paddle.base.framework import Program, in_dygraph_mode
+
+from .checkpoint_saver import CheckpointSaver, PaddleModel, SerializableBase
 
 g_train_epoch_range = None
 g_checker = None
diff --git a/python/paddle/base/initializer.py b/python/paddle/base/initializer.py
index 3902281721eac..7443e63b13e52 100644
--- a/python/paddle/base/initializer.py
+++ b/python/paddle/base/initializer.py
@@ -12,9 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .data_feeder import check_type
 import paddle
 
+from .data_feeder import check_type
+
 __all__ = ['set_global_initializer']
 
 _global_weight_initializer_ = None
diff --git a/python/paddle/base/io.py b/python/paddle/base/io.py
index 89468f88648e8..a2c7d02ede349 100644
--- a/python/paddle/base/io.py
+++ b/python/paddle/base/io.py
@@ -15,6 +15,7 @@
 import logging
 
 from paddle.base.log_helper import get_logger
+
 from . import reader
 from .reader import *
 
diff --git a/python/paddle/base/layer_helper.py b/python/paddle/base/layer_helper.py
index e6be93e777b75..312eaf67a3320 100644
--- a/python/paddle/base/layer_helper.py
+++ b/python/paddle/base/layer_helper.py
@@ -13,18 +13,19 @@
 # limitations under the License.
 
 import copy
+
 import paddle
+
+from . import unique_name
+from .dygraph_utils import _append_activation_in_dygraph
 from .framework import (
     Parameter,
+    _global_flags,
     dtype_is_floating,
     in_dygraph_mode,
-    _global_flags,
 )
-from . import unique_name
-from .param_attr import ParamAttr
-
 from .layer_helper_base import LayerHelperBase
-from .dygraph_utils import _append_activation_in_dygraph
+from .param_attr import ParamAttr
 
 
 class LayerHelper(LayerHelperBase):
diff --git a/python/paddle/base/layer_helper_base.py b/python/paddle/base/layer_helper_base.py
index b7bc6c6b8585e..6c047c08766fe 100644
--- a/python/paddle/base/layer_helper_base.py
+++ b/python/paddle/base/layer_helper_base.py
@@ -13,21 +13,22 @@
 # limitations under the License.
 
 import copy
+
 import numpy as np
+
 import paddle
 
+from . import core, unique_name
 from .framework import (
     Variable,
+    _current_expected_place,
     default_main_program,
     default_startup_program,
     in_dygraph_mode,
     in_pir_mode,
-    _current_expected_place,
 )
-from . import unique_name
+from .initializer import _global_bias_initializer, _global_weight_initializer
 from .param_attr import ParamAttr, WeightNormParamAttr
-from . import core
-from .initializer import _global_weight_initializer, _global_bias_initializer
 
 __all__ = ['LayerHelperBase']
 
diff --git a/python/paddle/base/layers/io.py b/python/paddle/base/layers/io.py
index d4aa7734aee6f..51f5b10fe0618 100644
--- a/python/paddle/base/layers/io.py
+++ b/python/paddle/base/layers/io.py
@@ -14,13 +14,9 @@
 
 from .. import core
 from ..executor import global_scope
-from ..framework import (
-    default_main_program,
-    default_startup_program,
-)
+from ..framework import default_main_program, default_startup_program
 from ..unique_name import generate as unique_name
 
-
 __all__ = []
 
 
diff --git a/python/paddle/base/layers/layer_function_generator.py b/python/paddle/base/layers/layer_function_generator.py
index 1b1b85d00ea42..bd11a412ffc5b 100644
--- a/python/paddle/base/layers/layer_function_generator.py
+++ b/python/paddle/base/layers/layer_function_generator.py
@@ -13,21 +13,22 @@
 # limitations under the License.
 
 import re
-import warnings
 import string
-
+import warnings
 from io import StringIO
-from ..proto import framework_pb2
+
+from paddle import _C_ops, _legacy_C_ops
+
+from ..data_feeder import check_variable_and_dtype
 from ..framework import (
     OpProtoHolder,
     Variable,
-    core,
     convert_np_dtype_to_dtype_,
+    core,
     in_dygraph_mode,
 )
 from ..layer_helper import LayerHelper
-from ..data_feeder import check_variable_and_dtype
-from paddle import _C_ops, _legacy_C_ops
+from ..proto import framework_pb2
 
 __all__ = [
     'generate_layer_fn',
diff --git a/python/paddle/base/layers/math_op_patch.py b/python/paddle/base/layers/math_op_patch.py
index 06f384eae23d1..53f35939b1f3a 100644
--- a/python/paddle/base/layers/math_op_patch.py
+++ b/python/paddle/base/layers/math_op_patch.py
@@ -12,13 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import warnings
 import inspect
+import warnings
+
+from paddle.base.dygraph.base import in_to_static_mode
 
 from .. import core
-from ..framework import Variable, unique_name, static_only
+from ..framework import Variable, static_only, unique_name
 from .layer_function_generator import OpProtoHolder
-from paddle.base.dygraph.base import in_to_static_mode
 
 _supported_int_dtype_ = [
     core.VarDesc.VarType.BOOL,
@@ -354,9 +355,7 @@ def pop(self, *args):
         Returns:
             Variable: self[index]
         """
-        from paddle.jit.dy2static.convert_operators import (
-            _run_paddle_pop,
-        )
+        from paddle.jit.dy2static.convert_operators import _run_paddle_pop
 
         if self.type != core.VarDesc.VarType.LOD_TENSOR_ARRAY:
             raise TypeError(
diff --git a/python/paddle/base/lod_tensor.py b/python/paddle/base/lod_tensor.py
index 96e18ec8f3bde..4be41d5cc6adc 100644
--- a/python/paddle/base/lod_tensor.py
+++ b/python/paddle/base/lod_tensor.py
@@ -12,9 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import numpy as np
+
 from . import core
 from .data_feeder import DataToLoDTensorConverter
-import numpy as np
 
 __all__ = ['create_lod_tensor', 'create_random_int_lodtensor']
 
diff --git a/python/paddle/base/multiprocess_utils.py b/python/paddle/base/multiprocess_utils.py
index b763446930fdb..8d18db0bb3ea8 100644
--- a/python/paddle/base/multiprocess_utils.py
+++ b/python/paddle/base/multiprocess_utils.py
@@ -12,14 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-import signal
 import atexit
+import queue
+import signal
+import sys
 
 from . import core
 
-import queue
-
 # multi-process worker check indices queue interval, avoid
 # hanging in subprocess data loading
 MP_STATUS_CHECK_INTERVAL = 5.0
diff --git a/python/paddle/base/param_attr.py b/python/paddle/base/param_attr.py
index a17432fcc3df2..674c4ad4328c5 100644
--- a/python/paddle/base/param_attr.py
+++ b/python/paddle/base/param_attr.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 import paddle
-from paddle.regularizer import WeightDecayRegularizer
 from paddle.base.data_feeder import check_type
+from paddle.regularizer import WeightDecayRegularizer
 
 __all__ = [
     'ParamAttr',
diff --git a/python/paddle/base/reader.py b/python/paddle/base/reader.py
index 63b97ee2bd495..c3a65721db275 100644
--- a/python/paddle/base/reader.py
+++ b/python/paddle/base/reader.py
@@ -12,44 +12,44 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from . import core
+import logging
+import multiprocessing
+import queue
 import sys
-import numpy as np
 import threading
+import warnings
+
+import numpy as np
+
 import paddle
+from paddle.base.framework import _set_expected_place
 
+from . import core
+from .data_feeder import BatchedTensorProvider, DataFeeder
+from .executor import global_scope
 from .framework import (
     Program,
-    program_guard,
+    _current_expected_place,
+    _get_paddle_place,
+    _get_paddle_place_list,
     default_main_program,
     default_startup_program,
     in_dygraph_mode,
-    _current_expected_place,
+    program_guard,
 )
-from .executor import global_scope
-from .data_feeder import DataFeeder, BatchedTensorProvider
+from .layers.io import (
+    __create_unshared_decorated_reader__,
+    _copy_reader_var_,
+    monkey_patch_reader_methods,
+)
+from .multiprocess_utils import _cleanup  # noqa: F401
+from .multiprocess_utils import multiprocess_queue_set  # noqa: F401
 from .multiprocess_utils import (
-    multiprocess_queue_set,  # noqa: F401
     CleanupFuncRegistrar,
     _cleanup_mmap,
-    _cleanup,  # noqa: F401
     _set_SIGCHLD_handler,
 )
-from .layers.io import (
-    monkey_patch_reader_methods,
-    _copy_reader_var_,
-    __create_unshared_decorated_reader__,
-)
 from .unique_name import UniqueNameGenerator
-from .framework import _get_paddle_place, _get_paddle_place_list
-from paddle.base.framework import _set_expected_place
-import logging
-import warnings
-
-### Dygraph DataLoader configs ###
-import multiprocessing
-
-import queue
 
 # NOTE: [ avoid hanging & failed quickly ] These value is used in getting data from another process
 QUEUE_GET_TIMEOUT = 60
diff --git a/python/paddle/base/trainer_desc.py b/python/paddle/base/trainer_desc.py
index 48cc427ac8e7e..f64530ec02353 100644
--- a/python/paddle/base/trainer_desc.py
+++ b/python/paddle/base/trainer_desc.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 """Definition of trainers."""
 
-import sys
 import os
+import sys
 
 __all__ = [
     'TrainerDesc',
diff --git a/python/paddle/base/trainer_factory.py b/python/paddle/base/trainer_factory.py
index cf197fab524e0..75351872d73d6 100644
--- a/python/paddle/base/trainer_factory.py
+++ b/python/paddle/base/trainer_factory.py
@@ -13,33 +13,35 @@
 # limitations under the License.
 """Definition of TrainerFactory."""
 
+import logging
 import threading
 import time
-import logging
+
 import numpy as np
+
 from paddle.base.log_helper import get_logger
 
 local_logger = get_logger(
     __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s'
 )
 
-from .trainer_desc import (  # noqa: F401
-    MultiTrainer,
-    DistMultiTrainer,
-    PipelineTrainer,
-    HeterXpuTrainer,
-    PSGPUTrainer,
-    HeterPipelineTrainer,
-)
 from .device_worker import (  # noqa: F401
-    Hogwild,
-    DownpourSGD,
     DownpourLite,
-    Section,
+    DownpourSGD,
     DownpourSGDOPT,
     HeterSection,
+    Hogwild,
+    Section,
 )
 from .framework import Variable
+from .trainer_desc import (  # noqa: F401
+    DistMultiTrainer,
+    HeterPipelineTrainer,
+    HeterXpuTrainer,
+    MultiTrainer,
+    PipelineTrainer,
+    PSGPUTrainer,
+)
 
 __all__ = ["TrainerFactory", "FetchHandlerMonitor"]
 
diff --git a/python/paddle/base/unique_name.py b/python/paddle/base/unique_name.py
index 745675767f150..c240273da890d 100644
--- a/python/paddle/base/unique_name.py
+++ b/python/paddle/base/unique_name.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import collections
+
 from .wrapped_decorator import signature_safe_contextmanager
 
 __all__ = ['generate', 'switch', 'guard']
@@ -121,7 +122,7 @@ def generate(key):
 # NOTE(zhiqiu): use c++ unique_name_generator in dygraph mode,
 # in order to keep name consistency.
 def generate_with_ignorable_key(key):
-    from .framework import in_dygraph_mode, _dygraph_tracer
+    from .framework import _dygraph_tracer, in_dygraph_mode
 
     if in_dygraph_mode():
         return _dygraph_tracer()._generate_unique_name()
diff --git a/python/paddle/base/variable_index.py b/python/paddle/base/variable_index.py
index 1b3039c5a8cbe..dcc87b74ea658 100644
--- a/python/paddle/base/variable_index.py
+++ b/python/paddle/base/variable_index.py
@@ -12,13 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import itertools
+import warnings
+
 import numpy as np
-from . import unique_name
-from . import core
+
 import paddle
-import warnings
-import itertools
 
+from . import core, unique_name
 
 MAX_INTEGER = 2**31 - 1
 
@@ -370,9 +371,7 @@ def _setitem_for_tensor_array(var, item, value):
         not paddle.in_dynamic_mode()
     ), "setitem for tensor_array must be called in static graph mode."
     if isinstance(item, (Variable, int)):
-        from paddle.jit.dy2static.variable_trans_func import (
-            to_static_variable,
-        )
+        from paddle.jit.dy2static.variable_trans_func import to_static_variable
         from paddle.tensor import array_write
 
         item = paddle.cast(to_static_variable(item), dtype='int64')
@@ -388,7 +387,8 @@ def _setitem_for_tensor_array(var, item, value):
 
 def _setitem_impl_(var, item, value):
     from paddle.base import core
-    from .framework import default_main_program, Variable
+
+    from .framework import Variable, default_main_program
 
     if var.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY:
         return _setitem_for_tensor_array(var, item, value)
@@ -572,9 +572,7 @@ def _setitem_impl_(var, item, value):
 
     if not paddle.in_dynamic_mode():
         # map var to the new output
-        from paddle.jit.dy2static.program_translator import (
-            ProgramTranslator,
-        )
+        from paddle.jit.dy2static.program_translator import ProgramTranslator
 
         ProgramTranslator.get_instance()._inplace_map.add(
             cur_block.program, var.desc.id(), output
@@ -601,8 +599,8 @@ def set_value_for_bool_tensor(var, item, value):
             )
 
     def idx_not_empty(var, item, value):
-        from .framework import Variable
         from ..tensor import gather_nd, scatter_nd_add
+        from .framework import Variable
 
         if not isinstance(value, Variable):
             value = paddle.assign(value).cast(var.dtype)
@@ -826,7 +824,7 @@ def _setitem_static(x, indices, values):
         indices(int|slice|None|Tensor|List|Tuple...): Indices, used to indicate the position of the element to be fetched.
         values(Tensor|Number|Ndarray): values to be assigned to the x.
     """
-    from .framework import default_main_program, Variable
+    from .framework import Variable, default_main_program
 
     if x.type == paddle.base.core.VarDesc.VarType.LOD_TENSOR_ARRAY:
         return _setitem_for_tensor_array(x, indices, values)
diff --git a/python/paddle/base/wrapped_decorator.py b/python/paddle/base/wrapped_decorator.py
index 7e7dbff65611e..1567bb0d4c55c 100644
--- a/python/paddle/base/wrapped_decorator.py
+++ b/python/paddle/base/wrapped_decorator.py
@@ -12,9 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import decorator
 import contextlib
 
+import decorator
+
 __all__ = ['wrap_decorator', 'signature_safe_contextmanager']
 
 

From eccee58b71d66c041b7c6c2554f1b83976eb4d9b Mon Sep 17 00:00:00 2001
From: Ghost Screaming <mofengshenjieII@163.com>
Date: Thu, 21 Sep 2023 11:10:14 +0800
Subject: [PATCH 21/39] [AutoParallel] Support new communication library for
 hogwild_worker, graph_helper, data_norm_op and margin_cross_entropy_op.
 (#57519)

---
 paddle/fluid/framework/hogwild_worker.cc      |  69 +++++++--
 paddle/fluid/framework/ir/graph_helper.cc     |  17 +-
 paddle/fluid/operators/data_norm_op.cu        | 115 +++++++++++---
 .../operators/margin_cross_entropy_op.cu      | 145 +++++++++++++-----
 .../core/distributed/comm_context_manager.cc  |  14 ++
 .../core/distributed/comm_context_manager.h   |   8 +
 6 files changed, 292 insertions(+), 76 deletions(-)

diff --git a/paddle/fluid/framework/hogwild_worker.cc b/paddle/fluid/framework/hogwild_worker.cc
index cc2c70506a34c..e638fbcb8a54d 100644
--- a/paddle/fluid/framework/hogwild_worker.cc
+++ b/paddle/fluid/framework/hogwild_worker.cc
@@ -22,6 +22,13 @@ limitations under the License. */
 #include "paddle/fluid/operators/controlflow/conditional_block_op_helper.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/lodtensor_printer.h"
+#include "paddle/phi/core/distributed/comm_context_manager.h"
+#include "paddle/phi/core/flags.h"
+
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#include "paddle/phi/core/distributed/nccl_comm_context.h"
+PHI_DECLARE_bool(dynamic_static_unified_comm);
+#endif
 
 #if defined PADDLE_WITH_PSCORE
 #include "paddle/fluid/distributed/ps/service/communicator/communicator.h"
@@ -30,7 +37,6 @@ limitations under the License. */
 #if defined(PADDLE_WITH_GLOO)
 #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
 #endif
-#include "paddle/phi/core/flags.h"
 
 PHI_DECLARE_bool(enable_exit_when_partial_worker);
 
@@ -152,16 +158,59 @@ bool HogwildWorker::CheckBatchNum(int flag) {
   }
   g_barrier.wait();
   float *stat_ptr = sync_stat_.data<float>();
-  auto comm =
-      platform::NCCLCommContext::Instance().Get(0, place_.GetDeviceId());
+  int nranks = 0;
+  int ring_id = 0;
+  platform::NCCLComm *comm = nullptr;
+  const auto &comm_context_manager =
+      phi::distributed::CommContextManager::GetInstance();
+  phi::distributed::NCCLCommContext *comm_ctx = nullptr;
+  if (FLAGS_dynamic_static_unified_comm) {
+    PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(ring_id)),
+                      true,
+                      platform::errors::InvalidArgument(
+                          "You choose to use new communication library by "
+                          "setting environment "
+                          "variable FLAGS_dynamic_static_unified_comm True. "
+                          "But ring_id(%d) is "
+                          "not found in comm_context_manager.",
+                          std::to_string(ring_id)));
+    comm_ctx = static_cast<phi::distributed::NCCLCommContext *>(
+        comm_context_manager.Get(std::to_string(ring_id)));
+    PADDLE_ENFORCE_NE(comm_ctx,
+                      nullptr,
+                      platform::errors::Unavailable(
+                          "NCCLCommContext is nullptr, collective op should "
+                          "has ring_id attr."));
+    nranks = comm_ctx->GetSize();
+  } else {
+    comm = platform::NCCLCommContext::Instance().Get(ring_id,
+                                                     place_.GetDeviceId());
+    nranks = comm->nranks();
+  }
+
   auto stream = static_cast<phi::GPUContext *>(dev_ctx_)->stream();
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(&stat_ptr[flag],
-                                                              &stat_ptr[2],
-                                                              1,
-                                                              ncclFloat32,
-                                                              ncclProd,
-                                                              comm->comm(),
-                                                              stream));
+  if (comm_ctx) {
+    // comm_ctx->AllReduce only support allreduce on the whole tensor,
+    // single element is not supported now.
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        platform::dynload::ncclAllReduce(&stat_ptr[flag],
+                                         &stat_ptr[2],
+                                         1,
+                                         ncclFloat32,
+                                         ncclProd,
+                                         comm_ctx->GetNcclComm(),
+                                         stream));
+
+  } else {
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(&stat_ptr[flag],
+                                                                &stat_ptr[2],
+                                                                1,
+                                                                ncclFloat32,
+                                                                ncclProd,
+                                                                comm->comm(),
+                                                                stream));
+  }
+
   PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(&ret,  // output
                                              &stat_ptr[2],
                                              sizeof(float),
diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc
index b322e3f8bce28..5d7054721db53 100644
--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -23,10 +23,14 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/program_utils.h"
+#include "paddle/phi/core/distributed/comm_context_manager.h"
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/framework/details/nccl_op_handle.h"
 #include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/phi/core/distributed/nccl_comm_context.h"
+#include "paddle/phi/core/flags.h"
+PHI_DECLARE_bool(dynamic_static_unified_comm);
 #endif
 #include "paddle/fluid/platform/flags.h"
 PD_DECLARE_bool(convert_all_blocks);
@@ -564,9 +568,16 @@ void ReplaceAllReduceOp(const Node &node,
   all_reduce_op_desc.SetType("c_allreduce_sum");
   all_reduce_op_desc.SetInput("X", {all_reduce_var_name});
   all_reduce_op_desc.SetOutput("Out", {all_reduce_var_name});
-
-  int ring_id = platform::NCCLCommContext::Instance().GetRingId(
-      dynamic_cast<details::NCCLOpHandleBase *>(&op_handle)->GetComm());
+  int ring_id = -1;
+  if (FLAGS_dynamic_static_unified_comm) {
+    ring_id = phi::distributed::CommContextManager::GetInstance().GetRingId(
+        dynamic_cast<details::NCCLOpHandleBase *>(&op_handle)->GetComm());
+    VLOG(3) << "New CommContextManager gets ring_id: " << ring_id;
+  } else {
+    ring_id = platform::NCCLCommContext::Instance().GetRingId(
+        dynamic_cast<details::NCCLOpHandleBase *>(&op_handle)->GetComm());
+    VLOG(3) << "Old NCCLCommContext gets ring_id: " << ring_id;
+  }
   all_reduce_op_desc.SetAttr("ring_id", ring_id);
   all_reduce_op_desc.SetAttr("use_calc_stream", false);
   all_reduce_op_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
diff --git a/paddle/fluid/operators/data_norm_op.cu b/paddle/fluid/operators/data_norm_op.cu
index a212bc0ee9478..509c067e24e42 100644
--- a/paddle/fluid/operators/data_norm_op.cu
+++ b/paddle/fluid/operators/data_norm_op.cu
@@ -21,6 +21,10 @@ limitations under the License. */
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
+#include "paddle/phi/core/distributed/comm_context_manager.h"
+#include "paddle/phi/core/distributed/nccl_comm_context.h"
+#include "paddle/phi/core/flags.h"
+PHI_DECLARE_bool(dynamic_static_unified_comm);
 #endif
 
 namespace paddle {
@@ -213,31 +217,92 @@ class DataNormGradKernel<T, phi::GPUContext> : public framework::OpKernel<T> {
 
     if (need_sync_stats) {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-      auto comm = platform::NCCLCommContext::Instance().Get(0, ctx.GetPlace());
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
-          reinterpret_cast<const void *>(d_batch_size),
-          reinterpret_cast<void *>(d_batch_size),
-          C,
-          platform::ToNCCLDataType(framework::TransToProtoVarType(x->dtype())),
-          ncclSum,
-          comm->comm(),
-          stream));
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
-          reinterpret_cast<const void *>(d_batch_sum),
-          reinterpret_cast<void *>(d_batch_sum),
-          C,
-          platform::ToNCCLDataType(framework::TransToProtoVarType(x->dtype())),
-          ncclSum,
-          comm->comm(),
-          stream));
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
-          reinterpret_cast<const void *>(d_batch_square_sum),
-          reinterpret_cast<void *>(d_batch_square_sum),
-          C,
-          platform::ToNCCLDataType(framework::TransToProtoVarType(x->dtype())),
-          ncclSum,
-          comm->comm(),
-          stream));
+      int rid = 0;
+      platform::NCCLComm *comm = nullptr;
+      const auto &comm_context_manager =
+          phi::distributed::CommContextManager::GetInstance();
+      phi::distributed::NCCLCommContext *comm_ctx = nullptr;
+      if (FLAGS_dynamic_static_unified_comm) {
+        PADDLE_ENFORCE_EQ(
+            comm_context_manager.Has(std::to_string(rid)),
+            true,
+            platform::errors::InvalidArgument(
+                "You choose to use new communication library by "
+                "setting environment "
+                "variable FLAGS_dynamic_static_unified_comm True. "
+                "But ring_id(%d) is "
+                "not found in comm_context_manager.",
+                std::to_string(rid)));
+        comm_ctx = static_cast<phi::distributed::NCCLCommContext *>(
+            comm_context_manager.Get(std::to_string(rid)));
+        PADDLE_ENFORCE_NE(
+            comm_ctx,
+            nullptr,
+            platform::errors::Unavailable(
+                "NCCLCommContext is nullptr, collective op should "
+                "has ring_id attr."));
+      } else {
+        comm = paddle::platform::NCCLCommContext::Instance().Get(
+            rid, ctx.GetPlace());
+      }
+
+      if (comm_ctx) {
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
+            reinterpret_cast<const void *>(d_batch_size),
+            reinterpret_cast<void *>(d_batch_size),
+            C,
+            platform::ToNCCLDataType(
+                framework::TransToProtoVarType(x->dtype())),
+            ncclSum,
+            comm_ctx->GetNcclComm(),
+            stream));
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
+            reinterpret_cast<const void *>(d_batch_sum),
+            reinterpret_cast<void *>(d_batch_sum),
+            C,
+            platform::ToNCCLDataType(
+                framework::TransToProtoVarType(x->dtype())),
+            ncclSum,
+            comm_ctx->GetNcclComm(),
+            stream));
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
+            reinterpret_cast<const void *>(d_batch_square_sum),
+            reinterpret_cast<void *>(d_batch_square_sum),
+            C,
+            platform::ToNCCLDataType(
+                framework::TransToProtoVarType(x->dtype())),
+            ncclSum,
+            comm_ctx->GetNcclComm(),
+            stream));
+      } else {
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
+            reinterpret_cast<const void *>(d_batch_size),
+            reinterpret_cast<void *>(d_batch_size),
+            C,
+            platform::ToNCCLDataType(
+                framework::TransToProtoVarType(x->dtype())),
+            ncclSum,
+            comm->comm(),
+            stream));
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
+            reinterpret_cast<const void *>(d_batch_sum),
+            reinterpret_cast<void *>(d_batch_sum),
+            C,
+            platform::ToNCCLDataType(
+                framework::TransToProtoVarType(x->dtype())),
+            ncclSum,
+            comm->comm(),
+            stream));
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
+            reinterpret_cast<const void *>(d_batch_square_sum),
+            reinterpret_cast<void *>(d_batch_square_sum),
+            C,
+            platform::ToNCCLDataType(
+                framework::TransToProtoVarType(x->dtype())),
+            ncclSum,
+            comm->comm(),
+            stream));
+      }
       platform::GpuStreamSync(stream);
 #else
       PADDLE_THROW(platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/operators/margin_cross_entropy_op.cu b/paddle/fluid/operators/margin_cross_entropy_op.cu
index d741bc5b42549..75ef56accb10b 100644
--- a/paddle/fluid/operators/margin_cross_entropy_op.cu
+++ b/paddle/fluid/operators/margin_cross_entropy_op.cu
@@ -30,6 +30,7 @@ namespace cub = hipcub;
 #include "paddle/phi/kernels/margin_cross_entropy_grad_kernel.h"
 
 #include "paddle/phi/common/memory_utils.h"
+#include "paddle/phi/core/distributed/comm_context_manager.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/core/visit_type.h"
@@ -39,6 +40,9 @@ namespace cub = hipcub;
 #include "paddle/fluid/distributed/collective/process_group.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
+#include "paddle/phi/core/distributed/nccl_comm_context.h"
+#include "paddle/phi/core/flags.h"
+PHI_DECLARE_bool(dynamic_static_unified_comm);
 #endif
 #include "paddle/phi/backends/gpu/gpu_context.h"
 
@@ -87,21 +91,50 @@ void GetClassInterval(const gpuStream_t& stream,
     auto task = pg->AllReduce(in_tensor, out_tensor, opts);
     task->Wait();
   } else {
-    const auto& comm =
-        paddle::platform::NCCLCommContext::Instance().Get(rid, place);
+    paddle::platform::NCCLComm* comm = nullptr;
+    const auto& comm_context_manager =
+        phi::distributed::CommContextManager::GetInstance();
+    phi::distributed::NCCLCommContext* comm_ctx = nullptr;
+    if (FLAGS_dynamic_static_unified_comm) {
+      PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(rid)),
+                        true,
+                        paddle::platform::errors::InvalidArgument(
+                            "You choose to use new communication library by "
+                            "setting environment "
+                            "variable FLAGS_dynamic_static_unified_comm True. "
+                            "But ring_id(%d) is "
+                            "not found in comm_context_manager.",
+                            std::to_string(rid)));
+      comm_ctx = static_cast<phi::distributed::NCCLCommContext*>(
+          comm_context_manager.Get(std::to_string(rid)));
+      PADDLE_ENFORCE_NE(comm_ctx,
+                        nullptr,
+                        paddle::platform::errors::Unavailable(
+                            "NCCLCommContext is nullptr, collective op should "
+                            "has ring_id attr."));
+    } else {
+      comm = paddle::platform::NCCLCommContext::Instance().Get(rid, place);
+    }
+
     // use global calculate stream
     const auto calcu_stream =
         static_cast<GPUContext*>(phi::DeviceContextPool::Instance().Get(place))
             ->stream();
-
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(
-        num_classes_per_device_ptr,
-        num_classes_per_device_ptr,
-        num_classes_per_device.numel(),
-        phi::ToNCCLDataType(num_classes_per_device.dtype()),
-        ncclSum,
-        comm->comm(),
-        calcu_stream));
+    if (comm_ctx) {
+      comm_ctx->AllReduce(&num_classes_per_device,
+                          num_classes_per_device,
+                          ncclSum,
+                          calcu_stream);
+    } else {
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(
+          num_classes_per_device_ptr,
+          num_classes_per_device_ptr,
+          num_classes_per_device.numel(),
+          phi::ToNCCLDataType(num_classes_per_device.dtype()),
+          ncclSum,
+          comm->comm(),
+          calcu_stream));
+    }
   }
 
   class_interval->Resize({nranks + 1});
@@ -238,7 +271,10 @@ void MarginCrossEntropyKernel(const Context& dev_ctx,
   const auto& place = dev_ctx.GetPlace();  // old code
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-  paddle::platform::NCCLComm* comm;
+  paddle::platform::NCCLComm* comm = nullptr;
+  const auto& comm_context_manager =
+      phi::distributed::CommContextManager::GetInstance();
+  phi::distributed::NCCLCommContext* comm_ctx = nullptr;
   paddle::distributed::ProcessGroup* pg = nullptr;
   gpuStream_t stream;
   if (nranks > 1) {
@@ -247,8 +283,29 @@ void MarginCrossEntropyKernel(const Context& dev_ctx,
       // Use ProcessGroup
       pg = map->get(ring_id);
     } else {
-      comm = paddle::platform::NCCLCommContext::Instance().Get(ring_id, place);
-
+      if (FLAGS_dynamic_static_unified_comm) {
+        PADDLE_ENFORCE_EQ(
+            comm_context_manager.Has(std::to_string(ring_id)),
+            true,
+            paddle::platform::errors::InvalidArgument(
+                "You choose to use new communication library by "
+                "setting environment "
+                "variable FLAGS_dynamic_static_unified_comm True. "
+                "But ring_id(%d) is "
+                "not found in comm_context_manager.",
+                std::to_string(ring_id)));
+        comm_ctx = static_cast<phi::distributed::NCCLCommContext*>(
+            comm_context_manager.Get(std::to_string(ring_id)));
+        PADDLE_ENFORCE_NE(
+            comm_ctx,
+            nullptr,
+            paddle::platform::errors::Unavailable(
+                "NCCLCommContext is nullptr, collective op should "
+                "has ring_id attr."));
+      } else {
+        comm =
+            paddle::platform::NCCLCommContext::Instance().Get(ring_id, place);
+      }
       // use global calculate stream
       stream = static_cast<GPUContext*>(
                    phi::DeviceContextPool::Instance().Get(place))
@@ -361,14 +418,18 @@ void MarginCrossEntropyKernel(const Context& dev_ctx,
       auto task = pg->AllReduce(in_tensor, out_tensor, opts);
       task->Wait();
     } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::ncclAllReduce(logits_max_buff,
-                                      logits_max_buff,
-                                      logits_max.numel(),
-                                      phi::ToNCCLDataType(logits_max.dtype()),
-                                      ncclMax,
-                                      comm->comm(),
-                                      stream));
+      if (comm_ctx) {
+        comm_ctx->AllReduce(&logits_max, logits_max, ncclMax, stream);
+      } else {
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            phi::dynload::ncclAllReduce(logits_max_buff,
+                                        logits_max_buff,
+                                        logits_max.numel(),
+                                        phi::ToNCCLDataType(logits_max.dtype()),
+                                        ncclMax,
+                                        comm->comm(),
+                                        stream));
+      }
     }
   }
 #endif
@@ -402,14 +463,18 @@ void MarginCrossEntropyKernel(const Context& dev_ctx,
       auto task = pg->AllReduce(in_tensor, out_tensor, opts);
       task->Wait();
     } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(
-          sum_exp_logits_buff,
-          sum_exp_logits_buff,
-          sum_exp_logits.numel(),
-          phi::ToNCCLDataType(sum_exp_logits.dtype()),
-          ncclSum,
-          comm->comm(),
-          stream));
+      if (comm_ctx) {
+        comm_ctx->AllReduce(&sum_exp_logits, sum_exp_logits, ncclSum, stream);
+      } else {
+        PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(
+            sum_exp_logits_buff,
+            sum_exp_logits_buff,
+            sum_exp_logits.numel(),
+            phi::ToNCCLDataType(sum_exp_logits.dtype()),
+            ncclSum,
+            comm->comm(),
+            stream));
+      }
     }
   }
 #endif
@@ -460,14 +525,18 @@ void MarginCrossEntropyKernel(const Context& dev_ctx,
       auto task = pg->AllReduce(in_tensor, out_tensor, opts);
       task->Wait();
     } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::ncclAllReduce(loss_ptr,
-                                      loss_ptr,
-                                      loss->numel(),
-                                      phi::ToNCCLDataType(loss->dtype()),
-                                      ncclSum,
-                                      comm->comm(),
-                                      stream));
+      if (comm_ctx) {
+        comm_ctx->AllReduce(loss, *loss, ncclSum, stream);
+      } else {
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            phi::dynload::ncclAllReduce(loss_ptr,
+                                        loss_ptr,
+                                        loss->numel(),
+                                        phi::ToNCCLDataType(loss->dtype()),
+                                        ncclSum,
+                                        comm->comm(),
+                                        stream));
+      }
     }
   }
 #endif
diff --git a/paddle/phi/core/distributed/comm_context_manager.cc b/paddle/phi/core/distributed/comm_context_manager.cc
index e7931282724ab..342a86313bf3f 100644
--- a/paddle/phi/core/distributed/comm_context_manager.cc
+++ b/paddle/phi/core/distributed/comm_context_manager.cc
@@ -176,6 +176,20 @@ CommContext* CommContextManager::Get(const std::string& unique_comm_key) const {
   return id_to_comm_context_.at(unique_comm_key).get();
 }
 
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+int CommContextManager::GetRingId(const ncclComm_t& comm) const {
+  for (auto iter = id_to_comm_context_.begin();
+       iter != id_to_comm_context_.end();
+       ++iter) {
+    if (static_cast<phi::distributed::NCCLCommContext*>(iter->second.get())
+            ->GetNcclComm() == comm) {
+      return std::stoi(iter->first);
+    }
+  }
+  return -1;
+}
+#endif
+
 bool CommContextManager::Has(const std::string& unique_comm_key) const {
   return id_to_comm_context_.find(unique_comm_key) != id_to_comm_context_.end();
 }
diff --git a/paddle/phi/core/distributed/comm_context_manager.h b/paddle/phi/core/distributed/comm_context_manager.h
index e2cb298a0984b..dcbfaab55af90 100644
--- a/paddle/phi/core/distributed/comm_context_manager.h
+++ b/paddle/phi/core/distributed/comm_context_manager.h
@@ -22,6 +22,10 @@
 #include "paddle/phi/core/distributed/comm_context.h"
 #include "paddle/phi/core/macros.h"
 
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#include "paddle/phi/backends/gpu/forwards.h"
+#endif
+
 namespace phi {
 namespace distributed {
 
@@ -44,6 +48,10 @@ class CommContextManager {
 
   CommContext* Get(const std::string& unique_comm_key) const;
 
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+  int GetRingId(const ncclComm_t& comm) const;
+#endif
+
   bool Has(const std::string& unique_comm_key) const;
 
   static void SetDeviceId(int dev_id);

From b1536e78833f22d1833cfb1171c3e6cb364e7a09 Mon Sep 17 00:00:00 2001
From: iLeGend <824040212@qq.com>
Date: Thu, 21 Sep 2023 11:11:08 +0800
Subject: [PATCH 22/39] [NewComm] No.9 compatiable upgrade for fused_attention
 op (#57560)

* [NewComm] No.9 compatiable upgrade for fused_attention op

* fix error

* fix error
---
 .../operators/fused/fused_attention_utils.h   | 50 +++++++++++++++++--
 test/legacy_test/test_fused_attention_op.py   | 10 ++++
 2 files changed, 55 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/operators/fused/fused_attention_utils.h b/paddle/fluid/operators/fused/fused_attention_utils.h
index 26cab895f0dfc..c059a194d0ea5 100644
--- a/paddle/fluid/operators/fused/fused_attention_utils.h
+++ b/paddle/fluid/operators/fused/fused_attention_utils.h
@@ -18,8 +18,13 @@
 #include "paddle/fluid/distributed/collective/process_group_nccl.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
+#include "paddle/phi/core/distributed/nccl_comm_context.h"
+#include "paddle/phi/core/flags.h"
+PHI_DECLARE_bool(dynamic_static_unified_comm);
 #endif
 
+#include "paddle/fluid/distributed/collective/utils.h"
+#include "paddle/phi/core/distributed/comm_context_manager.h"
 #include "paddle/phi/core/errors.h"
 
 namespace phi {
@@ -47,11 +52,46 @@ static void AllReduce(phi::DenseTensor &tensor,  // NOLINT
     auto place = dev_ctx.GetPlace();
     void *recvbuff =
         dev_ctx.template Alloc<T>(&tensor, tensor.numel() * sizeof(T));
-    auto comm =
-        paddle::platform::NCCLCommContext::Instance().Get(ring_id, place);
-    auto stream = dev_ctx.stream();
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(
-        sendbuff, recvbuff, numel, dtype, ncclSum, comm->comm(), stream));
+    gpuStream_t stream = nullptr;
+    paddle::platform::NCCLComm *comm = nullptr;
+    phi::distributed::NCCLCommContext *comm_ctx = nullptr;
+
+    const auto &comm_context_manager =
+        phi::distributed::CommContextManager::GetInstance();
+
+    if (FLAGS_dynamic_static_unified_comm) {
+      // Use New Communication Library
+      PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(ring_id)),
+                        true,
+                        paddle::platform::errors::InvalidArgument(
+                            "You choose to use new communication library by "
+                            "setting environment "
+                            "variable FLAGS_dynamic_static_unified_comm True. "
+                            "But ring_id(%d) is "
+                            "not found in comm_context_manager.",
+                            std::to_string(ring_id)));
+      comm_ctx = static_cast<phi::distributed::NCCLCommContext *>(
+          comm_context_manager.Get(std::to_string(ring_id)));
+      PADDLE_ENFORCE_NE(comm_ctx,
+                        nullptr,
+                        paddle::platform::errors::Unavailable(
+                            "NCCLCommContext is nullptr, collective op should "
+                            "has ring_id attr."));
+
+      stream = comm_ctx->GetStream();
+      VLOG(3) << "new comm_context_manager has ring_id" << ring_id;
+    } else {
+      comm = paddle::platform::NCCLCommContext::Instance().Get(ring_id, place);
+
+      stream = dev_ctx.stream();
+      VLOG(3) << "old NCCLCommContext has ring_id " << ring_id;
+    }
+    if (comm_ctx) {
+      comm_ctx->AllReduce(&tensor, tensor, ncclSum, stream);
+    } else {
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(
+          sendbuff, recvbuff, numel, dtype, ncclSum, comm->comm(), stream));
+    }
   }
 #else
   PADDLE_THROW(phi::errors::Unimplemented(
diff --git a/test/legacy_test/test_fused_attention_op.py b/test/legacy_test/test_fused_attention_op.py
index af734c96d19d8..0e012659f95f6 100644
--- a/test/legacy_test/test_fused_attention_op.py
+++ b/test/legacy_test/test_fused_attention_op.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import unittest
 
 import numpy as np
@@ -31,6 +32,7 @@
 
 class TestFusedAttentionOp(OpTest):
     def setUp(self):
+        self.with_new_comm()
         self.config()
         self.generate_input_data()
 
@@ -79,6 +81,9 @@ def setUp(self):
         paddle.set_default_dtype(self.x_type)
         self.dropout = Dropout(self.dropout_prob, mode="upscale_in_train")
 
+    def with_new_comm(self):
+        os.environ["FLAGS_dynamic_static_unified_comm"] = "0"
+
     def config(self):
         self.x_type = np.float32
         self.attn_mask_type = np.float64
@@ -350,6 +355,11 @@ def test_fused_attention_op(self):
         )
 
 
+class TestFusedAttentionOpWithNewComm(TestFusedAttentionOp):
+    def with_new_comm(self):
+        os.environ["FLAGS_dynamic_static_unified_comm"] = "1"
+
+
 class TestFusedAttentionOpBiasIsNone(TestFusedAttentionOp):
     def config(self):
         super().config()

From 6e9143181a8c4ba7253be9690f198cec8326e5a4 Mon Sep 17 00:00:00 2001
From: ooo oo <106524776+ooooo-create@users.noreply.github.com>
Date: Thu, 21 Sep 2023 11:12:43 +0800
Subject: [PATCH 23/39] [CodeStyle][task 11] enable Ruff F403 rule in
 `python/paddle/base/__init__.py` (#57501)

---
 pyproject.toml                         |  2 --
 python/paddle/base/__init__.py         | 47 +++++++++++++++++++++++---
 python/paddle/base/core.py             |  2 +-
 python/paddle/base/dygraph/__init__.py | 13 +++++--
 python/paddle/base/io.py               |  5 ++-
 5 files changed, 58 insertions(+), 11 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 8dd98b65873aa..eca2770cb1b4d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -109,8 +109,6 @@ ignore = [
     "UP031",
     "C408",
     "UP030",
-    "F522",
-    "F403",
     "C405",
     "C417",
     "PLR0402",
diff --git a/python/paddle/base/__init__.py b/python/paddle/base/__init__.py
index 6eec276eee03d..acc6f9f51ae2f 100644
--- a/python/paddle/base/__init__.py
+++ b/python/paddle/base/__init__.py
@@ -34,17 +34,48 @@
 
 # import all class inside framework into base module
 from . import framework
-from .framework import *  # noqa: F403
+from .framework import (
+    Program,
+    default_startup_program,
+    default_main_program,
+    program_guard,
+    name_scope,
+    ipu_shard_guard,
+    set_ipu_shard,
+    cuda_places,
+    cpu_places,
+    xpu_places,
+    cuda_pinned_places,
+    in_dygraph_mode,
+    in_pir_mode,
+    in_dynamic_or_pir_mode,
+    is_compiled_with_cinn,
+    is_compiled_with_cuda,
+    is_compiled_with_rocm,
+    is_compiled_with_xpu,
+    Variable,
+    require_version,
+    device_guard,
+    set_flags,
+    get_flags,
+)
 
 # import all class inside executor into base module
 from . import executor
-from .executor import *  # noqa: F403
+from .executor import (
+    Executor,
+    global_scope,
+    scope_guard,
+)
 
 from . import data_feed_desc
-from .data_feed_desc import *  # noqa: F403
+from .data_feed_desc import DataFeedDesc
 
 from . import dataset
-from .dataset import *  # noqa: F403
+from .dataset import (
+    DatasetFactory,
+    InMemoryDataset,
+)
 
 from . import trainer_desc
 
@@ -72,7 +103,13 @@
 
 from . import unique_name
 from . import compiler
-from .compiler import *  # noqa: F403
+from .compiler import (
+    CompiledProgram,
+    ExecutionStrategy,
+    BuildStrategy,
+    IpuCompiledProgram,
+    IpuStrategy,
+)
 from paddle.base.layers.math_op_patch import monkey_patch_variable
 from .dygraph.base import enable_dygraph, disable_dygraph
 from .dygraph.tensor_patch_methods import monkey_patch_tensor
diff --git a/python/paddle/base/core.py b/python/paddle/base/core.py
index df90a6ace8582..285a9f1b1a61b 100644
--- a/python/paddle/base/core.py
+++ b/python/paddle/base/core.py
@@ -278,7 +278,7 @@ def to_list(s):
     # assign tensor alias
     libpaddle.LoDTensor = libpaddle.Tensor
 
-    from .libpaddle import *
+    from .libpaddle import *  # noqa: F403
     from .libpaddle import (  # noqa: F401
         __doc__,
         __file__,
diff --git a/python/paddle/base/dygraph/__init__.py b/python/paddle/base/dygraph/__init__.py
index 6355ca337b9f8..2ac4df711681c 100644
--- a/python/paddle/base/dygraph/__init__.py
+++ b/python/paddle/base/dygraph/__init__.py
@@ -13,10 +13,19 @@
 # limitations under the License.
 
 from . import base
-from .base import *  # noqa: F403
+from .base import (
+    no_grad,
+    no_grad_,
+    grad,
+    guard,
+    enable_dygraph,
+    disable_dygraph,
+    enabled,
+    to_variable,
+)
 
 from . import tracer
-from .tracer import *  # noqa: F403
+from .tracer import Tracer
 
 
 __all__ = []
diff --git a/python/paddle/base/io.py b/python/paddle/base/io.py
index a2c7d02ede349..55f5c072f4e27 100644
--- a/python/paddle/base/io.py
+++ b/python/paddle/base/io.py
@@ -17,7 +17,10 @@
 from paddle.base.log_helper import get_logger
 
 from . import reader
-from .reader import *
+from .reader import (  # noqa: F401
+    PyReader,
+    DataLoader,
+)
 
 __all__ = reader.__all__
 

From 9650cf907fe3d574215e2949785075478096b8d9 Mon Sep 17 00:00:00 2001
From: YuanRisheng <yuanrisheng@baidu.com>
Date: Thu, 21 Sep 2023 11:20:02 +0800
Subject: [PATCH 24/39] [PIR]Rename flags (#57496)

* rename flag

* fix py3 bugs

* modify demo code
---
 paddle/fluid/framework/feed_fetch_method.cc   |  2 +-
 .../new_executor/standalone_executor.cc       |  8 +++---
 .../tensor_operants_gen.py                    | 24 ++++++++---------
 paddle/phi/core/flags.cc                      |  6 ++---
 python/paddle/base/framework.py               |  8 +++---
 python/paddle/pir_utils.py                    | 26 +++++++++----------
 test/ir/new_ir/CMakeLists.txt                 |  2 +-
 test/ir/new_ir/test_ir_backward.py            |  6 ++---
 test/prim/new_ir_prim/CMakeLists.txt          |  2 +-
 9 files changed, 41 insertions(+), 43 deletions(-)

diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc
index 1f2f645f97dc8..7a62b5563f30a 100644
--- a/paddle/fluid/framework/feed_fetch_method.cc
+++ b/paddle/fluid/framework/feed_fetch_method.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "glog/logging.h"
 
 PHI_DECLARE_bool(enable_new_ir_in_executor);
-PHI_DECLARE_bool(enable_new_ir_api);
+PHI_DECLARE_bool(enable_pir_api);
 
 namespace phi {
 class DenseTensor;
diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc
index 99b42bee8b73f..f06bee2c884e3 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor.cc
@@ -28,7 +28,7 @@
 #include "paddle/pir/pass/pass_manager.h"
 
 PHI_DECLARE_bool(enable_new_ir_in_executor);
-PHI_DECLARE_bool(enable_new_ir_api);
+PHI_DECLARE_bool(enable_pir_api);
 PHI_DECLARE_bool(new_ir_apply_inplace_pass);
 
 namespace paddle {
@@ -55,7 +55,7 @@ StandaloneExecutor::StandaloneExecutor(const platform::Place& place,
     const std::string& job_type = job->Type();
     std::shared_ptr<ProgramDesc> program = nullptr;
     std::shared_ptr<::pir::Program> ir_program = nullptr;
-    if (FLAGS_enable_new_ir_api) {
+    if (FLAGS_enable_pir_api) {
       ir_program = plan_.IrProgram(job_type);
     } else {
       program = std::make_shared<ProgramDesc>(*(plan_.Program(job_type)));
@@ -69,7 +69,7 @@ StandaloneExecutor::StandaloneExecutor(const platform::Place& place,
                                  micro_batch_id,
                                  micro_batch_num));
 
-    if (micro_batch_num > 1 && !FLAGS_enable_new_ir_api) {
+    if (micro_batch_num > 1 && !FLAGS_enable_pir_api) {
       SetColAttrForFeedFetchOps(program, micro_batch_num, micro_batch_id);
     }
 
@@ -80,7 +80,7 @@ StandaloneExecutor::StandaloneExecutor(const platform::Place& place,
     // TODO(phlrain) we only support cpu for now
     if (FLAGS_enable_new_ir_in_executor) {
       std::shared_ptr<::pir::Program> base_program = ir_program;
-      if (!FLAGS_enable_new_ir_api) {
+      if (!FLAGS_enable_pir_api) {
         VLOG(6) << "begin to translate" << std::endl;
         base_program = paddle::TranslateLegacyProgramToProgram(*program);
       }
diff --git a/paddle/fluid/prim/api/auto_code_generated/tensor_operants_gen.py b/paddle/fluid/prim/api/auto_code_generated/tensor_operants_gen.py
index 783066f0fc906..0bc050f00d4a0 100644
--- a/paddle/fluid/prim/api/auto_code_generated/tensor_operants_gen.py
+++ b/paddle/fluid/prim/api/auto_code_generated/tensor_operants_gen.py
@@ -214,7 +214,7 @@ class StaticTensorOperants : public TensorOperantsBase {
 #include "paddle/fluid/primitive/backend/backend.h"
 #include "paddle/fluid/primitive/type/lazy_tensor.h"
 
-PHI_DECLARE_bool(enable_new_ir_api);
+PHI_DECLARE_bool(enable_pir_api);
 
 """
 
@@ -227,7 +227,7 @@ class StaticTensorOperants : public TensorOperantsBase {
 using LazyTensor = paddle::primitive::LazyTensor;
 
 Tensor StaticTensorOperants::add(const Tensor& x, const Scalar& y) {
-  if (FLAGS_enable_new_ir_api) {
+  if (FLAGS_enable_pir_api) {
     return paddle::primitive::backend::add<LazyTensor>(x, paddle::primitive::backend::full<LazyTensor>(x.shape(), y, x.dtype(), x.place()));
   } else {
     return paddle::prim::add<DescTensor>(x, paddle::prim::full<DescTensor>(x.shape(), y, x.dtype(), x.place()));
@@ -235,7 +235,7 @@ class StaticTensorOperants : public TensorOperantsBase {
 }
 
 Tensor StaticTensorOperants::subtract(const Tensor& x, const Scalar& y) {
-  if (FLAGS_enable_new_ir_api) {
+  if (FLAGS_enable_pir_api) {
     return paddle::primitive::backend::subtract<LazyTensor>(x, paddle::primitive::backend::full<LazyTensor>(x.shape(), y, x.dtype(), x.place()));
   } else {
     return paddle::prim::subtract<DescTensor>(x, paddle::prim::full<DescTensor>(x.shape(), y, x.dtype(), x.place()));
@@ -243,7 +243,7 @@ class StaticTensorOperants : public TensorOperantsBase {
 }
 
 Tensor StaticTensorOperants::multiply(const Tensor& x, const Scalar& y) {
-  if (FLAGS_enable_new_ir_api) {
+  if (FLAGS_enable_pir_api) {
     return paddle::primitive::backend::scale<LazyTensor>(x, y, 0.0f, true);
   } else {
     return paddle::prim::scale<DescTensor>(x, y, 0.0f, true);
@@ -251,7 +251,7 @@ class StaticTensorOperants : public TensorOperantsBase {
 }
 
 Tensor StaticTensorOperants::divide(const Tensor& x, const Scalar& y) {
-  if (FLAGS_enable_new_ir_api) {
+  if (FLAGS_enable_pir_api) {
     return paddle::primitive::backend::divide<LazyTensor>(x, paddle::primitive::backend::full<LazyTensor>(x.shape(), y, x.dtype(), x.place()));
   } else {
     return paddle::prim::divide<DescTensor>(x, paddle::prim::full<DescTensor>(x.shape(), y, x.dtype(), x.place()));
@@ -259,7 +259,7 @@ class StaticTensorOperants : public TensorOperantsBase {
 }
 
 Tensor StaticTensorOperants::add(const Scalar& x, const Tensor& y) {
-  if (FLAGS_enable_new_ir_api) {
+  if (FLAGS_enable_pir_api) {
     return paddle::primitive::backend::add<LazyTensor>(paddle::primitive::backend::full<LazyTensor>(y.shape(), x, y.dtype(), y.place()), y);
   } else {
     return paddle::prim::add<DescTensor>(paddle::prim::full<DescTensor>(y.shape(), x, y.dtype(), y.place()), y);
@@ -268,7 +268,7 @@ class StaticTensorOperants : public TensorOperantsBase {
 
 
 Tensor StaticTensorOperants::subtract(const Scalar& x, const Tensor& y) {
-  if (FLAGS_enable_new_ir_api) {
+  if (FLAGS_enable_pir_api) {
     return paddle::primitive::backend::subtract<LazyTensor>(paddle::primitive::backend::full<LazyTensor>(y.shape(), x, y.dtype(), y.place()), y);
   } else {
     return paddle::prim::subtract<DescTensor>(paddle::prim::full<DescTensor>(y.shape(), x, y.dtype(), y.place()), y);
@@ -276,7 +276,7 @@ class StaticTensorOperants : public TensorOperantsBase {
 }
 
 Tensor StaticTensorOperants::multiply(const Scalar& x, const Tensor& y) {
-  if (FLAGS_enable_new_ir_api) {
+  if (FLAGS_enable_pir_api) {
     return paddle::primitive::backend::scale<LazyTensor>(y, x, 0.0f, true);
   } else {
     return paddle::prim::scale<DescTensor>(y, x, 0.0f, true);
@@ -284,7 +284,7 @@ class StaticTensorOperants : public TensorOperantsBase {
 }
 
 Tensor StaticTensorOperants::divide(const Scalar& x, const Tensor& y) {
-  if (FLAGS_enable_new_ir_api) {
+  if (FLAGS_enable_pir_api) {
     return paddle::primitive::backend::divide<LazyTensor>(paddle::primitive::backend::full<LazyTensor>(y.shape(), x, y.dtype(), y.place()), y);
   } else {
     return paddle::prim::divide<DescTensor>(paddle::prim::full<DescTensor>(y.shape(), x, y.dtype(), y.place()), y);
@@ -292,7 +292,7 @@ class StaticTensorOperants : public TensorOperantsBase {
 }
 
 Tensor StaticTensorOperants::pow(const Tensor& x, const Tensor& y) {
-  if (FLAGS_enable_new_ir_api) {
+  if (FLAGS_enable_pir_api) {
     return paddle::primitive::backend::elementwise_pow<LazyTensor>(x, y);
   } else {
     return paddle::prim::elementwise_pow<DescTensor>(x, y);
@@ -300,7 +300,7 @@ class StaticTensorOperants : public TensorOperantsBase {
 }
 
 Tensor StaticTensorOperants::pow(const Tensor& x, const Scalar& y) {
-  if (FLAGS_enable_new_ir_api) {
+  if (FLAGS_enable_pir_api) {
     return paddle::primitive::backend::elementwise_pow<LazyTensor>(x, paddle::primitive::backend::full<LazyTensor>(x.shape(), y, x.dtype(), x.place()));
   } else {
     return paddle::prim::elementwise_pow<DescTensor>(x, paddle::prim::full<DescTensor>(x.shape(), y, x.dtype(), x.place()));
@@ -393,7 +393,7 @@ def gene_static_tensor_func_call(self):
         )
         static_func_parameters = self.get_func_args()
 
-        static_tensor_func_call = f"""if (FLAGS_enable_new_ir_api) {{
+        static_tensor_func_call = f"""if (FLAGS_enable_pir_api) {{
     return {backend_static_func_name}({static_func_parameters});
   }} else {{
     return {prim_static_func_name}({static_func_parameters});
diff --git a/paddle/phi/core/flags.cc b/paddle/phi/core/flags.cc
index e02868d5e2c1b..ce03cdb3f4d69 100644
--- a/paddle/phi/core/flags.cc
+++ b/paddle/phi/core/flags.cc
@@ -1278,15 +1278,13 @@ PHI_DEFINE_EXPORTED_bool(enable_new_ir_in_executor,
 
 /**
  * Using new IR API in Python
- * Name: enable_new_ir_api
+ * Name: enable_pir_api
  * Since Version: 2.6.0
  * Value Range: bool, default=false
  * Example:
  * Note: If Ture, New IR API will be used in Python
  */
-PHI_DEFINE_EXPORTED_bool(enable_new_ir_api,
-                         false,
-                         "Enable new IR API in Python");
+PHI_DEFINE_EXPORTED_bool(enable_pir_api, false, "Enable new IR API in Python");
 
 /**
  * Using new IR in executor FLAG
diff --git a/python/paddle/base/framework.py b/python/paddle/base/framework.py
index 0440af415a7d0..d3f17ea6435e9 100644
--- a/python/paddle/base/framework.py
+++ b/python/paddle/base/framework.py
@@ -162,8 +162,8 @@ def __init__(self):
         self._in_to_static_mode_ = False
         self._functional_dygraph_context_manager = None
         self._dygraph_tracer_ = _dygraph_tracer_
-        self._use_pir_api_ = get_flags("FLAGS_enable_new_ir_api")[
-            'FLAGS_enable_new_ir_api'
+        self._use_pir_api_ = get_flags("FLAGS_enable_pir_api")[
+            'FLAGS_enable_pir_api'
         ]
 
     def __str__(self):
@@ -340,8 +340,8 @@ def in_dynamic_or_pir_mode():
             >>> print(paddle.framework.in_dynamic_or_pir_mode())
             False
 
-            >>> paddle.framework.set_flags({"FLAGS_enable_new_ir_api": True})
-            >>> print(paddle.framework.in_dynamic_or_pir_mode())
+            >>> with paddle.pir_utils.IrGuard():
+            ...     print(paddle.framework.in_dynamic_or_pir_mode())
             True
 
     """
diff --git a/python/paddle/pir_utils.py b/python/paddle/pir_utils.py
index a62fe6f61a924..9af825cfcd88b 100644
--- a/python/paddle/pir_utils.py
+++ b/python/paddle/pir_utils.py
@@ -19,11 +19,11 @@
 class IrGuard:
     def __init__(self):
         self.in_dygraph_outside = False
-        old_flag = paddle.base.framework.get_flags("FLAGS_enable_new_ir_api")
-        paddle.base.framework.set_flags({"FLAGS_enable_new_ir_api": False})
+        old_flag = paddle.base.framework.get_flags("FLAGS_enable_pir_api")
+        paddle.base.framework.set_flags({"FLAGS_enable_pir_api": False})
         paddle.base.framework.global_var._use_pir_api_ = False
-        if not paddle.base.framework.get_flags("FLAGS_enable_new_ir_api")[
-            "FLAGS_enable_new_ir_api"
+        if not paddle.base.framework.get_flags("FLAGS_enable_pir_api")[
+            "FLAGS_enable_pir_api"
         ]:
             self.old_Program = paddle.static.Program
             self.old_program_guard = paddle.base.program_guard
@@ -34,31 +34,31 @@ def __init__(self):
         else:
             raise RuntimeError(
                 "IrGuard only init when paddle.framework.in_pir_mode(): is false, \
-                please set FLAGS_enable_new_ir_api = false"
+                please set FLAGS_enable_pir_api = false"
             )
         paddle.base.framework.set_flags(old_flag)
         paddle.base.framework.global_var._use_pir_api_ = old_flag[
-            "FLAGS_enable_new_ir_api"
+            "FLAGS_enable_pir_api"
         ]
 
     def __enter__(self):
         self.in_dygraph_outside = paddle.base.framework.in_dygraph_mode()
         if self.in_dygraph_outside:
             paddle.enable_static()
-        paddle.framework.set_flags({"FLAGS_enable_new_ir_api": True})
+        paddle.framework.set_flags({"FLAGS_enable_pir_api": True})
         paddle.base.framework.global_var._use_pir_api_ = True
         self._switch_to_pir()
 
     def __exit__(self, exc_type, exc_val, exc_tb):
-        paddle.framework.set_flags({"FLAGS_enable_new_ir_api": False})
+        paddle.framework.set_flags({"FLAGS_enable_pir_api": False})
         paddle.base.framework.global_var._use_pir_api_ = False
         self._switch_to_old_ir()
         if self.in_dygraph_outside:
             paddle.disable_static()
 
     def _switch_to_pir(self):
-        if paddle.base.framework.get_flags("FLAGS_enable_new_ir_api")[
-            "FLAGS_enable_new_ir_api"
+        if paddle.base.framework.get_flags("FLAGS_enable_pir_api")[
+            "FLAGS_enable_pir_api"
         ]:
             paddle.framework.set_flags(
                 {"FLAGS_enable_new_ir_in_executor": True}
@@ -76,8 +76,8 @@ def _switch_to_pir(self):
             )
 
     def _switch_to_old_ir(self):
-        if not paddle.base.framework.get_flags("FLAGS_enable_new_ir_api")[
-            "FLAGS_enable_new_ir_api"
+        if not paddle.base.framework.get_flags("FLAGS_enable_pir_api")[
+            "FLAGS_enable_pir_api"
         ]:
             paddle.framework.set_flags(
                 {"FLAGS_enable_new_ir_in_executor": False}
@@ -93,5 +93,5 @@ def _switch_to_old_ir(self):
         else:
             raise RuntimeError(
                 "IrGuard._switch_to_old_ir only work when paddle.framework.in_pir_mode() is false, \
-                please set FLAGS_enable_new_ir_api = false"
+                please set FLAGS_enable_pir_api = false"
             )
diff --git a/test/ir/new_ir/CMakeLists.txt b/test/ir/new_ir/CMakeLists.txt
index e213eaba4c53c..75587db97c088 100644
--- a/test/ir/new_ir/CMakeLists.txt
+++ b/test/ir/new_ir/CMakeLists.txt
@@ -15,7 +15,7 @@ foreach(target ${TEST_INTERP_CASES})
 endforeach()
 
 foreach(target ${TEST_IR_SYSTEM_CASES})
-  py_test_modules(${target} MODULES ${target} ENVS FLAGS_enable_new_ir_api=true)
+  py_test_modules(${target} MODULES ${target} ENVS FLAGS_enable_pir_api=true)
 endforeach()
 
 set_tests_properties(test_pd_inplace_pass PROPERTIES TIMEOUT 60)
diff --git a/test/ir/new_ir/test_ir_backward.py b/test/ir/new_ir/test_ir_backward.py
index acffcf4ee28d6..c604290d34cad 100644
--- a/test/ir/new_ir/test_ir_backward.py
+++ b/test/ir/new_ir/test_ir_backward.py
@@ -38,7 +38,7 @@ def get_ir_program_0():
 
 class TesBackward_1(unittest.TestCase):
     def tearDown(self) -> None:
-        paddle.framework.set_flags({"FLAGS_enable_new_ir_api": False})
+        paddle.framework.set_flags({"FLAGS_enable_pir_api": False})
 
     def test_grad(self):
         newir_program = get_ir_program_0()
@@ -155,7 +155,7 @@ def get_ir_program_1():
 
 class TesBackward_2(unittest.TestCase):
     def tearDown(self) -> None:
-        paddle.framework.set_flags({"FLAGS_enable_new_ir_api": False})
+        paddle.framework.set_flags({"FLAGS_enable_pir_api": False})
 
     def test_add_n(self):
         newir_program = get_ir_program_1()
@@ -231,7 +231,7 @@ def get_ir_program_2():
 
 class TestBackward_3(unittest.TestCase):
     def tearDown(self) -> None:
-        paddle.framework.set_flags({"FLAGS_enable_new_ir_api": False})
+        paddle.framework.set_flags({"FLAGS_enable_pir_api": False})
 
     def test_basic_network(self):
         newir_program = get_ir_program_2()
diff --git a/test/prim/new_ir_prim/CMakeLists.txt b/test/prim/new_ir_prim/CMakeLists.txt
index 1b37b432d2052..a36e905e0c9f4 100644
--- a/test/prim/new_ir_prim/CMakeLists.txt
+++ b/test/prim/new_ir_prim/CMakeLists.txt
@@ -3,7 +3,7 @@ set(TEST_PRIM_PURE_NEW_IR_CASES test_prim_program test_prim_simpnet
 
 foreach(target ${TEST_PRIM_PURE_NEW_IR_CASES})
   py_test_modules(${target} MODULES ${target} ENVS GLOG_v=1
-                  FLAGS_enable_new_ir_api=true)
+                  FLAGS_enable_pir_api=true)
 endforeach()
 
 file(

From c882037892eaa80250a2e06b3f032326a1629661 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=91=A8=E5=91=A8=E5=91=A8?=
 <39978853+zhoutianzi666@users.noreply.github.com>
Date: Thu, 21 Sep 2023 11:24:42 +0800
Subject: [PATCH 25/39] remove SetTensorDynamicRange in softmax (#57538)

---
 paddle/fluid/inference/tensorrt/convert/softmax_op.cc | 2 --
 1 file changed, 2 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
index 8e101075768e0..9aefd7fb28b39 100644
--- a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
@@ -84,8 +84,6 @@ class SoftMaxOpConverter : public OpConverter {
     }
     layer->setAxes(1 << axes);
 
-    // The trt will not run int for softmax.
-    engine_->SetTensorDynamicRange(input1, 1.0);
     auto output_name = op_desc.Output("Out")[0];
 
     // support 0 or 1 dims input

From 5be4e463cde24dec8cd0cb60833224022f24f90e Mon Sep 17 00:00:00 2001
From: hong <43953930+phlrain@users.noreply.github.com>
Date: Thu, 21 Sep 2023 12:13:30 +0800
Subject: [PATCH 26/39] [PIR]Fix arange op and assign op bug (#57494)

* fix arange kernel selected bug

* revert some code

* fix compile bug
---
 .../fluid/ir_adaptor/translator/op_translator.cc  |  4 ++--
 .../fluid/pir/transforms/pd_op_to_kernel_pass.cc  | 15 +++++++++++----
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/ir_adaptor/translator/op_translator.cc b/paddle/fluid/ir_adaptor/translator/op_translator.cc
index e3eeaab4f7d48..b11101de616b8 100644
--- a/paddle/fluid/ir_adaptor/translator/op_translator.cc
+++ b/paddle/fluid/ir_adaptor/translator/op_translator.cc
@@ -754,8 +754,8 @@ struct AssignValueOpTranscriber : public OpTranscriber {
         attribute_translator(attr_info_maps.at("dtype").type_name, legacy_attr);
     attribute_map["dtype"] = attr_dtype;
 
-    pir::Attribute attr_place =
-        dialect::PlaceAttribute::get(ctx, phi::CPUPlace());
+    pir::Attribute attr_place = dialect::PlaceAttribute::get(
+        ctx, phi::Place(phi::AllocationType::UNDEFINED));
     attribute_map["place"] = attr_place;
 
     int dtype = paddle::get<int>(op_desc.GetAttr("dtype"));
diff --git a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
index d77161992c311..79e6bbe71230e 100644
--- a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
+++ b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
@@ -401,7 +401,8 @@ phi::DataType GetKernelDataTypeByYamlInfo(
 phi::Backend GetKernelBackendByYamlInfo(
     const pir::Operation* op,
     const std::unordered_map<pir::Value, pir::OpResult>& map_value_pair,
-    const dialect::OpYamlInfoParser* op_info_parser) {
+    const dialect::OpYamlInfoParser* op_info_parser,
+    const phi::Place& place) {
   auto& attr_map = op->attributes();
   auto& backend_info = op_info_parser->OpRuntimeInfo().kernel_key_backend;
   phi::Backend kernel_backend = phi::Backend::UNDEFINED;
@@ -465,6 +466,10 @@ phi::Backend GetKernelBackendByYamlInfo(
     }
   }
 
+  if (backend_info.size() > 0 && kernel_backend == phi::Backend::UNDEFINED) {
+    kernel_backend = paddle::experimental::ParseBackend(place);
+  }
+
   return kernel_backend;
 }
 
@@ -518,7 +523,7 @@ phi::KernelKey GetKernelKey(
     kernel_data_type =
         GetKernelDataTypeByYamlInfo(op, map_value_pair, op_info_parser);
     kernel_backend =
-        GetKernelBackendByYamlInfo(op, map_value_pair, op_info_parser);
+        GetKernelBackendByYamlInfo(op, map_value_pair, op_info_parser, place);
 
     // parse all the input tensor
     if (tensor_input_number == 0 || op->isa<paddle::dialect::Full_Op>()) {
@@ -550,7 +555,9 @@ phi::KernelKey GetKernelKey(
     }
   }
 
-  if (op->num_operands() > 0) {
+  if ((kernel_backend == phi::Backend::UNDEFINED ||
+       kernel_data_type == phi::DataType::UNDEFINED) &&
+      op->num_operands() > 0) {
     paddle::experimental::detail::KernelKeyParser kernel_key_parser;
 
     for (size_t i = 0; i < op->num_operands(); ++i) {
@@ -724,7 +731,7 @@ void HandleForSpecialOp(
     pir::IrContext* ctx,
     std::unordered_map<pir::Operation*, pir::Operation*>* map_op_pair,
     std::unordered_map<pir::Value, pir::OpResult>* map_value_pair) {
-  if (op_item->name() == "pd_op.if") {
+  if (op_item->isa<paddle::dialect::IfOp>()) {
     HandleForIfOp(place, op_item, block, ctx, map_op_pair, map_value_pair);
     return;
   }

From b13dcb85918bb467ebe557093e22bc2482479c93 Mon Sep 17 00:00:00 2001
From: cyber-pioneer <116002591+cyber-pioneer@users.noreply.github.com>
Date: Thu, 21 Sep 2023 14:13:23 +0800
Subject: [PATCH 27/39] support pir jit prim (#57561)

---
 .../jit/dy2static/newir_partial_program.py    | 32 +++++++---
 .../jit/dy2static/program_translator.py       | 60 +++++++++++++++++--
 test/prim/new_ir_prim/CMakeLists.txt          |  2 +-
 test/prim/new_ir_prim/test_prim_jit.py        | 58 ++++++++++++++++++
 4 files changed, 138 insertions(+), 14 deletions(-)
 create mode 100644 test/prim/new_ir_prim/test_prim_jit.py

diff --git a/python/paddle/jit/dy2static/newir_partial_program.py b/python/paddle/jit/dy2static/newir_partial_program.py
index 83cb5eed92534..c0da8f35c822a 100644
--- a/python/paddle/jit/dy2static/newir_partial_program.py
+++ b/python/paddle/jit/dy2static/newir_partial_program.py
@@ -642,11 +642,15 @@ def _insert_aggregation_ops_for_var(target_program, var):
     @switch_to_static_graph
     def _append_backward_desc(self, main_program):
         program = main_program
-        # if self._hooker:
-        # program = self._hooker.before_append_backward(program)
+
         targets = list(
             filter(lambda x: isinstance(x, OpResult), self._outputs.tolist())
         )
+        if self._hooker:
+            program, targets = self._hooker.before_append_backward(
+                program, targets
+            )
+            self._outputs = NestSequence(targets, need_check=True)
         inputs = list(
             filter(lambda x: isinstance(x, OpResult), self._inputs.tolist())
         )
@@ -676,11 +680,15 @@ def _append_backward_desc(self, main_program):
                     forward_outputs_grads.append(opres)
                     not_stop_gradient_num += 1
 
-            # TODO: add later.
-            # if self._hooker:
-            # program, start_idx = self._hooker.after_append_backward(
-            # program, start_idx
-            # )
+            if self._hooker:
+                (
+                    program,
+                    forward_end_idx,
+                    targets,
+                ) = self._hooker.after_append_backward(
+                    program, targets, forward_end_idx
+                )
+                self._outputs = NestSequence(targets, need_check=True)
 
             # TODO: add later
             # self.prepare_gradient_aggregation(
@@ -692,6 +700,8 @@ def _append_backward_desc(self, main_program):
         )
         hash_id = paddle.utils._hash_with_id(program, self)
         extra_info = self._program_extra_info.get(hash_id, {})
+        extra_info['forward_inputs'] = inputs
+        extra_info['forward_outputs'] = targets
         extra_info['forward_end_op_idx'] = forward_end_idx
         extra_info['forward_inputs_grads'] = list(
             map(mapping_op_result, grad_info_map)
@@ -791,8 +801,10 @@ def _get_forward_backward_program_form(
         forward_inputs_grads = self.get_program_extra(whole_program)[
             'forward_inputs_grads'
         ]
-        forward_inputs = self._inputs.tolist()
-        forward_outputs = self._outputs.tolist()
+        forward_inputs = self.get_program_extra(whole_program)['forward_inputs']
+        forward_outputs = self.get_program_extra(whole_program)[
+            'forward_outputs'
+        ]
         forward_outputs_grads = self.get_program_extra(whole_program)[
             'forward_outputs_grads'
         ]
@@ -947,9 +959,11 @@ def create_out(var_id):
                 tensor_type = paddle.dtype(8)  # SELECT ROW TENSOR
 
             # TODO(xiongkun): more elegent way to do it.
+
             ir_dtype_2_tensor_dtype = {
                 10: paddle.dtype(5),
             }
+
             out = core.eager.Tensor(
                 ir_dtype_2_tensor_dtype[int(var.dtype)],
                 var.shape,
diff --git a/python/paddle/jit/dy2static/program_translator.py b/python/paddle/jit/dy2static/program_translator.py
index 592665596cfef..8eb118852a764 100644
--- a/python/paddle/jit/dy2static/program_translator.py
+++ b/python/paddle/jit/dy2static/program_translator.py
@@ -21,6 +21,7 @@
 import weakref
 
 import paddle.ir.core as ir_static
+from paddle import decomposition
 from paddle.base import core, framework
 from paddle.base.data_feeder import check_type
 from paddle.base.dygraph.base import (
@@ -42,6 +43,9 @@
     get_buffers,
     get_parameters,
 )
+from .newir_partial_program import (
+    PartialProgramLayerHook as PirPartialProgramLayerHook,
+)
 from .origin_info import (
     attach_origin_info,
     create_and_update_origin_info_map,
@@ -1473,6 +1477,46 @@ def __setattr__(self, key, value):
         return super().__setattr__(key, value)
 
 
+class PirPrimHooker(PirPartialProgramLayerHook):
+    def __init__(self, original_program, backend):
+        self.backend = backend
+        self.custom_vjps = set()
+        with backend_guard(self.backend):
+            if core._is_all_prim_enabled():
+                self.custom_vjps = {
+                    op.name()
+                    for op in original_program.global_block().ops
+                    if core.has_custom_vjp(op)
+                }
+
+    def before_append_backward(self, forward_program, src_vars):
+        with backend_guard(self.backend):
+            if core._is_fwd_prim_enabled():
+                dst_vars = decomposition.decompose(
+                    forward_program, src_vars, blacklist=self.custom_vjps
+                )
+            return forward_program, dst_vars
+
+    def after_append_backward(self, whole_program, src_vars, forward_end_idx):
+        with backend_guard(self.backend):
+            backward_length = (
+                len(whole_program.global_block().ops) - forward_end_idx
+            )
+            if core._is_fwd_prim_enabled() and len(self.custom_vjps) != 0:
+                # only process backward part of block
+                dst_vars = decomposition.decompose(whole_program, src_vars)
+            new_start_index = (
+                len(whole_program.global_block().ops) - backward_length
+            )
+            return whole_program, new_start_index, dst_vars
+
+    def after_infer(self, infer_program, src_vars):
+        with backend_guard(self.backend):
+            if core._is_fwd_prim_enabled():
+                dst_vars = decomposition.decompose(infer_program, src_vars)
+            return infer_program, dst_vars
+
+
 class ProgramCache:
     """
     Wrapper class for the program functions defined by dygraph function.
@@ -1530,7 +1574,10 @@ def _build_once(self, cache_key):
                 raise
 
         backend = cache_key.kwargs['backend']
-        if prim_or_cinn_is_enabled(cache_key.kwargs['build_strategy'], backend):
+        if (
+            prim_or_cinn_is_enabled(cache_key.kwargs['build_strategy'], backend)
+            and not use_pir_api()
+        ):
             for var in concrete_program.main_program.list_vars():
                 if var.type not in NO_SHAPE_VAR_TYPE and -1 in var.shape:
                     warnings.warn(
@@ -1553,9 +1600,14 @@ def _build_once(self, cache_key):
             )
         with backend_guard(backend):
             if core._is_fwd_prim_enabled():
-                partial_program.set_hooker(
-                    PrimHooker(concrete_program.main_program, backend)
-                )
+                if use_pir_api():
+                    partial_program.set_hooker(
+                        PirPrimHooker(concrete_program.main_program, backend)
+                    )
+                else:
+                    partial_program.set_hooker(
+                        PrimHooker(concrete_program.main_program, backend)
+                    )
         return concrete_program, partial_program
 
     def __getitem__(self, item):
diff --git a/test/prim/new_ir_prim/CMakeLists.txt b/test/prim/new_ir_prim/CMakeLists.txt
index a36e905e0c9f4..e1cbcd60f8ee4 100644
--- a/test/prim/new_ir_prim/CMakeLists.txt
+++ b/test/prim/new_ir_prim/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TEST_PRIM_PURE_NEW_IR_CASES test_prim_program test_prim_simpnet
-                                test_prim_custom_vjp)
+                                test_prim_custom_vjp test_prim_jit)
 
 foreach(target ${TEST_PRIM_PURE_NEW_IR_CASES})
   py_test_modules(${target} MODULES ${target} ENVS GLOG_v=1
diff --git a/test/prim/new_ir_prim/test_prim_jit.py b/test/prim/new_ir_prim/test_prim_jit.py
new file mode 100644
index 0000000000000..72958eff9a1d7
--- /dev/null
+++ b/test/prim/new_ir_prim/test_prim_jit.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle.framework import core
+
+
+class TestDy2staticNewIR(unittest.TestCase):
+    def test_basic_network_backward(self):
+        core._set_prim_all_enabled(True)
+
+        def func(x):
+            x1 = paddle.mean(x)
+            out = paddle.nn.functional.gelu(x1, False)
+            return out
+
+        # ==== dygraph computation ====
+        static_func = paddle.jit.to_static(func)
+        x = paddle.randn((8, 16, 64))
+        x.stop_gradient = False
+        ref_out = func(x) * 2
+        ref_out.backward()
+        ref_grad = x.grad.numpy()
+        x.clear_gradient()
+
+        # ==== to static compuatation ====
+        out = static_func(x)
+        actual_out = out * 2
+        actual_out.backward()
+        actual_grad = x.grad
+        core._set_prim_all_enabled(True)
+
+        np.testing.assert_allclose(
+            ref_out, actual_out.numpy(), atol=1e-6, rtol=1e-6
+        )
+
+        np.testing.assert_allclose(
+            ref_grad, actual_grad.numpy(), atol=1e-6, rtol=1e-6
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()

From e24119c3e6ac49486f83fcdafad0ae6844a7633a Mon Sep 17 00:00:00 2001
From: BiynXu <62832681+BiynXu@users.noreply.github.com>
Date: Thu, 21 Sep 2023 14:14:02 +0800
Subject: [PATCH 28/39] [Fix] fix multi device compile error (#57530)

Add device_id directory when dumping information.
Reduce threads during multi card compilation.
---
 paddle/cinn/backends/compiler.cc              | 41 ++++++++++++++-----
 paddle/cinn/backends/compiler.h               | 19 ++++++---
 paddle/cinn/hlir/framework/graph.cc           | 10 ++++-
 .../cinn/hlir/framework/parallel_compiler.cc  | 38 ++++++++++++-----
 .../cinn/hlir/framework/parallel_compiler.h   | 11 ++++-
 5 files changed, 88 insertions(+), 31 deletions(-)

diff --git a/paddle/cinn/backends/compiler.cc b/paddle/cinn/backends/compiler.cc
index 448bef2392a9f..a913a3de86692 100644
--- a/paddle/cinn/backends/compiler.cc
+++ b/paddle/cinn/backends/compiler.cc
@@ -45,7 +45,7 @@ using CompilationStatus = hlir::framework::CompilationStatus;
 static constexpr int DebugLogMaxLen = 30000;
 
 void CompilationInfoDumper::DumpLoweredFuncByGroupIndex(
-    const ir::LoweredFunc& lowered_func, const int gidx) {
+    const ir::LoweredFunc& lowered_func, const int gidx, const int device_id) {
   if (FLAGS_cinn_dump_group_lowered_func.empty() ||
       lowered_func.get() == nullptr) {
     return;
@@ -54,34 +54,42 @@ void CompilationInfoDumper::DumpLoweredFuncByGroupIndex(
   content << lowered_func;
   Dump(FLAGS_cinn_dump_group_lowered_func,
        gidx,
+       device_id,
        "lowered_function.txt",
        content.str());
 }
 
 void CompilationInfoDumper::DumpSourceCodeByGroupIndex(
-    const std::string& source_code, const int gidx) {
+    const std::string& source_code, const int gidx, const int device_id) {
   if (FLAGS_cinn_dump_group_source_code.empty()) {
     return;
   }
-  Dump(FLAGS_cinn_dump_group_source_code, gidx, "source_code.cu", source_code);
+  Dump(FLAGS_cinn_dump_group_source_code,
+       gidx,
+       device_id,
+       "source_code.cu",
+       source_code);
 }
 
 void CompilationInfoDumper::DumpPtxCodeByGroupIndex(
-    const std::string& source_ptx, const int gidx) {
+    const std::string& source_ptx, const int gidx, const int device_id) {
   if (FLAGS_cinn_dump_group_ptx.empty()) {
     return;
   }
-  Dump(FLAGS_cinn_dump_group_ptx, gidx, "source_ptx.ptx", source_ptx);
+  Dump(
+      FLAGS_cinn_dump_group_ptx, gidx, device_id, "source_ptx.ptx", source_ptx);
 }
 
 void CompilationInfoDumper::DumpInstructionByGroupIndex(
     const std::unique_ptr<cinn::hlir::framework::Instruction>& instr,
-    const int gidx) {
+    const int gidx,
+    const int device_id) {
   if (FLAGS_cinn_dump_group_instruction.empty() || instr.get() == nullptr) {
     return;
   }
   Dump(FLAGS_cinn_dump_group_instruction,
        gidx,
+       device_id,
        "instruction.txt",
        instr->DumpInstruction());
 }
@@ -99,6 +107,7 @@ void CompilationInfoDumper::DumpLoweredFunc() {
     }
     Dump(FLAGS_cinn_dump_group_lowered_func,
          idx,
+         device_id_,
          "lowered_function.txt",
          content.str());
   }
@@ -115,7 +124,11 @@ void CompilationInfoDumper::DumpSourceCode() {
     } else {
       dump_str = "[No source code generated]\n\n" + info_.Message(idx);
     }
-    Dump(FLAGS_cinn_dump_group_source_code, idx, "source_code.cu", dump_str);
+    Dump(FLAGS_cinn_dump_group_source_code,
+         idx,
+         device_id_,
+         "source_code.cu",
+         dump_str);
   }
 }
 
@@ -130,7 +143,8 @@ void CompilationInfoDumper::DumpPtxCode() {
     } else {
       dump_str = "[No source ptxs generated]\n\n" + info_.Message(idx);
     }
-    Dump(FLAGS_cinn_dump_group_ptx, idx, "source_ptx.ptx", dump_str);
+    Dump(
+        FLAGS_cinn_dump_group_ptx, idx, device_id_, "source_ptx.ptx", dump_str);
   }
 }
 
@@ -145,16 +159,21 @@ void CompilationInfoDumper::DumpInstruction() {
     } else {
       dump_str = "[No instruction generated]\n\n" + info_.Message(idx);
     }
-    Dump(FLAGS_cinn_dump_group_instruction, idx, "instruction.txt", dump_str);
+    Dump(FLAGS_cinn_dump_group_instruction,
+         idx,
+         device_id_,
+         "instruction.txt",
+         dump_str);
   }
 }
 
 void CompilationInfoDumper::Dump(const std::string& base_path,
                                  const int idx,
+                                 const int device_id,
                                  const std::string& file_name,
                                  const std::string& content) {
-  auto dump_path =
-      utils::StringFormat("%s/fusion_group_%d", base_path.c_str(), idx);
+  auto dump_path = utils::StringFormat(
+      "%s/device_%d/fusion_group_%d", base_path.c_str(), device_id, idx);
   if (!hlir::framework::MakeDirectory(
           dump_path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH)) {
     LOG(WARNING) << "Failed to make directory: \"" << dump_path
diff --git a/paddle/cinn/backends/compiler.h b/paddle/cinn/backends/compiler.h
index 8b09573b522e4..a468193d4d85a 100644
--- a/paddle/cinn/backends/compiler.h
+++ b/paddle/cinn/backends/compiler.h
@@ -43,8 +43,9 @@ namespace backends {
  */
 class CompilationInfoDumper {
  public:
-  explicit CompilationInfoDumper(const hlir::framework::CompilationResult& info)
-      : info_(info) {
+  explicit CompilationInfoDumper(const hlir::framework::CompilationResult& info,
+                                 const int device_id)
+      : info_(info), device_id_(device_id) {
     DumpLoweredFunc();
     DumpSourceCode();
     DumpPtxCode();
@@ -52,14 +53,18 @@ class CompilationInfoDumper {
   }
 
   static void DumpLoweredFuncByGroupIndex(const ir::LoweredFunc& lowered_func,
-                                          const int gidx);
+                                          const int gidx,
+                                          const int device_id);
   static void DumpSourceCodeByGroupIndex(const std::string& source_code,
-                                         const int gidx);
+                                         const int gidx,
+                                         const int device_id);
   static void DumpPtxCodeByGroupIndex(const std::string& source_ptx,
-                                      const int gidx);
+                                      const int gidx,
+                                      const int device_id);
   static void DumpInstructionByGroupIndex(
       const std::unique_ptr<cinn::hlir::framework::Instruction>& instr,
-      const int gidx);
+      const int gidx,
+      const int device_id);
 
  private:
   void DumpLoweredFunc();
@@ -68,10 +73,12 @@ class CompilationInfoDumper {
   void DumpInstruction();
   static void Dump(const std::string& base_path,
                    const int idx,
+                   const int device_id,
                    const std::string& file_name,
                    const std::string& content);
 
   const hlir::framework::CompilationResult& info_;
+  const int device_id_;
 };
 
 class SourceCodePrint {
diff --git a/paddle/cinn/hlir/framework/graph.cc b/paddle/cinn/hlir/framework/graph.cc
index 3f81b8b91906d..4c8d166e4cc4a 100644
--- a/paddle/cinn/hlir/framework/graph.cc
+++ b/paddle/cinn/hlir/framework/graph.cc
@@ -18,6 +18,9 @@
 #include <sstream>
 
 #include "paddle/cinn/hlir/framework/visualize_helper.h"
+#ifdef CINN_WITH_CUDA
+#include "paddle/cinn/runtime/cuda/cuda_util.h"
+#endif
 #include "paddle/cinn/runtime/flags.h"
 #include "paddle/cinn/utils/string.h"
 
@@ -315,9 +318,14 @@ void Graph::VisualizeGroupedGraph(
   const auto& group_dots = VisualizeGroups(groups, fetch_var_ids);
   for (int idx = 0; idx < groups.size(); ++idx) {
     // Create fusion_group_x folder
+    int device_id = 0;
+#ifdef CINN_WITH_CUDA
+    cudaGetDevice(&device_id);
+#endif
     auto group_path =
-        utils::StringFormat("%s/fusion_group_%d",
+        utils::StringFormat("%s/device_%d/fusion_group_%d",
                             FLAGS_cinn_fusion_groups_graphviz_dir.c_str(),
+                            device_id,
                             idx);
     if (!MakeDirectory(group_path,
                        S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH)) {
diff --git a/paddle/cinn/hlir/framework/parallel_compiler.cc b/paddle/cinn/hlir/framework/parallel_compiler.cc
index bae6048477623..3a15f7c42bef0 100644
--- a/paddle/cinn/hlir/framework/parallel_compiler.cc
+++ b/paddle/cinn/hlir/framework/parallel_compiler.cc
@@ -80,8 +80,13 @@ void ParallelCompiler::SplitTask() {
   CHECK(context_->lowered_funcs.empty() ||
         context_->graph->fusion_groups.size() ==
             context_->lowered_funcs.size());
-  for (int i = 0; i < context_->graph->fusion_groups.size(); ++i) {
-    tasks_.emplace_back(i, this, context_);
+  int device_id = 0;
+#ifdef CINN_WITH_CUDA
+  CUDA_CALL(cudaGetDevice(&device_id));
+#endif
+  for (int group_id = 0; group_id < context_->graph->fusion_groups.size();
+       ++group_id) {
+    tasks_.emplace_back(device_id, group_id, this, context_);
   }
 }
 
@@ -126,11 +131,20 @@ void ParallelCompiler::RunTask() {
 }
 
 void ParallelCompiler::LaunchTask() {
+  int device_id = 0;
+#ifdef CINN_WITH_CUDA
+  CUDA_CALL(cudaGetDevice(&device_id));
+#endif
+  int num_threads = FLAGS_cinn_parallel_compile_thread;
+#if defined(PADDLE_WITH_DISTRIBUTE)
+  if (device_id > 0) {
+    num_threads = 1;
+  }
+#endif
   // multi thread compilation
   std::vector<std::thread> threads;
-  VLOG(4) << "Compile with " << FLAGS_cinn_parallel_compile_thread
-          << " threads";
-  for (int idx = 1; idx < FLAGS_cinn_parallel_compile_thread; ++idx) {
+  VLOG(4) << "Compile with " << num_threads << " threads";
+  for (int idx = 1; idx < num_threads; ++idx) {
     threads.emplace_back(&ParallelCompiler::RunTask, this);
   }
 
@@ -208,7 +222,7 @@ void ParallelCompiler::Task::Lowering() {
     pcompiler->result_.SetLoweredFuncs(group_id, lowered_funcs);
   }
   backends::CompilationInfoDumper::DumpLoweredFuncByGroupIndex(
-      pcompiler->result_.LoweredFuncs(group_id).front(), group_id);
+      pcompiler->result_.LoweredFuncs(group_id).front(), group_id, device_id);
 }
 
 void ParallelCompiler::Task::CodegenAndJit() {
@@ -239,8 +253,8 @@ void ParallelCompiler::Task::CodegenAndJit() {
     }
     CHECK(!cuda_c.empty()) << "Compile CUDA C code failed from device module:\n"
                            << dmodule;
-    backends::CompilationInfoDumper::DumpSourceCodeByGroupIndex(cuda_c,
-                                                                group_id);
+    backends::CompilationInfoDumper::DumpSourceCodeByGroupIndex(
+        cuda_c, group_id, device_id);
     pcompiler->result_.SetSourceCode(group_id, cuda_c);
 
     cinn::backends::SourceCodePrint::GetInstance()->write(cuda_c);
@@ -249,7 +263,8 @@ void ParallelCompiler::Task::CodegenAndJit() {
     backends::nvrtc::Compiler compiler;
     auto ptx = compiler(cuda_c);
     CHECK(!ptx.empty()) << "Compile PTX failed from source code:\n" << cuda_c;
-    backends::CompilationInfoDumper::DumpPtxCodeByGroupIndex(ptx, group_id);
+    backends::CompilationInfoDumper::DumpPtxCodeByGroupIndex(
+        ptx, group_id, device_id);
     pcompiler->result_.SetSourcePtx(group_id, ptx);
     // load cumodule
     cumodule = std::make_unique<CUDAModule>(ptx,
@@ -260,7 +275,7 @@ void ParallelCompiler::Task::CodegenAndJit() {
     // register kernel
     backends::RuntimeSymbols symbols;
     for (auto& fn : dmodule.functions()) {
-      auto cufunc = cumodule->GetFunction(0, fn->name);
+      auto cufunc = cumodule->GetFunction(device_id, fn->name);
       CHECK(cufunc);
       symbols.RegisterVar(fn->name + "_ptr_", reinterpret_cast<void*>(cufunc));
     }
@@ -291,7 +306,8 @@ void ParallelCompiler::Task::BuildInstruction() {
   instr->SetLoweredFunc(reinterpret_cast<void*>(fn_ptr), group->GetFuncName());
 
   instr->Finalize();
-  backends::CompilationInfoDumper::DumpInstructionByGroupIndex(instr, group_id);
+  backends::CompilationInfoDumper::DumpInstructionByGroupIndex(
+      instr, group_id, device_id);
   pcompiler->result_.SetInstruction(group_id, std::move(instr));
 }
 
diff --git a/paddle/cinn/hlir/framework/parallel_compiler.h b/paddle/cinn/hlir/framework/parallel_compiler.h
index e78ee99404867..df0d39ebe2afc 100644
--- a/paddle/cinn/hlir/framework/parallel_compiler.h
+++ b/paddle/cinn/hlir/framework/parallel_compiler.h
@@ -36,8 +36,14 @@ namespace framework {
 class ParallelCompiler {
  public:
   struct Task {
-    Task(int group_id, ParallelCompiler* compiler, CompilationContext* context)
-        : group_id(group_id), pcompiler(compiler), context(context) {}
+    Task(int device_id,
+         int group_id,
+         ParallelCompiler* compiler,
+         CompilationContext* context)
+        : device_id(device_id),
+          group_id(group_id),
+          pcompiler(compiler),
+          context(context) {}
     void Lowering();
     void CodegenAndJit();
     void BuildInstruction();
@@ -48,6 +54,7 @@ class ParallelCompiler {
     CompilationStatus status = CompilationStatus::SUCCESS;
     std::string message;
 
+    const int device_id;
     int group_id;
 
     std::unique_ptr<backends::ExecutionEngine> engine;

From be463d319530ec7ae1b5d4d5ecb7f1d3d0dbb445 Mon Sep 17 00:00:00 2001
From: xiaoguoguo626807 <100397923+xiaoguoguo626807@users.noreply.github.com>
Date: Thu, 21 Sep 2023 14:17:21 +0800
Subject: [PATCH 29/39] [PIR]add all add ,  mul newir optest (#57533)

* add all add mul newir optest

* add sub optest

* delete sub
---
 test/legacy_test/test_elementwise_add_op.py | 21 +++--------
 test/legacy_test/test_elementwise_mul_op.py | 42 ++++++---------------
 2 files changed, 17 insertions(+), 46 deletions(-)

diff --git a/test/legacy_test/test_elementwise_add_op.py b/test/legacy_test/test_elementwise_add_op.py
index 279d1997f160e..8bacfc9a45cfd 100644
--- a/test/legacy_test/test_elementwise_add_op.py
+++ b/test/legacy_test/test_elementwise_add_op.py
@@ -212,7 +212,7 @@ def setUp(self):
 
     def test_check_output(self):
         place = core.CUDAPlace(0)
-        self.check_output_with_place(place)
+        self.check_output_with_place(place, check_new_ir=True)
 
     def test_check_grad_normal(self):
         place = core.CUDAPlace(0)
@@ -738,27 +738,16 @@ def init_input_output(self):
         self.out = self.x + self.y
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=False)
 
     def test_check_grad_normal(self):
-        self.check_grad(
-            ['X', 'Y'],
-            'Out',
-        )
+        self.check_grad(['X', 'Y'], 'Out', check_new_ir=False)
 
     def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'],
-            'Out',
-            no_grad_set=set("X"),
-        )
+        self.check_grad(['Y'], 'Out', no_grad_set=set("X"), check_new_ir=False)
 
     def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'],
-            'Out',
-            no_grad_set=set('Y'),
-        )
+        self.check_grad(['X'], 'Out', no_grad_set=set('Y'), check_new_ir=False)
 
 
 class TestRealComplexElementwiseAddOp(TestComplexElementwiseAddOp):
diff --git a/test/legacy_test/test_elementwise_mul_op.py b/test/legacy_test/test_elementwise_mul_op.py
index 86f4e764916e0..8013eb0baaf15 100644
--- a/test/legacy_test/test_elementwise_mul_op.py
+++ b/test/legacy_test/test_elementwise_mul_op.py
@@ -128,24 +128,13 @@ def if_enable_cinn(self):
         self.enable_cinn = False
 
     def test_check_grad_normal(self):
-        self.check_grad(
-            ['X', 'Y'],
-            'Out',
-        )
+        self.check_grad(['X', 'Y'], 'Out', check_new_ir=False)
 
     def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'],
-            'Out',
-            no_grad_set=set("X"),
-        )
+        self.check_grad(['Y'], 'Out', no_grad_set=set("X"), check_new_ir=False)
 
     def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'],
-            'Out',
-            no_grad_set=set('Y'),
-        )
+        self.check_grad(['X'], 'Out', no_grad_set=set('Y'), check_new_ir=False)
 
 
 class TestElementwiseMulOp_ZeroDim1(ElementwiseMulOp):
@@ -196,7 +185,7 @@ def setUp(self):
         self.if_enable_cinn()
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=True)
 
     def test_check_grad_normal(self):
         self.check_grad(['X', 'Y'], 'Out', check_prim=True, check_new_ir=True)
@@ -274,6 +263,7 @@ def test_check_output(self):
         self.check_output(
             check_dygraph=self.check_dygraph,
             check_prim=self.check_prim,
+            check_new_ir=self.check_dygraph,
         )
 
     def test_check_grad_normal(self):
@@ -282,6 +272,7 @@ def test_check_grad_normal(self):
             'Out',
             check_dygraph=self.check_dygraph,
             check_prim=self.check_prim,
+            check_new_ir=self.check_dygraph,
         )
 
     def test_check_grad_ingore_x(self):
@@ -291,6 +282,7 @@ def test_check_grad_ingore_x(self):
             no_grad_set=set("X"),
             check_dygraph=self.check_dygraph,
             check_prim=self.check_prim,
+            check_new_ir=self.check_dygraph,
         )
 
     def test_check_grad_ingore_y(self):
@@ -300,6 +292,7 @@ def test_check_grad_ingore_y(self):
             no_grad_set=set('Y'),
             check_dygraph=self.check_dygraph,
             check_prim=self.check_prim,
+            check_new_ir=self.check_dygraph,
         )
 
     def init_input_attr_output(self):
@@ -527,27 +520,16 @@ def init_input_output(self):
         self.out = self.x * self.y
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=False)
 
     def test_check_grad_normal(self):
-        self.check_grad(
-            ['X', 'Y'],
-            'Out',
-        )
+        self.check_grad(['X', 'Y'], 'Out', check_new_ir=False)
 
     def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'],
-            'Out',
-            no_grad_set=set("X"),
-        )
+        self.check_grad(['Y'], 'Out', no_grad_set=set("X"), check_new_ir=False)
 
     def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'],
-            'Out',
-            no_grad_set=set('Y'),
-        )
+        self.check_grad(['X'], 'Out', no_grad_set=set('Y'), check_new_ir=False)
 
 
 class TestRealComplexElementwiseMulOp(TestComplexElementwiseMulOp):

From 6d9d73a230d65c871da4487c30a5c82558056833 Mon Sep 17 00:00:00 2001
From: Ruibin Cheung <beinggod@foxmail.com>
Date: Thu, 21 Sep 2023 14:35:35 +0800
Subject: [PATCH 30/39] [Custom Device] change the dlopen flag of custom device
 dylibs (#57544)

---
 paddle/fluid/platform/init.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index eae360c146df5..a3fff528f7903 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -142,7 +142,7 @@ void LoadCustomDevice(const std::string &library_dir) {
   LOG(INFO) << "Try loading custom device libs from: [" << library_dir << "]";
   std::vector<std::string> libs = phi::ListAllLibraries(library_dir);
   for (const auto &lib_path : libs) {
-    auto dso_handle = dlopen(lib_path.c_str(), RTLD_NOW);
+    auto dso_handle = dlopen(lib_path.c_str(), RTLD_LAZY);
     PADDLE_ENFORCE_NOT_NULL(
         dso_handle,
         platform::errors::InvalidArgument(

From c5a70065ac0baa817903595749e5b5e425bccc1e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=82=85=E5=89=91=E5=AF=92?= <Xs1580802568@gmail.com>
Date: Thu, 21 Sep 2023 14:46:08 +0800
Subject: [PATCH 31/39] move ir_nodes_collector from namespace optim to
 ir_utils (#57535)

---
 paddle/cinn/ast_gen_ius/tensor_group.cc       |   2 +-
 .../cinn/auto_schedule/analysis/analyze_ir.cc |  37 ++--
 .../search_space/auto_gen_rule/auto_bind.cc   |   4 +-
 .../search_space/auto_gen_rule/auto_inline.cc |  16 +-
 .../search_space/auto_gen_rule/auto_unroll.cc |   2 +-
 paddle/cinn/backends/codegen_cuda_dev.cc      |   2 +-
 paddle/cinn/backends/llvm/codegen_x86.cc      |   2 +-
 paddle/cinn/common/arithmatic.cc              |   4 +-
 paddle/cinn/common/cas.cc                     |   2 +-
 paddle/cinn/common/ir_util.cc                 |  10 +-
 .../cinn/hlir/framework/op_lowering_util.cc   |   7 +-
 paddle/cinn/hlir/pe/ir_schedule_pe.cc         |   6 +-
 paddle/cinn/ir/ir.cc                          |   2 +-
 paddle/cinn/ir/lowered_func.cc                |  14 +-
 paddle/cinn/ir/schedule/ir_schedule.cc        |  61 ++++---
 paddle/cinn/ir/schedule/ir_schedule_util.cc   | 172 +++++++++---------
 paddle/cinn/ir/tensor.cc                      |   6 +-
 paddle/cinn/ir/test/collect_ir_nodes_test.cc  |   3 +-
 paddle/cinn/ir/utils/ir_nodes_collector.cc    |   4 +-
 paddle/cinn/ir/utils/ir_nodes_collector.h     |   4 +-
 paddle/cinn/lang/lower.cc                     |  56 +++---
 paddle/cinn/lang/lower_impl.cc                |  19 +-
 paddle/cinn/lang/lower_tensor_group.cc        |   4 +-
 paddle/cinn/optim/buffer_assign.cc            |   2 +-
 paddle/cinn/optim/compute_inline_expand.cc    |   9 +-
 .../optim/eliminate_broadcast_in_forloop.cc   |   8 +-
 paddle/cinn/optim/transform_gpu_forloop.cc    |   2 +-
 paddle/cinn/optim/vectorize_loops.cc          |   8 +-
 paddle/cinn/poly/domain.cc                    |   4 +-
 paddle/cinn/poly/stage.cc                     |  15 +-
 30 files changed, 253 insertions(+), 234 deletions(-)

diff --git a/paddle/cinn/ast_gen_ius/tensor_group.cc b/paddle/cinn/ast_gen_ius/tensor_group.cc
index 2b604f2c383cb..e8b9c6a345e72 100644
--- a/paddle/cinn/ast_gen_ius/tensor_group.cc
+++ b/paddle/cinn/ast_gen_ius/tensor_group.cc
@@ -30,7 +30,7 @@ TensorGroup::TensorGroup(const std::vector<ir::Tensor>& tensors) {
 
   for (auto& tensor : tensors) {
     output_tensor_names_.insert(tensor->name);
-    std::set<ir::Expr> used_tensors = ir::CollectIRNodes(
+    std::set<ir::Expr> used_tensors = ir::ir_utils::CollectIRNodes(
         tensor->body(), [](const Expr* x) { return x->as_tensor(); });
     for (const Expr& x : used_tensors) {
       const ir::Tensor to_dep = x.as_tensor_ref();
diff --git a/paddle/cinn/auto_schedule/analysis/analyze_ir.cc b/paddle/cinn/auto_schedule/analysis/analyze_ir.cc
index 17aad495b246a..da2c063d9c00d 100644
--- a/paddle/cinn/auto_schedule/analysis/analyze_ir.cc
+++ b/paddle/cinn/auto_schedule/analysis/analyze_ir.cc
@@ -54,29 +54,30 @@ void AnalyzeScheduleBlockReadWriteBuffer(ir::ScheduleBlock* sche_block) {
     return;
   }
 
-  ir::CollectIRNodesWithoutTensor(sche_block->body, [&](const Expr* x) {
-    const ir::Load* load_expr = x->As<ir::Load>();
-    if (load_expr != nullptr) {
-      const ir::Tensor t = load_expr->tensor.as_tensor_ref();
-      sche_block->read_buffers.emplace_back(
-          ir::BufferRange(t->buffer, IndicesToVars(load_expr->indices)));
-      return false;
-    }
-    const ir::Store* store_expr = x->As<ir::Store>();
-    if (store_expr != nullptr) {
-      const ir::Tensor t = store_expr->tensor.as_tensor_ref();
-      sche_block->write_buffers.emplace_back(
-          ir::BufferRange(t->buffer, IndicesToVars(store_expr->indices)));
-      return false;
-    }
-    return false;
-  });
+  ir::ir_utils::CollectIRNodesWithoutTensor(
+      sche_block->body, [&](const Expr* x) {
+        const ir::Load* load_expr = x->As<ir::Load>();
+        if (load_expr != nullptr) {
+          const ir::Tensor t = load_expr->tensor.as_tensor_ref();
+          sche_block->read_buffers.emplace_back(
+              ir::BufferRange(t->buffer, IndicesToVars(load_expr->indices)));
+          return false;
+        }
+        const ir::Store* store_expr = x->As<ir::Store>();
+        if (store_expr != nullptr) {
+          const ir::Tensor t = store_expr->tensor.as_tensor_ref();
+          sche_block->write_buffers.emplace_back(
+              ir::BufferRange(t->buffer, IndicesToVars(store_expr->indices)));
+          return false;
+        }
+        return false;
+      });
 }
 
 bool ContainsNodeType(ir::Expr expr,
                       const std::unordered_set<ir::IrNodeTy>& node_types) {
   std::set<ir::Expr> collection =
-      ir::CollectIRNodesWithoutTensor(expr, [&](const Expr* x) {
+      ir::ir_utils::CollectIRNodesWithoutTensor(expr, [&](const Expr* x) {
         return node_types.find(x->node_type()) != node_types.end();
       });
   return !collection.empty();
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.cc
index 06215d98d8b27..62c92c9e38fca 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.cc
@@ -31,7 +31,7 @@ bool IsSpatialLoop(const ir::For* for_node) {
   const auto& loop_var = for_node->loop_var;
   // collect cases where the loop_var used in one of reduce axis in underneath
   // ScheduleBlock
-  auto used_for_reduce_axis = ir::CollectIRNodesWithoutTensor(
+  auto used_for_reduce_axis = ir::ir_utils::CollectIRNodesWithoutTensor(
       for_node->body, [&loop_var](const Expr* x) {
         const auto* block_realize = x->As<ir::ScheduleBlockRealize>();
         if (!block_realize) return false;
@@ -46,7 +46,7 @@ bool IsSpatialLoop(const ir::For* for_node) {
           const ir::Expr& binding = block_realize->iter_values[i];
           if (iter_var->is_reduce_axis ||
               iter_var->name.substr(0, 6) == "reduce") {
-            auto used_exprs = ir::CollectIRNodesWithoutTensor(
+            auto used_exprs = ir::ir_utils::CollectIRNodesWithoutTensor(
                 binding, [&loop_var](const Expr* x) {
                   const ir::_Var_* var = x->As<ir::_Var_>();
                   if (var &&
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.cc
index 946947611f35d..16eca6d677b89 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.cc
@@ -49,7 +49,7 @@ bool AutoInline::CanInlineIntoConsumer(const Expr& sche_block_realize_expr,
   ir::Expr root = ir_sch->GetRootBlock(sche_block_realize_expr);
 
   // Check the schedule block to be inlined is not a reduce tensor.
-  std::set<ir::Expr> find_store = ir::CollectIRNodesWithoutTensor(
+  std::set<ir::Expr> find_store = ir::ir_utils::CollectIRNodesWithoutTensor(
       compute_body, [&](const Expr* x) { return x->As<ir::Store>(); });
   if (find_store.size() != 1UL) {
     return false;
@@ -76,17 +76,19 @@ bool AutoInline::CanInlineIntoConsumer(const Expr& sche_block_realize_expr,
   }
 
   // Check this schedule block is the only writer of the tensor.
-  find_store = ir::CollectIRNodesWithoutTensor(root, [&](const Expr* x) {
-    return x->As<ir::Store>() &&
-           (x->As<ir::Store>()->tensor).as_tensor_ref()->name == tensor->name;
-  });
+  find_store =
+      ir::ir_utils::CollectIRNodesWithoutTensor(root, [&](const Expr* x) {
+        return x->As<ir::Store>() &&
+               (x->As<ir::Store>()->tensor).as_tensor_ref()->name ==
+                   tensor->name;
+      });
   if (find_store.size() != 1UL) {
     return false;
   }
   // Check there is no overlap between the buffers the schedule block reads and
   // writes.
-  std::set<ir::Expr> find_load =
-      ir::CollectIRNodesWithoutTensor(compute_body, [&](const Expr* x) {
+  std::set<ir::Expr> find_load = ir::ir_utils::CollectIRNodesWithoutTensor(
+      compute_body, [&](const Expr* x) {
         return x->As<ir::Load>() && x->As<ir::Load>()->tensor == tensor_expr;
       });
   if (!find_load.empty()) {
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.cc
index 946bd9e9d7730..000203306c1a1 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.cc
@@ -56,7 +56,7 @@ bool AutoUnroll::MeetCondition(const ir::ScheduleBlock* schedule_block) const {
     return false;
   };
 
-  auto find_target_exprs = ir::CollectIRNodesWithoutTensor(
+  auto find_target_exprs = ir::ir_utils::CollectIRNodesWithoutTensor(
       schedule_block->body,
       [&has_reduce_iter, &has_nonserial_loop](const Expr* x) {
         return has_reduce_iter(x) || has_nonserial_loop(x);
diff --git a/paddle/cinn/backends/codegen_cuda_dev.cc b/paddle/cinn/backends/codegen_cuda_dev.cc
index e33154f0c0129..1f6f5bba154aa 100644
--- a/paddle/cinn/backends/codegen_cuda_dev.cc
+++ b/paddle/cinn/backends/codegen_cuda_dev.cc
@@ -90,7 +90,7 @@ std::vector<Expr> CodeGenCUDA_Dev::GenerateBufferAliasExprs(
                                        temp_buffers.end());
   // prepare temp buffer alias
   std::vector<Expr> buffer_alias;
-  auto tensors = ir::CollectIRNodes(op->body, [&](const Expr *x) {
+  auto tensors = ir::ir_utils::CollectIRNodes(op->body, [&](const Expr *x) {
     return x->as_tensor() && x->as_tensor()->buffer.defined() &&
            temp_buffer_set.count(x->as_tensor()->buffer);
   });
diff --git a/paddle/cinn/backends/llvm/codegen_x86.cc b/paddle/cinn/backends/llvm/codegen_x86.cc
index ccae02ac5746b..9de0603e2c9e2 100644
--- a/paddle/cinn/backends/llvm/codegen_x86.cc
+++ b/paddle/cinn/backends/llvm/codegen_x86.cc
@@ -98,7 +98,7 @@ void CodeGenX86::CreateParallelLaunch(Expr body, int num_task) {
                                              llvm::Function::PrivateLinkage,
                                              "__parallel_lambda",
                                              m_);
-  std::vector<std::string> vars = ir::CollectUndefinedVars(&body);
+  std::vector<std::string> vars = ir::ir_utils::CollectUndefinedVars(&body);
   uint64_t nbytes;
   auto* data = PackVars(vars, &nbytes);
 
diff --git a/paddle/cinn/common/arithmatic.cc b/paddle/cinn/common/arithmatic.cc
index 16b1d9cb8e8a5..af6656317aa11 100644
--- a/paddle/cinn/common/arithmatic.cc
+++ b/paddle/cinn/common/arithmatic.cc
@@ -126,7 +126,7 @@ GiNaC::ex ExprToGinacConverter::BuildHelper(ir::Expr expr) {
 
 GiNaC::ex ExprToGinacConverter::operator()(Expr expr) {
   // TODO(Superjomn) Replace this with common::IsPureMath(
-  auto complex_nodes = CollectIRNodes(expr, [](const Expr* n) {
+  auto complex_nodes = ir::ir_utils::CollectIRNodes(expr, [](const Expr* n) {
     return n->As<Block>() ||    //
            n->As<PolyFor>() ||  //
            n->As<EQ>() ||       //
@@ -262,7 +262,7 @@ bool IsPureMath(Expr expr) {
       IrNodeTy ::Minus,
   });
 
-  auto complex_nodes = ir::CollectIRNodes(expr, [&](const Expr* n) {
+  auto complex_nodes = ir::ir_utils::CollectIRNodes(expr, [&](const Expr* n) {
     return !valid_node_tys.count(n->node_type());
   });
 #ifdef CINN_DEBUG
diff --git a/paddle/cinn/common/cas.cc b/paddle/cinn/common/cas.cc
index 6264c5b12d453..bf1c9092ed5eb 100644
--- a/paddle/cinn/common/cas.cc
+++ b/paddle/cinn/common/cas.cc
@@ -1868,7 +1868,7 @@ bool IsExprCasCompatible(Expr expr) {
     return expr->As<Add>() || expr->As<Sub>() || expr->As<Mul>() ||
            expr->As<Div>();
   };
-  return ir::CollectIRNodes(expr, teller).empty();
+  return ir::ir_utils::CollectIRNodes(expr, teller).empty();
 }
 
 // Partially divide a by b. e.g. (2x+y)/2 => x + y/2
diff --git a/paddle/cinn/common/ir_util.cc b/paddle/cinn/common/ir_util.cc
index f0f219ee105f7..4f000af1e8f0d 100644
--- a/paddle/cinn/common/ir_util.cc
+++ b/paddle/cinn/common/ir_util.cc
@@ -249,8 +249,8 @@ Expr or_all(const std::vector<Expr> &conds) {
 }
 
 void CheckTensorUniqueInExpr(Expr expr) {
-  auto tensor_uniq =
-      ir::CollectIRNodes(expr, [](const Expr *x) { return x->as_tensor(); });
+  auto tensor_uniq = ir::ir_utils::CollectIRNodes(
+      expr, [](const Expr *x) { return x->as_tensor(); });
   absl::flat_hash_map<std::string, const ir::_Tensor_ *> tensor_names;
   for (auto &t : tensor_uniq) {
     auto *tp = t.as_tensor();
@@ -269,9 +269,9 @@ void CheckBufferUniqueInExpr(Expr expr) {
   // the buffers exists in tensor and lowered functions.
   CheckTensorUniqueInExpr(expr);
 
-  auto tensors =
-      ir::CollectIRNodes(expr, [](const Expr *x) { return x->as_tensor(); });
-  auto funcs = ir::CollectIRNodes(
+  auto tensors = ir::ir_utils::CollectIRNodes(
+      expr, [](const Expr *x) { return x->as_tensor(); });
+  auto funcs = ir::ir_utils::CollectIRNodes(
       expr, [](const Expr *x) { return x->as_lowered_func(); });
 
   absl::flat_hash_map<std::string, const ir::_Buffer_ *> buffer_name;
diff --git a/paddle/cinn/hlir/framework/op_lowering_util.cc b/paddle/cinn/hlir/framework/op_lowering_util.cc
index e7a4412202d87..1af9ef0576351 100644
--- a/paddle/cinn/hlir/framework/op_lowering_util.cc
+++ b/paddle/cinn/hlir/framework/op_lowering_util.cc
@@ -1046,7 +1046,7 @@ void LoopAssignReduce(
     auto first_reduce_loop = rloops.front();
     // collect if
     auto if_checker = [](const Expr* x) { return x->As<ir::IfThenElse>(); };
-    auto if_set = ir::CollectIRNodesWithoutTensor(
+    auto if_set = ir::ir_utils::CollectIRNodesWithoutTensor(
         first_reduce_loop.As<ir::For>()->body, if_checker);
     std::string reduce_block_name = reducer_data->id();
     for (auto if_expr : if_set) {
@@ -1056,10 +1056,11 @@ void LoopAssignReduce(
                        ->schedule_block.As<ir::ScheduleBlock>()
                        ->name == reduce_block_name;
       };
-      auto blocks_in_if = ir::CollectIRNodesWithoutTensor(if_expr, checker);
+      auto blocks_in_if =
+          ir::ir_utils::CollectIRNodesWithoutTensor(if_expr, checker);
       if (!blocks_in_if.empty()) {
         ir::Expr condition = if_expr.As<ir::IfThenElse>()->condition;
-        auto indices_in_if = ir::CollectIRNodesWithoutTensor(
+        auto indices_in_if = ir::ir_utils::CollectIRNodesWithoutTensor(
             condition, [](const Expr* x) { return x->As<ir::_Var_>(); });
         for (int i = 0; i < rloops.size(); ++i) {
           std::string var_name = rloops[i].As<ir::For>()->loop_var->name;
diff --git a/paddle/cinn/hlir/pe/ir_schedule_pe.cc b/paddle/cinn/hlir/pe/ir_schedule_pe.cc
index 3677025aaedaa..6600905b083c1 100644
--- a/paddle/cinn/hlir/pe/ir_schedule_pe.cc
+++ b/paddle/cinn/hlir/pe/ir_schedule_pe.cc
@@ -633,7 +633,7 @@ void IRCudaScheduleBlockShuffleReduce(ir::IRSchedule &ir_sch,  // NOLINT
     // simplify reshape index
     auto hand_write_simplify = [](std::vector<ir::Expr> loops, ir::Expr block) {
       // check exist select.
-      auto find_select = ir::CollectIRNodesInOrder(
+      auto find_select = ir::ir_utils::CollectIRNodesInOrder(
           block, [&](const Expr *x) { return x->As<ir::Select>(); });
       if (find_select.size() > 0) {
         return;
@@ -667,7 +667,7 @@ void IRCudaScheduleBlockShuffleReduce(ir::IRSchedule &ir_sch,  // NOLINT
         index = index + ir::Expr(schedule_block->iter_vars[idx]) * stride;
       }
 
-      auto exprs = ir::CollectIRNodesInOrder(
+      auto exprs = ir::ir_utils::CollectIRNodesInOrder(
           block, [&](const Expr *x) { return x->As<ir::Load>(); });
       CHECK_EQ(exprs.size(), 1);
       auto load = exprs.front().As<ir::Load>();
@@ -709,7 +709,7 @@ void IRCudaScheduleBlockShuffleReduce(ir::IRSchedule &ir_sch,  // NOLINT
       break;
     }
 
-    auto exprs = ir::CollectIRNodesInOrder(
+    auto exprs = ir::ir_utils::CollectIRNodesInOrder(
         block, [&](const Expr *x) { return x->As<ir::Load>(); });
     for (auto expr : exprs) {
       auto load = expr.As<ir::Load>();
diff --git a/paddle/cinn/ir/ir.cc b/paddle/cinn/ir/ir.cc
index 5427a14afa5ba..7911f3ea14bba 100644
--- a/paddle/cinn/ir/ir.cc
+++ b/paddle/cinn/ir/ir.cc
@@ -535,7 +535,7 @@ std::vector<const Expr *> PolyFor::expr_fields() const {
 }
 
 Expr PolyFor::ExtractExtent() const {
-  auto nodes = CollectIRNodes(condition, [&](const Expr *e) {
+  auto nodes = ir::ir_utils::CollectIRNodes(condition, [&](const Expr *e) {
     return e->As<NE>() ||   //
            e->As<EQ>() ||   //
            e->As<Min>() ||  //
diff --git a/paddle/cinn/ir/lowered_func.cc b/paddle/cinn/ir/lowered_func.cc
index 5a897e7c334a5..ec5f4b2e64ce6 100644
--- a/paddle/cinn/ir/lowered_func.cc
+++ b/paddle/cinn/ir/lowered_func.cc
@@ -82,7 +82,7 @@ std::vector<const Expr*> _LoweredFunc_::expr_fields() const { return {&body}; }
 
 void _LoweredFunc_::PrepareCudaAxisInfoFromBody() {
   std::set<Expr> bound_for_exprs =
-      ir::CollectIRNodes(body, [](const Expr* expr) {
+      ir::ir_utils::CollectIRNodes(body, [](const Expr* expr) {
         const ir::For* for_expr = expr->As<ir::For>();
         return for_expr != nullptr && for_expr->is_binded();
       });
@@ -208,7 +208,7 @@ void _LoweredFunc_::AllocTempBuffer() {}
 void _LoweredFunc_::PrepareBufferCastExprs(bool with_expr_gen_tensor) {
   buffer_data_cast_exprs.clear();
   // collect write.
-  auto write_teller = ir::CollectTensorNeedsWrite(&body);
+  auto write_teller = ir::ir_utils::CollectTensorNeedsWrite(&body);
 
   auto tensors = CollectAllTensorReference(with_expr_gen_tensor);
   std::sort(tensors.begin(),
@@ -248,7 +248,7 @@ std::vector<Expr> _LoweredFunc_::CudaAliasVarExprs() const {
   }
   // collect write.
   std::vector<Expr> res;
-  auto write_teller = ir::CollectTensorNeedsWrite(&body);
+  auto write_teller = ir::ir_utils::CollectTensorNeedsWrite(&body);
 
   auto tensors = CollectAllTensorReference();
   std::sort(tensors.begin(),
@@ -403,11 +403,11 @@ std::vector<Tensor> _LoweredFunc_::CollectAllTensorReference(
     bool with_expr_gen_tensor) const {
   std::set<Expr> tensor_exprs =
       with_expr_gen_tensor
-          ? ir::CollectIRNodes(
+          ? ir::ir_utils::CollectIRNodes(
                 body, [](const Expr* expr) { return expr->As<ir::_Tensor_>(); })
-          : ir::CollectIRNodesWithoutTensor(body, [](const Expr* expr) {
-              return expr->As<ir::_Tensor_>();
-            });
+          : ir::ir_utils::CollectIRNodesWithoutTensor(
+                body,
+                [](const Expr* expr) { return expr->As<ir::_Tensor_>(); });
 
   std::vector<Tensor> tensors;
   // remove the duplicate tensor by their name.
diff --git a/paddle/cinn/ir/schedule/ir_schedule.cc b/paddle/cinn/ir/schedule/ir_schedule.cc
index 78ce98564dbdc..fab8a53deb121 100644
--- a/paddle/cinn/ir/schedule/ir_schedule.cc
+++ b/paddle/cinn/ir/schedule/ir_schedule.cc
@@ -767,7 +767,7 @@ struct CacheWriteRewriter : public ir::IRMutator<> {
     rewriter(&info->cache_block);
     rewriter.mutate_cache_block = false;
     rewriter(&new_root);
-    auto find_tensor = ir::CollectIRNodesWithoutTensor(
+    auto find_tensor = ir::ir_utils::CollectIRNodesWithoutTensor(
         new_root,
         [&](const Expr* x) {
           return x->As<Store>() &&
@@ -775,7 +775,7 @@ struct CacheWriteRewriter : public ir::IRMutator<> {
         },
         true);
     if (!find_tensor.empty()) {
-      auto find_store = ir::CollectIRNodesWithoutTensor(
+      auto find_store = ir::ir_utils::CollectIRNodesWithoutTensor(
           (*find_tensor.begin()), [&](const Expr* x) {
             return x->As<Load>() &&
                    (x->As<Load>()->tensor == Expr(info->write_tensor));
@@ -864,7 +864,7 @@ struct ChangeBodyToBlock : public ir::IRMutator<> {
 
 DeviceAPI ScheduleImpl::GetDeviceAPI() const {
   auto exprs = this->GetModule().GetExprs();
-  auto find_for_nodes = ir::CollectIRNodesWithoutTensor(
+  auto find_for_nodes = ir::ir_utils::CollectIRNodesWithoutTensor(
       exprs.front(), [&](const Expr* x) { return x->As<ir::For>(); }, true);
   CHECK(!find_for_nodes.empty());
   return (*find_for_nodes.begin()).As<ir::For>()->device_api;
@@ -925,7 +925,7 @@ Expr ScheduleImpl::CacheWrite(const Expr& block,
           ->schedule_block.As<ScheduleBlock>()
           ->body);
 
-  auto find_cache_block = ir::CollectIRNodesWithoutTensor(
+  auto find_cache_block = ir::ir_utils::CollectIRNodesWithoutTensor(
       root,
       [&](const Expr* x) {
         return x->As<ir::ScheduleBlockRealize>() &&
@@ -937,9 +937,10 @@ Expr ScheduleImpl::CacheWrite(const Expr& block,
   CHECK(info.write_tensor->buffer.defined());
 
   // Replace buffer
-  auto all_tensors = ir::CollectIRNodesWithoutTensor(root, [&](const Expr* x) {
-    return x->as_tensor() && x->as_tensor()->buffer.defined();
-  });
+  auto all_tensors =
+      ir::ir_utils::CollectIRNodesWithoutTensor(root, [&](const Expr* x) {
+        return x->as_tensor() && x->as_tensor()->buffer.defined();
+      });
 
   for (auto i : all_tensors) {
     if (i.as_tensor()->name != info.write_tensor->name &&
@@ -1119,7 +1120,7 @@ Expr ScheduleImpl::Reorder(const Expr& block,
 Expr ScheduleImpl::GetRootBlock(const Expr& expr) const {
   auto exprs = this->GetModule().GetExprs();
   for (auto& it_expr : exprs) {
-    auto find_expr = ir::CollectIRNodesWithoutTensor(
+    auto find_expr = ir::ir_utils::CollectIRNodesWithoutTensor(
         it_expr,
         [&](const Expr* x) {
           return x->node_type() == expr.node_type() && *x == expr;
@@ -1198,20 +1199,21 @@ struct LoopReconstructor : public ir::IRMutator<> {
     // Replace the copied Tensor object with the original Tensor object,
     // to ensure that the same Tensor in a AST is the same object.
     std::unordered_map<std::string, ir::Expr> tensors_map;
-    ir::CollectIRNodesWithoutTensor(loop_, [&tensors_map](const Expr* x) {
-      if (x->as_tensor()) {
-        tensors_map.insert({x->as_tensor()->name, *x});
-        return true;
-      }
-      return false;
-    });
-    auto find_store = ir::CollectIRNodesWithoutTensor(
+    ir::ir_utils::CollectIRNodesWithoutTensor(
+        loop_, [&tensors_map](const Expr* x) {
+          if (x->as_tensor()) {
+            tensors_map.insert({x->as_tensor()->name, *x});
+            return true;
+          }
+          return false;
+        });
+    auto find_store = ir::ir_utils::CollectIRNodesWithoutTensor(
         new_loop_, [](const Expr* x) { return x->As<ir::Store>(); });
     for (auto store : find_store) {
       store.As<ir::Store>()->tensor =
           tensors_map.at(store.As<ir::Store>()->tensor.as_tensor()->name);
     }
-    auto find_load = ir::CollectIRNodesWithoutTensor(
+    auto find_load = ir::ir_utils::CollectIRNodesWithoutTensor(
         new_loop_, [](const Expr* x) { return x->As<ir::Load>(); });
     for (auto load : find_load) {
       load.As<ir::Load>()->tensor =
@@ -1275,7 +1277,7 @@ void ScheduleImpl::SetBuffer(Expr& block,
                              const std::string& memory_type,
                              bool fixed) {
   CHECK(block.As<ir::ScheduleBlockRealize>());
-  auto find_tensor = ir::CollectIRNodesWithoutTensor(
+  auto find_tensor = ir::ir_utils::CollectIRNodesWithoutTensor(
       block, [&](const Expr* x) { return x->As<ir::Store>(); }, true);
   CHECK_EQ(find_tensor.size(), 1U)
       << "One block should only have one Store node!(except for root block)";
@@ -1286,7 +1288,7 @@ void ScheduleImpl::SetBuffer(Expr& block,
   auto exprs = this->GetModule().GetExprs();
   for (auto& it_expr : exprs) {
     auto find_tensor =
-        ir::CollectIRNodesWithoutTensor(it_expr, [&](const Expr* x) {
+        ir::ir_utils::CollectIRNodesWithoutTensor(it_expr, [&](const Expr* x) {
           return x->as_tensor() &&
                  (x->as_tensor()->name == tensor.as_tensor_ref()->name ||
                   x->as_tensor()->name ==
@@ -1328,7 +1330,7 @@ void ScheduleImpl::MergeExprs() {
                              ->body);
   VLOG(3) << "Before merging, exprs[0] is : " << exprs[0];
   for (int i = 1; i < exprs.size(); ++i) {
-    auto root_block = ir::CollectIRNodesWithoutTensor(
+    auto root_block = ir::ir_utils::CollectIRNodesWithoutTensor(
         exprs[i],
         [&](const Expr* x) {
           return x->As<ir::ScheduleBlockRealize>() &&
@@ -1437,7 +1439,7 @@ void ScheduleImpl::SimpleComputeAt(const Expr& block, const Expr& loop) {
   auto body = block_loops.at(loops.size() - 1).As<ir::For>()->body;
   // collect if
   auto if_checker = [](const Expr* x) { return x->As<ir::IfThenElse>(); };
-  auto if_set = ir::CollectIRNodesWithoutTensor(body, if_checker);
+  auto if_set = ir::ir_utils::CollectIRNodesWithoutTensor(body, if_checker);
   for (auto if_expr : if_set) {
     auto checker = [block_name](const Expr* x) {
       return x->As<ir::ScheduleBlockRealize>() &&
@@ -1445,7 +1447,8 @@ void ScheduleImpl::SimpleComputeAt(const Expr& block, const Expr& loop) {
                      ->schedule_block.As<ScheduleBlock>()
                      ->name == block_name;
     };
-    if (ir::CollectIRNodesWithoutTensor(if_expr, checker, true).size() > 0) {
+    if (ir::ir_utils::CollectIRNodesWithoutTensor(if_expr, checker, true)
+            .size() > 0) {
       result =
           IfThenElse::Make(if_expr.As<ir::IfThenElse>()->condition, result);
       break;
@@ -1582,7 +1585,7 @@ bool ComputeInliner::BodyPatternAllowInline() {
     return false;
   }
   CHECK(inlined_store_.As<Store>());
-  auto find_vars = ir::CollectIRNodesWithoutTensor(
+  auto find_vars = ir::ir_utils::CollectIRNodesWithoutTensor(
       inlined_store_, [&](const Expr* x) { return x->as_var(); });
   std::set<Var, CompVar> vars_set;
   for (auto& i : find_vars) vars_set.insert(i.as_var_ref());
@@ -1650,7 +1653,7 @@ bool ReverseComputeInliner::BodyPatternAllowInline() {
   CHECK(inlined_store_.As<Store>());
   CHECK(inlined_load_.As<Load>());
   CHECK(target_store_.As<Store>());
-  auto find_vars = ir::CollectIRNodesWithoutTensor(
+  auto find_vars = ir::ir_utils::CollectIRNodesWithoutTensor(
       inlined_store_, [&](const Expr* x) { return x->as_var(); });
   std::set<Var, CompVar> vars_set;
   for (auto& i : find_vars) vars_set.insert(i.as_var_ref());
@@ -2036,7 +2039,7 @@ void ScheduleImpl::FlattenLoops(const std::vector<Expr>& loops,
       }
     }
 
-    auto exprs = ir::CollectIRNodesInOrder(
+    auto exprs = ir::ir_utils::CollectIRNodesInOrder(
         schedule_block->body,
         [&](const Expr* x) { return x->As<ir::Store>() || x->As<ir::Load>(); });
     // reverse exprs from last to first.
@@ -2185,7 +2188,7 @@ void ScheduleImpl::CopyTransformAndLoopInfo(const Expr& block,
   std::set<std::string> used_target_loop_vars;
   for (auto& iter_val : new_iter_values) {
     auto find_partial_loop =
-        ir::CollectIRNodesWithoutTensor(iter_val, [&](const Expr* x) {
+        ir::ir_utils::CollectIRNodesWithoutTensor(iter_val, [&](const Expr* x) {
           if (x->as_var()) used_target_loop_vars.insert(x->as_var_ref()->name);
           return x->as_var();
         });
@@ -2194,7 +2197,7 @@ void ScheduleImpl::CopyTransformAndLoopInfo(const Expr& block,
   std::vector<Expr> used_target_loops;
   auto expr_copy = optim::IRCopy(expr);
   for (auto& var : used_target_loop_vars) {
-    auto find_loop_var = ir::CollectIRNodesWithoutTensor(
+    auto find_loop_var = ir::ir_utils::CollectIRNodesWithoutTensor(
         expr_copy,
         [&](const Expr* x) {
           return x->As<ir::For>() && x->As<ir::For>()->loop_var->name == var &&
@@ -2222,7 +2225,7 @@ void ScheduleImpl::CopyTransformAndLoopInfo(const Expr& block,
   } else {
     CHECK(old_iter_values[changed_loop_num].as_var());
     auto old_var = old_iter_values[changed_loop_num].as_var_ref();
-    auto find_partial_loop = ir::CollectIRNodesWithoutTensor(
+    auto find_partial_loop = ir::ir_utils::CollectIRNodesWithoutTensor(
         expr,
         [&](const Expr* x) {
           return x->As<ir::For>() &&
@@ -2232,7 +2235,7 @@ void ScheduleImpl::CopyTransformAndLoopInfo(const Expr& block,
         true);
     CHECK_EQ(find_partial_loop.size(), 1U);
     new_loop = optim::IRCopy(*find_partial_loop.begin());
-    auto find_schedule_block = ir::CollectIRNodesWithoutTensor(
+    auto find_schedule_block = ir::ir_utils::CollectIRNodesWithoutTensor(
         new_loop,
         [&](const Expr* x) { return x->As<ir::ScheduleBlockRealize>(); },
         true);
diff --git a/paddle/cinn/ir/schedule/ir_schedule_util.cc b/paddle/cinn/ir/schedule/ir_schedule_util.cc
index b4000ff212cad..45779788e9c54 100644
--- a/paddle/cinn/ir/schedule/ir_schedule_util.cc
+++ b/paddle/cinn/ir/schedule/ir_schedule_util.cc
@@ -40,7 +40,7 @@ namespace ir {
 
 Tensor GetTensor(const Expr& block) {
   CHECK(block.As<ir::ScheduleBlockRealize>());
-  auto find_tensor = ir::CollectIRNodesWithoutTensor(
+  auto find_tensor = ir::ir_utils::CollectIRNodesWithoutTensor(
       block, [&](const Expr* x) { return x->As<ir::Store>(); }, true);
   CHECK_EQ(find_tensor.size(), 1U)
       << "One block should only have one Store node!(except for root block)";
@@ -52,13 +52,13 @@ Tensor GetTensor(const Expr& block) {
 
 Tensor GetReadTensor(const Expr& block, int index) {
   CHECK(block.As<ir::ScheduleBlockRealize>());
-  auto find_tensor = ir::CollectIRNodesWithoutTensor(
+  auto find_tensor = ir::ir_utils::CollectIRNodesWithoutTensor(
       block, [&](const Expr* x) { return x->As<ir::Store>(); }, true);
   CHECK_EQ(find_tensor.size(), 1U)
       << "One block should only have one Store node!(except for root block)";
   std::vector<Tensor> res;
   auto find_read_tensor =
-      ir::CollectIRNodesWithoutTensor(block, [&](const Expr* x) {
+      ir::ir_utils::CollectIRNodesWithoutTensor(block, [&](const Expr* x) {
         if (x->As<ir::Load>())
           res.push_back(x->As<ir::Load>()->tensor.as_tensor_ref());
         return x->As<ir::Load>();
@@ -86,41 +86,43 @@ void SetCudaAxisInfo(Expr* lowered_func) {
   auto func_body = lowered_func->as_lowered_func_ref()->body;
   CudaAxisInfo info;
 
-  auto block_nodes = ir::CollectIRNodes(func_body, [&](const Expr* x) {
-    if (x->As<ir::For>() && x->As<ir::For>()->bind_info().valid()) {
-      auto bind_info = x->As<ir::For>()->bind_info();
-      info.set_valid(true);
-      if (bind_info.for_type == ForType::GPUThread) {
-        CHECK(common::is_zero(x->As<ir::For>()->min));
-        CHECK(x->As<ir::For>()->extent.is_constant());
-        int range = x->As<ir::For>()->extent.get_constant();
-        range = range > info.block_dim(bind_info.offset)
-                    ? range
-                    : info.block_dim(bind_info.offset);
-        VLOG(3) << "Set block dim[" << bind_info.offset << "] with range "
-                << range;
-        info.set_block_dim(bind_info.offset, range);
-      } else if (bind_info.for_type == ForType::GPUBlock) {
-        CHECK(common::is_zero(x->As<ir::For>()->min));
-        CHECK(x->As<ir::For>()->extent.is_constant());
-        int range = x->As<ir::For>()->extent.get_constant();
-        range = range > info.grid_dim(bind_info.offset)
-                    ? range
-                    : info.grid_dim(bind_info.offset);
-        info.set_grid_dim(bind_info.offset, range);
-        VLOG(3) << "Set grid dim[" << bind_info.offset << "] with range "
-                << range;
-      } else {
-        LOG(FATAL) << "The for loop's bind info should be gpu block or thread!";
-      }
-    }
-    return (x->As<ir::For>() && x->As<ir::For>()->bind_info().valid());
-  });
+  auto block_nodes =
+      ir::ir_utils::CollectIRNodes(func_body, [&](const Expr* x) {
+        if (x->As<ir::For>() && x->As<ir::For>()->bind_info().valid()) {
+          auto bind_info = x->As<ir::For>()->bind_info();
+          info.set_valid(true);
+          if (bind_info.for_type == ForType::GPUThread) {
+            CHECK(common::is_zero(x->As<ir::For>()->min));
+            CHECK(x->As<ir::For>()->extent.is_constant());
+            int range = x->As<ir::For>()->extent.get_constant();
+            range = range > info.block_dim(bind_info.offset)
+                        ? range
+                        : info.block_dim(bind_info.offset);
+            VLOG(3) << "Set block dim[" << bind_info.offset << "] with range "
+                    << range;
+            info.set_block_dim(bind_info.offset, range);
+          } else if (bind_info.for_type == ForType::GPUBlock) {
+            CHECK(common::is_zero(x->As<ir::For>()->min));
+            CHECK(x->As<ir::For>()->extent.is_constant());
+            int range = x->As<ir::For>()->extent.get_constant();
+            range = range > info.grid_dim(bind_info.offset)
+                        ? range
+                        : info.grid_dim(bind_info.offset);
+            info.set_grid_dim(bind_info.offset, range);
+            VLOG(3) << "Set grid dim[" << bind_info.offset << "] with range "
+                    << range;
+          } else {
+            LOG(FATAL)
+                << "The for loop's bind info should be gpu block or thread!";
+          }
+        }
+        return (x->As<ir::For>() && x->As<ir::For>()->bind_info().valid());
+      });
   lowered_func->as_lowered_func_ref()->cuda_axis_info = info;
 }
 
 bool Contains(const Expr& container, const Expr& expr) {
-  auto find_expr = ir::CollectIRNodesWithoutTensor(
+  auto find_expr = ir::ir_utils::CollectIRNodesWithoutTensor(
       container,
       [&](const Expr* x) {
         return (x->node_type() == expr.node_type() && *x == expr);
@@ -283,13 +285,13 @@ void CHECKRfactorValidation(const Expr& rf_loop, int rf_axis) {
   auto* rf_for = rf_loop.As<ir::For>();
   CHECK(rf_for) << "Expr param of Rfactor must be For node! Please check.";
   // check the rf_loop only has one schedule block
-  auto block_nodes = ir::CollectIRNodesWithoutTensor(
+  auto block_nodes = ir::ir_utils::CollectIRNodesWithoutTensor(
       rf_loop,
       [&](const Expr* x) { return x->As<ScheduleBlockRealize>(); },
       true);
   CHECK_EQ(block_nodes.size(), 1U)
       << "Rfactor Loop should only have one schedule block";
-  auto find_store = ir::CollectIRNodesWithoutTensor(
+  auto find_store = ir::ir_utils::CollectIRNodesWithoutTensor(
       rf_loop, [&](const Expr* x) { return x->As<Store>(); }, true);
   CHECK_EQ(find_store.size(), 1U);
   auto indice = find_store.begin()->As<Store>()->indices;
@@ -322,9 +324,9 @@ void CHECKRfactorValidation(const Expr& rf_loop, int rf_axis) {
 }
 
 std::vector<Expr> GetLoopsOfExpr(const Expr& expr, const Expr& root) {
-  auto loop_nodes = ir::CollectIRNodesWithoutTensor(root, [&](const Expr* x) {
-    return x->As<ir::For>() && Contains(*x, expr);
-  });
+  auto loop_nodes = ir::ir_utils::CollectIRNodesWithoutTensor(
+      root,
+      [&](const Expr* x) { return x->As<ir::For>() && Contains(*x, expr); });
   std::vector<Expr> result(loop_nodes.begin(), loop_nodes.end());
   if (result.empty())
     LOG(FATAL) << "Didn't find expr's : \n"
@@ -439,8 +441,8 @@ Expr GetNthAccessExpr(const Expr& block, int index, bool is_write) {
                           ->body;
   if (is_write) {
     std::vector<Expr> find_store_vec;
-    auto find_store =
-        ir::CollectIRNodesWithoutTensor(compute_body, [&](const Expr* x) {
+    auto find_store = ir::ir_utils::CollectIRNodesWithoutTensor(
+        compute_body, [&](const Expr* x) {
           if (x->As<ir::Store>()) find_store_vec.push_back(*x);
           return x->As<ir::Store>();
         });
@@ -450,8 +452,8 @@ Expr GetNthAccessExpr(const Expr& block, int index, bool is_write) {
     return store_index;
   } else {
     std::vector<Expr> find_load_vec;
-    auto find_load =
-        ir::CollectIRNodesWithoutTensor(compute_body, [&](const Expr* x) {
+    auto find_load = ir::ir_utils::CollectIRNodesWithoutTensor(
+        compute_body, [&](const Expr* x) {
           if (x->As<ir::Load>()) find_load_vec.push_back(*x);
           return x->As<ir::Load>();
         });
@@ -526,7 +528,7 @@ void FindInsertionPoint(const Expr& root, CacheBlockInfo* info, bool is_write) {
   Expr find_tensor =
       is_write ? Expr(info->write_tensor) : Expr(info->read_tensor);
   auto find_produce_read =
-      ir::CollectIRNodesWithoutTensor(root, [&](const Expr* x) {
+      ir::ir_utils::CollectIRNodesWithoutTensor(root, [&](const Expr* x) {
         return x->As<ir::Store>() && x->As<ir::Store>()->tensor == find_tensor;
       });
 
@@ -675,9 +677,9 @@ Expr ConstructNewLoopChain(const std::vector<Expr>& chain,
   // In each IfThenElse node, find the vars its condition depends on.
   for (auto& if_expr : if_nodes) {
     CHECK(if_expr.As<IfThenElse>());
-    auto var_set =
-        ir::CollectIRNodes(if_expr.As<IfThenElse>()->condition,
-                           [&](const Expr* x) { return x->as_var(); });
+    auto var_set = ir::ir_utils::CollectIRNodes(
+        if_expr.As<IfThenElse>()->condition,
+        [&](const Expr* x) { return x->as_var(); });
     std::set<std::string> var_name_set;
     for (auto& i : var_set) var_name_set.insert(i.as_var()->name);
     condition_vars.push_back(var_name_set);
@@ -863,7 +865,7 @@ std::vector<Expr> GetProducers(const Expr& block, const Expr& root) {
   std::string block_name = block.As<ir::ScheduleBlockRealize>()
                                ->schedule_block.As<ir::ScheduleBlock>()
                                ->name;
-  ir::CollectIRNodesWithoutTensor(
+  ir::ir_utils::CollectIRNodesWithoutTensor(
       compute_body, [&producer_tensor_names, &block_name](const Expr* x) {
         auto* load = x->As<ir::Load>();
         if (load) {
@@ -879,15 +881,15 @@ std::vector<Expr> GetProducers(const Expr& block, const Expr& root) {
 
   // traverse each of other blocks and filter those ones which contain at least
   // one producer tensor;
-  auto find_blocks =
-      ir::CollectIRNodesWithoutTensor(root, [&block, &root](const Expr* x) {
+  auto find_blocks = ir::ir_utils::CollectIRNodesWithoutTensor(
+      root, [&block, &root](const Expr* x) {
         return x->As<ir::ScheduleBlockRealize>() && *x != block && *x != root;
       });
   for (auto&& cur : find_blocks) {
     auto* cur_block = cur.As<ir::ScheduleBlockRealize>()
                           ->schedule_block.As<ir::ScheduleBlock>();
     CHECK(cur_block) << "block result should be a ScheduleBlockRealize";
-    auto find_stores = ir::CollectIRNodesWithoutTensor(
+    auto find_stores = ir::ir_utils::CollectIRNodesWithoutTensor(
         cur_block->body, [&producer_tensor_names](const Expr* x) {
           return x->As<ir::Store>() &&
                  producer_tensor_names.count(
@@ -905,27 +907,29 @@ std::vector<Expr> GetConsumers(const Expr& block, const Expr& root) {
   std::string block_tensor = GetTensor(block)->name;
   if (IsReduceInitTensorName(block_tensor)) {
     std::string consumer_name = GetOriginalReduceTensorName(block_tensor);
-    auto consumer = ir::CollectIRNodesWithoutTensor(root, [&](const Expr* x) {
-      return x->As<ir::ScheduleBlockRealize>() &&
-             x->As<ir::ScheduleBlockRealize>()
-                     ->schedule_block.As<ir::ScheduleBlock>()
-                     ->name == consumer_name;
-    });
+    auto consumer =
+        ir::ir_utils::CollectIRNodesWithoutTensor(root, [&](const Expr* x) {
+          return x->As<ir::ScheduleBlockRealize>() &&
+                 x->As<ir::ScheduleBlockRealize>()
+                         ->schedule_block.As<ir::ScheduleBlock>()
+                         ->name == consumer_name;
+        });
     CHECK_EQ(consumer.size(), 1);
     return {*consumer.begin()};
   }
 
-  auto find_block = ir::CollectIRNodesWithoutTensor(root, [&](const Expr* x) {
-    return x->As<ir::ScheduleBlockRealize>() && *x != block && *x != root;
-  });
+  auto find_block =
+      ir::ir_utils::CollectIRNodesWithoutTensor(root, [&](const Expr* x) {
+        return x->As<ir::ScheduleBlockRealize>() && *x != block && *x != root;
+      });
   for (auto& i : find_block) {
     CHECK(i.As<ir::ScheduleBlockRealize>()
               ->schedule_block.As<ir::ScheduleBlock>());
     auto block_body = i.As<ir::ScheduleBlockRealize>()
                           ->schedule_block.As<ir::ScheduleBlock>()
                           ->body;
-    auto find_load =
-        ir::CollectIRNodesWithoutTensor(block_body, [&](const Expr* x) {
+    auto find_load = ir::ir_utils::CollectIRNodesWithoutTensor(
+        block_body, [&](const Expr* x) {
           return x->As<ir::Load>() &&
                  x->As<ir::Load>()->tensor.as_tensor_ref()->name ==
                      block_tensor;
@@ -938,7 +942,7 @@ std::vector<Expr> GetConsumers(const Expr& block, const Expr& root) {
 void CheckComputeAtValidation(const Expr& block,
                               const Expr& loop,
                               const Expr& root) {
-  auto find_block = ir::CollectIRNodesWithoutTensor(
+  auto find_block = ir::ir_utils::CollectIRNodesWithoutTensor(
       root,
       [&](const Expr* x) {
         return x->As<ir::ScheduleBlockRealize>() && *x == block;
@@ -946,13 +950,13 @@ void CheckComputeAtValidation(const Expr& block,
       true);
   CHECK(!find_block.empty()) << "Didn't find block in root!";
 
-  auto find_loop = ir::CollectIRNodesWithoutTensor(
+  auto find_loop = ir::ir_utils::CollectIRNodesWithoutTensor(
       root,
       [&](const Expr* x) { return x->As<ir::For>() && *x == loop; },
       true);
   CHECK(!find_loop.empty()) << "Didn't find loop in root!";
 
-  auto find_block_in_loop = ir::CollectIRNodesWithoutTensor(
+  auto find_block_in_loop = ir::ir_utils::CollectIRNodesWithoutTensor(
       loop,
       [&](const Expr* x) {
         return x->As<ir::ScheduleBlockRealize>() && *x == block;
@@ -1005,10 +1009,10 @@ std::vector<IterRange> CalculateRequiredRegions(
 
   std::set<Expr> provided_nodes;
   if (is_store_provided) {
-    provided_nodes = ir::CollectIRNodesWithoutTensor(
+    provided_nodes = ir::ir_utils::CollectIRNodesWithoutTensor(
         block, [&](const Expr* x) { return x->As<ir::Store>(); });
   } else {
-    provided_nodes = ir::CollectIRNodesWithoutTensor(
+    provided_nodes = ir::ir_utils::CollectIRNodesWithoutTensor(
         block, [&](const Expr* x) { return x->As<ir::Load>(); });
   }
 
@@ -1036,7 +1040,7 @@ std::vector<IterRange> CalculateRequiredRegions(
 
       // Notice that we look for For nodes in loop's body instead of loop
       // itself.
-      auto find_loops = ir::CollectIRNodesWithoutTensor(
+      auto find_loops = ir::ir_utils::CollectIRNodesWithoutTensor(
           loop.As<ir::For>()->body, [&](const Expr* x) {
             return x->As<ir::For>() && Contains(*x, req_block);
           });
@@ -1052,15 +1056,15 @@ std::vector<IterRange> CalculateRequiredRegions(
 
       std::set<Expr> required_nodes;
       if (is_store_provided) {
-        required_nodes =
-            ir::CollectIRNodesWithoutTensor(block_body, [&](const Expr* x) {
+        required_nodes = ir::ir_utils::CollectIRNodesWithoutTensor(
+            block_body, [&](const Expr* x) {
               return x->As<ir::Load>() &&
                      x->As<ir::Load>()->tensor.as_tensor_ref()->name ==
                          provided_tensor_name;
             });
       } else {
-        required_nodes =
-            ir::CollectIRNodesWithoutTensor(block_body, [&](const Expr* x) {
+        required_nodes = ir::ir_utils::CollectIRNodesWithoutTensor(
+            block_body, [&](const Expr* x) {
               return x->As<ir::Store>() &&
                      x->As<ir::Store>()->tensor.as_tensor_ref()->name ==
                          provided_tensor_name;
@@ -1105,7 +1109,7 @@ std::vector<IterRange> CalculateRequiredRegions(
             block.As<ir::ScheduleBlockRealize>()->iter_values[i].is_constant());
       if (block.As<ir::ScheduleBlockRealize>()->iter_values[i].as_var()) {
         auto find_for_loops =
-            ir::CollectIRNodesWithoutTensor(root, [&](const Expr* x) {
+            ir::ir_utils::CollectIRNodesWithoutTensor(root, [&](const Expr* x) {
               return x->As<ir::For>() &&
                      x->As<ir::For>()->loop_var->name ==
                          block.As<ir::ScheduleBlockRealize>()
@@ -1134,13 +1138,13 @@ Expr CheckComputeInlineValidationAndGetStore(const Expr& schedule_block,
                           ->schedule_block.As<ir::ScheduleBlock>()
                           ->body;
   // 1. Check the schedule block to be inlined is not a reduce tensor.
-  auto find_store = ir::CollectIRNodesWithoutTensor(
+  auto find_store = ir::ir_utils::CollectIRNodesWithoutTensor(
       compute_body, [&](const Expr* x) { return x->As<ir::Store>(); }, true);
   CHECK_EQ(find_store.size(), 1U);
   Expr tensor = (*find_store.begin()).As<ir::Store>()->tensor;
   CHECK(!tensor.as_tensor_ref()->is_reduce_tensor());
   // 2. Check this schedule block is the only writer of the tensor.
-  find_store = ir::CollectIRNodesWithoutTensor(
+  find_store = ir::ir_utils::CollectIRNodesWithoutTensor(
       root,
       [&](const Expr* x) {
         return x->As<ir::Store>() &&
@@ -1151,8 +1155,8 @@ Expr CheckComputeInlineValidationAndGetStore(const Expr& schedule_block,
   CHECK_EQ(find_store.size(), 1U);
   // 3. Check there is no overlap between the buffers the schedule block reads
   // and writes.
-  auto find_load =
-      ir::CollectIRNodesWithoutTensor(compute_body, [&](const Expr* x) {
+  auto find_load = ir::ir_utils::CollectIRNodesWithoutTensor(
+      compute_body, [&](const Expr* x) {
         return x->As<ir::Load>() && x->As<ir::Load>()->tensor == tensor;
       });
   CHECK(find_load.empty());
@@ -1166,14 +1170,14 @@ std::tuple<Expr, Expr, Expr> CheckReverseComputeInlineValidationAndGetExprs(
                           ->schedule_block.As<ir::ScheduleBlock>()
                           ->body;
   // 1. Check the schedule block to be reverse inlined is not a reduce tensor.
-  auto find_inlined_load = ir::CollectIRNodesWithoutTensor(
+  auto find_inlined_load = ir::ir_utils::CollectIRNodesWithoutTensor(
       compute_body, [&](const Expr* x) { return x->As<ir::Load>(); }, true);
   CHECK_EQ(find_inlined_load.size(), 1U);
   Expr tensor = (*find_inlined_load.begin()).As<ir::Load>()->tensor;
   CHECK(!tensor.as_tensor_ref()->is_reduce_tensor());
   auto inlined_load = *find_inlined_load.begin();
   // 2. Check this schedule block is the only reader of the tensor.
-  auto find_load = ir::CollectIRNodesWithoutTensor(
+  auto find_load = ir::ir_utils::CollectIRNodesWithoutTensor(
       root,
       [&](const Expr* x) {
         return x->As<ir::Load>() &&
@@ -1184,20 +1188,20 @@ std::tuple<Expr, Expr, Expr> CheckReverseComputeInlineValidationAndGetExprs(
   CHECK_EQ(find_load.size(), 1U);
   // 3. Check there is no overlap between the buffers the schedule block reads
   // and writes.
-  auto find_store =
-      ir::CollectIRNodesWithoutTensor(compute_body, [&](const Expr* x) {
+  auto find_store = ir::ir_utils::CollectIRNodesWithoutTensor(
+      compute_body, [&](const Expr* x) {
         return x->As<ir::Store>() && x->As<ir::Store>()->tensor == tensor;
       });
   CHECK(find_store.empty());
   // 4. Get store that will be inlined.
   auto find_inlined_store =
-      ir::CollectIRNodesWithoutTensor(root, [&](const Expr* x) {
+      ir::ir_utils::CollectIRNodesWithoutTensor(root, [&](const Expr* x) {
         return x->As<ir::Store>() && x->As<ir::Store>()->tensor == tensor;
       });
   CHECK_EQ(find_inlined_store.size(), 1U);
   auto inlined_store = *find_inlined_store.begin();
   // 5. Get target store.
-  auto find_target_store = ir::CollectIRNodesWithoutTensor(
+  auto find_target_store = ir::ir_utils::CollectIRNodesWithoutTensor(
       compute_body, [&](const Expr* x) { return x->As<ir::Store>(); }, true);
   CHECK_EQ(find_target_store.size(), 1U);
   auto target_store = *find_target_store.begin();
@@ -1206,7 +1210,7 @@ std::tuple<Expr, Expr, Expr> CheckReverseComputeInlineValidationAndGetExprs(
 
 bool ContainVar(const std::vector<Expr>& exprs, const std::string& var_name) {
   for (auto& expr : exprs) {
-    auto find_expr = ir::CollectIRNodesWithoutTensor(
+    auto find_expr = ir::ir_utils::CollectIRNodesWithoutTensor(
         expr,
         [&](const Expr* x) {
           return x->As<_Var_>() && x->As<_Var_>()->name == var_name;
diff --git a/paddle/cinn/ir/tensor.cc b/paddle/cinn/ir/tensor.cc
index 3297b714630e1..8ad8b9878d4bc 100644
--- a/paddle/cinn/ir/tensor.cc
+++ b/paddle/cinn/ir/tensor.cc
@@ -60,7 +60,7 @@ std::set<std::string> _Tensor_::GetDependTensorNames() const {
   std::set<std::string> names;
 
   auto add_depend_tensors_from_expr = [&](Expr expr) {
-    auto tensors = CollectIRNodes(expr, [&](const Expr *x) {
+    auto tensors = ir::ir_utils::CollectIRNodes(expr, [&](const Expr *x) {
       return x->as_tensor() && x->as_tensor()->name != this->name;
     });
     for (auto &e : tensors) {
@@ -515,7 +515,7 @@ bool _Tensor_::IsDependOnStatement(absl::string_view statement) {
 std::set<std::string> _Tensor_::DependingTensorNames() {
   std::set<std::string> res;
   if (body().defined()) {
-    auto depend_tensors = ir::CollectIRNodes(
+    auto depend_tensors = ir::ir_utils::CollectIRNodes(
         body(), [](const Expr *x) -> bool { return x->as_tensor(); });
     for (const auto &x : depend_tensors) {
       if (x.get() != this) {
@@ -538,7 +538,7 @@ std::vector<Var> _Tensor_::axis_with_reduce() const {
 }
 
 bool _Tensor_::Uses(const Tensor &other) const {
-  auto loads = ir::CollectIRNodes(body(), [&](const Expr *x) {
+  auto loads = ir::ir_utils::CollectIRNodes(body(), [&](const Expr *x) {
     auto *loadn = x->As<ir::Load>();
     if (!loadn) return false;
     return loadn->tensor.as_tensor()->name == other->name;
diff --git a/paddle/cinn/ir/test/collect_ir_nodes_test.cc b/paddle/cinn/ir/test/collect_ir_nodes_test.cc
index 82441b4a005c7..d380b4475e37d 100644
--- a/paddle/cinn/ir/test/collect_ir_nodes_test.cc
+++ b/paddle/cinn/ir/test/collect_ir_nodes_test.cc
@@ -19,6 +19,7 @@
 
 namespace cinn {
 namespace ir {
+namespace ir_utils {
 
 TEST(CollectIRNodes, basic0) {
   Expr C = Expr(1) + 2;
@@ -57,6 +58,6 @@ TEST(CollectIRNodes, basic) {
       CollectIRNodes(fn_body, [](const Expr* x) { return x->as_tensor(); });
   auto exprs = CollectIRNodes(fn_body, [](const Expr* x) { return x; });
 }
-
+}  // namespace ir_utils
 }  // namespace ir
 }  // namespace cinn
diff --git a/paddle/cinn/ir/utils/ir_nodes_collector.cc b/paddle/cinn/ir/utils/ir_nodes_collector.cc
index d44c3701b5ac2..7d7373a6b9ee8 100644
--- a/paddle/cinn/ir/utils/ir_nodes_collector.cc
+++ b/paddle/cinn/ir/utils/ir_nodes_collector.cc
@@ -21,8 +21,8 @@
 namespace cinn {
 namespace ir {
 
+namespace ir_utils {
 namespace {
-
 struct IrNodesCollector : public IRVisitorRequireReImpl<void> {
   using teller_t = std::function<bool(const Expr*)>;
   using handler_t = std::function<void(const Expr*)>;
@@ -317,6 +317,6 @@ std::set<std::string> CollectTensorNeedsWrite(const Expr* e) {
   collector.Visit(e);
   return tensor_written;
 }
-
+}  // namespace ir_utils
 }  // namespace ir
 }  // namespace cinn
diff --git a/paddle/cinn/ir/utils/ir_nodes_collector.h b/paddle/cinn/ir/utils/ir_nodes_collector.h
index 0f8a390e1ade7..7bfb1b3b4e6b3 100644
--- a/paddle/cinn/ir/utils/ir_nodes_collector.h
+++ b/paddle/cinn/ir/utils/ir_nodes_collector.h
@@ -18,7 +18,7 @@
 
 namespace cinn {
 namespace ir {
-
+namespace ir_utils {
 /**
  * Collect the IR Nodes(without duplication) in the expression.
  */
@@ -83,6 +83,6 @@ std::vector<std::string> CollectUndefinedVars(const Expr* e);
  * Collect the Tensor Nodes which will be Writed by Store or Call Nodes
  */
 std::set<std::string> CollectTensorNeedsWrite(const Expr* e);
-
+}  // namespace ir_utils
 }  // namespace ir
 }  // namespace cinn
diff --git a/paddle/cinn/lang/lower.cc b/paddle/cinn/lang/lower.cc
index 58ae00fe8771e..0b91b6d598ac7 100644
--- a/paddle/cinn/lang/lower.cc
+++ b/paddle/cinn/lang/lower.cc
@@ -40,7 +40,7 @@ std::vector<ir::Argument> GetArgs(
   std::vector<ir::Argument> res;
   std::map<std::string, std::set<const ir::Load*>> name2loads;
   std::map<std::string, std::set<const ir::Store*>> name2stores;
-  auto load_or_store_nodes = ir::CollectIRNodesWithoutTensor(
+  auto load_or_store_nodes = ir::ir_utils::CollectIRNodesWithoutTensor(
       func_body,
       [&](const Expr* x) { return x->As<ir::Store>() || x->As<ir::Load>(); });
 
@@ -102,7 +102,7 @@ std::vector<ir::Buffer> GetTempBuffers(const std::vector<Tensor>& tensor_args,
       name_to_buffer;  // used to avoid duplication.
 
   auto all_temp_tensors =
-      ir::CollectIRNodesWithoutTensor(body, [&](const Expr* x) {
+      ir::ir_utils::CollectIRNodesWithoutTensor(body, [&](const Expr* x) {
         return x->as_tensor() && x->as_tensor()->buffer.defined() &&
                (!tensor_group.Contain(x->as_tensor()->name) &&
                 ((!buffer_arg_names.count(x->as_tensor()->buffer->name) &&
@@ -145,7 +145,7 @@ std::vector<ir::Buffer> GetTempBuffers(const std::vector<Tensor>& tensor_args,
       name_to_buffer;  // used to avoid duplication.
 
   auto all_temp_tensors =
-      ir::CollectIRNodesWithoutTensor(body, [&](const Expr* x) {
+      ir::ir_utils::CollectIRNodesWithoutTensor(body, [&](const Expr* x) {
         return x->as_tensor() && x->as_tensor()->buffer.defined() &&
                (!stage_map->Lookup(x->as_tensor()->name) ||
                 !stage_map[x->as_tensor()]->inlined()) &&
@@ -165,17 +165,18 @@ std::vector<ir::Buffer> GetTempBuffers(const std::vector<Tensor>& tensor_args,
     }
   }
   // visit the ir body and update the map of name_to_buffer
-  auto update_map = ir::CollectIRNodesWithoutTensor(body, [&](const Expr* x) {
-    if (x->as_tensor() && x->as_tensor()->buffer.defined()) {
-      auto buffer_name = x->as_tensor()->buffer->name;
-      if (name_to_buffer.count(buffer_name) &&
-          x->as_tensor()->buffer->numel() <
-              name_to_buffer[buffer_name]->numel()) {
-        name_to_buffer[buffer_name] = x->as_tensor()->buffer;
-      }
-    }
-    return x->as_tensor() && x->as_tensor()->buffer.defined();
-  });
+  auto update_map =
+      ir::ir_utils::CollectIRNodesWithoutTensor(body, [&](const Expr* x) {
+        if (x->as_tensor() && x->as_tensor()->buffer.defined()) {
+          auto buffer_name = x->as_tensor()->buffer->name;
+          if (name_to_buffer.count(buffer_name) &&
+              x->as_tensor()->buffer->numel() <
+                  name_to_buffer[buffer_name]->numel()) {
+            name_to_buffer[buffer_name] = x->as_tensor()->buffer;
+          }
+        }
+        return x->as_tensor() && x->as_tensor()->buffer.defined();
+      });
 
   std::vector<ir::Buffer> temp_buffers;
   for (auto& i : name_to_buffer) temp_buffers.push_back(i.second);
@@ -195,7 +196,7 @@ std::vector<ir::Buffer> GetTempBuffers(const std::vector<ir::Argument>& args,
       name_to_buffer;  // used to avoid duplication.
 
   auto all_temp_tensors =
-      ir::CollectIRNodesWithoutTensor(body, [&](const Expr* x) {
+      ir::ir_utils::CollectIRNodesWithoutTensor(body, [&](const Expr* x) {
         return x->as_tensor() && x->as_tensor()->buffer.defined() &&
                (!buffer_arg_names.count(x->as_tensor()->buffer->name) ||
                 utils::Endswith(x->as_tensor()->buffer->name, "temp_buffer"));
@@ -212,17 +213,18 @@ std::vector<ir::Buffer> GetTempBuffers(const std::vector<ir::Argument>& args,
     }
   }
   // visit the ir body and update the map of name_to_buffer
-  auto update_map = ir::CollectIRNodesWithoutTensor(body, [&](const Expr* x) {
-    if (x->as_tensor() && x->as_tensor()->buffer.defined()) {
-      auto buffer_name = x->as_tensor()->buffer->name;
-      if (name_to_buffer.count(buffer_name) &&
-          x->as_tensor()->buffer->numel() <
-              name_to_buffer[buffer_name]->numel()) {
-        name_to_buffer[buffer_name] = x->as_tensor()->buffer;
-      }
-    }
-    return x->as_tensor() && x->as_tensor()->buffer.defined();
-  });
+  auto update_map =
+      ir::ir_utils::CollectIRNodesWithoutTensor(body, [&](const Expr* x) {
+        if (x->as_tensor() && x->as_tensor()->buffer.defined()) {
+          auto buffer_name = x->as_tensor()->buffer->name;
+          if (name_to_buffer.count(buffer_name) &&
+              x->as_tensor()->buffer->numel() <
+                  name_to_buffer[buffer_name]->numel()) {
+            name_to_buffer[buffer_name] = x->as_tensor()->buffer;
+          }
+        }
+        return x->as_tensor() && x->as_tensor()->buffer.defined();
+      });
 
   std::vector<ir::Buffer> temp_buffers;
   for (auto& i : name_to_buffer) temp_buffers.push_back(i.second);
@@ -250,7 +252,7 @@ void InitReduceTensor(StageMap stages,
     tensor->InitReduction(stages, target);
   }
   auto uninited_reduce_tensors =
-      ir::CollectIRNodes(tensor->body(), [&](const Expr* x) {
+      ir::ir_utils::CollectIRNodes(tensor->body(), [&](const Expr* x) {
         return x && x->defined() && x->as_tensor() &&
                x->as_tensor()->is_reduce_tensor() &&
                !x->as_tensor()->IsReduceInited(stages);
diff --git a/paddle/cinn/lang/lower_impl.cc b/paddle/cinn/lang/lower_impl.cc
index 629b405dcd2f0..24d5325bc1be9 100644
--- a/paddle/cinn/lang/lower_impl.cc
+++ b/paddle/cinn/lang/lower_impl.cc
@@ -35,7 +35,7 @@ namespace lang {
 namespace detail {
 
 void CheckNoIslCallRemains(Expr* expr) {
-  auto isl_calls = ir::CollectIRNodes(*expr, [](const Expr* expr) {
+  auto isl_calls = ir::ir_utils::CollectIRNodes(*expr, [](const Expr* expr) {
     return expr->As<ir::Call>() && expr->As<ir::Call>()->is_isl_call();
   });
 #ifdef CINN_DEBUG
@@ -223,7 +223,7 @@ void CreateCompGraphWithInlineTensors(common::Graph* graph,
   // collect dependency tensors of t
   // here we just collect the tensors in Load nodes
   // NOTE there may be some other cases.
-  auto deps = ir::CollectLoadTensors(
+  auto deps = ir::ir_utils::CollectLoadTensors(
       t->body(), [](const Expr* x) { return x->as_tensor(); });
   for (const auto& dep : deps) {
     auto e_tensor = dep.as_tensor_ref();
@@ -342,7 +342,7 @@ std::vector<ir::Argument> LowerImpl::GenerateFunctionArgumentList(
   CheckArgsUnique();
 
   std::vector<ir::Argument> args;
-  auto teller = ir::CollectTensorNeedsWrite(&fn_body);
+  auto teller = ir::ir_utils::CollectTensorNeedsWrite(&fn_body);
 
   std::set<std::string> arg_names;
 
@@ -395,7 +395,7 @@ std::vector<ir::Argument> LowerImpl::GenFuncArgForSplitKernel(
 
   std::vector<ir::Argument> in_args;
   std::vector<ir::Argument> out_args;
-  auto teller = ir::CollectTensorNeedsWrite(&func_iterator);
+  auto teller = ir::ir_utils::CollectTensorNeedsWrite(&func_iterator);
   std::set<std::string> arg_names;
   std::set<std::string> all_tensor_names;
 
@@ -408,11 +408,12 @@ std::vector<ir::Argument> LowerImpl::GenFuncArgForSplitKernel(
     in_args.emplace_back(scalar, ir::Argument::IO::kInput);
   }
 
-  auto all_tensors = ir::CollectIRNodes(func_iterator, [&](const Expr* x) {
-    return x->as_tensor() && !stages_[x->as_tensor()]->inlined();
-  });
+  auto all_tensors =
+      ir::ir_utils::CollectIRNodes(func_iterator, [&](const Expr* x) {
+        return x->as_tensor() && !stages_[x->as_tensor()]->inlined();
+      });
 
-  auto all_vars = ir::CollectIRNodes(
+  auto all_vars = ir::ir_utils::CollectIRNodes(
       func_iterator, [&](const Expr* x) { return x->as_var(); });
 
   for (auto& i : all_tensors) {
@@ -588,7 +589,7 @@ std::vector<ir::LoweredFunc> LowerImpl::operator()() {
         Reference(&arg)->buffer = tensor_map.at(arg->name)->buffer;
       }
     }
-    auto store_exprs = ir::CollectIRNodes(
+    auto store_exprs = ir::ir_utils::CollectIRNodes(
         func_iterator, [](const Expr* x) { return x->As<ir::Store>(); });
     std::vector<ir::Tensor> new_temp_tensors;
     for (auto& expr : store_exprs) {
diff --git a/paddle/cinn/lang/lower_tensor_group.cc b/paddle/cinn/lang/lower_tensor_group.cc
index 200b608387560..0a802c0f0566d 100644
--- a/paddle/cinn/lang/lower_tensor_group.cc
+++ b/paddle/cinn/lang/lower_tensor_group.cc
@@ -88,7 +88,7 @@ std::vector<ir::LoweredFunc> LowerTensorGroup::operator()() {
     }
 
     // Some store tensors are also temp tensors;
-    auto store_exprs = ir::CollectIRNodes(
+    auto store_exprs = ir::ir_utils::CollectIRNodes(
         func_body, [](const Expr* x) { return x->As<ir::Store>(); });
     for (auto& expr : store_exprs) {
       auto* store_node = expr.As<ir::Store>();
@@ -146,7 +146,7 @@ std::vector<ir::LoweredFunc> LowerTensorGroup::operator()() {
 std::vector<ir::Argument> LowerTensorGroup::GenerateFunctionArgumentList(
     Expr fn_body) {
   std::vector<ir::Argument> args;
-  auto teller = ir::CollectTensorNeedsWrite(&fn_body);
+  auto teller = ir::ir_utils::CollectTensorNeedsWrite(&fn_body);
 
   std::set<std::string> arg_names;
 
diff --git a/paddle/cinn/optim/buffer_assign.cc b/paddle/cinn/optim/buffer_assign.cc
index eb059a30ea26d..175689defbe36 100644
--- a/paddle/cinn/optim/buffer_assign.cc
+++ b/paddle/cinn/optim/buffer_assign.cc
@@ -73,7 +73,7 @@ std::map<std::string, ir::Tensor> InitialAssignBuffer(
 
   // unify all the tensor occurance with a global one, e.g. there are multiple
   // tensor B exists in the expression, replace them with a shared one.
-  ir::CollectIRNodes(*expr, [&](const Expr* x) -> bool {
+  ir::ir_utils::CollectIRNodes(*expr, [&](const Expr* x) -> bool {
     auto* t = x->as_tensor();
     if (t && !stages[t]->inlined()) {
       Reference(x) = Expr(all_tensor_map.at(t->name));
diff --git a/paddle/cinn/optim/compute_inline_expand.cc b/paddle/cinn/optim/compute_inline_expand.cc
index 8dad52ab4d9bc..d4123729bc53f 100644
--- a/paddle/cinn/optim/compute_inline_expand.cc
+++ b/paddle/cinn/optim/compute_inline_expand.cc
@@ -225,7 +225,7 @@ void ComputeInlineExpand(Expr *expr,
                          poly::StageMap stages,
                          std::map<std::string, ir::Tensor> *all_tensor_map) {
   // the inline tensors contained in the expression.
-  auto inline_tensors = ir::CollectIRNodes(*expr, [&](const Expr *x) {
+  auto inline_tensors = ir::ir_utils::CollectIRNodes(*expr, [&](const Expr *x) {
     return x->as_tensor() && stages[x->as_tensor()]->inlined();
   });
 
@@ -240,9 +240,10 @@ void ComputeInlineExpand(Expr *expr,
       TensorInlineExpandMutator(tensor->name, all_tensor_map, stages)(expr);
     }
 
-    inline_tensors = ir::CollectLoadTensors(*expr, [&](const Expr *x) {
-      return x->as_tensor() && stages[x->as_tensor()]->inlined();
-    });
+    inline_tensors =
+        ir::ir_utils::CollectLoadTensors(*expr, [&](const Expr *x) {
+          return x->as_tensor() && stages[x->as_tensor()]->inlined();
+        });
   }
 }
 
diff --git a/paddle/cinn/optim/eliminate_broadcast_in_forloop.cc b/paddle/cinn/optim/eliminate_broadcast_in_forloop.cc
index a4feec97626cb..bb546f694be9d 100644
--- a/paddle/cinn/optim/eliminate_broadcast_in_forloop.cc
+++ b/paddle/cinn/optim/eliminate_broadcast_in_forloop.cc
@@ -36,9 +36,9 @@ struct EliminateBroadcastInForloop : public ir::IRMutator<Expr*> {
 
     auto* node = expr->As<ir::Store>();
 
-    auto broadcasts = ir::CollectIRNodes(node->value, [&](const Expr* expr) {
-      return expr->As<ir::Broadcast>();
-    });
+    auto broadcasts = ir::ir_utils::CollectIRNodes(
+        node->value,
+        [&](const Expr* expr) { return expr->As<ir::Broadcast>(); });
     std::vector<Expr> let_exprs;
 
     Var tmp;
@@ -79,7 +79,7 @@ struct EliminateBroadcastInForloop : public ir::IRMutator<Expr*> {
   }
 
   bool ContainsLoopVar(Expr expr, Var loop_var) {
-    return !ir::CollectIRNodes(expr, [&](const Expr* e) -> bool {
+    return !ir::ir_utils::CollectIRNodes(expr, [&](const Expr* e) -> bool {
               return e->As<ir::_Var_>() &&
                      e->As<ir::_Var_>()->name == loop_var->name;
             }).empty();
diff --git a/paddle/cinn/optim/transform_gpu_forloop.cc b/paddle/cinn/optim/transform_gpu_forloop.cc
index d12a5c9f2dab8..7b30f75bf9652 100644
--- a/paddle/cinn/optim/transform_gpu_forloop.cc
+++ b/paddle/cinn/optim/transform_gpu_forloop.cc
@@ -586,7 +586,7 @@ class ResizeBufferSizeVisitor : public ir::IRMutator<> {
 
   int BufferSize(ir::Expr indice) {
     auto copy = IRCopy(indice);
-    auto vars = ir::CollectIRNodesInOrder(
+    auto vars = ir::ir_utils::CollectIRNodesInOrder(
         copy, [](const ir::Expr *expr) { return expr->As<ir::_Var_>(); });
 
     int max_range = 1;
diff --git a/paddle/cinn/optim/vectorize_loops.cc b/paddle/cinn/optim/vectorize_loops.cc
index 2f3a9b29a3567..357bafe79730a 100644
--- a/paddle/cinn/optim/vectorize_loops.cc
+++ b/paddle/cinn/optim/vectorize_loops.cc
@@ -129,7 +129,8 @@ class TensorVectorizeTeller : public ir::IRMutator<const Expr *> {
 
     // the iter val must appear in the last index
     if (indices.empty() ||
-        ir::CollectIRNodes(indices.back(), find_matched_var_fn).empty()) {
+        ir::ir_utils::CollectIRNodes(indices.back(), find_matched_var_fn)
+            .empty()) {
       VLOG(5) << "Loop var:" << iter_var_->name
               << " is not used in the last index";
       return false;
@@ -137,7 +138,8 @@ class TensorVectorizeTeller : public ir::IRMutator<const Expr *> {
 
     // the iter val can't appear in mulitple indices
     for (int i = 0; i < indices.size() - 1; ++i) {
-      auto repeat_found = ir::CollectIRNodes(indices[i], find_matched_var_fn);
+      auto repeat_found =
+          ir::ir_utils::CollectIRNodes(indices[i], find_matched_var_fn);
       if (!repeat_found.empty()) {
         VLOG(5) << "Loop var:" << iter_var_->name
                 << " is used at more than last index, current:" << i;
@@ -214,7 +216,7 @@ class CudaVectorizer : public IRMutator<Expr *> {
   }
 
   void Visit(Expr *expr) {
-    write_teller_ = ir::CollectTensorNeedsWrite(expr);
+    write_teller_ = ir::ir_utils::CollectTensorNeedsWrite(expr);
     vectorized_teller_.Collect(expr);
     IRMutator<Expr *>::Visit(expr, expr);
   }
diff --git a/paddle/cinn/poly/domain.cc b/paddle/cinn/poly/domain.cc
index 309fa5aaa3db4..257de52fe7a5b 100644
--- a/paddle/cinn/poly/domain.cc
+++ b/paddle/cinn/poly/domain.cc
@@ -70,8 +70,8 @@ void Domain::ExtractParams() {
   std::unordered_set<std::string> var_names;
   auto collect_param_fn = [&](Expr& e) {
     if (!e.is_constant()) {
-      auto vars =
-          ir::CollectIRNodes(e, [](const Expr* e) { return e->is_var(); });
+      auto vars = ir::ir_utils::CollectIRNodes(
+          e, [](const Expr* e) { return e->is_var(); });
       for (auto& var : vars) var_names.insert(var.As<ir::_Var_>()->name);
     }
   };
diff --git a/paddle/cinn/poly/stage.cc b/paddle/cinn/poly/stage.cc
index faa7a99c0cfde..e2e5dc531c0f7 100644
--- a/paddle/cinn/poly/stage.cc
+++ b/paddle/cinn/poly/stage.cc
@@ -805,7 +805,7 @@ void Stage::SimpleComputeAt(Stage *other, int level) {
   compute_ats_[other->id()] = relation;
   auto other_expr = other->expr();
   auto find_tensors =
-      ir::CollectIRNodesWithoutTensor(other_expr, [&](const Expr *x) {
+      ir::ir_utils::CollectIRNodesWithoutTensor(other_expr, [&](const Expr *x) {
         return x->as_tensor() && x->as_tensor_ref()->name == tensor()->name;
       });
   if (!find_tensors.empty()) {
@@ -1025,7 +1025,7 @@ Iterator Stage::Fuse(const Iterator &level0, const Iterator &level1) {
 std::vector<std::string> Stage::input_statements() const {
   if (!expr_.defined()) return {};
   VLOG(3) << "stage " << id() << " expr: " << expr_;
-  auto load_exprs = ir::CollectIRNodes(
+  auto load_exprs = ir::ir_utils::CollectIRNodes(
       expr_, [](const Expr *x) { return x->As<ir::Load>(); });
   std::set<std::string> statements;
   for (auto &expr : load_exprs) {
@@ -1563,10 +1563,11 @@ void Stage::ShareBufferWith(Stage *other) {
 isl_map *__isl_give GatherAccesses(Stage *stage,
                                    const std::string &tensor_name) {
   CHECK(stage->tensor_);
-  auto loads = ir::CollectIRNodes(stage->tensor_->body(), [&](const Expr *x) {
-    return x->As<ir::Load>() &&
-           x->As<ir::Load>()->tensor.as_tensor()->name == tensor_name;
-  });
+  auto loads =
+      ir::ir_utils::CollectIRNodes(stage->tensor_->body(), [&](const Expr *x) {
+        return x->As<ir::Load>() &&
+               x->As<ir::Load>()->tensor.as_tensor()->name == tensor_name;
+      });
 
   auto vars = stage->tensor_->axis_with_reduce();
 
@@ -1888,7 +1889,7 @@ StageMap CreateStages(const std::vector<ir::Tensor> &tensors) {
   std::set<ir::Tensor> all_tensors(tensors.begin(), tensors.end());
 
   for (auto &tensor : tensors) {
-    auto used_tensors = ir::CollectIRNodes(
+    auto used_tensors = ir::ir_utils::CollectIRNodes(
         tensor->body(), [](const Expr *x) { return x->as_tensor(); });
     for (const Expr &x : used_tensors) {
       all_tensors.insert(x.as_tensor_ref());

From 27d0fed793ac229645371d4b34c1a6c3970a02c1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=82=85=E5=89=91=E5=AF=92?= <Xs1580802568@gmail.com>
Date: Thu, 21 Sep 2023 14:51:47 +0800
Subject: [PATCH 32/39] move ir_verify from namespace optim to ir_utils
 (#57532)

---
 paddle/cinn/backends/codegen_c.cc         |  2 +-
 paddle/cinn/backends/codegen_cuda_dev.cc  |  2 +-
 paddle/cinn/backends/llvm/codegen_llvm.cc |  2 +-
 paddle/cinn/ir/test/ir_verify_test.cc     | 10 ++++++----
 paddle/cinn/ir/utils/ir_verify.cc         | 12 ++++++++----
 paddle/cinn/ir/utils/ir_verify.h          |  9 ++++++---
 6 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/paddle/cinn/backends/codegen_c.cc b/paddle/cinn/backends/codegen_c.cc
index 2345bf53d36cd..6440339947682 100644
--- a/paddle/cinn/backends/codegen_c.cc
+++ b/paddle/cinn/backends/codegen_c.cc
@@ -38,7 +38,7 @@ using cinn::common::float16;
 const char *kCKeywordRestrict = "__restrict__";
 
 void CodeGenC::Compile(const ir::Module &module, const Outputs &outputs) {
-  ir::IrVerify(Expr(module));
+  ir::ir_utils::IrVerify(Expr(module));
 
   if (!outputs.c_header_name.empty()) {
     auto source = Compile(module, OutputKind::CHeader);
diff --git a/paddle/cinn/backends/codegen_cuda_dev.cc b/paddle/cinn/backends/codegen_cuda_dev.cc
index 1f6f5bba154aa..5a1ddbc450a09 100644
--- a/paddle/cinn/backends/codegen_cuda_dev.cc
+++ b/paddle/cinn/backends/codegen_cuda_dev.cc
@@ -56,7 +56,7 @@ std::string CodeGenCUDA_Dev::Compile(const ir::Module &module, bool for_nvrtc) {
 
 void CodeGenCUDA_Dev::Compile(const ir::Module &module,
                               const Outputs &outputs) {
-  ir::IrVerify(Expr(module));
+  ir::ir_utils::IrVerify(Expr(module));
 
   CodeGenC::inline_builtin_codes_ = false;
   if (!outputs.c_header_name.empty()) {
diff --git a/paddle/cinn/backends/llvm/codegen_llvm.cc b/paddle/cinn/backends/llvm/codegen_llvm.cc
index 5ff8ce03c77b0..b91772bd688b8 100644
--- a/paddle/cinn/backends/llvm/codegen_llvm.cc
+++ b/paddle/cinn/backends/llvm/codegen_llvm.cc
@@ -790,7 +790,7 @@ llvm::Value *CodeGenLLVM::Visit(const ir::Call *op) {
 llvm::Value *CodeGenLLVM::Visit(const ir::_Module_ *op) {
   {
     Expr body_to_verify(&Reference(op));
-    ir::IrVerify(body_to_verify);
+    ir::ir_utils::IrVerify(body_to_verify);
   }
 
   for (auto &fn : op->functions) {
diff --git a/paddle/cinn/ir/test/ir_verify_test.cc b/paddle/cinn/ir/test/ir_verify_test.cc
index 06a842ef5ba81..183f20e491fbc 100644
--- a/paddle/cinn/ir/test/ir_verify_test.cc
+++ b/paddle/cinn/ir/test/ir_verify_test.cc
@@ -18,12 +18,14 @@
 
 #include "paddle/cinn/ir/op/ir_operators.h"
 
-namespace cinn::ir {
-
+namespace cinn {
+namespace ir {
+namespace ir_utils {
 TEST(IrVerify, basic) {
   Expr a(1);
   Expr b(1);
   IrVerify(a + b);
 }
-
-}  // namespace cinn::ir
+}  // namespace ir_utils
+}  // namespace ir
+}  // namespace cinn
diff --git a/paddle/cinn/ir/utils/ir_verify.cc b/paddle/cinn/ir/utils/ir_verify.cc
index d0f69802438bb..b961e25114249 100644
--- a/paddle/cinn/ir/utils/ir_verify.cc
+++ b/paddle/cinn/ir/utils/ir_verify.cc
@@ -17,7 +17,10 @@
 #include "paddle/cinn/ir/utils/ir_mutator.h"
 #include "paddle/cinn/ir/utils/ir_printer.h"
 
-namespace cinn::ir {
+namespace cinn {
+namespace ir {
+namespace ir_utils {
+namespace {
 
 struct IrVerifyVisitor : public ir::IRMutator<> {
   using ir::IRMutator<>::Visit;
@@ -30,10 +33,11 @@ struct IrVerifyVisitor : public ir::IRMutator<> {
   NODETY_FORALL(__)
 #undef __
 };
-
+}  // namespace
 void IrVerify(Expr e) {
   IrVerifyVisitor visitor;
   visitor.Visit(&e, &e);
 }
-
-}  // namespace cinn::ir
+}  // namespace ir_utils
+}  // namespace ir
+}  // namespace cinn
diff --git a/paddle/cinn/ir/utils/ir_verify.h b/paddle/cinn/ir/utils/ir_verify.h
index deddb3178282d..d47c97e0197d4 100644
--- a/paddle/cinn/ir/utils/ir_verify.h
+++ b/paddle/cinn/ir/utils/ir_verify.h
@@ -15,8 +15,11 @@
 #pragma once
 #include "paddle/cinn/ir/ir.h"
 
-namespace cinn::ir {
+namespace cinn {
+namespace ir {
+namespace ir_utils {
 
 void IrVerify(Expr e);
-
-}  // namespace cinn::ir
+}  // namespace ir_utils
+}  // namespace ir
+}  // namespace cinn

From 98be3d95e2041938fa7e783a07ec5cee56251f38 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=82=85=E5=89=91=E5=AF=92?= <Xs1580802568@gmail.com>
Date: Thu, 21 Sep 2023 14:52:09 +0800
Subject: [PATCH 33/39] =?UTF-8?q?=E3=80=90CINN=E3=80=91move=20ir=5Freplace?=
 =?UTF-8?q?=20from=20cinn/optim=20to=20cinn/ir/utils=20(#57524)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* move ir_replace from cinn/optim to cinn/ir/utils

* delete extra modification
---
 paddle/cinn/ir/utils/CMakeLists.txt              |  3 ++-
 paddle/cinn/{optim => ir/utils}/ir_replace.cc    |  8 +++++---
 paddle/cinn/{optim => ir/utils}/ir_replace.h     |  7 ++++---
 paddle/cinn/optim/CMakeLists.txt                 |  1 -
 paddle/cinn/optim/buffer_assign.cc               |  2 +-
 .../cinn/optim/eliminate_broadcast_in_forloop.cc |  4 ++--
 paddle/cinn/optim/unroll_loops.cc                |  4 ++--
 paddle/cinn/optim/vectorize_loops.cc             | 16 +++++++++-------
 paddle/cinn/poly/stage.cc                        |  2 +-
 9 files changed, 26 insertions(+), 21 deletions(-)
 rename paddle/cinn/{optim => ir/utils}/ir_replace.cc (93%)
 rename paddle/cinn/{optim => ir/utils}/ir_replace.h (91%)

diff --git a/paddle/cinn/ir/utils/CMakeLists.txt b/paddle/cinn/ir/utils/CMakeLists.txt
index 5613bf7260155..032bf537d2fce 100644
--- a/paddle/cinn/ir/utils/CMakeLists.txt
+++ b/paddle/cinn/ir/utils/CMakeLists.txt
@@ -9,4 +9,5 @@ gather_srcs(
   ir_verify.cc
   ir_compare.cc
   ir_nodes_collector.cc
-  ir_copy.cc)
+  ir_copy.cc
+  ir_replace.cc)
diff --git a/paddle/cinn/optim/ir_replace.cc b/paddle/cinn/ir/utils/ir_replace.cc
similarity index 93%
rename from paddle/cinn/optim/ir_replace.cc
rename to paddle/cinn/ir/utils/ir_replace.cc
index 3dc39a08a3817..da2305359c5e9 100644
--- a/paddle/cinn/optim/ir_replace.cc
+++ b/paddle/cinn/ir/utils/ir_replace.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/cinn/optim/ir_replace.h"
+#include "paddle/cinn/ir/utils/ir_replace.h"
 
 #include <set>
 
@@ -22,7 +22,8 @@
 #include "paddle/cinn/utils/string.h"
 
 namespace cinn {
-namespace optim {
+namespace ir {
+namespace ir_utils {
 using utils::GetStreamCnt;
 
 namespace {
@@ -65,5 +66,6 @@ void IrReplace(ir::Expr* expr, ir::Expr from, ir::Expr to) {
   IrReplaceMutator(from, to)(expr);
 }
 
-}  // namespace optim
+}  // namespace ir_utils
+}  // namespace ir
 }  // namespace cinn
diff --git a/paddle/cinn/optim/ir_replace.h b/paddle/cinn/ir/utils/ir_replace.h
similarity index 91%
rename from paddle/cinn/optim/ir_replace.h
rename to paddle/cinn/ir/utils/ir_replace.h
index 7c95d1e6f6c38..312e4c61eff0a 100644
--- a/paddle/cinn/optim/ir_replace.h
+++ b/paddle/cinn/ir/utils/ir_replace.h
@@ -18,10 +18,11 @@
 #include "paddle/cinn/ir/ir.h"
 
 namespace cinn {
-namespace optim {
+namespace ir {
+namespace ir_utils {
 
 //! Replace the variable \p v to expression \p e in expression \p expr.
 void IrReplace(ir::Expr* expr, ir::Expr from, ir::Expr to);
-
-}  // namespace optim
+}  // namespace ir_utils
+}  // namespace ir
 }  // namespace cinn
diff --git a/paddle/cinn/optim/CMakeLists.txt b/paddle/cinn/optim/CMakeLists.txt
index 1b4a55479ef0b..03b8c95b74173 100755
--- a/paddle/cinn/optim/CMakeLists.txt
+++ b/paddle/cinn/optim/CMakeLists.txt
@@ -4,7 +4,6 @@ gather_srcs(
   cinnapi_src
   SRCS
   replace_call_with_expr.cc
-  ir_replace.cc
   replace_var_with_expr.cc
   ir_simplify.cc
   optimize.cc
diff --git a/paddle/cinn/optim/buffer_assign.cc b/paddle/cinn/optim/buffer_assign.cc
index 175689defbe36..f749cac9ba502 100644
--- a/paddle/cinn/optim/buffer_assign.cc
+++ b/paddle/cinn/optim/buffer_assign.cc
@@ -17,8 +17,8 @@
 #include "paddle/cinn/common/union_find.h"
 #include "paddle/cinn/ir/utils/ir_mutator.h"
 #include "paddle/cinn/ir/utils/ir_printer.h"
+#include "paddle/cinn/ir/utils/ir_replace.h"
 #include "paddle/cinn/lang/lower_impl.h"
-#include "paddle/cinn/optim/ir_replace.h"
 
 namespace cinn {
 namespace optim {
diff --git a/paddle/cinn/optim/eliminate_broadcast_in_forloop.cc b/paddle/cinn/optim/eliminate_broadcast_in_forloop.cc
index bb546f694be9d..e836563a9feb0 100644
--- a/paddle/cinn/optim/eliminate_broadcast_in_forloop.cc
+++ b/paddle/cinn/optim/eliminate_broadcast_in_forloop.cc
@@ -19,8 +19,8 @@
 
 #include "paddle/cinn/ir/utils/ir_mutator.h"
 #include "paddle/cinn/ir/utils/ir_printer.h"
+#include "paddle/cinn/ir/utils/ir_replace.h"
 #include "paddle/cinn/ir/utils/ir_visitor.h"
-#include "paddle/cinn/optim/ir_replace.h"
 
 namespace cinn {
 namespace optim {
@@ -54,7 +54,7 @@ struct EliminateBroadcastInForloop : public ir::IRMutator<Expr*> {
       std::tie(let_expr, tmp) = CreateTmpLet(broadcast);
       let_exprs.push_back(let_expr);
 
-      optim::IrReplace(expr, broadcast, tmp);
+      cinn::ir::ir_utils::IrReplace(expr, broadcast, tmp);
     }
 
     // insert the let expressions to the outer forloop.
diff --git a/paddle/cinn/optim/unroll_loops.cc b/paddle/cinn/optim/unroll_loops.cc
index fc5fab85eca5f..32d4037b83e3e 100644
--- a/paddle/cinn/optim/unroll_loops.cc
+++ b/paddle/cinn/optim/unroll_loops.cc
@@ -21,7 +21,7 @@
 #include "paddle/cinn/ir/utils/ir_copy.h"
 #include "paddle/cinn/ir/utils/ir_mutator.h"
 #include "paddle/cinn/ir/utils/ir_printer.h"
-#include "paddle/cinn/optim/ir_replace.h"
+#include "paddle/cinn/ir/utils/ir_replace.h"
 
 namespace cinn {
 namespace optim {
@@ -95,7 +95,7 @@ struct UnrollMutator : public ir::IRMutator<Expr*> {
     for (int i = min->value; i < extent->value; i++) {
       Expr start = op->min + i;
       body.push_back(optim::IRCopy(op->body));
-      optim::IrReplace(&body.back(), op->loop_var, start);
+      cinn::ir::ir_utils::IrReplace(&body.back(), op->loop_var, start);
     }
 
     *expr = ir::Block::Make(body);
diff --git a/paddle/cinn/optim/vectorize_loops.cc b/paddle/cinn/optim/vectorize_loops.cc
index 357bafe79730a..8ed13e9d5971b 100644
--- a/paddle/cinn/optim/vectorize_loops.cc
+++ b/paddle/cinn/optim/vectorize_loops.cc
@@ -29,7 +29,7 @@
 #include "paddle/cinn/ir/utils/ir_copy.h"
 #include "paddle/cinn/ir/utils/ir_nodes_collector.h"
 #include "paddle/cinn/ir/utils/ir_printer.h"
-#include "paddle/cinn/optim/ir_replace.h"
+#include "paddle/cinn/ir/utils/ir_replace.h"
 #include "paddle/cinn/optim/ir_simplify.h"
 #include "paddle/cinn/optim/unroll_loops.h"
 #include "paddle/cinn/utils/functional.h"
@@ -149,11 +149,11 @@ class TensorVectorizeTeller : public ir::IRMutator<const Expr *> {
 
     // check tensor accessed sequentially by comparing index one by one
     Expr first_idx = optim::IRCopy(indices.back());
-    optim::IrReplace(&first_idx, Expr(iter_var_), Expr(0));
+    cinn::ir::ir_utils::IrReplace(&first_idx, Expr(iter_var_), Expr(0));
     const auto &interval = var_intervals_->at(iter_var_->name);
     for (int i = 1; i < interval.r; ++i) {
       Expr next_idx = optim::IRCopy(indices.back());
-      optim::IrReplace(&next_idx, Expr(iter_var_), Expr(i));
+      cinn::ir::ir_utils::IrReplace(&next_idx, Expr(iter_var_), Expr(i));
       auto gap = common::AutoSimplify(Expr(next_idx - first_idx));
       if (!gap.As<IntImm>() || gap.as_int32() != i) {
         VLOG(5) << "Tensor:" << tensor->name
@@ -310,7 +310,8 @@ class CudaVectorizer : public IRMutator<Expr *> {
 
     // generate a get_addr expr to get the address of the tensor
     Expr converted_tensor = Load::Make(tensor, indices);
-    optim::IrReplace(&converted_tensor, iter_var_, Expr(int32_t(0)));
+    cinn::ir::ir_utils::IrReplace(
+        &converted_tensor, iter_var_, Expr(int32_t(0)));
     auto get_addr = ir::intrinsics::GetAddr::Make(converted_tensor);
 
     // generate a let expression to cast the tensor into the local vector
@@ -888,7 +889,7 @@ struct VectorizeLoops_ : public IRMutator<Expr *> {
                                                     ForType::Serial,
                                                     DeviceAPI::UNK,
                                                     IRCopy(inner_for->body))});
-          optim::IrReplace(
+          cinn::ir::ir_utils::IrReplace(
               &inner_for_b, inner_for->loop_var, Expr(new_iterator_inner));
 
           Expr out_for_b = For::Make(new_iterator_outer,
@@ -898,7 +899,7 @@ struct VectorizeLoops_ : public IRMutator<Expr *> {
                                      outer_for->device_api,
                                      inner_for_b,
                                      outer_for->vectorize_info());
-          optim::IrReplace(
+          cinn::ir::ir_utils::IrReplace(
               &out_for_b, outer_for->loop_var, Expr(new_iterator_outer));
           *expr = Block::Make({out_for_a, out_for_b});
           VLOG(2) << *expr;
@@ -960,7 +961,8 @@ struct VectorizeLoops_ : public IRMutator<Expr *> {
       } else {
         new_index = Expr(forloop->loop_var) * factor + Expr(new_iterator);
       }
-      optim::IrReplace(&forloop->body, forloop->loop_var, new_index);
+      cinn::ir::ir_utils::IrReplace(
+          &forloop->body, forloop->loop_var, new_index);
       auto new_forloop = For::Make(new_iterator,
                                    forloop->min,
                                    make_const(factor),
diff --git a/paddle/cinn/poly/stage.cc b/paddle/cinn/poly/stage.cc
index e2e5dc531c0f7..d74bce1404e5b 100644
--- a/paddle/cinn/poly/stage.cc
+++ b/paddle/cinn/poly/stage.cc
@@ -28,9 +28,9 @@
 #include "paddle/cinn/ir/utils/ir_mutator.h"
 #include "paddle/cinn/ir/utils/ir_nodes_collector.h"
 #include "paddle/cinn/ir/utils/ir_printer.h"
+#include "paddle/cinn/ir/utils/ir_replace.h"
 #include "paddle/cinn/ir/utils/ir_visitor.h"
 #include "paddle/cinn/lang/compute.h"
-#include "paddle/cinn/optim/ir_replace.h"
 #include "paddle/cinn/optim/ir_simplify.h"
 #include "paddle/cinn/optim/replace_var_with_expr.h"
 #include "paddle/cinn/poly/compute_at_transform.h"

From 55b7523779bbbed757c4e5b8294e12df64f79af5 Mon Sep 17 00:00:00 2001
From: cyberslack_lee <luhputu0815@gmail.com>
Date: Thu, 21 Sep 2023 15:36:50 +0800
Subject: [PATCH 34/39] [clang-tidy] NO.23 bugprone-branch-clone (#57522)

* clangtidyNo23

* fix

* fix
---
 .clang-tidy                                       |  2 +-
 .../collective/processgroup_comm_utils.cc         |  2 +-
 paddle/fluid/framework/details/fetch_op_handle.cc |  2 +-
 paddle/fluid/framework/downpour_worker.cc         |  5 ++---
 paddle/fluid/framework/executor_cache.cc          |  2 +-
 paddle/fluid/framework/io/fs.cc                   |  9 ++++-----
 .../fluid/framework/ir/constant_folding_pass.cc   |  4 +---
 .../ir/mkldnn/quant_dequant_mkldnn_pass.cc        |  5 ++---
 .../garbage_collector/event_garbage_collector.cc  |  7 ++++---
 .../garbage_collector/fast_garbage_collector.cc   |  7 ++++---
 .../new_executor/interpreter/static_build.cc      |  6 ++----
 .../framework/new_executor/new_ir_interpreter.cc  |  7 ++++---
 .../framework/new_executor/program_interpreter.cc |  7 ++++---
 paddle/fluid/framework/operator.cc                |  9 ++-------
 paddle/fluid/framework/parallel_executor.cc       | 10 ++++------
 paddle/fluid/framework/tensor_util.cc             |  6 ++++--
 paddle/fluid/framework/var_desc.cc                |  7 ++-----
 paddle/fluid/inference/api/analysis_predictor.cc  |  4 ++--
 paddle/fluid/memory/memcpy.cc                     |  2 +-
 paddle/fluid/operators/batch_norm_op.cc           |  4 ----
 paddle/fluid/operators/data_norm_op.cc            |  2 --
 .../operators/detection/multiclass_nms_op.cc      | 15 ++-------------
 .../operators/fused/fused_bn_activation_op.cc     |  2 --
 .../operators/fused/fused_bn_add_activation_op.cc |  2 --
 .../fused/mkldnn/fusion_gru_mkldnn_op.cc          |  2 +-
 .../fused/mkldnn/fusion_lstm_mkldnn_op.cc         |  2 +-
 .../operators/fused/mkldnn/multi_gru_mkldnn_op.cc |  4 ++--
 paddle/fluid/operators/inplace_abn_op.cc          |  2 --
 paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc     |  2 +-
 .../fluid/operators/mkldnn/reshape_mkldnn_op.cc   |  2 --
 paddle/fluid/operators/reader/buffered_reader.cc  |  6 ++----
 paddle/fluid/operators/sum_op.cc                  |  2 +-
 .../pir/phi_kernel_adaptor/phi_kernel_util.cc     |  5 ++---
 paddle/fluid/platform/place.cc                    |  6 +-----
 .../fluid/prim/api/manual_prim/static_prim_api.cc |  2 --
 paddle/fluid/pybind/eager_method.cc               |  6 ++----
 paddle/fluid/pybind/eager_properties.cc           |  6 ++----
 paddle/fluid/pybind/eager_utils.cc                |  9 +++------
 paddle/fluid/pybind/inference_api.cc              |  2 +-
 paddle/fluid/pybind/op_function_common.cc         |  4 +---
 paddle/phi/core/compat/convert_utils.cc           |  2 +-
 paddle/phi/core/kernel_factory.cc                 |  5 ++---
 paddle/phi/infermeta/unary.cc                     |  2 +-
 paddle/phi/kernels/cpu/diagonal_grad_kernel.cc    |  6 ++----
 .../phi/kernels/cpu/generate_proposals_kernel.cc  |  8 +-------
 .../phi/kernels/cpu/send_ue_recv_grad_kernel.cc   |  4 ++--
 paddle/phi/kernels/funcs/vol2col.cc               |  4 ++--
 47 files changed, 79 insertions(+), 142 deletions(-)

diff --git a/.clang-tidy b/.clang-tidy
index 6a6700c192027..924095b4def28 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -4,7 +4,7 @@ bugprone-argument-comment,
 -bugprone-assert-side-effect,
 -bugprone-bad-signal-to-kill-thread,
 -bugprone-bool-pointer-implicit-conversion,
--bugprone-branch-clone,
+bugprone-branch-clone,
 bugprone-copy-constructor-init,
 -bugprone-dangling-handle,
 -bugprone-dynamic-static-initializers,
diff --git a/paddle/fluid/distributed/collective/processgroup_comm_utils.cc b/paddle/fluid/distributed/collective/processgroup_comm_utils.cc
index 94723906fccb1..eec697f523945 100644
--- a/paddle/fluid/distributed/collective/processgroup_comm_utils.cc
+++ b/paddle/fluid/distributed/collective/processgroup_comm_utils.cc
@@ -51,7 +51,7 @@ ccl::CCLComm GetCCLComm(const Place& place, int global_gid) {
 #else
     return nullptr;
 #endif
-  } else if (place.GetType() == phi::AllocationType::CUSTOM) {
+  } else if (place.GetType() == phi::AllocationType::CUSTOM) {  // NOLINT
 #if defined(PADDLE_WITH_CUSTOM_DEVICE)
     return static_cast<paddle::distributed::ProcessGroupCustom*>(pg)->XCCLComm(
         place);
diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
index 2a504b2a0fc2b..b71c476a2c95e 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -120,7 +120,7 @@ void FetchOpHandle::WaitAndMergeCPUFetchVars() const {
 static void TransData(const phi::DenseTensor &src_item,
                       phi::DenseTensor *dst_item) {
   if (src_item.IsInitialized() && src_item.numel() > 0) {
-    if (platform::is_gpu_place(src_item.place())) {
+    if (platform::is_gpu_place(src_item.place())) {  // NOLINT
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       TensorCopy(src_item, platform::CPUPlace(), dst_item);
 #endif
diff --git a/paddle/fluid/framework/downpour_worker.cc b/paddle/fluid/framework/downpour_worker.cc
index 8a0406864cde7..e69a25bb32781 100644
--- a/paddle/fluid/framework/downpour_worker.cc
+++ b/paddle/fluid/framework/downpour_worker.cc
@@ -362,9 +362,8 @@ void DownpourWorker::CopySparseTable() {
     if (src_table == dest_table) {
       continue;
     } else if (!copy_table_config_.sparse_copy_by_feasign()) {
-      if (feasign_set_.find(src_table) == feasign_set_.end()) {
-        continue;
-      } else if (feasign_set_[src_table].empty()) {
+      if (feasign_set_.find(src_table) == feasign_set_.end() ||
+          feasign_set_[src_table].empty()) {
         continue;
       }
       feanum = fleet_ptr_->CopyTable(src_table, dest_table);
diff --git a/paddle/fluid/framework/executor_cache.cc b/paddle/fluid/framework/executor_cache.cc
index 64d5ce24d20fe..5613a8dbf155e 100644
--- a/paddle/fluid/framework/executor_cache.cc
+++ b/paddle/fluid/framework/executor_cache.cc
@@ -47,7 +47,7 @@ static ExecutionStrategy GetExecutionStrategy(const platform::Place &place) {
       execution_strategy.num_threads_ = 2;
       break;
     }
-    case platform::DeviceType::CUDA: {
+    case platform::DeviceType::CUDA: {  // NOLINT
       // NOTE: According experiments, one thread is faster in
       // most model training.
       execution_strategy.num_threads_ = 1;
diff --git a/paddle/fluid/framework/io/fs.cc b/paddle/fluid/framework/io/fs.cc
index a39147a97cf7e..4a689409d412b 100644
--- a/paddle/fluid/framework/io/fs.cc
+++ b/paddle/fluid/framework/io/fs.cc
@@ -399,13 +399,12 @@ void hdfs_mv(const std::string& src, const std::string& dest) {
 }
 
 int fs_select_internal(const std::string& path) {
-  if (fs_begin_with_internal(path, "hdfs:")) {
-    return 1;
-  } else if (fs_begin_with_internal(path, "afs:")) {
+  if (fs_begin_with_internal(path, "hdfs:") ||
+      fs_begin_with_internal(path, "afs:")) {
     return 1;
+  } else {
+    return 0;
   }
-
-  return 0;
 }
 
 std::shared_ptr<FILE> fs_open_read(const std::string& path,
diff --git a/paddle/fluid/framework/ir/constant_folding_pass.cc b/paddle/fluid/framework/ir/constant_folding_pass.cc
index 3b3f23933fb6d..f8e0ac9475b5d 100644
--- a/paddle/fluid/framework/ir/constant_folding_pass.cc
+++ b/paddle/fluid/framework/ir/constant_folding_pass.cc
@@ -81,9 +81,7 @@ void ConstantFoldingPass::ApplyImpl(ir::Graph *graph) const {
     std::unordered_map<std::string, int> map;
     for (auto in_node : op_node->inputs) {
       map[in_node->Name()] = 0;
-      if (!in_node->Var()->Persistable()) {
-        input_persis = false;
-      } else if (!in_node->inputs.empty()) {
+      if (!in_node->Var()->Persistable() || !in_node->inputs.empty()) {
         input_persis = false;
       }
     }
diff --git a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc
index 8f19225dc53b4..655183dc712c0 100644
--- a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc
@@ -400,9 +400,8 @@ void QuantDequantMkldnnPass::RemoveFakeOps(
 
     if (fake_quantize_types.count(op_node->Name())) {
       CollectFakeQuantizeOps(graph, op_node, &nodes2rm);
-    } else if (fake_dequantize_types.count(op_node->Name())) {
-      CollectFakeDequantizeOps(graph, op_node, &nodes2rm);
-    } else if (fake_quantize_dequantize_types.count(op_node->Name())) {
+    } else if (fake_dequantize_types.count(op_node->Name()) ||
+               fake_quantize_dequantize_types.count(op_node->Name())) {
       CollectFakeDequantizeOps(graph, op_node, &nodes2rm);
     } else if (onnx_format_quantize_dequantize_types.count(op_node->Name())) {
       CollectQuantizeDequantizeOpsFromONNXFormat(graph, op_node, &nodes2rm);
diff --git a/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.cc b/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.cc
index e826c94712568..e63164c020c36 100644
--- a/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.cc
+++ b/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.cc
@@ -88,9 +88,10 @@ void InterpreterCoreEventGarbageCollector::Add(
 
   if (var->IsType<phi::DenseTensor>()) {
     Add(var->GetMutable<phi::DenseTensor>()->MoveMemoryHolder(), event, ctx);
-  } else if (var->IsType<
-                 operators::reader::
-                     OrderedMultiDeviceLoDTensorBlockingQueueHolder>()) {
+  } else if (
+      var->IsType<
+          operators::reader::
+              OrderedMultiDeviceLoDTensorBlockingQueueHolder>()) {  // NOLINT
     // TODO(xiongkun03) in old executor, this type of variable is not support
     // eager deletion. so we just leave it here ?
   } else if (var->IsType<LoDRankTable>()) {
diff --git a/paddle/fluid/framework/new_executor/garbage_collector/fast_garbage_collector.cc b/paddle/fluid/framework/new_executor/garbage_collector/fast_garbage_collector.cc
index 4bc8b298012ab..e7efc1f10c324 100644
--- a/paddle/fluid/framework/new_executor/garbage_collector/fast_garbage_collector.cc
+++ b/paddle/fluid/framework/new_executor/garbage_collector/fast_garbage_collector.cc
@@ -34,9 +34,10 @@ void InterpreterCoreFastGarbageCollector::Add(Variable* var) {
 
   if (var->IsType<phi::DenseTensor>()) {
     Add(var->GetMutable<phi::DenseTensor>()->MoveMemoryHolder());
-  } else if (var->IsType<
-                 operators::reader::
-                     OrderedMultiDeviceLoDTensorBlockingQueueHolder>()) {
+  } else if (
+      var->IsType<
+          operators::reader::
+              OrderedMultiDeviceLoDTensorBlockingQueueHolder>()) {  // NOLINT
     // TODO(xiongkun03) in old executor, this type of variable is not support
     // eager deletion. so we just leave it here ?
   } else if (var->IsType<LoDRankTable>()) {
diff --git a/paddle/fluid/framework/new_executor/interpreter/static_build.cc b/paddle/fluid/framework/new_executor/interpreter/static_build.cc
index 0f9bd3f387a92..67b75bb523711 100644
--- a/paddle/fluid/framework/new_executor/interpreter/static_build.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/static_build.cc
@@ -267,10 +267,8 @@ phi::TensorBase* GetTensorFormVar(framework::Variable* var) {
       return var->template GetMutable<phi::TensorArray>();
     } else if (var->template IsType<framework::Strings>()) {
       return var->template GetMutable<framework::Strings>();
-    } else if (var->template IsType<paddle::framework::RawTensor>()) {
-      return var->template GetMutable<paddle::framework::RawTensor>();
-    } else if (!var->IsInitialized()) {
-      // The following is for RAW type of var
+    } else if (var->template IsType<paddle::framework::RawTensor>() ||
+               !var->IsInitialized()) {
       return var->template GetMutable<paddle::framework::RawTensor>();
     } else {
       PADDLE_THROW(platform::errors::Unimplemented(
diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
index 47823eb82b428..2dc6181180c9d 100644
--- a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
@@ -758,9 +758,10 @@ void NewIRInterpreter::RecordStreamForGC(InstructionBase* instr) {
 
     if (var->IsType<phi::DenseTensor>()) {
       TensorRecordStream(*(var->GetMutable<phi::DenseTensor>()));
-    } else if (var->IsType<
-                   operators::reader::
-                       OrderedMultiDeviceLoDTensorBlockingQueueHolder>()) {
+    } else if (
+        var->IsType<
+            operators::reader::
+                OrderedMultiDeviceLoDTensorBlockingQueueHolder>()) {  // NOLINT
       // do nothing
     } else if (var->IsType<phi::SelectedRows>()) {
       TensorRecordStream(
diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc
index 1384a9fb487de..2e466962c4d31 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/program_interpreter.cc
@@ -1292,9 +1292,10 @@ void ProgramInterpreter::RecordStreamForGC(const Instruction& instr) {
 
     if (var->IsType<phi::DenseTensor>()) {
       TensorRecordStream(*(var->GetMutable<phi::DenseTensor>()));
-    } else if (var->IsType<
-                   operators::reader::
-                       OrderedMultiDeviceLoDTensorBlockingQueueHolder>()) {
+    } else if (
+        var->IsType<
+            operators::reader::
+                OrderedMultiDeviceLoDTensorBlockingQueueHolder>()) {  // NOLINT
       // do nothing
     } else if (var->IsType<phi::SelectedRows>()) {
       TensorRecordStream(
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 9b9979bc70f4c..7a3271a48debc 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -2777,8 +2777,6 @@ void OperatorWithKernel::ParseInputDataType(
     const phi::DenseTensor* t = nullptr;
     if (var->IsType<phi::DenseTensor>()) {
       t = &var->Get<phi::DenseTensor>();
-    } else if (var->IsType<phi::DenseTensor>()) {
-      t = &var->Get<phi::DenseTensor>();
     } else if (var->IsType<phi::SelectedRows>()) {
       t = &(var->Get<phi::SelectedRows>().value());
     } else if (var->IsType<phi::SparseCooTensor>()) {
@@ -3221,11 +3219,8 @@ void OperatorWithKernel::BuildPhiKernelContext(
         } else if (var->template IsType<framework::Strings>()) {
           tensor_out = var->template GetMutable<framework::Strings>();
           phi_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
-        } else if (var->template IsType<paddle::framework::RawTensor>()) {
-          tensor_out = var->template GetMutable<paddle::framework::RawTensor>();
-          phi_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
-        } else if (!var->IsInitialized()) {
-          // The following is for RAW type of var
+        } else if (var->template IsType<paddle::framework::RawTensor>() ||
+                   !var->IsInitialized()) {
           tensor_out = var->template GetMutable<paddle::framework::RawTensor>();
           phi_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
         } else {
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 8b6363d93d134..e6c11df275b56 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -693,7 +693,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
 
   // broadcast parameters from the 0th device to others:
   auto need_broadcast = [&]() -> bool {
-    if (member_->build_strategy_.num_trainers_ > 1) {
+    if (member_->build_strategy_.num_trainers_ > 1) {  // NOLINT
       // 1. num_tariners would be grater than 1 for nccl distributed training.
       return true;
     } else if (member_->local_scopes_.size() != 1 && local_scopes.empty()) {
@@ -936,11 +936,9 @@ void ParallelExecutor::BCastParamsToDevices(
         auto share_memory = [&] { t->ShareDataWith(main_tensor); };
 
         // FIXME(zcd): LR_DECAY_COUNTER should not be shared. This is a hot fix.
-        if (member_->build_strategy_.async_mode_) {
-          share_memory();
-        } else if (member_->use_all_reduce_ ||
-                   member_->IsUseCUDA(member_->use_device_) ||
-                   var == "@LR_DECAY_COUNTER@") {
+        if (member_->use_all_reduce_ ||
+            member_->IsUseCUDA(member_->use_device_) ||
+            var == "@LR_DECAY_COUNTER@") {
           copy_memory();
         } else {
           share_memory();
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 6fe75d1a90dab..90612e5692595 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -78,7 +78,8 @@ void TensorCopyImpl(const TENSOR& src,
   auto size = src.numel() * phi::SizeOf(src.dtype());
 #endif
 
-  if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
+  if (platform::is_cpu_place(src_place) &&
+      platform::is_cpu_place(dst_place)) {  // NOLINT
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
@@ -327,7 +328,8 @@ void TensorCopySync(const phi::DenseTensor& src,
     return;
   }
   auto size = src.numel() * phi::SizeOf(src.dtype());
-  if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
+  if (platform::is_cpu_place(src_place) &&
+      platform::is_cpu_place(dst_place)) {  // NOLINT
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
diff --git a/paddle/fluid/framework/var_desc.cc b/paddle/fluid/framework/var_desc.cc
index b0130e055c075..836ba0fb762b3 100644
--- a/paddle/fluid/framework/var_desc.cc
+++ b/paddle/fluid/framework/var_desc.cc
@@ -386,11 +386,8 @@ struct SetVarAttrDescVisitor {
   template <typename T>
   void operator()(T &&v) {
     using U = std::decay_t<decltype(v)>;
-    if (std::is_same<U, int>::value) {
-      set_attr_value(v);
-    } else if (std::is_same<U, std::string>::value) {
-      set_attr_value(v);
-    } else if (std::is_same<U, std::vector<int>>::value) {
+    if (std::is_same<U, int>::value || std::is_same<U, std::string>::value ||
+        std::is_same<U, std::vector<int>>::value) {
       set_attr_value(v);
     } else {
       PADDLE_THROW(platform::errors::Unavailable(
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 6b57f1fabf4bd..70da22a3240e9 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -2006,7 +2006,7 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
       static_cast<void *>(scope), this->GetDeviceContexts()));
   res->input_or_output_ = true;
   res->SetName(name);
-  if (platform::is_cpu_place(place_)) {
+  if (platform::is_cpu_place(place_)) {  // NOLINT
     res->SetPlace(PaddlePlace::kCPU);
   } else if (platform::is_ipu_place(place_)) {
     // Currently, IPUPlace's tensor copy between cpu and ipu has been set in
@@ -2057,7 +2057,7 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
       static_cast<void *>(scope), this->GetDeviceContexts()));
   res->input_or_output_ = false;
   res->SetName(name);
-  if (platform::is_cpu_place(place_)) {
+  if (platform::is_cpu_place(place_)) {  // NOLINT
     res->SetPlace(PaddlePlace::kCPU);
   } else if (platform::is_ipu_place(place_)) {
     // Currently, IPUPlace's tensor copy between cpu and ipu has been set in
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index 656d6273afb3f..cf253d6c4ebdc 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -743,7 +743,7 @@ void Copy<phi::Place, phi::Place>(phi::Place dst_place,
   VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
           << dst_place;
   if (src_place.GetType() == phi::AllocationType::CPU &&
-      dst_place.GetType() == phi::AllocationType::CPU) {
+      dst_place.GetType() == phi::AllocationType::CPU) {  // NOLINT
     std::memcpy(dst, src, num);
   }
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index 4f1c7ab3857d7..1d45cee715409 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -386,8 +386,6 @@ phi::KernelKey BatchNormGradOp::GetExpectedKernelType(
   const phi::DenseTensor *t = nullptr;
   if (var->IsType<phi::DenseTensor>()) {
     t = &var->Get<phi::DenseTensor>();
-  } else if (var->IsType<phi::DenseTensor>()) {
-    t = &var->Get<phi::DenseTensor>();
   }
   if (t == nullptr) {
     PADDLE_THROW(
@@ -530,8 +528,6 @@ phi::KernelKey BatchNormDoubleGradOp::GetExpectedKernelType(
   const phi::DenseTensor *t = nullptr;
   if (var->IsType<phi::DenseTensor>()) {
     t = &var->Get<phi::DenseTensor>();
-  } else if (var->IsType<phi::DenseTensor>()) {
-    t = &var->Get<phi::DenseTensor>();
   }
   if (t == nullptr) {
     PADDLE_THROW(
diff --git a/paddle/fluid/operators/data_norm_op.cc b/paddle/fluid/operators/data_norm_op.cc
index 493351654d5eb..2e70168876162 100644
--- a/paddle/fluid/operators/data_norm_op.cc
+++ b/paddle/fluid/operators/data_norm_op.cc
@@ -495,8 +495,6 @@ class DataNormGradOp : public framework::OperatorWithKernel {
     const phi::DenseTensor *t = nullptr;
     if (var->IsType<phi::DenseTensor>()) {
       t = &var->Get<phi::DenseTensor>();
-    } else if (var->IsType<phi::DenseTensor>()) {
-      t = &var->Get<phi::DenseTensor>();
     }
     if (t == nullptr) {
       PADDLE_THROW(platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/detection/multiclass_nms_op.cc b/paddle/fluid/operators/detection/multiclass_nms_op.cc
index 432713c60d969..8519752bc1049 100644
--- a/paddle/fluid/operators/detection/multiclass_nms_op.cc
+++ b/paddle/fluid/operators/detection/multiclass_nms_op.cc
@@ -101,11 +101,7 @@ class MultiClassNMSOp : public framework::OperatorWithKernel {
     }
     // Here the box_dims[0] is not the real dimension of output.
     // It will be rewritten in the computing kernel.
-    if (score_size == 3) {
-      ctx->SetOutputDim("Out", {-1, box_dims[2] + 2});
-    } else {
-      ctx->SetOutputDim("Out", {-1, box_dims[2] + 2});
-    }
+    ctx->SetOutputDim("Out", {-1, box_dims[2] + 2});
     if (!ctx->IsRuntime()) {
       ctx->SetLoDLevel("Out", std::max(ctx->GetLoDLevel("BBoxes"), 1));
     }
@@ -584,14 +580,7 @@ class MultiClassNMS2Op : public MultiClassNMSOp {
 
   void InferShape(framework::InferShapeContext* ctx) const override {
     MultiClassNMSOp::InferShape(ctx);
-
-    auto score_dims = ctx->GetInputDim("Scores");
-    auto score_size = score_dims.size();
-    if (score_size == 3) {
-      ctx->SetOutputDim("Index", {-1, 1});
-    } else {
-      ctx->SetOutputDim("Index", {-1, 1});
-    }
+    ctx->SetOutputDim("Index", {-1, 1});
     if (!ctx->IsRuntime()) {
       ctx->SetLoDLevel("Index", std::max(ctx->GetLoDLevel("BBoxes"), 1));
     }
diff --git a/paddle/fluid/operators/fused/fused_bn_activation_op.cc b/paddle/fluid/operators/fused/fused_bn_activation_op.cc
index 88b11f1ef39c5..ca59a466a5c2b 100644
--- a/paddle/fluid/operators/fused/fused_bn_activation_op.cc
+++ b/paddle/fluid/operators/fused/fused_bn_activation_op.cc
@@ -303,8 +303,6 @@ phi::KernelKey FusedBatchNormActGradOp::GetExpectedKernelType(
   const phi::DenseTensor *t = nullptr;
   if (var->IsType<phi::DenseTensor>()) {
     t = &var->Get<phi::DenseTensor>();
-  } else if (var->IsType<phi::DenseTensor>()) {
-    t = &var->Get<phi::DenseTensor>();
   }
   if (t == nullptr) {
     PADDLE_THROW(
diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc
index a33a91b082e5c..ed416d4ad13d1 100644
--- a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc
+++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc
@@ -267,8 +267,6 @@ phi::KernelKey FusedBatchNormAddActGradOp::GetExpectedKernelType(
   const phi::DenseTensor *t = nullptr;
   if (var->IsType<phi::DenseTensor>()) {
     t = &var->Get<phi::DenseTensor>();
-  } else if (var->IsType<phi::DenseTensor>()) {
-    t = &var->Get<phi::DenseTensor>();
   }
   if (t == nullptr) {
     PADDLE_THROW(
diff --git a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
index 05d1e64f92ae7..5ec5e8081bb6f 100644
--- a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
+++ b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
@@ -248,7 +248,7 @@ class FusionGRUMKLDNNKernel : public framework::OpKernel<T> {
     const bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
 
     // BF16 does not support force output
-    if (!is_bf16 && force_fp32_output) {
+    if (!is_bf16 && force_fp32_output) {  // NOLINT
       RunKernel<float>(ctx);
     } else {
       RunKernel<T>(ctx);
diff --git a/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
index d973c5e89a626..4972db5804322 100644
--- a/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
+++ b/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
@@ -329,7 +329,7 @@ class FusionLSTMMKLDNNKernel : public framework::OpKernel<T> {
     const bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
 
     // BF16 does not support force output
-    if (!is_bf16 && force_fp32_output) {
+    if (!is_bf16 && force_fp32_output) {  // NOLINT
       RunKernel<float>(ctx);
     } else {
       RunKernel<T>(ctx);
diff --git a/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc
index 90ecbe4506d98..1c8e0a1b56a97 100644
--- a/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc
+++ b/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc
@@ -688,7 +688,7 @@ class MultiGRUMKLDNNKernel : public framework::OpKernel<T> {
     const bool force_fp32_output =
         ctx.HasAttr("force_fp32_output") && ctx.Attr<bool>("force_fp32_output");
 
-    if (force_fp32_output) {
+    if (force_fp32_output) {  // NOLINT
       RunKernel<float>(ctx);
     } else {
       RunKernel<T>(ctx);
@@ -706,7 +706,7 @@ class MultiGRUMKLDNNKernel : public framework::OpKernel<T> {
       auto gru_out_L2R = handler.executeSingleGru(input_mem, layer, L2R);
       handler.reorderInputL2RtoR2L(input_mem, layer);
       auto gru_out_R2L = handler.executeSingleGru(input_mem, layer, R2L);
-      if (layer < layers - 1)
+      if (layer < layers - 1)  // NOLINT
         handler.template reorderOutputR2LtoL2R<T>(gru_out_R2L, layer);
       else
         handler.template reorderOutputR2LtoL2R<Tout>(gru_out_R2L, layer);
diff --git a/paddle/fluid/operators/inplace_abn_op.cc b/paddle/fluid/operators/inplace_abn_op.cc
index eee0f1f304bc3..a53a9867b9903 100644
--- a/paddle/fluid/operators/inplace_abn_op.cc
+++ b/paddle/fluid/operators/inplace_abn_op.cc
@@ -284,8 +284,6 @@ class InplaceABNGradOp : public framework::OperatorWithKernel {
     const phi::DenseTensor* t = nullptr;
     if (var->IsType<phi::DenseTensor>()) {
       t = &var->Get<phi::DenseTensor>();
-    } else if (var->IsType<phi::DenseTensor>()) {
-      t = &var->Get<phi::DenseTensor>();
     }
     if (t == nullptr) {
       PADDLE_THROW(
diff --git a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
index a7f6bc512ffce..692b7f0721ceb 100644
--- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
@@ -359,7 +359,7 @@ class FCMKLDNNKernel : public framework::OpKernel<T_in> {
     bool fuse_relu = ctx.Attr<std::string>("activation_type") == "relu";
 
     IF_CHANGE_FC_TW_TYPENAME((std::is_same<T_in, uint8_t>::value), ([&] {
-                               if (force_fp32_output) {
+                               if (force_fp32_output) {  // NOLINT
                                  this->RunKernel<float, T_w>(ctx);
                                } else if (phi::funcs::is_int8<T_in>()) {
                                  if (fuse_relu) {
diff --git a/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc
index b7a33edb82a00..3c53b05152b7e 100644
--- a/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc
@@ -105,8 +105,6 @@ class ReshapeMKLDNNKernel : public framework::OpKernel<T> {
         InferShapeSqueezeOp(ctx, x_dims, out_dims);
         break;
       case ReshapeKernelOpName::flatten:
-        InferShapeFlattenOp(ctx, x_dims, out_dims);
-        break;
       case ReshapeKernelOpName::flatten2:
         InferShapeFlattenOp(ctx, x_dims, out_dims);
         break;
diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc
index 2e24caa91c6bb..b73ffe4319be7 100644
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -213,10 +213,8 @@ void BufferedReader::ReadAsync(size_t i) {
           auto cpu_ptr = cpu[i].data();
           auto gpu_ptr = gpu_ptrs[i];
           auto size = cpu[i].numel() * phi::SizeOf(cpu[i].dtype());
-          if (platform::is_cuda_pinned_place(cpu_place)) {
-            memory::Copy(
-                place_, gpu_ptr, cpu_place, cpu_ptr, size, stream_.get());
-          } else if ((platform::is_gpu_place(cpu_place))) {
+          if (platform::is_cuda_pinned_place(cpu_place) ||
+              platform::is_gpu_place(cpu_place)) {
             memory::Copy(
                 place_, gpu_ptr, cpu_place, cpu_ptr, size, stream_.get());
           } else {
diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc
index 5cf9fba9f2681..ebb4cd7cf132d 100644
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -76,7 +76,7 @@ class SumOp : public framework::OperatorWithKernel {
       // NOTE(jiahongyu): Below codes originally enclosed by PADDLE_WITH_DNNL
       if (!((data_type == framework::proto::VarType::FP32 ||
              data_type == framework::proto::VarType::BF16) &&
-            ctx.OutputVar("Out")->IsType<phi::DenseTensor>())) {
+            ctx.OutputVar("Out")->IsType<phi::DenseTensor>())) {  // NOLINT
         this->SetDnnFallback(true);
       } else if (!std::all_of(x_vars.begin(),
                               x_vars.end(),
diff --git a/paddle/fluid/pir/phi_kernel_adaptor/phi_kernel_util.cc b/paddle/fluid/pir/phi_kernel_adaptor/phi_kernel_util.cc
index a3997ee97db6a..437523e41bf3e 100644
--- a/paddle/fluid/pir/phi_kernel_adaptor/phi_kernel_util.cc
+++ b/paddle/fluid/pir/phi_kernel_adaptor/phi_kernel_util.cc
@@ -196,9 +196,8 @@ void BuildValue(pir::Value value,
                     variable_list);
   }
   // Only support DenseTensor or Vector<DenseTensor>
-  if (!value.type()) {
-    var->GetMutable<phi::DenseTensor>();
-  } else if (value.type().isa<paddle::dialect::AllocatedDenseTensorType>()) {
+  if (!value.type() ||
+      value.type().isa<paddle::dialect::AllocatedDenseTensorType>()) {
     var->GetMutable<phi::DenseTensor>();
   } else if (value.type().isa<paddle::dialect::AllocatedSelectedRowsType>()) {
     var->GetMutable<phi::SelectedRows>();
diff --git a/paddle/fluid/platform/place.cc b/paddle/fluid/platform/place.cc
index b8452a594e358..d38d0418e4639 100644
--- a/paddle/fluid/platform/place.cc
+++ b/paddle/fluid/platform/place.cc
@@ -62,11 +62,7 @@ bool is_same_place(const Place &p1, const Place &p2) {
   if (places_are_same_class(p1, p2)) {
     if (is_cpu_place(p1) || is_cuda_pinned_place(p1)) {
       return true;
-    } else if (is_xpu_place(p1)) {
-      return p1 == p2;
-    } else if (is_ipu_place(p1)) {
-      return p1 == p2;
-    } else if (is_custom_place(p1)) {
+    } else if (is_xpu_place(p1) || is_ipu_place(p1) || is_custom_place(p1)) {
       return p1 == p2;
     } else {
       return p1 == p2;
diff --git a/paddle/fluid/prim/api/manual_prim/static_prim_api.cc b/paddle/fluid/prim/api/manual_prim/static_prim_api.cc
index c907be2d10256..c45a473b4a8d3 100644
--- a/paddle/fluid/prim/api/manual_prim/static_prim_api.cc
+++ b/paddle/fluid/prim/api/manual_prim/static_prim_api.cc
@@ -50,8 +50,6 @@ Tensor full<DescTensor>(const IntArray& shape,
   op->SetAttr("shape", shape.GetData());
   switch (dtype) {
     case phi::DataType::FLOAT16:
-      op->SetAttr("str_value", std::to_string(value.to<float>()));
-      break;
     case phi::DataType::BFLOAT16:
       op->SetAttr("str_value", std::to_string(value.to<float>()));
       break;
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index 59ef86423788a..e72f5dc77f99c 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -1617,7 +1617,8 @@ static PyObject* tensor_method__setitem_eager_tensor(TensorObject* self,
           py::isinstance<py::int_>(value_obj_tmp) ||
           py::isinstance<py::bool_>(value_obj_tmp) ||
           PyComplex_Check(value_obj)) {
-        if (self->tensor.dtype() == phi::DataType::FLOAT32) {
+        if (self->tensor.dtype() == phi::DataType::FLOAT32 ||
+            self->tensor.dtype() == phi::DataType::FLOAT16) {
           attrs["values"] = std::vector<paddle::experimental::Scalar>{
               value_obj_tmp.cast<float>()};
         } else if (self->tensor.dtype() == phi::DataType::FLOAT64) {
@@ -1632,9 +1633,6 @@ static PyObject* tensor_method__setitem_eager_tensor(TensorObject* self,
         } else if (self->tensor.dtype() == phi::DataType::BOOL) {
           attrs["values"] = std::vector<paddle::experimental::Scalar>{
               value_obj_tmp.cast<bool>()};
-        } else if (self->tensor.dtype() == phi::DataType::FLOAT16) {
-          attrs["values"] = std::vector<paddle::experimental::Scalar>{
-              value_obj_tmp.cast<float>()};
         } else if (self->tensor.dtype() == phi::DataType::COMPLEX64) {
           attrs["values"] = std::vector<paddle::experimental::Scalar>{
               value_obj_tmp.cast<std::complex<float>>()};
diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc
index 59ecee2c5d668..517c210830022 100644
--- a/paddle/fluid/pybind/eager_properties.cc
+++ b/paddle/fluid/pybind/eager_properties.cc
@@ -92,13 +92,11 @@ Tensor's type.
 
 PyObject* tensor_properties_get_type(TensorObject* self, void* closure) {
   EAGER_TRY
-  if (!self->tensor.defined()) {
+  if (!self->tensor.defined() || self->tensor.is_dense_tensor()) {
     // be same to old dygraph
     return ToPyObject(paddle::framework::proto::VarType::LOD_TENSOR);
   }
-  if (self->tensor.is_dense_tensor()) {
-    return ToPyObject(paddle::framework::proto::VarType::LOD_TENSOR);
-  } else if (self->tensor.is_selected_rows()) {
+  if (self->tensor.is_selected_rows()) {
     return ToPyObject(paddle::framework::proto::VarType::SELECTED_ROWS);
   } else if (egr::IsVariableCompatTensor(self->tensor)) {
     return ToPyObject(static_cast<paddle::framework::proto::VarType::Type>(
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index 0432ca88d6ada..87660d9fd88ca 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -173,13 +173,11 @@ bool PyObject_CheckIRVectorOfOpResult(PyObject* obj) {
   }
 }
 bool CastPyArg2AttrBoolean(PyObject* obj, ssize_t arg_pos) {
-  if (obj == Py_None) {
+  if (obj == Py_None || obj == Py_False) {
     return false;  // To be compatible with QA integration testing. Some
                    // test cases pass in None.
   } else if (obj == Py_True) {
     return true;
-  } else if (obj == Py_False) {
-    return false;
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "argument (position %d) must be "
@@ -1125,9 +1123,8 @@ static paddle::Tensor& GetTensorFromPyObject(const std::string& op_type,
     return emptytensor;
   }
 
-  if (PyObject_TypeCheck(obj, p_tensor_type)) {
-    return reinterpret_cast<TensorObject*>(obj)->tensor;
-  } else if (PyObject_TypeCheck(obj, p_string_tensor_type)) {
+  if (PyObject_TypeCheck(obj, p_tensor_type) ||
+      PyObject_TypeCheck(obj, p_string_tensor_type)) {
     return reinterpret_cast<TensorObject*>(obj)->tensor;
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index b1fbf43aac8b6..bd569f328b115 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -239,7 +239,7 @@ void PaddleInferTensorCreate(paddle_infer::Tensor &tensor,  // NOLINT
 
 paddle_infer::PlaceType ToPaddleInferPlace(
     phi::AllocationType allocation_type) {
-  if (allocation_type == phi::AllocationType::CPU) {
+  if (allocation_type == phi::AllocationType::CPU) {  // NOLINT
     return paddle_infer::PlaceType::kCPU;
   } else if (allocation_type == phi::AllocationType::GPU) {
     return paddle_infer::PlaceType::kGPU;
diff --git a/paddle/fluid/pybind/op_function_common.cc b/paddle/fluid/pybind/op_function_common.cc
index 366465e6b2984..9d8074628fb13 100644
--- a/paddle/fluid/pybind/op_function_common.cc
+++ b/paddle/fluid/pybind/op_function_common.cc
@@ -121,13 +121,11 @@ bool PyObject_CheckString(PyObject* obj) { return PyUnicode_Check(obj); }
 bool CastPyArg2Boolean(PyObject* obj,
                        const std::string& op_type,
                        ssize_t arg_pos) {
-  if (obj == Py_None) {
+  if (obj == Py_None || obj == Py_False) {
     return false;  // To be compatible with QA integration testing. Some
                    // test case pass in None.
   } else if (obj == Py_True) {
     return true;
-  } else if (obj == Py_False) {
-    return false;
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "%s(): argument (position %d) must be "
diff --git a/paddle/phi/core/compat/convert_utils.cc b/paddle/phi/core/compat/convert_utils.cc
index d82b37328850f..d4c5de0dbe6dc 100644
--- a/paddle/phi/core/compat/convert_utils.cc
+++ b/paddle/phi/core/compat/convert_utils.cc
@@ -67,7 +67,7 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) {
           set_device_id ? phi::backends::gpu::GetCurrentDeviceId() : 0);
 #endif
 #ifdef PADDLE_WITH_DNNL
-    case phi::Backend::ONEDNN:
+    case phi::Backend::ONEDNN:  // NOLINT
       return phi::CPUPlace();
 #endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
diff --git a/paddle/phi/core/kernel_factory.cc b/paddle/phi/core/kernel_factory.cc
index d58decadfadca..f9c1dca46b2fb 100644
--- a/paddle/phi/core/kernel_factory.cc
+++ b/paddle/phi/core/kernel_factory.cc
@@ -63,9 +63,8 @@ KernelFactory& KernelFactory::Instance() {
 
 bool KernelFactory::HasCompatiblePhiKernel(const std::string& op_type) const {
   if (deprecated_op_names.find(op_type) == deprecated_op_names.end()) {
-    if (phi::OpUtilsMap::Instance().Contains(op_type)) {
-      return true;
-    } else if (kernels_.find(op_type) != kernels_.end()) {
+    if (phi::OpUtilsMap::Instance().Contains(op_type) ||
+        (kernels_.find(op_type) != kernels_.end())) {
       return true;
     }
   }
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index aa1b6526cd5f8..e0df80157013e 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -1130,7 +1130,7 @@ void ExpandInferMeta(const MetaTensor& x,
       std::max(static_cast<size_t>(x_dims.size()), expand_shape.size());
   std::vector<int64_t> out_shape(out_rank);
   for (int i = 0; i < static_cast<int>(expand_shape.size()); ++i) {
-    if (x_dims[i] == -1) {
+    if (x_dims[i] == -1) {  // NOLINT
       out_shape[i] = -1;
     } else if (expand_shape[i] == -1) {
       if (static_cast<int>(x_dims.size()) > i) {
diff --git a/paddle/phi/kernels/cpu/diagonal_grad_kernel.cc b/paddle/phi/kernels/cpu/diagonal_grad_kernel.cc
index 5ccb5ad8c43b4..d8383b45beb79 100644
--- a/paddle/phi/kernels/cpu/diagonal_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/diagonal_grad_kernel.cc
@@ -63,10 +63,8 @@ void DiagonalGradKernel(const Context& dev_ctx,
     idx_dim.erase(idx_dim.begin() + std::min(axis1_, axis2_));
 
     bool flag = false;
-    if (offset_ == 0 && axis1_dim == axis2_dim) {
-      idx_dim.push_back(axis1_dim);
-      flag = true;
-    } else if (offset_ > 0 && (axis1_dim + offset_) == axis2_dim) {
+    if ((offset_ == 0 && axis1_dim == axis2_dim) ||
+        (offset_ > 0 && (axis1_dim + offset_) == axis2_dim)) {
       idx_dim.push_back(axis1_dim);
       flag = true;
     } else if (offset_ < 0 && (axis1_dim + offset_) == axis2_dim) {
diff --git a/paddle/phi/kernels/cpu/generate_proposals_kernel.cc b/paddle/phi/kernels/cpu/generate_proposals_kernel.cc
index 2e468ef2d07ff..e9764035613ed 100644
--- a/paddle/phi/kernels/cpu/generate_proposals_kernel.cc
+++ b/paddle/phi/kernels/cpu/generate_proposals_kernel.cc
@@ -52,13 +52,7 @@ void ClipTiledBoxes(const phi::CPUContext& ctx,
   T im_h =
       is_scale ? round(im_info_data[0] / im_info_data[2]) : im_info_data[0];
   for (int64_t i = 0; i < input_boxes.numel(); ++i) {
-    if (i % 4 == 0) {
-      out_data[i] =
-          std::max(std::min(input_boxes_data[i], im_w - offset), zero);
-    } else if (i % 4 == 1) {
-      out_data[i] =
-          std::max(std::min(input_boxes_data[i], im_h - offset), zero);
-    } else if (i % 4 == 2) {
+    if ((i % 4 == 0) || (i % 4 == 2)) {
       out_data[i] =
           std::max(std::min(input_boxes_data[i], im_w - offset), zero);
     } else {
diff --git a/paddle/phi/kernels/cpu/send_ue_recv_grad_kernel.cc b/paddle/phi/kernels/cpu/send_ue_recv_grad_kernel.cc
index 0ca3be62a3971..fac19f142dffc 100644
--- a/paddle/phi/kernels/cpu/send_ue_recv_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/send_ue_recv_grad_kernel.cc
@@ -256,7 +256,7 @@ void CalculateEGrad(const T* out_grad_data,
       for (int64_t j = 0; j < bcast.out_len; j++) {
         int64_t x_add = bcast.use_bcast ? bcast.l_offset[j] : j;
         int64_t e_add = bcast.use_bcast ? bcast.r_offset[j] : j;
-        if (message_op == "ADD") {
+        if (message_op == "ADD") {  // NOLINT
 #ifdef PADDLE_WITH_MKLML
 #pragma omp atomic
 #endif
@@ -283,7 +283,7 @@ void CalculateEGrad(const T* out_grad_data,
       for (int64_t j = 0; j < bcast.out_len; j++) {
         int64_t x_add = bcast.use_bcast ? bcast.l_offset[j] : j;
         int64_t e_add = bcast.use_bcast ? bcast.r_offset[j] : j;
-        if (message_op == "ADD") {
+        if (message_op == "ADD") {  // NOLINT
 #ifdef PADDLE_WITH_MKLML
 #pragma omp atomic
 #endif
diff --git a/paddle/phi/kernels/funcs/vol2col.cc b/paddle/phi/kernels/funcs/vol2col.cc
index 0f411b8894ce9..e505fcb3de337 100644
--- a/paddle/phi/kernels/funcs/vol2col.cc
+++ b/paddle/phi/kernels/funcs/vol2col.cc
@@ -66,7 +66,7 @@ class Vol2ColFunctor<phi::CPUContext, T> {
 
     // changed
     bool paddings_size_is_6 = (paddings.size() == 6);
-    int pad_d_forth = paddings_size_is_6 ? paddings[0] : paddings[0];
+    int pad_d_forth = paddings[0];
     int pad_d_back = paddings_size_is_6 ? paddings[1] : paddings[0];
     int pad_h_up = paddings_size_is_6 ? paddings[2] : paddings[1];
     int pad_h_down = paddings_size_is_6 ? paddings[3] : paddings[1];
@@ -191,7 +191,7 @@ class Col2VolFunctor<phi::CPUContext, T> {
         input_channels * filter_depth * filter_height * filter_width;
 
     bool paddings_size_is_6 = (paddings.size() == 6);
-    int pad_d_forth = paddings_size_is_6 ? paddings[0] : paddings[0];
+    int pad_d_forth = paddings[0];
     int pad_d_back = paddings_size_is_6 ? paddings[1] : paddings[0];
     int pad_h_up = paddings_size_is_6 ? paddings[2] : paddings[1];
     int pad_h_down = paddings_size_is_6 ? paddings[3] : paddings[1];

From 4c856f9d714999ade2cea66728c8e498067c5c1d Mon Sep 17 00:00:00 2001
From: Xianduo Li <30922914+lxd-cumt@users.noreply.github.com>
Date: Thu, 21 Sep 2023 16:33:11 +0800
Subject: [PATCH 35/39] [PRIM][PIR]Migrate prim rules (#57554)

* fix bugs of generating Op::Build when Op has optional tensor

* add default constructor for IrMetaTensor

* fix bugs

* polish guard

* pir support prim gelu and rsqrt

* support prim bwd ops

* migrate vjp rules of cast,add,multiply,elementwise_pow

* add cast as primitive op

* fix bugs in elementwise_pow_grad

* add test for cast_grad

* add test for elementwise_add_grad

* add test for elementwise_mul_grad

* add test for elementwise_pow_grad

* fix bugs

* fix bugs

* support pir prim backward ops

* refien

* fix bug

* migrate layer_norm custom vjp rules to pir

* fix bugs in ir_backward

* fix backward , scope, and concat_grad prim

* add layer_norm fwd decompose logic

* fix pow

* change _use_new_ir_api to in_pir_mode

* add _static_guard

* fix

* fix executor cuda700 error caused by full and full_like

* refine

* add vjp rules

* fix bugs

* add scope

* add test

* add add op prim rules

---------

Co-authored-by: YuanRisheng <yuanrisheng@baidu.com>
Co-authored-by: cyber-pioneer <chenzhuo@tju.edu.cn>
Co-authored-by: Charles-hit <wanghao107@baidu.com>
Co-authored-by: zhangbo9674 <zhangbo54@baidu.com>
---
 paddle/fluid/primitive/codegen/gen.py         |  18 +-
 .../rule/vjp/generated/generated_vjp.cc.j2    |   2 +-
 paddle/fluid/primitive/primitive.yaml         |   1 +
 paddle/fluid/primitive/rule/vjp/details.h     | 389 ++++++++++++++++--
 paddle/fluid/pybind/ir.cc                     |   4 +
 paddle/phi/api/yaml/legacy_backward.yaml      |   2 +-
 python/paddle/autograd/ir_backward.py         |   5 +-
 python/paddle/decomposition/rules.py          |  80 ++++
 python/paddle/tensor/creation.py              |   9 +-
 test/legacy_test/prim_op_test.py              |   9 +-
 test/legacy_test/test_activation_op.py        |  79 +++-
 test/legacy_test/test_cast_op.py              |  10 +-
 test/legacy_test/test_concat_op.py            | 162 +++++++-
 test/legacy_test/test_elementwise_add_op.py   |  10 +
 test/legacy_test/test_elementwise_mul_op.py   |  19 +-
 test/legacy_test/test_elementwise_pow_op.py   |  19 +-
 test/legacy_test/test_layer_norm_op.py        |  56 ++-
 test/legacy_test/test_reshape_op.py           |  19 +-
 test/legacy_test/test_split_op.py             |  25 +-
 test/legacy_test/test_sum_op.py               |  31 +-
 test/legacy_test/test_transpose_op.py         |  40 +-
 21 files changed, 882 insertions(+), 107 deletions(-)

diff --git a/paddle/fluid/primitive/codegen/gen.py b/paddle/fluid/primitive/codegen/gen.py
index f9a920730967d..e0eeeb10a3a4d 100644
--- a/paddle/fluid/primitive/codegen/gen.py
+++ b/paddle/fluid/primitive/codegen/gen.py
@@ -72,8 +72,20 @@
 ]
 
 
-PRIM_VJP = ['divide_grad', 'sum_grad']  # vjp list of primitive op
-CUSTOM_VJP = ['gelu_grad']  # custom vjp list of composite op
+PRIM_VJP = [
+    'divide_grad',
+    'sum_grad',
+    'cast_grad',
+    'add_grad',
+    'multiply_grad',
+    'elementwise_pow_grad',
+    'reshape_grad',
+    'split_grad',
+    'tanh_grad',
+    'transpose_grad',
+    'concat_grad',
+]  # vjp list of primitive op
+CUSTOM_VJP = ['gelu_grad', 'layer_norm_grad']  # custom vjp list of composite op
 VJP_COMPS = PRIM_VJP + CUSTOM_VJP
 
 BACKENDS = [
@@ -149,6 +161,8 @@
     'embedding_grad',
     'sqrt',
     'uniform',
+    'split',
+    'transpose',
 ]
 
 
diff --git a/paddle/fluid/primitive/codegen/templates/rule/vjp/generated/generated_vjp.cc.j2 b/paddle/fluid/primitive/codegen/templates/rule/vjp/generated/generated_vjp.cc.j2
index 1ab275ceaecbf..6737a73d69eb5 100644
--- a/paddle/fluid/primitive/codegen/templates/rule/vjp/generated/generated_vjp.cc.j2
+++ b/paddle/fluid/primitive/codegen/templates/rule/vjp/generated/generated_vjp.cc.j2
@@ -106,7 +106,7 @@ paddle::Tensor* {{api.outputs[i].name}} = !stop_gradients[{{i}}][0] ? &vjp_res[{
     {% else %}
 std::vector<paddle::Tensor*> {{api.outputs[i].name}}(stop_gradients[{{i}}].size(), nullptr);
 for (size_t i=0; i< stop_gradients[{{i}}].size(); i++ ) {
-  {{api.outputs[i].name}} =  !stop_gradients[{{i}}][i] ?  &vjp_res[{{i}}][i] : nullptr;
+  {{api.outputs[i].name}}[i] =  !stop_gradients[{{i}}][i] ?  &vjp_res[{{i}}][i] : nullptr;
 }
     {% endif %}
   {% endfor %}
diff --git a/paddle/fluid/primitive/primitive.yaml b/paddle/fluid/primitive/primitive.yaml
index a42e2503e31ba..ccf9673bafba0 100644
--- a/paddle/fluid/primitive/primitive.yaml
+++ b/paddle/fluid/primitive/primitive.yaml
@@ -49,3 +49,4 @@
 - erf
 - tanh
 - full
+- cast
diff --git a/paddle/fluid/primitive/rule/vjp/details.h b/paddle/fluid/primitive/rule/vjp/details.h
index eb640a4643ed3..96b4d051b7cde 100644
--- a/paddle/fluid/primitive/rule/vjp/details.h
+++ b/paddle/fluid/primitive/rule/vjp/details.h
@@ -134,32 +134,371 @@ void gelu_grad(const Tensor& x,
   // Promote to fp32 when the input type is fp16 for keeping consistent with
   // phi kernel
 
-  // Scale only support fp32 attr in static graph mode, use elementwise_xx
-  // when precision is over fp32.
-  if (approximate) {
-    auto kBeta = M_SQRT2 * M_2_SQRTPI * 0.5;
-    auto kKappa = 0.044715;
-    auto x_sq = x * x;
-    auto x_cube = x_sq * x;
-    auto inner = kBeta * (x + kKappa * x_cube);
-    auto tanh_inner = tanh<T>(inner);
-
-    auto left = scale<T>(x, 0.5);
-    auto right = scale<T>(tanh_inner, 1., 1.);
-
-    auto left_derivative = scale<T>(right, 0.5);
-
-    auto tanh_derivative = scale<T>(tanh_inner * tanh_inner, -1., 1.);
-    auto inner_derivative = kBeta * (scale<T>(3 * kKappa * x_sq, 1., 1.));
-    auto right_derivative = left * tanh_derivative * inner_derivative;
-
-    set_output<T>(out_grad * (left_derivative + right_derivative), x_grad);
+  if (x.dtype() == phi::DataType::FLOAT16 ||
+      x.dtype() == phi::DataType::BFLOAT16) {
+    auto promoted_x = cast<T>(x, phi::DataType::FLOAT32);
+    auto promoted_out_grad = cast<T>(out_grad, phi::DataType::FLOAT32);
+    if (approximate) {
+      float kbeta = M_SQRT2 * M_2_SQRTPI * 0.5;
+      float kkappa = 0.044715;
+      auto x_sq = promoted_x * promoted_x;
+      auto x_cube = x_sq * promoted_x;
+      auto inner = kbeta * (promoted_x + kkappa * x_cube);
+      auto tanh_inner = tanh<T>(inner);
+
+      auto left = scale<T>(promoted_x, 0.5);
+      auto right = scale<T>(tanh_inner, 1., 1.);
+
+      auto left_derivative = scale<T>(right, 0.5);
+
+      auto tanh_derivative = scale<T>(tanh_inner * tanh_inner, -1., 1.);
+      auto inner_derivative = kbeta * (scale<T>(3 * kkappa * x_sq, 1., 1.));
+      auto right_derivative = left * tanh_derivative * inner_derivative;
+
+      set_output<T>(
+          cast<T>(promoted_out_grad * (left_derivative + right_derivative),
+                  x.type()),
+          x_grad);
+    } else {
+      float kalpha = M_SQRT1_2;
+      float kbeta = M_2_SQRTPI * M_SQRT1_2 * 0.5;
+      auto cdf = scale<T>(scale<T>(erf<T>(kalpha * promoted_x), 1., 1.), 0.5);
+      auto pdf = kbeta * exp<T>(scale<T>(promoted_x * promoted_x, -0.5));
+      set_output<T>(
+          cast<T>(promoted_out_grad * (cdf + promoted_x * pdf), x.type()),
+          x_grad);
+    }
   } else {
-    auto kAlpha = M_SQRT1_2;
-    auto kBeta = M_2_SQRTPI * M_SQRT1_2 * 0.5;
-    auto cdf = scale<T>(scale<T>(erf<T>(kAlpha * x), 1., 1.), 0.5);
-    auto pdf = kBeta * exp<T>(scale<T>(x * x, -0.5));
-    set_output<T>(out_grad * (cdf + x * pdf), x_grad);
+    // Scale only support fp32 attr in static graph mode, use elementwise_xx
+    // when precision is over fp32.
+    if (approximate) {
+      auto kBeta = M_SQRT2 * M_2_SQRTPI * 0.5;
+      auto kKappa = 0.044715;
+      auto x_sq = x * x;
+      auto x_cube = x_sq * x;
+      auto inner = kBeta * (x + kKappa * x_cube);
+      auto tanh_inner = tanh<T>(inner);
+
+      auto left = scale<T>(x, 0.5);
+      auto right = scale<T>(tanh_inner, 1., 1.);
+
+      auto left_derivative = scale<T>(right, 0.5);
+
+      auto tanh_derivative = scale<T>(tanh_inner * tanh_inner, -1., 1.);
+      auto inner_derivative = kBeta * (scale<T>(3 * kKappa * x_sq, 1., 1.));
+      auto right_derivative = left * tanh_derivative * inner_derivative;
+
+      set_output<T>(out_grad * (left_derivative + right_derivative), x_grad);
+    } else {
+      auto kAlpha = M_SQRT1_2;
+      auto kBeta = M_2_SQRTPI * M_SQRT1_2 * 0.5;
+      auto cdf = scale<T>(scale<T>(erf<T>(kAlpha * x), 1., 1.), 0.5);
+      auto pdf = kBeta * exp<T>(scale<T>(x * x, -0.5));
+      set_output<T>(out_grad * (cdf + x * pdf), x_grad);
+    }
+  }
+}
+
+template <typename T>
+void reshape_grad(const Tensor& x, const Tensor& grad_out, Tensor* grad_x) {
+  if (grad_x) {
+    auto grad_x_tmp = reshape<T>(grad_out, phi::vectorize(x.dims()));
+    set_output<T>(grad_x_tmp, grad_x);
+  }
+}
+
+template <typename T>
+void transpose_grad(const Tensor& grad_out,
+                    const std::vector<int>& perm,
+                    Tensor* grad_x) {
+  if (grad_x) {
+    std::vector<int> reverse_perm(perm);
+    // make origin ranks
+    for (int i = 0; i < static_cast<int>(perm.size()); ++i) {
+      if (perm[i] >= 0) {
+        reverse_perm[perm[i]] = i;
+      } else {
+        reverse_perm[perm[i] + perm.size()] = i;
+      }
+    }
+    auto grad_x_tmp = transpose<T>(grad_out, reverse_perm);
+    set_output<T>(grad_x_tmp, grad_x);
+  }
+}
+
+template <typename T>
+void tanh_grad(const Tensor& out, const Tensor& grad_out, Tensor* grad_x) {
+  if (!grad_x) return;
+  auto grad_x_tmp = grad_out * (1 - out * out);
+  set_output<T>(grad_x_tmp, grad_x);
+}
+
+template <typename T>
+void concat_grad(const std::vector<Tensor>& x,
+                 const Tensor& out_grad,
+                 const Scalar& axis,
+                 std::vector<Tensor*> x_grad) {
+  int axis_value = axis.to<int>();
+  int rank = x[0].dims().size();
+  if (axis_value < 0) {
+    axis_value = axis_value + rank;
+  }
+  axis_value = axis_value > 0 ? axis_value : 0;
+  std::vector<int> sections;
+  int x_num = x.size();
+  for (int i = 0; i < x_num; ++i) {
+    sections.push_back(x[i].dims()[axis_value]);
+  }
+  std::vector<Tensor> x_grad_tmp =
+      split<T>(out_grad, IntArray(sections), axis_value);
+  for (int i = 0; i < x_num; ++i) {
+    if (x_grad[i]) {
+      set_output<T>(x_grad_tmp.at(i), x_grad.at(i));
+    }
+  }
+}
+
+template <typename T>
+void split_grad(const std::vector<Tensor>& out_grad,
+                const Scalar& axis,
+                Tensor* x_grad) {
+  if (x_grad) {
+    auto grad = concat<T>(out_grad, axis);
+    set_output<T>(grad, x_grad);
+  }
+}
+
+template <typename T>
+void cast_grad(const Tensor& x, const Tensor& out_grad, Tensor* x_grad) {
+  if (x_grad) {
+    auto res = cast<T>(out_grad, x.dtype());
+    set_output<T>(res, x_grad);
+  }
+}
+
+template <typename T>
+void add_grad(const Tensor& x,
+              const Tensor& y,
+              const Tensor& out_grad,
+              int axis,
+              Tensor* dx,
+              Tensor* dy) {
+  if (dy) {
+    if (x.dims() != y.dims()) {
+      // Maybe need reduce here
+      phi::DDim reduce_dim = get_reduce_dims(y.dims(), x.dims());
+      if (!reduce_dim.size()) {
+        set_output<T>(out_grad, dy);
+      } else {
+        auto dy_reduce_res =
+            out_grad.sum(phi::vectorize(reduce_dim), y.dtype(), false);
+        auto dy_tmp = reshape<T>(dy_reduce_res, phi::vectorize(y.dims()));
+        set_output<T>(dy_tmp, dy);
+      }
+
+    } else {
+      set_output<T>(out_grad, dy);
+    }
+  }
+  if (dx) {
+    if (y.dims() != x.dims()) {
+      // Maybe need reduce here
+      auto reduce_dim = get_reduce_dims(x.dims(), y.dims());
+      if (!reduce_dim.size()) {
+        set_output<T>(out_grad, dx);
+      } else {
+        auto dx_reduce_res =
+            out_grad.sum(phi::vectorize(reduce_dim), x.dtype(), false);
+        auto dx_tmp = reshape<T>(dx_reduce_res, phi::vectorize(x.dims()));
+        set_output<T>(dx_tmp, dx);
+      }
+    } else {
+      set_output<T>(out_grad, dx);
+    }
+  }
+}
+
+template <typename T>
+void multiply_grad(const Tensor& x,
+                   const Tensor& y,
+                   const Tensor& out_grad,
+                   int axis,
+                   Tensor* x_grad,
+                   Tensor* y_grad) {
+  if (x_grad) {
+    auto x_grad_unreduce = out_grad * y;
+    if (x_grad_unreduce.dims() != x.dims()) {
+      auto axes = get_reduce_dims_from_out(x_grad_unreduce.dims(), x.dims());
+      if (!axes.size()) {
+        set_output<T>(x_grad_unreduce, x_grad);
+      } else {
+        auto x_grad_reduced = x_grad_unreduce.sum(
+            phi::vectorize(axes), x_grad_unreduce.dtype(), false);
+        if (x_grad_reduced.dims().size() != x.dims().size()) {
+          x_grad_reduced = reshape<T>(x_grad_reduced, x.shape());
+        }
+        set_output<T>(x_grad_reduced, x_grad);
+      }
+    } else {
+      set_output<T>(x_grad_unreduce, x_grad);
+    }
+  }
+  if (y_grad) {
+    auto y_grad_unreduce = out_grad * x;
+    if (y_grad_unreduce.dims() != y.dims()) {
+      auto axes = get_reduce_dims_from_out(y_grad_unreduce.dims(), y.dims());
+      if (!axes.size()) {
+        set_output<T>(y_grad_unreduce, y_grad);
+      } else {
+        auto y_grad_reduced = y_grad_unreduce.sum(
+            phi::vectorize(axes), y_grad_unreduce.dtype(), false);
+        if (y_grad_reduced.dims().size() != y.dims().size()) {
+          y_grad_reduced = reshape<T>(y_grad_reduced, y.shape());
+        }
+        set_output<T>(y_grad_reduced, y_grad);
+      }
+    } else {
+      set_output<T>(y_grad_unreduce, y_grad);
+    }
+  }
+}
+
+template <typename T>
+void elementwise_pow_grad(const Tensor& x,
+                          const Tensor& y,
+                          const Tensor& out_grad,
+                          Tensor* dx,
+                          Tensor* dy) {
+  if (dy) {
+    // dy = lnx * x^y
+    auto lnx = log<T>(x);
+    auto x_pow_y = elementwise_pow<T>(x, y);
+    auto dy_res = lnx * x_pow_y * out_grad;
+    if (x.dims() != y.dims()) {
+      // Maybe need reduce here
+      phi::DDim reduce_dim = get_reduce_dims(y.dims(), x.dims());
+      if (!reduce_dim.size()) {
+        set_output<T>(dy_res, dy);
+      } else {
+        auto dy_reduce_res =
+            dy_res.sum(phi::vectorize(reduce_dim), y.dtype(), false);
+        auto dy_tmp = reshape<T>(dy_reduce_res, phi::vectorize(y.dims()));
+        set_output<T>(dy_tmp, dy);
+      }
+    } else {
+      set_output<T>(dy_res, dy);
+    }
+  }  // indicate we will compute dy
+  if (dx) {
+    // dx = y * x^(y-1)
+    auto tmp_z = y - 1.0;
+    auto x_pow_z = elementwise_pow<T>(x, tmp_z);
+    auto dx_res = y * x_pow_z * out_grad;
+    if (y.dims() != x.dims()) {
+      // Maybe need reduce here
+      auto reduce_dim = get_reduce_dims(x.dims(), y.dims());
+      if (!reduce_dim.size()) {
+        set_output<T>(dx_res, dx);
+      } else {
+        auto dx_reduce_res =
+            dx_res.sum(phi::vectorize(reduce_dim), x.dtype(), false);
+        auto dx_tmp = reshape<T>(dx_reduce_res, phi::vectorize(x.dims()));
+        set_output<T>(dx_tmp, dx);
+      }
+
+    } else {
+      set_output<T>(dx_res, dx);
+    }
+  }  // indicate we will compute dx
+}
+
+template <typename T>
+void layer_norm_grad(const Tensor& x,
+                     const paddle::optional<Tensor>& scale,
+                     const paddle::optional<Tensor>& bias,
+                     const Tensor& mean,
+                     const Tensor& variance,
+                     const Tensor& out_grad,
+                     float epsilon,
+                     int begin_norm_axis,
+                     Tensor* x_grad,
+                     Tensor* scale_grad,
+                     Tensor* bias_grad) {
+  auto x_dims = x.dims();
+  auto shape_1 = 1;  // front part
+  auto shape_2 = 1;  // back part
+  for (int i = 0; i < begin_norm_axis; ++i) {
+    shape_1 *= x_dims[i];
+  }
+  for (int i = begin_norm_axis; i < x.dims().size(); ++i) {
+    shape_2 *= x_dims[i];
+  }
+  auto scale_ptr = scale.get_ptr();
+  auto bias_ptr = bias.get_ptr();
+
+  auto x_cast = reshape<T>(x, std::vector<int64_t>({shape_1, shape_2}));
+  auto out_grad_cast =
+      reshape<T>(out_grad, std::vector<int64_t>({shape_1, shape_2}));
+  auto mean_ = reshape<T>(mean, std::vector<int64_t>({shape_1, 1}));
+  auto variance_ = reshape<T>(variance, std::vector<int64_t>({shape_1, 1}));
+
+  Tensor scale_cast;
+  if (scale_ptr) {
+    scale_cast = reshape<T>(*scale_ptr, std::vector<int64_t>({1, shape_2}));
+  }
+
+  // cast dtype to float32 if dtype =float16 or bfloat16
+
+  auto x_sub_mean = x_cast - mean_;          // M,N
+  auto tmp = (1.0 / (variance_ + epsilon));  // M,1
+  // auto sqrt_var_1 = sqrt<T>(tmp);            // M,1
+  auto sqrt_var_1 = elementwise_pow<T>(
+      tmp, full<T>(phi::vectorize(tmp.dims()), 0.5, tmp.dtype()));
+  auto x_sub_mean_mul_sqrt_var_1 = x_sub_mean * sqrt_var_1;
+
+  if (x_grad) {
+    auto out_grad_scale = out_grad_cast;  // M,N
+    if (scale_ptr) {
+      out_grad_scale = out_grad_cast * scale_cast;  // M,N * 1,N = M,N
+    }
+
+    auto dx_end = sqrt_var_1 * out_grad_scale;
+    auto d_mean =
+        dx_end.sum(std::vector<int64_t>({1}), x_cast.dtype(), true);  // M,1
+
+    auto d_std_1 =
+        (tmp * x_sub_mean * out_grad_scale)
+            .sum(std::vector<int64_t>({1}), x_cast.dtype(), true);  // M,1
+    auto d_std = d_std_1 * x_sub_mean_mul_sqrt_var_1;  // M,1 * M,N = M,N
+
+    auto d_mean_d_std = (1.0 / shape_2) * (d_mean + d_std);
+    auto x_grad_tmp = dx_end - d_mean_d_std;
+    x_grad_tmp = reshape<T>(x_grad_tmp, phi::vectorize(x.dims()));
+
+    set_output<T>(x_grad_tmp, x_grad);
+  }
+
+  if (scale_grad) {
+    if (scale_ptr) {
+      auto scale_grad_tmp =
+          (x_sub_mean_mul_sqrt_var_1 * out_grad_cast)
+              .sum(std::vector<int64_t>({0}), x_cast.dtype(), true);
+      scale_grad_tmp = reshape<T>(scale_grad_tmp, scale_ptr->shape());
+      set_output<T>(scale_grad_tmp, scale_grad);
+    } else {
+      scale_grad = nullptr;
+    }
+  }
+
+  if (bias_grad) {
+    if (bias_ptr) {
+      auto bias_grad_tmp =
+          out_grad_cast.sum(std::vector<int64_t>({0}), x_cast.dtype(), true);
+      bias_grad_tmp = reshape<T>(bias_grad_tmp, bias_ptr->shape());
+      set_output<T>(bias_grad_tmp, bias_grad);
+    } else {
+      bias_grad = nullptr;
+    }
   }
 }
 
diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/ir.cc
index 22fd0f40a36b5..80ecad93997db 100644
--- a/paddle/fluid/pybind/ir.cc
+++ b/paddle/fluid/pybind/ir.cc
@@ -464,6 +464,10 @@ void BindOpResult(py::module *m) {
            [](OpResult &self, OpResult &other) {
              return paddle::dialect::add(self, other);
            })
+      .def("__add__",
+           [](OpResult &self, float &bias) {
+             return paddle::dialect::scale(self, 1.0, bias, false);
+           })
       .def("__sub__",
            [](OpResult &self, OpResult &other) {
              return paddle::dialect::subtract(self, other);
diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml
index a8260bb816865..9b5db92c54700 100755
--- a/paddle/phi/api/yaml/legacy_backward.yaml
+++ b/paddle/phi/api/yaml/legacy_backward.yaml
@@ -224,7 +224,7 @@
   infer_meta :
     func : GeneralBinaryGradInferMeta
     param: [x, y]
-  composite : elementwise_pow_grad(x, y, out_grad, axis, x_grad, y_grad)
+  composite : elementwise_pow_grad(x, y, out_grad, x_grad, y_grad)
   kernel :
     func : elementwise_pow_grad
 
diff --git a/python/paddle/autograd/ir_backward.py b/python/paddle/autograd/ir_backward.py
index e33c3a38bff74..f8a2aae71b0cd 100644
--- a/python/paddle/autograd/ir_backward.py
+++ b/python/paddle/autograd/ir_backward.py
@@ -94,7 +94,6 @@ def prepare_grad_outputs(grad_outputs, outputs, state):
                 dtype=output.dtype,
             )
             fillop = output_grad.get_defining_op()
-
             update_bwdop_structure(
                 backward_ops,
                 state.op_to_opgrad[output.get_defining_op()],
@@ -138,14 +137,14 @@ def prepare_grad_outputs(grad_outputs, outputs, state):
                     0.0,
                     opresult.dtype,
                 )
-                fillop = grad.get_defining_op()
+                fillop = grad_value.get_defining_op()
 
                 update_bwdop_structure(
                     backward_ops,
                     state.op_to_opgrad[opresult.get_defining_op()],
                     fillop,
                 )
-                state.value_to_valuegrad[opresult] = [grad_value]
+                state.value_to_valuegrad[opresult] = [[grad_value]]
 
                 visited_output.add(opresult)
 
diff --git a/python/paddle/decomposition/rules.py b/python/paddle/decomposition/rules.py
index e9d04ede061ce..26a4ae73debd0 100644
--- a/python/paddle/decomposition/rules.py
+++ b/python/paddle/decomposition/rules.py
@@ -63,3 +63,83 @@ def gelu_composite(x, approximate):
         cdf = half * (one + _ir_ops.erf(x * full(x.shape, M_SQRT1_2, x.dtype)))
         out = x * cdf
         return out
+
+
+@register_decomp('pd_op.rsqrt')
+def rsqrt_composite(x):
+    """define composite rule of op rsqrt."""
+    # rsqrt(x) = x^(-0.5)
+    is_amp = False
+    from paddle.base.data_feeder import convert_dtype
+
+    dtype = convert_dtype(x.dtype)
+    if dtype in ["float16", "uint16"]:
+        is_amp = True
+        x = cast(x, "float32")
+    y = full(x.shape if len(x.shape) == 0 else [1], -0.5, x.dtype)
+    res = pow(x, y)
+    return res if not is_amp else cast(res, dtype)
+
+
+@register_decomp('pd_op.pow')
+def pow_composite(x, y):
+    """
+    define composite rule of op pow
+    res = x^y
+    """
+    is_amp = False
+    from paddle.base.data_feeder import convert_dtype
+
+    dtype = convert_dtype(x.dtype)
+    if dtype in ["float16", "uint16"]:
+        is_amp = True
+        x = cast(x, "float32")
+
+    if isinstance(y, (int, float)):
+        y = full(x.shape if len(x.shape) == 0 else [1], y, x.dtype)
+    res = pow(x, y)
+    if is_amp:
+        res = cast(res, dtype)
+    return res
+
+
+@register_decomp('pd_op.layer_norm')
+def layernorm_composite(x, scale, bias, epsilon, begin_norm_axis):
+    """
+    define composite rule of op layer_norm
+    out = (x - mean(x)) / sqrt(var + epsilon))
+    var = mean((x-mean(x))^2)
+    """
+    is_amp = False
+    from paddle.base.data_feeder import convert_dtype
+
+    dtype = convert_dtype(x.dtype)
+    if dtype in ["float16", "uint16"]:
+        is_amp = True
+        x = cast(x, "float32")
+        scale = cast(scale, "float32") if scale else scale
+        bias = cast(bias, "float32") if bias else bias
+
+    axis = tuple(range(begin_norm_axis, len(x.shape)))
+    mean_ = mean(x, axis=axis, keepdim=True)
+    difference = x - mean_
+    var_tmp1 = difference * difference
+    variance = mean(var_tmp1, axis=axis, keepdim=True)
+    var_tmp3 = variance + epsilon
+    rsqrt_var = rsqrt(var_tmp3)
+    out = difference * rsqrt_var
+
+    if scale is not None:
+        if x.shape[begin_norm_axis:] != scale.shape:
+            scale = reshape(scale, x.shape[begin_norm_axis:])
+        out = out * scale
+    if bias is not None:
+        if x.shape[begin_norm_axis:] != bias.shape:
+            bias = reshape(bias, x.shape[begin_norm_axis:])
+        out = out + bias
+
+    mean_ = reshape(mean_, [-1])
+    variance = reshape(variance, [-1])
+    if is_amp:
+        out = cast(out, dtype)
+    return out, mean_, variance
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index c3e814cc906d4..f764fbb45996d 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -833,8 +833,7 @@ def full_like(x, fill_value, dtype=None, name=None):
     if in_dynamic_mode():
         return _C_ops.full_like(x, fill_value, dtype, x.place)
     elif in_pir_mode():
-        place = _current_expected_place()
-        return _C_ops.full_like(x, fill_value, dtype, place)
+        return _C_ops.full_like(x, fill_value, dtype, core.Place())
     else:
         helper = LayerHelper("full_like", **locals())
         check_variable_and_dtype(
@@ -881,7 +880,11 @@ def full_like(x, fill_value, dtype=None, name=None):
 
 def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
     if in_dynamic_or_pir_mode():
-        place = _current_expected_place()
+        place = (
+            _current_expected_place()
+            if not in_pir_mode()
+            else paddle.base.core.Place()
+        )
         if force_cpu:
             place = core.CPUPlace()
         if isinstance(shape, (list, tuple)):
diff --git a/test/legacy_test/prim_op_test.py b/test/legacy_test/prim_op_test.py
index e472c70813c73..f28957cdc89be 100644
--- a/test/legacy_test/prim_op_test.py
+++ b/test/legacy_test/prim_op_test.py
@@ -22,7 +22,8 @@
 
 import paddle
 from paddle.autograd.ir_backward import grad as ir_grad
-from paddle.base import core
+from paddle.base import Scope, core
+from paddle.base.executor import scope_guard
 from paddle.base.framework import (
     OpProtoHolder,
     _dygraph_tracer,
@@ -409,7 +410,8 @@ def check(self):
                 self.check_jit_comp_with_cinn()
         else:
             if self.enable_check_static_comp:
-                self.check_static_comp()
+                with scope_guard(Scope()):
+                    self.check_static_comp()
 
     def get_kernel_sig(self):
         with dygraph_guard():
@@ -870,7 +872,8 @@ def check(self):
                 self.check_jit_comp_with_cinn()
         else:
             if self.enable_check_static_comp:
-                self.check_static_comp()
+                with scope_guard(Scope()):
+                    self.check_static_comp()
 
     def get_output_dict(self, np_outputs, api_outputs, outputs_sig):
         assert len(api_outputs) <= len(outputs_sig), (
diff --git a/test/legacy_test/test_activation_op.py b/test/legacy_test/test_activation_op.py
index 8b16ee5750eac..8d1ee1ac5091a 100644
--- a/test/legacy_test/test_activation_op.py
+++ b/test/legacy_test/test_activation_op.py
@@ -693,9 +693,21 @@ def test_check_grad(self):
             return
         # TODO(ScottWong98): set `check_prim=False` when `fill_any_like` supports `complex` dtype
         if self.dtype == np.complex64 or self.dtype == np.complex128:
-            self.check_grad(['X'], 'Out', check_prim=False, check_new_ir=False)
+            self.check_grad(
+                ['X'],
+                'Out',
+                check_prim=False,
+                check_prim_pir=False,
+                check_new_ir=False,
+            )
         else:
-            self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True)
+            self.check_grad(
+                ['X'],
+                'Out',
+                check_prim=True,
+                check_new_ir=True,
+                check_prim_pir=True,
+            )
 
     def init_dtype(self):
         # TODO If dtype is float64, the output (Out) has diff at CPUPlace
@@ -1615,7 +1627,9 @@ def if_enable_cinn(self):
         pass
 
     def test_check_output(self):
-        self.check_output(check_prim=True, check_new_ir=True)
+        self.check_output(
+            check_prim=True, check_new_ir=True, check_prim_pir=True
+        )
 
     def test_check_grad(self):
         if self.dtype == np.float16:
@@ -1626,6 +1640,7 @@ def test_check_grad(self):
             max_relative_error=0.0005,
             check_prim=True,
             check_new_ir=True,
+            check_prim_pir=True,
         )
 
 
@@ -2480,12 +2495,22 @@ def setUp(self):
         self.cinn_atol = 1e-8
 
     def test_check_output(self):
-        self.check_output(check_prim=True, check_new_ir=True)
+        self.check_output(
+            check_prim=True,
+            check_new_ir=True,
+            check_prim_pir=False,
+        )
 
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
-        self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True)
+        self.check_grad(
+            ['X'],
+            'Out',
+            check_prim=True,
+            check_new_ir=True,
+            check_prim_pir=True,
+        )
 
 
 class TestGelu(TestActivation):
@@ -2518,12 +2543,20 @@ def if_enable_cinn(self):
         pass
 
     def test_check_output(self):
-        self.check_output(check_prim=True, check_new_ir=True)
+        self.check_output(
+            check_prim=True, check_new_ir=True, check_prim_pir=False
+        )
 
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
-        self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True)
+        self.check_grad(
+            ['X'],
+            'Out',
+            check_prim=True,
+            check_new_ir=True,
+            check_prim_pir=True,
+        )
 
 
 class TestGelu_ZeroDim(TestGelu):
@@ -3575,12 +3608,20 @@ def if_enable_cinn(self):
         pass
 
     def test_check_output(self):
-        self.check_output(check_prim=True, check_new_ir=True)
+        self.check_output(
+            check_prim=True, check_prim_pir=True, check_new_ir=True
+        )
 
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
-        self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True)
+        self.check_grad(
+            ['X'],
+            'Out',
+            check_prim=True,
+            check_prim_pir=True,
+            check_new_ir=True,
+        )
 
 
 class TestPow_ZeroDim(TestPow):
@@ -4397,6 +4438,7 @@ def create_test_act_fp16_class(
     grad_check=True,
     check_dygraph=True,
     check_prim=False,
+    check_prim_pir=False,
     enable_cinn=False,
     grad_atol=1e-2,
     **kwargs
@@ -4425,6 +4467,7 @@ def test_check_output(self):
                     atol=atol,
                     check_dygraph=check_dygraph,
                     check_prim=check_prim,
+                    check_prim_pir=check_prim_pir,
                 )
 
         def test_check_grad(self):
@@ -4437,6 +4480,7 @@ def test_check_grad(self):
                     'Out',
                     check_dygraph=check_dygraph,
                     check_prim=check_prim,
+                    check_prim_pir=check_prim_pir,
                     max_relative_error=grad_atol,
                 )
 
@@ -4451,7 +4495,9 @@ def test_check_grad(self):
 create_test_act_fp16_class(TestSigmoid, check_prim=True, enable_cinn=True)
 create_test_act_fp16_class(TestSilu, check_prim=True, enable_cinn=True)
 create_test_act_fp16_class(TestLogSigmoid)
-create_test_act_fp16_class(TestTanh, check_prim=True, enable_cinn=True)
+create_test_act_fp16_class(
+    TestTanh, check_prim=True, check_prim_pir=True, enable_cinn=True
+)
 create_test_act_fp16_class(TestTanhshrink)
 create_test_act_fp16_class(TestHardShrink)
 create_test_act_fp16_class(TestSoftshrink)
@@ -4478,6 +4524,7 @@ def test_check_grad(self):
 create_test_act_fp16_class(
     TestGelu,
     check_prim=True,
+    check_prim_pir=True,
     check_new_ir=True,
     enable_cinn=True,
     rev_comp_rtol=1e-3,
@@ -4499,7 +4546,7 @@ def test_check_grad(self):
 create_test_act_fp16_class(TestLog10)
 create_test_act_fp16_class(TestLog1p)
 create_test_act_fp16_class(TestSquare)
-create_test_act_fp16_class(TestPow, check_prim=True)
+create_test_act_fp16_class(TestPow, check_prim=True, check_prim_pir=True)
 create_test_act_fp16_class(TestPow_API)
 create_test_act_fp16_class(TestSTanh)
 create_test_act_fp16_class(TestSoftplus)
@@ -4521,7 +4568,11 @@ def test_check_grad(self):
 )
 create_test_act_fp16_class(TestLeakyRelu_ZeroDim, check_prim=True)
 create_test_act_fp16_class(
-    TestRsqrt, check_prim=True, enable_cinn=True, check_new_ir=True
+    TestRsqrt,
+    check_prim=True,
+    enable_cinn=True,
+    check_new_ir=True,
+    check_prim_pir=True,
 )
 
 
@@ -4645,7 +4696,9 @@ def test_check_grad(self):
 create_test_act_bf16_class(TestLeakyReluAlpha2, check_prim=True)
 create_test_act_bf16_class(TestLeakyReluAlpha3, check_prim=True)
 create_test_act_bf16_class(TestLeakyRelu_ZeroDim, check_prim=True)
-create_test_act_bf16_class(TestRsqrt, check_prim=True, check_new_ir=True)
+create_test_act_bf16_class(
+    TestRsqrt, check_prim=True, check_new_ir=True, check_prim_pir=True
+)
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/legacy_test/test_cast_op.py b/test/legacy_test/test_cast_op.py
index 47bc23d76f601..448629431d0b1 100644
--- a/test/legacy_test/test_cast_op.py
+++ b/test/legacy_test/test_cast_op.py
@@ -52,10 +52,16 @@ def init_shapes(self):
         self.input_shape = [10, 10]
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_prim_pir=True, check_new_ir=True)
 
     def test_grad(self):
-        self.check_grad(['X'], ['Out'], check_prim=True, check_new_ir=True)
+        self.check_grad(
+            ['X'],
+            ['Out'],
+            check_prim=True,
+            check_prim_pir=True,
+            check_new_ir=True,
+        )
 
 
 class TestCastOpFp32ToFp64_ZeroDim(TestCastOpFp32ToFp64):
diff --git a/test/legacy_test/test_concat_op.py b/test/legacy_test/test_concat_op.py
index dc9702beeb014..153e1cc06d308 100644
--- a/test/legacy_test/test_concat_op.py
+++ b/test/legacy_test/test_concat_op.py
@@ -61,18 +61,51 @@ def test_check_grad(self):
         if self.dtype == np.uint16:
             place = core.CUDAPlace(0)
             self.check_grad_with_place(
-                place, ['x0'], 'Out', check_prim=True, check_new_ir=True
+                place,
+                ['x0'],
+                'Out',
+                check_prim=True,
+                check_new_ir=True,
+                check_prim_pir=True,
             )
             self.check_grad_with_place(
-                place, ['x1'], 'Out', check_prim=True, check_new_ir=True
+                place,
+                ['x1'],
+                'Out',
+                check_prim=True,
+                check_new_ir=True,
+                check_prim_pir=True,
             )
             self.check_grad_with_place(
-                place, ['x2'], 'Out', check_prim=True, check_new_ir=True
+                place,
+                ['x2'],
+                'Out',
+                check_prim=True,
+                check_new_ir=True,
+                check_prim_pir=True,
             )
         else:
-            self.check_grad(['x0'], 'Out', check_prim=True, check_new_ir=True)
-            self.check_grad(['x1'], 'Out', check_prim=True, check_new_ir=True)
-            self.check_grad(['x2'], 'Out', check_prim=True, check_new_ir=True)
+            self.check_grad(
+                ['x0'],
+                'Out',
+                check_prim=True,
+                check_new_ir=True,
+                check_prim_pir=True,
+            )
+            self.check_grad(
+                ['x1'],
+                'Out',
+                check_prim=True,
+                check_new_ir=True,
+                check_prim_pir=True,
+            )
+            self.check_grad(
+                ['x2'],
+                'Out',
+                check_prim=True,
+                check_new_ir=True,
+                check_prim_pir=True,
+            )
 
     def init_test_data(self):
         if self.dtype == np.uint16:
@@ -213,9 +246,27 @@ def test_check_output(self):
         self.check_output(check_new_ir=True)
 
     def test_check_grad(self):
-        self.check_grad(['x0'], 'Out', check_prim=True, check_new_ir=True)
-        self.check_grad(['x1'], 'Out', check_prim=True, check_new_ir=True)
-        self.check_grad(['x2'], 'Out', check_prim=True, check_new_ir=True)
+        self.check_grad(
+            ['x0'],
+            'Out',
+            check_prim=True,
+            check_new_ir=True,
+            check_prim_pir=True,
+        )
+        self.check_grad(
+            ['x1'],
+            'Out',
+            check_prim=True,
+            check_new_ir=True,
+            check_prim_pir=True,
+        )
+        self.check_grad(
+            ['x2'],
+            'Out',
+            check_prim=True,
+            check_new_ir=True,
+            check_prim_pir=True,
+        )
 
     def init_test_data(self):
         if self.dtype == np.uint16:
@@ -301,8 +352,10 @@ def create_test_fp16(parent):
     class TestConcatFp16(parent):
         def setUp(self):
             self.op_type = "concat"
+            self.prim_op_type = "prim"
             self.python_api = paddle.concat
             self.public_python_api = paddle.concat
+            self.enable_cinn = False
             self.dtype = self.get_dtype()
             self.init_test_data()
             self.inputs = {
@@ -332,18 +385,51 @@ def test_check_grad(self):
             if self.dtype == np.uint16:
                 place = core.CUDAPlace(0)
                 self.check_grad_with_place(
-                    place, ['x0'], 'Out', check_new_ir=True
+                    place,
+                    ['x0'],
+                    'Out',
+                    check_new_ir=True,
+                    check_prim=True,
+                    check_prim_pir=True,
                 )
                 self.check_grad_with_place(
-                    place, ['x1'], 'Out', check_new_ir=True
+                    place,
+                    ['x1'],
+                    'Out',
+                    check_new_ir=True,
+                    check_prim=True,
+                    check_prim_pir=True,
                 )
                 self.check_grad_with_place(
-                    place, ['x2'], 'Out', check_new_ir=True
+                    place,
+                    ['x2'],
+                    'Out',
+                    check_new_ir=True,
+                    check_prim=True,
+                    check_prim_pir=True,
                 )
             else:
-                self.check_grad(['x0'], 'Out', check_new_ir=True)
-                self.check_grad(['x1'], 'Out', check_new_ir=True)
-                self.check_grad(['x2'], 'Out', check_new_ir=True)
+                self.check_grad(
+                    ['x0'],
+                    'Out',
+                    check_new_ir=True,
+                    check_prim=True,
+                    check_prim_pir=True,
+                )
+                self.check_grad(
+                    ['x1'],
+                    'Out',
+                    check_new_ir=True,
+                    check_prim=True,
+                    check_prim_pir=True,
+                )
+                self.check_grad(
+                    ['x2'],
+                    'Out',
+                    check_new_ir=True,
+                    check_prim=True,
+                    check_prim_pir=True,
+                )
 
         def get_dtype(self):
             return np.float16
@@ -371,6 +457,7 @@ def create_test_bf16(parent):
     class TestConcatBf16(parent):
         def setUp(self):
             self.op_type = "concat"
+            self.prim_op_type = "prim"
             self.python_api = paddle.concat
             self.public_python_api = paddle.concat
             self.enable_cinn = False
@@ -403,18 +490,51 @@ def test_check_grad(self):
             if self.dtype == np.uint16:
                 place = core.CUDAPlace(0)
                 self.check_grad_with_place(
-                    place, ['x0'], 'Out', check_new_ir=True
+                    place,
+                    ['x0'],
+                    'Out',
+                    check_new_ir=True,
+                    check_prim=True,
+                    check_prim_pir=True,
                 )
                 self.check_grad_with_place(
-                    place, ['x1'], 'Out', check_new_ir=True
+                    place,
+                    ['x1'],
+                    'Out',
+                    check_new_ir=True,
+                    check_prim=True,
+                    check_prim_pir=True,
                 )
                 self.check_grad_with_place(
-                    place, ['x2'], 'Out', check_new_ir=True
+                    place,
+                    ['x2'],
+                    'Out',
+                    check_new_ir=True,
+                    check_prim=True,
+                    check_prim_pir=True,
                 )
             else:
-                self.check_grad(['x0'], 'Out', check_new_ir=True)
-                self.check_grad(['x1'], 'Out', check_new_ir=True)
-                self.check_grad(['x2'], 'Out', check_new_ir=True)
+                self.check_grad(
+                    ['x0'],
+                    'Out',
+                    check_new_ir=True,
+                    check_prim=True,
+                    check_prim_pir=True,
+                )
+                self.check_grad(
+                    ['x1'],
+                    'Out',
+                    check_new_ir=True,
+                    check_prim=True,
+                    check_prim_pir=True,
+                )
+                self.check_grad(
+                    ['x2'],
+                    'Out',
+                    check_new_ir=True,
+                    check_prim=True,
+                    check_prim_pir=True,
+                )
 
         def get_dtype(self):
             return np.uint16
diff --git a/test/legacy_test/test_elementwise_add_op.py b/test/legacy_test/test_elementwise_add_op.py
index 8bacfc9a45cfd..546e9d2555421 100644
--- a/test/legacy_test/test_elementwise_add_op.py
+++ b/test/legacy_test/test_elementwise_add_op.py
@@ -57,6 +57,7 @@ def test_check_output(self):
         self.check_output(
             check_dygraph=self.check_dygraph(),
             check_prim=self.check_prim,
+            check_prim_pir=self.check_dygraph(),
             check_new_ir=self.check_dygraph(),
         )
 
@@ -69,6 +70,7 @@ def test_check_grad_normal(self):
             'Out',
             check_dygraph=self.check_dygraph(),
             check_prim=self.check_prim,
+            check_prim_pir=self.check_dygraph(),
             check_new_ir=self.check_dygraph(),
         )
 
@@ -82,6 +84,7 @@ def test_check_grad_ingore_x(self):
             no_grad_set=set("X"),
             check_dygraph=self.check_dygraph(),
             check_prim=self.check_prim,
+            check_prim_pir=self.check_dygraph(),
             check_new_ir=self.check_dygraph(),
         )
 
@@ -95,6 +98,7 @@ def test_check_grad_ingore_y(self):
             no_grad_set=set('Y'),
             check_dygraph=self.check_dygraph(),
             check_prim=self.check_prim,
+            check_prim_pir=self.check_dygraph(),
             check_new_ir=self.check_dygraph(),
         )
 
@@ -152,6 +156,7 @@ def test_check_output(self):
             atol=1e-3,
             check_dygraph=self.check_dygraph(),
             check_prim=self.check_prim,
+            check_prim_pir=self.check_dygraph(),
             check_new_ir=self.check_dygraph(),
         )
 
@@ -167,6 +172,7 @@ def test_check_grad_ingore_x(self):
             'Out',
             no_grad_set=set("X"),
             check_prim=True,
+            check_prim_pir=True,
             check_new_ir=True,
         )
 
@@ -178,6 +184,7 @@ def test_check_grad_ingore_y(self):
             'Out',
             no_grad_set=set('Y'),
             check_prim=True,
+            check_prim_pir=True,
             check_new_ir=True,
         )
 
@@ -221,6 +228,7 @@ def test_check_grad_normal(self):
             ['X', 'Y'],
             'Out',
             check_prim=True,
+            check_prim_pir=True,
             check_new_ir=True,
         )
 
@@ -232,6 +240,7 @@ def test_check_grad_ingore_x(self):
             'Out',
             no_grad_set=set("X"),
             check_prim=True,
+            check_prim_pir=True,
             check_new_ir=True,
         )
 
@@ -243,6 +252,7 @@ def test_check_grad_ingore_y(self):
             'Out',
             no_grad_set=set('Y'),
             check_prim=True,
+            check_prim_pir=True,
             check_new_ir=True,
         )
 
diff --git a/test/legacy_test/test_elementwise_mul_op.py b/test/legacy_test/test_elementwise_mul_op.py
index 8013eb0baaf15..fde11e09fbe14 100644
--- a/test/legacy_test/test_elementwise_mul_op.py
+++ b/test/legacy_test/test_elementwise_mul_op.py
@@ -49,6 +49,7 @@ def test_check_output(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
         self.check_output(
             check_dygraph=(not self.use_mkldnn),
+            check_prim_pir=(not self.use_mkldnn),
             check_new_ir=(not self.use_mkldnn),
         )
 
@@ -59,6 +60,7 @@ def test_check_grad_normal(self):
             'Out',
             check_dygraph=(not self.use_mkldnn),
             check_prim=True,
+            check_prim_pir=(not self.use_mkldnn),
             check_new_ir=(not self.use_mkldnn),
         )
 
@@ -70,6 +72,7 @@ def test_check_grad_ingore_x(self):
             no_grad_set=set("X"),
             check_dygraph=(not self.use_mkldnn),
             check_prim=True,
+            check_prim_pir=(not self.use_mkldnn),
             check_new_ir=(not self.use_mkldnn),
         )
 
@@ -81,6 +84,7 @@ def test_check_grad_ingore_y(self):
             no_grad_set=set('Y'),
             check_dygraph=(not self.use_mkldnn),
             check_prim=True,
+            check_prim_pir=(not self.use_mkldnn),
             check_new_ir=(not self.use_mkldnn),
         )
 
@@ -102,6 +106,7 @@ def if_enable_cinn(self):
 class TestComplexElementwiseMulOpWithCheckGrad(ElementwiseMulOp):
     def setUp(self):
         self.op_type = "elementwise_mul"
+        self.prim_op_type = "prim"
         self.python_api = paddle.multiply
         self.public_python_api = paddle.multiply
         self.dtype = np.complex128
@@ -188,7 +193,13 @@ def test_check_output(self):
         self.check_output(check_new_ir=True)
 
     def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out', check_prim=True, check_new_ir=True)
+        self.check_grad(
+            ['X', 'Y'],
+            'Out',
+            check_prim=True,
+            check_prim_pir=True,
+            check_new_ir=True,
+        )
 
     def test_check_grad_ingore_x(self):
         self.check_grad(
@@ -196,6 +207,7 @@ def test_check_grad_ingore_x(self):
             'Out',
             no_grad_set=set("X"),
             check_prim=True,
+            check_prim_pir=True,
             check_new_ir=True,
         )
 
@@ -205,6 +217,7 @@ def test_check_grad_ingore_y(self):
             'Out',
             no_grad_set=set('Y'),
             check_prim=True,
+            check_prim_pir=True,
             check_new_ir=True,
         )
 
@@ -420,6 +433,7 @@ def test_check_grad_normal(self):
             'Out',
             check_dygraph=(not self.use_mkldnn),
             check_prim=True,
+            check_prim_pir=(not self.use_mkldnn),
             check_new_ir=(not self.use_mkldnn),
         )
 
@@ -431,6 +445,7 @@ def test_check_grad_ingore_x(self):
             no_grad_set=set("X"),
             check_dygraph=(not self.use_mkldnn),
             check_prim=True,
+            check_prim_pir=(not self.use_mkldnn),
             check_new_ir=(not self.use_mkldnn),
         )
 
@@ -442,6 +457,7 @@ def test_check_grad_ingore_y(self):
             no_grad_set=set('Y'),
             check_dygraph=(not self.use_mkldnn),
             check_prim=True,
+            check_prim_pir=(not self.use_mkldnn),
             check_new_ir=(not self.use_mkldnn),
         )
 
@@ -496,6 +512,7 @@ def setUp(self):
 class TestComplexElementwiseMulOp(OpTest):
     def setUp(self):
         self.op_type = "elementwise_mul"
+        self.prim_op_type = "prim"
         self.python_api = paddle.multiply
         self.init_base_dtype()
         self.init_input_output()
diff --git a/test/legacy_test/test_elementwise_pow_op.py b/test/legacy_test/test_elementwise_pow_op.py
index e406845960abc..c718ce16292b9 100644
--- a/test/legacy_test/test_elementwise_pow_op.py
+++ b/test/legacy_test/test_elementwise_pow_op.py
@@ -44,7 +44,7 @@ def test_check_output(self):
         if hasattr(self, 'attrs'):
             self.check_output(check_dygraph=False)
         else:
-            self.check_output(check_new_ir=True)
+            self.check_output(check_prim_pir=True, check_new_ir=True)
 
     def test_check_grad_normal(self):
         if hasattr(self, 'attrs'):
@@ -53,7 +53,11 @@ def test_check_grad_normal(self):
             )
         else:
             self.check_grad(
-                ['X', 'Y'], 'Out', check_prim=True, check_new_ir=True
+                ['X', 'Y'],
+                'Out',
+                check_prim=True,
+                check_prim_pir=True,
+                check_new_ir=True,
             )
 
 
@@ -190,6 +194,8 @@ class TestElementwisePowOpInt(OpTest):
     def setUp(self):
         self.op_type = "elementwise_pow"
         self.python_api = paddle.pow
+        self.public_python_api = paddle.pow
+        self.prim_op_type = "prim"
 
         self.inputs = {'X': np.asarray([1, 3, 6]), 'Y': np.asarray([1, 1, 1])}
         self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])}
@@ -198,7 +204,7 @@ def test_check_output(self):
         if hasattr(self, 'attrs'):
             self.check_output(check_dygraph=False)
         else:
-            self.check_output(check_new_ir=True)
+            self.check_output(check_prim_pir=True, check_new_ir=True)
 
 
 class TestElementwisePowGradOpInt(unittest.TestCase):
@@ -254,7 +260,7 @@ def test_check_output(self):
         if hasattr(self, 'attrs'):
             self.check_output(check_dygraph=False)
         else:
-            self.check_output(check_new_ir=True)
+            self.check_output(check_prim_pir=True, check_new_ir=True)
 
     def test_check_grad(self):
         self.check_grad(
@@ -264,6 +270,7 @@ def test_check_grad(self):
                 self.inputs['X'], self.inputs['Y'], 1 / self.inputs['X'].size
             ),
             check_prim=True,
+            check_prim_pir=True,
             check_new_ir=True,
         )
 
@@ -290,7 +297,7 @@ def setUp(self):
         self.outputs = {'Out': convert_float_to_uint16(out)}
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_prim_pir=True, check_new_ir=True)
 
     def test_check_grad(self):
         self.check_grad(['X', 'Y'], 'Out')
@@ -301,7 +308,7 @@ def test_check_grad(self):
                 'Out',
                 check_prim=True,
                 only_check_prim=True,
-                check_new_ir=True,
+                check_prim_pir=True,
             )
 
 
diff --git a/test/legacy_test/test_layer_norm_op.py b/test/legacy_test/test_layer_norm_op.py
index b023ff6488e48..3fb01bb3d0b62 100644
--- a/test/legacy_test/test_layer_norm_op.py
+++ b/test/legacy_test/test_layer_norm_op.py
@@ -141,8 +141,9 @@ def test_check_output(self):
             no_check_set=["Mean", "Variance"],
             atol=self.ori_atol,
             rtol=self.ori_rtol,
-            check_prim=True,
-            check_new_ir=True,
+            check_prim=self.check_prim,
+            check_prim_pir=self.check_prim_pir,
+            check_new_ir=self.check_new_ir,
         )
 
     def test_check_grad(self):
@@ -150,8 +151,9 @@ def test_check_grad(self):
             self.check_grad_input_list,
             ['Y'],
             max_relative_error=self.max_relative_error,
-            check_prim=True,
-            check_new_ir=True,
+            check_prim=self.check_prim,
+            check_prim_pir=self.check_prim_pir,
+            check_new_ir=self.check_new_ir,
         )
 
     def initConfig(self):
@@ -173,6 +175,9 @@ def initConfig(self):
         self.begin_norm_axis = 1
         self.has_scale = True
         self.has_bias = True
+        self.check_prim = True
+        self.check_prim_pir = True
+        self.check_new_ir = True
 
     def initTestCase(self):
         np.random.seed(123)
@@ -240,8 +245,9 @@ def test_check_output(self):
             no_check_set=["Mean", "Variance"],
             atol=self.ori_atol,
             rtol=self.ori_rtol,
-            check_prim=True,
-            check_new_ir=True,
+            check_prim=self.check_prim,
+            check_prim_pir=self.check_prim_pir,
+            check_new_ir=self.check_new_ir,
         )
 
     def test_check_grad(self):
@@ -250,8 +256,9 @@ def test_check_grad(self):
             self.check_grad_input_list,
             ['Y'],
             max_relative_error=self.max_relative_error,
-            check_prim=True,
-            check_new_ir=True,
+            check_prim=self.check_prim,
+            check_prim_pir=self.check_prim_pir,
+            check_new_ir=self.check_new_ir,
         )
 
     def initConfig(self):
@@ -266,6 +273,9 @@ def initConfig(self):
         self.begin_norm_axis = 1
         self.has_scale = True
         self.has_bias = True
+        self.check_prim = True
+        self.check_prim_pir = True
+        self.check_new_ir = True
 
     def initTestCase(self):
         np.random.seed(123)
@@ -335,6 +345,9 @@ def initConfig(self):
         self.begin_norm_axis = 1
         self.has_scale = False
         self.has_bias = False
+        self.check_prim = False
+        self.check_prim_pir = False
+        self.check_new_ir = True
 
 
 @unittest.skipIf(
@@ -356,6 +369,9 @@ def initConfig(self):
         self.begin_norm_axis = 1
         self.has_scale = False
         self.has_bias = False
+        self.check_prim = False
+        self.check_prim_pir = False
+        self.check_new_ir = True
 
 
 @unittest.skipIf(
@@ -382,6 +398,9 @@ def initConfig(self):
         self.begin_norm_axis = 1
         self.has_scale = True
         self.has_bias = False
+        self.check_prim = False
+        self.check_prim_pir = False
+        self.check_new_ir = True
 
 
 @unittest.skipIf(
@@ -403,6 +422,9 @@ def initConfig(self):
         self.begin_norm_axis = 1
         self.has_scale = True
         self.has_bias = False
+        self.check_prim = False
+        self.check_prim_pir = False
+        self.check_new_ir = True
 
 
 @unittest.skipIf(
@@ -429,6 +451,9 @@ def initConfig(self):
         self.begin_norm_axis = 1
         self.has_scale = False
         self.has_bias = True
+        self.check_prim = False
+        self.check_prim_pir = False
+        self.check_new_ir = True
 
 
 @unittest.skipIf(
@@ -450,6 +475,9 @@ def initConfig(self):
         self.begin_norm_axis = 1
         self.has_scale = False
         self.has_bias = True
+        self.check_prim = False
+        self.check_prim_pir = False
+        self.check_new_ir = True
 
 
 class TestLayerNormOpByOpTestFP32(TestLayerNormOpByOpTest):
@@ -467,6 +495,9 @@ def initConfig(self):
         self.begin_norm_axis = 1
         self.has_scale = True
         self.has_bias = True
+        self.check_prim = True
+        self.check_prim_pir = True
+        self.check_new_ir = True
 
 
 class TestLayerNormOpByOpTestFP32_case2(TestLayerNormOpByOpTest):
@@ -484,6 +515,9 @@ def initConfig(self):
         self.begin_norm_axis = 1
         self.has_scale = False
         self.has_bias = False
+        self.check_prim = False
+        self.check_prim_pir = False
+        self.check_new_ir = True
 
 
 class TestLayerNormOpByOpTestFP32_case3(TestLayerNormOpByOpTest):
@@ -501,6 +535,9 @@ def initConfig(self):
         self.begin_norm_axis = 1
         self.has_scale = True
         self.has_bias = False
+        self.check_prim = False
+        self.check_prim_pir = False
+        self.check_new_ir = True
 
 
 class TestLayerNormOpByOpTestFP32_case4(TestLayerNormOpByOpTest):
@@ -518,6 +555,9 @@ def initConfig(self):
         self.begin_norm_axis = 1
         self.has_scale = False
         self.has_bias = True
+        self.check_prim = False
+        self.check_prim_pir = False
+        self.check_new_ir = True
 
 
 class TestLayerNormOp(unittest.TestCase):
diff --git a/test/legacy_test/test_reshape_op.py b/test/legacy_test/test_reshape_op.py
index c9ab6baf41ef6..0a9132ca55b49 100755
--- a/test/legacy_test/test_reshape_op.py
+++ b/test/legacy_test/test_reshape_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16, skip_check_grad_ci
 
 import paddle
 from paddle import base
@@ -43,11 +43,17 @@ def init_data(self):
         self.new_shape = (12, 10)
         self.infered_shape = (12, 10)
 
-    def test_check_output(self):
+    def _test_check_output(self):
         self.check_output(no_check_set=['XShape'], check_new_ir=True)
 
     def test_check_grad(self):
-        self.check_grad(["X"], "Out", check_prim=True, check_new_ir=True)
+        self.check_grad(
+            ["X"],
+            "Out",
+            check_prim=True,
+            check_new_ir=True,
+            check_prim_pir=True,
+        )
 
 
 class TestReshapeOp_ZeroDim1(TestReshapeOp):
@@ -120,7 +126,7 @@ def test_check_output(self):
         self.check_output(no_check_set=['XShape'])
 
     def test_check_grad(self):
-        self.check_grad(["X"], "Out", check_prim=True)
+        self.check_grad(["X"], "Out", check_prim=True, check_prim_pir=True)
 
 
 class TestReshapeFP16Op(OpTest):
@@ -148,7 +154,7 @@ def test_check_output(self):
         self.check_output(no_check_set=['XShape'])
 
     def test_check_grad(self):
-        self.check_grad(["X"], "Out", check_prim=True)
+        self.check_grad(["X"], "Out", check_prim=True, check_prim_pir=True)
 
 
 class TestReshapeOpDimInfer1(TestReshapeOp):
@@ -340,6 +346,9 @@ def init_dtype(self):
         self.dtype = np.uint8
 
 
+@skip_check_grad_ci(
+    "we don't need to check grad for the bool type of reshape op"
+)
 class TestReshapeOpBool(TestReshapeOp):
     def setUp(self):
         self.init_data()
diff --git a/test/legacy_test/test_split_op.py b/test/legacy_test/test_split_op.py
index 964e127aafb81..92dfe72f8443e 100644
--- a/test/legacy_test/test_split_op.py
+++ b/test/legacy_test/test_split_op.py
@@ -61,7 +61,11 @@ def test_check_output(self):
 
     def test_check_grad(self):
         self.check_grad(
-            ['X'], ['out0', 'out1', 'out2'], check_prim=True, check_new_ir=True
+            ['X'],
+            ['out0', 'out1', 'out2'],
+            check_prim=True,
+            check_prim_pir=True,
+            check_new_ir=True,
         )
 
 
@@ -117,7 +121,11 @@ def test_check_output(self):
 
     def test_check_grad(self):
         self.check_grad(
-            ['X'], ['out0', 'out1', 'out2'], check_prim=True, check_new_ir=True
+            ['X'],
+            ['out0', 'out1', 'out2'],
+            check_prim=True,
+            check_prim_pir=True,
+            check_new_ir=True,
         )
 
 
@@ -243,7 +251,11 @@ def test_check_output(self):
 
     def test_check_grad(self):
         self.check_grad(
-            ['X'], ['out0', 'out1', 'out2'], check_prim=True, check_new_ir=True
+            ['X'],
+            ['out0', 'out1', 'out2'],
+            check_prim=True,
+            check_prim_pir=True,
+            check_new_ir=True,
         )
 
 
@@ -291,7 +303,12 @@ def test_check_output(self):
         def test_check_grad(self):
             place = core.CUDAPlace(0)
             self.check_grad_with_place(
-                place, ['X'], 'out2', check_prim=True, check_new_ir=True
+                place,
+                ['X'],
+                'out2',
+                check_prim=True,
+                check_prim_pir=True,
+                check_new_ir=True,
             )
 
     cls_name = "{}_{}".format(parent.__name__, "BF16Op")
diff --git a/test/legacy_test/test_sum_op.py b/test/legacy_test/test_sum_op.py
index 63a68442936ab..c154625fb51f4 100644
--- a/test/legacy_test/test_sum_op.py
+++ b/test/legacy_test/test_sum_op.py
@@ -58,11 +58,20 @@ def init_kernel_type(self):
         self.dtype = np.float64
 
     def test_check_output(self):
-        self.check_output(check_prim=True, check_cinn=True, check_new_ir=True)
+        self.check_output(
+            check_prim=True,
+            check_cinn=True,
+            check_new_ir=True,
+        )
 
     def test_check_grad(self):
         self.check_grad(
-            ['x0'], 'Out', check_prim=True, check_cinn=True, check_new_ir=True
+            ['x0'],
+            'Out',
+            check_prim=True,
+            check_cinn=True,
+            check_prim_pir=True,
+            check_new_ir=True,
         )
 
 
@@ -304,7 +313,13 @@ def test_check_output(self):
     def test_check_grad(self):
         place = core.CUDAPlace(0)
         if core.is_float16_supported(place):
-            self.check_grad(['x0'], 'Out', check_cinn=True, check_new_ir=True)
+            self.check_grad(
+                ['x0'],
+                'Out',
+                check_cinn=True,
+                check_prim_pir=True,
+                check_new_ir=True,
+            )
 
 
 def create_test_sum_fp16_class(parent):
@@ -330,7 +345,9 @@ def test_w_is_selected_rows(self):
 class TestSumBF16Op(OpTest):
     def setUp(self):
         self.op_type = "sum"
+        self.prim_op_type = "prim"
         self.python_api = paddle.add_n
+        self.public_python_api = paddle.add_n
         self.init_kernel_type()
         x0 = np.random.random((3, 40)).astype(np.float32)
         x1 = np.random.random((3, 40)).astype(np.float32)
@@ -354,7 +371,13 @@ def test_check_output(self):
 
     def test_check_grad(self):
         # new dynamic graph mode does not support unit16 type
-        self.check_grad(['x0'], 'Out', check_dygraph=False, check_new_ir=True)
+        self.check_grad(
+            ['x0'],
+            'Out',
+            check_dygraph=False,
+            check_prim_pir=True,
+            check_new_ir=True,
+        )
 
 
 class API_Test_Add_n(unittest.TestCase):
diff --git a/test/legacy_test/test_transpose_op.py b/test/legacy_test/test_transpose_op.py
index c8d91f59f8c49..52f85ef1e0a70 100644
--- a/test/legacy_test/test_transpose_op.py
+++ b/test/legacy_test/test_transpose_op.py
@@ -52,7 +52,13 @@ def test_check_output(self):
         self.check_output(no_check_set=['XShape'], check_new_ir=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True)
+        self.check_grad(
+            ['X'],
+            'Out',
+            check_prim=True,
+            check_new_ir=True,
+            check_prim_pir=True,
+        )
 
     def if_enable_cinn(self):
         pass
@@ -209,7 +215,13 @@ def test_check_output(self):
         base.core.disable_autotune()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True)
+        self.check_grad(
+            ['X'],
+            'Out',
+            check_prim=True,
+            check_prim_pir=True,
+            check_new_ir=True,
+        )
 
 
 class TestAutoTuneTransposeFP16Op(OpTest):
@@ -246,7 +258,13 @@ def test_check_output(self):
         base.core.disable_autotune()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True)
+        self.check_grad(
+            ['X'],
+            'Out',
+            check_prim=True,
+            check_prim_pir=True,
+            check_new_ir=True,
+        )
 
 
 class TestAutoTuneTransposeBF16Op(OpTest):
@@ -290,7 +308,13 @@ def test_check_output(self):
         base.core.disable_autotune()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True)
+        self.check_grad(
+            ['X'],
+            'Out',
+            check_prim=True,
+            check_prim_pir=True,
+            check_new_ir=True,
+        )
 
 
 class TestTransposeFP16Op(OpTest):
@@ -325,7 +349,13 @@ def test_check_output(self):
         self.check_output(no_check_set=['XShape'], check_new_ir=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True)
+        self.check_grad(
+            ['X'],
+            'Out',
+            check_prim=True,
+            check_prim_pir=True,
+            check_new_ir=True,
+        )
 
     def initTestCase(self):
         self.shape = (3, 40)

From 6e5c978878e401b9d383de91078f82520fa40cf1 Mon Sep 17 00:00:00 2001
From: xiaoguoguo626807 <100397923+xiaoguoguo626807@users.noreply.github.com>
Date: Thu, 21 Sep 2023 18:35:15 +0800
Subject: [PATCH 36/39] =?UTF-8?q?=E3=80=90pir=E3=80=91Modify=20comment=20o?=
 =?UTF-8?q?f=20pr57478=20and=20pr56873=20(#57520)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* tmp

* reply comment

* code style
---
 .../fluid/pir/dialect/op_generator/api_gen.py |  2 +-
 .../pir/dialect/op_generator/python_c_gen.py  |  2 +-
 .../pir/dialect/operator/ir/manual_api.cc     | 23 ++++++++++---------
 .../pir/dialect/operator/ir/manual_api.h      | 21 +++++++++--------
 .../pir/dialect/operator/ir/manual_op_vjp.cc  |  4 +++-
 .../primitive/backend/manual/manual_backend.h |  1 -
 6 files changed, 28 insertions(+), 25 deletions(-)

diff --git a/paddle/fluid/pir/dialect/op_generator/api_gen.py b/paddle/fluid/pir/dialect/op_generator/api_gen.py
index d7e74f72b652f..851f318e9bc47 100644
--- a/paddle/fluid/pir/dialect/op_generator/api_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/api_gen.py
@@ -150,7 +150,7 @@ def _gen_api_inputs(self, op_info):
         assert len(name_list) == len(type_list)
         ret = []
         for name, type in zip(name_list, type_list):
-            ret.append(f'{self._type_map[type]} {name}')
+            ret.append(f'const {self._type_map[type]}& {name}')
         return ', '.join(ret)
 
     def _gen_api_attrs(
diff --git a/paddle/fluid/pir/dialect/op_generator/python_c_gen.py b/paddle/fluid/pir/dialect/op_generator/python_c_gen.py
index 440f656b99964..adb5270e975e6 100644
--- a/paddle/fluid/pir/dialect/op_generator/python_c_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/python_c_gen.py
@@ -174,7 +174,7 @@
 """
 
 BUILTIN_STACK_OP_TEMPLATE = """
-            {name} = paddle::dialect::stack({name}_tmp, 0);
+            {name} = paddle::dialect::stack({name}_tmp, /*axis*/0);
 """
 TYPE_TO_FUNC_MAP = {
     "bool": "CastPyArg2Boolean",
diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_api.cc b/paddle/fluid/pir/dialect/operator/ir/manual_api.cc
index 24e7a94b66650..eb5acbf2388ea 100644
--- a/paddle/fluid/pir/dialect/operator/ir/manual_api.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/manual_api.cc
@@ -28,8 +28,8 @@ pir::OpResult builtin_combine(const std::vector<pir::Value>& x) {
   return combine_op.out();
 }
 
-std::vector<pir::OpResult> add_n_grad(std::vector<pir::Value> inputs,
-                                      pir::Value out_grad) {
+std::vector<pir::OpResult> add_n_grad(const std::vector<pir::Value>& inputs,
+                                      const pir::Value& out_grad) {
   std::vector<pir::OpResult> inputs_grad;
   for (size_t i = 0; i < inputs.size(); i++) {
     paddle::dialect::ScaleOp scale_op =
@@ -40,8 +40,8 @@ std::vector<pir::OpResult> add_n_grad(std::vector<pir::Value> inputs,
   return inputs_grad;
 }
 
-pir::OpResult zeros_like(pir::Value x,
-                         phi::DataType dtype,
+pir::OpResult zeros_like(const pir::Value& x,
+                         const phi::DataType dtype,
                          const Place& place) {
   return paddle::dialect::full_like(x, 0, dtype, place);
 }
@@ -54,7 +54,7 @@ pir::OpResult get_parameter(const std::string& name) {
   return get_parameter_op.result(0);
 }
 
-void set_parameter(pir::Value parameter, const std::string& name) {
+void set_parameter(const pir::Value& parameter, const std::string& name) {
   std::unique_ptr<pir::Parameter> param(
       new pir::Parameter(nullptr, 0, parameter.type()));
   APIBuilder::Instance().SetParameter(name, std::move(param));
@@ -62,9 +62,9 @@ void set_parameter(pir::Value parameter, const std::string& name) {
                                                                   name);
 }
 
-pir::OpResult embedding_grad(pir::Value x,
-                             pir::Value weight,
-                             pir::Value out_grad,
+pir::OpResult embedding_grad(const pir::Value& x,
+                             const pir::Value& weight,
+                             const pir::Value& out_grad,
                              int64_t padding_idx,
                              bool sparse) {
   if (weight.type().isa<paddle::dialect::DenseTensorType>()) {
@@ -81,7 +81,8 @@ pir::OpResult embedding_grad(pir::Value x,
   }
 }
 
-pir::OpResult split_with_num_grad(std::vector<pir::Value> out_grad, int axis) {
+pir::OpResult split_with_num_grad(const std::vector<pir::Value>& out_grad,
+                                  int axis) {
   auto out_grad_combine_op =
       APIBuilder::Instance().GetBuilder()->Build<pir::CombineOp>(out_grad);
   paddle::dialect::SplitGradOp split_grad_op =
@@ -90,8 +91,8 @@ pir::OpResult split_with_num_grad(std::vector<pir::Value> out_grad, int axis) {
   return split_grad_op.result(0);
 }
 
-pir::OpResult split_with_num_grad(std::vector<pir::Value> out_grad,
-                                  pir::Value axis) {
+pir::OpResult split_with_num_grad(const std::vector<pir::Value>& out_grad,
+                                  const pir::Value& axis) {
   auto out_grad_combine_op =
       APIBuilder::Instance().GetBuilder()->Build<pir::CombineOp>(out_grad);
   paddle::dialect::SplitGradOp split_grad_op =
diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_api.h b/paddle/fluid/pir/dialect/operator/ir/manual_api.h
index c919448f1ddb0..fe579295ad5a0 100644
--- a/paddle/fluid/pir/dialect/operator/ir/manual_api.h
+++ b/paddle/fluid/pir/dialect/operator/ir/manual_api.h
@@ -25,26 +25,27 @@ namespace dialect {
 
 pir::OpResult builtin_combine(const std::vector<pir::Value>& x);
 
-std::vector<pir::OpResult> add_n_grad(std::vector<pir::Value> inputs,
-                                      pir::Value out_grad);
+std::vector<pir::OpResult> add_n_grad(const std::vector<pir::Value>& inputs,
+                                      const pir::Value& out_grad);
 
-pir::OpResult zeros_like(pir::Value x,
+pir::OpResult zeros_like(const pir::Value& x,
                          phi::DataType dtype = phi::DataType::UNDEFINED,
                          const Place& place = {});
 
 pir::OpResult get_parameter(const std::string& name);
 
-void set_parameter(pir::Value parameter, const std::string& name);
+void set_parameter(const pir::Value& parameter, const std::string& name);
 
-pir::OpResult embedding_grad(pir::Value x,
-                             pir::Value weight,
-                             pir::Value out_grad,
+pir::OpResult embedding_grad(const pir::Value& x,
+                             const pir::Value& weight,
+                             const pir::Value& out_grad,
                              int64_t padding_idx = -1,
                              bool sparse = false);
 
-pir::OpResult split_with_num_grad(std::vector<pir::Value> out_grad, int axis);
+pir::OpResult split_with_num_grad(const std::vector<pir::Value>& out_grad,
+                                  int axis);
 
-pir::OpResult split_with_num_grad(std::vector<pir::Value> out_grad,
-                                  pir::Value axis);
+pir::OpResult split_with_num_grad(const std::vector<pir::Value>& out_grad,
+                                  const pir::Value& axis);
 }  // namespace dialect
 }  // namespace paddle
diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_op_vjp.cc b/paddle/fluid/pir/dialect/operator/ir/manual_op_vjp.cc
index b6d131e5411fb..80c13ac89def1 100644
--- a/paddle/fluid/pir/dialect/operator/ir/manual_op_vjp.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/manual_op_vjp.cc
@@ -34,7 +34,9 @@ std::vector<std::vector<pir::OpResult>> AddNOp::Vjp(
   AddNOp op_obj = op->dyn_cast<AddNOp>();
 
   VLOG(6) << "Prepare inputs of add_n_grad";
-
+  PADDLE_ENFORCE(
+      op_obj.inputs() != nullptr,
+      paddle::platform::errors::Fatal("addn op's inputs can't be null"));
   pir::CombineOp combine_op_obj = op_obj.inputs()
                                       .dyn_cast<pir::OpResult>()
                                       .owner()
diff --git a/paddle/fluid/primitive/backend/manual/manual_backend.h b/paddle/fluid/primitive/backend/manual/manual_backend.h
index 16c1facbd5354..3c9340164ac01 100644
--- a/paddle/fluid/primitive/backend/manual/manual_backend.h
+++ b/paddle/fluid/primitive/backend/manual/manual_backend.h
@@ -18,7 +18,6 @@
 #include <vector>
 
 #include "paddle/phi/api/include/tensor.h"
-#include "paddle/utils/optional.h"
 
 namespace paddle {
 namespace primitive {

From 69ad1735436555288b1adb88f731cd67ef8240d9 Mon Sep 17 00:00:00 2001
From: Nyakku Shigure <sigure.qaq@gmail.com>
Date: Thu, 21 Sep 2023 18:39:12 +0800
Subject: [PATCH 37/39] [SOT][3.11] fix eval frame for python 3.11 (#57490)

* [SOT] fix eval frame for python 3.11

* fix missing `()`

* fix no Paddle_PyInterpreterFrameProxyType in < 3.11

* `Paddle_PyInterpreterFrameProxy` -> `PyInterpreterFrameProxy`

* compat for eval_custom_code

* clean callback result is None logic

* refine internal API name

* refine comments
---
 paddle/fluid/pybind/jit.cc | 364 ++++++++++++++++++++++++++++---------
 1 file changed, 275 insertions(+), 89 deletions(-)

diff --git a/paddle/fluid/pybind/jit.cc b/paddle/fluid/pybind/jit.cc
index 69b32fca9cd75..688fe7c670370 100644
--- a/paddle/fluid/pybind/jit.cc
+++ b/paddle/fluid/pybind/jit.cc
@@ -21,7 +21,14 @@ limitations under the License. */
 #include <code.h>
 #endif
 #if PY_VERSION_HEX >= 0x030b0000
+#include <internal/pycore_code.h>
 #include <internal/pycore_frame.h>
+#define Py_BUILD_CORE       // internal/pycore_opcode.h need this macro
+#define NEED_OPCODE_TABLES  // To get _PyOpcode_Caches and _PyOpcode_Deopt
+#include <internal/pycore_opcode.h>
+#undef NEED_OPCODE_TABLES
+#undef Py_BUILD_CORE
+#include <opcode.h>
 #endif
 
 #include <object.h>
@@ -49,64 +56,181 @@ namespace pybind {
 // that we don't need any modification in eval_frame functions.
 typedef _PyInterpreterFrame FrameObject;
 #define CALL_STAT_INC(name) ((void)0)
-PyFrameObject *Paddle_PyFrame_New_NoTrack(PyCodeObject *code) {
-  CALL_STAT_INC(frame_objects_created);
-  int slots = code->co_nlocalsplus + code->co_stacksize;
-  PyFrameObject *f = PyObject_GC_NewVar(PyFrameObject, &PyFrame_Type, slots);
-  if (f == NULL) {
-    return NULL;
+
+// clang-format off
+// Define a proxy PyObject to access _PyInterpreterFrame's properties.
+// It will be passed as an argument to the eval frame's callback.
+typedef struct PyInterpreterFrameProxy {
+  PyObject_HEAD
+  _PyInterpreterFrame *frame;
+} PyInterpreterFrameProxy;
+// clang-format on
+
+#define DECLARE_PROXY_PROPERTY(name)                        \
+  static PyObject *PyInterpreterFrameProxy_property_##name( \
+      PyInterpreterFrameProxy *self, void *closure) {       \
+    Py_XINCREF(self->frame->name);                          \
+    return reinterpret_cast<PyObject *>(self->frame->name); \
+  }
+
+// clang-format off
+#define REGISTER_PROXY_PROPERTY(name)                                         \
+  {                                                                           \
+    #name, (getter)PyInterpreterFrameProxy_property_##name, nullptr, nullptr, \
+        nullptr                                                               \
+  }
+// clang-format on
+
+DECLARE_PROXY_PROPERTY(f_code)
+DECLARE_PROXY_PROPERTY(f_locals)
+DECLARE_PROXY_PROPERTY(f_globals)
+DECLARE_PROXY_PROPERTY(f_builtins)
+
+static PyGetSetDef PyInterpreterFrameProxy_properties[] = {
+    REGISTER_PROXY_PROPERTY(f_code),
+    REGISTER_PROXY_PROPERTY(f_locals),
+    REGISTER_PROXY_PROPERTY(f_globals),
+    REGISTER_PROXY_PROPERTY(f_builtins),
+    {nullptr} /* Sentinel */
+};
+
+// clang-format off
+static PyTypeObject PyInterpreterFrameProxyType = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    .tp_name = "paddle.framework.core.PyInterpreterFrameProxy",
+    .tp_doc = PyDoc_STR("A proxy object for _PyInterpreterFrame, "
+                        "it's only define all properties we need."),
+    .tp_basicsize = sizeof(PyInterpreterFrameProxy),
+    .tp_itemsize = 0,
+    .tp_flags = Py_TPFLAGS_DEFAULT,
+    .tp_getset = PyInterpreterFrameProxy_properties,
+};
+// clang-format on
+
+PyInterpreterFrameProxy *PyInterpreterFrameProxy_New(
+    _PyInterpreterFrame *frame) {
+  PyTypeObject *type = &PyInterpreterFrameProxyType;
+  PyInterpreterFrameProxy *self =
+      reinterpret_cast<PyInterpreterFrameProxy *>(type->tp_alloc(type, 0));
+  if (!self) {
+    VLOG(7) << "Failed to allocate PyInterpreterFrameProxy";
+    return nullptr;
   }
-  f->f_back = NULL;
-  f->f_trace = NULL;
-  f->f_trace_lines = 1;
-  f->f_trace_opcodes = 0;
-  f->f_fast_as_locals = 0;
-  f->f_lineno = 0;
-  return f;
+  self->frame = frame;
+  return self;
 }
 
-static inline bool Paddle_PyFrame_IsIncomplete(_PyInterpreterFrame *frame) {
-  return frame->owner != FRAME_OWNED_BY_GENERATOR &&
-         frame->prev_instr <
-             _PyCode_CODE(frame->f_code) + frame->f_code->_co_firsttraceable;
+// We copy some cpython internal API from cpython project.
+// To avoid name conflict, we use "Internal_" prefix to mark them.
+static int Internal_PyFrame_OpAlreadyRan(_PyInterpreterFrame *frame,
+                                         int opcode,
+                                         int oparg) {
+  // This only works when opcode is a non-quickened form:
+  assert(_PyOpcode_Deopt[opcode] == opcode);
+  int check_oparg = 0;
+  for (_Py_CODEUNIT *instruction = _PyCode_CODE(frame->f_code);
+       instruction < frame->prev_instr;
+       instruction++) {
+    int check_opcode = _PyOpcode_Deopt[_Py_OPCODE(*instruction)];
+    check_oparg |= _Py_OPARG(*instruction);
+    if (check_opcode == opcode && check_oparg == oparg) {
+      return 1;
+    }
+    if (check_opcode == EXTENDED_ARG) {
+      check_oparg <<= 8;
+    } else {
+      check_oparg = 0;
+    }
+    instruction += _PyOpcode_Caches[check_opcode];
+  }
+  return 0;
 }
 
-PyFrameObject *Paddle_PyFrame_MakeAndSetFrameObject(
-    _PyInterpreterFrame *frame) {
-  assert(frame->frame_obj == NULL);
-  PyObject *error_type, *error_value, *error_traceback;
-  PyErr_Fetch(&error_type, &error_value, &error_traceback);
-
-  PyFrameObject *f = Paddle_PyFrame_New_NoTrack(frame->f_code);
-  if (f == NULL) {
-    Py_XDECREF(error_type);
-    Py_XDECREF(error_value);
-    Py_XDECREF(error_traceback);
-    return NULL;  // NOLINT
+int Internal_PyFrame_FastToLocalsWithError(_PyInterpreterFrame *frame) {
+  /* Merge fast locals into f->f_locals */
+  PyObject *locals;
+  PyObject **fast;
+  PyCodeObject *co;
+  locals = frame->f_locals;
+  if (locals == NULL) {
+    locals = frame->f_locals = PyDict_New();
+    if (locals == NULL) return -1;
   }
-  PyErr_Restore(error_type, error_value, error_traceback);
-  if (frame->frame_obj) {
-    f->f_frame = (_PyInterpreterFrame *)f->_f_frame_data;  // NOLINT
-    f->f_frame->owner = FRAME_CLEARED;
-    f->f_frame->frame_obj = f;
-    Py_DECREF(f);
-    return frame->frame_obj;
+  co = frame->f_code;
+  fast = _PyFrame_GetLocalsArray(frame);
+  // COPY_FREE_VARS has no quickened forms, so no need to use _PyOpcode_Deopt
+  // here:
+  int lasti = _PyInterpreterFrame_LASTI(frame);
+  if (lasti < 0 && _Py_OPCODE(_PyCode_CODE(co)[0]) == COPY_FREE_VARS) {
+    /* Free vars have not been initialized -- Do that */
+    PyCodeObject *co = frame->f_code;
+    PyObject *closure = frame->f_func->func_closure;
+    int offset = co->co_nlocals + co->co_nplaincellvars;
+    for (int i = 0; i < co->co_nfreevars; ++i) {
+      PyObject *o = PyTuple_GET_ITEM(closure, i);
+      Py_INCREF(o);
+      frame->localsplus[offset + i] = o;
+    }
+    // COPY_FREE_VARS doesn't have inline CACHEs, either:
+    frame->prev_instr = _PyCode_CODE(frame->f_code);
   }
-  assert(frame->owner != FRAME_OWNED_BY_FRAME_OBJECT);
-  assert(frame->owner != FRAME_CLEARED);
-  f->f_frame = frame;
-  frame->frame_obj = f;
-  return f;
-}
+  for (int i = 0; i < co->co_nlocalsplus; i++) {
+    _PyLocals_Kind kind = _PyLocals_GetKind(co->co_localspluskinds, i);
+
+    /* If the namespace is unoptimized, then one of the
+       following cases applies:
+       1. It does not contain free variables, because it
+          uses import * or is a top-level namespace.
+       2. It is a class namespace.
+       We don't want to accidentally copy free variables
+       into the locals dict used by the class.
+    */
+    if (kind & CO_FAST_FREE && !(co->co_flags & CO_OPTIMIZED)) {
+      continue;
+    }
 
-static inline PyFrameObject *Paddle_PyFrame_GetFrameObject(
-    _PyInterpreterFrame *frame) {
-  assert(!Paddle_PyFrame_IsIncomplete(frame));
-  PyFrameObject *res = frame->frame_obj;
-  if (res != NULL) {
-    return res;
+    PyObject *name = PyTuple_GET_ITEM(co->co_localsplusnames, i);
+    PyObject *value = fast[i];
+    if (frame->stacktop) {
+      if (kind & CO_FAST_FREE) {
+        // The cell was set by COPY_FREE_VARS.
+        assert(value != NULL && PyCell_Check(value));
+        value = PyCell_GET(value);
+      } else if (kind & CO_FAST_CELL) {
+        // Note that no *_DEREF ops can happen before MAKE_CELL
+        // executes.  So there's no need to duplicate the work
+        // that MAKE_CELL would otherwise do later, if it hasn't
+        // run yet.
+        if (value != NULL) {
+          if (PyCell_Check(value) &&
+              Internal_PyFrame_OpAlreadyRan(frame, MAKE_CELL, i)) {
+            // (likely) MAKE_CELL must have executed already.
+            value = PyCell_GET(value);
+          }
+          // (likely) Otherwise it it is an arg (kind & CO_FAST_LOCAL),
+          // with the initial value set when the frame was created...
+          // (unlikely) ...or it was set to some initial value by
+          // an earlier call to PyFrame_LocalsToFast().
+        }
+      }
+    } else {
+      assert(value == NULL);
+    }
+    if (value == NULL) {
+      if (PyObject_DelItem(locals, name) != 0) {
+        if (PyErr_ExceptionMatches(PyExc_KeyError)) {
+          PyErr_Clear();
+        } else {
+          return -1;
+        }
+      }
+    } else {
+      if (PyObject_SetItem(locals, name, value) != 0) {
+        return -1;
+      }
+    }
   }
-  return Paddle_PyFrame_MakeAndSetFrameObject(frame);
+  return 0;
 }
 
 #else
@@ -145,37 +269,84 @@ inline static PyObject *eval_frame_default(PyThreadState *tstate,
 #endif
 }
 
-// Start a new frame and run code in this frame.
-// Execute a piece of code by default frame-hook.
-inline static PyObject *eval_custom_code(PyThreadState *tstate,
-                                         FrameObject *frame,
-                                         PyCodeObject *code,
-                                         int throw_flag) {
+#if PY_VERSION_HEX >= 0x030b0000
+
+inline static PyObject *eval_custom_code_py311_plus(PyThreadState *tstate,
+                                                    FrameObject *frame,
+                                                    PyCodeObject *code,
+                                                    int throw_flag) {
+  // Create a new PyInterpreterFrame. Refer to CALL.
+  // PyInterpreterFrame has a head section calls "specials". It follows
+  // a contiguous section containing localplus and interpreter stack space.
+  size_t size = code->co_nlocalsplus + code->co_stacksize + FRAME_SPECIALS_SIZE;
+  CALL_STAT_INC(frames_pushed);
+  _PyInterpreterFrame *shadow = reinterpret_cast<_PyInterpreterFrame *>(
+      malloc(sizeof(PyObject *) * size));
+  if (shadow == nullptr) {
+    VLOG(7) << "Failed to allocate memory for shadow frame.";
+    return nullptr;
+  }
+  // Create a new function object from code object. Refer to MAKE_FUNCTION.
+  PyFunctionObject *func = reinterpret_cast<PyFunctionObject *>(
+      PyFunction_New(reinterpret_cast<PyObject *>(code), frame->f_globals));
+  _PyFrame_InitializeSpecials(shadow, func, nullptr, code->co_nlocalsplus);
+
+  PyObject **fastlocals_old = frame->localsplus;
+  PyObject **fastlocals_new = shadow->localsplus;
+
+  for (size_t i = 0; i < code->co_nlocalsplus; ++i) {
+    fastlocals_new[i] = nullptr;
+  }
+
+  // The namemap to map the name to index in new frame localsplus.
+  PyObject *namemap = PyDict_New();
+  if (namemap == nullptr) {
+    VLOG(7) << "Failed to create namemap.";
+    free(shadow);
+    return nullptr;
+  }
+  for (size_t i = 0; i < code->co_nlocalsplus; ++i) {
+    PyObject *name = PyTuple_GET_ITEM(code->co_localsplusnames, i);
+    PyObject *index = PyLong_FromSize_t(i);
+    PyDict_SetItem(namemap, name, index);
+  }
+  for (size_t i = 0; i < frame->f_code->co_nlocalsplus; ++i) {
+    PyObject *name = PyTuple_GET_ITEM(frame->f_code->co_localsplusnames, i);
+    PyObject *index = PyDict_GetItem(namemap, name);
+    if (index == nullptr) {
+      continue;
+    }
+    Py_XINCREF(fastlocals_old[i]);
+    fastlocals_new[PyLong_AsSize_t(index)] = fastlocals_old[i];
+  }
+
+  PyObject *result = eval_frame_default(tstate, shadow, throw_flag);
+  free(shadow);
+  Py_DECREF(namemap);
+  return result;
+}
+
+#else
+
+inline static PyObject *eval_custom_code_py310_minus(PyThreadState *tstate,
+                                                     FrameObject *frame,
+                                                     PyCodeObject *code,
+                                                     int throw_flag) {
   Py_ssize_t ncells = 0;
   Py_ssize_t nfrees = 0;
   Py_ssize_t nlocals_new = code->co_nlocals;
   Py_ssize_t nlocals_old = frame->f_code->co_nlocals;
 
-#if PY_VERSION_HEX >= 0x030b0000
-  ncells = code->co_ncellvars;
-  nfrees = code->co_nfreevars;
-#else
   ncells = PyTuple_GET_SIZE(code->co_cellvars);
   nfrees = PyTuple_GET_SIZE(code->co_freevars);
-#endif
 
   PyFrameObject *shadow = PyFrame_New(tstate, code, frame->f_globals, nullptr);
   if (shadow == nullptr) {
     return nullptr;
   }
 
-#if PY_VERSION_HEX >= 0x030b0000
-  PyObject **fastlocals_old = frame->localsplus;
-  PyObject **fastlocals_new = shadow->f_frame->localsplus;
-#else
   PyObject **fastlocals_old = frame->f_localsplus;
   PyObject **fastlocals_new = shadow->f_localsplus;
-#endif
 
   for (Py_ssize_t i = 0; i < nlocals_old; i++) {
     Py_XINCREF(fastlocals_old[i]);
@@ -187,15 +358,26 @@ inline static PyObject *eval_custom_code(PyThreadState *tstate,
     fastlocals_new[nlocals_new + i] = fastlocals_old[nlocals_old + i];
   }
 
-#if PY_VERSION_HEX >= 0x030b0000
-  PyObject *result = eval_frame_default(tstate, shadow->f_frame, throw_flag);
-#else
   PyObject *result = eval_frame_default(tstate, shadow, throw_flag);
-#endif
   Py_DECREF(shadow);
   return result;
 }
 
+#endif
+
+// Start a new frame and run code in this frame.
+// Execute a piece of code by default frame-hook.
+inline static PyObject *eval_custom_code(PyThreadState *tstate,
+                                         FrameObject *frame,
+                                         PyCodeObject *code,
+                                         int throw_flag) {
+#if PY_VERSION_HEX >= 0x030b0000
+  return eval_custom_code_py311_plus(tstate, frame, code, throw_flag);
+#else
+  return eval_custom_code_py310_minus(tstate, frame, code, throw_flag);
+#endif
+}
+
 static PyObject *_custom_eval_frame(PyThreadState *tstate,
                                     FrameObject *frame,
                                     int throw_flag,
@@ -203,13 +385,16 @@ static PyObject *_custom_eval_frame(PyThreadState *tstate,
 // https://peps.python.org/pep-0558/#fast-locals-proxy-implementation-details
 // https://devguide.python.org/internals/interpreter/#all-sorts-of-variables
 #if PY_VERSION_HEX >= 0x030b0000
-  // _PyFrame_GetFrameObject(frame) # this function should be the right answer,
-  // but nm libpython.so | grep _PyFrame_MakeAndSetFrameObject is a `t' symbol,
-  // which means it's local to library. we will get a link error if we use it.
   if (frame->owner == FRAME_OWNED_BY_GENERATOR) {
     return eval_frame_default(tstate, frame, throw_flag);
   }
-  if (PyFrame_FastToLocalsWithError(Paddle_PyFrame_GetFrameObject(frame)) < 0) {
+  // PyFrame_FastToLocalsWithError receives a PyFrameObject, but if we created a
+  // PyFrameObject from a PyInterpreterFrame, it will changes the original
+  // PyInterpreterFrame and causes a Segmentation Fault when Fallback to run
+  // original frame. So we pass a PyInterpreterFrame to
+  // _PyFrame_FastToLocalsWithError directly. But this is an internal API, so we
+  // copy many code from CPython project into our project.
+  if (Internal_PyFrame_FastToLocalsWithError(frame) < 0) {
 #else
   if (PyFrame_FastToLocalsWithError(frame) < 0) {
 #endif
@@ -236,39 +421,38 @@ static PyObject *_custom_eval_frame(PyThreadState *tstate,
   eval_frame_callback_set(Py_None);
 
 #if PY_VERSION_HEX >= 0x030b0000
-  PyObject *args = Py_BuildValue("(O)", Paddle_PyFrame_GetFrameObject(frame));
+  PyObject *args = Py_BuildValue("(O)", PyInterpreterFrameProxy_New(frame));
 #else
   PyObject *args = Py_BuildValue("(O)", frame);
 #endif
   PyObject *result = PyObject_CallObject(callback, args);
   Py_DECREF(args);
   VLOG(7) << "After call eval_frame_function and decrease frame.";
-  // result: GuardedCode
+  // class CustomCode(Protocal):
+  //     code: CodeType | None
+  //     disable_eval_frame: bool
+  // result: CustomCode
   if (result == nullptr) {
     // internal exception
     VLOG(7) << "Error happened.";
     return nullptr;
-  } else if (result != Py_None) {
+  } else {
     //  NOTE: Cache is not supported now
     PyCodeObject *code = reinterpret_cast<PyCodeObject *>(
         PyObject_GetAttrString(result, "code"));
     PyObject *disable_eval_frame =
         PyObject_GetAttrString(result, "disable_eval_frame");
+    PyObject *out;
+    VLOG(7) << "Start eval new frame and code.";
     if (disable_eval_frame != Py_True) {
       // Re-enable custom behavior
       eval_frame_callback_set(callback);
-      VLOG(7) << "Start eval new frame and code.";
-      PyObject *out;
       if (reinterpret_cast<PyObject *>(code) != Py_None) {
         out = eval_custom_code(tstate, frame, code, throw_flag);
       } else {
         out = eval_frame_default(tstate, frame, throw_flag);
       }
-      Py_DECREF(result);
-      Py_DECREF(code);
-      return out;
     } else {
-      PyObject *out;
       if (reinterpret_cast<PyObject *>(code) != Py_None) {
         out = eval_custom_code(tstate, frame, code, throw_flag);
       } else {
@@ -276,14 +460,10 @@ static PyObject *_custom_eval_frame(PyThreadState *tstate,
       }
       // Re-enable custom behavior
       eval_frame_callback_set(callback);
-      Py_DECREF(result);
-      Py_DECREF(code);
-      return out;
     }
-  } else {
-    // Re-enable custom behavior
-    eval_frame_callback_set(callback);
-    return eval_frame_default(tstate, frame, throw_flag);
+    Py_DECREF(result);
+    Py_DECREF(code);
+    return out;
   }
 }
 
@@ -414,6 +594,12 @@ void BindEvalFrame(pybind11::module *m) {
         return obj;
       },
       py::arg("callback"));
+#if PY_VERSION_HEX >= 0x030b0000
+  if (PyType_Ready(&PyInterpreterFrameProxyType) < 0) {
+    VLOG(7) << "PyInterpreterFrameProxyType has not been ready!";
+  }
+  Py_INCREF(&PyInterpreterFrameProxyType);
+#endif
 }
 
 }  // namespace pybind

From 177c1397ec774a286fa6a203dd0fa249b685d963 Mon Sep 17 00:00:00 2001
From: kangguangli <kangguangli@hotmail.com>
Date: Thu, 21 Sep 2023 20:20:07 +0800
Subject: [PATCH 38/39] [PIR] register fused_attention in pir (#57557)

* register fused_attention in pir

* fix

* fix
---
 .../pir/dialect/op_generator/ops_api_gen.py   |   1 +
 paddle/fluid/pir/dialect/operator/ir/ops.yaml |   9 +
 paddle/phi/api/yaml/op_compat.yaml            |  35 +++
 paddle/phi/infermeta/multiary.cc              | 247 ++++++++++++++++++
 paddle/phi/infermeta/multiary.h               |  49 ++++
 test/white_list/new_ir_op_test_white_list     |   1 +
 6 files changed, 342 insertions(+)

diff --git a/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py b/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py
index 9f04a9b2fd4b2..e11b2ad1c1bf1 100644
--- a/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py
@@ -95,6 +95,7 @@
     'c_allreduce_max',
     'c_allgather',
     'seed',
+    "fused_attention",
 ]
 
 
diff --git a/paddle/fluid/pir/dialect/operator/ir/ops.yaml b/paddle/fluid/pir/dialect/operator/ir/ops.yaml
index 8babc4635b8fb..d3cbc31c2e490 100644
--- a/paddle/fluid/pir/dialect/operator/ir/ops.yaml
+++ b/paddle/fluid/pir/dialect/operator/ir/ops.yaml
@@ -170,3 +170,12 @@
   args : (Tensor i, Tensor x)
   output : Tensor[](out)
   backward: write_to_array_grad
+
+- op: fused_attention
+  args: (Tensor x, Tensor ln_scale, Tensor ln_bias, Tensor qkv_weight, Tensor qkv_bias, Tensor cache_kv, Tensor src_mask, Tensor out_linear_weight, Tensor out_linear_bias, Tensor ln_scale_2, Tensor ln_bias_2, int num_heads, bool transpose_qkv_wb, bool pre_layer_norm, float epsilon, float attn_dropout_rate, bool is_test, bool attn_dropout_fix_seed, int attn_dropout_seed, str attn_dropout_implementation, float dropout_rate, bool dropout_fix_seed, int dropout_seed, str dropout_implementation, float ln_epsilon, bool add_residual, int ring_id)
+  output: Tensor(ln_mean), Tensor(ln_var), Tensor(ln_out), Tensor(qkv_out), Tensor(qkv_bias_out), Tensor(transpose_out_2), Tensor(qk_out), Tensor(qktv_out), Tensor(softmax_out), Tensor(attn_dropout_mask_out), Tensor(attn_dropout_out), Tensor(src_mask_out), Tensor(fmha_out), Tensor(out_linear_out), Tensor(dropout_mask_out), Tensor(ln_mean_2), Tensor(ln_var_2), Tensor(bias_dropout_residual_out), Tensor(cache_kv_out), Tensor(out)
+  kernel:
+    func: fused_attention
+  infer_meta:
+    func: FusedAttentionInferMeta
+  optional: cache_kv, ln_scale, ln_bias, qkv_bias, src_mask, out_linear_bias, ln_scale_2, ln_bias_2
diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml
index 8a85147a66da0..63093631e4347 100755
--- a/paddle/phi/api/yaml/op_compat.yaml
+++ b/paddle/phi/api/yaml/op_compat.yaml
@@ -1181,6 +1181,41 @@
       data_type : float
       support_tensor : true
 
+- op : fused_attention
+  inputs:
+    x: X
+    ln_scale: LnScale
+    ln_bias: LnBias
+    qkv_weight: QKVW
+    qkv_bias: QKVBias
+    cache_kv: CacheKV
+    src_mask: SrcMask
+    out_linear_weight: OutLinearW
+    out_linear_bias: OutLinearBias
+    ln_scale_2: Ln2Scale
+    ln_bias_2: Ln2Bias
+  outputs:
+    ln_mean: LnMean
+    ln_var: LnVariance
+    ln_out: LnOut
+    qkv_out: QKVOut
+    qkv_bias_out: QKVBiasOut
+    transpose_out_2: TransposeOut2
+    qk_out: QKOut
+    qktv_out: QKTVOut
+    softmax_out: SoftmaxOut
+    attn_dropout_mask_out: AttnDropoutMaskOut
+    attn_dropout_out: AttnDropoutOut
+    src_mask_out: SrcMaskOut
+    fmha_out: FMHAOut
+    out_linear_out: OutLinearOut
+    dropout_mask_out: DropoutMaskOut
+    ln_mean_2: Ln2Mean
+    ln_var_2: Ln2Variance
+    bias_dropout_residual_out: BiasDropoutResidualOut
+    cache_kv_out: CacheKVOut
+    out: Y
+
 - op : fused_batch_norm_act
   backward : fused_batch_norm_act_grad
   inputs:
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index 8de465867273c..6b09dd22db263 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -1604,6 +1604,253 @@ void FusedBiasActInferMeta(const MetaTensor& x,
   out->set_layout(x.layout());
 }
 
+void FusedAttentionInferMeta(const MetaTensor& x,
+                             const MetaTensor& ln_scale,
+                             const MetaTensor& ln_bias,
+                             const MetaTensor& qkv_weight,
+                             const MetaTensor& qkv_bias,
+                             const MetaTensor& cache_kv,
+                             const MetaTensor& src_mask,
+                             const MetaTensor& out_linear_weight,
+                             const MetaTensor& out_linear_bias,
+                             const MetaTensor& ln_scale_2,
+                             const MetaTensor& ln_bias_2,
+                             int num_heads,
+                             bool transpose_qkv_wb,
+                             bool pre_layer_norm,
+                             float epsilon,
+                             float attn_dropout_rate,
+                             bool is_test,
+                             bool attn_dropout_fix_seed,
+                             int attn_dropout_seed,
+                             const std::string& attn_dropout_implementation,
+                             float dropout_rate,
+                             bool dropout_fix_seed,
+                             int dropout_seed,
+                             const std::string& dropout_implementation,
+                             float ln_epsilon,
+                             bool add_residual,
+                             int ring_id,
+                             MetaTensor* ln_mean,
+                             MetaTensor* ln_var,
+                             MetaTensor* ln_out,
+                             MetaTensor* qkv_out,
+                             MetaTensor* qkv_bias_out,
+                             MetaTensor* transpose_out_2,
+                             MetaTensor* qk_out,
+                             MetaTensor* qktv_out,
+                             MetaTensor* softmax_out,
+                             MetaTensor* attn_dropout_mask_out,
+                             MetaTensor* attn_dropout_out,
+                             MetaTensor* src_mask_out,
+                             MetaTensor* fmha_out,
+                             MetaTensor* out_linear_out,
+                             MetaTensor* dropout_mask_out,
+                             MetaTensor* ln_mean_2,
+                             MetaTensor* ln_var_2,
+                             MetaTensor* bias_dropout_residual_out,
+                             MetaTensor* cache_kv_out,
+                             MetaTensor* out,
+                             MetaConfig config) {
+  auto x_dim = x.dims();
+  auto y_dim = qkv_weight.dims();
+
+  int dim_head = 0;
+  int hidden_size = 0;
+  int nranks = 1;
+  if (transpose_qkv_wb) {
+    PADDLE_ENFORCE_EQ(y_dim.size(),
+                      2,
+                      phi::errors::InvalidArgument(
+                          "The dimensions of qkv_weight must be 2 if enable"
+                          "transpose_qkv_wb: (dim_embed, 3 * dim_embed),"
+                          "but received dimensions of"
+                          "Input is [%d]",
+                          y_dim.size()));
+    PADDLE_ENFORCE_GT(num_heads,
+                      0,
+                      phi::errors::InvalidArgument(
+                          "The num_heads must be provided and greater than 0 "
+                          "if enable transpose_qkv_wb, but we got %d.",
+                          num_heads));
+    PADDLE_ENFORCE_EQ(y_dim[0] % num_heads,
+                      0,
+                      phi::errors::InvalidArgument(
+                          "First dim of qkv_w must be divisible by num heads "
+                          "if enable transpose_qkv_wb, but receive first "
+                          "dim of qkv_w is %d and num_heads is %d.",
+                          y_dim[0],
+                          num_heads));
+    if (ring_id == -1) {
+      PADDLE_ENFORCE_EQ(
+          y_dim[0] * 3,
+          y_dim[1],
+          phi::errors::InvalidArgument("The dimensions of qkv_weight must be 2"
+                                       "(dim_embed, 3 * dim_embed)."));
+    } else {
+      // compute the mp nranks
+      nranks = (y_dim[0] * 3) / y_dim[1];
+    }
+    dim_head = y_dim[0] / (num_heads * nranks);
+    hidden_size = y_dim[0];
+  } else {
+    PADDLE_ENFORCE_EQ(y_dim.size(),
+                      4,
+                      phi::errors::InvalidArgument(
+                          "The dimensions of qkv_weight must be 4 if not"
+                          "enable transpose_qkv_wb: (3, num_head, dim_head, "
+                          "dim_embed), but received [%d]",
+                          y_dim.size()));
+    PADDLE_ENFORCE_EQ(
+        y_dim[0],
+        3,
+        phi::errors::InvalidArgument("First dim of qkv_w must be 3 if disable "
+                                     "transpose_qkv_wb, but we got %d.",
+                                     y_dim[0]));
+    if (ring_id == -1) {
+      PADDLE_ENFORCE_EQ(
+          y_dim[1] * y_dim[2],
+          y_dim[3],
+          phi::errors::InvalidArgument("The dimensions of qkv_weight must be 4"
+                                       "(3, num_head, dim_head, dim_embed),"
+                                       "and must satisfy the limitations: "
+                                       "(num_head * dim_head == dim_embed)"));
+    }
+    num_heads = y_dim[1];
+    dim_head = y_dim[2];
+    hidden_size = y_dim[3];
+  }
+
+  PADDLE_ENFORCE_EQ(
+      x_dim.size(),
+      3,
+      phi::errors::InvalidArgument("The dimensions of x must be 3"
+                                   "(batch_size, seq_len, dim_embed),"
+                                   "but received dimensions of"
+                                   "Input is [%d]",
+                                   x_dim.size()));
+
+  PADDLE_ENFORCE_EQ(x_dim[2],
+                    hidden_size,
+                    phi::errors::InvalidArgument(
+                        "ShapeError: the dimension of x_dim[2] and y_dim[3] "
+                        "(y_dim[1] if enable transpose_qkv_w) "
+                        "must be equal. But received: the shape "
+                        "of input x = [%s], and the shape of "
+                        "input qkv_weight = [%s]",
+                        x_dim,
+                        y_dim));
+
+  if (pre_layer_norm) {
+    ln_mean->set_dims({x_dim[0] * x_dim[1]});
+    ln_var->set_dims({x_dim[0] * x_dim[1]});
+    ln_out->set_dims(x.dims());
+  } else {
+    ln_mean_2->set_dims({x_dim[0] * x_dim[1]});
+    ln_var_2->set_dims({x_dim[0] * x_dim[1]});
+    bias_dropout_residual_out->set_dims(x.dims());
+  }
+
+  if (transpose_qkv_wb) {
+    // [batch_size, seq_len, 3 * num_heads * dim_head]
+    qkv_out->set_dims({x_dim[0], x_dim[1], 3 * num_heads * dim_head});
+
+    if (qkv_bias) {
+      qkv_bias_out->set_dims({x_dim[0], x_dim[1], 3 * num_heads * dim_head});
+    }
+  } else {
+    // [batch_size, seq_len, 3, num_head, head_size]
+    qkv_out->set_dims({x_dim[0], x_dim[1], 3, num_heads, dim_head});
+
+    if (qkv_bias) {
+      qkv_bias_out->set_dims({x_dim[0], x_dim[1], 3, num_heads, dim_head});
+    }
+  }
+
+  // [3, batch_size, num_head, seq_len, head_size]
+  transpose_out_2->set_dims({3, x_dim[0], num_heads, x_dim[1], dim_head});
+
+  // cache_seq_len + seq_len if cache else seq_len
+  auto out_seq_len = x_dim[1];
+  if (cache_kv) {
+    // [2, batch_size, num_head, cache_seq_len, head_size]
+    auto c_dim = cache_kv.dims();
+
+    PADDLE_ENFORCE_EQ(
+        c_dim.size(),
+        5,
+        phi::errors::InvalidArgument("The CacheKV must be 5 dims, but got %d",
+                                     c_dim.size()));
+    PADDLE_ENFORCE_EQ(c_dim[0],
+                      2,
+                      phi::errors::InvalidArgument(
+                          "The first dim of CacheKV must be 2, but got %d",
+                          c_dim[0]));  // 2
+    PADDLE_ENFORCE_EQ(c_dim[1],
+                      x_dim[0],
+                      phi::errors::InvalidArgument(
+                          "The second dim of CacheKV must be equal with "
+                          "batch size %d, but got %d",
+                          x_dim[0],
+                          c_dim[1]));  // batch_size
+    PADDLE_ENFORCE_EQ(c_dim[2],
+                      num_heads,
+                      phi::errors::InvalidArgument(
+                          "The third dim of CacheKV must be equal with num "
+                          "head %d, but got %d",
+                          num_heads,
+                          c_dim[2]));  // num_head
+    // In compile stage, input seq_len can be -1, in that case
+    // c_dim[3] may < 0 in while
+    if (config.is_runtime) {
+      PADDLE_ENFORCE_GE(
+          c_dim[3],
+          0,
+          phi::errors::InvalidArgument(
+              "The forth dim of CacheKV must be greater than 0, but got %d",
+              c_dim[3]));  // cache_seq_len
+    }
+
+    PADDLE_ENFORCE_EQ(c_dim[4],
+                      dim_head,
+                      phi::errors::InvalidArgument(
+                          "The fifth dim of CacheKV must be equal with head "
+                          "size %d, but got %d",
+                          dim_head,
+                          c_dim[4]));  // head_size
+
+    out_seq_len += c_dim[3];
+    // [3, batch_size, num_head, cache_seq_len + seq_len, head_size]
+    cache_kv_out->set_dims(
+        {c_dim[0], c_dim[1], c_dim[2], out_seq_len, c_dim[4]});
+  }
+  // [batch, num_head, seq_len, out_seq_len]
+  qk_out->set_dims({x_dim[0], num_heads, x_dim[1], out_seq_len});
+
+  if (src_mask) {
+    src_mask_out->set_dims({x_dim[0], num_heads, x_dim[1], out_seq_len});
+  }
+  // the same as QKOut's shape.
+  attn_dropout_out->set_dims({x_dim[0], num_heads, x_dim[1], out_seq_len});
+  if (is_test) {
+    attn_dropout_mask_out->set_dims(
+        {x_dim[0], num_heads, x_dim[1], out_seq_len});
+  }
+  softmax_out->set_dims({x_dim[0], num_heads, x_dim[1], out_seq_len});
+  // [batch_size, num_heads, seq_len, head_dim]
+  qktv_out->set_dims({x_dim[0], num_heads, x_dim[1], dim_head});
+  // [batch_size, seq_len, number of heads*head size]
+  fmha_out->set_dims({x_dim[0], x_dim[1], num_heads, dim_head});
+
+  out_linear_out->set_dims(x.dims());
+
+  if (is_test == false) {
+    dropout_mask_out->set_dims(x.dims());
+  }
+
+  out->set_dims(x.dims());
+}
+
 void FusedLayerNormInferMeta(const MetaTensor& x,
                              const MetaTensor& bias,
                              const MetaTensor& residual,
diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h
index ee62d6d51d655..aaa4787968538 100644
--- a/paddle/phi/infermeta/multiary.h
+++ b/paddle/phi/infermeta/multiary.h
@@ -334,6 +334,55 @@ void FusedBiasActInferMeta(const MetaTensor& x,
                            MetaTensor* out,
                            MetaConfig config = MetaConfig());
 
+void FusedAttentionInferMeta(const MetaTensor& x,
+                             const MetaTensor& ln_scale,
+                             const MetaTensor& ln_bias,
+                             const MetaTensor& qkv_weight,
+                             const MetaTensor& qkv_bias,
+                             const MetaTensor& cache_kv,
+                             const MetaTensor& src_mask,
+                             const MetaTensor& out_linear_weight,
+                             const MetaTensor& out_linear_bias,
+                             const MetaTensor& ln_scale_2,
+                             const MetaTensor& ln_bias_2,
+                             int num_heads,
+                             bool transpose_qkv_wb,
+                             bool pre_layer_norm,
+                             float epsilon,
+                             float attn_dropout_rate,
+                             bool is_test,
+                             bool attn_dropout_fix_seed,
+                             int attn_dropout_seed,
+                             const std::string& attn_dropout_implementation,
+                             float dropout_rate,
+                             bool dropout_fix_seed,
+                             int dropout_seed,
+                             const std::string& dropout_implementation,
+                             float ln_epsilon,
+                             bool add_residual,
+                             int ring_id,
+                             MetaTensor* ln_mean,
+                             MetaTensor* ln_var,
+                             MetaTensor* ln_out,
+                             MetaTensor* qkv_out,
+                             MetaTensor* qkv_bias_out,
+                             MetaTensor* transpose_out_2,
+                             MetaTensor* qk_out,
+                             MetaTensor* qktv_out,
+                             MetaTensor* softmax_out,
+                             MetaTensor* attn_dropout_mask_out,
+                             MetaTensor* attn_dropout_out,
+                             MetaTensor* src_mask_out,
+                             MetaTensor* fmha_out,
+                             MetaTensor* out_linear_out,
+                             MetaTensor* dropout_mask_out,
+                             MetaTensor* ln_mean_2,
+                             MetaTensor* ln_var_2,
+                             MetaTensor* bias_dropout_residual_out,
+                             MetaTensor* cache_kv_out,
+                             MetaTensor* out,
+                             MetaConfig config = MetaConfig());
+
 void FusedLayerNormInferMeta(const MetaTensor& x,
                              const MetaTensor& bias,
                              const MetaTensor& residual,
diff --git a/test/white_list/new_ir_op_test_white_list b/test/white_list/new_ir_op_test_white_list
index b85c88fa6bb18..3dc336a187718 100644
--- a/test/white_list/new_ir_op_test_white_list
+++ b/test/white_list/new_ir_op_test_white_list
@@ -88,6 +88,7 @@ test_fmax_op
 test_fmin_op
 test_fold_op
 test_frame_op
+test_fused_attention_op_api
 test_gather_tree_op
 test_gaussian_random_op
 test_generate_proposals_v2_op

From 85622b9d6c543c758f6405454ccd403e2ba9aa7d Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Fri, 22 Sep 2023 02:50:35 +0000
Subject: [PATCH 39/39] fix merge conflict

---
 .../dialect/op_generator/op_creator_drr_gen.py    | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/pir/dialect/op_generator/op_creator_drr_gen.py b/paddle/fluid/pir/dialect/op_generator/op_creator_drr_gen.py
index 5f75063668ee0..01512a7d5b38d 100644
--- a/paddle/fluid/pir/dialect/op_generator/op_creator_drr_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/op_creator_drr_gen.py
@@ -79,9 +79,14 @@ def parse_yaml(self, op_yaml_files, op_compat_yaml_file):
                 op_yaml_items = op_yaml_items + ops
         op_info_items = []
         for op in op_yaml_items:
-            op_info_items.append(
-                OpInfoParser(op, op_compat_parser.get_compat(op['name']))
-            )
+            op_compat_item = op_compat_parser.get_compat(op['name'])
+            if (
+                op_compat_item is not None
+                and op_compat_item['op'] == "pow"
+                and 'scalar' in op_compat_item
+            ):
+                op_compat_item = op_compat_item.pop('scalar')
+            op_info_items.append(OpInfoParser(op, op_compat_item))
         return op_info_items
 
     def gen_cpp_file_code(self, cpp_file_path):
@@ -117,9 +122,7 @@ def gen_cpp_file_code(self, cpp_file_path):
                     if len(op_info_item.attribute_name_list) > len(
                         op_info_item.mutable_attribute_name_list
                     ):
-                        # TODO(zyfncg): Currently Op::Build Interface doesn't support this case.
-                        continue
-                        # params_with_mutable_attr.append("attrs")
+                        params_with_mutable_attr.append("attrs")
 
                     body_code += MUTABLE_ATTR_FUNCTION_TEMPLATE.format(
                         op_name=ir_op_name,