diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 4d0b04209c059..dc661fce388fe 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -201,10 +201,6 @@ if(WITH_DISTRIBUTE)
   add_definitions(-DPADDLE_WITH_DISTRIBUTE)
 endif()
 
-if(WITH_GFLAGS)
-  add_definitions(-DPADDLE_WITH_GFLAGS)
-endif()
-
 if(WITH_PSCORE)
   add_definitions(-DPADDLE_WITH_PSCORE)
 endif()
diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
index 75436783c7ede..3398f8a28307e 100755
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -110,6 +110,7 @@ endif()
 set(flags_dep)
 if(WITH_GFLAGS)
   list(APPEND flags_dep gflags)
+  add_definitions(-DPADDLE_WITH_GFLAGS)
 else()
   list(APPEND flags_dep paddle_flags)
 endif()
diff --git a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc
index 27ac1681a4008..3dc9175dbfd4b 100644
--- a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc
@@ -414,7 +414,6 @@ void analyse_event_info_for_two_instructions<Instruction>(
 
   if (has_data_dependency<Instruction, std::string>(
           instructions[cur_instr_id], instructions[next_instr_id]) ||
-      !run_type_info[next_instr_id][DownstreamRunType::kEventRun].empty() ||
       instructions[next_instr_id]->OpBase()->Type() == "depend") {
     waiter_instr_ids->insert(next_instr_id);
     return;
@@ -474,7 +473,6 @@ void analyse_event_info_for_two_instructions<
 
   if (has_data_dependency<paddle::framework::InstructionBase, ir::Value>(
           instructions[cur_instr_id], instructions[next_instr_id]) ||
-      !run_type_info[next_instr_id][DownstreamRunType::kEventRun].empty() ||
       instructions[next_instr_id]->Name() == "pd.depend") {
     waiter_instr_ids->insert(next_instr_id);
     return;
diff --git a/paddle/fluid/framework/type_info.cc b/paddle/fluid/framework/type_info.cc
index 442800d035f55..cb7dae540d119 100644
--- a/paddle/fluid/framework/type_info.cc
+++ b/paddle/fluid/framework/type_info.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/raw_tensor.h"
 #include "paddle/fluid/framework/string_array.h"
+#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_meta_tensor.h"
 #include "paddle/fluid/prim/utils/static/desc_tensor.h"
 #include "paddle/fluid/primitive/type/lazy_tensor.h"
 
@@ -44,5 +45,6 @@ template class TypeInfoTraits<phi::TensorBase, paddle::prim::DescTensor>;
 template class TypeInfoTraits<phi::TensorBase, paddle::primitive::LazyTensor>;
 template class TypeInfoTraits<phi::TensorBase,
                               paddle::framework::VariableRefArray>;
+template class TypeInfoTraits<phi::TensorBase, paddle::dialect::IrMetaTensor>;
 
 }  // namespace phi
diff --git a/paddle/fluid/ir/dialect/op_generator/op_build_gen.py b/paddle/fluid/ir/dialect/op_generator/op_build_gen.py
index d9747b47e8747..d36c269648315 100644
--- a/paddle/fluid/ir/dialect/op_generator/op_build_gen.py
+++ b/paddle/fluid/ir/dialect/op_generator/op_build_gen.py
@@ -286,27 +286,25 @@ def GenBuildOutputs(
     build_output_str = '  VLOG(4) << "Builder construction outputs";\n'
     CREATE_INPUT_METATENSOR_TEMPLATE = """
   VLOG(4) << "Builder construction  dense_{name}";
-  phi::DenseTensor dense_{name}(std::make_unique<paddle::experimental::DefaultAllocator>(paddle::platform::CPUPlace()).get(),
-                                phi::DenseTensorMeta(paddle::dialect::TransToPhiDataType({name}.dtype()),
-                                                     {name}.dims(),
-                                                     {name}.data_layout(),
-                                                     {name}.lod(),
-                                                     {name}.offset()));
+  paddle::dialect::IrMetaTensor ir_meta_tensor_{name}(paddle::dialect::TransToPhiDataType({name}.dtype()),
+                                                      {name}.dims(),
+                                                      {name}.data_layout(),
+                                                      {name}.lod(),
+                                                      {name}.offset());
   VLOG(4) << "Builder construction  meta_{name}";
-  phi::MetaTensor meta_{name}(&dense_{name});
+  phi::MetaTensor meta_{name}(&ir_meta_tensor_{name});
 """
-    CREATE_INPUT_VEC_METATENSOR_TEMPLATE = """  std::vector<phi::DenseTensor> vec_dense_{name};
+    CREATE_INPUT_VEC_METATENSOR_TEMPLATE = """  std::vector<paddle::dialect::IrMetaTensor> vec_ir_meta_tensor_{name};
   for (size_t i=0; i < static_cast<size_t>({name}.size()); i++) {{
-    vec_dense_{name}.push_back(phi::DenseTensor(std::make_unique<paddle::experimental::DefaultAllocator>(paddle::platform::CPUPlace()).get(),
-                                                phi::DenseTensorMeta(paddle::dialect::TransToPhiDataType({name}[i].dyn_cast<paddle::dialect::DenseTensorType>().dtype()),
+    vec_ir_meta_tensor_{name}.push_back(paddle::dialect::IrMetaTensor(paddle::dialect::TransToPhiDataType({name}[i].dyn_cast<paddle::dialect::DenseTensorType>().dtype()),
                                                                      {name}[i].dyn_cast<paddle::dialect::DenseTensorType>().dims(),
                                                                      {name}[i].dyn_cast<paddle::dialect::DenseTensorType>().data_layout(),
                                                                      {name}[i].dyn_cast<paddle::dialect::DenseTensorType>().lod(),
-                                                                     {name}[i].dyn_cast<paddle::dialect::DenseTensorType>().offset())));
+                                                                     {name}[i].dyn_cast<paddle::dialect::DenseTensorType>().offset()));
   }}
   std::vector<phi::MetaTensor> vec_meta_{name};
-  for (size_t i=0; i < vec_dense_{name}.size(); i++) {{
-    vec_meta_{name}.push_back(phi::MetaTensor(&vec_dense_{name}[i]));
+  for (size_t i=0; i < vec_ir_meta_tensor_{name}.size(); i++) {{
+    vec_meta_{name}.push_back(phi::MetaTensor(&vec_ir_meta_tensor_{name}[i]));
   }}
 
   std::vector<const phi::MetaTensor*> meta_{name};
diff --git a/paddle/fluid/ir/dialect/op_generator/op_gen.py b/paddle/fluid/ir/dialect/op_generator/op_gen.py
index 5d51a731c546a..7ee65d050581b 100644
--- a/paddle/fluid/ir/dialect/op_generator/op_gen.py
+++ b/paddle/fluid/ir/dialect/op_generator/op_gen.py
@@ -101,6 +101,7 @@ class {op_name} : public ir::Op<{op_name}{interfaces}{traits}> {{
 #include "{h_file}"
 #include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_type.h"
 #include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_attribute.h"
+#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_meta_tensor.h"
 #include "paddle/ir/core/builtin_attribute.h"
 #include "paddle/ir/core/builtin_type.h"
 #include "paddle/ir/core/builtin_op.h"
@@ -172,7 +173,7 @@ class {op_name} : public ir::Op<{op_name}{interfaces}{traits}> {{
     'bool': 'ir::BoolAttribute',
 }
 
-_NO_NEED_GEN_OPS = {'add_n', 'split_grad'}
+_NO_NEED_GEN_OPS = {'add_n', 'add_n_', 'add_n_with_kernel', 'split_grad'}
 
 
 def to_phi_and_fluid_op_name(op_item):
diff --git a/paddle/fluid/ir/dialect/paddle_dialect/ir/CMakeLists.txt b/paddle/fluid/ir/dialect/paddle_dialect/ir/CMakeLists.txt
index cc8d1357bf070..08cc463c34c9b 100644
--- a/paddle/fluid/ir/dialect/paddle_dialect/ir/CMakeLists.txt
+++ b/paddle/fluid/ir/dialect/paddle_dialect/ir/CMakeLists.txt
@@ -183,7 +183,7 @@ add_custom_target(ops_api_gen ALL DEPENDS ${ops_api_source_file})
 
 cc_library(
   pd_dialect_core
-  SRCS pd_attribute.cc pd_type.cc
+  SRCS pd_attribute.cc pd_type.cc pd_meta_tensor.cc
   DEPS phi pd_interface pd_trait type_info)
 cc_library(
   pd_dialect_op
diff --git a/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_dialect.cc b/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_dialect.cc
index dd68500a626a0..9c89059db6936 100644
--- a/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_dialect.cc
+++ b/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_dialect.cc
@@ -49,6 +49,8 @@ void PaddleDialect::initialize() {
 #include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_op.h"  // NOLINT
       >();
   RegisterOps<paddle::dialect::AddNOp,
+              paddle::dialect::AddN_Op,
+              paddle::dialect::AddNWithKernelOp,
               paddle::dialect::FusedGemmEpilogueOp,
               paddle::dialect::FusedGemmEpilogueGradOp,
               paddle::dialect::SplitGradOp>();
diff --git a/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_manual_op.cc b/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_manual_op.cc
index 2f2ba34c881e4..3d16c44405ab0 100644
--- a/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_manual_op.cc
+++ b/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_manual_op.cc
@@ -58,13 +58,18 @@ void AddNOp::Verify() {
             "The size %d of inputs must be equal to 1.", input_size));
     if (auto vec_type = (*this)->operand(0).type().dyn_cast<ir::VectorType>()) {
       for (size_t i = 0; i < vec_type.size(); ++i) {
-        PADDLE_ENFORCE(vec_type[i].isa<paddle::dialect::DenseTensorType>(),
+        PADDLE_ENFORCE(vec_type[i].isa<paddle::dialect::DenseTensorType>() ||
+                           vec_type[i].isa<paddle::dialect::SelectedRowsType>(),
                        phi::errors::PreconditionNotMet(
                            "Type validation failed for the 0th input."));
       }
     } else {
       PADDLE_ENFORCE(
-          (*this)->operand(0).type().isa<paddle::dialect::DenseTensorType>(),
+          (*this)->operand(0).type().isa<paddle::dialect::DenseTensorType>() ||
+              (*this)
+                  ->operand(0)
+                  .type()
+                  .isa<paddle::dialect::SelectedRowsType>(),
           phi::errors::PreconditionNotMet(
               "Type validation failed for the 0th input."));
     }
@@ -82,7 +87,8 @@ void AddNOp::Verify() {
         phi::errors::PreconditionNotMet(
             "The size %d of outputs must be equal to 1.", output_size));
     PADDLE_ENFORCE(
-        (*this)->result(0).type().isa<paddle::dialect::DenseTensorType>(),
+        (*this)->result(0).type().isa<paddle::dialect::DenseTensorType>() ||
+            (*this)->result(0).type().isa<paddle::dialect::SelectedRowsType>(),
         phi::errors::PreconditionNotMet(
             "Type validation failed for the 0th output."));
   }
@@ -147,6 +153,262 @@ void AddNOp::InferMeta(phi::InferMetaContext *infer_meta) {
   fn(infer_meta);
 }
 
+OpInfoTuple AddN_Op::GetOpInfo() {
+  std::vector<paddle::dialect::OpInputInfo> inputs = {
+      paddle::dialect::OpInputInfo(
+          "inputs",
+          "ir::VectorType<paddle::dialect::DenseTensorType>",
+          false,
+          false,
+          false)};
+  std::vector<paddle::dialect::OpAttributeInfo> attributes = {};
+  std::vector<paddle::dialect::OpOutputInfo> outputs = {
+      paddle::dialect::OpOutputInfo(
+          "out", "paddle::dialect::DenseTensorType", false, false)};
+  paddle::dialect::OpRunTimeInfo run_time_info = paddle::dialect::OpRunTimeInfo(
+      "AddNInferMeta", {"inputs"}, {"add_n"}, {"inputs"}, {}, {}, {}, {});
+  return std::make_tuple(inputs, attributes, outputs, run_time_info, "add_n_");
+}
+
+void AddN_Op::Build(ir::Builder &builder,
+                    ir::OperationArgument &argument,
+                    ir::OpResult inputs_) {
+  VLOG(4) << "Builder construction inputs";
+  std::vector<ir::OpResult> argument_inputs = {inputs_};
+  argument.AddOperands(argument_inputs.begin(), argument_inputs.end());
+
+  VLOG(4) << "Builder construction attributes";
+
+  VLOG(4) << "Builder construction outputs";
+  ir::VectorType inputs = inputs_.type().dyn_cast<ir::VectorType>();
+  (void)inputs;
+  std::vector<phi::DenseTensor> vec_dense_inputs;
+  for (size_t i = 0; i < static_cast<size_t>(inputs.size()); i++) {
+    vec_dense_inputs.push_back(phi::DenseTensor(
+        std::make_unique<paddle::experimental::DefaultAllocator>(
+            paddle::platform::CPUPlace())
+            .get(),
+        phi::DenseTensorMeta(
+            paddle::dialect::TransToPhiDataType(
+                inputs[i].dyn_cast<paddle::dialect::DenseTensorType>().dtype()),
+            inputs[i].dyn_cast<paddle::dialect::DenseTensorType>().dims(),
+            inputs[i]
+                .dyn_cast<paddle::dialect::DenseTensorType>()
+                .data_layout(),
+            inputs[i].dyn_cast<paddle::dialect::DenseTensorType>().lod(),
+            inputs[i].dyn_cast<paddle::dialect::DenseTensorType>().offset())));
+  }
+  std::vector<phi::MetaTensor> vec_meta_inputs;
+  for (size_t i = 0; i < vec_dense_inputs.size(); i++) {
+    vec_meta_inputs.push_back(phi::MetaTensor(&vec_dense_inputs[i]));
+  }
+
+  std::vector<const phi::MetaTensor *> meta_inputs;
+  for (size_t i = 0; i < static_cast<size_t>(vec_meta_inputs.size()); i++) {
+    meta_inputs.push_back(&vec_meta_inputs[i]);
+  }
+  phi::DenseTensor dense_out;
+  phi::MetaTensor meta_out(&dense_out);
+
+  phi::AddNInferMeta(meta_inputs, &meta_out);
+
+  std::vector<ir::Type> argument_outputs;
+  ir::Type out_dense_tensor_type = paddle::dialect::DenseTensorType::get(
+      ir::IrContext::Instance(),
+      paddle::dialect::TransToIrDataType(dense_out.dtype()),
+      dense_out.dims(),
+      dense_out.layout(),
+      dense_out.lod(),
+      dense_out.offset());
+  argument_outputs.push_back(out_dense_tensor_type);
+  argument.AddOutputs(argument_outputs.begin(), argument_outputs.end());
+}
+
+void AddN_Op::Verify() {
+  VLOG(4) << "Start Verifying inputs, outputs and attributes for: AddN_Op.";
+  VLOG(4) << "Verifying inputs:";
+  {
+    auto input_size = num_operands();
+    PADDLE_ENFORCE_EQ(
+        input_size,
+        1u,
+        phi::errors::PreconditionNotMet(
+            "The size %d of inputs must be equal to 1.", input_size));
+    if (auto vec_type =
+            (*this)->operand_source(0).type().dyn_cast<ir::VectorType>()) {
+      for (size_t i = 0; i < vec_type.size(); ++i) {
+        PADDLE_ENFORCE(vec_type[i].isa<paddle::dialect::DenseTensorType>() ||
+                           vec_type[i].isa<paddle::dialect::SelectedRowsType>(),
+                       phi::errors::PreconditionNotMet(
+                           "Type validation failed for the 0th input."));
+      }
+    } else {
+      PADDLE_ENFORCE((*this)->operand_source(0)
+                             .type()
+                             .isa<paddle::dialect::DenseTensorType>() ||
+                         (*this)
+                             ->operand_source(0)
+                             .type()
+                             .isa<paddle::dialect::SelectedRowsType>(),
+                     phi::errors::PreconditionNotMet(
+                         "Type validation failed for the 0th input."));
+    }
+  }
+  VLOG(4) << "Verifying attributes:";
+  {
+    // Attributes num is 0, not need to check attributes type.
+  }
+  VLOG(4) << "Verifying outputs:";
+  {
+    auto output_size = num_results();
+    PADDLE_ENFORCE_EQ(
+        output_size,
+        1u,
+        phi::errors::PreconditionNotMet(
+            "The size %d of outputs must be equal to 1.", output_size));
+    PADDLE_ENFORCE(
+        (*this)->result(0).type().isa<paddle::dialect::DenseTensorType>() ||
+            (*this)->result(0).type().isa<paddle::dialect::SelectedRowsType>(),
+        phi::errors::PreconditionNotMet(
+            "Type validation failed for the 0th output."));
+  }
+  VLOG(4) << "End Verifying for: AddN_Op.";
+}
+
+void AddN_Op::InferMeta(phi::InferMetaContext *infer_meta) {
+  auto fn = PD_INFER_META(phi::AddNInferMeta);
+  fn(infer_meta);
+}
+
+OpInfoTuple AddNWithKernelOp::GetOpInfo() {
+  std::vector<paddle::dialect::OpInputInfo> inputs = {
+      paddle::dialect::OpInputInfo(
+          "inputs",
+          "ir::VectorType<paddle::dialect::DenseTensorType>",
+          false,
+          false,
+          false)};
+  std::vector<paddle::dialect::OpAttributeInfo> attributes = {};
+  std::vector<paddle::dialect::OpOutputInfo> outputs = {
+      paddle::dialect::OpOutputInfo(
+          "out", "paddle::dialect::DenseTensorType", false, false)};
+  paddle::dialect::OpRunTimeInfo run_time_info = paddle::dialect::OpRunTimeInfo(
+      "AddNInferMeta", {"inputs"}, {"add_n"}, {"inputs"}, {}, {}, {}, {});
+  return std::make_tuple(
+      inputs, attributes, outputs, run_time_info, "add_n_with_kernel");
+}
+
+void AddNWithKernelOp::Build(ir::Builder &builder,
+                             ir::OperationArgument &argument,
+                             ir::OpResult inputs_) {
+  VLOG(4) << "Builder construction inputs";
+  std::vector<ir::OpResult> argument_inputs = {inputs_};
+  argument.AddOperands(argument_inputs.begin(), argument_inputs.end());
+
+  VLOG(4) << "Builder construction attributes";
+
+  VLOG(4) << "Builder construction outputs";
+  ir::VectorType inputs = inputs_.type().dyn_cast<ir::VectorType>();
+  (void)inputs;
+  std::vector<phi::DenseTensor> vec_dense_inputs;
+  for (size_t i = 0; i < static_cast<size_t>(inputs.size()); i++) {
+    vec_dense_inputs.push_back(phi::DenseTensor(
+        std::make_unique<paddle::experimental::DefaultAllocator>(
+            paddle::platform::CPUPlace())
+            .get(),
+        phi::DenseTensorMeta(
+            paddle::dialect::TransToPhiDataType(
+                inputs[i].dyn_cast<paddle::dialect::DenseTensorType>().dtype()),
+            inputs[i].dyn_cast<paddle::dialect::DenseTensorType>().dims(),
+            inputs[i]
+                .dyn_cast<paddle::dialect::DenseTensorType>()
+                .data_layout(),
+            inputs[i].dyn_cast<paddle::dialect::DenseTensorType>().lod(),
+            inputs[i].dyn_cast<paddle::dialect::DenseTensorType>().offset())));
+  }
+  std::vector<phi::MetaTensor> vec_meta_inputs;
+  for (size_t i = 0; i < vec_dense_inputs.size(); i++) {
+    vec_meta_inputs.push_back(phi::MetaTensor(&vec_dense_inputs[i]));
+  }
+
+  std::vector<const phi::MetaTensor *> meta_inputs;
+  for (size_t i = 0; i < static_cast<size_t>(vec_meta_inputs.size()); i++) {
+    meta_inputs.push_back(&vec_meta_inputs[i]);
+  }
+  phi::DenseTensor dense_out;
+  phi::MetaTensor meta_out(&dense_out);
+
+  phi::AddNInferMeta(meta_inputs, &meta_out);
+
+  std::vector<ir::Type> argument_outputs;
+  ir::Type out_dense_tensor_type = paddle::dialect::DenseTensorType::get(
+      ir::IrContext::Instance(),
+      paddle::dialect::TransToIrDataType(dense_out.dtype()),
+      dense_out.dims(),
+      dense_out.layout(),
+      dense_out.lod(),
+      dense_out.offset());
+  argument_outputs.push_back(out_dense_tensor_type);
+  argument.AddOutputs(argument_outputs.begin(), argument_outputs.end());
+}
+
+void AddNWithKernelOp::Verify() {
+  VLOG(4) << "Start Verifying inputs, outputs and attributes for: "
+             "AddNWithKernelOp.";
+  VLOG(4) << "Verifying inputs:";
+  {
+    auto input_size = num_operands();
+    PADDLE_ENFORCE_EQ(
+        input_size,
+        1u,
+        phi::errors::PreconditionNotMet(
+            "The size %d of inputs must be equal to 1.", input_size));
+    if (auto vec_type =
+            (*this)->operand_source(0).type().dyn_cast<ir::VectorType>()) {
+      for (size_t i = 0; i < vec_type.size(); ++i) {
+        PADDLE_ENFORCE(vec_type[i].isa<paddle::dialect::DenseTensorType>() ||
+                           vec_type[i].isa<paddle::dialect::SelectedRowsType>(),
+                       phi::errors::PreconditionNotMet(
+                           "Type validation failed for the 0th input."));
+      }
+    } else {
+      PADDLE_ENFORCE((*this)->operand_source(0)
+                             .type()
+                             .isa<paddle::dialect::DenseTensorType>() ||
+                         (*this)
+                             ->operand_source(0)
+                             .type()
+                             .isa<paddle::dialect::SelectedRowsType>(),
+                     phi::errors::PreconditionNotMet(
+                         "Type validation failed for the 0th input."));
+    }
+  }
+  VLOG(4) << "Verifying attributes:";
+  {
+    // Attributes num is 0, not need to check attributes type.
+  }
+  VLOG(4) << "Verifying outputs:";
+  {
+    auto output_size = num_results();
+    PADDLE_ENFORCE_EQ(
+        output_size,
+        1u,
+        phi::errors::PreconditionNotMet(
+            "The size %d of outputs must be equal to 1.", output_size));
+    PADDLE_ENFORCE(
+        (*this)->result(0).type().isa<paddle::dialect::DenseTensorType>() ||
+            (*this)->result(0).type().isa<paddle::dialect::SelectedRowsType>(),
+        phi::errors::PreconditionNotMet(
+            "Type validation failed for the 0th output."));
+  }
+  VLOG(4) << "End Verifying for: AddNWithKernelOp.";
+}
+
+void AddNWithKernelOp::InferMeta(phi::InferMetaContext *infer_meta) {
+  auto fn = PD_INFER_META(phi::AddNInferMeta);
+  fn(infer_meta);
+}
+
 const char *FusedGemmEpilogueOp::attributes_name[3] = {
     "trans_x", "trans_y", "activation"};
 
@@ -794,3 +1056,5 @@ IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::AddNOp)
 IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::FusedGemmEpilogueOp)
 IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::FusedGemmEpilogueGradOp)
 IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::SplitGradOp)
+IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::AddN_Op)
+IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::AddNWithKernelOp)
diff --git a/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_manual_op.h b/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_manual_op.h
index 4db22c3908254..ca163029e7d0d 100644
--- a/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_manual_op.h
+++ b/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_manual_op.h
@@ -24,6 +24,7 @@ paddle::dialect::AddNOp, paddle::dialect::SplitGradOp
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/ir/dialect/paddle_dialect/interface/infermeta.h"
 #include "paddle/fluid/ir/dialect/paddle_dialect/interface/op_yaml_info.h"
+#include "paddle/fluid/ir/dialect/paddle_dialect/trait/inplace.h"
 #include "paddle/fluid/ir/dialect/paddle_dialect/utils/op_yaml_info_util.h"
 #include "paddle/fluid/ir/dialect/paddle_dialect/utils/utils.h"
 #include "paddle/ir/core/builder.h"
@@ -51,6 +52,47 @@ class AddNOp : public ir::Op<AddNOp, OpYamlInfoInterface, InferMetaInterface> {
   static void InferMeta(phi::InferMetaContext *infer_meta);
 };
 
+class AddN_Op : public ir::Op<AddN_Op,
+                              paddle::dialect::OpYamlInfoInterface,
+                              paddle::dialect::InferMetaInterface,
+                              paddle::dialect::InplaceTrait> {
+ public:
+  using Op::Op;
+  static const char *name() { return "pd.add_n_"; }
+  static constexpr const char **attributes_name = nullptr;
+  static constexpr uint32_t attributes_num = 0;
+  static OpInfoTuple GetOpInfo();
+  static void Build(ir::Builder &builder,             // NOLINT
+                    ir::OperationArgument &argument,  // NOLINT
+                    ir::OpResult inputs_);
+
+  void Verify();
+  ir::Value inputs() { return operand_source(0); }
+  ir::OpResult out() { return result(0); }
+
+  static void InferMeta(phi::InferMetaContext *infer_meta);
+};
+
+class AddNWithKernelOp : public ir::Op<AddNWithKernelOp,
+                                       paddle::dialect::OpYamlInfoInterface,
+                                       paddle::dialect::InferMetaInterface> {
+ public:
+  using Op::Op;
+  static const char *name() { return "pd.add_n_with_kernel"; }
+  static constexpr const char **attributes_name = nullptr;
+  static constexpr uint32_t attributes_num = 0;
+  static OpInfoTuple GetOpInfo();
+  static void Build(ir::Builder &builder,             // NOLINT
+                    ir::OperationArgument &argument,  // NOLINT
+                    ir::OpResult inputs_);
+
+  void Verify();
+  ir::Value inputs() { return operand_source(0); }
+  ir::OpResult out() { return result(0); }
+
+  static void InferMeta(phi::InferMetaContext *infer_meta);
+};
+
 class FusedGemmEpilogueOp : public ir::Op<FusedGemmEpilogueOp,
                                           paddle::dialect::OpYamlInfoInterface,
                                           paddle::dialect::InferMetaInterface> {
@@ -137,5 +179,7 @@ IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::AddNOp)
 IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::FusedGemmEpilogueOp)
 IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::FusedGemmEpilogueGradOp)
 IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::SplitGradOp)
+IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::AddN_Op)
+IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::AddNWithKernelOp)
 
 #endif
diff --git a/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_meta_tensor.cc b/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_meta_tensor.cc
new file mode 100644
index 0000000000000..2da7b098a6556
--- /dev/null
+++ b/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_meta_tensor.cc
@@ -0,0 +1,68 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_meta_tensor.h"
+
+#include "paddle/ir/core/enforce.h"
+
+namespace paddle {
+namespace dialect {
+IrMetaTensor::IrMetaTensor(phi::DataType dtype,
+                           const phi::DDim& dims,
+                           phi::DataLayout layout,
+                           const LoD& lod,
+                           size_t offset)
+    : dims_(dims), dtype_(dtype), layout_(layout), lod_(lod), offset_(offset) {}
+
+IrMetaTensor::IrMetaTensor(const IrMetaTensor& other) {
+  dims_ = other.dims();
+  dtype_ = other.dtype();
+  layout_ = other.layout();
+  lod_ = other.lod();
+  offset_ = other.offset();
+}
+
+IrMetaTensor& IrMetaTensor::operator=(const IrMetaTensor& other) {
+  dims_ = other.dims();
+  dtype_ = other.dtype();
+  layout_ = other.layout();
+  lod_ = other.lod();
+  offset_ = other.offset();
+  return *this;
+}
+
+IrMetaTensor& IrMetaTensor::operator=(IrMetaTensor&& other) noexcept {
+  dims_ = std::move(other.dims());
+  dtype_ = other.dtype();
+  layout_ = other.layout();
+  lod_ = std::move(other.lod());
+  offset_ = other.offset();
+  return *this;
+}
+
+int64_t IrMetaTensor::numel() const { return phi::product(dims_); }
+
+const phi::Place& IrMetaTensor::place() const {
+  IR_THROW("Don't use IrMetaTensor::place method.");
+}
+
+void* IrMetaTensor::AllocateFrom(phi::Allocator* allocator,
+                                 phi::DataType dtype,
+                                 size_t requested_size,
+                                 bool fake_alloc) {
+  IR_THROW("Don't use IrMetaTensor::AllocateFrom method.");
+}
+
+}  // namespace dialect
+}  // namespace paddle
diff --git a/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_meta_tensor.h b/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_meta_tensor.h
new file mode 100644
index 0000000000000..ffcbd415c368a
--- /dev/null
+++ b/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_meta_tensor.h
@@ -0,0 +1,80 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/allocator.h"
+#include "paddle/phi/core/tensor_base.h"
+#include "paddle/phi/core/tensor_meta.h"
+
+namespace paddle {
+namespace dialect {
+
+using LoD = std::vector<std::vector<size_t>>;
+
+class IrMetaTensor : public phi::TensorBase,
+                     public phi::TypeInfoTraits<phi::TensorBase, IrMetaTensor> {
+ public:
+  IrMetaTensor(phi::DataType dtype,
+               const phi::DDim& dims,
+               phi::DataLayout layout,
+               const LoD& lod,
+               size_t offset = 0);
+
+  IrMetaTensor(IrMetaTensor&& other) = default;
+
+  IrMetaTensor(const IrMetaTensor& other);
+
+  IrMetaTensor& operator=(const IrMetaTensor& other);
+
+  IrMetaTensor& operator=(IrMetaTensor&& other) noexcept;
+
+  virtual ~IrMetaTensor() = default;
+
+ public:
+  static const char* name() { return "IrMetaTensor"; }
+
+  int64_t numel() const override;
+
+  const phi::DDim& dims() const noexcept override { return dims_; }
+
+  const phi::Place& place() const override;
+
+  phi::DataType dtype() const noexcept override { return dtype_; }
+
+  phi::DataLayout layout() const noexcept override { return layout_; }
+
+  const LoD& lod() const noexcept { return lod_; }
+
+  size_t offset() const noexcept { return offset_; }
+
+  bool valid() const noexcept override { return true; }
+
+  bool initialized() const override { return true; }
+
+  void* AllocateFrom(phi::Allocator* allocator,
+                     phi::DataType dtype,
+                     size_t requested_size = 0,
+                     bool fake_alloc = false) override;
+
+ private:
+  phi::DDim dims_;
+  phi::DataType dtype_{phi::DataType::UNDEFINED};
+  phi::DataLayout layout_{phi::DataLayout::NCHW};
+  LoD lod_;
+  size_t offset_{0};
+};
+
+}  // namespace dialect
+}  // namespace paddle
diff --git a/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h b/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h
index 1958a9444bcb9..b1916d5418f77 100644
--- a/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h
+++ b/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h
@@ -118,8 +118,18 @@ void BuildPhiContext(ir::Operation* op,
       InListType inputs;
       auto& variable_array = var->Get<paddle::framework::VariableRefArray>();
       for (size_t i = 0; i < variable_array.size(); ++i) {
-        inputs.emplace_back(InType(const_cast<phi::DenseTensor*>(
-            &(variable_array[i]->Get<phi::DenseTensor>()))));
+        if (variable_array[i]->IsType<phi::DenseTensor>()) {
+          inputs.emplace_back(InType(const_cast<phi::DenseTensor*>(
+              &(variable_array[i]->Get<phi::DenseTensor>()))));
+        } else if (variable_array[i]->IsType<phi::SelectedRows>()) {
+          inputs.emplace_back(InType(const_cast<phi::SelectedRows*>(
+              &(variable_array[i]->Get<phi::SelectedRows>()))));
+        } else {
+          PADDLE_THROW(phi::errors::Unimplemented(
+              "Only support Vector<DenseTensor> and vector<SelectedRows> now, "
+              "not support vector<%d>.",
+              variable_array[i]->Type()));
+        }
       }
       ctx->EmplaceBackInputs(inputs);
     } else {
@@ -315,8 +325,18 @@ void BuildPhiContext(ir::Operation* op,
       auto& variable_array = inner_scope->FindVar(name_map.at(out_ptr))
                                  ->Get<paddle::framework::VariableRefArray>();
       for (size_t i = 0; i < variable_array.size(); ++i) {
-        outputs.emplace_back(OutType(const_cast<phi::DenseTensor*>(
-            &(variable_array[i]->Get<phi::DenseTensor>()))));
+        if (variable_array[i]->IsType<phi::DenseTensor>()) {
+          outputs.emplace_back(OutType(const_cast<phi::DenseTensor*>(
+              &(variable_array[i]->Get<phi::DenseTensor>()))));
+        } else if (variable_array[i]->IsType<phi::SelectedRows>()) {
+          outputs.emplace_back(OutType(const_cast<phi::SelectedRows*>(
+              &(variable_array[i]->Get<phi::SelectedRows>()))));
+        } else {
+          PADDLE_THROW(phi::errors::Unimplemented(
+              "Only support Vector<DenseTensor> and vector<SelectedRows> now, "
+              "not support vector<%d>.",
+              variable_array[i]->Type()));
+        }
       }
       ctx->EmplaceBackOutputs(outputs);
     } else {
diff --git a/paddle/fluid/ir/transforms/pd_op_to_kernel_pass.cc b/paddle/fluid/ir/transforms/pd_op_to_kernel_pass.cc
index 84f18baa55aea..d75c7cc4779ff 100644
--- a/paddle/fluid/ir/transforms/pd_op_to_kernel_pass.cc
+++ b/paddle/fluid/ir/transforms/pd_op_to_kernel_pass.cc
@@ -149,34 +149,67 @@ bool SkipFeedOp(ir::Operation* op, const std::set<std::string>& feed_names) {
       op->attributes().at("name").dyn_cast<ir::StrAttribute>().AsString());
 }
 
-std::vector<phi::DenseTensor> GetFakeTensorList(ir::Value new_input_tmp) {
-  std::vector<phi::DenseTensor> vec_res;
+std::vector<std::shared_ptr<phi::TensorBase>> GetFakeTensorList(
+    ir::Value new_input_tmp) {
+  std::vector<std::shared_ptr<phi::TensorBase>> vec_res;
   auto input_type = new_input_tmp.type();
-  std::vector<dialect::AllocatedDenseTensorType> types;
-  if (input_type.isa<dialect::AllocatedDenseTensorType>()) {
-    types.push_back(input_type.dyn_cast<dialect::AllocatedDenseTensorType>());
-  } else if (input_type.isa<ir::VectorType>()) {
-    auto vec_inner_types = input_type.dyn_cast<ir::VectorType>().data();
-    for (size_t i = 0; i < vec_inner_types.size(); ++i) {
-      types.push_back(
-          vec_inner_types[0].dyn_cast<dialect::AllocatedDenseTensorType>());
-    }
-  }
 
-  for (auto& type : types) {
-    auto ptr = new phi::Allocation(nullptr, 0, type.place());
+  auto build_fake_dense_tensor =
+      [](const dialect::AllocatedDenseTensorType& type) {
+        auto ptr = new phi::Allocation(nullptr, 0, type.place());
+
+        std::shared_ptr<phi::Allocation> holder(ptr);
+
+        auto dtype = TransToPhiDataType(type.dtype());
 
-    std::shared_ptr<phi::Allocation> holder(ptr);
+        phi::DenseTensorMeta meta(
+            dtype, type.dims(), type.data_layout(), type.lod(), type.offset());
 
-    auto dtype = TransToPhiDataType(type.dtype());
+        return std::make_shared<phi::DenseTensor>(holder, meta);
+      };
 
-    phi::DenseTensorMeta meta(
-        dtype, type.dims(), type.data_layout(), type.lod(), type.offset());
+  auto build_fake_selected_rows =
+      [](const dialect::AllocatedSelectedRowsType& type) {
+        auto ptr = new phi::Allocation(nullptr, 0, type.place());
 
-    phi::DenseTensor fake_tensor(holder, meta);
+        std::shared_ptr<phi::Allocation> holder(ptr);
 
-    vec_res.push_back(fake_tensor);
+        auto dtype = TransToPhiDataType(type.dtype());
+
+        phi::DenseTensorMeta meta(
+            dtype, type.dims(), type.data_layout(), type.lod(), type.offset());
+
+        std::vector<int64_t> rows;
+        int64_t height = 0;
+        rows.clear();
+
+        auto sr = std::make_shared<phi::SelectedRows>(rows, height);
+
+        phi::DenseTensor dense_tensor(holder, meta);
+        *(sr->mutable_value()) = dense_tensor;
+
+        return sr;
+      };
+
+  if (input_type.isa<dialect::AllocatedDenseTensorType>()) {
+    vec_res.push_back(build_fake_dense_tensor(
+        input_type.dyn_cast<dialect::AllocatedDenseTensorType>()));
+  } else if (input_type.isa<dialect::AllocatedSelectedRowsType>()) {
+    vec_res.push_back(build_fake_selected_rows(
+        input_type.dyn_cast<dialect::AllocatedSelectedRowsType>()));
+  } else if (input_type.isa<ir::VectorType>()) {
+    auto vec_inner_types = input_type.dyn_cast<ir::VectorType>().data();
+    for (size_t i = 0; i < vec_inner_types.size(); ++i) {
+      if (vec_inner_types[0].isa<dialect::AllocatedDenseTensorType>()) {
+        vec_res.push_back(build_fake_dense_tensor(
+            vec_inner_types[0].dyn_cast<dialect::AllocatedDenseTensorType>()));
+      } else if (vec_inner_types[0].isa<dialect::AllocatedSelectedRowsType>()) {
+        vec_res.push_back(build_fake_selected_rows(
+            vec_inner_types[0].dyn_cast<dialect::AllocatedSelectedRowsType>()));
+      }
+    }
   }
+
   return vec_res;
 }
 
@@ -514,7 +547,7 @@ phi::KernelKey GetKernelKey(
 
       auto fake_tensors = GetFakeTensorList(new_input_tmp);
       for (auto& fake_tensor : fake_tensors) {
-        kernel_key_parser.AssignKernelKeySet(fake_tensor);
+        kernel_key_parser.AssignKernelKeySet(*fake_tensor);
       }
 
       // Because we can't make sure the place when build data op
@@ -617,6 +650,12 @@ std::unique_ptr<ir::Program> PdOpLowerToKernelPass(ir::Program* prog,
                 new_in.type()
                     .dyn_cast<paddle::dialect::AllocatedDenseTensorType>()
                     .place());
+          } else if (new_in.type()
+                         .isa<paddle::dialect::AllocatedSelectedRowsType>()) {
+            out_places.push_back(
+                new_in.type()
+                    .dyn_cast<paddle::dialect::AllocatedSelectedRowsType>()
+                    .place());
           } else {
             PADDLE_THROW(phi::errors::Unimplemented(
                 "only support dense tensor type for now"));
@@ -759,6 +798,14 @@ std::unique_ptr<ir::Program> PdOpLowerToKernelPass(ir::Program* prog,
     if (op_info_parser != nullptr) {
       kernel_fn_str = op_info_parser->OpRuntimeInfo().kernel_func[0];
     }
+
+    if (op_item->name() == "pd.add_n_" ||
+        op_item->name() == "pd.add_n_with_kernel") {
+      if (op_item->result(0).type().isa<dialect::SelectedRowsType>()) {
+        kernel_fn_str = "add_n_sr";
+      }
+    }
+
     auto kernel_key =
         GetKernelKey(op_item, place, map_value_pair, op_info_parser.get());
     VLOG(6) << "kernel type " << kernel_key;
@@ -929,9 +976,22 @@ std::unique_ptr<ir::Program> PdOpLowerToKernelPass(ir::Program* prog,
               for (size_t j = 0; j < pre_define_op->num_operands(); ++j) {
                 auto in_i = map_value_pair.at(pre_define_op->operand_source(j));
                 auto in_i_type = in_i.type();
-                auto place =
-                    in_i_type.dyn_cast<dialect::AllocatedDenseTensorType>()
-                        .place();
+                phi::Place place;
+                if (in_i_type.isa<dialect::AllocatedDenseTensorType>()) {
+                  place =
+                      in_i_type.dyn_cast<dialect::AllocatedDenseTensorType>()
+                          .place();
+                } else if (in_i_type
+                               .isa<dialect::AllocatedSelectedRowsType>()) {
+                  place =
+                      in_i_type.dyn_cast<dialect::AllocatedSelectedRowsType>()
+                          .place();
+                } else {
+                  PADDLE_THROW(phi::errors::Unimplemented(
+                      "builtin.combine Input type only support "
+                      "VectorType<DenseTensorType> and "
+                      "VectorType<SelectedRowsType>"));
+                }
 
                 // get input args def type
                 auto args_def = kernel.args_def();
@@ -949,12 +1009,30 @@ std::unique_ptr<ir::Program> PdOpLowerToKernelPass(ir::Program* prog,
                   // build memcopy op
                   auto out_place =
                       phi::TransToPhiPlace(kernel.InputAt(i).backend);
-                  auto out_type = dialect::AllocatedDenseTensorType::get(
-                      ctx,
-                      out_place,
-                      pre_define_op->operand_source(j)
-                          .type()
-                          .dyn_cast<dialect::DenseTensorType>());
+
+                  ir::Type out_type;
+                  if (in_i_type.isa<dialect::AllocatedDenseTensorType>()) {
+                    out_type = dialect::AllocatedDenseTensorType::get(
+                        ctx,
+                        out_place,
+                        pre_define_op->operand_source(j)
+                            .type()
+                            .dyn_cast<dialect::DenseTensorType>());
+                  } else if (in_i_type
+                                 .isa<dialect::AllocatedSelectedRowsType>()) {
+                    out_type = dialect::AllocatedSelectedRowsType::get(
+                        ctx,
+                        out_place,
+                        pre_define_op->operand_source(j)
+                            .type()
+                            .dyn_cast<dialect::SelectedRowsType>());
+                  } else {
+                    PADDLE_THROW(phi::errors::Unimplemented(
+                        "builtin.combine Input type only support "
+                        "VectorType<DenseTensorType> and "
+                        "VectorType<SelectedRowsType>"));
+                  }
+
                   in_i = AddPlaceTransferOp(in_i,
                                             out_type,
                                             place,
diff --git a/paddle/fluid/ir_adaptor/translator/op_translator.cc b/paddle/fluid/ir_adaptor/translator/op_translator.cc
index 38d833fc312de..e22fa5f3b3779 100644
--- a/paddle/fluid/ir_adaptor/translator/op_translator.cc
+++ b/paddle/fluid/ir_adaptor/translator/op_translator.cc
@@ -1112,8 +1112,8 @@ struct AddNOpTranscriber : public OpTranscriber {
     }
     const auto& op_info = ctx->GetRegisteredOpInfo(target_op_name);
     if (!op_info) {
-      IR_THROW(
-          "Op assign_value should have corresponding OpInfo pd.assign_value_");
+      IR_THROW("Op assign_value should have corresponding OpInfo %s",
+               target_op_name);
     }
 
     return op_info;
diff --git a/paddle/fluid/memory/allocation/custom_allocator.cc b/paddle/fluid/memory/allocation/custom_allocator.cc
index 37503105c8bc0..c8225a160428f 100644
--- a/paddle/fluid/memory/allocation/custom_allocator.cc
+++ b/paddle/fluid/memory/allocation/custom_allocator.cc
@@ -29,7 +29,8 @@ void CustomAllocator::FreeImpl(phi::Allocation* allocation) {
       platform::errors::PermissionDenied("CustomDevice memory is "
                                          "freed in incorrect device. "
                                          "This may be a bug"));
-
+  phi::DeviceManager::GetDeviceWithPlace(place_)->MemoryDeallocate(
+      allocation->ptr(), allocation->size());
   delete allocation;
 }
 
diff --git a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
index 18680fe678b5d..a7f6bc512ffce 100644
--- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
@@ -284,7 +284,13 @@ class FCMKLDNNHandler
 
   std::shared_ptr<dnnl::memory> AcquireWeightsMemoryWithReorder(
       const phi::DenseTensor* weights, const std::vector<float>& scale_data) {
-    const std::string weights_key = this->memory_key_ + "@weights";
+    const std::string weights_base_key = this->memory_key_ + "@weights";
+    std::string weights_key;
+    weights_key.reserve(128);
+    weights_key = phi::funcs::ExtendKeyWithThreadInfoIfNeeded(
+        dev_ctx_,
+        phi::funcs::CreateKey(
+            dev_ctx_, weights_base_key, this->fwd_pd_->weights_desc()));
     auto memory_p = std::static_pointer_cast<dnnl::memory>(
         this->dev_ctx_.GetBlob(weights_key));
 
@@ -410,7 +416,8 @@ class FCMKLDNNKernel : public framework::OpKernel<T_in> {
         phi::funcs::CreateKey(dev_ctx,
                               ctx.InputName("Input"),
                               ctx.InputName("W"),
-                              phi::vectorize(x->dims())));
+                              phi::vectorize(x->dims()),
+                              phi::vectorize(weights->dims())));
 
     auto inner_product_cache =
         std::static_pointer_cast<InnerProductCache>(dev_ctx.GetBlob(cache_key));
diff --git a/paddle/fluid/prim/api/auto_code_generated/tensor_operants_gen.py b/paddle/fluid/prim/api/auto_code_generated/tensor_operants_gen.py
index 8322b0ba2be83..783066f0fc906 100644
--- a/paddle/fluid/prim/api/auto_code_generated/tensor_operants_gen.py
+++ b/paddle/fluid/prim/api/auto_code_generated/tensor_operants_gen.py
@@ -211,6 +211,11 @@ class StaticTensorOperants : public TensorOperantsBase {
 #include "paddle/fluid/prim/api/manual_prim/prim_manual_api.h"
 #include "paddle/fluid/prim/utils/static/desc_tensor.h"
 
+#include "paddle/fluid/primitive/backend/backend.h"
+#include "paddle/fluid/primitive/type/lazy_tensor.h"
+
+PHI_DECLARE_bool(enable_new_ir_api);
+
 """
 
 
@@ -219,47 +224,88 @@ class StaticTensorOperants : public TensorOperantsBase {
 
 namespace prim {
 using DescTensor = paddle::prim::DescTensor;
+using LazyTensor = paddle::primitive::LazyTensor;
 
 Tensor StaticTensorOperants::add(const Tensor& x, const Scalar& y) {
-  return paddle::prim::add<DescTensor>(x, paddle::prim::full<DescTensor>(x.shape(), y, x.dtype(), x.place()));
+  if (FLAGS_enable_new_ir_api) {
+    return paddle::primitive::backend::add<LazyTensor>(x, paddle::primitive::backend::full<LazyTensor>(x.shape(), y, x.dtype(), x.place()));
+  } else {
+    return paddle::prim::add<DescTensor>(x, paddle::prim::full<DescTensor>(x.shape(), y, x.dtype(), x.place()));
+  }
 }
 
 Tensor StaticTensorOperants::subtract(const Tensor& x, const Scalar& y) {
-  return paddle::prim::subtract<DescTensor>(x, paddle::prim::full<DescTensor>(x.shape(), y, x.dtype(), x.place()));
+  if (FLAGS_enable_new_ir_api) {
+    return paddle::primitive::backend::subtract<LazyTensor>(x, paddle::primitive::backend::full<LazyTensor>(x.shape(), y, x.dtype(), x.place()));
+  } else {
+    return paddle::prim::subtract<DescTensor>(x, paddle::prim::full<DescTensor>(x.shape(), y, x.dtype(), x.place()));
+  }
 }
 
 Tensor StaticTensorOperants::multiply(const Tensor& x, const Scalar& y) {
-  return paddle::prim::scale<DescTensor>(x, y, 0.0f, true);
+  if (FLAGS_enable_new_ir_api) {
+    return paddle::primitive::backend::scale<LazyTensor>(x, y, 0.0f, true);
+  } else {
+    return paddle::prim::scale<DescTensor>(x, y, 0.0f, true);
+  }
 }
 
 Tensor StaticTensorOperants::divide(const Tensor& x, const Scalar& y) {
-  return paddle::prim::divide<DescTensor>(x, paddle::prim::full<DescTensor>(x.shape(), y, x.dtype(), x.place()));
+  if (FLAGS_enable_new_ir_api) {
+    return paddle::primitive::backend::divide<LazyTensor>(x, paddle::primitive::backend::full<LazyTensor>(x.shape(), y, x.dtype(), x.place()));
+  } else {
+    return paddle::prim::divide<DescTensor>(x, paddle::prim::full<DescTensor>(x.shape(), y, x.dtype(), x.place()));
+  }
 }
 
 Tensor StaticTensorOperants::add(const Scalar& x, const Tensor& y) {
-  return paddle::prim::add<DescTensor>(paddle::prim::full<DescTensor>(y.shape(), x, y.dtype(), y.place()), y);
+  if (FLAGS_enable_new_ir_api) {
+    return paddle::primitive::backend::add<LazyTensor>(paddle::primitive::backend::full<LazyTensor>(y.shape(), x, y.dtype(), y.place()), y);
+  } else {
+    return paddle::prim::add<DescTensor>(paddle::prim::full<DescTensor>(y.shape(), x, y.dtype(), y.place()), y);
+  }
 }
 
+
 Tensor StaticTensorOperants::subtract(const Scalar& x, const Tensor& y) {
-  return paddle::prim::subtract<DescTensor>(paddle::prim::full<DescTensor>(y.shape(), x, y.dtype(), y.place()), y);
+  if (FLAGS_enable_new_ir_api) {
+    return paddle::primitive::backend::subtract<LazyTensor>(paddle::primitive::backend::full<LazyTensor>(y.shape(), x, y.dtype(), y.place()), y);
+  } else {
+    return paddle::prim::subtract<DescTensor>(paddle::prim::full<DescTensor>(y.shape(), x, y.dtype(), y.place()), y);
+  }
 }
 
 Tensor StaticTensorOperants::multiply(const Scalar& x, const Tensor& y) {
-  return paddle::prim::scale<DescTensor>(y, x, 0.0f, true);
+  if (FLAGS_enable_new_ir_api) {
+    return paddle::primitive::backend::scale<LazyTensor>(y, x, 0.0f, true);
+  } else {
+    return paddle::prim::scale<DescTensor>(y, x, 0.0f, true);
+  }
 }
 
 Tensor StaticTensorOperants::divide(const Scalar& x, const Tensor& y) {
-  return paddle::prim::divide<DescTensor>(paddle::prim::full<DescTensor>(y.shape(), x, y.dtype(), y.place()), y);
+  if (FLAGS_enable_new_ir_api) {
+    return paddle::primitive::backend::divide<LazyTensor>(paddle::primitive::backend::full<LazyTensor>(y.shape(), x, y.dtype(), y.place()), y);
+  } else {
+    return paddle::prim::divide<DescTensor>(paddle::prim::full<DescTensor>(y.shape(), x, y.dtype(), y.place()), y);
+  }
 }
 
 Tensor StaticTensorOperants::pow(const Tensor& x, const Tensor& y) {
-  return paddle::prim::elementwise_pow<DescTensor>(x, y);
+  if (FLAGS_enable_new_ir_api) {
+    return paddle::primitive::backend::elementwise_pow<LazyTensor>(x, y);
+  } else {
+    return paddle::prim::elementwise_pow<DescTensor>(x, y);
+  }
 }
 
 Tensor StaticTensorOperants::pow(const Tensor& x, const Scalar& y) {
-  return paddle::prim::elementwise_pow<DescTensor>(x, paddle::prim::full<DescTensor>(x.shape(), y, x.dtype(), x.place()));
+  if (FLAGS_enable_new_ir_api) {
+    return paddle::primitive::backend::elementwise_pow<LazyTensor>(x, paddle::primitive::backend::full<LazyTensor>(x.shape(), y, x.dtype(), x.place()));
+  } else {
+    return paddle::prim::elementwise_pow<DescTensor>(x, paddle::prim::full<DescTensor>(x.shape(), y, x.dtype(), x.place()));
+  }
 }
-
 """
 
 
@@ -339,13 +385,21 @@ def gene_eager_tensor_operants_implementation(self):
 
     def gene_static_tensor_func_call(self):
         api_func_name = self.get_api_func_name()
-
+        backend_static_func_name = (
+            'paddle::primitive::backend::' + api_func_name + '<LazyTensor>'
+        )
         prim_static_func_name = (
             'paddle::prim::' + api_func_name + '<DescTensor>'
         )
-        prim_static_func_parameters = self.get_func_args()
+        static_func_parameters = self.get_func_args()
+
+        static_tensor_func_call = f"""if (FLAGS_enable_new_ir_api) {{
+    return {backend_static_func_name}({static_func_parameters});
+  }} else {{
+    return {prim_static_func_name}({static_func_parameters});
+  }}"""
 
-        return f"""return {prim_static_func_name}({prim_static_func_parameters});"""
+        return static_tensor_func_call
 
     def gene_static_tensor_operants_implementation(self):
         api_code = ""
diff --git a/paddle/fluid/prim/utils/static/CMakeLists.txt b/paddle/fluid/prim/utils/static/CMakeLists.txt
index aa72fadb591a6..483c3eabc05d1 100644
--- a/paddle/fluid/prim/utils/static/CMakeLists.txt
+++ b/paddle/fluid/prim/utils/static/CMakeLists.txt
@@ -6,4 +6,4 @@ cc_library(
 cc_library(
   static_tensor_operants
   SRCS static_tensor_operants.cc
-  DEPS static_prim_api)
+  DEPS static_prim_api primitive_backend_static_experimental)
diff --git a/paddle/fluid/primitive/codegen/templates/rule/vjp/generated/generated_vjp.cc.j2 b/paddle/fluid/primitive/codegen/templates/rule/vjp/generated/generated_vjp.cc.j2
index ab040254355f5..6d69433737633 100644
--- a/paddle/fluid/primitive/codegen/templates/rule/vjp/generated/generated_vjp.cc.j2
+++ b/paddle/fluid/primitive/codegen/templates/rule/vjp/generated/generated_vjp.cc.j2
@@ -10,7 +10,9 @@
 #include "paddle/fluid/primitive/type/lazy_tensor.h"
 #include "paddle/fluid/primitive/utils/utils.h"
 #include "paddle/ir/core/operation.h"
+#include "paddle/phi/core/flags.h"
 
+PHI_DECLARE_string(tensor_operants_mode);
 
 namespace paddle {
 namespace primitive {
@@ -95,6 +97,7 @@ for (size_t i=0; i< stop_gradients[0].size(); i++ ) {
 {% endmacro %}
 
 {% macro body_prim(api) %}
+FLAGS_tensor_operants_mode = "static";
   {% for i in range(api.outputs|length) %}
     {% if api.outputs[i].typename=='Tensor' %}
 paddle::Tensor* {{api.outputs[i].name}} = !stop_gradients[{{i}}][0] ? &vjp_res[{{i}}][0] : nullptr; 
diff --git a/paddle/fluid/primitive/rule/vjp/details.h b/paddle/fluid/primitive/rule/vjp/details.h
index e018cccdef7a0..12fb66127a298 100644
--- a/paddle/fluid/primitive/rule/vjp/details.h
+++ b/paddle/fluid/primitive/rule/vjp/details.h
@@ -39,10 +39,7 @@ void divide_grad(const Tensor& x,
                  Tensor* dy) {
   if (dy) {
     // dy = -(x/y^2) * dout
-    auto denominator =
-        elementwise_pow<T>(y, full<T>(y.shape(), 2.0, y.dtype(), y.place()));
-    auto dy_res = scale<T>(
-        multiply<T>(divide<T>(x, denominator), out_grad), -1.0, 0.0, true);
+    auto dy_res = -(x / y.pow(2.0)) * out_grad;
     if (x.dims() != y.dims()) {
       // Maybe need reduce here
       phi::DDim reduce_dim = get_reduce_dims(y.dims(), x.dims());
@@ -61,7 +58,7 @@ void divide_grad(const Tensor& x,
   if (dx) {
     // dx = (1/y) * dout
     auto one_tensor = full<T>(phi::vectorize(y.dims()), 1.0, y.dtype());
-    auto dx_res = multiply<T>(divide<T>(one_tensor, y), out_grad);
+    auto dx_res = one_tensor / y * out_grad;
     if (y.dims() != x.dims()) {
       // Maybe need reduce here
       auto reduce_dim = get_reduce_dims(x.dims(), y.dims());
diff --git a/paddle/ir/core/block.cc b/paddle/ir/core/block.cc
index f99ec340e4c49..04d59e2582ebe 100644
--- a/paddle/ir/core/block.cc
+++ b/paddle/ir/core/block.cc
@@ -13,6 +13,9 @@
 // limitations under the License.
 
 #include "paddle/ir/core/block.h"
+
+#include <unordered_set>
+
 #include "paddle/ir/core/enforce.h"
 #include "paddle/ir/core/operation.h"
 #include "paddle/ir/core/region.h"
@@ -60,4 +63,34 @@ Block::UseIterator Block::use_end() const { return Block::UseIterator(); }
 
 bool Block::HasOneUse() const { return first_use_ && !first_use_.next_use(); }
 
+void Block::ResetOpListOrder(const OpListType &new_op_list) {
+  IR_ENFORCE(new_op_list.size() == ops_.size(),
+             "The size of new_op_list not same with ops_.");
+  IR_ENFORCE(TopoOrderCheck(new_op_list),
+             "The new_op_list is not in topological order.");
+
+  ops_.clear();
+  for (Operation *op : new_op_list) {
+    push_back(op);
+  }
+}
+
+bool Block::TopoOrderCheck(const OpListType &op_list) {
+  std::unordered_set<Value> visited_values;
+  for (const Operation *op : op_list) {
+    if (op->num_operands() > 0) {
+      for (size_t i = 0; i < op->num_operands(); ++i) {
+        auto operand = op->operand_source(i);
+        if (operand && visited_values.count(op->operand_source(i)) == 0) {
+          return false;
+        }
+      }
+    }
+    for (size_t i = 0; i < op->results().size(); ++i) {
+      visited_values.insert(op->result(i));
+    }
+  }
+  return true;
+}
+
 }  // namespace ir
diff --git a/paddle/ir/core/block.h b/paddle/ir/core/block.h
index ebe4b6cb8ecf4..2cf00037eb5fc 100644
--- a/paddle/ir/core/block.h
+++ b/paddle/ir/core/block.h
@@ -70,6 +70,8 @@ class IR_API Block {
   bool HasOneUse() const;
   BlockOperand *first_use_addr() { return &first_use_; }
 
+  void ResetOpListOrder(const OpListType &new_op_list);
+
  private:
   Block(Block &) = delete;
   Block &operator=(const Block &) = delete;
@@ -78,6 +80,8 @@ class IR_API Block {
   friend class Region;
   void SetParent(Region *parent, Region::iterator position);
 
+  static bool TopoOrderCheck(const OpListType &op_list);
+
  private:
   Region *parent_;  // not owned
   OpListType ops_;  // owned
diff --git a/paddle/ir/dialect/shape/ir/shape_op.cc b/paddle/ir/dialect/shape/ir/shape_op.cc
index 3681aafa36520..776503ea269e3 100644
--- a/paddle/ir/dialect/shape/ir/shape_op.cc
+++ b/paddle/ir/dialect/shape/ir/shape_op.cc
@@ -112,6 +112,7 @@ bool SymbolicDim::merge(SymbolicDim other) {
   if (!isDynamic() && !other.isDynamic() && getValue() != other.getValue())
     return false;
   if (isDynamic() && !other.isDynamic()) updateValue(other.getValue());
+  if (!isDynamic() && other.isDynamic()) other.updateValue(getValue());
 
   bool knownNonNegativeFlag =
       getKnownNonNegative() || other.getKnownNonNegative();
diff --git a/paddle/ir/dialect/shape/utils/shape_utils.cc b/paddle/ir/dialect/shape/utils/shape_utils.cc
index 182d335f71c3d..f9d78a63184cb 100644
--- a/paddle/ir/dialect/shape/utils/shape_utils.cc
+++ b/paddle/ir/dialect/shape/utils/shape_utils.cc
@@ -46,6 +46,154 @@ const std::string SymbolTable::insert(ir::Operation* symbol) {
   return name;
 }
 
+bool SymbolicDimMgr::load() {
+  for (auto op_it = m_.block()->begin(); op_it != m_.block()->end(); op_it++) {
+    symbolTable_.insert(*op_it);
+    SymbolicDim op = (*op_it)->dyn_cast<SymbolicDim>();
+    if (!op) continue;
+    symbolDimUnionSet_[op] = op;
+    symbolNameSet_.insert(op.getSymName());
+  }
+  return loadShapeConstraintGraph();
+}
+
+bool SymbolicDimMgr::loadShapeConstraintGraph() {
+  // TODO(liujinnan): add more constraint function. currently, only support
+  // tie_product_equal.
+  auto constraint_vec =
+      symbolTable_.lookup<ir::dialect::TieProductEqualOp>("tie_product_equal");
+
+  if (!constraint_vec.size()) return true;
+
+  auto build_sym_product = [&](std::vector<ir::Value> range,
+                               SymbolicDimProduct& product) {
+    for (Value v : range) {
+      auto definingOp = v.GetDefiningOp();
+      if (auto constOp = definingOp->dyn_cast<ir::ConstantOp>()) {
+        product.factor *= constOp.value().dyn_cast<ir::Int32Attribute>().data();
+        continue;
+      } else if (auto dimOp = definingOp->dyn_cast<ir::dialect::DimOp>()) {
+        auto sym = symbolTable_.lookup<SymbolicDim>(dimOp.getName());
+        if (!sym) return false;
+        product.symbols.push_back(sym);
+        continue;
+      }
+      return false;
+    }
+    return true;
+  };
+  for (auto op : constraint_vec) {
+    SymbolicDimProduct lhs, rhs;
+    if (!build_sym_product(op.getLhs(), lhs) ||
+        !build_sym_product(op.getRhs(), rhs) ||
+        !mapSymbolicDimProductEqual(lhs, rhs))
+      return false;
+  }
+  return true;
+}
+
+int64_t gcd(int64_t m, int64_t n) {
+  if (!m) return n;
+  if (!n) return m;
+  return (m < n) ? gcd(m, n % m) : gcd(m % n, n);
+}
+
+bool SymbolicDimMgr::mapSymbolicDimProductEqual(const SymbolicDimProduct& lhs,
+                                                const SymbolicDimProduct& rhs) {
+  SymbolicDimProduct newLhs, newRhs;
+  std::tie(newLhs, newRhs) = simplifySymbolicDimProductPair(lhs, rhs);
+
+  // early return for identity case.
+  if (newLhs == newRhs) return true;
+
+  if (newLhs.factor == newRhs.factor && newLhs.symbols.size() == 1 &&
+      newRhs.symbols.size() == 1) {
+    return mapSymbolicDimEqual(newLhs.symbols[0], newRhs.symbols[0]);
+  } else if (newLhs.symbols.size() == 0 && newRhs.symbols.size() == 1 &&
+             newRhs.factor == 1) {
+    return mapSymbolicDimEqual(newConstantSymbolicDim(newLhs.factor),
+                               newRhs.symbols[0]);
+  } else if (newRhs.symbols.size() == 0 && newLhs.symbols.size() == 1 &&
+             newLhs.factor == 1) {
+    return mapSymbolicDimEqual(newConstantSymbolicDim(newRhs.factor),
+                               newLhs.symbols[0]);
+  }
+
+  productEqualityMap_[newLhs][newRhs] = productEqualityMap_[newRhs][newLhs] =
+      true;
+
+  productEqualityMapUpdated_ = false;
+  return true;
+}
+
+std::pair<SymbolicDimProduct, SymbolicDimProduct>
+SymbolicDimMgr::simplifySymbolicDimProductPair(const SymbolicDimProduct& x,
+                                               const SymbolicDimProduct& y) {
+  auto lhs = simplifySymbolicDimProduct(x);
+  auto rhs = simplifySymbolicDimProduct(y);
+
+  SymbolicDimProduct newLhs, newRhs;
+  int64_t gcdFactor = gcd(std::abs(lhs.factor), std::abs(rhs.factor));
+  if (!gcdFactor) return std::make_pair(std::move(newLhs), std::move(newRhs));
+  if (std::abs(lhs.factor) < std::abs(rhs.factor)) {
+    if (lhs.factor < 0) gcdFactor = -gcdFactor;
+  } else {
+    if (rhs.factor < 0) gcdFactor = -gcdFactor;
+  }
+
+  newLhs.factor = lhs.factor / gcdFactor;
+  newRhs.factor = rhs.factor / gcdFactor;
+
+  std::unordered_map<SymbolicDim, int, SymDimHasher> lhsSymbolMap;
+  std::unordered_map<SymbolicDim, int, SymDimHasher> rhsSymbolMap;
+  for (SymbolicDim op : lhs.symbols) ++lhsSymbolMap[op];
+  for (SymbolicDim op : rhs.symbols) ++rhsSymbolMap[op];
+
+  for (SymbolicDim op : lhs.symbols) {
+    auto it = rhsSymbolMap.find(op);
+    if (it != rhsSymbolMap.end() && op.getKnownNonSizeZero()) {
+      if (--it->second == 0) rhsSymbolMap.erase(it);
+      continue;
+    }
+    newLhs.symbols.push_back(op);
+  }
+
+  for (SymbolicDim op : rhs.symbols) {
+    auto it = lhsSymbolMap.find(op);
+    if (it != lhsSymbolMap.end() && op.getKnownNonSizeZero()) {
+      if (--it->second == 0) lhsSymbolMap.erase(it);
+      continue;
+    }
+    newRhs.symbols.push_back(op);
+  }
+
+  if (!newLhs.factor) newLhs.symbols.clear();
+  if (!newRhs.factor) newRhs.symbols.clear();
+
+  return std::make_pair(std::move(newLhs), std::move(newRhs));
+}
+
+SymbolicDimProduct SymbolicDimMgr::simplifySymbolicDimProduct(
+    const SymbolicDimProduct& x) {
+  std::vector<SymbolicDim> copied;
+  copied.reserve(x.symbols.size());
+  for (SymbolicDim op : x.symbols) copied.push_back(getRootSymbolicDim(op));
+
+  sort(copied.begin(), copied.end(), [&](SymbolicDim lhs, SymbolicDim rhs) {
+    return compareSymbolicDimNames(lhs.getSymName(), rhs.getSymName());
+  });
+  SymbolicDimProduct newX;
+  newX.factor = x.factor;
+  for (SymbolicDim op : copied) {
+    if (!op.isDynamic()) {
+      newX.factor *= op.getValue();
+    } else {
+      newX.symbols.push_back(op);
+    }
+  }
+  return newX;
+}
+
 const std::string SymbolicDimMgr::getNextName() {
   std::string name;
   do {
@@ -123,4 +271,154 @@ bool SymbolicDimMgr::mapSymbolicDimEqual(SymbolicDim lhs, SymbolicDim rhs) {
   return true;
 }
 
+SymbolicDimProduct* SymbolicDimMgr::symbolicDimProductDivide(
+    const SymbolicDimProduct& lhs, const SymbolicDimProduct& rhs) {
+  SymbolicDimProduct newLhs, newRhs;
+  std::tie(newLhs, newRhs) = simplifySymbolicDimProductPair(lhs, rhs);
+
+  if (newLhs.factor == 0 || newRhs.factor == 0) return nullptr;
+  if (newLhs.factor % newRhs.factor != 0) return nullptr;
+  if (newLhs.symbols.size() < newRhs.symbols.size()) return nullptr;
+
+  SymbolicDimProduct* result = new SymbolicDimProduct();
+  result->factor = newLhs.factor / newRhs.factor;
+
+  std::unordered_map<SymbolicDim, int, SymDimHasher> symProcMap;
+  for (SymbolicDim sym : newRhs.symbols) ++symProcMap[sym];
+
+  for (SymbolicDim sym : newLhs.symbols) {
+    auto it = symProcMap.find(sym);
+    if (it == symProcMap.end()) {
+      result->symbols.push_back(sym);
+      continue;
+    }
+    if (--it->second == 0) {
+      symProcMap.erase(it);
+      continue;
+    }
+  }
+
+  if (!symProcMap.empty()) return nullptr;
+  return result;
+}
+
+bool SymbolicDimMgr::isMultipleOfKnownSymbolicDimProductEqualPair(
+    const SymbolicDimProduct& lhs, const SymbolicDimProduct& rhs) {
+  for (auto& pairOutter : productEqualityMap_) {
+    const SymbolicDimProduct& x = pairOutter.first;
+    auto factorX = symbolicDimProductDivide(lhs, x);
+    if (!factorX) continue;
+    for (auto& pairInner : pairOutter.second) {
+      if (!pairInner.second) continue;
+      const SymbolicDimProduct& y = pairInner.first;
+      auto factorY = symbolicDimProductDivide(rhs, y);
+      if (!factorY || (*factorX) != (*factorY)) continue;
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool SymbolicDimMgr::updateProductEqualityMap() {
+  // early return if nothing is updated.
+  if (productEqualityMapUpdated_) return true;
+
+  SymbolicDimProductMap newMap;
+  std::unordered_set<SymbolicDimProduct, SymProductHasher> productSet;
+  for (auto& pairOutter : productEqualityMap_) {
+    const SymbolicDimProduct& x = pairOutter.first;
+    for (auto& pairInner : pairOutter.second) {
+      if (!pairInner.second) continue;
+      const SymbolicDimProduct& y = pairInner.first;
+      SymbolicDimProduct newX, newY;
+      std::tie(newX, newY) = simplifySymbolicDimProductPair(x, y);
+      if (newX == newY) continue;
+      newMap[newX][newY] = newMap[newY][newX] = true;
+      productSet.insert(newX);
+      productSet.insert(newY);
+    }
+  }
+  // hash function of SymbolicDimProduct is expensive, thus we map it to integer
+  // domain first.
+  std::unordered_map<const SymbolicDimProduct*, size_t> symProd2Idx;
+  std::vector<const SymbolicDimProduct*> idx2SymProd(productSet.size());
+  std::vector<size_t> idx2root(productSet.size());
+  for (auto& x : productSet) {
+    size_t idx = symProd2Idx.size();
+    symProd2Idx[&x] = idx;
+    idx2SymProd[idx] = &x;
+    idx2root[idx] = idx;
+  }
+
+  auto getRootIdx = [&](size_t root) {
+    std::vector<size_t> path;
+    while (idx2root[root] != root) {
+      path.push_back(root);
+      root = idx2root[root];
+    }
+    for (size_t idx : path) idx2root[idx] = root;
+    return root;
+  };
+
+  for (size_t x = 0; x < symProd2Idx.size(); ++x) {
+    auto& xProd = *idx2SymProd[x];
+    auto& rowMap = newMap[xProd];
+    size_t xRoot = getRootIdx(x);
+    for (size_t y = x; y < symProd2Idx.size(); ++y) {
+      auto& yProd = *idx2SymProd[y];
+      if (!rowMap[yProd]) continue;
+      idx2root[getRootIdx(y)] = xRoot;
+    }
+  }
+
+  for (size_t x = 0; x < symProd2Idx.size(); ++x)
+    for (size_t y = x; y < symProd2Idx.size(); ++y) {
+      if (getRootIdx(x) != getRootIdx(y)) continue;
+      auto& xSymProd = *idx2SymProd[x];
+      auto& ySymProd = *idx2SymProd[y];
+
+      newMap[xSymProd][ySymProd] = newMap[ySymProd][xSymProd] = true;
+    }
+
+  productEqualityMap_ = std::move(newMap);
+
+  for (auto& x : productSet)
+    for (auto& y : productSet) {
+      if (!productEqualityMap_[x][y]) continue;
+      productEqualityMap_[x][y] = productEqualityMap_[y][x] = false;
+      if (!isMultipleOfKnownSymbolicDimProductEqualPair(x, y)) {
+        productEqualityMap_[x][y] = productEqualityMap_[y][x] = true;
+      }
+    }
+
+  std::unordered_set<SymbolicDimProduct, SymProductHasher> toRemove;
+  for (auto& x : productSet) {
+    if (std::all_of(productSet.begin(),
+                    productSet.end(),
+                    [&](const SymbolicDimProduct& y) {
+                      return !productEqualityMap_[x][y];
+                    })) {
+      toRemove.insert(x);
+    }
+  }
+
+  for (auto& x : toRemove) {
+    productEqualityMap_.erase(x);
+  }
+
+  productEqualityMapUpdated_ = true;
+  return true;
+}
+
+bool SymbolicDimMgr::isSymbolicDimProductEqual(const SymbolicDimProduct& lhs,
+                                               const SymbolicDimProduct& rhs) {
+  SymbolicDimProduct newLhs, newRhs;
+  std::tie(newLhs, newRhs) = simplifySymbolicDimProductPair(lhs, rhs);
+
+  // early return for identity case.
+  if (newLhs == newRhs) return true;
+  IR_ENFORCE(updateProductEqualityMap(), "Update product equality map failed.");
+  return isMultipleOfKnownSymbolicDimProductEqualPair(newLhs, newRhs);
+}
 }  // namespace ir
diff --git a/paddle/ir/dialect/shape/utils/shape_utils.h b/paddle/ir/dialect/shape/utils/shape_utils.h
index 70f2a16c4481e..8d5fab1a1c811 100644
--- a/paddle/ir/dialect/shape/utils/shape_utils.h
+++ b/paddle/ir/dialect/shape/utils/shape_utils.h
@@ -18,6 +18,7 @@
 #include <type_traits>
 #include <unordered_map>
 #include <unordered_set>
+#include "paddle/ir/core/builtin_attribute.h"
 #include "paddle/ir/core/builtin_op.h"
 #include "paddle/ir/core/utils.h"
 #include "paddle/ir/dialect/shape/ir/shape_op.h"
@@ -45,7 +46,6 @@ class SymbolTable {
  public:
   explicit SymbolTable(ir::Operation* symbolTableOp)
       : symbolTableOp_(symbolTableOp) {}
-
   template <typename T>
   typename std::enable_if<std::is_same<T, SymbolicDim>::value,
                           SymbolicDim>::type
@@ -97,6 +97,7 @@ struct SymProductHasher {
 class SymbolicDimMgr {
  public:
   explicit SymbolicDimMgr(ir::ModuleOp m);
+  bool load();
   SymbolicDim newSymbolicDim(const std::string& name = {});
   SymbolicDim newConstantSymbolicDim(int64_t val);
   std::vector<SymbolicDim> createSymbolicDimsForRankedValue(Value value);
@@ -104,9 +105,28 @@ class SymbolicDimMgr {
   bool isSymbolicDimEqual(SymbolicDim lhs, SymbolicDim rhs);
   SymbolTable& symbolTable() { return symbolTable_; }
   bool mapSymbolicDimEqual(SymbolicDim lhs, SymbolicDim rhs);
+  SymbolicDimProduct simplifySymbolicDimProduct(const SymbolicDimProduct& x);
+  std::pair<SymbolicDimProduct, SymbolicDimProduct>
+  simplifySymbolicDimProductPair(const SymbolicDimProduct& x,
+                                 const SymbolicDimProduct& y);
+  SymbolicDimProduct* symbolicDimProductDivide(const SymbolicDimProduct& x,
+                                               const SymbolicDimProduct& y);
+
+  bool save();  // TODO(liujinnan): load constraint func
+
+  bool isSymbolicDimProductEqual(const SymbolicDimProduct& lhs,
+                                 const SymbolicDimProduct& rhs);
+  bool mapSymbolicDimProductEqual(const SymbolicDimProduct& lhs,
+                                  const SymbolicDimProduct& rhs);
 
  private:
   const std::string getNextName();
+  bool updateProductEqualityMap();
+  bool isMultipleOfKnownSymbolicDimProductEqualPair(
+      const SymbolicDimProduct& lhs, const SymbolicDimProduct& rhs);
+  bool saveShapeConstraintGraph();  // TODO(liujinnan): load & save
+                                    // shape_constraint_func
+  bool loadShapeConstraintGraph();
 
  private:
   ir::ModuleOp m_;
@@ -127,6 +147,6 @@ class SymbolicDimMgr {
       std::unordered_map<SymbolicDimProduct, bool, SymProductHasher>,
       SymProductHasher>;
   SymbolicDimProductMap productEqualityMap_;
+  bool productEqualityMapUpdated_ = true;
 };
-
 }  // namespace ir
diff --git a/paddle/ir/transforms/reorder_block_ops_pass.cc b/paddle/ir/transforms/reorder_block_ops_pass.cc
new file mode 100644
index 0000000000000..d922326677985
--- /dev/null
+++ b/paddle/ir/transforms/reorder_block_ops_pass.cc
@@ -0,0 +1,105 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/ir/transforms/reorder_block_ops_pass.h"
+
+#include <queue>
+
+#include "paddle/ir/core/builtin_op.h"
+#include "paddle/ir/core/program.h"
+#include "paddle/ir/pass/pass.h"
+
+namespace {
+
+// TODO(wilber): After support SideEffectTrait, Only NoSideEffectTrait op can be
+// removed by dce pass.
+// Now just a naive implementation.
+class ReorderBlockOpsPass : public ir::Pass {
+ public:
+  ReorderBlockOpsPass() : ir::Pass("ReorderBlockOpsPass", 0) {}
+
+  void Run(ir::Operation *op) override {
+    IR_ENFORCE(op->num_regions() > 0,
+               "ReorderBlockOpsPass should run on Operation which regions "
+               "number greater than 0.");
+    for (size_t i = 0; i < op->num_regions(); ++i) {
+      for (auto *block : op->region(i)) {
+        std::list<ir::Operation *> res_op_list;
+        std::unordered_map<ir::Operation *, int>
+            reorder_op_dep_cnt;  // op -> dependent input count
+        std::unordered_set<ir::Value> visited_values;
+        std::queue<ir::Operation *> op_que;
+
+        auto update_op_que = [&](ir::Operation *op) {
+          for (size_t i = 0; i < op->results().size(); ++i) {
+            auto result = op->result(i);
+            visited_values.insert(result);
+            for (auto it = result.use_begin(); it != result.use_end(); ++it) {
+              if (reorder_op_dep_cnt.count(it->owner())) {
+                reorder_op_dep_cnt[it->owner()]--;
+                if (reorder_op_dep_cnt[it->owner()] == 0) {
+                  op_que.push(it->owner());
+                }
+              }
+            }
+          }
+        };
+
+        for (auto &op : *block) {
+          bool has_dependency = false;
+          if (op->num_operands() > 0) {
+            for (size_t i = 0; i < op->num_operands(); ++i) {
+              auto operand = op->operand_source(i);
+              if (operand && visited_values.count(op->operand_source(i)) == 0) {
+                reorder_op_dep_cnt[op]++;
+                has_dependency = true;
+              }
+            }
+          }
+          if (!has_dependency) {
+            res_op_list.push_back(op);
+            update_op_que(op);
+          }
+        }
+
+        if (reorder_op_dep_cnt.empty()) {
+          return;
+        }
+
+        while (!op_que.empty()) {
+          auto *op = op_que.front();
+          op_que.pop();
+          res_op_list.push_back(op);
+          update_op_que(op);
+        }
+        VLOG(4) << "ReorderBlockOpsPass is applied.";
+        block->ResetOpListOrder(res_op_list);
+      }
+    }
+  }
+
+  bool CanApplyOn(ir::Operation *op) const override {
+    return op->num_regions() > 0;
+  }
+};
+
+}  // namespace
+
+namespace ir {
+
+std::unique_ptr<Pass> CreateReorderBlockOpsPass() {
+  return std::make_unique<ReorderBlockOpsPass>();
+}
+
+}  // namespace ir
diff --git a/paddle/ir/transforms/reorder_block_ops_pass.h b/paddle/ir/transforms/reorder_block_ops_pass.h
new file mode 100644
index 0000000000000..f668471fc9e04
--- /dev/null
+++ b/paddle/ir/transforms/reorder_block_ops_pass.h
@@ -0,0 +1,26 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include "paddle/ir/core/dll_decl.h"
+
+namespace ir {
+
+class Pass;
+
+IR_API std::unique_ptr<Pass> CreateReorderBlockOpsPass();
+
+}  // namespace ir
diff --git a/paddle/phi/backends/onednn/onednn_helper.h b/paddle/phi/backends/onednn/onednn_helper.h
index 84e36a26ca487..1d61004b36161 100644
--- a/paddle/phi/backends/onednn/onednn_helper.h
+++ b/paddle/phi/backends/onednn/onednn_helper.h
@@ -154,6 +154,12 @@ inline void AppendKey(std::string* key, const T& num) {
   key->append(std::to_string(num));
 }
 
+template <>
+inline void AppendKey(std::string* key,
+                      const dnnl::memory::format_kind& format) {
+  key->append(std::to_string(static_cast<int>(format)));
+}
+
 template <>
 inline void AppendKey(std::string* key,
                       const dnnl::memory::format_tag& format) {
@@ -171,6 +177,25 @@ inline void AppendKey(std::string* key, const dnnl::algorithm& algorithm) {
   key->append(std::to_string(static_cast<int>(algorithm)));
 }
 
+template <>
+inline void AppendKey(std::string* key, const dnnl::memory::dims& dims) {
+  for (size_t i = 0; i < dims.size(); i++) {
+    AppendKey(key, static_cast<int64_t>(dims[i]));
+  }
+}
+
+template <>
+inline void AppendKey(std::string* key, const dnnl::memory::desc& md) {
+  AppendKey(key, md.get_dims());
+  AppendKey(key, md.get_data_type());
+  AppendKey(key, md.get_format_kind());
+  AppendKey(key, md.get_inner_blks());
+  AppendKey(key, md.get_inner_idxs());
+  AppendKey(key, md.get_inner_nblks());
+  AppendKey(key, md.get_padded_dims());
+  AppendKey(key, md.get_strides());
+}
+
 template <>
 inline void AppendKey(std::string* key,
                       const dnnl::normalization_flags& flags) {
diff --git a/paddle/phi/core/extended_tensor.cc b/paddle/phi/core/extended_tensor.cc
index e5b5c3773f867..31d0fb25c88c1 100644
--- a/paddle/phi/core/extended_tensor.cc
+++ b/paddle/phi/core/extended_tensor.cc
@@ -38,7 +38,7 @@ DataType ExtendedTensor::dtype() const {
 
 DataLayout ExtendedTensor::layout() const {
   PADDLE_THROW(phi::errors::Unavailable(
-      "ExtendedTensor does not support `dtype` method."));
+      "ExtendedTensor does not support `layout` method."));
 }
 
 bool ExtendedTensor::valid() const {
diff --git a/paddle/phi/core/meta_tensor.cc b/paddle/phi/core/meta_tensor.cc
index 5ea5a07960923..9b9df5c1ff4aa 100644
--- a/paddle/phi/core/meta_tensor.cc
+++ b/paddle/phi/core/meta_tensor.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include "glog/logging.h"
 
+#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_meta_tensor.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/distributed/auto_parallel/dist_tensor.h"
 #include "paddle/phi/core/enforce.h"
@@ -271,6 +272,8 @@ const LoD& MetaTensor::lod() const {
     return static_cast<SparseCooTensor*>(tensor_)->non_zero_elements().lod();
   } else if (phi::SparseCsrTensor::classof(tensor_)) {
     return static_cast<SparseCsrTensor*>(tensor_)->non_zero_elements().lod();
+  } else if (paddle::dialect::IrMetaTensor::classof(tensor_)) {
+    return static_cast<paddle::dialect::IrMetaTensor*>(tensor_)->lod();
   } else {
     PADDLE_THROW(phi::errors::Unimplemented("Unsupported getting lod of `%s`.",
                                             tensor_->type_info().name()));
diff --git a/paddle/phi/core/utils/type_info.cc b/paddle/phi/core/utils/type_info.cc
index 38e17b57f633d..99b134b6e7960 100644
--- a/paddle/phi/core/utils/type_info.cc
+++ b/paddle/phi/core/utils/type_info.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <string>
 
+#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_meta_tensor.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/backends/custom/custom_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
@@ -50,6 +51,7 @@ template class TypeInfoTraits<phi::TensorBase, SparseCsrTensor>;
 template class TypeInfoTraits<phi::TensorBase, StringTensor>;
 template class TypeInfoTraits<phi::TensorBase, TensorArray>;
 template class TypeInfoTraits<phi::TensorBase, phi::distributed::DistTensor>;
+template class TypeInfoTraits<phi::TensorBase, paddle::dialect::IrMetaTensor>;
 
 template class TypeInfoTraits<phi::DeviceContext, CPUContext>;
 template class TypeInfoTraits<phi::DeviceContext, CustomContext>;
diff --git a/paddle/utils/flags.h b/paddle/utils/flags.h
index 3f68ba2f5dc1f..06c84ae15ab7f 100644
--- a/paddle/utils/flags.h
+++ b/paddle/utils/flags.h
@@ -79,8 +79,8 @@ using gflags::DoubleFromEnv;
 using gflags::Int32FromEnv;
 using gflags::Int64FromEnv;
 using gflags::StringFromEnv;
-using gflags::UInt32FromEnv;
-using gflags::UInt64FromEnv;
+using gflags::Uint32FromEnv;
+using gflags::Uint64FromEnv;
 #else
 #define DEFINE_FROM_ENV_FUNC(type, name)                     \
   inline type name##FromEnv(const std::string& env_var_name, \
@@ -90,9 +90,9 @@ using gflags::UInt64FromEnv;
 
 DEFINE_FROM_ENV_FUNC(bool, Bool);
 DEFINE_FROM_ENV_FUNC(int32_t, Int32);
-DEFINE_FROM_ENV_FUNC(uint32_t, UInt32);
+DEFINE_FROM_ENV_FUNC(uint32_t, Uint32);
 DEFINE_FROM_ENV_FUNC(int64_t, Int64);
-DEFINE_FROM_ENV_FUNC(uint64_t, UInt64);
+DEFINE_FROM_ENV_FUNC(uint64_t, Uint64);
 DEFINE_FROM_ENV_FUNC(double, Double);
 DEFINE_FROM_ENV_FUNC(std::string, String);
 
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index dbcc4f0c05fda..f0411d096dee4 100644
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -1594,9 +1594,9 @@ def poisson_nll_loss(
             + 0.5 * paddle.log(2 * math.pi * label)
         )
         loss_out += paddle.where(
-            stirling_approx <= 1,
-            paddle.zeros_like(stirling_approx),
+            label > 1,
             stirling_approx,
+            paddle.zeros_like(stirling_approx),
         )
     if reduction == 'mean':
         loss_out = paddle.mean(loss_out)
diff --git a/test/cpp/ir/pattern_rewrite/pattern_rewrite_test.cc b/test/cpp/ir/pattern_rewrite/pattern_rewrite_test.cc
index e007b73c9f0ed..fcca8cde7d5aa 100644
--- a/test/cpp/ir/pattern_rewrite/pattern_rewrite_test.cc
+++ b/test/cpp/ir/pattern_rewrite/pattern_rewrite_test.cc
@@ -42,6 +42,7 @@
 #include "paddle/ir/pattern_rewrite/pattern_match.h"
 #include "paddle/ir/pattern_rewrite/pattern_rewrite_driver.h"
 #include "paddle/ir/transforms/dead_code_elimination_pass.h"
+#include "paddle/ir/transforms/reorder_block_ops_pass.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 // NOTE(zhangbo9674): File pd_op.h is generated by op_gen.py, see details in
@@ -1099,6 +1100,7 @@ TEST(pattern_rewrite, Patterns) {
   pm.AddPass(std::make_unique<TestPass>());
   pm.AddPass(ir::CreateConstantFoldingPass());
   pm.AddPass(ir::CreateDeadCodeEliminationPass());
+  pm.AddPass(ir::CreateReorderBlockOpsPass());
   pm.EnablePassTiming();
   pm.EnableIRPrinting();
   // pm.EnableIRPrinting(std::make_unique<ir::PassManager::IRPrinterOption>(
diff --git a/test/cpp/ir/shape_dialect/symbolic_op_test.cc b/test/cpp/ir/shape_dialect/symbolic_op_test.cc
index 7b0751d17ac13..138e5e5b0d8c9 100644
--- a/test/cpp/ir/shape_dialect/symbolic_op_test.cc
+++ b/test/cpp/ir/shape_dialect/symbolic_op_test.cc
@@ -93,7 +93,10 @@ TEST(assist_struct_test, symbolic_dim_table) {
   EXPECT_FALSE(symbolTable.lookup<ir::dialect::SymbolicDim>("S1"));
 }
 
-TEST(assist_struct_test, symbolic_dim_mgr) {
+TEST(assist_struct_test, symbolic_dim_mgr_simple) {
+  /******************************************************/
+  /* Mgr simple version, only SymbolicDim related func. */
+  /******************************************************/
   ir::IrContext *ctx = ir::IrContext::Instance();
   ir::Program program(ctx);
   ctx->GetOrRegisterDialect<ir::dialect::ShapeDialect>();
@@ -141,6 +144,175 @@ TEST(assist_struct_test, symbolic_dim_mgr) {
   EXPECT_FALSE(symDimMgr.isSymbolicDimEqual(symDimS0, symDimC10));
 }
 
+TEST(assist_struct_test, symbolic_dim_mgr_complex) {
+  /***************************************************************/
+  /* Mgr with constraintOp, and SymbolicDimProduct related func. */
+  /***************************************************************/
+  ir::IrContext *ctx = ir::IrContext::Instance();
+  ir::Program program(ctx);
+  ctx->GetOrRegisterDialect<ir::dialect::ShapeDialect>();
+  ctx->GetOrRegisterDialect<paddle::dialect::PaddleDialect>();
+  ir::Builder builder = ir::Builder(ctx, program.block());
+
+  ir::dialect::SymbolicDim symDimS0 = builder.Build<ir::dialect::SymbolicDim>(
+      "S0", -100000, false, false, true, true);
+  ir::dialect::SymbolicDim symDimS1 = builder.Build<ir::dialect::SymbolicDim>(
+      "S1", -100000, false, false, true, true);
+  ir::dialect::SymbolicDim symDimS2 = builder.Build<ir::dialect::SymbolicDim>(
+      "S2", -100000, false, false, true, true);
+  ir::dialect::SymbolicDim symDimS3 = builder.Build<ir::dialect::SymbolicDim>(
+      "S3", -100000, false, false, true, true);
+  ir::dialect::SymbolicDim symDimS4 = builder.Build<ir::dialect::SymbolicDim>(
+      "S4", -100000, false, false, true, true);
+  ir::dialect::SymbolicDim symDimS5 = builder.Build<ir::dialect::SymbolicDim>(
+      "S5", -100000, false, false, true, true);
+  ir::dialect::SymbolicDim symDimS6 = builder.Build<ir::dialect::SymbolicDim>(
+      "S6", -100000, false, false, true, true);
+  ir::dialect::SymbolicDim symDimS7 = builder.Build<ir::dialect::SymbolicDim>(
+      "S7", -100000, false, false, true, true);
+  ir::dialect::SymbolicDim symDimS8 = builder.Build<ir::dialect::SymbolicDim>(
+      "S8", -100000, false, false, true, true);
+  ir::dialect::SymbolicDim symDimS9 = builder.Build<ir::dialect::SymbolicDim>(
+      "S9", -100000, false, false, true, true);
+  ir::dialect::SymbolicDim symDimS10 = builder.Build<ir::dialect::SymbolicDim>(
+      "S10", -100000, false, false, true, true);
+  ir::dialect::SymbolicDim symDimS11 = builder.Build<ir::dialect::SymbolicDim>(
+      "S11", -100000, false, false, true, true);
+  ir::dialect::SymbolicDim symDimS12 = builder.Build<ir::dialect::SymbolicDim>(
+      "S12", -100000, false, false, true, false);
+  ir::dialect::SymbolicDim symDimC10 = builder.Build<ir::dialect::SymbolicDim>(
+      "C10", 10, true, false, true, true);
+  ir::dialect::SymbolicDim symDimC20 = builder.Build<ir::dialect::SymbolicDim>(
+      "C20", 20, true, false, true, true);
+
+  ir::OpResult dimOpS0 = builder.Build<ir::dialect::DimOp>("S0").out();
+  ir::OpResult dimOpS1 = builder.Build<ir::dialect::DimOp>("S1").out();
+  ir::OpResult dimOpS2 = builder.Build<ir::dialect::DimOp>("S2").out();
+  ir::OpResult dimOpS3 = builder.Build<ir::dialect::DimOp>("S3").out();
+  ir::OpResult dimOpS4 = builder.Build<ir::dialect::DimOp>("S4").out();
+  ir::OpResult dimOpS5 = builder.Build<ir::dialect::DimOp>("S5").out();
+  ir::OpResult dimOpS6 = builder.Build<ir::dialect::DimOp>("S6").out();
+  ir::OpResult dimOpS7 = builder.Build<ir::dialect::DimOp>("S7").out();
+  ir::OpResult dimOpS8 = builder.Build<ir::dialect::DimOp>("S8").out();
+  ir::OpResult dimOpS9 = builder.Build<ir::dialect::DimOp>("S9").out();
+  ir::OpResult dimOpS10 = builder.Build<ir::dialect::DimOp>("S10").out();
+  ir::OpResult dimOpS11 = builder.Build<ir::dialect::DimOp>("S11").out();
+  ir::OpResult dimOpC10 = builder.Build<ir::dialect::DimOp>("C10").out();
+  ir::OpResult dimOpC20 = builder.Build<ir::dialect::DimOp>("C20").out();
+  ir::OpResult constant =
+      builder
+          .Build<ir::ConstantOp>(ir::Int32Attribute::get(ctx, 2),
+                                 ir::Int32Type::get(ctx))
+          ->result(0);
+
+  // Mark S1 == S2.
+  builder.Build<ir::dialect::TieProductEqualOp>(
+      2, 2, std::vector<ir::OpResult>{constant, dimOpS1, dimOpS2, constant});
+  // Mark S0 * S1 == S2 * S3, For check S0 == S3.
+  builder.Build<ir::dialect::TieProductEqualOp>(
+      2, 2, std::vector<ir::OpResult>{dimOpS0, dimOpS1, dimOpS2, dimOpS3});
+  // Mark S4 * S0 * S1 == S2 * S3 * S5, For check S4 == S5.
+  builder.Build<ir::dialect::TieProductEqualOp>(
+      3,
+      3,
+      std::vector<ir::OpResult>{
+          dimOpS4, dimOpS0, dimOpS1, dimOpS2, dimOpS3, dimOpS5});
+  // For check S6 == C10 * C20.
+  builder.Build<ir::dialect::TieProductEqualOp>(
+      1, 2, std::vector<ir::OpResult>{dimOpS6, dimOpC10, dimOpC20});
+  // Mark C10 * S0 * S1 == S2 * S3 * S7, for check C10 == S7.
+  builder.Build<ir::dialect::TieProductEqualOp>(
+      3,
+      3,
+      std::vector<ir::OpResult>{
+          dimOpC10, dimOpS0, dimOpS1, dimOpS2, dimOpS3, dimOpS7});
+
+  // Mark S8 * S9 == S10 * S11, for unsimplify product case
+  builder.Build<ir::dialect::TieProductEqualOp>(
+      2, 2, std::vector<ir::OpResult>{dimOpS8, dimOpS9, dimOpS10, dimOpS11});
+
+  ir::SymbolicDimMgr symDimMgr(program.module_op());
+
+  symDimMgr.load();
+
+  // For check indirect equality: S1 * S4 == S2 * S5
+  ir::SymbolicDimProduct symDimProductLhs;
+  ir::SymbolicDimProduct symDimProductRhs;
+
+  symDimProductLhs.symbols.push_back(symDimS1);
+  symDimProductLhs.symbols.push_back(symDimS4);
+
+  symDimProductRhs.symbols.push_back(symDimS2);
+  symDimProductRhs.symbols.push_back(symDimS5);
+
+  // For uncompletely simplied product check: S8 * S9 * S12 == S10 * S11 * S12
+  ir::SymbolicDimProduct symDimProductLhs_;
+  ir::SymbolicDimProduct symDimProductRhs_;
+
+  symDimProductLhs_.symbols.push_back(symDimS8);
+  symDimProductLhs_.symbols.push_back(symDimS9);
+  symDimProductLhs_.symbols.push_back(symDimS12);
+
+  symDimProductRhs_.symbols.push_back(symDimS10);
+  symDimProductRhs_.symbols.push_back(symDimS11);
+  symDimProductRhs_.symbols.push_back(symDimS12);
+
+  // For check simplifySymbolicDimProduct, {factor = 1, Sym = {S7}} => {factor =
+  // 10}
+  ir::SymbolicDimProduct symDimProductS7;
+  symDimProductS7.symbols.push_back(symDimS7);
+  ir::SymbolicDimProduct simplifiedProductS7 =
+      symDimMgr.simplifySymbolicDimProduct(symDimProductS7);
+
+  // For check simplifySymbolicDimProductPair, X * Y * Y, Y * Y * Z => X, Z
+  ir::SymbolicDimProduct symDimProductPairLhs;
+  ir::SymbolicDimProduct symDimProductPairRhs;
+  ir::SymbolicDimProduct newLhs, newRhs;
+  symDimProductPairLhs.symbols.push_back(symDimS4);
+  symDimProductPairLhs.symbols.push_back(symDimS1);
+  symDimProductPairLhs.symbols.push_back(symDimS2);
+  symDimProductPairRhs.symbols.push_back(symDimS1);
+  symDimProductPairRhs.symbols.push_back(symDimS2);
+  symDimProductPairRhs.symbols.push_back(symDimS3);
+
+  std::tie(newLhs, newRhs) = symDimMgr.simplifySymbolicDimProductPair(
+      symDimProductPairLhs, symDimProductPairRhs);
+
+  // For check symbolicDimProductDivide, {S4 * S1 * C20} / {S1 * C10} => {factor
+  // = 2 Sym = {S4}}
+  ir::SymbolicDimProduct symDimProductDivLhs;
+  ir::SymbolicDimProduct symDimProductDivRhs;
+  symDimProductDivLhs.symbols.push_back(symDimS4);
+  symDimProductDivLhs.symbols.push_back(symDimS1);
+  symDimProductDivLhs.symbols.push_back(symDimC20);
+  symDimProductDivRhs.symbols.push_back(symDimS1);
+  symDimProductDivRhs.symbols.push_back(symDimC10);
+
+  ir::SymbolicDimProduct *divRes = symDimMgr.symbolicDimProductDivide(
+      symDimProductDivLhs, symDimProductDivRhs);
+
+  EXPECT_TRUE(symDimMgr.isSymbolicDimEqual(symDimS1, symDimS2));
+  EXPECT_TRUE(symDimMgr.isSymbolicDimEqual(symDimS0, symDimS3));
+  EXPECT_TRUE(symDimMgr.isSymbolicDimEqual(symDimS4, symDimS5));
+  EXPECT_EQ(symDimS6.getValue(), 200);
+  EXPECT_EQ(symDimMgr.symbolTable().lookup<ir::dialect::SymbolicDim>("C20"),
+            symDimC20);
+  EXPECT_EQ(symDimS7.getValue(), symDimC10.getValue());
+  EXPECT_EQ(simplifiedProductS7.factor, 10);
+  EXPECT_EQ(simplifiedProductS7.symbols.size(), static_cast<size_t>(0));
+  EXPECT_EQ(newLhs.symbols.size(), static_cast<size_t>(1));
+  EXPECT_EQ(newRhs.symbols.size(), static_cast<size_t>(1));
+  EXPECT_EQ(newLhs.symbols[0], symDimMgr.getRootSymbolicDim(symDimS4));
+  EXPECT_EQ(newRhs.symbols[0], symDimMgr.getRootSymbolicDim(symDimS3));
+  EXPECT_EQ(divRes->factor, 2);
+  EXPECT_EQ(divRes->symbols.size(), static_cast<size_t>(1));
+  EXPECT_EQ(divRes->symbols[0], symDimMgr.getRootSymbolicDim(symDimS4));
+  EXPECT_TRUE(
+      symDimMgr.isSymbolicDimProductEqual(symDimProductLhs, symDimProductRhs));
+  EXPECT_TRUE(symDimMgr.isSymbolicDimProductEqual(symDimProductLhs_,
+                                                  symDimProductRhs_));
+}
+
 TEST(assist_struct_test, dim) {
   ir::IrContext *ctx = ir::IrContext::Instance();
   ir::Program program(ctx);
diff --git a/test/dygraph_to_static/test_simnet.py b/test/dygraph_to_static/test_simnet.py
index 2c69cf2072cf9..09ea063f9ad8e 100644
--- a/test/dygraph_to_static/test_simnet.py
+++ b/test/dygraph_to_static/test_simnet.py
@@ -17,6 +17,7 @@
 import unittest
 
 import numpy as np
+from dygraph_to_static_util import test_and_compare_with_new_ir
 from simnet_dygraph_model import BOW, HingeLoss
 
 import paddle
@@ -176,6 +177,7 @@ def train(conf_dict, to_static):
 
 
 class TestSimnet(unittest.TestCase):
+    @test_and_compare_with_new_ir(True)
     def test_dygraph_static_same_loss(self):
         if fluid.is_compiled_with_cuda():
             fluid.set_flags({"FLAGS_cudnn_deterministic": True})
diff --git a/test/dygraph_to_static/test_simnet_v2.py b/test/dygraph_to_static/test_simnet_v2.py
index a49cc23af11f8..316464ab79132 100644
--- a/test/dygraph_to_static/test_simnet_v2.py
+++ b/test/dygraph_to_static/test_simnet_v2.py
@@ -17,6 +17,7 @@
 import unittest
 
 import numpy as np
+from dygraph_to_static_util import test_and_compare_with_new_ir
 from simnet_dygraph_model_v2 import BOW, HingeLoss
 
 import paddle
@@ -176,6 +177,7 @@ def train(conf_dict, to_static):
 
 
 class TestSimnet(unittest.TestCase):
+    @test_and_compare_with_new_ir(True)
     def test_dygraph_static_same_loss(self):
         if paddle.is_compiled_with_cuda():
             paddle.fluid.set_flags({"FLAGS_cudnn_deterministic": True})
diff --git a/test/legacy_test/CMakeLists.txt b/test/legacy_test/CMakeLists.txt
index 05f30dca257f1..46a0136167e9e 100644
--- a/test/legacy_test/CMakeLists.txt
+++ b/test/legacy_test/CMakeLists.txt
@@ -592,6 +592,10 @@ py_test_modules(
 py_test_modules(test_install_check MODULES test_install_check ENVS
                 FLAGS_cudnn_deterministic=1)
 set_tests_properties(test_install_check PROPERTIES LABELS "RUN_TYPE=DIST")
+py_test_modules(test_install_check_new_ir MODULES test_install_check ENVS
+                FLAGS_cudnn_deterministic=1 FLAGS_enable_new_ir_in_executor=1)
+set_tests_properties(test_install_check_new_ir PROPERTIES LABELS
+                                                          "RUN_TYPE=DIST")
 
 if((WITH_GPU) AND (CUDA_VERSION GREATER_EQUAL 11.6))
   py_test_modules(test_fused_gemm_epilogue_op MODULES
diff --git a/test/legacy_test/test_poisson_nll_loss.py b/test/legacy_test/test_poisson_nll_loss.py
index 096018a6e2bf0..14ad375519914 100644
--- a/test/legacy_test/test_poisson_nll_loss.py
+++ b/test/legacy_test/test_poisson_nll_loss.py
@@ -51,7 +51,9 @@ def ref_poisson_nll_loss(
         stirling_approx = (
             label * np.log(label) - label + 0.5 * np.log(2 * np.pi * label)
         )
-        loss_out += np.where(stirling_approx <= 1, 0, stirling_approx)
+        loss_out += np.where(
+            label > 1, stirling_approx, np.zeros_like(stirling_approx)
+        )
 
     if reduction == 'none':
         return loss_out
diff --git a/test/prim/new_ir_prim/test_vjp_prim.py b/test/prim/new_ir_prim/test_vjp_prim.py
index 2a29ae9f69fc2..22309a08823ec 100644
--- a/test/prim/new_ir_prim/test_vjp_prim.py
+++ b/test/prim/new_ir_prim/test_vjp_prim.py
@@ -63,6 +63,7 @@ class TestVjpPrim(unittest.TestCase):
     def test_divide_grad_prim_case1(self):
         newir_program = get_ir_divide_program()
         paddle.framework.core._set_prim_backward_enabled(True)
+        paddle.framework.set_flags({"FLAGS_enable_new_ir_api": True})
         dout = newir_program.block().ops[-2].result(0)
         out_grads = [[dout]]
         stop_gradients = [[False], [False]]
@@ -83,9 +84,9 @@ def test_divide_grad_prim_case1(self):
             "pd.full",
             "pd.elementwise_pow",
             "pd.divide",
-            "pd.multiply",
             "pd.full",
             "pd.scale",
+            "pd.multiply",
             "pd.full_int_array",
             "pd.sum",
             "pd.full_int_array",
@@ -101,6 +102,7 @@ def test_divide_grad_prim_case1(self):
         for idx, op in enumerate(newir_program.block().ops):
             self.assertEqual(op.name(), all_op_names[idx])
         paddle.framework.core._set_prim_backward_enabled(False)
+        paddle.framework.set_flags({"FLAGS_enable_new_ir_api": False})
 
     def test_divide_grad_no_prim(self):
         newir_program = get_ir_divide_program()
@@ -123,6 +125,7 @@ def test_divide_grad_no_prim(self):
     def test_sum_grad_prim(self):
         newir_program = get_ir_sum_program()
         paddle.framework.core._set_prim_backward_enabled(True)
+        paddle.framework.set_flags({"FLAGS_enable_new_ir_api": True})
         dout = newir_program.block().ops[-3].result(0)
         out_grads = [[dout]]
         stop_gradients = [[False], [True]]
@@ -147,6 +150,7 @@ def test_sum_grad_prim(self):
         for idx, op in enumerate(newir_program.block().ops):
             self.assertEqual(op.name(), all_op_names[idx])
         paddle.framework.core._set_prim_backward_enabled(False)
+        paddle.framework.set_flags({"FLAGS_enable_new_ir_api": False})
 
     def test_sum_grad_no_prim(self):
         newir_program = get_ir_sum_program()