From 064c16074b11e815f371b9cb0d28ea7dff1336e1 Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Wed, 29 Nov 2023 09:35:45 +0800
Subject: [PATCH 01/14] [PIR+CINN]Support SubGraph Exporter for Unittest
 Platform (#59353)

* [PIR+CINN]Support SubGraph Exporter for Unittest Platform

* add unittest

* fix UT not take effect
---
 paddle/phi/core/flags.cc                      |  12 +
 .../paddle/jit/dy2static/export_subgraph.py   | 245 ++++++++++++++++++
 .../paddle/jit/dy2static/partial_program.py   |  33 ++-
 .../jit/dy2static/pir_partial_program.py      |   1 +
 .../jit/dy2static/program_translator.py       |  14 +-
 test/ir/pir/CMakeLists.txt                    |   4 +
 test/ir/pir/test_subgraph_exporter.py         | 102 ++++++++
 7 files changed, 406 insertions(+), 5 deletions(-)
 create mode 100644 python/paddle/jit/dy2static/export_subgraph.py
 create mode 100644 test/ir/pir/test_subgraph_exporter.py

diff --git a/paddle/phi/core/flags.cc b/paddle/phi/core/flags.cc
index ebef7410c31bf..eea908c0db520 100644
--- a/paddle/phi/core/flags.cc
+++ b/paddle/phi/core/flags.cc
@@ -1337,6 +1337,18 @@ PHI_DEFINE_EXPORTED_string(
     ir_inplace_kernel_blacklist,
     "",
     "It controls the ir inplace kernel subset do not use.");
+/**
+ * Specify the directory of saving PIR sugraph from @to_static
+ * Name: pir_subgraph_saving_dir
+ * Since Version: 2.6.0
+ * Value Range: str, default=""
+ * Example:
+ * Note: "/workspace/my_path", it will save into my_path dir;
+ */
+PHI_DEFINE_EXPORTED_string(
+    pir_subgraph_saving_dir,
+    "",
+    "Specify the directory of saving PIR sugraph from @to_static.");
 
 PHI_DEFINE_EXPORTED_bool(enable_record_memory, false, "Enable memory recorder");
 
diff --git a/python/paddle/jit/dy2static/export_subgraph.py b/python/paddle/jit/dy2static/export_subgraph.py
new file mode 100644
index 0000000000000..900f29db1c2a4
--- /dev/null
+++ b/python/paddle/jit/dy2static/export_subgraph.py
@@ -0,0 +1,245 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from paddle import pir
+from paddle.base import core
+from paddle.base.dygraph.base import switch_to_static_graph
+from paddle.base.framework import Variable, get_flags
+
+__all__ = []
+
+MAX_FILE_PATH_LEN = 50
+
+
+class SubGraphRole:
+    Infer = 0
+    Forward = 1
+    Backward = 2
+
+
+def get_saving_dir():
+    flag = "FLAGS_pir_subgraph_saving_dir"
+    value = get_flags(flag)[flag]
+    return value
+
+
+class BaseExporter:
+    def __init__(self, partial_program_layer, program, role):
+        self.pp_layer = partial_program_layer
+        self.program = program
+        self.role = role
+        self.root_dir = get_saving_dir()
+
+    def save(self):
+        # step 1: Create subgraph saving path.
+        saving_path = self.generate_saving_path()
+
+        # step 2: Translate into pir program.
+        pir_program = self.translate_into_pir()
+
+        # step 3: save into local disk.
+        self._save(pir_program, saving_path)
+
+    def _save(self, pir_program, path):
+        content = str(pir_program)
+        with open(path, 'w') as f:
+            f.write(content)
+
+    def parse_inout(self):
+        raise NotImplementedError("Need to implement parse_inout method")
+
+    def translate_into_pir(self):
+        # step 1: Insert data op for inputs/params
+        feed_list, fetch_list, inter_outs = self.parse_inout()
+        self.insert_feed_op(feed_list, "pt_input_")
+        # step 2: Insert fetch op for outputs
+        self.insert_fetch_op(fetch_list, "pt_output_")
+        self.insert_fetch_op(inter_outs, "pt_intermediate_")
+        # step 3: translate into pir
+        pir_program = pir.translate_to_pir(self.program.desc)
+        return pir_program
+
+    def generate_saving_path(self):
+        layer_name = self.pp_layer._debug_name
+        assert layer_name is not None
+        ops_name = [
+            op.type for op in self.program.block(0).ops[:MAX_FILE_PATH_LEN]
+        ]
+        prefix = ["infer_", "forward_", "backward_"][self.role]
+        file_name = prefix + "_".join(ops_name)[:MAX_FILE_PATH_LEN] + '.txt'
+        saving_dir = os.path.join(self.root_dir, layer_name)
+        self.verify_saving_dir(saving_dir)
+        return os.path.join(self.root_dir, layer_name, file_name)
+
+    def verify_saving_dir(self, dir_path):
+        if not os.path.exists(dir_path):
+            os.makedirs(dir_path)
+
+    def insert_feed_op(self, intputs, rename_prefix):
+        global_block = self.program.block(0)
+
+        for i, var in enumerate(intputs):
+            old_name = var.name
+            new_name = rename_prefix + str(i)
+            global_block._rename_var(old_name, new_name)
+            out = global_block.var(new_name)
+            global_block._prepend_op(
+                type='data',
+                inputs={},
+                outputs={'out': out},
+                attrs={
+                    'shape': out.shape,
+                    'dtype': out.dtype,
+                    'place': 0,
+                    'name': out.name,
+                },
+            )
+        global_block._sync_with_cpp()
+
+    def insert_fetch_op(self, outputs, rename_prefix):
+        global_block = self.program.block(0)
+        fetch_var = global_block.create_var(
+            name="fetch_outputs",
+            type=core.VarDesc.VarType.FETCH_LIST,
+            persistable=False,
+        )
+        for i, out in enumerate(outputs):
+            var = self.get_var(out)
+            old_name = var.name
+            new_name = rename_prefix + str(i)
+            global_block._rename_var(old_name, new_name)
+            new_var = global_block.var(new_name)
+            global_block.append_op(
+                type="fetch",
+                inputs={'X': [new_var]},
+                outputs={'Out': [fetch_var]},
+                attrs={'col': i},
+            )
+        global_block._sync_with_cpp()
+
+    def rename_ops(self, ops, new_name, old_name):
+        for op in ops:
+            op._rename_input(old_name, new_name)
+            op._rename_output(old_name, new_name)
+
+    def get_var(self, name_or_var):
+        if isinstance(name_or_var, Variable):
+            return name_or_var
+        assert isinstance(name_or_var, str)
+        global_block = self.program.block(0)
+        return global_block.var(name_or_var)
+
+
+class InferExporter(BaseExporter):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def parse_inout(self):
+        inputs, outputs = [], []
+        global_block = self.program.block(0)
+        raw_inputs = self.pp_layer._inputs.tolist() + self.pp_layer._params
+        raw_outputs = self.pp_layer._outputs.tolist()
+        for var in raw_inputs:
+            new_var = global_block.var(var.name)
+            inputs.append(new_var)
+
+        for var in raw_outputs:
+            new_var = global_block.var(var.name)
+            outputs.append(new_var)
+
+        return inputs, outputs, []
+
+
+class TrainFwdExporter(BaseExporter):
+    def __init__(self, pp_layer, copy_program, role, raw_inter_outs):
+        super().__init__(pp_layer, copy_program, role)
+        self.raw_inter_outs = raw_inter_outs
+
+    def parse_inout(self):
+        inputs, outputs = [], []
+        global_block = self.program.block(0)
+        raw_inputs = self.pp_layer._inputs.tolist() + self.pp_layer._params
+        raw_outputs = self.pp_layer._outputs.tolist()
+
+        inter_outs = {
+            name
+            for name in self.raw_inter_outs
+            if self.program.block(0).has_var(name)
+        }
+        for var in raw_inputs:
+            new_var = global_block.var(var.name)
+            inputs.append(new_var)
+            if var.name in inter_outs:
+                inter_outs.remove(var.name)
+
+        for var in raw_outputs:
+            new_var = global_block.var(var.name)
+            outputs.append(new_var)
+            if var.name in inter_outs:
+                inter_outs.remove(var.name)
+
+        return inputs, outputs, list(inter_outs)
+
+
+class TrainBwdExporter(BaseExporter):
+    def __init__(self, pp_layer, copy_program, role, raw_inputs, raw_outputs):
+        super().__init__(pp_layer, copy_program, role)
+        self.raw_inputs = raw_inputs
+        self.raw_outputs = raw_outputs
+
+    def parse_inout(self):
+        inputs, outputs = [], []
+        global_block = self.program.block(0)
+
+        for var_name in self.raw_inputs:
+            if global_block.has_var(var_name):
+                inputs.append(global_block.var(var_name))
+
+        # add fill_constant grad_var as input
+        for var in self.pp_layer._outputs.tolist():
+            init_grad_name = var.name + "@GRAD"
+            if init_grad_name not in self.raw_inputs and global_block.has_var(
+                init_grad_name
+            ):
+                inputs.append(global_block.var(init_grad_name))
+
+        for var_name in self.raw_outputs:
+            if (
+                global_block.has_var(var_name)
+                and var_name not in self.raw_inputs
+            ):
+                outputs.append(global_block.var(var_name))
+
+        return inputs, outputs, []
+
+
+@switch_to_static_graph
+def pir_exporter(pp_layer, program, role, shared_inputs=None, inter_outs=None):
+    # skip it if not specify root_saving_dir by FLAGS.
+    root_saving_dir = get_saving_dir()
+    if not root_saving_dir:
+        return
+    copy_program = program.clone()
+    if role == SubGraphRole.Infer:
+        InferExporter(pp_layer, copy_program, role).save()
+    elif role == SubGraphRole.Forward:
+        TrainFwdExporter(pp_layer, copy_program, role, inter_outs).save()
+    elif role == SubGraphRole.Backward:
+        TrainBwdExporter(
+            pp_layer, copy_program, role, shared_inputs, inter_outs
+        ).save()
+    else:
+        raise RuntimeError("role only support Infer/Forward/Backward")
diff --git a/python/paddle/jit/dy2static/partial_program.py b/python/paddle/jit/dy2static/partial_program.py
index 0aa0d0e3dcb07..1c4e2eb3b49f0 100644
--- a/python/paddle/jit/dy2static/partial_program.py
+++ b/python/paddle/jit/dy2static/partial_program.py
@@ -28,6 +28,7 @@
 from paddle.optimizer.lr import LRScheduler
 
 from . import logging_utils
+from .export_subgraph import SubGraphRole, pir_exporter
 from .utils import (
     RETURN_NO_VALUE_MAGIC_NUM,
     backend_guard,
@@ -226,6 +227,7 @@ def __init__(
         self._out_var_descs = [
             self._outputs[var_id].desc for var_id in self._outputs.var_ids
         ]
+        self._debug_name = None
 
     def __call__(self, inputs):
         """
@@ -544,14 +546,19 @@ def train_program(self):
     @property
     def infer_program(self):
         if _in_amp_guard():
-            return self._infer_amp_program
+            infer_program = self._infer_amp_program
         elif _in_pure_fp16_guard():
-            return self._infer_pure_fp16_program
+            infer_program = self._infer_pure_fp16_program
         else:
-            return self._infer_program
+            infer_program = self._infer_program
+        # NOTE(Aurelius84): Export forward_program for SubGraphChecker,
+        # see export_subgraph for detail.
+        pir_exporter(self, infer_program, SubGraphRole.Infer)
+        return infer_program
 
     @property
     def forward_program(self):
+        forward_program, role = None, None
         if self.training:
             if _in_amp_guard():
                 progs = self._train_amp_forward_backward_program
@@ -561,7 +568,8 @@ def forward_program(self):
                 progs = self._train_forward_backward_program
             return progs[0]
         else:
-            return self.infer_program
+            forward_program = self.infer_program
+        return forward_program
 
     @property
     def backward_program(self):
@@ -868,6 +876,23 @@ def _get_forward_backward_program_form(
         self._apply_inplace_pass(
             forward_builded_program, backward_builded_program
         )
+
+        # NOTE(Aurelius84): Export forward/backward program for SubGraphChecker,
+        # see export_subgraph for detail.
+        pir_exporter(
+            self,
+            forward_builded_program,
+            SubGraphRole.Forward,
+            set(),
+            set(forward_skip_vars),
+        )
+        pir_exporter(
+            self,
+            backward_builded_program,
+            SubGraphRole.Backward,
+            set(forward_skip_vars),
+            set(backward_skip_vars),
+        )
         return [forward_builded_program, backward_builded_program]
 
     def _apply_inplace_pass(self, forward_program, backward_program):
diff --git a/python/paddle/jit/dy2static/pir_partial_program.py b/python/paddle/jit/dy2static/pir_partial_program.py
index 109c96ca11bba..d8009a3bcd399 100644
--- a/python/paddle/jit/dy2static/pir_partial_program.py
+++ b/python/paddle/jit/dy2static/pir_partial_program.py
@@ -445,6 +445,7 @@ def __init__(
         self._hooker = None
         self._backend = kwargs.get('backend', None)
         self._grad_var_names = {}
+        self._debug_name = None
 
     def __call__(self, inputs):
         """
diff --git a/python/paddle/jit/dy2static/program_translator.py b/python/paddle/jit/dy2static/program_translator.py
index a8b87750b108a..26691bf06b0f0 100644
--- a/python/paddle/jit/dy2static/program_translator.py
+++ b/python/paddle/jit/dy2static/program_translator.py
@@ -376,6 +376,16 @@ def __init__(self, function, input_spec=None, **kwargs):
         self._cuda_graph_capture_mode = ""
         self._cuda_graph_pool_id = 0
         self._property = kwargs.get("property", False)
+        self._get_debug_name()
+
+    def _get_debug_name(self):
+        try:
+            if self._class_instance:
+                self._debug_name = self._class_instance.__class__.__name__
+            else:
+                self._debug_name = self._dygraph_function.__name__
+        except Exception:
+            self._debug_name = "static_function"
 
     @property
     def is_property(self):
@@ -778,7 +788,7 @@ def _perform_call(self, *args, **kwargs):
         args, kwargs = self._function_spec.unified_args_and_kwargs(args, kwargs)
 
         try:
-            concrete_program, partial_program_layer = self.get_concrete_program(
+            _, partial_program_layer = self.get_concrete_program(
                 *args, **kwargs, is_train=self._is_train_mode()
             )
             # 2. synchronize self.training attribute.
@@ -863,6 +873,7 @@ def get_concrete_program(self, *args, **kwargs):
             concrete_program, partial_program_layer = self._program_cache[
                 cache_key
             ]
+        partial_program_layer._debug_name = self._debug_name
         return concrete_program, partial_program_layer
 
     def get_concrete_program_with_cache_key(self, cached_key):
@@ -1442,6 +1453,7 @@ class FallbackProgramLayer:
         'training',
         '_cuda_graph_capture_mode',
         '_cuda_graph_pool_id',
+        '_debug_name',
     ]
 
     def __init__(self, instance, dy_func):
diff --git a/test/ir/pir/CMakeLists.txt b/test/ir/pir/CMakeLists.txt
index 5a9f2c48509b3..dd2d0ced90104 100644
--- a/test/ir/pir/CMakeLists.txt
+++ b/test/ir/pir/CMakeLists.txt
@@ -14,6 +14,10 @@ set(TEST_IR_SYSTEM_CASES
     test_stop_gradient
     test_override_operator)
 list(REMOVE_ITEM TEST_INTERP_CASES ${TEST_IR_SYSTEM_CASES})
+list(REMOVE_ITEM TEST_INTERP_CASES test_subgraph_exporter)
+py_test_modules(
+  test_subgraph_exporter MODULES test_subgraph_exporter ENVS MIN_GRAPH_SIZE=0
+  FLAGS_pir_subgraph_saving_dir=${CMAKE_CURRENT_SOURCE_DIR})
 
 foreach(target ${TEST_INTERP_CASES})
   py_test_modules(${target} MODULES ${target} ENVS GLOG_v=1
diff --git a/test/ir/pir/test_subgraph_exporter.py b/test/ir/pir/test_subgraph_exporter.py
new file mode 100644
index 0000000000000..9eed01700dbae
--- /dev/null
+++ b/test/ir/pir/test_subgraph_exporter.py
@@ -0,0 +1,102 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import shutil
+import unittest
+
+import paddle
+from paddle.jit.dy2static.export_subgraph import get_saving_dir
+
+
+class Net(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        y = paddle.pow(x, 2)
+        z = x + y
+        z2 = paddle.matmul(y, z)
+        out = paddle.nn.functional.relu(z * z2)
+        out = paddle.mean(out)
+        return out, z2
+
+
+class TestSaveFwdBwdProg(unittest.TestCase):
+    def setUp(self):
+        self.net = paddle.jit.to_static(Net())
+        self.root_dir = os.path.join(get_saving_dir(), "wrapper")
+        self.clean()
+
+    def clean(self):
+        if os.path.exists(self.root_dir):
+            shutil.rmtree(self.root_dir)
+        os.mkdir(self.root_dir)
+
+    def test_export(self):
+        x = paddle.randn([4, 4])
+        x.stop_gradient = False
+        out = self.net(x)
+        self.check_export()
+
+    def check_export(self):
+        for prog_file in os.listdir(self.root_dir):
+            if "forward" in prog_file:
+                self.check_fwd(prog_file)
+                return
+            elif "backward" in prog_file:
+                self.check_bwd(prog_file)
+            else:
+                raise RuntimeError("Not Support.")
+
+    def check_fwd(self, prog_file):
+        prog_info = [
+            "pt_input_0",
+            "pt_output_0",
+            "pt_output_1",
+            "pt_intermediate_0",
+            "pt_intermediate_1",
+            "pt_intermediate_2",
+        ]
+        path = os.path.join(self.root_dir, prog_file)
+        with open(path, 'r') as f:
+            content = f.readlines()
+        index = 0
+        for op_str in content:
+            if "pd_op.data" in op_str or "pd_op.fetch" in op_str:
+                self.assertIn(prog_info[index], op_str)
+                index += 1
+
+    def check_bwd(self, prog_file):
+        prog_info = [
+            "pt_input_6",
+            "pt_input_5",
+            "pt_input_4",
+            "pt_input_3",
+            "pt_input_2",
+            "pt_input_1",
+            "pt_input_0",
+        ]
+        path = os.path.join(self.root_dir, prog_file)
+        with open(path, 'r') as f:
+            content = f.readlines()
+        index = 0
+        for op_str in content:
+            if "pd_op.data" in op_str or "pd_op.fetch" in op_str:
+                self.assertIn(prog_info[index], op_str)
+                index += 1
+
+
+if __name__ == "__main__":
+    unittest.main()

From b22b8798ec6fc0263294bd9b1f12831b2a508619 Mon Sep 17 00:00:00 2001
From: houj04 <35131887+houj04@users.noreply.github.com>
Date: Wed, 29 Nov 2023 09:51:49 +0800
Subject: [PATCH 02/14] [XPU] prepare ut for bf16 (#59389)

* [XPU] prepare ut for bf16

* add ut for coverage

* add ut for coverage
---
 paddle/phi/backends/xpu/xpu2_op_list.cc      |  5 ++-
 paddle/phi/backends/xpu/xpu3_op_list.cc      |  5 ++-
 paddle/phi/kernels/funcs/tensor_formatter.cc |  8 ++++
 test/legacy_test/test_print_op.py            | 39 ++++++++++++++++++--
 test/xpu/op_test_xpu.py                      |  4 +-
 test/xpu/test_adamw_op_xpu.py                |  2 +-
 6 files changed, 54 insertions(+), 9 deletions(-)

diff --git a/paddle/phi/backends/xpu/xpu2_op_list.cc b/paddle/phi/backends/xpu/xpu2_op_list.cc
index 323207df20371..efb5b4c9d098f 100644
--- a/paddle/phi/backends/xpu/xpu2_op_list.cc
+++ b/paddle/phi/backends/xpu/xpu2_op_list.cc
@@ -595,7 +595,10 @@ XPUOpMap& get_kl2_ops() {
        XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
       {"mean_grad",
        XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
-      {"mean", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"mean",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::BFLOAT16})},
       {"merged_adam", XPUKernelSet({phi::DataType::FLOAT32})},
       {"merged_momentum",
        XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
diff --git a/paddle/phi/backends/xpu/xpu3_op_list.cc b/paddle/phi/backends/xpu/xpu3_op_list.cc
index aeb51998f4d7a..29e9c7e0f8901 100644
--- a/paddle/phi/backends/xpu/xpu3_op_list.cc
+++ b/paddle/phi/backends/xpu/xpu3_op_list.cc
@@ -571,7 +571,10 @@ XPUOpMap& get_kl3_ops() {
        XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
       {"mean_grad",
        XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
-      {"mean", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"mean",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::BFLOAT16})},
       {"merged_momentum",
        XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
       {"mish_grad", XPUKernelSet({phi::DataType::FLOAT32})},
diff --git a/paddle/phi/kernels/funcs/tensor_formatter.cc b/paddle/phi/kernels/funcs/tensor_formatter.cc
index 16d3b38bced7c..7c4cd28fe20c7 100644
--- a/paddle/phi/kernels/funcs/tensor_formatter.cc
+++ b/paddle/phi/kernels/funcs/tensor_formatter.cc
@@ -107,6 +107,10 @@ std::string TensorFormatter::Format(const phi::DenseTensor& print_tensor,
     FormatData<int64_t>(print_tensor, log_stream);
   } else if (dtype == phi::DataType::BOOL) {
     FormatData<bool>(print_tensor, log_stream);
+  } else if (dtype == phi::DataType::FLOAT16) {
+    FormatData<phi::dtype::float16>(print_tensor, log_stream);
+  } else if (dtype == phi::DataType::BFLOAT16) {
+    FormatData<phi::dtype::bfloat16>(print_tensor, log_stream);
   } else {
     log_stream << "  - data: unprintable type: " << dtype << std::endl;
   }
@@ -153,6 +157,10 @@ template void TensorFormatter::FormatData<int>(
     const phi::DenseTensor& print_tensor, std::stringstream& log_stream);
 template void TensorFormatter::FormatData<int64_t>(
     const phi::DenseTensor& print_tensor, std::stringstream& log_stream);
+template void TensorFormatter::FormatData<phi::dtype::float16>(
+    const phi::DenseTensor& print_tensor, std::stringstream& log_stream);
+template void TensorFormatter::FormatData<phi::dtype::bfloat16>(
+    const phi::DenseTensor& print_tensor, std::stringstream& log_stream);
 
 }  // namespace funcs
 }  // namespace paddle
diff --git a/test/legacy_test/test_print_op.py b/test/legacy_test/test_print_op.py
index 95c1dd420626d..1ce1a08643210 100755
--- a/test/legacy_test/test_print_op.py
+++ b/test/legacy_test/test_print_op.py
@@ -15,6 +15,7 @@
 import unittest
 
 import numpy as np
+from op_test import convert_float_to_uint16
 from simple_nets import init_data, simple_fc_net
 
 import paddle
@@ -30,14 +31,17 @@
 
 class TestPrintOpCPU(unittest.TestCase):
     def setUp(self):
+        self.dtype = 'float32'
         self.place = paddle.CPUPlace()
         self.x_tensor = base.core.LoDTensor()
-        tensor_np = np.random.random(size=(2, 3)).astype('float32')
+        tensor_np = np.random.random(size=(2, 3)).astype(self.dtype)
         self.x_tensor.set(tensor_np, self.place)
         self.x_tensor.set_recursive_sequence_lengths([[1, 1]])
 
     def build_network(self, only_forward, **kargs):
-        x = paddle.static.data('x', shape=[-1, 3], dtype='float32', lod_level=1)
+        x = paddle.static.data(
+            'x', shape=[-1, 3], dtype=self.dtype, lod_level=1
+        )
         x.stop_gradient = False
         paddle.static.Print(input=x, **kargs)
         loss = paddle.mean(x)
@@ -77,7 +81,7 @@ def test_all_parameters(self):
         prog = paddle.static.Program()
         with paddle.static.program_guard(prog, paddle.static.Program()):
             x = paddle.static.data(
-                'x', shape=[-1, 3], dtype='float32', lod_level=1
+                'x', shape=[-1, 3], dtype=self.dtype, lod_level=1
             )
             x.stop_gradient = False
 
@@ -136,9 +140,36 @@ def test_errors(self):
 )
 class TestPrintOpGPU(TestPrintOpCPU):
     def setUp(self):
+        self.dtype = 'float32'
         self.place = paddle.CUDAPlace(0)
         self.x_tensor = base.core.LoDTensor()
-        tensor_np = np.random.random(size=(2, 3)).astype('float32')
+        tensor_np = np.random.random(size=(2, 3)).astype(self.dtype)
+        self.x_tensor.set(tensor_np, self.place)
+        self.x_tensor.set_recursive_sequence_lengths([[1, 1]])
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
+)
+class TestPrintOpGPUFP16(TestPrintOpCPU):
+    def setUp(self):
+        self.dtype = 'float16'
+        self.place = paddle.CUDAPlace(0)
+        self.x_tensor = base.core.LoDTensor()
+        tensor_np = np.random.random(size=(2, 3)).astype(self.dtype)
+        self.x_tensor.set(tensor_np, self.place)
+        self.x_tensor.set_recursive_sequence_lengths([[1, 1]])
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
+)
+class TestPrintOpGPUBFP16(TestPrintOpCPU):
+    def setUp(self):
+        self.dtype = 'bfloat16'
+        self.place = paddle.CUDAPlace(0)
+        self.x_tensor = base.core.LoDTensor()
+        tensor_np = convert_float_to_uint16(np.random.random(size=(2, 3)))
         self.x_tensor.set(tensor_np, self.place)
         self.x_tensor.set_recursive_sequence_lengths([[1, 1]])
 
diff --git a/test/xpu/op_test_xpu.py b/test/xpu/op_test_xpu.py
index 7ea5359de5044..09ee428714bd6 100644
--- a/test/xpu/op_test_xpu.py
+++ b/test/xpu/op_test_xpu.py
@@ -183,8 +183,8 @@ def check_grad_with_place(
             if not core.is_float16_supported(place):
                 return
 
-        if self.dtype == np.float16:
-            max_relative_error = 1.0
+        if self.dtype == np.float16 or self.dtype == np.uint16:
+            max_relative_error = 0.1
             return super().check_grad_with_place(
                 place,
                 inputs_to_check,
diff --git a/test/xpu/test_adamw_op_xpu.py b/test/xpu/test_adamw_op_xpu.py
index 1a777f2d23578..8584360837d79 100644
--- a/test/xpu/test_adamw_op_xpu.py
+++ b/test/xpu/test_adamw_op_xpu.py
@@ -84,7 +84,7 @@ def setUp(self):
             # Test AdamW Op with supplied attributes
             self.op_type = "adamw"
             self.init_shape()
-            self.dtype = self.in_type_str
+            self.dtype = self.in_type
             param = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
             grad = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
             moment1 = np.random.uniform(-1, 1, self.shape).astype("float32")

From 141ef977773231d6b424dd855c77f7677865680e Mon Sep 17 00:00:00 2001
From: yuguo <948529990@qq.com>
Date: Wed, 29 Nov 2023 10:05:54 +0800
Subject: [PATCH 03/14] fix rnn op bug for DCU (#59402)

---
 paddle/phi/kernels/gpu/rnn_functor.h | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/paddle/phi/kernels/gpu/rnn_functor.h b/paddle/phi/kernels/gpu/rnn_functor.h
index bdbcd05e65772..ae43d64ea2d8c 100644
--- a/paddle/phi/kernels/gpu/rnn_functor.h
+++ b/paddle/phi/kernels/gpu/rnn_functor.h
@@ -99,14 +99,24 @@ class RNNDescriptors {
     // ------------------- cudnn dropout descriptors ---------------------
     size_t state_size;
     bool is_initialized = dropout_state->initialized();
-    if (!is_test_ && !is_initialized) {
 #ifdef PADDLE_WITH_HIP
+    if (!is_initialized) {
       PADDLE_ENFORCE_GPU_SUCCESS(
           phi::dynload::miopenDropoutGetStatesSize(handle, &state_size));
+      dropout_state->Resize({static_cast<int64_t>(state_size)});
+      dev_ctx.template Alloc<uint8_t>(dropout_state);
+    }
+    dropout_desc_.descriptor(handle,  // NOLINT
+                             dev_ctx.GetPlace(),
+                             is_initialized,
+                             dropout_prob_,
+                             dropout_state,
+                             seed_,
+                             state_size);
 #else
+    if (!is_test_ && !is_initialized) {
       PADDLE_ENFORCE_GPU_SUCCESS(
           phi::dynload::cudnnDropoutGetStatesSize(handle, &state_size));
-#endif
       dropout_state->Resize({static_cast<int64_t>(state_size)});
       dev_ctx.template Alloc<uint8_t>(dropout_state);
     }
@@ -117,6 +127,7 @@ class RNNDescriptors {
                              is_test_ ? nullptr : dropout_state,
                              seed_,
                              state_size);
+#endif
 
 // ------------------- cudnn rnn descriptors ---------------------
 #ifdef PADDLE_WITH_HIP

From 3c637e10cf4ac3aca5a15f90c87c509c3a0f9eb5 Mon Sep 17 00:00:00 2001
From: xiaoguoguo626807 <100397923+xiaoguoguo626807@users.noreply.github.com>
Date: Wed, 29 Nov 2023 10:24:09 +0800
Subject: [PATCH 04/14] =?UTF-8?q?=E3=80=90pir=E3=80=91modfiy=20fused=5Flin?=
 =?UTF-8?q?ear=5Fparamgrad=5Fadd=5Fpass=20(#59296)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* modfiy pass

* modify pattern insert point wrong

* modify pattern insert point wrong

* fix bug DeleteSourcePatternOp in DRR

* refine code

* modify pattern insert point wrong

* delete matmul_add replace

* delete matmul_add replace

---------

Co-authored-by: zyfncg <zhangyunfei07@baidu.com>
---
 paddle/fluid/pir/drr/drr_rewrite_pattern.cc   |  2 +-
 .../fused_linear_param_grad_add_pass.cc       | 51 ++++++++++++++-----
 2 files changed, 40 insertions(+), 13 deletions(-)

diff --git a/paddle/fluid/pir/drr/drr_rewrite_pattern.cc b/paddle/fluid/pir/drr/drr_rewrite_pattern.cc
index d558a2c10746b..91be95e788805 100644
--- a/paddle/fluid/pir/drr/drr_rewrite_pattern.cc
+++ b/paddle/fluid/pir/drr/drr_rewrite_pattern.cc
@@ -401,7 +401,7 @@ MatchContextImpl DrrRewritePattern::CreateOperations(
           max_index_op = ir_input_op;
         } else if (max_input_op_index == op_2_temp_program_index[ir_input_op]) {
           const auto& ops_vec = temp_program[max_input_op_index];
-          for (auto it = ops_vec.rbegin(); it != ops_vec.rend(); it++) {
+          for (auto it = ops_vec.begin(); it != ops_vec.end(); it++) {
             if (*it == max_index_op) {
               break;
             } else if (*it == ir_input_op) {
diff --git a/paddle/fluid/pir/transforms/fusion/fused_linear_param_grad_add_pass.cc b/paddle/fluid/pir/transforms/fusion/fused_linear_param_grad_add_pass.cc
index bc0821d832c04..7a3afec65f33f 100644
--- a/paddle/fluid/pir/transforms/fusion/fused_linear_param_grad_add_pass.cc
+++ b/paddle/fluid/pir/transforms/fusion/fused_linear_param_grad_add_pass.cc
@@ -26,15 +26,22 @@ class FusedMatmulAddGradAddPattern
  public:
   void operator()(pir::drr::DrrPatternContext *ctx) const override {
     pir::drr::SourcePattern pat = ctx->SourcePattern();
+    const auto &matmul0 = pat.Op(paddle::dialect::MatmulOp::name(),
+                                 {{"transpose_x", pat.Attr("trans_x")},
+                                  {"transpose_y", pat.Attr("trans_y")}});
+    const auto &add0 = pat.Op(paddle::dialect::AddOp::name());
     const auto &add_grad = pat.Op(paddle::dialect::AddGradOp::name());
     const auto &matmul_grad = pat.Op(paddle::dialect::MatmulGradOp::name(),
                                      {{"transpose_x", pat.Attr("trans_x")},
                                       {"transpose_y", pat.Attr("trans_y")}});
     const auto &add_ = pat.Op(paddle::dialect::Add_Op::name());
 
-    add_grad(
-        {&pat.Tensor("out"), &pat.Tensor("bias"), &pat.Tensor("addout_grad")},
-        {&pat.Tensor("out_grad"), &pat.Tensor("dbias")});
+    pat.Tensor("out") = matmul0(pat.Tensor("x"), pat.Tensor("weight"));
+    pat.Tensor("fwd_add_out") = add0(pat.Tensor("out"), pat.Tensor("bias"));
+    add_grad({&pat.Tensor("out"),
+              &pat.Tensor("bias"),
+              &pat.Tensor("fwd_add_out_grad")},
+             {&pat.Tensor("out_grad"), &pat.Tensor("dbias")});
     matmul_grad(
         {&pat.Tensor("x"), &pat.Tensor("weight"), &pat.Tensor("out_grad")},
         {&pat.Tensor("x_grad"), &pat.Tensor("weight_grad")});
@@ -47,7 +54,7 @@ class FusedMatmulAddGradAddPattern
       return (match_ctx.Tensor("weight_grad").Shape() ==
                   match_ctx.Tensor("dweight").Shape() &&
               match_ctx.Tensor("out").Shape() ==
-                  match_ctx.Tensor("addout_grad").Shape() &&
+                  match_ctx.Tensor("fwd_add_out_grad").Shape() &&
               x_trans == false && y_trans == false);
     });
 
@@ -70,10 +77,10 @@ class FusedMatmulAddGradAddPattern
         paddle::dialect::FusedLinearParamGradAddOp::name(),
         {{{"multi_precision", muti_precision_attr}, {"has_bias", true_attr}}});
 
-    matmul({&res.Tensor("addout_grad"), &res.Tensor("weight")},
+    matmul({&res.Tensor("fwd_add_out_grad"), &res.Tensor("weight")},
            {&res.Tensor("x_grad")});
     fused_linear_param_grad_add({&res.Tensor("x"),
-                                 &res.Tensor("addout_grad"),
+                                 &res.Tensor("fwd_add_out_grad"),
                                  &res.Tensor("dweight"),
                                  &res.NoneTensor()},
                                 {&res.Tensor("add_out"), &res.Tensor("dbias")});
@@ -232,15 +239,25 @@ class FusedMatmulAddGradAddaPattern
  public:
   void operator()(pir::drr::DrrPatternContext *ctx) const override {
     pir::drr::SourcePattern pat = ctx->SourcePattern();
-    const auto &add_grad = pat.Op(paddle::dialect::AddGradOp::name());
     const auto &matmul = pat.Op(paddle::dialect::MatmulOp::name(),
                                 {{"transpose_x", pat.Attr("trans_x")},
                                  {"transpose_y", pat.Attr("trans_y")}});
+    const auto &add = pat.Op(paddle::dialect::AddOp::name());
+    const auto &add_grad = pat.Op(paddle::dialect::AddGradOp::name());
+    const auto &matmul_g0 = pat.Op(paddle::dialect::MatmulOp::name(),
+                                   {{"transpose_x", pat.Attr("trans_xg0")},
+                                    {"transpose_y", pat.Attr("trans_yg0")}});
+    const auto &matmul_g1 = pat.Op(paddle::dialect::MatmulOp::name(),
+                                   {{"transpose_x", pat.Attr("trans_xg1")},
+                                    {"transpose_y", pat.Attr("trans_yg1")}});
     const auto &add_ = pat.Op(paddle::dialect::Add_Op::name());
+
+    pat.Tensor("out") = matmul(pat.Tensor("x"), pat.Tensor("weight"));
+    pat.Tensor("fwd_add_out") = add(pat.Tensor("out"), pat.Tensor("bias"));
     add_grad({&pat.Tensor("out"), &pat.Tensor("bias"), &pat.Tensor("dadd_out")},
              {&pat.Tensor("dout"), &pat.Tensor("dbias")});
-    matmul({&pat.Tensor("x"), &pat.Tensor("dout")},
-           {&pat.Tensor("weight_grad")});
+    pat.Tensor("dx") = matmul_g0(pat.Tensor("dout"), pat.Tensor("weight"));
+    pat.Tensor("weight_grad") = matmul_g1(pat.Tensor("x"), pat.Tensor("dout"));
     pat.Tensor("dweight_out") =
         add_(pat.Tensor("dweight"), pat.Tensor("weight_grad"));
 
@@ -277,15 +294,25 @@ class FusedMatmulAddGradAddbPattern
  public:
   void operator()(pir::drr::DrrPatternContext *ctx) const override {
     pir::drr::SourcePattern pat = ctx->SourcePattern();
-    const auto &add_grad = pat.Op(paddle::dialect::AddGradOp::name());
     const auto &matmul = pat.Op(paddle::dialect::MatmulOp::name(),
                                 {{"transpose_x", pat.Attr("trans_x")},
                                  {"transpose_y", pat.Attr("trans_y")}});
+    const auto &add = pat.Op(paddle::dialect::AddOp::name());
+    const auto &add_grad = pat.Op(paddle::dialect::AddGradOp::name());
+    const auto &matmul_g0 = pat.Op(paddle::dialect::MatmulOp::name(),
+                                   {{"transpose_x", pat.Attr("trans_xg0")},
+                                    {"transpose_y", pat.Attr("trans_yg0")}});
+    const auto &matmul_g1 = pat.Op(paddle::dialect::MatmulOp::name(),
+                                   {{"transpose_x", pat.Attr("trans_xg1")},
+                                    {"transpose_y", pat.Attr("trans_yg1")}});
     const auto &add_ = pat.Op(paddle::dialect::Add_Op::name());
+
+    pat.Tensor("out") = matmul(pat.Tensor("x"), pat.Tensor("weight"));
+    pat.Tensor("fwd_add_out") = add(pat.Tensor("out"), pat.Tensor("bias"));
     add_grad({&pat.Tensor("out"), &pat.Tensor("bias"), &pat.Tensor("dadd_out")},
              {&pat.Tensor("dout"), &pat.Tensor("dbias")});
-    matmul({&pat.Tensor("x"), &pat.Tensor("dout")},
-           {&pat.Tensor("weight_grad")});
+    pat.Tensor("dx") = matmul_g0(pat.Tensor("dout"), pat.Tensor("weight"));
+    pat.Tensor("weight_grad") = matmul_g1(pat.Tensor("x"), pat.Tensor("dout"));
     pat.Tensor("dweight_out") =
         add_(pat.Tensor("weight_grad"), pat.Tensor("dweight"));
 

From cae8de7856c8f5a632d49396f4949f5c2caba881 Mon Sep 17 00:00:00 2001
From: HydrogenSulfate <490868991@qq.com>
Date: Wed, 29 Nov 2023 10:25:12 +0800
Subject: [PATCH 05/14] [Fix] Fix DistributedBatchSampler for len(dataset) <
 num_replicas (#59390)

* fix for case when len(dataset) < num_replicas

* rewrite code
---
 python/paddle/io/dataloader/batch_sampler.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/python/paddle/io/dataloader/batch_sampler.py b/python/paddle/io/dataloader/batch_sampler.py
index 20a9bb9a00da4..aa2f4547fd477 100644
--- a/python/paddle/io/dataloader/batch_sampler.py
+++ b/python/paddle/io/dataloader/batch_sampler.py
@@ -271,7 +271,15 @@ def __init__(
     def __iter__(self):
         num_samples = len(self.dataset)
         indices = np.arange(num_samples).tolist()
-        indices += indices[: (self.total_size - len(indices))]
+        # add extra samples to make it evenly divisible
+        padding_size = self.total_size - len(indices)
+        if padding_size <= len(indices):
+            indices += indices[:padding_size]
+        else:
+            indices += (indices * math.ceil(padding_size / len(indices)))[
+                :padding_size
+            ]
+
         assert len(indices) == self.total_size
         if self.shuffle:
             np.random.RandomState(self.epoch).shuffle(indices)

From 1d859c503d3e3d4fd51cd1245d7fbe9c4cca7034 Mon Sep 17 00:00:00 2001
From: gouzil <66515297+gouzil@users.noreply.github.com>
Date: Wed, 29 Nov 2023 10:37:28 +0800
Subject: [PATCH 06/14] [Dy2St] pir dy2st unittest verification - Part 11
 (#59314)

---------

Co-authored-by: SigureMo <sigure.qaq@gmail.com>
---
 test/dygraph_to_static/test_build_strategy.py |   2 +
 .../test_load_transformer.py                  |   7 +-
 test/dygraph_to_static/test_save_load.py      |  75 ++++++-----
 test/dygraph_to_static/test_sentiment.py      | 122 +++++++++---------
 test/dygraph_to_static/test_simnet.py         |   5 +-
 test/dygraph_to_static/test_simnet_v2.py      |   3 +-
 6 files changed, 112 insertions(+), 102 deletions(-)

diff --git a/test/dygraph_to_static/test_build_strategy.py b/test/dygraph_to_static/test_build_strategy.py
index 991b0d52c7698..84b31adc296db 100644
--- a/test/dygraph_to_static/test_build_strategy.py
+++ b/test/dygraph_to_static/test_build_strategy.py
@@ -18,6 +18,7 @@
 from dygraph_to_static_utils import (
     Dy2StTestBase,
     test_ast_only,
+    test_legacy_and_pt_and_pir,
 )
 from test_resnet import ResNetHelper
 
@@ -87,6 +88,7 @@ def test_in_static_mode_mkldnn(self):
 
 
 class TestError(Dy2StTestBase):
+    @test_legacy_and_pt_and_pir
     def test_type_error(self):
         def foo(x):
             out = x + 1
diff --git a/test/dygraph_to_static/test_load_transformer.py b/test/dygraph_to_static/test_load_transformer.py
index ccb583428f95d..6698ba7ef6075 100644
--- a/test/dygraph_to_static/test_load_transformer.py
+++ b/test/dygraph_to_static/test_load_transformer.py
@@ -16,7 +16,10 @@
 import unittest
 
 import numpy as np
-from dygraph_to_static_utils import Dy2StTestBase
+from dygraph_to_static_utils import (
+    Dy2StTestBase,
+    test_legacy_and_pt_and_pir,
+)
 
 import paddle
 
@@ -45,6 +48,7 @@ class TestFallback(Dy2StTestBase):
     def setUp(self):
         self.x = paddle.to_tensor(1.0).astype('int')
 
+    @test_legacy_and_pt_and_pir
     def test_name_load(self):
         net_dy = Net()
         net_st = Net()
@@ -54,6 +58,7 @@ def test_name_load(self):
 
 
 class TestLoad2(Dy2StTestBase):
+    @test_legacy_and_pt_and_pir
     def test_name_load_nograd(self):
         @paddle.no_grad()
         def func(x):
diff --git a/test/dygraph_to_static/test_save_load.py b/test/dygraph_to_static/test_save_load.py
index 755e9ff143e85..0246be752c841 100644
--- a/test/dygraph_to_static/test_save_load.py
+++ b/test/dygraph_to_static/test_save_load.py
@@ -72,40 +72,38 @@ def test_save_load_same_result(self):
         x_data = np.random.randn(30, 10, 32).astype('float32')
         batch_num = 3
 
-        with base.dygraph.guard(place):
-            paddle.jit.enable_to_static(True)
-            x = base.dygraph.to_variable(x_data)
-            net = Linear(32, 64)
-            adam = Adam(learning_rate=0.1, parameters=net.parameters())
-
-            for i in range(batch_num):
-                static_out, static_loss = net(x)
-                # Update parameters
-                static_loss.backward()
-                adam.minimize(static_loss)
-                net.clear_gradients()
-            # Save parameters
-
-            paddle.save(net.state_dict(), self.model_path + '.pdparams')
-            # minimize() will update parameter, call net() to get output and avg_loss.
-            # Switch into eval mode.
-            net.eval()
+        paddle.jit.enable_to_static(True)
+        x = base.dygraph.to_variable(x_data)
+        net = Linear(32, 64)
+        adam = Adam(learning_rate=0.1, parameters=net.parameters())
+
+        for i in range(batch_num):
             static_out, static_loss = net(x)
+            # Update parameters
+            static_loss.backward()
+            adam.minimize(static_loss)
+            net.clear_gradients()
+        # Save parameters
+
+        paddle.save(net.state_dict(), self.model_path + '.pdparams')
+        # minimize() will update parameter, call net() to get output and avg_loss.
+        # Switch into eval mode.
+        net.eval()
+        static_out, static_loss = net(x)
 
         # load parameters into dygraph
-        with base.dygraph.guard(place):
-            dygraph_net = Linear(32, 64)
+        dygraph_net = Linear(32, 64)
 
-            # Load parameters
-            model_dict = paddle.load(self.model_path + '.pdparams')
-            dygraph_net.set_dict(model_dict)
-            # Switch into eval mode.
-            dygraph_net.eval()
+        # Load parameters
+        model_dict = paddle.load(self.model_path + '.pdparams')
+        dygraph_net.set_dict(model_dict)
+        # Switch into eval mode.
+        dygraph_net.eval()
 
-            x = base.dygraph.to_variable(x_data)
-            # predict output
-            paddle.jit.enable_to_static(False)
-            dygraph_out, dygraph_loss = dygraph_net(x)
+        x = base.dygraph.to_variable(x_data)
+        # predict output
+        paddle.jit.enable_to_static(False)
+        dygraph_out, dygraph_loss = dygraph_net(x)
 
         np.testing.assert_allclose(
             dygraph_out.numpy(), static_out.numpy(), rtol=1e-05
@@ -114,6 +112,17 @@ def test_save_load_same_result(self):
             dygraph_loss.numpy(), static_loss.numpy(), rtol=1e-05
         )
 
+    def _compute_op_num(self, composite_program):
+        if paddle.framework.use_pir_api():
+            comp_op_type_list = [
+                op.name() for op in composite_program.program.global_block().ops
+            ]
+        else:
+            comp_op_type_list = [
+                op.type for op in composite_program.block(0).ops
+            ]
+        return comp_op_type_list
+
     @test_ast_only
     def test_save_load_prim(self):
         with base.dygraph.guard(place):
@@ -127,9 +136,7 @@ def test_save_load_prim(self):
             composite_program = static_net.forward.get_concrete_program(self.x)[
                 1
             ].train_program
-            comp_op_type_list = [
-                op.type for op in composite_program.block(0).ops
-            ]
+            comp_op_type_list = self._compute_op_num(composite_program)
             self.assertNotIn("batch_norm", comp_op_type_list)
             self.assertNotIn("relu", comp_op_type_list)
             self.assertNotIn("pow", comp_op_type_list)
@@ -169,9 +176,7 @@ def test_save_load_prim_with_hook(self):
             composite_program = static_net.forward.get_concrete_program(self.x)[
                 1
             ].train_program
-            comp_op_type_list = [
-                op.type for op in composite_program.block(0).ops
-            ]
+            comp_op_type_list = self._compute_op_num(composite_program)
             self.assertNotIn("batch_norm", comp_op_type_list)
             self.assertNotIn("relu", comp_op_type_list)
             self.assertNotIn("pow", comp_op_type_list)
diff --git a/test/dygraph_to_static/test_sentiment.py b/test/dygraph_to_static/test_sentiment.py
index bc606751f4624..5e93521aade88 100644
--- a/test/dygraph_to_static/test_sentiment.py
+++ b/test/dygraph_to_static/test_sentiment.py
@@ -21,7 +21,6 @@
 import paddle
 from paddle import base
 from paddle.base.dygraph import to_variable
-from paddle.jit.api import to_static
 from paddle.nn import Embedding, Linear
 
 SEED = 2020
@@ -88,7 +87,6 @@ def __init__(self, dict_dim, batch_size, seq_len):
         self._fc1_act = paddle.nn.Softmax()
         self._fc_prediction = Linear(self.fc_hid_dim, self.class_dim)
 
-    @to_static
     def forward(self, inputs, label=None):
         emb = self.embedding(inputs)
         o_np_mask = (paddle.reshape(inputs, [-1, 1]) != self.dict_dim).astype(
@@ -132,7 +130,6 @@ def __init__(self, dict_dim, batch_size, seq_len):
         self._fc2 = Linear(self.hid_dim, self.fc_hid_dim)
         self._fc_prediction = Linear(self.fc_hid_dim, self.class_dim)
 
-    @to_static
     def forward(self, inputs, label=None):
         emb = self.embedding(inputs)
         o_np_mask = (paddle.reshape(inputs, [-1, 1]) != self.dict_dim).astype(
@@ -171,7 +168,7 @@ def __init__(self, dict_dim, batch_size, seq_len):
         self.embedding = Embedding(
             self.dict_dim + 1,
             self.emb_dim,
-            weight_attr=base.ParamAttr(learning_rate=30),
+            weight_attr=paddle.ParamAttr(learning_rate=30),
             sparse=False,
         )
         h_0 = np.zeros((self.batch_size, self.hid_dim), dtype="float32")
@@ -181,7 +178,6 @@ def __init__(self, dict_dim, batch_size, seq_len):
         self._fc_prediction = Linear(self.fc_hid_dim, self.class_dim)
         self._gru = DynamicGRU(size=self.hid_dim, h_0=h_0)
 
-    @to_static
     def forward(self, inputs, label=None):
         emb = self.embedding(inputs)
         o_np_mask = (paddle.reshape(inputs, [-1, 1]) != self.dict_dim).astype(
@@ -219,7 +215,7 @@ def __init__(self, dict_dim, batch_size, seq_len):
         self.embedding = Embedding(
             self.dict_dim + 1,
             self.emb_dim,
-            weight_attr=base.ParamAttr(learning_rate=30),
+            weight_attr=paddle.ParamAttr(learning_rate=30),
             sparse=False,
         )
         h_0 = np.zeros((self.batch_size, self.hid_dim), dtype="float32")
@@ -234,7 +230,6 @@ def __init__(self, dict_dim, batch_size, seq_len):
             size=self.hid_dim, h_0=h_0, is_reverse=True
         )
 
-    @to_static
     def forward(self, inputs, label=None):
         emb = self.embedding(inputs)
         o_np_mask = (paddle.reshape(inputs, [-1, 1]) != self.dict_dim).astype(
@@ -304,68 +299,71 @@ class Args:
 
 def train(args, to_static):
     paddle.jit.enable_to_static(to_static)
-    place = (
-        base.CUDAPlace(0) if base.is_compiled_with_cuda() else base.CPUPlace()
-    )
+    np.random.seed(SEED)
+    paddle.seed(SEED)
+    paddle.framework.random._manual_program_seed(SEED)
 
-    with base.dygraph.guard(place):
-        np.random.seed(SEED)
-        paddle.seed(SEED)
-        paddle.framework.random._manual_program_seed(SEED)
+    train_reader = fake_data_reader(
+        args.class_num, args.vocab_size, args.batch_size, args.padding_size
+    )
+    train_loader = base.io.DataLoader.from_generator(capacity=24)
+    train_loader.set_sample_list_generator(train_reader)
 
-        train_reader = fake_data_reader(
-            args.class_num, args.vocab_size, args.batch_size, args.padding_size
+    if args.model_type == 'cnn_net':
+        model = paddle.jit.to_static(
+            CNN(args.vocab_size, args.batch_size, args.padding_size)
+        )
+    elif args.model_type == 'bow_net':
+        model = paddle.jit.to_static(
+            BOW(args.vocab_size, args.batch_size, args.padding_size)
         )
-        train_loader = base.io.DataLoader.from_generator(capacity=24)
-        train_loader.set_sample_list_generator(train_reader)
-
-        if args.model_type == 'cnn_net':
-            model = CNN(args.vocab_size, args.batch_size, args.padding_size)
-        elif args.model_type == 'bow_net':
-            model = BOW(args.vocab_size, args.batch_size, args.padding_size)
-        elif args.model_type == 'gru_net':
-            model = GRU(args.vocab_size, args.batch_size, args.padding_size)
-        elif args.model_type == 'bigru_net':
-            model = BiGRU(args.vocab_size, args.batch_size, args.padding_size)
-        sgd_optimizer = paddle.optimizer.Adagrad(
-            learning_rate=args.lr, parameters=model.parameters()
+    elif args.model_type == 'gru_net':
+        model = paddle.jit.to_static(
+            GRU(args.vocab_size, args.batch_size, args.padding_size)
         )
+    elif args.model_type == 'bigru_net':
+        model = paddle.jit.to_static(
+            BiGRU(args.vocab_size, args.batch_size, args.padding_size)
+        )
+    sgd_optimizer = paddle.optimizer.Adagrad(
+        learning_rate=args.lr, parameters=model.parameters()
+    )
 
-        loss_data = []
-        for eop in range(args.epoch):
-            time_begin = time.time()
-            for batch_id, data in enumerate(train_loader()):
-                word_ids, labels, seq_lens = data
-                doc = to_variable(word_ids.numpy().reshape(-1)).astype('int64')
-                label = labels.astype('int64')
-
-                model.train()
-                avg_cost, prediction, acc = model(doc, label)
-                loss_data.append(float(avg_cost))
-
-                avg_cost.backward()
-                sgd_optimizer.minimize(avg_cost)
-                model.clear_gradients()
-
-                if batch_id % args.log_step == 0:
-                    time_end = time.time()
-                    used_time = time_end - time_begin
-                    # used_time may be 0.0, cause zero division error
-                    if used_time < 1e-5:
-                        used_time = 1e-5
-                    print(
-                        "step: %d, ave loss: %f, speed: %f steps/s"
-                        % (
-                            batch_id,
-                            float(avg_cost),
-                            args.log_step / used_time,
-                        )
+    loss_data = []
+    for eop in range(args.epoch):
+        time_begin = time.time()
+        for batch_id, data in enumerate(train_loader()):
+            word_ids, labels, seq_lens = data
+            doc = paddle.to_tensor(word_ids.numpy().reshape(-1), dtype="int64")
+            label = labels.astype('int64')
+
+            model.train()
+            avg_cost, prediction, acc = model(doc, label)
+            loss_data.append(float(avg_cost))
+
+            avg_cost.backward()
+            sgd_optimizer.minimize(avg_cost)
+            model.clear_gradients()
+
+            if batch_id % args.log_step == 0:
+                time_end = time.time()
+                used_time = time_end - time_begin
+                # used_time may be 0.0, cause zero division error
+                if used_time < 1e-5:
+                    used_time = 1e-5
+                print(
+                    "step: %d, ave loss: %f, speed: %f steps/s"
+                    % (
+                        batch_id,
+                        float(avg_cost),
+                        args.log_step / used_time,
                     )
-                    time_begin = time.time()
+                )
+                time_begin = time.time()
 
-                if batch_id == args.train_step:
-                    break
-                batch_id += 1
+            if batch_id == args.train_step:
+                break
+            batch_id += 1
     return loss_data
 
 
diff --git a/test/dygraph_to_static/test_simnet.py b/test/dygraph_to_static/test_simnet.py
index 1b6a5148023f8..b1dc687abad3b 100644
--- a/test/dygraph_to_static/test_simnet.py
+++ b/test/dygraph_to_static/test_simnet.py
@@ -17,9 +17,7 @@
 import unittest
 
 import numpy as np
-from dygraph_to_static_utils import (
-    Dy2StTestBase,
-)
+from dygraph_to_static_utils import Dy2StTestBase, test_legacy_and_pt_and_pir
 from simnet_dygraph_model import BOW, HingeLoss
 
 import paddle
@@ -180,6 +178,7 @@ def train(conf_dict, to_static):
 
 
 class TestSimnet(Dy2StTestBase):
+    @test_legacy_and_pt_and_pir
     def test_dygraph_static_same_loss(self):
         if base.is_compiled_with_cuda():
             base.set_flags({"FLAGS_cudnn_deterministic": True})
diff --git a/test/dygraph_to_static/test_simnet_v2.py b/test/dygraph_to_static/test_simnet_v2.py
index d87235ca8ce31..1c4476a99457d 100644
--- a/test/dygraph_to_static/test_simnet_v2.py
+++ b/test/dygraph_to_static/test_simnet_v2.py
@@ -17,7 +17,7 @@
 import unittest
 
 import numpy as np
-from dygraph_to_static_utils import Dy2StTestBase
+from dygraph_to_static_utils import Dy2StTestBase, test_legacy_and_pt_and_pir
 from simnet_dygraph_model_v2 import BOW, HingeLoss
 
 import paddle
@@ -177,6 +177,7 @@ def train(conf_dict, to_static):
 
 
 class TestSimnet(Dy2StTestBase):
+    @test_legacy_and_pt_and_pir
     def test_dygraph_static_same_loss(self):
         if paddle.is_compiled_with_cuda():
             paddle.base.set_flags({"FLAGS_cudnn_deterministic": True})

From 4f23e7e86fe854ae7da39a3036575a302df92f58 Mon Sep 17 00:00:00 2001
From: lzy <569782149@qq.com>
Date: Wed, 29 Nov 2023 10:37:59 +0800
Subject: [PATCH 07/14] optimize mem_eff_attn's compilation (#59446)

---
 paddle/phi/kernels/CMakeLists.txt             | 65 ++++++++++++++++++-
 .../generate_kernels.py                       | 13 ++--
 .../generate_variable_forward_kernels.py      | 13 ++--
 3 files changed, 79 insertions(+), 12 deletions(-)

diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index 6fbbd0cf83306..5355b4e08c21b 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -66,14 +66,14 @@ if(WITH_CUTLASS)
     COMMAND
       ${PYTHON_EXECUTABLE}
       ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/generate_kernels.py
-      --cuda_arch "${NVCC_ARCH_BIN}"
+      --cuda_arch "${NVCC_ARCH_BIN}" --gen_dir "autogen_tmp"
     RESULT_VARIABLE memory_efficient_attention_gen_res)
 
   execute_process(
     COMMAND
       ${PYTHON_EXECUTABLE}
       ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/generate_variable_forward_kernels.py
-      --cuda_arch "${NVCC_ARCH_BIN}"
+      --cuda_arch "${NVCC_ARCH_BIN}" --gen_dir "autogen_variable_tmp"
     RESULT_VARIABLE memory_efficient_attention_gen_res)
 
   if(NOT memory_efficient_attention_gen_res EQUAL 0)
@@ -83,6 +83,67 @@ if(WITH_CUTLASS)
     )
   endif()
 
+  set(autogen_tmp_dir
+      ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/autogen_tmp
+  )
+  set(autogen_variable_tmp_dir
+      ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/autogen_variable_tmp
+  )
+  set(autogen_dir
+      ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/autogen
+  )
+  set(autogen_variable_dir
+      ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/autogen_variable
+  )
+
+  file(GLOB generated_files ${autogen_tmp_dir}/*.h ${autogen_tmp_dir}/impl/*.cu)
+
+  file(GLOB variable_generated_files ${autogen_variable_tmp_dir}/*.h
+       ${autogen_variable_tmp_dir}/impl/*.cu)
+
+  if(EXISTS ${autogen_dir})
+    foreach(gen_file ${generated_files})
+      string(REPLACE "autogen_tmp" "autogen" now_file ${gen_file})
+      execute_process(COMMAND ${CMAKE_COMMAND} -E copy_if_different
+                              "${gen_file}" "${now_file}")
+    endforeach()
+    message("copy if different ${autogen_dir}")
+  else()
+    foreach(gen_file ${generated_files})
+      string(REPLACE "autogen_tmp" "autogen" now_file ${gen_file})
+      execute_process(COMMAND ${CMAKE_COMMAND} -E copy "${gen_file}"
+                              "${now_file}")
+    endforeach()
+    message("copy ${autogen_dir}")
+  endif()
+
+  if(EXISTS ${autogen_variable_dir})
+    foreach(gen_file ${variable_generated_files})
+      string(REPLACE "autogen_variable_tmp" "autogen_variable" now_file
+                     ${gen_file})
+      execute_process(COMMAND ${CMAKE_COMMAND} -E copy_if_different
+                              "${gen_file}" "${now_file}")
+    endforeach()
+    message("copy if different ${autogen_variable_dir}")
+  else()
+    foreach(gen_file ${variable_generated_files})
+      string(REPLACE "autogen_variable_tmp" "autogen_variable" now_file
+                     ${gen_file})
+      execute_process(COMMAND ${CMAKE_COMMAND} -E copy "${gen_file}"
+                              "${now_file}")
+    endforeach()
+    message("copy ${autogen_variable_dir}")
+  endif()
+
+  file(
+    REMOVE_RECURSE
+    ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/autogen_tmp
+  )
+  file(
+    REMOVE_RECURSE
+    ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/autogen_variable_tmp
+  )
+
   execute_process(
     COMMAND
       ${CMAKE_COMMAND} -E make_directory
diff --git a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/generate_kernels.py b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/generate_kernels.py
index 7caf30236bb79..a6633d02ecebe 100644
--- a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/generate_kernels.py
+++ b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/generate_kernels.py
@@ -26,7 +26,6 @@
 import collections
 import itertools
 import os
-import shutil
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple, TypeVar
@@ -94,6 +93,12 @@ def parse_args():
         default=convert_to_arch_list("All"),
         help="The CUDA architecture to be generated.",
     )
+    parser.add_argument(
+        "--gen_dir",
+        type=str,
+        default="autogen_variable",
+        help="The directory to save the generated files.",
+    )
     args = parser.parse_args()
     args.max_arch = find_max_arch(args.cuda_arch)
     return args
@@ -425,7 +430,7 @@ def write_decl_impl(
     declarations += "} // namespace phi\n"
     declarations += f"#endif // {enable_def}\n"
 
-    autogen_dir = Path(args.dst_path) / "autogen"
+    autogen_dir = Path(args.dst_path) / args.gen_dir
     os.makedirs(autogen_dir, exist_ok=True)
     declaration_path = autogen_dir / f"{family_name}.h"
     declaration_path.write_text(declarations)
@@ -530,14 +535,12 @@ def write_main_header(forward_impl, backward_impl):
 #endif
 '''
 
-    path = Path(args.dst_path) / "autogen"
+    path = Path(args.dst_path) / args.gen_dir
     os.makedirs(path, exist_ok=True)
     path = Path(path) / "memory_efficient_attention.h"
     path.write_text(main_header_content)
 
 
-if os.path.exists(Path(args.dst_path) / "autogen"):
-    shutil.rmtree(Path(args.dst_path) / "autogen")
 forward_impl = "paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/kernel_forward.h"
 backward_impl = "paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/kernel_backward.h"
 
diff --git a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/generate_variable_forward_kernels.py b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/generate_variable_forward_kernels.py
index 86c08cdf8d107..07e710e52d206 100644
--- a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/generate_variable_forward_kernels.py
+++ b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/generate_variable_forward_kernels.py
@@ -26,7 +26,6 @@
 import collections
 import itertools
 import os
-import shutil
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple, TypeVar
@@ -94,6 +93,12 @@ def parse_args():
         default=convert_to_arch_list("All"),
         help="The CUDA architecture to be generated.",
     )
+    parser.add_argument(
+        "--gen_dir",
+        type=str,
+        default="autogen_variable",
+        help="The directory to save the generated files.",
+    )
     args = parser.parse_args()
     args.max_arch = find_max_arch(args.cuda_arch)
     return args
@@ -399,7 +404,7 @@ def write_decl_impl(
     declarations += "} // namespace phi\n"
     declarations += f"#endif // {enable_def}\n"
 
-    autogen_dir = Path(args.dst_path) / "autogen_variable"
+    autogen_dir = Path(args.dst_path) / args.gen_dir
     os.makedirs(autogen_dir, exist_ok=True)
     declaration_path = autogen_dir / f"{family_name}.h"
     declaration_path.write_text(declarations)
@@ -546,14 +551,12 @@ def write_main_header():
 #endif
 '''
 
-    path = Path(args.dst_path) / "autogen_variable"
+    path = Path(args.dst_path) / args.gen_dir
     os.makedirs(path, exist_ok=True)
     path = Path(path) / "memory_efficient_variable_attention.h"
     path.write_text(main_header_content)
 
 
-if os.path.exists(Path(args.dst_path) / "autogen_variable"):
-    shutil.rmtree(Path(args.dst_path) / "autogen_variable")
 forward_impl = "paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/autogen_variable/memory_efficient_variable_attention.h"
 
 write_main_header()

From 3b40279ef60ce6577af5c1671a69168e7ab97281 Mon Sep 17 00:00:00 2001
From: Ryan <44900829+DrRyanHuang@users.noreply.github.com>
Date: Wed, 29 Nov 2023 10:39:09 +0800
Subject: [PATCH 08/14] [Dy2St] pir dy2st unittest verification - Part -2
 (#59370)

---------

Co-authored-by: SigureMo <sigure.qaq@gmail.com>
---
 test/dygraph_to_static/test_assert.py     | 31 ++++++++++--------
 test/dygraph_to_static/test_dict.py       | 29 ++++++++++-------
 test/dygraph_to_static/test_isinstance.py | 38 ++++++++++++++---------
 3 files changed, 59 insertions(+), 39 deletions(-)

diff --git a/test/dygraph_to_static/test_assert.py b/test/dygraph_to_static/test_assert.py
index bde776bf023d9..01250626f07e0 100644
--- a/test/dygraph_to_static/test_assert.py
+++ b/test/dygraph_to_static/test_assert.py
@@ -17,21 +17,17 @@
 import numpy
 from dygraph_to_static_utils import (
     Dy2StTestBase,
-    test_ast_only,
 )
 
 import paddle
 from paddle import base
-from paddle.jit.api import to_static
 
 
-@paddle.jit.to_static
 def dyfunc_assert_variable(x):
     x_v = base.dygraph.to_variable(x)
     assert x_v
 
 
-@to_static
 def dyfunc_assert_non_variable(x=True):
     assert x
 
@@ -51,31 +47,40 @@ def _run_dy_static(self, func, x, with_exception):
         self._run(func, x, with_exception, True)
         self._run(func, x, with_exception, False)
 
-    @test_ast_only
     def test_non_variable(self):
         self._run_dy_static(
-            dyfunc_assert_non_variable, x=False, with_exception=True
+            paddle.jit.to_static(dyfunc_assert_non_variable),
+            x=False,
+            with_exception=True,
         )
         self._run_dy_static(
-            dyfunc_assert_non_variable, x=True, with_exception=False
+            paddle.jit.to_static(dyfunc_assert_non_variable),
+            x=True,
+            with_exception=False,
         )
 
-    @test_ast_only
     def test_bool_variable(self):
         self._run_dy_static(
-            dyfunc_assert_variable, x=numpy.array([False]), with_exception=True
+            paddle.jit.to_static(dyfunc_assert_variable),
+            x=numpy.array([False]),
+            with_exception=True,
         )
         self._run_dy_static(
-            dyfunc_assert_variable, x=numpy.array([True]), with_exception=False
+            paddle.jit.to_static(dyfunc_assert_variable),
+            x=numpy.array([True]),
+            with_exception=False,
         )
 
-    @test_ast_only
     def test_int_variable(self):
         self._run_dy_static(
-            dyfunc_assert_variable, x=numpy.array([0]), with_exception=True
+            paddle.jit.to_static(dyfunc_assert_variable),
+            x=numpy.array([0]),
+            with_exception=True,
         )
         self._run_dy_static(
-            dyfunc_assert_variable, x=numpy.array([1]), with_exception=False
+            paddle.jit.to_static(dyfunc_assert_variable),
+            x=numpy.array([1]),
+            with_exception=False,
         )
 
 
diff --git a/test/dygraph_to_static/test_dict.py b/test/dygraph_to_static/test_dict.py
index b8b2480573527..f69b112ba9afd 100644
--- a/test/dygraph_to_static/test_dict.py
+++ b/test/dygraph_to_static/test_dict.py
@@ -15,11 +15,13 @@
 import unittest
 
 import numpy as np
-from dygraph_to_static_utils import Dy2StTestBase
+from dygraph_to_static_utils import (
+    Dy2StTestBase,
+    test_legacy_and_pt_and_pir,
+)
 
 import paddle
 from paddle import base
-from paddle.jit import to_static
 
 PLACE = base.CUDAPlace(0) if base.is_compiled_with_cuda() else base.CPUPlace()
 
@@ -79,7 +81,6 @@ def __init__(self, batch_size=64, hidden_size=16, output_size=16):
         self.output_size = output_size
         self.sub_net = SubNetWithDict(hidden_size, output_size)
 
-    @to_static
     def forward(self, input, max_len=4):
         input = base.dygraph.to_variable(input)
         cache = {
@@ -135,17 +136,19 @@ def _run_dygraph(self):
     def train(self, to_static=False):
         paddle.jit.enable_to_static(to_static)
         with base.dygraph.guard(PLACE):
-            net = MainNetWithDict(batch_size=self.batch_size)
+            net = paddle.jit.to_static(
+                MainNetWithDict(batch_size=self.batch_size)
+            )
             ret = net(self.x)
             return ret.numpy()
 
+    @test_legacy_and_pt_and_pir
     def test_ast_to_func(self):
         self.assertTrue((self._run_dygraph() == self._run_static()).all())
 
 
 # Tests for dict pop
-@paddle.jit.to_static
-def test_dic_pop(x):
+def test_dict_pop(x):
     x = paddle.to_tensor(x)
     dict_a = {"red": 0, "green": 1, "blue": 2}
 
@@ -156,8 +159,7 @@ def test_dic_pop(x):
     return out
 
 
-@paddle.jit.to_static
-def test_dic_pop_2(x):
+def test_dict_pop_2(x):
     x = paddle.to_tensor(x)
     dict_a = {"red": x, "green": x + 1, "blue": x + 3}
 
@@ -179,7 +181,7 @@ def setUp(self):
         self._set_test_func()
 
     def _set_test_func(self):
-        self.dygraph_func = test_dic_pop
+        self.dygraph_func = paddle.jit.to_static(test_dict_pop)
 
     def _run_static(self):
         return self._run(to_static=True)
@@ -194,6 +196,7 @@ def _run(self, to_static):
 
         return result.numpy()
 
+    @test_legacy_and_pt_and_pir
     def test_transformed_result(self):
         dygraph_res = self._run_dygraph()
         static_res = self._run_static()
@@ -207,14 +210,13 @@ def test_transformed_result(self):
 
 class TestDictPop2(TestDictPop):
     def _set_test_func(self):
-        self.dygraph_func = test_dic_pop_2
+        self.dygraph_func = paddle.jit.to_static(test_dict_pop_2)
 
 
 class NetWithDictPop(paddle.nn.Layer):
     def __init__(self):
         super().__init__()
 
-    @to_static
     def forward(self, x, **kwargs):
         x = paddle.to_tensor(x)
         y = kwargs.pop('y', None)
@@ -233,10 +235,11 @@ def setUp(self):
     def train(self, to_static=False):
         paddle.jit.enable_to_static(to_static)
         with base.dygraph.guard(PLACE):
-            net = NetWithDictPop()
+            net = paddle.jit.to_static(NetWithDictPop())
             ret = net(z=0, x=self.x, y=True)
             return ret.numpy()
 
+    @test_legacy_and_pt_and_pir
     def test_ast_to_func(self):
         dygraph_result = self._run_dygraph()
         static_result = self._run_static()
@@ -248,6 +251,7 @@ def test_ast_to_func(self):
 
 
 class TestDictCmpInFor(Dy2StTestBase):
+    @test_legacy_and_pt_and_pir
     def test_with_for(self):
         def func():
             pos = [1, 3]
@@ -264,6 +268,7 @@ def func():
 
         self.assertEqual(paddle.jit.to_static(func)()['minus'], 8)
 
+    @test_legacy_and_pt_and_pir
     def test_with_for_enumerate(self):
         def func():
             pos = [1, 3]
diff --git a/test/dygraph_to_static/test_isinstance.py b/test/dygraph_to_static/test_isinstance.py
index 498c84ad0e885..9cac83cdbbe2f 100644
--- a/test/dygraph_to_static/test_isinstance.py
+++ b/test/dygraph_to_static/test_isinstance.py
@@ -88,24 +88,34 @@ def train(model, to_static):
 class TestIsinstance(Dy2StTestBase):
     @test_legacy_and_pt_and_pir
     def test_isinstance_simple_return_layer(self):
-        model = paddle.jit.to_static(IsInstanceLayer(SimpleReturnLayer()))
-        self._test_model(model)
+        model_creator = lambda: paddle.jit.to_static(
+            IsInstanceLayer(SimpleReturnLayer())
+        )
+        self._test_model(model_creator)
 
+    @test_legacy_and_pt_and_pir
     def test_isinstance_add_attr_layer(self):
-        model = paddle.jit.to_static(IsInstanceLayer(AddAttrLayer()))
-        self._test_model(model)
+        model_creator = lambda: paddle.jit.to_static(
+            IsInstanceLayer(AddAttrLayer())
+        )
+        self._test_model(model_creator)
 
+    @test_legacy_and_pt_and_pir
     def test_sequential_layer(self):
-        layers = []
-        for i in range(5):
-            layers.append(SimpleReturnLayer())
-            layers.append(AddAttrLayer())
-        model = paddle.jit.to_static(SequentialLayer(layers))
-        self._test_model(model)
-
-    def _test_model(self, model):
-        st_out = train(model, to_static=True)
-        dy_out = train(model, to_static=False)
+        def model_creator():
+            layers = []
+            for i in range(5):
+                layers.append(SimpleReturnLayer())
+                layers.append(AddAttrLayer())
+            return paddle.jit.to_static(SequentialLayer(layers))
+
+        self._test_model(model_creator)
+
+    def _test_model(self, model_creator):
+        st_model = model_creator()
+        st_out = train(st_model, to_static=True)
+        dy_model = model_creator()
+        dy_out = train(dy_model, to_static=False)
         np.testing.assert_allclose(
             dy_out,
             st_out,

From 87bf502307ac3041f3e1171415b052fe8815e0be Mon Sep 17 00:00:00 2001
From: zhengzhonghui <38675903+deepllz@users.noreply.github.com>
Date: Wed, 29 Nov 2023 10:43:00 +0800
Subject: [PATCH 09/14] [AutoParallel] Add reduce_all spmd rule (#59411)

* [AutoParallel] Add reduce_all spmd rule

* fix compile error on window

* add c++ unittest for reduce_all spmd
---
 paddle/phi/api/yaml/legacy_ops.yaml           |  1 +
 .../auto_parallel/reshard/reshard_utils.h     |  6 ++--
 paddle/phi/core/distributed/gloo_utils.h      |  7 ++++
 paddle/phi/core/visit_type.h                  | 29 +++++++++++++++--
 paddle/phi/infermeta/spmd_rules/reduction.cc  |  7 ++++
 paddle/phi/infermeta/spmd_rules/reduction.h   |  4 +++
 paddle/phi/kernels/gpu/all_reduce_kernel.cu   |  5 +++
 .../semi_auto_parallel_for_reduction.py       | 32 +++++++++++++++++--
 test/cpp/auto_parallel/spmd_rule_test.cc      | 21 ++++++++++++
 9 files changed, 104 insertions(+), 8 deletions(-)

diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml
index c14588389a706..3cfcf155f47c4 100755
--- a/paddle/phi/api/yaml/legacy_ops.yaml
+++ b/paddle/phi/api/yaml/legacy_ops.yaml
@@ -34,6 +34,7 @@
   output : Tensor(out)
   infer_meta :
     func : ReduceInferMeta
+    spmd_rule : ReductionAllInferSpmdDynamic
   kernel :
     func : all
 
diff --git a/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.h b/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.h
index f064d038c5186..022dc06598064 100644
--- a/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.h
+++ b/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.h
@@ -76,14 +76,14 @@ CommContext* CreateOrGetCommContext(const DeviceContext& dev_ctx,
   do {                                                                \
     if (phi::CPUContext::classof(dev_ctx)) {                          \
       VLOG(4) << "Call `" << #fn_name << "` in Resharding on CPU.";   \
-      PD_VISIT_FLOATING_AND_INTEGRAL_TYPES(                           \
+      PD_VISIT_BOOL_AND_FLOATING_AND_INTEGRAL_TYPES_CPU(              \
           dtype, #fn_name, ([&] {                                     \
             fn_name<data_t>(static_cast<const CPUContext&>(*dev_ctx), \
                             __VA_ARGS__);                             \
           }));                                                        \
     } else if (phi::GPUContext::classof(dev_ctx)) {                   \
       VLOG(4) << "Call `" << #fn_name << "` in Resharding on GPU.";   \
-      PD_VISIT_BOOL_AND_FLOATING_AND_INTEGRAL_TYPES(                  \
+      PD_VISIT_BOOL_AND_FLOATING_AND_INTEGRAL_TYPES_GPU(              \
           dtype, #fn_name, ([&] {                                     \
             fn_name<data_t>(static_cast<const GPUContext&>(*dev_ctx), \
                             __VA_ARGS__);                             \
@@ -99,7 +99,7 @@ CommContext* CreateOrGetCommContext(const DeviceContext& dev_ctx,
   do {                                                                    \
     if (phi::CPUContext::classof(dev_ctx)) {                              \
       VLOG(4) << "Call `" << #fn_name << "` in Resharding on CPU.";       \
-      PD_VISIT_FLOATING_AND_INTEGRAL_TYPES(                               \
+      PD_VISIT_BOOL_AND_FLOATING_AND_INTEGRAL_TYPES_CPU(                  \
           dtype, #fn_name, ([&] {                                         \
             fn_name<data_t>(static_cast<const CPUContext&>(*dev_ctx),     \
                             __VA_ARGS__);                                 \
diff --git a/paddle/phi/core/distributed/gloo_utils.h b/paddle/phi/core/distributed/gloo_utils.h
index 36680a873111a..5c0be5d5ca0cf 100644
--- a/paddle/phi/core/distributed/gloo_utils.h
+++ b/paddle/phi/core/distributed/gloo_utils.h
@@ -142,6 +142,13 @@ void SetReduceFunc(P* opts, int reduce_type) {
           static_cast<void (*)(void*, const void*, const void*, size_t)>(
               &gloo::product<T>));
       break;
+    case ReduceType::kRedAll:
+      // NOTE(zhonghui): There is no reduce_all math function for gloo, just use
+      // min to replace
+      opts->setReduceFunction(
+          static_cast<void (*)(void*, const void*, const void*, size_t)>(
+              &gloo::min<T>));
+      break;
     default:
       PADDLE_THROW(
           errors::InvalidArgument("Unsupport reduce type: %d.", reduce_type));
diff --git a/paddle/phi/core/visit_type.h b/paddle/phi/core/visit_type.h
index 9f1b258bb5b03..047ba79bc1599 100644
--- a/paddle/phi/core/visit_type.h
+++ b/paddle/phi/core/visit_type.h
@@ -151,7 +151,7 @@ namespace phi {
 ///////// BOOL and Floating and Integral Dispatch Marco ///////////
 
 #if NCCL_VERSION_CODE >= 21000
-#define PD_VISIT_BOOL_AND_FLOATING_AND_INTEGRAL_TYPES(TYPE, NAME, ...)        \
+#define PD_VISIT_BOOL_AND_FLOATING_AND_INTEGRAL_TYPES_GPU(TYPE, NAME, ...)    \
   [&] {                                                                       \
     const auto& __dtype__ = TYPE;                                             \
     switch (__dtype__) {                                                      \
@@ -180,7 +180,7 @@ namespace phi {
     }                                                                         \
   }()
 #else
-#define PD_VISIT_BOOL_AND_FLOATING_AND_INTEGRAL_TYPES(TYPE, NAME, ...)        \
+#define PD_VISIT_BOOL_AND_FLOATING_AND_INTEGRAL_TYPES_GPU(TYPE, NAME, ...)    \
   [&] {                                                                       \
     const auto& __dtype__ = TYPE;                                             \
     switch (__dtype__) {                                                      \
@@ -208,6 +208,31 @@ namespace phi {
   }()
 #endif
 
+#define PD_VISIT_BOOL_AND_FLOATING_AND_INTEGRAL_TYPES_CPU(TYPE, NAME, ...)    \
+  [&] {                                                                       \
+    const auto& __dtype__ = TYPE;                                             \
+    switch (__dtype__) {                                                      \
+      PD_PRIVATE_CASE_TYPE(NAME, ::phi::DataType::BOOL, bool, __VA_ARGS__)    \
+      PD_PRIVATE_CASE_TYPE(                                                   \
+          NAME, ::paddle::DataType::FLOAT32, float, __VA_ARGS__)              \
+      PD_PRIVATE_CASE_TYPE(                                                   \
+          NAME, ::paddle::DataType::FLOAT64, double, __VA_ARGS__)             \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT32, int, __VA_ARGS__) \
+      PD_PRIVATE_CASE_TYPE(                                                   \
+          NAME, ::paddle::DataType::INT64, int64_t, __VA_ARGS__)              \
+      PD_PRIVATE_CASE_TYPE(                                                   \
+          NAME, ::paddle::DataType::INT8, int8_t, __VA_ARGS__)                \
+      PD_PRIVATE_CASE_TYPE(                                                   \
+          NAME, ::paddle::DataType::UINT8, uint8_t, __VA_ARGS__)              \
+      PD_PRIVATE_CASE_TYPE(                                                   \
+          NAME, ::paddle::DataType::INT16, int16_t, __VA_ARGS__)              \
+      default:                                                                \
+        PD_THROW("function " #NAME " is not implemented for data type `",     \
+                 __dtype__,                                                   \
+                 "`");                                                        \
+    }                                                                         \
+  }()
+
 ///////// Floating and Complex Dispatch Marco ///////////
 
 #define PD_VISIT_FLOATING_AND_COMPLEX_TYPES(TYPE, NAME, ...)              \
diff --git a/paddle/phi/infermeta/spmd_rules/reduction.cc b/paddle/phi/infermeta/spmd_rules/reduction.cc
index a1fc0873a244a..1129c2ab81e35 100644
--- a/paddle/phi/infermeta/spmd_rules/reduction.cc
+++ b/paddle/phi/infermeta/spmd_rules/reduction.cc
@@ -159,6 +159,13 @@ SpmdInfo ReductionMaxInferSpmdDynamic(const DistMetaTensor& x,
       x, axis.GetData(), keep_dim, static_cast<int>(ReduceType::kRedMax));
 }
 
+SpmdInfo ReductionAllInferSpmdDynamic(const DistMetaTensor& x,
+                                      const IntArray& axis,
+                                      bool keep_dim) {
+  return ReductionInferSpmdBase(
+      x, axis.GetData(), keep_dim, static_cast<int>(ReduceType::kRedAll));
+}
+
 SpmdInfo ReductionInferSpmdReverse(const DistMetaTensor& x,
                                    const DistMetaTensor& out,
                                    const std::vector<int64_t>& axis,
diff --git a/paddle/phi/infermeta/spmd_rules/reduction.h b/paddle/phi/infermeta/spmd_rules/reduction.h
index 30144e6d7ca46..a168f4cb500e5 100644
--- a/paddle/phi/infermeta/spmd_rules/reduction.h
+++ b/paddle/phi/infermeta/spmd_rules/reduction.h
@@ -44,6 +44,10 @@ SpmdInfo ReductionMaxInferSpmdDynamic(const DistMetaTensor& x,
                                       const IntArray& axis,
                                       bool keep_dim);
 
+SpmdInfo ReductionAllInferSpmdDynamic(const DistMetaTensor& x,
+                                      const IntArray& axis,
+                                      bool keep_dim);
+
 SpmdInfo ReductionInferSpmdReverse(const DistMetaTensor& x,
                                    const DistMetaTensor& out,
                                    const std::vector<int64_t>& axis,
diff --git a/paddle/phi/kernels/gpu/all_reduce_kernel.cu b/paddle/phi/kernels/gpu/all_reduce_kernel.cu
index 9cd283695c89a..0c920ef1bc61e 100644
--- a/paddle/phi/kernels/gpu/all_reduce_kernel.cu
+++ b/paddle/phi/kernels/gpu/all_reduce_kernel.cu
@@ -57,6 +57,11 @@ void AllReduceKernel(const Context& dev_ctx,
     case ReduceType::kRedProd:
       red_type = ncclProd;
       break;
+    case ReduceType::kRedAll:
+      // NOTE(zhonghui): There is no reduce_all type of ncclRedOp_t, just use
+      // min to replace
+      red_type = ncclMin;
+      break;
   }
   comm_ctx->AllReduce(out, x, red_type, stream);
 #else
diff --git a/test/auto_parallel/semi_auto_parallel_for_reduction.py b/test/auto_parallel/semi_auto_parallel_for_reduction.py
index 5cd7ef4596268..15c43b9f56304 100644
--- a/test/auto_parallel/semi_auto_parallel_for_reduction.py
+++ b/test/auto_parallel/semi_auto_parallel_for_reduction.py
@@ -37,8 +37,11 @@ def test_body(
     ):
         paddle.seed(self._seed)
         np.random.seed(self._seed)
+        is_op_func_all = op_func == paddle.all
 
         x = paddle.randn(x_shape, self._dtype)
+        if is_op_func_all:
+            x = x > 0
         x.stop_gradient = False
 
         dist_x = dist.shard_tensor(x, self._mesh, x_placements)
@@ -49,9 +52,10 @@ def test_body(
         self.check_tensor_eq(out, dist_out)
         np.testing.assert_equal(dist_out.shape, out_shape, verbose=True)
 
-        dist_out.backward()
-        out.backward()
-        self.check_tensor_eq(x.grad, dist_x.grad)
+        if not is_op_func_all:
+            dist_out.backward()
+            out.backward()
+            self.check_tensor_eq(x.grad, dist_x.grad)
 
     def test_sum_x_shard(self):
         self.test_body(
@@ -113,6 +117,26 @@ def test_max_x_shard_on_axis(self):
             op_func=paddle.max,
         )
 
+    def test_all_x_shard(self):
+        self.test_body(
+            x_shape=[4, 8, 6],
+            out_shape=[4, 6],
+            x_placements=[dist.Shard(0)],
+            axis=1,
+            keepdim=False,
+            op_func=paddle.all,
+        )
+
+    def test_all_x_shard_on_axis(self):
+        self.test_body(
+            x_shape=[4, 8, 6],
+            out_shape=[4, 6],
+            x_placements=[dist.Shard(1)],
+            axis=1,
+            keepdim=False,
+            op_func=paddle.all,
+        )
+
     def run_test_case(self):
         if self._backend == "cpu":
             paddle.set_device("cpu")
@@ -127,6 +151,8 @@ def run_test_case(self):
         self.test_mean_x_shard()
         self.test_max_x_shard()
         self.test_max_x_shard_on_axis()
+        self.test_all_x_shard()
+        self.test_all_x_shard_on_axis()
 
 
 if __name__ == '__main__':
diff --git a/test/cpp/auto_parallel/spmd_rule_test.cc b/test/cpp/auto_parallel/spmd_rule_test.cc
index 844758e923e3b..2a13d46db2798 100644
--- a/test/cpp/auto_parallel/spmd_rule_test.cc
+++ b/test/cpp/auto_parallel/spmd_rule_test.cc
@@ -1006,6 +1006,27 @@ TEST(ReduceMaxRule, Ctor) {
   check_partial_dims(backward_info.second[0], {});
 }
 
+TEST(ReduceAllRule, Ctor) {
+  std::vector<int64_t> mesh_shape = {2};
+  std::vector<int64_t> process_ids = {0, 1};
+  std::vector<std::string> dim_names = {"x"};
+  ProcessMesh process_mesh(mesh_shape, process_ids, dim_names);
+
+  // test forward
+  auto t_dist_attr = TensorDistAttr();
+  t_dist_attr.set_process_mesh(process_mesh);
+  t_dist_attr.set_dims_mapping({-1, 0, -1});
+  t_dist_attr.set_dynamic_dims({false, false, false});
+  phi::distributed::DistMetaTensor x =
+      phi::distributed::DistMetaTensor(phi::make_ddim({4, 6, 8}), t_dist_attr);
+  IntArray axis = {1};
+  bool keep_dim = false;
+  phi::distributed::SpmdInfo forward_info =
+      phi::distributed::ReductionAllInferSpmdDynamic(x, axis, keep_dim);
+  check_dim_mapping(forward_info.second[0], {-1, -1});
+  check_partial_dims(forward_info.second[0], {0});
+}
+
 TEST(Numel, Ctor) {
   std::vector<int64_t> mesh_shape = {2, 2};
   std::vector<int64_t> process_ids = {0, 1, 2, 3};

From 5e897083b58b0ff991854c963d791fcab0946928 Mon Sep 17 00:00:00 2001
From: zyt1024 <42999008+zyt1024@users.noreply.github.com>
Date: Wed, 29 Nov 2023 10:43:27 +0800
Subject: [PATCH 10/14] =?UTF-8?q?=E3=80=90complex=20op=E3=80=91=20add=20co?=
 =?UTF-8?q?mplex=20support=20for=20=20unbind,broadcast,broadcast=5Ftensors?=
 =?UTF-8?q?=20and=20broadcast=5Ftensor=5Fgrad=20(#59122)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* add complex support for unbind,broadcast,broadcast_tensors and broadcast_tensors_grad

* add test_dtype

* add complex support for unbind,broadcast,broadcast_tensors and broadcast_tensor_grad

* fix code_style

* Resolve conflicts and generate complex data.
---
 paddle/phi/kernels/cpu/broadcast_kernel.cc    |   4 +-
 .../cpu/broadcast_tensors_grad_kernel.cc      |   4 +-
 .../kernels/cpu/broadcast_tensors_kernel.cc   |   4 +-
 paddle/phi/kernels/cpu/unbind_kernel.cc       |   4 +-
 paddle/phi/kernels/gpu/broadcast_kernel.cu    |   8 +-
 .../gpu/broadcast_tensors_grad_kernel.cu      |   4 +-
 .../kernels/gpu/broadcast_tensors_kernel.cu   |   4 +-
 paddle/phi/kernels/gpu/unbind_kernel.cu       |   4 +-
 .../distributed/communication/broadcast.py    |   2 +-
 python/paddle/tensor/manipulation.py          |   8 +-
 .../cpp/phi/kernels/test_ternary_broadcast.cu |  48 ++++++
 test/legacy_test/op_test.py                   |   2 +
 test/legacy_test/test_broadcast_error.py      |  20 ++-
 test/legacy_test/test_broadcast_tensors_op.py | 107 ++++++++----
 test/legacy_test/test_unbind_op.py            | 152 +++++++++++++++---
 15 files changed, 310 insertions(+), 65 deletions(-)

diff --git a/paddle/phi/kernels/cpu/broadcast_kernel.cc b/paddle/phi/kernels/cpu/broadcast_kernel.cc
index baa12d1815edc..0deb8d8bbc562 100644
--- a/paddle/phi/kernels/cpu/broadcast_kernel.cc
+++ b/paddle/phi/kernels/cpu/broadcast_kernel.cc
@@ -62,4 +62,6 @@ PD_REGISTER_KERNEL(broadcast,
                    int8_t,
                    uint8_t,
                    int64_t,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc b/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc
index 0656f681367ff..8f73c5c5f5f6e 100644
--- a/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc
@@ -199,4 +199,6 @@ PD_REGISTER_KERNEL(broadcast_tensors_grad,
                    int64_t,
                    float,
                    double,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/broadcast_tensors_kernel.cc b/paddle/phi/kernels/cpu/broadcast_tensors_kernel.cc
index 3ad26164d7d8d..7d0e08655fc27 100644
--- a/paddle/phi/kernels/cpu/broadcast_tensors_kernel.cc
+++ b/paddle/phi/kernels/cpu/broadcast_tensors_kernel.cc
@@ -27,4 +27,6 @@ PD_REGISTER_KERNEL(broadcast_tensors,
                    int64_t,
                    float,
                    double,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/unbind_kernel.cc b/paddle/phi/kernels/cpu/unbind_kernel.cc
index e8d0c01352c97..255f73af1aca7 100644
--- a/paddle/phi/kernels/cpu/unbind_kernel.cc
+++ b/paddle/phi/kernels/cpu/unbind_kernel.cc
@@ -27,4 +27,6 @@ PD_REGISTER_KERNEL(unbind,
                    phi::dtype::float16,
                    phi::dtype::bfloat16,
                    int,
-                   int64_t) {}
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/broadcast_kernel.cu b/paddle/phi/kernels/gpu/broadcast_kernel.cu
index 4b46e218c328e..e4986f752b1ae 100644
--- a/paddle/phi/kernels/gpu/broadcast_kernel.cu
+++ b/paddle/phi/kernels/gpu/broadcast_kernel.cu
@@ -66,7 +66,9 @@ PD_REGISTER_KERNEL(broadcast,
                    int8_t,
                    uint8_t,
                    int64_t,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
 #else
 PD_REGISTER_KERNEL(broadcast,
                    GPU,
@@ -79,5 +81,7 @@ PD_REGISTER_KERNEL(broadcast,
                    int8_t,
                    uint8_t,
                    int64_t,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
 #endif
diff --git a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
index 40ea1f195069e..1c56b93c7c1dc 100644
--- a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
@@ -111,4 +111,6 @@ PD_REGISTER_KERNEL(broadcast_tensors_grad,
                    float,
                    double,
                    phi::dtype::float16,
-                   phi::dtype::bfloat16) {}
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu b/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu
index 3d16797cb66c0..aae7d53aeb43a 100644
--- a/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu
+++ b/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu
@@ -28,4 +28,6 @@ PD_REGISTER_KERNEL(broadcast_tensors,
                    float,
                    double,
                    phi::dtype::float16,
-                   phi::dtype::bfloat16) {}
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/unbind_kernel.cu b/paddle/phi/kernels/gpu/unbind_kernel.cu
index 37272cebdf118..178191f048e30 100644
--- a/paddle/phi/kernels/gpu/unbind_kernel.cu
+++ b/paddle/phi/kernels/gpu/unbind_kernel.cu
@@ -27,4 +27,6 @@ PD_REGISTER_KERNEL(unbind,
                    phi::dtype::float16,
                    phi::dtype::bfloat16,
                    int,
-                   int64_t) {}
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/python/paddle/distributed/communication/broadcast.py b/python/paddle/distributed/communication/broadcast.py
index 208158cd20918..9c87e0345db5f 100644
--- a/python/paddle/distributed/communication/broadcast.py
+++ b/python/paddle/distributed/communication/broadcast.py
@@ -37,7 +37,7 @@ def broadcast(tensor, src, group=None, sync_op=True):
 
     Args:
         tensor (Tensor): The tensor to send if current rank is the source, or the tensor to receive otherwise. Its data type
-            should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
+            should be float16, float32, float64, int32, int64, int8, uint8, bool, bfloat16, complex64 or complex128.
         src (int): The source rank in global view.
         group (Group, optional): The group instance return by new_group or None for global default group.
         sync_op (bool, optional): Whether this op is a sync op. The default value is True.
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 5bec599390fdb..a0ed76a3b970d 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -1349,7 +1349,7 @@ def broadcast_tensors(input, name=None):
 
     Args:
         input (list|tuple): ``input`` is a Tensor list or Tensor tuple which is with data type bool,
-            float16, float32, float64, int32, int64. All the Tensors in ``input`` must have same data type.
+            float16, float32, float64, int32, int64, complex64, complex128. All the Tensors in ``input`` must have same data type.
             Currently we only support tensors with rank no greater than 5.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
@@ -1390,6 +1390,8 @@ def broadcast_tensors(input, name=None):
                     'int32',
                     'int64',
                     'uint16',
+                    'complex64',
+                    'complex128',
                 ],
                 'broadcast_tensors',
             )
@@ -3037,7 +3039,7 @@ def unbind(input, axis=0):
     Removes a tensor dimension, then split the input tensor into multiple sub-Tensors.
 
     Args:
-        input (Tensor): The input variable which is an N-D Tensor, data type being bool, float16, float32, float64, int32 or int64.
+        input (Tensor): The input variable which is an N-D Tensor, data type being bool, float16, float32, float64, int32, int64, complex64 or complex128.
         axis (int32|int64, optional): A scalar with type ``int32|int64`` shape [1]. The dimension along which to unbind.
             If :math:`axis < 0`, the dimension to unbind along is :math:`rank(input) + axis`. Default is 0.
     Returns:
@@ -3094,6 +3096,8 @@ def unbind(input, axis=0):
                 'float64',
                 'int32',
                 'int64',
+                'complex64',
+                'complex128',
             ],
             'unbind',
         )
diff --git a/test/cpp/phi/kernels/test_ternary_broadcast.cu b/test/cpp/phi/kernels/test_ternary_broadcast.cu
index 09598e637909a..959b79725f07a 100644
--- a/test/cpp/phi/kernels/test_ternary_broadcast.cu
+++ b/test/cpp/phi/kernels/test_ternary_broadcast.cu
@@ -122,6 +122,22 @@ TEST(Broadcast, add) {
                                    dim_out,
                                    times,
                                    AddTernary_1<phi::dtype::bfloat16>());
+    TestCase<phi::dtype::complex<float>>(
+        *dev_ctx,
+        dim1,
+        dim2,
+        dim3,
+        dim_out,
+        times,
+        AddTernary_1<phi::dtype::complex<float>>());
+    TestCase<phi::dtype::complex<double>>(
+        *dev_ctx,
+        dim1,
+        dim2,
+        dim3,
+        dim_out,
+        times,
+        AddTernary_1<phi::dtype::complex<double>>());
   } while (0);
 
   do {
@@ -145,6 +161,22 @@ TEST(Broadcast, add) {
                                    dim_out,
                                    times,
                                    AddTernary_2<phi::dtype::bfloat16>());
+    TestCase<phi::dtype::complex<float>>(
+        *dev_ctx,
+        dim1,
+        dim2,
+        dim3,
+        dim_out,
+        times,
+        AddTernary_2<phi::dtype::complex<float>>());
+    TestCase<phi::dtype::complex<double>>(
+        *dev_ctx,
+        dim1,
+        dim2,
+        dim3,
+        dim_out,
+        times,
+        AddTernary_2<phi::dtype::complex<double>>());
   } while (0);
 
   do {
@@ -168,6 +200,22 @@ TEST(Broadcast, add) {
                                    dim_out,
                                    times,
                                    AddTernary_3<phi::dtype::bfloat16>());
+    TestCase<phi::dtype::complex<float>>(
+        *dev_ctx,
+        dim1,
+        dim2,
+        dim3,
+        dim_out,
+        times,
+        AddTernary_3<phi::dtype::complex<float>>());
+    TestCase<phi::dtype::complex<double>>(
+        *dev_ctx,
+        dim1,
+        dim2,
+        dim3,
+        dim_out,
+        times,
+        AddTernary_3<phi::dtype::complex<double>>());
   } while (0);
 #endif
 }
diff --git a/test/legacy_test/op_test.py b/test/legacy_test/op_test.py
index 79a289f65890e..759d76dabef9a 100644
--- a/test/legacy_test/op_test.py
+++ b/test/legacy_test/op_test.py
@@ -671,6 +671,8 @@ def infer_dtype(numpy_dict, dtype_set):
         input_dtype_set = set()
         infer_dtype(inputs, input_dtype_set)
         dtype_list = [
+            np.dtype(np.complex128),
+            np.dtype(np.complex64),
             np.dtype(np.float64),
             np.dtype(np.float32),
             np.dtype(np.float16),
diff --git a/test/legacy_test/test_broadcast_error.py b/test/legacy_test/test_broadcast_error.py
index d42cc6d9b8840..e5defec467d9f 100644
--- a/test/legacy_test/test_broadcast_error.py
+++ b/test/legacy_test/test_broadcast_error.py
@@ -23,7 +23,12 @@
 class TestBroadcastOpCpu(OpTest):
     def setUp(self):
         self.op_type = "broadcast"
-        input = np.random.random((100, 2)).astype("float32")
+        self.init_dtype()
+        input = np.random.random((100, 2)).astype(self.dtype)
+        if self.dtype == 'complex64' or self.dtype == 'complex128':
+            input = (
+                np.random.random((100, 2)) + 1j * np.random.random((100, 2))
+            ).astype(self.dtype)
         np_out = input[:]
         self.inputs = {"X": input}
         self.attrs = {"sync_mode": False, "root": 0}
@@ -35,6 +40,19 @@ def test_check_output_cpu(self):
         except:
             print("do not support cpu test, skip")
 
+    def init_dtype(self):
+        self.dtype = 'float32'
+
+
+class TestBroadcastOpCpu_complex64(TestBroadcastOpCpu):
+    def init_dtype(self):
+        self.dtype = 'complex64'
+
+
+class TestBroadcastOpCpu_complex128(TestBroadcastOpCpu):
+    def init_dtype(self):
+        self.dtype = 'complex128'
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/legacy_test/test_broadcast_tensors_op.py b/test/legacy_test/test_broadcast_tensors_op.py
index 9f5b7b76caacb..d8de6e1bba8a8 100644
--- a/test/legacy_test/test_broadcast_tensors_op.py
+++ b/test/legacy_test/test_broadcast_tensors_op.py
@@ -47,7 +47,10 @@ def find_output_shape(input_list):
 def make_inputs_outputs(input_shapes, dtype, is_bfloat16=False):
     """Automatically generate formatted inputs and outputs from input_shapes"""
     input_list = [
-        np.random.random(shape).astype(dtype) for shape in input_shapes
+        (np.random.random(shape) + 1j * np.random.random(shape)).astype(dtype)
+        if dtype == 'complex64' or dtype == 'complex128'
+        else np.random.random(shape).astype(dtype)
+        for shape in input_shapes
     ]
     output_shape = find_output_shape(input_list)
     output_list = [
@@ -98,8 +101,8 @@ class TestCPUBroadcastTensorsOp(OpTest):
     def set_place(self):
         self.place = core.CPUPlace()
 
-    def set_dtypes(self):
-        self.dtypes = ['float64']
+    def set_dtype(self):
+        self.dtype = 'float64'
 
     def setUp(self):
         self.op_type = "broadcast_tensors"
@@ -112,26 +115,24 @@ def setUp(self):
             gen_empty_tensors_test,
         ]
         self.set_place()
-        self.set_dtypes()
+        self.set_dtype()
         self.python_api = paddle.broadcast_tensors
 
     def run_dual_test(self, test_func, args):
-        for dtype in self.dtypes:
-            for gen_func in self.test_gen_func_list:
-                self.inputs, self.outputs = gen_func(dtype)
-                if len(self.outputs["Out"]) < 3:
-                    self.python_out_sig = [
-                        f"out{i}" for i in range(len(self.outputs["Out"]))
-                    ]
-                    test_func(**args)
+        for gen_func in self.test_gen_func_list:
+            self.inputs, self.outputs = gen_func(self.dtype)
+            if len(self.outputs["Out"]) < 3:
+                self.python_out_sig = [
+                    f"out{i}" for i in range(len(self.outputs["Out"]))
+                ]
+                test_func(**args)
 
     def run_triple_in_test(self, test_func, args):
-        for dtype in self.dtypes:
-            self.inputs, self.outputs = self.test_gen_func_list[2](dtype)
-            self.python_out_sig = [
-                f"out{i}" for i in range(len(self.outputs["Out"]))
-            ]
-            test_func(**args)
+        self.inputs, self.outputs = self.test_gen_func_list[2](self.dtype)
+        self.python_out_sig = [
+            f"out{i}" for i in range(len(self.outputs["Out"]))
+        ]
+        test_func(**args)
 
     def test_check_output(self):
         self.run_dual_test(
@@ -160,6 +161,16 @@ def test_check_grad_normal(self):
         )
 
 
+class TestCPUBroadcastTensorsOp_complex64(TestCPUBroadcastTensorsOp):
+    def set_dtypes(self):
+        self.dtype = 'complex64'
+
+
+class TestCPUBroadcastTensorsOp_complex128(TestCPUBroadcastTensorsOp):
+    def set_dtypes(self):
+        self.dtype = 'complex128'
+
+
 @unittest.skipIf(
     not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
 )
@@ -239,28 +250,44 @@ def test_check_grad_normal(self):
 
 
 class TestBroadcastTensorsAPI(unittest.TestCase):
+    def setUp(self):
+        self.dtype = 'float32'
+
     def test_api(self):
         @test_with_pir_api
         def test_static():
-            inputs = [
-                paddle.static.data(
-                    shape=[-1, 4, 1, 4, 1], dtype='float32', name="x0"
-                ),
-                paddle.static.data(
-                    shape=[-1, 1, 4, 1, 4], dtype='float32', name="x1"
-                ),
-            ]
-            paddle.broadcast_tensors(inputs)
+            prog = paddle.static.Program()
+            startup_prog = paddle.static.Program()
+            with paddle.static.program_guard(prog, startup_prog):
+                inputs = [
+                    paddle.static.data(
+                        shape=[-1, 4, 1, 4, 1], dtype=self.dtype, name="x0"
+                    ),
+                    paddle.static.data(
+                        shape=[-1, 1, 4, 1, 4], dtype=self.dtype, name="x1"
+                    ),
+                ]
+                paddle.broadcast_tensors(inputs)
 
         def test_dynamic():
             paddle.disable_static()
             try:
                 inputs = [
                     paddle.to_tensor(
-                        np.random.random([4, 1, 4, 1]).astype("float32")
+                        np.random.random([4, 1, 4, 1]).astype(self.dtype)
+                        if self.dtype == 'float32'
+                        else (
+                            np.random.random([4, 1, 4, 1])
+                            + 1j * np.random.random([4, 1, 4, 1])
+                        ).astype(self.dtype)
                     ),
                     paddle.to_tensor(
-                        np.random.random([1, 4, 1, 4]).astype("float32")
+                        np.random.random([1, 4, 1, 4]).astype(self.dtype)
+                        if self.dtype == 'float32'
+                        else (
+                            np.random.random([1, 4, 1, 4])
+                            + 1j * np.random.random([1, 4, 1, 4])
+                        ).astype(self.dtype)
                     ),
                 ]
                 paddle.broadcast_tensors(inputs)
@@ -271,6 +298,16 @@ def test_dynamic():
         test_dynamic()
 
 
+class TestBroadcastTensorsAPI_complex64(TestBroadcastTensorsAPI):
+    def setUp(self):
+        self.dtype = 'complex64'
+
+
+class TestBroadcastTensorsAPI_complex128(TestBroadcastTensorsAPI):
+    def setUp(self):
+        self.dtype = 'complex128'
+
+
 class TestRaiseBroadcastTensorsError(unittest.TestCase):
     def test_errors(self):
         def test_type():
@@ -306,9 +343,21 @@ def test_bcast_semantics():
             ]
             paddle.broadcast_tensors(inputs)
 
+        def test_bcast_semantics_complex64():
+            inputs = [
+                paddle.static.data(
+                    shape=[-1, 1, 3, 1, 1], dtype='complex64', name="x11"
+                ),
+                paddle.static.data(
+                    shape=[-1, 1, 8, 1, 1], dtype='complex64', name="x12"
+                ),
+            ]
+            paddle.broadcast_tensors(inputs)
+
         self.assertRaises(TypeError, test_type)
         self.assertRaises(TypeError, test_dtype)
         self.assertRaises(TypeError, test_bcast_semantics)
+        self.assertRaises(TypeError, test_bcast_semantics_complex64)
 
 
 class TestRaiseBroadcastTensorsErrorDyGraph(unittest.TestCase):
diff --git a/test/legacy_test/test_unbind_op.py b/test/legacy_test/test_unbind_op.py
index c01858c06ad5e..833ffd824bb13 100644
--- a/test/legacy_test/test_unbind_op.py
+++ b/test/legacy_test/test_unbind_op.py
@@ -24,28 +24,38 @@
 
 
 class TestUnbind(unittest.TestCase):
+    def setUp(self):
+        self.init_dtype()
+        self.input_1 = np.random.random([2, 3]).astype(self.dtype)
+        if self.dtype == 'complex64' or self.dtype == 'complex128':
+            self.input_1 = (
+                np.random.random([2, 3]) + 1j * np.random.random([2, 3])
+            ).astype(self.dtype)
+
+    def init_dtype(self):
+        self.dtype = 'float32'
+
     @test_with_pir_api
     def test_unbind(self):
         paddle.enable_static()
-
+        self.init_dtype()
         main_program = static.Program()
         startup_program = static.Program()
         with static.program_guard(
             main_program=main_program, startup_program=startup_program
         ):
-            x_1 = paddle.static.data(shape=[2, 3], dtype='float32', name='x_1')
+            x_1 = paddle.static.data(shape=[2, 3], dtype=self.dtype, name='x_1')
             [out_0, out_1] = tensor.unbind(input=x_1, axis=0)
-            input_1 = np.random.random([2, 3]).astype("float32")
             axis = paddle.static.data(shape=[], dtype='int32', name='axis')
             exe = base.Executor(place=base.CPUPlace())
 
             [res_1, res_2] = exe.run(
-                feed={"x_1": input_1, "axis": 0},
+                feed={"x_1": self.input_1, "axis": 0},
                 fetch_list=[out_0, out_1],
             )
 
-            np.testing.assert_array_equal(res_1, input_1[0, 0:100])
-            np.testing.assert_array_equal(res_2, input_1[1, 0:100])
+            np.testing.assert_array_equal(res_1, self.input_1[0, 0:100])
+            np.testing.assert_array_equal(res_2, self.input_1[1, 0:100])
 
     @test_with_pir_api
     def test_unbind_static_fp16_gpu(self):
@@ -73,38 +83,74 @@ def test_unbind_static_fp16_gpu(self):
 
     def test_unbind_dygraph(self):
         with base.dygraph.guard():
-            np_x = np.random.random([2, 3]).astype("float32")
-            x = paddle.to_tensor(np_x)
+            x = paddle.to_tensor(self.input_1)
             x.stop_gradient = False
             [res_1, res_2] = paddle.unbind(x, 0)
-            np.testing.assert_array_equal(res_1, np_x[0, 0:100])
-            np.testing.assert_array_equal(res_2, np_x[1, 0:100])
+            np.testing.assert_array_equal(res_1, self.input_1[0, 0:100])
+            np.testing.assert_array_equal(res_2, self.input_1[1, 0:100])
 
             out = paddle.add_n([res_1, res_2])
 
-            np_grad = np.ones(x.shape, np.float32)
+            np_grad = np.ones(x.shape, self.dtype)
             out.backward()
             np.testing.assert_array_equal(x.grad.numpy(False), np_grad)
 
 
+class TestUnbind_complex64(TestUnbind):
+    def init_dtype(self):
+        self.dtype = 'complex64'
+
+    def test_unbind_static_fp16_gpu(self):
+        pass
+
+
+class TestUnbind_complex128(TestUnbind):
+    def init_dtype(self):
+        self.dtype = 'complex128'
+
+    def test_unbind_static_fp16_gpu(self):
+        pass
+
+
 class TestLayersUnbind(unittest.TestCase):
+    def setUp(self):
+        self.init_dtype()
+        self.input_1 = np.random.random([2, 3]).astype(self.dtype)
+        if self.dtype == 'complex64' or self.dtype == 'complex128':
+            self.input_1 = (
+                np.random.random([2, 3]) + 1j * np.random.random([2, 3])
+            ).astype(self.dtype)
+
+    def init_dtype(self):
+        self.dtype = 'float32'
+
     @test_with_pir_api
     def test_layers_unbind(self):
         paddle.enable_static()
+        prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        with paddle.static.program_guard(prog, startup_prog):
+            x_1 = paddle.static.data(shape=[2, 3], dtype=self.dtype, name='x_1')
+            [out_0, out_1] = paddle.unbind(input=x_1, axis=0)
+            axis = paddle.static.data(shape=[], dtype='int32', name='axis')
+            exe = base.Executor(place=base.CPUPlace())
+            [res_1, res_2] = exe.run(
+                feed={"x_1": self.input_1, "axis": 0},
+                fetch_list=[out_0, out_1],
+            )
 
-        x_1 = paddle.static.data(shape=[2, 3], dtype='float32', name='x_1')
-        [out_0, out_1] = paddle.unbind(input=x_1, axis=0)
-        input_1 = np.random.random([2, 3]).astype("float32")
-        axis = paddle.static.data(shape=[], dtype='int32', name='axis')
-        exe = base.Executor(place=base.CPUPlace())
+            np.testing.assert_array_equal(res_1, self.input_1[0, 0:100])
+            np.testing.assert_array_equal(res_2, self.input_1[1, 0:100])
 
-        [res_1, res_2] = exe.run(
-            feed={"x_1": input_1, "axis": 0},
-            fetch_list=[out_0, out_1],
-        )
 
-        np.testing.assert_array_equal(res_1, input_1[0, 0:100])
-        np.testing.assert_array_equal(res_2, input_1[1, 0:100])
+class TestLayersUnbind_complex64(TestLayersUnbind):
+    def init_dtype(self):
+        self.dtype = 'complex64'
+
+
+class TestLayersUnbind_complex128(TestLayersUnbind):
+    def init_dtype(self):
+        self.dtype = 'complex128'
 
 
 class TestUnbindOp(OpTest):
@@ -126,6 +172,11 @@ def setUp(self):
         self.num = 3
         self.initParameters()
         x = np.arange(12).reshape(3, 2, 2).astype(self.dtype)
+        if self.dtype == np.complex64 or self.dtype == np.complex128:
+            x = (
+                np.arange(12).reshape(3, 2, 2)
+                + 1j * np.arange(12).reshape(3, 2, 2)
+            ).astype(self.dtype)
         self.out = np.split(x, self.num, self.axis)
         self.outReshape()
         self.inputs = {'X': x}
@@ -208,6 +259,46 @@ def outReshape(self):
         self.out[1] = self.out[1].reshape((3, 2))
 
 
+class TestUnbindOp1_Complex64(TestUnbindOp1):
+    def get_dtype(self):
+        return np.complex64
+
+
+class TestUnbindOp2_Complex64(TestUnbindOp2):
+    def get_dtype(self):
+        return np.complex64
+
+
+class TestUnbindOp3_Complex64(TestUnbindOp3):
+    def get_dtype(self):
+        return np.complex64
+
+
+class TestUnbindOp4_Complex64(TestUnbindOp4):
+    def get_dtype(self):
+        return np.complex64
+
+
+class TestUnbindOp1_Complex128(TestUnbindOp1):
+    def get_dtype(self):
+        return np.complex128
+
+
+class TestUnbindOp2_Complex128(TestUnbindOp2):
+    def get_dtype(self):
+        return np.complex128
+
+
+class TestUnbindOp3_Complex128(TestUnbindOp3):
+    def get_dtype(self):
+        return np.complex128
+
+
+class TestUnbindOp4_Complex128(TestUnbindOp4):
+    def get_dtype(self):
+        return np.complex128
+
+
 class TestUnbindFP16Op(OpTest):
     def setUp(self):
         paddle.disable_static()
@@ -278,10 +369,15 @@ def test_check_grad(self):
 
 
 class TestUnbindAxisError(unittest.TestCase):
+    def setUp(self):
+        self.dtype = 'float32'
+
     @test_with_pir_api
     def test_errors(self):
+        paddle.enable_static()
+
         with program_guard(Program(), Program()):
-            x = paddle.static.data(shape=[2, 3], dtype='float32', name='x')
+            x = paddle.static.data(shape=[2, 3], dtype=self.dtype, name='x')
 
             def test_table_Variable():
                 tensor.unbind(input=x, axis=2.0)
@@ -294,6 +390,16 @@ def test_invalid_axis():
             self.assertRaises(ValueError, test_invalid_axis)
 
 
+class TestUnbindAxisError_complex64(TestUnbindAxisError):
+    def setUp(self):
+        self.dtype = 'complex64'
+
+
+class TestUnbindAxisError_complex128(TestUnbindAxisError):
+    def setUp(self):
+        self.dtype = 'complex128'
+
+
 class TestUnbindBool(unittest.TestCase):
     def test_bool(self):
         x = paddle.to_tensor([[True, True], [False, False]])

From 7331bf95ef13d2a413c91e7da6c7c23000381d4e Mon Sep 17 00:00:00 2001
From: Xiaoxu Chen <chenxx_id@163.com>
Date: Wed, 29 Nov 2023 10:45:43 +0800
Subject: [PATCH 11/14] [auto parallel] add embedding backward spmd rules
 (#59003)

* add embedding backward spmd rules

* [auto parallel] add dense embedding_grad spmd rule.

* fix embedding out_grad is reshard bugs
---
 paddle/phi/api/lib/api_custom_impl.cc         |  37 ++--
 paddle/phi/api/yaml/legacy_ops.yaml           |   1 +
 paddle/phi/infermeta/spmd_rules/embedding.cc  | 112 ++++++++++++
 paddle/phi/infermeta/spmd_rules/embedding.h   |   9 +-
 .../phi/kernels/cpu/embedding_grad_kernel.cc  |   1 -
 ...py => semi_auto_parallel_for_embedding.py} |  52 +++---
 .../test_semi_auto_parallel_basic.py          |   4 +-
 test/cpp/auto_parallel/spmd_rule_test.cc      | 159 ++++++++++++++++++
 8 files changed, 329 insertions(+), 46 deletions(-)
 rename test/auto_parallel/{semi_auto_parallel_for_embedding_grad.py => semi_auto_parallel_for_embedding.py} (76%)

diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc
index 72c5bc3485772..efb7a0befe281 100644
--- a/paddle/phi/api/lib/api_custom_impl.cc
+++ b/paddle/phi/api/lib/api_custom_impl.cc
@@ -264,12 +264,17 @@ void embedding_grad_impl(const Tensor& x,
       auto meta_dist_input_x = MakeDistMetaTensor(*x.impl());
       auto meta_dist_input_weight = MakeDistMetaTensor(*weight.impl());
       auto meta_dist_input_out_grad = MakeDistMetaTensor(*out_grad.impl());
-      auto spmd_info = phi::distributed::VariadicReplicatedInferSpmdDynamic(
-          meta_dist_input_weight, meta_dist_input_x, meta_dist_input_out_grad);
+      auto spmd_info =
+          phi::distributed::EmbeddingGradInferSpmd(meta_dist_input_x,
+                                                   meta_dist_input_weight,
+                                                   meta_dist_input_out_grad,
+                                                   padding_idx,
+                                                   sparse);
 
       // 2. Create Temporary Output & Prepare Dist and Dense Output
       std::shared_ptr<phi::distributed::DistTensor> shared_dist_out =
-          CreateKernelDistOutput(weight_grad, !rank_is_in_current_mesh);
+          CreateKernelDistOutput(
+              weight_grad, !rank_is_in_current_mesh, spmd_info.second[0]);
       phi::distributed::DistTensor* dist_out = shared_dist_out.get();
       phi::DenseTensor* dense_out = dist_out->unsafe_mutable_value();
       if (dense_out && !rank_is_in_current_mesh && !dist_out->defined()) {
@@ -284,37 +289,29 @@ void embedding_grad_impl(const Tensor& x,
       UnchangedInferMeta(MakeMetaTensor(*weight.impl()), &meta_dist_out);
 
       // 4. Set Output Dist Attr For Default Impl
-      auto current_process_mesh =
-          paddle::holds_alternative<phi::distributed::TensorDistAttr>(
-              spmd_info.first[0])
-              ? paddle::get<0>(spmd_info.first[0]).process_mesh()
-              : paddle::get<1>(spmd_info.first[0]).at(0).process_mesh();
-      SetReplicatedDistAttrForOutput(dist_out, current_process_mesh);
 
       if (rank_is_in_current_mesh) {
         // 5. Reshard Input
-        auto dist_input_weight =
-            ReshardApiInputToKernelInput(dev_ctx, weight, spmd_info.first[0]);
         auto dist_input_x =
-            ReshardApiInputToKernelInput(dev_ctx, x, spmd_info.first[1]);
+            ReshardApiInputToKernelInput(dev_ctx, x, spmd_info.first[0]);
+        auto dist_input_weight =
+            ReshardApiInputToKernelInput(dev_ctx, weight, spmd_info.first[1]);
         auto dist_input_out_grad =
             ReshardApiInputToKernelInput(dev_ctx, out_grad, spmd_info.first[2]);
 
         // 6. PrepareData (DataTransform & Prepare Dense Input)
-        dist_input_weight = PrepareDataForDistTensor(
-            dist_input_weight,
+        dist_input_x = PrepareDataForDistTensor(
+            dist_input_x,
             GetKernelInputArgDef(kernel.InputAt(0), kernel_key.backend()),
             {},
             kernel_result.is_stride_kernel);
-        auto input_weight = &dist_input_weight->value();
-
-        dist_input_x = PrepareDataForDistTensor(
-            dist_input_x,
+        auto input_x = &dist_input_x->value();
+        dist_input_weight = PrepareDataForDistTensor(
+            dist_input_weight,
             GetKernelInputArgDef(kernel.InputAt(1), kernel_key.backend()),
             {},
             kernel_result.is_stride_kernel);
-        auto input_x = &dist_input_x->value();
-
+        auto input_weight = &dist_input_weight->value();
         dist_input_out_grad = PrepareDataForDistTensor(
             dist_input_out_grad,
             GetKernelInputArgDef(kernel.InputAt(2), kernel_key.backend()),
diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml
index 3cfcf155f47c4..e9eaef16b6a29 100755
--- a/paddle/phi/api/yaml/legacy_ops.yaml
+++ b/paddle/phi/api/yaml/legacy_ops.yaml
@@ -362,6 +362,7 @@
   infer_meta :
     func : EmbeddingInferMeta
     param : [x, weight, padding_idx]
+    spmd_rule: EmbeddingInferSpmd
   kernel :
     func : embedding {dense, dense -> dense}
            sparse_weight_embedding {dense, selected_rows -> dense}
diff --git a/paddle/phi/infermeta/spmd_rules/embedding.cc b/paddle/phi/infermeta/spmd_rules/embedding.cc
index b9d3ee7904ba7..99aec54cd5474 100644
--- a/paddle/phi/infermeta/spmd_rules/embedding.cc
+++ b/paddle/phi/infermeta/spmd_rules/embedding.cc
@@ -19,6 +19,8 @@ limitations under the License. */
 #include "paddle/phi/core/distributed/auto_parallel/dist_attr.h"
 #include "paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h"
 #include "paddle/phi/core/distributed/auto_parallel/utils.h"
+#include "paddle/phi/infermeta/spmd_rules/matmul.h"
+#include "paddle/phi/infermeta/spmd_rules/reshape.h"
 #include "paddle/phi/infermeta/spmd_rules/utils.h"
 
 namespace phi {
@@ -208,5 +210,115 @@ SpmdInfo EmbeddingInferSpmdReverse(const DistMetaTensor& x,
   return {{x_dist_attr, weight_dist_attr}, {out_dist_attr_src}};
 }
 
+SpmdInfo EmbeddingGradInferSpmd(const DistMetaTensor& x,
+                                const DistMetaTensor& weight,
+                                const DistMetaTensor& out_grad,
+                                int64_t padding_idx,
+                                bool sparse) {
+  PADDLE_ENFORCE_EQ(out_grad.dims().size(),
+                    out_grad.dist_attr().dims_mapping().size(),
+                    phi::errors::InvalidArgument(
+                        "The Tensor out_grad's rank [%d] and out_grad's "
+                        "dims_mapping size [%d] are not matched.",
+                        out_grad.dims(),
+                        out_grad.dist_attr().dims_mapping().size()));
+
+  if (sparse) {
+    PADDLE_THROW(phi::errors::InvalidArgument(
+        "EmbeddingGradInferSpmd does't support sparse currently."));
+  }
+
+  // Propagate sharding info using composite operators.
+  // The whole mathematical expression of EmbeddingGrad is:
+  // w_grad = einsum('...j, ...k->jk', onehot(x, j), out_grad)
+
+  // TODO(cxxly): Simplifies the code logic of sharding propagation using
+  // primitive operators.
+  DistMetaTensor x_dst(x.dims(), x.dist_attr());
+  DistMetaTensor w_dst(weight.dims(), weight.dist_attr());
+  DistMetaTensor out_grad_dst(out_grad.dims(), out_grad.dist_attr());
+  DistMetaTensor w_grad(weight.dims(), weight.dist_attr());
+
+  // Step1: t0 = onehot(x_dst, w_dst.shape[0]) = eye(num_classes)[x_dst]
+  auto t0_dims_mapping = x_dst.dist_attr().dims_mapping();
+  t0_dims_mapping.emplace_back(-1);
+  TensorDistAttr t0_dist_attr(x.dist_attr());
+  t0_dist_attr.set_dims_mapping(t0_dims_mapping);
+  auto t0_shape = phi::vectorize(x.dims());
+  t0_shape.emplace_back(w_dst.dims()[0]);
+  DistMetaTensor t0(phi::make_ddim(t0_shape), t0_dist_attr);
+
+  // Step2: w_grad = einsum('...j, ...k -> jk', t0, out_grad_dst)
+  // Step 2.1: Build Einsum Notation
+  std::string alphabet = "abcdefghijlmnopqrstuvwxyz";
+  std::string t0_axes =
+      GetBroadcastAxes(t0.dims().size(), t0.dims().size(), alphabet);
+  std::string out_grad_dst_axes = t0_axes.substr(0, t0_axes.length() - 1) + "k";
+  std::string w_grad_axes = t0_axes.substr(t0_axes.length() - 1, 1) + "k";
+
+  // Step2.2: Sharding Propogation
+  // Step2.2.1: merge input shardings
+  auto axis_to_dim_map = ShardingMergeForTensors(
+      {{t0_axes, t0.dist_attr().dims_mapping()},
+       {out_grad_dst_axes, out_grad_dst.dist_attr().dims_mapping()}},
+      false);
+
+  // Step2.2.2: infer output's dims mapping.
+  auto w_grad_dist_attr = w_grad.dist_attr();
+  std::vector<int64_t> w_grad_dims_mapping =
+      GetDimsMappingForAxes(w_grad_axes, axis_to_dim_map);
+  w_grad_dist_attr.set_dims_mapping(w_grad_dims_mapping);
+
+  // Step2.2.3: merge potential conflict in inputs,
+  // update input dims mapping with merged shardings.
+  t0_dist_attr.set_dims_mapping(
+      GetDimsMappingForAxes(t0_axes, axis_to_dim_map));
+  auto out_grad_dst_dist_attr = out_grad_dst.dist_attr();
+  out_grad_dst_dist_attr.set_dims_mapping(
+      GetDimsMappingForAxes(out_grad_dst_axes, axis_to_dim_map));
+
+  // Step2.2.4: Handle Partial
+  std::vector<int64_t> partial_on_dims =
+      ResoluteOutputPartialDimension(axis_to_dim_map, w_grad_axes);
+  w_grad_dist_attr.set_partial_status(partial_on_dims);
+
+  // Step2.3: Update inputs info.
+  // NOTE: Reshard happend on intemediate operators must be ensure propagated
+  // back to first inputs.
+  t0 = DistMetaTensor(t0.dims(), t0_dist_attr);
+  const auto& t0_dims = t0.dist_attr().dims_mapping();
+  if (x_dst.dist_attr().dims_mapping() !=
+      std::vector<int64_t>(t0_dims.begin(), t0_dims.end() - 1)) {
+    TensorDistAttr t0_new(t0.dist_attr());
+    t0_new.set_dims_mapping(
+        std::vector<int64_t>(t0_dims.begin(), t0_dims.end() - 1));
+    x_dst = DistMetaTensor(x_dst.dims(), t0_new);
+  }
+  out_grad_dst = DistMetaTensor(out_grad_dst.dims(), out_grad_dst_dist_attr);
+  w_grad = DistMetaTensor(w_grad.dims(), w_grad_dist_attr);
+
+  VLOG(6) << "EmbeddingGradInferSpmd:\n"
+          << "Input x shape: [" << str_join(phi::vectorize(x.dims()))
+          << "], src_dims_mapping: [" << str_join(x.dist_attr().dims_mapping())
+          << "], dst_dims_mapping: ["
+          << str_join(x_dst.dist_attr().dims_mapping()) << "]\n"
+          << "Input weight shape: [" << str_join(phi::vectorize(weight.dims()))
+          << "], src_dims_mapping: ["
+          << str_join(weight.dist_attr().dims_mapping())
+          << "], dst_dims_mapping: ["
+          << str_join(w_dst.dist_attr().dims_mapping()) << "]\n"
+          << "Input out_grad shape: ["
+          << str_join(phi::vectorize(out_grad.dims()))
+          << "], src_dims_mapping: ["
+          << str_join(out_grad.dist_attr().dims_mapping())
+          << "], dst_dims_mapping: ["
+          << str_join(out_grad_dst.dist_attr().dims_mapping()) << "]\n"
+          << "Output w_grad shape: [" << str_join(phi::vectorize(w_grad.dims()))
+          << "], dims_mapping: [" << str_join(w_grad.dist_attr().dims_mapping())
+          << "]\n\n";
+
+  return {{x_dst.dist_attr(), w_dst.dist_attr(), out_grad_dst.dist_attr()},
+          {w_grad.dist_attr()}};
+}
 }  // namespace distributed
 }  // namespace phi
diff --git a/paddle/phi/infermeta/spmd_rules/embedding.h b/paddle/phi/infermeta/spmd_rules/embedding.h
index 6b1d3614442bd..08782334f9e21 100644
--- a/paddle/phi/infermeta/spmd_rules/embedding.h
+++ b/paddle/phi/infermeta/spmd_rules/embedding.h
@@ -29,13 +29,18 @@ namespace distributed {
 SpmdInfo EmbeddingInferSpmd(const DistMetaTensor& x,
                             const DistMetaTensor& weight,
                             int padding_idx,
-                            bool sparse);
+                            bool sparse = false);
 
 SpmdInfo EmbeddingInferSpmdReverse(const DistMetaTensor& x,
                                    const DistMetaTensor& weight,
                                    const DistMetaTensor& out,
                                    int padding_idx,
-                                   bool sparse);
+                                   bool sparse = false);
 
+SpmdInfo EmbeddingGradInferSpmd(const DistMetaTensor& x,
+                                const DistMetaTensor& weight,
+                                const DistMetaTensor& out_grad,
+                                int64_t padding_idx,
+                                bool sparse = false);
 }  // namespace distributed
 }  // namespace phi
diff --git a/paddle/phi/kernels/cpu/embedding_grad_kernel.cc b/paddle/phi/kernels/cpu/embedding_grad_kernel.cc
index fabb4e83d52f7..d3dbc06f6c04f 100644
--- a/paddle/phi/kernels/cpu/embedding_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/embedding_grad_kernel.cc
@@ -58,7 +58,6 @@ struct EmbeddingGradCPUFunctor {
       auto* d_table_data = weight_grad_->data<T>();
 
       memset(d_table_data, 0, weight_grad_->numel() * sizeof(T));
-
       for (int64_t i = 0; i < ids_num; ++i) {
         if (padding_idx_ != kNoPadding && ids_data[i] == padding_idx_) {
           // the gradient of padding_idx should be 0, already done by memset, so
diff --git a/test/auto_parallel/semi_auto_parallel_for_embedding_grad.py b/test/auto_parallel/semi_auto_parallel_for_embedding.py
similarity index 76%
rename from test/auto_parallel/semi_auto_parallel_for_embedding_grad.py
rename to test/auto_parallel/semi_auto_parallel_for_embedding.py
index 19d26d8437fa4..8f91a68f755c7 100644
--- a/test/auto_parallel/semi_auto_parallel_for_embedding_grad.py
+++ b/test/auto_parallel/semi_auto_parallel_for_embedding.py
@@ -21,7 +21,7 @@
 from paddle.distributed import Replicate, Shard
 
 
-class TestCustomEmbeddingGradApiForSemiAutoParallel:
+class TestEmbeddingApiForSemiAutoParallel:
     def __init__(self):
         self._dtype = os.getenv("dtype")
         self._backend = os.getenv("backend")
@@ -100,28 +100,37 @@ def test_w_col_shard(self):
         )
 
     def test_x_row_w_col_shard(self):
-        self.test_body(
-            x_shape=[12, 16],
-            w_shape=[10, 4],
-            x_placements=[Shard(0)],
-            w_placements=[Shard(1)],
-        )
+        try:
+            self.test_body(
+                x_shape=[12, 16],
+                w_shape=[10, 4],
+                x_placements=[Shard(0)],
+                w_placements=[Shard(1)],
+            )
+        except RuntimeError as e:
+            assert 'sharded by same mesh dimension ' in str(e)
 
     def test_x_col_w_row_shard(self):
-        self.test_body(
-            x_shape=[12, 16],
-            w_shape=[10, 4],
-            x_placements=[Shard(1)],
-            w_placements=[Shard(0)],
-        )
+        try:
+            self.test_body(
+                x_shape=[12, 16],
+                w_shape=[10, 4],
+                x_placements=[Shard(1)],
+                w_placements=[Shard(0)],
+            )
+        except RuntimeError as e:
+            assert 'sharded by same mesh dimension ' in str(e)
 
     def test_both_col_shard(self):
-        self.test_body(
-            x_shape=[12, 16],
-            w_shape=[10, 4],
-            x_placements=[Shard(1)],
-            w_placements=[Shard(1)],
-        )
+        try:
+            self.test_body(
+                x_shape=[12, 16],
+                w_shape=[10, 4],
+                x_placements=[Shard(1)],
+                w_placements=[Shard(1)],
+            )
+        except RuntimeError as e:
+            assert 'sharded by same mesh dimension', str(e)
 
     def run_test_case(self):
         if self._backend == "cpu":
@@ -134,7 +143,8 @@ def run_test_case(self):
         self.test_non_shard()
         self.test_x_row_shard()
         self.test_x_col_shard()
-        self.test_w_row_shard()
+        # Sharding along weight's row axis is not supported by raw embdding kernel.
+        # self.test_w_row_shard()
         self.test_w_col_shard()
         self.test_x_row_w_col_shard()
         self.test_x_col_w_row_shard()
@@ -142,4 +152,4 @@ def run_test_case(self):
 
 
 if __name__ == '__main__':
-    TestCustomEmbeddingGradApiForSemiAutoParallel().run_test_case()
+    TestEmbeddingApiForSemiAutoParallel().run_test_case()
diff --git a/test/auto_parallel/test_semi_auto_parallel_basic.py b/test/auto_parallel/test_semi_auto_parallel_basic.py
index c933f3784a980..245bf00a197f4 100644
--- a/test/auto_parallel/test_semi_auto_parallel_basic.py
+++ b/test/auto_parallel/test_semi_auto_parallel_basic.py
@@ -138,13 +138,13 @@ def test_flash_attention_api(self):
                 user_defined_envs=envs,
             )
 
-    def test_custom_embedding_grad_api(self):
+    def test_embedding_api(self):
         envs_list = test_base.gen_product_envs_list(
             self._default_envs, self._changeable_envs
         )
         for envs in envs_list:
             self.run_test_case(
-                "semi_auto_parallel_for_embedding_grad.py",
+                "semi_auto_parallel_for_embedding.py",
                 user_defined_envs=envs,
             )
 
diff --git a/test/cpp/auto_parallel/spmd_rule_test.cc b/test/cpp/auto_parallel/spmd_rule_test.cc
index 2a13d46db2798..aec8e4d0445a5 100644
--- a/test/cpp/auto_parallel/spmd_rule_test.cc
+++ b/test/cpp/auto_parallel/spmd_rule_test.cc
@@ -25,6 +25,7 @@ limitations under the License. */
 #include "paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h"
 #include "paddle/phi/core/distributed/auto_parallel/process_mesh.h"
 #include "paddle/phi/core/distributed/type_defs.h"
+#include "paddle/phi/infermeta/spmd_rules/embedding.h"
 #include "paddle/phi/infermeta/spmd_rules/replicated.h"
 #include "paddle/phi/infermeta/spmd_rules/rules.h"
 
@@ -1348,6 +1349,164 @@ TEST(ElementwiseUnaryLike, Ctor) {
   check_element_unary_like(infered_dist_attrs);
 }
 
+TEST(EmbeddingGradInferSpmd, Ctor) {
+  // build input data class
+  std::vector<int64_t> x_shape = {4, 5};
+  std::vector<int64_t> w_shape = {10, 3};
+  std::vector<int64_t> out_grad_shape = {4, 5, 3};
+
+  std::vector<int64_t> mesh_shape = {2, 3};
+  std::vector<int64_t> process_ids = {0, 1, 2, 3, 4, 5};
+  std::vector<std::string> dim_names = {"x", "y"};
+  ProcessMesh process_mesh(mesh_shape, process_ids, dim_names);
+
+  // indices is shard, embedding table is replicated,
+  TensorDistAttr x_dist_attr = TensorDistAttr();
+  x_dist_attr.set_process_mesh(process_mesh);
+  x_dist_attr.set_dims_mapping(std::vector<int64_t>({1, -1}));
+  x_dist_attr.set_dynamic_dims(std::vector<bool>({false, false}));
+
+  TensorDistAttr w_dist_attr = TensorDistAttr();
+  w_dist_attr.set_process_mesh(process_mesh);
+  w_dist_attr.set_dims_mapping(std::vector<int64_t>({-1, -1}));
+  w_dist_attr.set_dynamic_dims(std::vector<bool>({false, false}));
+
+  TensorDistAttr out_grad_dist_attr = TensorDistAttr();
+  out_grad_dist_attr.set_process_mesh(process_mesh);
+  out_grad_dist_attr.set_dims_mapping(std::vector<int64_t>({-1, -1, -1}));
+  out_grad_dist_attr.set_dynamic_dims(std::vector<bool>({false, false}));
+
+  phi::distributed::DistMetaTensor x(phi::make_ddim(x_shape), x_dist_attr);
+  phi::distributed::DistMetaTensor w(phi::make_ddim(w_shape), w_dist_attr);
+  phi::distributed::DistMetaTensor out_grad(phi::make_ddim(out_grad_shape),
+                                            out_grad_dist_attr);
+
+  auto spmdinfo = EmbeddingGradInferSpmd(x, w, out_grad, -1, false);
+
+  EXPECT_EQ(spmdinfo.first.size(), 3UL);
+  EXPECT_EQ(spmdinfo.second.size(), 1UL);
+
+  EXPECT_EQ(get_dims_mapping(spmdinfo.first[0]), std::vector<int64_t>({1, -1}));
+  EXPECT_EQ(get_dims_mapping(spmdinfo.first[1]),
+            std::vector<int64_t>({-1, -1}));
+  EXPECT_EQ(get_dims_mapping(spmdinfo.first[2]),
+            std::vector<int64_t>({1, -1, -1}));
+
+  EXPECT_EQ(get_dims_mapping(spmdinfo.second[0]),
+            std::vector<int64_t>({-1, -1}));
+  EXPECT_EQ(
+      PADDLE_GET_CONST(phi::distributed::TensorDistAttr, spmdinfo.second[0])
+          .is_partial(),
+      true);
+  VLOG(4) << "Test EmbeddingGradInferSpmd with sharding indices and "
+             "replicating weight"
+          << std::endl
+          << std::endl
+          << std::endl;
+
+  // indices'rank is greater than 1,  x and weight is replicated, out_grad is
+  // sharded along axis 1
+  x_dist_attr.set_dims_mapping({-1, -1});
+  w_dist_attr.set_dims_mapping({-1, 1});
+  out_grad_dist_attr.set_dims_mapping({-1, 1, -1});
+  x = phi::distributed::DistMetaTensor(phi::make_ddim(x_shape), x_dist_attr);
+  w = phi::distributed::DistMetaTensor(phi::make_ddim(w_shape), w_dist_attr);
+  out_grad = phi::distributed::DistMetaTensor(phi::make_ddim(out_grad_shape),
+                                              out_grad_dist_attr);
+
+  spmdinfo = EmbeddingGradInferSpmd(x, w, out_grad, -1, false);
+
+  EXPECT_EQ(spmdinfo.first.size(), 3UL);
+  EXPECT_EQ(spmdinfo.second.size(), 1UL);
+
+  EXPECT_EQ(get_dims_mapping(spmdinfo.first[0]), std::vector<int64_t>({-1, 1}));
+  EXPECT_EQ(get_dims_mapping(spmdinfo.first[1]), std::vector<int64_t>({-1, 1}));
+  EXPECT_EQ(get_dims_mapping(spmdinfo.first[2]),
+            std::vector<int64_t>({-1, 1, -1}));
+
+  EXPECT_EQ(get_dims_mapping(spmdinfo.second[0]),
+            std::vector<int64_t>({-1, -1}));
+  EXPECT_EQ(
+      PADDLE_GET_CONST(phi::distributed::TensorDistAttr, spmdinfo.second[0])
+          .is_partial(),
+      true);
+  VLOG(4) << "Test EmbeddingGradInferSpmd with replicating indices and "
+             "sharding weight along col axis."
+          << std::endl
+          << std::endl
+          << std::endl;
+
+  // Indices's rank equals 1, indices and out_grad is sharded.
+  x_shape = {5};
+  w_shape = {10, 3};
+  out_grad_shape = {5, 3};
+
+  x_dist_attr.set_dims_mapping(std::vector<int64_t>({0}));
+  w_dist_attr.set_dims_mapping(std::vector<int64_t>({-1, -1}));
+  out_grad_dist_attr.set_dims_mapping(std::vector<int64_t>({-1, 1}));
+
+  x = phi::distributed::DistMetaTensor(phi::make_ddim(x_shape), x_dist_attr);
+  w = phi::distributed::DistMetaTensor(phi::make_ddim(w_shape), w_dist_attr);
+  out_grad = phi::distributed::DistMetaTensor(phi::make_ddim(out_grad_shape),
+                                              out_grad_dist_attr);
+
+  spmdinfo = EmbeddingGradInferSpmd(x, w, out_grad, -1, false);
+
+  EXPECT_EQ(spmdinfo.first.size(), 3UL);
+  EXPECT_EQ(spmdinfo.second.size(), 1UL);
+
+  EXPECT_EQ(get_dims_mapping(spmdinfo.first[0]), std::vector<int64_t>({0}));
+  EXPECT_EQ(get_dims_mapping(spmdinfo.first[1]),
+            std::vector<int64_t>({-1, -1}));
+  EXPECT_EQ(get_dims_mapping(spmdinfo.first[2]), std::vector<int64_t>({0, 1}));
+
+  EXPECT_EQ(get_dims_mapping(spmdinfo.second[0]),
+            std::vector<int64_t>({-1, 1}));
+  EXPECT_EQ(
+      PADDLE_GET_CONST(phi::distributed::TensorDistAttr, spmdinfo.second[0])
+          .is_partial(),
+      true);
+  VLOG(4) << "Test EmbeddingGradInferSpmd with sharding weight and out_grad."
+          << std::endl
+          << std::endl
+          << std::endl;
+
+  x_shape = {12, 16};
+  w_shape = {10, 4};
+  out_grad_shape = {12, 16, 4};
+
+  x_dist_attr.set_dims_mapping(std::vector<int64_t>({-1, -1}));
+  w_dist_attr.set_dims_mapping(std::vector<int64_t>({-1, -0}));
+  out_grad_dist_attr.set_dims_mapping(std::vector<int64_t>({-1, -1, 0}));
+
+  x = phi::distributed::DistMetaTensor(phi::make_ddim(x_shape), x_dist_attr);
+  w = phi::distributed::DistMetaTensor(phi::make_ddim(w_shape), w_dist_attr);
+  out_grad = phi::distributed::DistMetaTensor(phi::make_ddim(out_grad_shape),
+                                              out_grad_dist_attr);
+
+  spmdinfo = EmbeddingGradInferSpmd(x, w, out_grad, -1, false);
+
+  EXPECT_EQ(spmdinfo.first.size(), 3UL);
+  EXPECT_EQ(spmdinfo.second.size(), 1UL);
+
+  EXPECT_EQ(get_dims_mapping(spmdinfo.first[0]),
+            std::vector<int64_t>({-1, -1}));
+  EXPECT_EQ(get_dims_mapping(spmdinfo.first[1]), std::vector<int64_t>({-1, 0}));
+  EXPECT_EQ(get_dims_mapping(spmdinfo.first[2]),
+            std::vector<int64_t>({-1, -1, 0}));
+
+  EXPECT_EQ(get_dims_mapping(spmdinfo.second[0]),
+            std::vector<int64_t>({-1, 0}));
+  EXPECT_EQ(
+      PADDLE_GET_CONST(phi::distributed::TensorDistAttr, spmdinfo.second[0])
+          .is_partial(),
+      false);
+  VLOG(4) << "Test EmbeddingGradInferSpmd with sharding weight and out_grad."
+          << std::endl
+          << std::endl
+          << std::endl;
+}
+
 }  // namespace auto_parallel
 }  // namespace distributed
 }  // namespace paddle

From c97365b7d7fced3e16b24359068bbba7e3bd0417 Mon Sep 17 00:00:00 2001
From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com>
Date: Wed, 29 Nov 2023 10:54:54 +0800
Subject: [PATCH 12/14] [PIR] Refine update_ops.yaml gen method (#59444)

* fix

* fix

* fix
---
 .../fluid/pir/dialect/op_generator/api_gen.py |  8 --------
 .../fluid/pir/dialect/op_generator/op_gen.py  |  6 ------
 paddle/fluid/pir/dialect/operator/ir/ops.yaml | 19 ++++---------------
 .../pir/dialect/operator/ir/ops_backward.yaml |  5 +++++
 .../pir/dialect/operator/ir/update_ops.yaml   |  5 ++++-
 paddle/fluid/primitive/codegen/gen.py         |  8 +++-----
 6 files changed, 16 insertions(+), 35 deletions(-)

diff --git a/paddle/fluid/pir/dialect/op_generator/api_gen.py b/paddle/fluid/pir/dialect/op_generator/api_gen.py
index 64bf32910c274..e6453cd271273 100644
--- a/paddle/fluid/pir/dialect/op_generator/api_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/api_gen.py
@@ -21,9 +21,7 @@
     PD_MANUAL_OP_LIST,
     OpCompatParser,
     OpInfoParser,
-    check_need_update_ops,
     to_pascal_case,
-    update_ops,
 )
 
 PD_MANUAL_API_LIST = {
@@ -164,18 +162,12 @@ def __init__(self) -> None:
 
     def _parse_yaml(self, op_yaml_files, op_compat_yaml_file):
         op_compat_parser = OpCompatParser(op_compat_yaml_file)
-        need_update_ops, update_yaml_file = check_need_update_ops(op_yaml_files)
 
         op_yaml_items = []
         for yaml_file in op_yaml_files:
-            if update_yaml_file == yaml_file:
-                continue
             with open(yaml_file, "r") as f:
                 ops = yaml.safe_load(f)
                 op_yaml_items = op_yaml_items + ops
-        # replace old ir ops with pir ops
-        if need_update_ops:
-            update_ops(op_yaml_items, update_yaml_file)
 
         op_info_items = []
         for op in op_yaml_items:
diff --git a/paddle/fluid/pir/dialect/op_generator/op_gen.py b/paddle/fluid/pir/dialect/op_generator/op_gen.py
index 10d79bc8b2791..262a434de8c7b 100644
--- a/paddle/fluid/pir/dialect/op_generator/op_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/op_gen.py
@@ -1055,18 +1055,12 @@ def OpGenerator(
 
     # (2) Prepare: Get all op item in all op_yaml_files
     op_compat_parser = OpCompatParser(op_compat_yaml_file)
-    need_update_ops, update_yaml_file = check_need_update_ops(op_yaml_files)
 
     op_yaml_items = []
     for yaml_file in op_yaml_files:
-        if update_yaml_file == yaml_file:
-            continue
         with open(yaml_file, "r") as f:
             ops = yaml.safe_load(f)
             op_yaml_items = op_yaml_items + ops
-    # replace old ir ops with pir ops
-    if need_update_ops:
-        update_ops(op_yaml_items, update_yaml_file)
 
     op_info_items = {}
     for op in op_yaml_items:
diff --git a/paddle/fluid/pir/dialect/operator/ir/ops.yaml b/paddle/fluid/pir/dialect/operator/ir/ops.yaml
index dbb6ad62f1082..460ca5ad373ce 100644
--- a/paddle/fluid/pir/dialect/operator/ir/ops.yaml
+++ b/paddle/fluid/pir/dialect/operator/ir/ops.yaml
@@ -1,5 +1,7 @@
-# The apis in this file are unstandardized that may caused by a variety of reasons,
-# we are trying to fix these apis and will move standardized apis into ops.yaml.
+# The operators included in this file are:
+# 1) Operators defined only in PIR, dynamic graphs do not exist;
+# 2) The definitions of static graphs and dynamic graphs are inconsistent, but the final definition plan has not yet been clarified.
+# After the definition is clearly defined, migrate to paddle /fluid/pir/dialect/operator/ir/update_ops.yaml or paddle/phi/api/yaml/ops.yaml
 
 - op : adadelta_
   args : (Tensor param, Tensor grad, Tensor avg_squared_grad, Tensor avg_squared_update, Tensor learning_rate, Tensor master_param, float rho, float epsilon, bool multi_precision)
@@ -83,19 +85,6 @@
   kernel :
     func : any
 
-- op : arange
-  args : (Scalar start, Scalar end, Scalar step, DataType dtype=DataType::FLOAT64, Place place=CPUPlace())
-  output : Tensor(out)
-  infer_meta :
-    func : ArangeInferMeta
-    param : [start, end, step, dtype]
-  kernel :
-    func : arange
-    param : [start, end, step]
-    data_type : dtype
-    backend : place
-  support_tensor : [start, end, step]
-
 - op : assert
   args : (Tensor cond, Tensor[] data, int64_t summarize = -1)
   output :
diff --git a/paddle/fluid/pir/dialect/operator/ir/ops_backward.yaml b/paddle/fluid/pir/dialect/operator/ir/ops_backward.yaml
index 1ae80dbc8b8fc..beba440b5b6de 100644
--- a/paddle/fluid/pir/dialect/operator/ir/ops_backward.yaml
+++ b/paddle/fluid/pir/dialect/operator/ir/ops_backward.yaml
@@ -1,3 +1,8 @@
+# The operators included in this file are:
+# 1) Operators defined only in PIR, dynamic graphs do not exist;
+# 2) The definitions of static graphs and dynamic graphs are inconsistent, but the final definition plan has not yet been clarified.
+# After the definition is clearly defined, migrate to paddle /fluid/pir/dialect/operator/ir/update_ops.yaml or paddle/phi/api/yaml/ops.yaml
+
 - backward_op : add_double_grad
   forward : add_grad (Tensor x, Tensor y, Tensor grad_out, int axis = -1) -> Tensor(grad_x), Tensor(grad_y)
   args : (Tensor y, Tensor grad_out, Tensor grad_x_grad, Tensor grad_y_grad, int axis = -1)
diff --git a/paddle/fluid/pir/dialect/operator/ir/update_ops.yaml b/paddle/fluid/pir/dialect/operator/ir/update_ops.yaml
index de542e68f30b9..23040d41ab5df 100644
--- a/paddle/fluid/pir/dialect/operator/ir/update_ops.yaml
+++ b/paddle/fluid/pir/dialect/operator/ir/update_ops.yaml
@@ -1,4 +1,7 @@
-# Ops in this file is only used for pir currently and will replace ops of legacy_ops.yaml/ops.yaml of PHI in future.
+# The operators contained in this file are:
+# Operators that are inconsistent with the dynamic graph definition currently,
+# but the final definition scheme of the static graph has been determined, after
+# the dynamic graph is simultaneously upgraded, the operators in this file will be migrated to paddle/phi/api/yaml/ops.yaml.
 
 - op : arange
   args : (Scalar start, Scalar end, Scalar step, DataType dtype=DataType::FLOAT64, Place place=CPUPlace())
diff --git a/paddle/fluid/primitive/codegen/gen.py b/paddle/fluid/primitive/codegen/gen.py
index 056dac8b6640a..f1741929b7beb 100644
--- a/paddle/fluid/primitive/codegen/gen.py
+++ b/paddle/fluid/primitive/codegen/gen.py
@@ -14,7 +14,6 @@
 
 import argparse
 import hashlib
-import os
 import pathlib
 import sys
 
@@ -353,6 +352,7 @@ def gen(
         compats,
         ir_fwds,
         ir_revs,
+        ir_update_fwds,
     ) = (
         load(prim_path),
         load(fwd_path),
@@ -360,13 +360,11 @@ def gen(
         load(compat_path),
         load(fwd_pd_op_path),
         load(rev_pd_op_path),
+        load(update_fwd_pd_op_path),
     )
     filter_compat_info(compats)
 
-    fwd_apis = fwds + ir_fwds
-    # replace old ir ops with pir ops
-    if os.path.exists(update_fwd_pd_op_path):
-        update_apis(fwd_apis, update_fwd_pd_op_path)
+    fwd_apis = fwds + ir_fwds + ir_update_fwds
 
     apis = [{**api, **{'is_fwd': True}} for api in fwd_apis]
     apis = apis + [{**api, **{'is_fwd': False}} for api in revs + ir_revs]

From 27565243d15d8df64b81b79df6fb50d433b61190 Mon Sep 17 00:00:00 2001
From: Galaxy1458 <55453380+Galaxy1458@users.noreply.github.com>
Date: Wed, 29 Nov 2023 10:56:22 +0800
Subject: [PATCH 13/14] chang_cc_test_old (#59384)

* chang_cc_test_old

* chang_cc_test_old

* chang_cc_test_old

* chang_cc_test_old

* chang_cc_test_old

* chang_cc_test_old

* update

* update

* update

* update
---
 paddle/common/ddim.h                          |  2 +-
 paddle/fluid/framework/details/CMakeLists.txt |  2 +-
 paddle/phi/core/ddim.h                        |  2 +-
 test/cpp/pir/core/CMakeLists.txt              | 10 +---------
 4 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/paddle/common/ddim.h b/paddle/common/ddim.h
index c2801fa54bd62..cfed0d221221d 100644
--- a/paddle/common/ddim.h
+++ b/paddle/common/ddim.h
@@ -64,7 +64,7 @@ inline void dynamic_dim_assign(const T1* in, T2* out, int n) {
  *
  * The number of dimensions must be between [1, 9].
  */
-class DDim {
+class TEST_API DDim {
  public:
   constexpr static int kMaxRank = 9;
 
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 92dd5157ff6a5..f76f6af9dce96 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -317,8 +317,8 @@ cc_test_old(
   var_handle
   op_handle_base
   scope
-  phi
   memory
+  phi
   device_context
   gather_op_handle)
 
diff --git a/paddle/phi/core/ddim.h b/paddle/phi/core/ddim.h
index be11b4c9596cd..ff2abdb3b84b3 100644
--- a/paddle/phi/core/ddim.h
+++ b/paddle/phi/core/ddim.h
@@ -248,7 +248,7 @@ DDim slice_ddim(const DDim& dim, int begin, int end);
 
 int arity(const DDim& ddim);
 
-TEST_API std::ostream& operator<<(std::ostream&, const DDim&);
+std::ostream& operator<<(std::ostream&, const DDim&);
 
 /**
  * \brief Flatten dim to 3d
diff --git a/test/cpp/pir/core/CMakeLists.txt b/test/cpp/pir/core/CMakeLists.txt
index 231f6a64cef6a..42c331c59fb70 100644
--- a/test/cpp/pir/core/CMakeLists.txt
+++ b/test/cpp/pir/core/CMakeLists.txt
@@ -1,11 +1,4 @@
-cc_test_old(
-  type_test
-  SRCS
-  type_test.cc
-  DEPS
-  pir
-  gtest
-  op_dialect_vjp)
+paddle_test(type_test SRCS type_test.cc DEPS pir op_dialect_vjp)
 cc_test_old(ir_attribute_test SRCS ir_attribute_test.cc DEPS pir gtest)
 cc_test_old(ir_value_test SRCS ir_value_test.cc DEPS pir gtest)
 paddle_test(
@@ -14,7 +7,6 @@ paddle_test(
   ir_op_test.cc
   DEPS
   pir
-  gtest
   test_dialect
   op_dialect_vjp)
 cc_test_old(ir_region_test SRCS ir_region_test.cc DEPS pir gtest)

From c8b1e8b6ef066e82c496914bc9740047ce121d0e Mon Sep 17 00:00:00 2001
From: wanghuancoder <wanghuan29@baidu.com>
Date: Wed, 29 Nov 2023 10:58:05 +0800
Subject: [PATCH 14/14] share inplace version counter for strided kernels
 (#59422)

* share inplace version counter for strided kernels
---
 paddle/phi/kernels/stride/as_complex_kernel.cc        | 1 +
 paddle/phi/kernels/stride/as_real_kernel.cc           | 1 +
 paddle/phi/kernels/stride/as_strided_kernel.cc        | 1 +
 paddle/phi/kernels/stride/complex_kernel.cc           | 2 ++
 paddle/phi/kernels/stride/diagonal_kernel.cc          | 1 +
 paddle/phi/kernels/stride/index_select_kernel.cc      | 1 +
 paddle/phi/kernels/stride/reshape_kernel.cc           | 1 +
 paddle/phi/kernels/stride/slice_kernel.cc             | 1 +
 paddle/phi/kernels/stride/squeeze_kernel.cc           | 1 +
 paddle/phi/kernels/stride/strided_slice_kernel.cc     | 1 +
 paddle/phi/kernels/stride/tensor_unfold_kernel.cc     | 1 +
 paddle/phi/kernels/stride/transpose_kernel.cc         | 1 +
 paddle/phi/kernels/stride/unsqueeze_kernel.cc         | 1 +
 paddle/phi/kernels/stride/view_kernel.cc              | 4 ++++
 test/collective/fleet/hybrid_parallel_pp_recompute.py | 1 +
 15 files changed, 19 insertions(+)

diff --git a/paddle/phi/kernels/stride/as_complex_kernel.cc b/paddle/phi/kernels/stride/as_complex_kernel.cc
index 6c5df876049d7..c2e7f816958eb 100644
--- a/paddle/phi/kernels/stride/as_complex_kernel.cc
+++ b/paddle/phi/kernels/stride/as_complex_kernel.cc
@@ -35,6 +35,7 @@ void AsComplexStridedKernel(const Context& dev_ctx,
   }
   out->set_offset(x.offset());
   out->ResetHolder(x.Holder());
+  out->ShareInplaceVersionCounterWith(x);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/stride/as_real_kernel.cc b/paddle/phi/kernels/stride/as_real_kernel.cc
index 203fce123c66f..92357968809ce 100644
--- a/paddle/phi/kernels/stride/as_real_kernel.cc
+++ b/paddle/phi/kernels/stride/as_real_kernel.cc
@@ -33,6 +33,7 @@ void AsRealStridedKernel(const Context& dev_ctx,
   }
   out->set_offset(x.offset());
   out->ResetHolder(x.Holder());
+  out->ShareInplaceVersionCounterWith(x);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/stride/as_strided_kernel.cc b/paddle/phi/kernels/stride/as_strided_kernel.cc
index f8aa3d81ffe67..0cd63577d1e96 100644
--- a/paddle/phi/kernels/stride/as_strided_kernel.cc
+++ b/paddle/phi/kernels/stride/as_strided_kernel.cc
@@ -28,6 +28,7 @@ void AsStridedKernel(const Context& dev_ctx,
   out->set_strides(DDim(stride.data(), static_cast<int>(stride.size())));
   out->set_offset(offset);
   out->ResetHolder(input.Holder());
+  out->ShareInplaceVersionCounterWith(input);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/stride/complex_kernel.cc b/paddle/phi/kernels/stride/complex_kernel.cc
index ed9076141a54f..d72bfec2b09f0 100644
--- a/paddle/phi/kernels/stride/complex_kernel.cc
+++ b/paddle/phi/kernels/stride/complex_kernel.cc
@@ -35,6 +35,7 @@ void RealStridedKernel(const Context& dev_ctx,
   out->set_offset(x.offset());
   out->set_strides(stride);
   out->ResetHolder(x.Holder());
+  out->ShareInplaceVersionCounterWith(x);
 }
 
 template <typename T, typename Context>
@@ -54,6 +55,7 @@ void ImagStridedKernel(const Context& dev_ctx,
   out->set_strides(stride);
   out->set_offset(x.offset() + phi::SizeOf(out->dtype()));
   out->ResetHolder(x.Holder());
+  out->ShareInplaceVersionCounterWith(x);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/stride/diagonal_kernel.cc b/paddle/phi/kernels/stride/diagonal_kernel.cc
index b4ca6d9b277df..31b2aa97e96fc 100644
--- a/paddle/phi/kernels/stride/diagonal_kernel.cc
+++ b/paddle/phi/kernels/stride/diagonal_kernel.cc
@@ -78,6 +78,7 @@ void DiagonalStridedKernel(const Context& dev_ctx,
   meta.offset = x_offset;
   out->set_meta(meta);
   out->ResetHolder(x.Holder());
+  out->ShareInplaceVersionCounterWith(x);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/stride/index_select_kernel.cc b/paddle/phi/kernels/stride/index_select_kernel.cc
index e5db5f9d18f76..b7f96be147532 100644
--- a/paddle/phi/kernels/stride/index_select_kernel.cc
+++ b/paddle/phi/kernels/stride/index_select_kernel.cc
@@ -53,6 +53,7 @@ void IndexSelectStridedKernel(const Context& ctx,
   meta.strides = DDim(stride.data(), static_cast<int>(stride.size()));
   output->set_meta(meta);
   output->ResetHolder(x.Holder());
+  output->ShareInplaceVersionCounterWith(x);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/stride/reshape_kernel.cc b/paddle/phi/kernels/stride/reshape_kernel.cc
index 732eb86a45e19..9d94e53314193 100644
--- a/paddle/phi/kernels/stride/reshape_kernel.cc
+++ b/paddle/phi/kernels/stride/reshape_kernel.cc
@@ -41,6 +41,7 @@ void ReshapeStridedKernel(const Context& dev_ctx,
     out->set_offset(x_offset);
     out->set_strides(stride);
     out->ResetHolder(x.Holder());
+    out->ShareInplaceVersionCounterWith(x);
   } else {
     DenseTensor tmp;
     DenseTensor tmp_x = x;
diff --git a/paddle/phi/kernels/stride/slice_kernel.cc b/paddle/phi/kernels/stride/slice_kernel.cc
index 195034fb92249..998bc2700df4f 100644
--- a/paddle/phi/kernels/stride/slice_kernel.cc
+++ b/paddle/phi/kernels/stride/slice_kernel.cc
@@ -99,6 +99,7 @@ void SliceStridedKernel(const Context& ctx,
       DDim(output_stride.data(), static_cast<int>(output_stride.size()));
   out->set_meta(meta);
   out->ResetHolder(input.Holder());
+  out->ShareInplaceVersionCounterWith(input);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/stride/squeeze_kernel.cc b/paddle/phi/kernels/stride/squeeze_kernel.cc
index 90f240c7ce865..33895dfcf8e66 100644
--- a/paddle/phi/kernels/stride/squeeze_kernel.cc
+++ b/paddle/phi/kernels/stride/squeeze_kernel.cc
@@ -111,6 +111,7 @@ void SqueezeInferStridedKernel(const Context& dev_ctx,
   meta.offset = input.offset();
   out->set_meta(meta);
   out->ResetHolder(input.Holder());
+  out->ShareInplaceVersionCounterWith(input);
 }
 
 template <typename Context>
diff --git a/paddle/phi/kernels/stride/strided_slice_kernel.cc b/paddle/phi/kernels/stride/strided_slice_kernel.cc
index 9bc36d5b15f97..a57ed98d119a9 100644
--- a/paddle/phi/kernels/stride/strided_slice_kernel.cc
+++ b/paddle/phi/kernels/stride/strided_slice_kernel.cc
@@ -121,6 +121,7 @@ void StridedSliceRawStridedKernel(const Context& dev_ctx,
       DDim(output_stride.data(), static_cast<int>(output_stride.size()));
   out->set_meta(meta);
   out->ResetHolder(input.Holder());
+  out->ShareInplaceVersionCounterWith(input);
 }
 
 template <typename Context>
diff --git a/paddle/phi/kernels/stride/tensor_unfold_kernel.cc b/paddle/phi/kernels/stride/tensor_unfold_kernel.cc
index 9207dd6a7fde9..0a4d4ee9af3e2 100644
--- a/paddle/phi/kernels/stride/tensor_unfold_kernel.cc
+++ b/paddle/phi/kernels/stride/tensor_unfold_kernel.cc
@@ -65,6 +65,7 @@ void TensorUnfoldKernel(const Context& dev_ctx,
   out->set_strides(DDim(stride.data(), static_cast<int>(stride.size())));
   out->set_offset(input.offset());
   out->ResetHolder(input.Holder());
+  out->ShareInplaceVersionCounterWith(input);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/stride/transpose_kernel.cc b/paddle/phi/kernels/stride/transpose_kernel.cc
index 1fedb515ef020..acdc321ad0e8a 100644
--- a/paddle/phi/kernels/stride/transpose_kernel.cc
+++ b/paddle/phi/kernels/stride/transpose_kernel.cc
@@ -41,6 +41,7 @@ void TransposeStridedKernel(const Context& ctx,
 
   out->set_meta(meta);
   out->ResetHolder(x.Holder());
+  out->ShareInplaceVersionCounterWith(x);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/stride/unsqueeze_kernel.cc b/paddle/phi/kernels/stride/unsqueeze_kernel.cc
index 45c50c37fe800..b97a0222e6cd6 100644
--- a/paddle/phi/kernels/stride/unsqueeze_kernel.cc
+++ b/paddle/phi/kernels/stride/unsqueeze_kernel.cc
@@ -71,6 +71,7 @@ void UnsqueezeInferStridedKernel(const Context& dev_ctx,
   meta.offset = input.offset();
   out->set_meta(meta);
   out->ResetHolder(input.Holder());
+  out->ShareInplaceVersionCounterWith(input);
 }
 
 template <typename Context>
diff --git a/paddle/phi/kernels/stride/view_kernel.cc b/paddle/phi/kernels/stride/view_kernel.cc
index ff92086ff2f88..d6ca9bf6c02f0 100644
--- a/paddle/phi/kernels/stride/view_kernel.cc
+++ b/paddle/phi/kernels/stride/view_kernel.cc
@@ -32,6 +32,7 @@ void ViewShapeKernel(const Context& dev_ctx,
     meta.offset = input.offset();
     out->set_meta(meta);
     out->ResetHolder(input.Holder());
+    out->ShareInplaceVersionCounterWith(input);
   } else {
     PADDLE_THROW(phi::errors::InvalidArgument(
         "The Tensor can not be viewed, please call reshape."));
@@ -51,6 +52,7 @@ void ViewDtypeKernel(const Context& dev_ctx,
     meta.dtype = dtype;
     out->set_meta(meta);
     out->ResetHolder(input.Holder());
+    out->ShareInplaceVersionCounterWith(input);
   } else if (input_dtype_size == 0) {
     PADDLE_THROW(phi::errors::InvalidArgument(
         "The Tensor's shape is [] can not be viewed."));
@@ -82,6 +84,7 @@ void ViewDtypeKernel(const Context& dev_ctx,
     meta.offset = input.offset() * times;
     out->set_meta(meta);
     out->ResetHolder(input.Holder());
+    out->ShareInplaceVersionCounterWith(input);
   } else {
     PADDLE_ENFORCE_EQ(
         input.strides()[input.strides().size() - 1],
@@ -140,6 +143,7 @@ void ViewDtypeKernel(const Context& dev_ctx,
     meta.offset = input.offset() / times;
     out->set_meta(meta);
     out->ResetHolder(input.Holder());
+    out->ShareInplaceVersionCounterWith(input);
   }
 }
 
diff --git a/test/collective/fleet/hybrid_parallel_pp_recompute.py b/test/collective/fleet/hybrid_parallel_pp_recompute.py
index fd03b562f25a3..3d95375d1da8a 100644
--- a/test/collective/fleet/hybrid_parallel_pp_recompute.py
+++ b/test/collective/fleet/hybrid_parallel_pp_recompute.py
@@ -98,6 +98,7 @@ class TransformerNetPipe(TransformerNet):
     def forward(self, tensors):
         if framework.in_dynamic_mode():
             stable, x = tensors
+            stable = paddle.assign(stable)
             output = super().forward(x)
             return stable, output
         else: