From 064c16074b11e815f371b9cb0d28ea7dff1336e1 Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Wed, 29 Nov 2023 09:35:45 +0800 Subject: [PATCH 01/14] [PIR+CINN]Support SubGraph Exporter for Unittest Platform (#59353) * [PIR+CINN]Support SubGraph Exporter for Unittest Platform * add unittest * fix UT not take effect --- paddle/phi/core/flags.cc | 12 + .../paddle/jit/dy2static/export_subgraph.py | 245 ++++++++++++++++++ .../paddle/jit/dy2static/partial_program.py | 33 ++- .../jit/dy2static/pir_partial_program.py | 1 + .../jit/dy2static/program_translator.py | 14 +- test/ir/pir/CMakeLists.txt | 4 + test/ir/pir/test_subgraph_exporter.py | 102 ++++++++ 7 files changed, 406 insertions(+), 5 deletions(-) create mode 100644 python/paddle/jit/dy2static/export_subgraph.py create mode 100644 test/ir/pir/test_subgraph_exporter.py diff --git a/paddle/phi/core/flags.cc b/paddle/phi/core/flags.cc index ebef7410c31bf..eea908c0db520 100644 --- a/paddle/phi/core/flags.cc +++ b/paddle/phi/core/flags.cc @@ -1337,6 +1337,18 @@ PHI_DEFINE_EXPORTED_string( ir_inplace_kernel_blacklist, "", "It controls the ir inplace kernel subset do not use."); +/** + * Specify the directory of saving PIR sugraph from @to_static + * Name: pir_subgraph_saving_dir + * Since Version: 2.6.0 + * Value Range: str, default="" + * Example: + * Note: "/workspace/my_path", it will save into my_path dir; + */ +PHI_DEFINE_EXPORTED_string( + pir_subgraph_saving_dir, + "", + "Specify the directory of saving PIR sugraph from @to_static."); PHI_DEFINE_EXPORTED_bool(enable_record_memory, false, "Enable memory recorder"); diff --git a/python/paddle/jit/dy2static/export_subgraph.py b/python/paddle/jit/dy2static/export_subgraph.py new file mode 100644 index 0000000000000..900f29db1c2a4 --- /dev/null +++ b/python/paddle/jit/dy2static/export_subgraph.py @@ -0,0 +1,245 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from paddle import pir +from paddle.base import core +from paddle.base.dygraph.base import switch_to_static_graph +from paddle.base.framework import Variable, get_flags + +__all__ = [] + +MAX_FILE_PATH_LEN = 50 + + +class SubGraphRole: + Infer = 0 + Forward = 1 + Backward = 2 + + +def get_saving_dir(): + flag = "FLAGS_pir_subgraph_saving_dir" + value = get_flags(flag)[flag] + return value + + +class BaseExporter: + def __init__(self, partial_program_layer, program, role): + self.pp_layer = partial_program_layer + self.program = program + self.role = role + self.root_dir = get_saving_dir() + + def save(self): + # step 1: Create subgraph saving path. + saving_path = self.generate_saving_path() + + # step 2: Translate into pir program. + pir_program = self.translate_into_pir() + + # step 3: save into local disk. + self._save(pir_program, saving_path) + + def _save(self, pir_program, path): + content = str(pir_program) + with open(path, 'w') as f: + f.write(content) + + def parse_inout(self): + raise NotImplementedError("Need to implement parse_inout method") + + def translate_into_pir(self): + # step 1: Insert data op for inputs/params + feed_list, fetch_list, inter_outs = self.parse_inout() + self.insert_feed_op(feed_list, "pt_input_") + # step 2: Insert fetch op for outputs + self.insert_fetch_op(fetch_list, "pt_output_") + self.insert_fetch_op(inter_outs, "pt_intermediate_") + # step 3: translate into pir + pir_program = pir.translate_to_pir(self.program.desc) + return pir_program + + def generate_saving_path(self): + layer_name = self.pp_layer._debug_name + assert layer_name is not None + ops_name = [ + op.type for op in self.program.block(0).ops[:MAX_FILE_PATH_LEN] + ] + prefix = ["infer_", "forward_", "backward_"][self.role] + file_name = prefix + "_".join(ops_name)[:MAX_FILE_PATH_LEN] + '.txt' + saving_dir = os.path.join(self.root_dir, layer_name) + self.verify_saving_dir(saving_dir) + return os.path.join(self.root_dir, layer_name, file_name) + + def verify_saving_dir(self, dir_path): + if not os.path.exists(dir_path): + os.makedirs(dir_path) + + def insert_feed_op(self, intputs, rename_prefix): + global_block = self.program.block(0) + + for i, var in enumerate(intputs): + old_name = var.name + new_name = rename_prefix + str(i) + global_block._rename_var(old_name, new_name) + out = global_block.var(new_name) + global_block._prepend_op( + type='data', + inputs={}, + outputs={'out': out}, + attrs={ + 'shape': out.shape, + 'dtype': out.dtype, + 'place': 0, + 'name': out.name, + }, + ) + global_block._sync_with_cpp() + + def insert_fetch_op(self, outputs, rename_prefix): + global_block = self.program.block(0) + fetch_var = global_block.create_var( + name="fetch_outputs", + type=core.VarDesc.VarType.FETCH_LIST, + persistable=False, + ) + for i, out in enumerate(outputs): + var = self.get_var(out) + old_name = var.name + new_name = rename_prefix + str(i) + global_block._rename_var(old_name, new_name) + new_var = global_block.var(new_name) + global_block.append_op( + type="fetch", + inputs={'X': [new_var]}, + outputs={'Out': [fetch_var]}, + attrs={'col': i}, + ) + global_block._sync_with_cpp() + + def rename_ops(self, ops, new_name, old_name): + for op in ops: + op._rename_input(old_name, new_name) + op._rename_output(old_name, new_name) + + def get_var(self, name_or_var): + if isinstance(name_or_var, Variable): + return name_or_var + assert isinstance(name_or_var, str) + global_block = self.program.block(0) + return global_block.var(name_or_var) + + +class InferExporter(BaseExporter): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def parse_inout(self): + inputs, outputs = [], [] + global_block = self.program.block(0) + raw_inputs = self.pp_layer._inputs.tolist() + self.pp_layer._params + raw_outputs = self.pp_layer._outputs.tolist() + for var in raw_inputs: + new_var = global_block.var(var.name) + inputs.append(new_var) + + for var in raw_outputs: + new_var = global_block.var(var.name) + outputs.append(new_var) + + return inputs, outputs, [] + + +class TrainFwdExporter(BaseExporter): + def __init__(self, pp_layer, copy_program, role, raw_inter_outs): + super().__init__(pp_layer, copy_program, role) + self.raw_inter_outs = raw_inter_outs + + def parse_inout(self): + inputs, outputs = [], [] + global_block = self.program.block(0) + raw_inputs = self.pp_layer._inputs.tolist() + self.pp_layer._params + raw_outputs = self.pp_layer._outputs.tolist() + + inter_outs = { + name + for name in self.raw_inter_outs + if self.program.block(0).has_var(name) + } + for var in raw_inputs: + new_var = global_block.var(var.name) + inputs.append(new_var) + if var.name in inter_outs: + inter_outs.remove(var.name) + + for var in raw_outputs: + new_var = global_block.var(var.name) + outputs.append(new_var) + if var.name in inter_outs: + inter_outs.remove(var.name) + + return inputs, outputs, list(inter_outs) + + +class TrainBwdExporter(BaseExporter): + def __init__(self, pp_layer, copy_program, role, raw_inputs, raw_outputs): + super().__init__(pp_layer, copy_program, role) + self.raw_inputs = raw_inputs + self.raw_outputs = raw_outputs + + def parse_inout(self): + inputs, outputs = [], [] + global_block = self.program.block(0) + + for var_name in self.raw_inputs: + if global_block.has_var(var_name): + inputs.append(global_block.var(var_name)) + + # add fill_constant grad_var as input + for var in self.pp_layer._outputs.tolist(): + init_grad_name = var.name + "@GRAD" + if init_grad_name not in self.raw_inputs and global_block.has_var( + init_grad_name + ): + inputs.append(global_block.var(init_grad_name)) + + for var_name in self.raw_outputs: + if ( + global_block.has_var(var_name) + and var_name not in self.raw_inputs + ): + outputs.append(global_block.var(var_name)) + + return inputs, outputs, [] + + +@switch_to_static_graph +def pir_exporter(pp_layer, program, role, shared_inputs=None, inter_outs=None): + # skip it if not specify root_saving_dir by FLAGS. + root_saving_dir = get_saving_dir() + if not root_saving_dir: + return + copy_program = program.clone() + if role == SubGraphRole.Infer: + InferExporter(pp_layer, copy_program, role).save() + elif role == SubGraphRole.Forward: + TrainFwdExporter(pp_layer, copy_program, role, inter_outs).save() + elif role == SubGraphRole.Backward: + TrainBwdExporter( + pp_layer, copy_program, role, shared_inputs, inter_outs + ).save() + else: + raise RuntimeError("role only support Infer/Forward/Backward") diff --git a/python/paddle/jit/dy2static/partial_program.py b/python/paddle/jit/dy2static/partial_program.py index 0aa0d0e3dcb07..1c4e2eb3b49f0 100644 --- a/python/paddle/jit/dy2static/partial_program.py +++ b/python/paddle/jit/dy2static/partial_program.py @@ -28,6 +28,7 @@ from paddle.optimizer.lr import LRScheduler from . import logging_utils +from .export_subgraph import SubGraphRole, pir_exporter from .utils import ( RETURN_NO_VALUE_MAGIC_NUM, backend_guard, @@ -226,6 +227,7 @@ def __init__( self._out_var_descs = [ self._outputs[var_id].desc for var_id in self._outputs.var_ids ] + self._debug_name = None def __call__(self, inputs): """ @@ -544,14 +546,19 @@ def train_program(self): @property def infer_program(self): if _in_amp_guard(): - return self._infer_amp_program + infer_program = self._infer_amp_program elif _in_pure_fp16_guard(): - return self._infer_pure_fp16_program + infer_program = self._infer_pure_fp16_program else: - return self._infer_program + infer_program = self._infer_program + # NOTE(Aurelius84): Export forward_program for SubGraphChecker, + # see export_subgraph for detail. + pir_exporter(self, infer_program, SubGraphRole.Infer) + return infer_program @property def forward_program(self): + forward_program, role = None, None if self.training: if _in_amp_guard(): progs = self._train_amp_forward_backward_program @@ -561,7 +568,8 @@ def forward_program(self): progs = self._train_forward_backward_program return progs[0] else: - return self.infer_program + forward_program = self.infer_program + return forward_program @property def backward_program(self): @@ -868,6 +876,23 @@ def _get_forward_backward_program_form( self._apply_inplace_pass( forward_builded_program, backward_builded_program ) + + # NOTE(Aurelius84): Export forward/backward program for SubGraphChecker, + # see export_subgraph for detail. + pir_exporter( + self, + forward_builded_program, + SubGraphRole.Forward, + set(), + set(forward_skip_vars), + ) + pir_exporter( + self, + backward_builded_program, + SubGraphRole.Backward, + set(forward_skip_vars), + set(backward_skip_vars), + ) return [forward_builded_program, backward_builded_program] def _apply_inplace_pass(self, forward_program, backward_program): diff --git a/python/paddle/jit/dy2static/pir_partial_program.py b/python/paddle/jit/dy2static/pir_partial_program.py index 109c96ca11bba..d8009a3bcd399 100644 --- a/python/paddle/jit/dy2static/pir_partial_program.py +++ b/python/paddle/jit/dy2static/pir_partial_program.py @@ -445,6 +445,7 @@ def __init__( self._hooker = None self._backend = kwargs.get('backend', None) self._grad_var_names = {} + self._debug_name = None def __call__(self, inputs): """ diff --git a/python/paddle/jit/dy2static/program_translator.py b/python/paddle/jit/dy2static/program_translator.py index a8b87750b108a..26691bf06b0f0 100644 --- a/python/paddle/jit/dy2static/program_translator.py +++ b/python/paddle/jit/dy2static/program_translator.py @@ -376,6 +376,16 @@ def __init__(self, function, input_spec=None, **kwargs): self._cuda_graph_capture_mode = "" self._cuda_graph_pool_id = 0 self._property = kwargs.get("property", False) + self._get_debug_name() + + def _get_debug_name(self): + try: + if self._class_instance: + self._debug_name = self._class_instance.__class__.__name__ + else: + self._debug_name = self._dygraph_function.__name__ + except Exception: + self._debug_name = "static_function" @property def is_property(self): @@ -778,7 +788,7 @@ def _perform_call(self, *args, **kwargs): args, kwargs = self._function_spec.unified_args_and_kwargs(args, kwargs) try: - concrete_program, partial_program_layer = self.get_concrete_program( + _, partial_program_layer = self.get_concrete_program( *args, **kwargs, is_train=self._is_train_mode() ) # 2. synchronize self.training attribute. @@ -863,6 +873,7 @@ def get_concrete_program(self, *args, **kwargs): concrete_program, partial_program_layer = self._program_cache[ cache_key ] + partial_program_layer._debug_name = self._debug_name return concrete_program, partial_program_layer def get_concrete_program_with_cache_key(self, cached_key): @@ -1442,6 +1453,7 @@ class FallbackProgramLayer: 'training', '_cuda_graph_capture_mode', '_cuda_graph_pool_id', + '_debug_name', ] def __init__(self, instance, dy_func): diff --git a/test/ir/pir/CMakeLists.txt b/test/ir/pir/CMakeLists.txt index 5a9f2c48509b3..dd2d0ced90104 100644 --- a/test/ir/pir/CMakeLists.txt +++ b/test/ir/pir/CMakeLists.txt @@ -14,6 +14,10 @@ set(TEST_IR_SYSTEM_CASES test_stop_gradient test_override_operator) list(REMOVE_ITEM TEST_INTERP_CASES ${TEST_IR_SYSTEM_CASES}) +list(REMOVE_ITEM TEST_INTERP_CASES test_subgraph_exporter) +py_test_modules( + test_subgraph_exporter MODULES test_subgraph_exporter ENVS MIN_GRAPH_SIZE=0 + FLAGS_pir_subgraph_saving_dir=${CMAKE_CURRENT_SOURCE_DIR}) foreach(target ${TEST_INTERP_CASES}) py_test_modules(${target} MODULES ${target} ENVS GLOG_v=1 diff --git a/test/ir/pir/test_subgraph_exporter.py b/test/ir/pir/test_subgraph_exporter.py new file mode 100644 index 0000000000000..9eed01700dbae --- /dev/null +++ b/test/ir/pir/test_subgraph_exporter.py @@ -0,0 +1,102 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import shutil +import unittest + +import paddle +from paddle.jit.dy2static.export_subgraph import get_saving_dir + + +class Net(paddle.nn.Layer): + def __init__(self): + super().__init__() + + def forward(self, x): + y = paddle.pow(x, 2) + z = x + y + z2 = paddle.matmul(y, z) + out = paddle.nn.functional.relu(z * z2) + out = paddle.mean(out) + return out, z2 + + +class TestSaveFwdBwdProg(unittest.TestCase): + def setUp(self): + self.net = paddle.jit.to_static(Net()) + self.root_dir = os.path.join(get_saving_dir(), "wrapper") + self.clean() + + def clean(self): + if os.path.exists(self.root_dir): + shutil.rmtree(self.root_dir) + os.mkdir(self.root_dir) + + def test_export(self): + x = paddle.randn([4, 4]) + x.stop_gradient = False + out = self.net(x) + self.check_export() + + def check_export(self): + for prog_file in os.listdir(self.root_dir): + if "forward" in prog_file: + self.check_fwd(prog_file) + return + elif "backward" in prog_file: + self.check_bwd(prog_file) + else: + raise RuntimeError("Not Support.") + + def check_fwd(self, prog_file): + prog_info = [ + "pt_input_0", + "pt_output_0", + "pt_output_1", + "pt_intermediate_0", + "pt_intermediate_1", + "pt_intermediate_2", + ] + path = os.path.join(self.root_dir, prog_file) + with open(path, 'r') as f: + content = f.readlines() + index = 0 + for op_str in content: + if "pd_op.data" in op_str or "pd_op.fetch" in op_str: + self.assertIn(prog_info[index], op_str) + index += 1 + + def check_bwd(self, prog_file): + prog_info = [ + "pt_input_6", + "pt_input_5", + "pt_input_4", + "pt_input_3", + "pt_input_2", + "pt_input_1", + "pt_input_0", + ] + path = os.path.join(self.root_dir, prog_file) + with open(path, 'r') as f: + content = f.readlines() + index = 0 + for op_str in content: + if "pd_op.data" in op_str or "pd_op.fetch" in op_str: + self.assertIn(prog_info[index], op_str) + index += 1 + + +if __name__ == "__main__": + unittest.main() From b22b8798ec6fc0263294bd9b1f12831b2a508619 Mon Sep 17 00:00:00 2001 From: houj04 <35131887+houj04@users.noreply.github.com> Date: Wed, 29 Nov 2023 09:51:49 +0800 Subject: [PATCH 02/14] [XPU] prepare ut for bf16 (#59389) * [XPU] prepare ut for bf16 * add ut for coverage * add ut for coverage --- paddle/phi/backends/xpu/xpu2_op_list.cc | 5 ++- paddle/phi/backends/xpu/xpu3_op_list.cc | 5 ++- paddle/phi/kernels/funcs/tensor_formatter.cc | 8 ++++ test/legacy_test/test_print_op.py | 39 ++++++++++++++++++-- test/xpu/op_test_xpu.py | 4 +- test/xpu/test_adamw_op_xpu.py | 2 +- 6 files changed, 54 insertions(+), 9 deletions(-) diff --git a/paddle/phi/backends/xpu/xpu2_op_list.cc b/paddle/phi/backends/xpu/xpu2_op_list.cc index 323207df20371..efb5b4c9d098f 100644 --- a/paddle/phi/backends/xpu/xpu2_op_list.cc +++ b/paddle/phi/backends/xpu/xpu2_op_list.cc @@ -595,7 +595,10 @@ XPUOpMap& get_kl2_ops() { XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})}, {"mean_grad", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})}, - {"mean", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})}, + {"mean", + XPUKernelSet({phi::DataType::FLOAT32, + phi::DataType::FLOAT16, + phi::DataType::BFLOAT16})}, {"merged_adam", XPUKernelSet({phi::DataType::FLOAT32})}, {"merged_momentum", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})}, diff --git a/paddle/phi/backends/xpu/xpu3_op_list.cc b/paddle/phi/backends/xpu/xpu3_op_list.cc index aeb51998f4d7a..29e9c7e0f8901 100644 --- a/paddle/phi/backends/xpu/xpu3_op_list.cc +++ b/paddle/phi/backends/xpu/xpu3_op_list.cc @@ -571,7 +571,10 @@ XPUOpMap& get_kl3_ops() { XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})}, {"mean_grad", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})}, - {"mean", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})}, + {"mean", + XPUKernelSet({phi::DataType::FLOAT32, + phi::DataType::FLOAT16, + phi::DataType::BFLOAT16})}, {"merged_momentum", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})}, {"mish_grad", XPUKernelSet({phi::DataType::FLOAT32})}, diff --git a/paddle/phi/kernels/funcs/tensor_formatter.cc b/paddle/phi/kernels/funcs/tensor_formatter.cc index 16d3b38bced7c..7c4cd28fe20c7 100644 --- a/paddle/phi/kernels/funcs/tensor_formatter.cc +++ b/paddle/phi/kernels/funcs/tensor_formatter.cc @@ -107,6 +107,10 @@ std::string TensorFormatter::Format(const phi::DenseTensor& print_tensor, FormatData(print_tensor, log_stream); } else if (dtype == phi::DataType::BOOL) { FormatData(print_tensor, log_stream); + } else if (dtype == phi::DataType::FLOAT16) { + FormatData(print_tensor, log_stream); + } else if (dtype == phi::DataType::BFLOAT16) { + FormatData(print_tensor, log_stream); } else { log_stream << " - data: unprintable type: " << dtype << std::endl; } @@ -153,6 +157,10 @@ template void TensorFormatter::FormatData( const phi::DenseTensor& print_tensor, std::stringstream& log_stream); template void TensorFormatter::FormatData( const phi::DenseTensor& print_tensor, std::stringstream& log_stream); +template void TensorFormatter::FormatData( + const phi::DenseTensor& print_tensor, std::stringstream& log_stream); +template void TensorFormatter::FormatData( + const phi::DenseTensor& print_tensor, std::stringstream& log_stream); } // namespace funcs } // namespace paddle diff --git a/test/legacy_test/test_print_op.py b/test/legacy_test/test_print_op.py index 95c1dd420626d..1ce1a08643210 100755 --- a/test/legacy_test/test_print_op.py +++ b/test/legacy_test/test_print_op.py @@ -15,6 +15,7 @@ import unittest import numpy as np +from op_test import convert_float_to_uint16 from simple_nets import init_data, simple_fc_net import paddle @@ -30,14 +31,17 @@ class TestPrintOpCPU(unittest.TestCase): def setUp(self): + self.dtype = 'float32' self.place = paddle.CPUPlace() self.x_tensor = base.core.LoDTensor() - tensor_np = np.random.random(size=(2, 3)).astype('float32') + tensor_np = np.random.random(size=(2, 3)).astype(self.dtype) self.x_tensor.set(tensor_np, self.place) self.x_tensor.set_recursive_sequence_lengths([[1, 1]]) def build_network(self, only_forward, **kargs): - x = paddle.static.data('x', shape=[-1, 3], dtype='float32', lod_level=1) + x = paddle.static.data( + 'x', shape=[-1, 3], dtype=self.dtype, lod_level=1 + ) x.stop_gradient = False paddle.static.Print(input=x, **kargs) loss = paddle.mean(x) @@ -77,7 +81,7 @@ def test_all_parameters(self): prog = paddle.static.Program() with paddle.static.program_guard(prog, paddle.static.Program()): x = paddle.static.data( - 'x', shape=[-1, 3], dtype='float32', lod_level=1 + 'x', shape=[-1, 3], dtype=self.dtype, lod_level=1 ) x.stop_gradient = False @@ -136,9 +140,36 @@ def test_errors(self): ) class TestPrintOpGPU(TestPrintOpCPU): def setUp(self): + self.dtype = 'float32' self.place = paddle.CUDAPlace(0) self.x_tensor = base.core.LoDTensor() - tensor_np = np.random.random(size=(2, 3)).astype('float32') + tensor_np = np.random.random(size=(2, 3)).astype(self.dtype) + self.x_tensor.set(tensor_np, self.place) + self.x_tensor.set_recursive_sequence_lengths([[1, 1]]) + + +@unittest.skipIf( + not core.is_compiled_with_cuda(), "core is not compiled with CUDA" +) +class TestPrintOpGPUFP16(TestPrintOpCPU): + def setUp(self): + self.dtype = 'float16' + self.place = paddle.CUDAPlace(0) + self.x_tensor = base.core.LoDTensor() + tensor_np = np.random.random(size=(2, 3)).astype(self.dtype) + self.x_tensor.set(tensor_np, self.place) + self.x_tensor.set_recursive_sequence_lengths([[1, 1]]) + + +@unittest.skipIf( + not core.is_compiled_with_cuda(), "core is not compiled with CUDA" +) +class TestPrintOpGPUBFP16(TestPrintOpCPU): + def setUp(self): + self.dtype = 'bfloat16' + self.place = paddle.CUDAPlace(0) + self.x_tensor = base.core.LoDTensor() + tensor_np = convert_float_to_uint16(np.random.random(size=(2, 3))) self.x_tensor.set(tensor_np, self.place) self.x_tensor.set_recursive_sequence_lengths([[1, 1]]) diff --git a/test/xpu/op_test_xpu.py b/test/xpu/op_test_xpu.py index 7ea5359de5044..09ee428714bd6 100644 --- a/test/xpu/op_test_xpu.py +++ b/test/xpu/op_test_xpu.py @@ -183,8 +183,8 @@ def check_grad_with_place( if not core.is_float16_supported(place): return - if self.dtype == np.float16: - max_relative_error = 1.0 + if self.dtype == np.float16 or self.dtype == np.uint16: + max_relative_error = 0.1 return super().check_grad_with_place( place, inputs_to_check, diff --git a/test/xpu/test_adamw_op_xpu.py b/test/xpu/test_adamw_op_xpu.py index 1a777f2d23578..8584360837d79 100644 --- a/test/xpu/test_adamw_op_xpu.py +++ b/test/xpu/test_adamw_op_xpu.py @@ -84,7 +84,7 @@ def setUp(self): # Test AdamW Op with supplied attributes self.op_type = "adamw" self.init_shape() - self.dtype = self.in_type_str + self.dtype = self.in_type param = np.random.uniform(-1, 1, self.shape).astype(self.dtype) grad = np.random.uniform(-1, 1, self.shape).astype(self.dtype) moment1 = np.random.uniform(-1, 1, self.shape).astype("float32") From 141ef977773231d6b424dd855c77f7677865680e Mon Sep 17 00:00:00 2001 From: yuguo <948529990@qq.com> Date: Wed, 29 Nov 2023 10:05:54 +0800 Subject: [PATCH 03/14] fix rnn op bug for DCU (#59402) --- paddle/phi/kernels/gpu/rnn_functor.h | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/paddle/phi/kernels/gpu/rnn_functor.h b/paddle/phi/kernels/gpu/rnn_functor.h index bdbcd05e65772..ae43d64ea2d8c 100644 --- a/paddle/phi/kernels/gpu/rnn_functor.h +++ b/paddle/phi/kernels/gpu/rnn_functor.h @@ -99,14 +99,24 @@ class RNNDescriptors { // ------------------- cudnn dropout descriptors --------------------- size_t state_size; bool is_initialized = dropout_state->initialized(); - if (!is_test_ && !is_initialized) { #ifdef PADDLE_WITH_HIP + if (!is_initialized) { PADDLE_ENFORCE_GPU_SUCCESS( phi::dynload::miopenDropoutGetStatesSize(handle, &state_size)); + dropout_state->Resize({static_cast(state_size)}); + dev_ctx.template Alloc(dropout_state); + } + dropout_desc_.descriptor(handle, // NOLINT + dev_ctx.GetPlace(), + is_initialized, + dropout_prob_, + dropout_state, + seed_, + state_size); #else + if (!is_test_ && !is_initialized) { PADDLE_ENFORCE_GPU_SUCCESS( phi::dynload::cudnnDropoutGetStatesSize(handle, &state_size)); -#endif dropout_state->Resize({static_cast(state_size)}); dev_ctx.template Alloc(dropout_state); } @@ -117,6 +127,7 @@ class RNNDescriptors { is_test_ ? nullptr : dropout_state, seed_, state_size); +#endif // ------------------- cudnn rnn descriptors --------------------- #ifdef PADDLE_WITH_HIP From 3c637e10cf4ac3aca5a15f90c87c509c3a0f9eb5 Mon Sep 17 00:00:00 2001 From: xiaoguoguo626807 <100397923+xiaoguoguo626807@users.noreply.github.com> Date: Wed, 29 Nov 2023 10:24:09 +0800 Subject: [PATCH 04/14] =?UTF-8?q?=E3=80=90pir=E3=80=91modfiy=20fused=5Flin?= =?UTF-8?q?ear=5Fparamgrad=5Fadd=5Fpass=20(#59296)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * modfiy pass * modify pattern insert point wrong * modify pattern insert point wrong * fix bug DeleteSourcePatternOp in DRR * refine code * modify pattern insert point wrong * delete matmul_add replace * delete matmul_add replace --------- Co-authored-by: zyfncg --- paddle/fluid/pir/drr/drr_rewrite_pattern.cc | 2 +- .../fused_linear_param_grad_add_pass.cc | 51 ++++++++++++++----- 2 files changed, 40 insertions(+), 13 deletions(-) diff --git a/paddle/fluid/pir/drr/drr_rewrite_pattern.cc b/paddle/fluid/pir/drr/drr_rewrite_pattern.cc index d558a2c10746b..91be95e788805 100644 --- a/paddle/fluid/pir/drr/drr_rewrite_pattern.cc +++ b/paddle/fluid/pir/drr/drr_rewrite_pattern.cc @@ -401,7 +401,7 @@ MatchContextImpl DrrRewritePattern::CreateOperations( max_index_op = ir_input_op; } else if (max_input_op_index == op_2_temp_program_index[ir_input_op]) { const auto& ops_vec = temp_program[max_input_op_index]; - for (auto it = ops_vec.rbegin(); it != ops_vec.rend(); it++) { + for (auto it = ops_vec.begin(); it != ops_vec.end(); it++) { if (*it == max_index_op) { break; } else if (*it == ir_input_op) { diff --git a/paddle/fluid/pir/transforms/fusion/fused_linear_param_grad_add_pass.cc b/paddle/fluid/pir/transforms/fusion/fused_linear_param_grad_add_pass.cc index bc0821d832c04..7a3afec65f33f 100644 --- a/paddle/fluid/pir/transforms/fusion/fused_linear_param_grad_add_pass.cc +++ b/paddle/fluid/pir/transforms/fusion/fused_linear_param_grad_add_pass.cc @@ -26,15 +26,22 @@ class FusedMatmulAddGradAddPattern public: void operator()(pir::drr::DrrPatternContext *ctx) const override { pir::drr::SourcePattern pat = ctx->SourcePattern(); + const auto &matmul0 = pat.Op(paddle::dialect::MatmulOp::name(), + {{"transpose_x", pat.Attr("trans_x")}, + {"transpose_y", pat.Attr("trans_y")}}); + const auto &add0 = pat.Op(paddle::dialect::AddOp::name()); const auto &add_grad = pat.Op(paddle::dialect::AddGradOp::name()); const auto &matmul_grad = pat.Op(paddle::dialect::MatmulGradOp::name(), {{"transpose_x", pat.Attr("trans_x")}, {"transpose_y", pat.Attr("trans_y")}}); const auto &add_ = pat.Op(paddle::dialect::Add_Op::name()); - add_grad( - {&pat.Tensor("out"), &pat.Tensor("bias"), &pat.Tensor("addout_grad")}, - {&pat.Tensor("out_grad"), &pat.Tensor("dbias")}); + pat.Tensor("out") = matmul0(pat.Tensor("x"), pat.Tensor("weight")); + pat.Tensor("fwd_add_out") = add0(pat.Tensor("out"), pat.Tensor("bias")); + add_grad({&pat.Tensor("out"), + &pat.Tensor("bias"), + &pat.Tensor("fwd_add_out_grad")}, + {&pat.Tensor("out_grad"), &pat.Tensor("dbias")}); matmul_grad( {&pat.Tensor("x"), &pat.Tensor("weight"), &pat.Tensor("out_grad")}, {&pat.Tensor("x_grad"), &pat.Tensor("weight_grad")}); @@ -47,7 +54,7 @@ class FusedMatmulAddGradAddPattern return (match_ctx.Tensor("weight_grad").Shape() == match_ctx.Tensor("dweight").Shape() && match_ctx.Tensor("out").Shape() == - match_ctx.Tensor("addout_grad").Shape() && + match_ctx.Tensor("fwd_add_out_grad").Shape() && x_trans == false && y_trans == false); }); @@ -70,10 +77,10 @@ class FusedMatmulAddGradAddPattern paddle::dialect::FusedLinearParamGradAddOp::name(), {{{"multi_precision", muti_precision_attr}, {"has_bias", true_attr}}}); - matmul({&res.Tensor("addout_grad"), &res.Tensor("weight")}, + matmul({&res.Tensor("fwd_add_out_grad"), &res.Tensor("weight")}, {&res.Tensor("x_grad")}); fused_linear_param_grad_add({&res.Tensor("x"), - &res.Tensor("addout_grad"), + &res.Tensor("fwd_add_out_grad"), &res.Tensor("dweight"), &res.NoneTensor()}, {&res.Tensor("add_out"), &res.Tensor("dbias")}); @@ -232,15 +239,25 @@ class FusedMatmulAddGradAddaPattern public: void operator()(pir::drr::DrrPatternContext *ctx) const override { pir::drr::SourcePattern pat = ctx->SourcePattern(); - const auto &add_grad = pat.Op(paddle::dialect::AddGradOp::name()); const auto &matmul = pat.Op(paddle::dialect::MatmulOp::name(), {{"transpose_x", pat.Attr("trans_x")}, {"transpose_y", pat.Attr("trans_y")}}); + const auto &add = pat.Op(paddle::dialect::AddOp::name()); + const auto &add_grad = pat.Op(paddle::dialect::AddGradOp::name()); + const auto &matmul_g0 = pat.Op(paddle::dialect::MatmulOp::name(), + {{"transpose_x", pat.Attr("trans_xg0")}, + {"transpose_y", pat.Attr("trans_yg0")}}); + const auto &matmul_g1 = pat.Op(paddle::dialect::MatmulOp::name(), + {{"transpose_x", pat.Attr("trans_xg1")}, + {"transpose_y", pat.Attr("trans_yg1")}}); const auto &add_ = pat.Op(paddle::dialect::Add_Op::name()); + + pat.Tensor("out") = matmul(pat.Tensor("x"), pat.Tensor("weight")); + pat.Tensor("fwd_add_out") = add(pat.Tensor("out"), pat.Tensor("bias")); add_grad({&pat.Tensor("out"), &pat.Tensor("bias"), &pat.Tensor("dadd_out")}, {&pat.Tensor("dout"), &pat.Tensor("dbias")}); - matmul({&pat.Tensor("x"), &pat.Tensor("dout")}, - {&pat.Tensor("weight_grad")}); + pat.Tensor("dx") = matmul_g0(pat.Tensor("dout"), pat.Tensor("weight")); + pat.Tensor("weight_grad") = matmul_g1(pat.Tensor("x"), pat.Tensor("dout")); pat.Tensor("dweight_out") = add_(pat.Tensor("dweight"), pat.Tensor("weight_grad")); @@ -277,15 +294,25 @@ class FusedMatmulAddGradAddbPattern public: void operator()(pir::drr::DrrPatternContext *ctx) const override { pir::drr::SourcePattern pat = ctx->SourcePattern(); - const auto &add_grad = pat.Op(paddle::dialect::AddGradOp::name()); const auto &matmul = pat.Op(paddle::dialect::MatmulOp::name(), {{"transpose_x", pat.Attr("trans_x")}, {"transpose_y", pat.Attr("trans_y")}}); + const auto &add = pat.Op(paddle::dialect::AddOp::name()); + const auto &add_grad = pat.Op(paddle::dialect::AddGradOp::name()); + const auto &matmul_g0 = pat.Op(paddle::dialect::MatmulOp::name(), + {{"transpose_x", pat.Attr("trans_xg0")}, + {"transpose_y", pat.Attr("trans_yg0")}}); + const auto &matmul_g1 = pat.Op(paddle::dialect::MatmulOp::name(), + {{"transpose_x", pat.Attr("trans_xg1")}, + {"transpose_y", pat.Attr("trans_yg1")}}); const auto &add_ = pat.Op(paddle::dialect::Add_Op::name()); + + pat.Tensor("out") = matmul(pat.Tensor("x"), pat.Tensor("weight")); + pat.Tensor("fwd_add_out") = add(pat.Tensor("out"), pat.Tensor("bias")); add_grad({&pat.Tensor("out"), &pat.Tensor("bias"), &pat.Tensor("dadd_out")}, {&pat.Tensor("dout"), &pat.Tensor("dbias")}); - matmul({&pat.Tensor("x"), &pat.Tensor("dout")}, - {&pat.Tensor("weight_grad")}); + pat.Tensor("dx") = matmul_g0(pat.Tensor("dout"), pat.Tensor("weight")); + pat.Tensor("weight_grad") = matmul_g1(pat.Tensor("x"), pat.Tensor("dout")); pat.Tensor("dweight_out") = add_(pat.Tensor("weight_grad"), pat.Tensor("dweight")); From cae8de7856c8f5a632d49396f4949f5c2caba881 Mon Sep 17 00:00:00 2001 From: HydrogenSulfate <490868991@qq.com> Date: Wed, 29 Nov 2023 10:25:12 +0800 Subject: [PATCH 05/14] [Fix] Fix DistributedBatchSampler for len(dataset) < num_replicas (#59390) * fix for case when len(dataset) < num_replicas * rewrite code --- python/paddle/io/dataloader/batch_sampler.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/python/paddle/io/dataloader/batch_sampler.py b/python/paddle/io/dataloader/batch_sampler.py index 20a9bb9a00da4..aa2f4547fd477 100644 --- a/python/paddle/io/dataloader/batch_sampler.py +++ b/python/paddle/io/dataloader/batch_sampler.py @@ -271,7 +271,15 @@ def __init__( def __iter__(self): num_samples = len(self.dataset) indices = np.arange(num_samples).tolist() - indices += indices[: (self.total_size - len(indices))] + # add extra samples to make it evenly divisible + padding_size = self.total_size - len(indices) + if padding_size <= len(indices): + indices += indices[:padding_size] + else: + indices += (indices * math.ceil(padding_size / len(indices)))[ + :padding_size + ] + assert len(indices) == self.total_size if self.shuffle: np.random.RandomState(self.epoch).shuffle(indices) From 1d859c503d3e3d4fd51cd1245d7fbe9c4cca7034 Mon Sep 17 00:00:00 2001 From: gouzil <66515297+gouzil@users.noreply.github.com> Date: Wed, 29 Nov 2023 10:37:28 +0800 Subject: [PATCH 06/14] [Dy2St] pir dy2st unittest verification - Part 11 (#59314) --------- Co-authored-by: SigureMo --- test/dygraph_to_static/test_build_strategy.py | 2 + .../test_load_transformer.py | 7 +- test/dygraph_to_static/test_save_load.py | 75 ++++++----- test/dygraph_to_static/test_sentiment.py | 122 +++++++++--------- test/dygraph_to_static/test_simnet.py | 5 +- test/dygraph_to_static/test_simnet_v2.py | 3 +- 6 files changed, 112 insertions(+), 102 deletions(-) diff --git a/test/dygraph_to_static/test_build_strategy.py b/test/dygraph_to_static/test_build_strategy.py index 991b0d52c7698..84b31adc296db 100644 --- a/test/dygraph_to_static/test_build_strategy.py +++ b/test/dygraph_to_static/test_build_strategy.py @@ -18,6 +18,7 @@ from dygraph_to_static_utils import ( Dy2StTestBase, test_ast_only, + test_legacy_and_pt_and_pir, ) from test_resnet import ResNetHelper @@ -87,6 +88,7 @@ def test_in_static_mode_mkldnn(self): class TestError(Dy2StTestBase): + @test_legacy_and_pt_and_pir def test_type_error(self): def foo(x): out = x + 1 diff --git a/test/dygraph_to_static/test_load_transformer.py b/test/dygraph_to_static/test_load_transformer.py index ccb583428f95d..6698ba7ef6075 100644 --- a/test/dygraph_to_static/test_load_transformer.py +++ b/test/dygraph_to_static/test_load_transformer.py @@ -16,7 +16,10 @@ import unittest import numpy as np -from dygraph_to_static_utils import Dy2StTestBase +from dygraph_to_static_utils import ( + Dy2StTestBase, + test_legacy_and_pt_and_pir, +) import paddle @@ -45,6 +48,7 @@ class TestFallback(Dy2StTestBase): def setUp(self): self.x = paddle.to_tensor(1.0).astype('int') + @test_legacy_and_pt_and_pir def test_name_load(self): net_dy = Net() net_st = Net() @@ -54,6 +58,7 @@ def test_name_load(self): class TestLoad2(Dy2StTestBase): + @test_legacy_and_pt_and_pir def test_name_load_nograd(self): @paddle.no_grad() def func(x): diff --git a/test/dygraph_to_static/test_save_load.py b/test/dygraph_to_static/test_save_load.py index 755e9ff143e85..0246be752c841 100644 --- a/test/dygraph_to_static/test_save_load.py +++ b/test/dygraph_to_static/test_save_load.py @@ -72,40 +72,38 @@ def test_save_load_same_result(self): x_data = np.random.randn(30, 10, 32).astype('float32') batch_num = 3 - with base.dygraph.guard(place): - paddle.jit.enable_to_static(True) - x = base.dygraph.to_variable(x_data) - net = Linear(32, 64) - adam = Adam(learning_rate=0.1, parameters=net.parameters()) - - for i in range(batch_num): - static_out, static_loss = net(x) - # Update parameters - static_loss.backward() - adam.minimize(static_loss) - net.clear_gradients() - # Save parameters - - paddle.save(net.state_dict(), self.model_path + '.pdparams') - # minimize() will update parameter, call net() to get output and avg_loss. - # Switch into eval mode. - net.eval() + paddle.jit.enable_to_static(True) + x = base.dygraph.to_variable(x_data) + net = Linear(32, 64) + adam = Adam(learning_rate=0.1, parameters=net.parameters()) + + for i in range(batch_num): static_out, static_loss = net(x) + # Update parameters + static_loss.backward() + adam.minimize(static_loss) + net.clear_gradients() + # Save parameters + + paddle.save(net.state_dict(), self.model_path + '.pdparams') + # minimize() will update parameter, call net() to get output and avg_loss. + # Switch into eval mode. + net.eval() + static_out, static_loss = net(x) # load parameters into dygraph - with base.dygraph.guard(place): - dygraph_net = Linear(32, 64) + dygraph_net = Linear(32, 64) - # Load parameters - model_dict = paddle.load(self.model_path + '.pdparams') - dygraph_net.set_dict(model_dict) - # Switch into eval mode. - dygraph_net.eval() + # Load parameters + model_dict = paddle.load(self.model_path + '.pdparams') + dygraph_net.set_dict(model_dict) + # Switch into eval mode. + dygraph_net.eval() - x = base.dygraph.to_variable(x_data) - # predict output - paddle.jit.enable_to_static(False) - dygraph_out, dygraph_loss = dygraph_net(x) + x = base.dygraph.to_variable(x_data) + # predict output + paddle.jit.enable_to_static(False) + dygraph_out, dygraph_loss = dygraph_net(x) np.testing.assert_allclose( dygraph_out.numpy(), static_out.numpy(), rtol=1e-05 @@ -114,6 +112,17 @@ def test_save_load_same_result(self): dygraph_loss.numpy(), static_loss.numpy(), rtol=1e-05 ) + def _compute_op_num(self, composite_program): + if paddle.framework.use_pir_api(): + comp_op_type_list = [ + op.name() for op in composite_program.program.global_block().ops + ] + else: + comp_op_type_list = [ + op.type for op in composite_program.block(0).ops + ] + return comp_op_type_list + @test_ast_only def test_save_load_prim(self): with base.dygraph.guard(place): @@ -127,9 +136,7 @@ def test_save_load_prim(self): composite_program = static_net.forward.get_concrete_program(self.x)[ 1 ].train_program - comp_op_type_list = [ - op.type for op in composite_program.block(0).ops - ] + comp_op_type_list = self._compute_op_num(composite_program) self.assertNotIn("batch_norm", comp_op_type_list) self.assertNotIn("relu", comp_op_type_list) self.assertNotIn("pow", comp_op_type_list) @@ -169,9 +176,7 @@ def test_save_load_prim_with_hook(self): composite_program = static_net.forward.get_concrete_program(self.x)[ 1 ].train_program - comp_op_type_list = [ - op.type for op in composite_program.block(0).ops - ] + comp_op_type_list = self._compute_op_num(composite_program) self.assertNotIn("batch_norm", comp_op_type_list) self.assertNotIn("relu", comp_op_type_list) self.assertNotIn("pow", comp_op_type_list) diff --git a/test/dygraph_to_static/test_sentiment.py b/test/dygraph_to_static/test_sentiment.py index bc606751f4624..5e93521aade88 100644 --- a/test/dygraph_to_static/test_sentiment.py +++ b/test/dygraph_to_static/test_sentiment.py @@ -21,7 +21,6 @@ import paddle from paddle import base from paddle.base.dygraph import to_variable -from paddle.jit.api import to_static from paddle.nn import Embedding, Linear SEED = 2020 @@ -88,7 +87,6 @@ def __init__(self, dict_dim, batch_size, seq_len): self._fc1_act = paddle.nn.Softmax() self._fc_prediction = Linear(self.fc_hid_dim, self.class_dim) - @to_static def forward(self, inputs, label=None): emb = self.embedding(inputs) o_np_mask = (paddle.reshape(inputs, [-1, 1]) != self.dict_dim).astype( @@ -132,7 +130,6 @@ def __init__(self, dict_dim, batch_size, seq_len): self._fc2 = Linear(self.hid_dim, self.fc_hid_dim) self._fc_prediction = Linear(self.fc_hid_dim, self.class_dim) - @to_static def forward(self, inputs, label=None): emb = self.embedding(inputs) o_np_mask = (paddle.reshape(inputs, [-1, 1]) != self.dict_dim).astype( @@ -171,7 +168,7 @@ def __init__(self, dict_dim, batch_size, seq_len): self.embedding = Embedding( self.dict_dim + 1, self.emb_dim, - weight_attr=base.ParamAttr(learning_rate=30), + weight_attr=paddle.ParamAttr(learning_rate=30), sparse=False, ) h_0 = np.zeros((self.batch_size, self.hid_dim), dtype="float32") @@ -181,7 +178,6 @@ def __init__(self, dict_dim, batch_size, seq_len): self._fc_prediction = Linear(self.fc_hid_dim, self.class_dim) self._gru = DynamicGRU(size=self.hid_dim, h_0=h_0) - @to_static def forward(self, inputs, label=None): emb = self.embedding(inputs) o_np_mask = (paddle.reshape(inputs, [-1, 1]) != self.dict_dim).astype( @@ -219,7 +215,7 @@ def __init__(self, dict_dim, batch_size, seq_len): self.embedding = Embedding( self.dict_dim + 1, self.emb_dim, - weight_attr=base.ParamAttr(learning_rate=30), + weight_attr=paddle.ParamAttr(learning_rate=30), sparse=False, ) h_0 = np.zeros((self.batch_size, self.hid_dim), dtype="float32") @@ -234,7 +230,6 @@ def __init__(self, dict_dim, batch_size, seq_len): size=self.hid_dim, h_0=h_0, is_reverse=True ) - @to_static def forward(self, inputs, label=None): emb = self.embedding(inputs) o_np_mask = (paddle.reshape(inputs, [-1, 1]) != self.dict_dim).astype( @@ -304,68 +299,71 @@ class Args: def train(args, to_static): paddle.jit.enable_to_static(to_static) - place = ( - base.CUDAPlace(0) if base.is_compiled_with_cuda() else base.CPUPlace() - ) + np.random.seed(SEED) + paddle.seed(SEED) + paddle.framework.random._manual_program_seed(SEED) - with base.dygraph.guard(place): - np.random.seed(SEED) - paddle.seed(SEED) - paddle.framework.random._manual_program_seed(SEED) + train_reader = fake_data_reader( + args.class_num, args.vocab_size, args.batch_size, args.padding_size + ) + train_loader = base.io.DataLoader.from_generator(capacity=24) + train_loader.set_sample_list_generator(train_reader) - train_reader = fake_data_reader( - args.class_num, args.vocab_size, args.batch_size, args.padding_size + if args.model_type == 'cnn_net': + model = paddle.jit.to_static( + CNN(args.vocab_size, args.batch_size, args.padding_size) + ) + elif args.model_type == 'bow_net': + model = paddle.jit.to_static( + BOW(args.vocab_size, args.batch_size, args.padding_size) ) - train_loader = base.io.DataLoader.from_generator(capacity=24) - train_loader.set_sample_list_generator(train_reader) - - if args.model_type == 'cnn_net': - model = CNN(args.vocab_size, args.batch_size, args.padding_size) - elif args.model_type == 'bow_net': - model = BOW(args.vocab_size, args.batch_size, args.padding_size) - elif args.model_type == 'gru_net': - model = GRU(args.vocab_size, args.batch_size, args.padding_size) - elif args.model_type == 'bigru_net': - model = BiGRU(args.vocab_size, args.batch_size, args.padding_size) - sgd_optimizer = paddle.optimizer.Adagrad( - learning_rate=args.lr, parameters=model.parameters() + elif args.model_type == 'gru_net': + model = paddle.jit.to_static( + GRU(args.vocab_size, args.batch_size, args.padding_size) ) + elif args.model_type == 'bigru_net': + model = paddle.jit.to_static( + BiGRU(args.vocab_size, args.batch_size, args.padding_size) + ) + sgd_optimizer = paddle.optimizer.Adagrad( + learning_rate=args.lr, parameters=model.parameters() + ) - loss_data = [] - for eop in range(args.epoch): - time_begin = time.time() - for batch_id, data in enumerate(train_loader()): - word_ids, labels, seq_lens = data - doc = to_variable(word_ids.numpy().reshape(-1)).astype('int64') - label = labels.astype('int64') - - model.train() - avg_cost, prediction, acc = model(doc, label) - loss_data.append(float(avg_cost)) - - avg_cost.backward() - sgd_optimizer.minimize(avg_cost) - model.clear_gradients() - - if batch_id % args.log_step == 0: - time_end = time.time() - used_time = time_end - time_begin - # used_time may be 0.0, cause zero division error - if used_time < 1e-5: - used_time = 1e-5 - print( - "step: %d, ave loss: %f, speed: %f steps/s" - % ( - batch_id, - float(avg_cost), - args.log_step / used_time, - ) + loss_data = [] + for eop in range(args.epoch): + time_begin = time.time() + for batch_id, data in enumerate(train_loader()): + word_ids, labels, seq_lens = data + doc = paddle.to_tensor(word_ids.numpy().reshape(-1), dtype="int64") + label = labels.astype('int64') + + model.train() + avg_cost, prediction, acc = model(doc, label) + loss_data.append(float(avg_cost)) + + avg_cost.backward() + sgd_optimizer.minimize(avg_cost) + model.clear_gradients() + + if batch_id % args.log_step == 0: + time_end = time.time() + used_time = time_end - time_begin + # used_time may be 0.0, cause zero division error + if used_time < 1e-5: + used_time = 1e-5 + print( + "step: %d, ave loss: %f, speed: %f steps/s" + % ( + batch_id, + float(avg_cost), + args.log_step / used_time, ) - time_begin = time.time() + ) + time_begin = time.time() - if batch_id == args.train_step: - break - batch_id += 1 + if batch_id == args.train_step: + break + batch_id += 1 return loss_data diff --git a/test/dygraph_to_static/test_simnet.py b/test/dygraph_to_static/test_simnet.py index 1b6a5148023f8..b1dc687abad3b 100644 --- a/test/dygraph_to_static/test_simnet.py +++ b/test/dygraph_to_static/test_simnet.py @@ -17,9 +17,7 @@ import unittest import numpy as np -from dygraph_to_static_utils import ( - Dy2StTestBase, -) +from dygraph_to_static_utils import Dy2StTestBase, test_legacy_and_pt_and_pir from simnet_dygraph_model import BOW, HingeLoss import paddle @@ -180,6 +178,7 @@ def train(conf_dict, to_static): class TestSimnet(Dy2StTestBase): + @test_legacy_and_pt_and_pir def test_dygraph_static_same_loss(self): if base.is_compiled_with_cuda(): base.set_flags({"FLAGS_cudnn_deterministic": True}) diff --git a/test/dygraph_to_static/test_simnet_v2.py b/test/dygraph_to_static/test_simnet_v2.py index d87235ca8ce31..1c4476a99457d 100644 --- a/test/dygraph_to_static/test_simnet_v2.py +++ b/test/dygraph_to_static/test_simnet_v2.py @@ -17,7 +17,7 @@ import unittest import numpy as np -from dygraph_to_static_utils import Dy2StTestBase +from dygraph_to_static_utils import Dy2StTestBase, test_legacy_and_pt_and_pir from simnet_dygraph_model_v2 import BOW, HingeLoss import paddle @@ -177,6 +177,7 @@ def train(conf_dict, to_static): class TestSimnet(Dy2StTestBase): + @test_legacy_and_pt_and_pir def test_dygraph_static_same_loss(self): if paddle.is_compiled_with_cuda(): paddle.base.set_flags({"FLAGS_cudnn_deterministic": True}) From 4f23e7e86fe854ae7da39a3036575a302df92f58 Mon Sep 17 00:00:00 2001 From: lzy <569782149@qq.com> Date: Wed, 29 Nov 2023 10:37:59 +0800 Subject: [PATCH 07/14] optimize mem_eff_attn's compilation (#59446) --- paddle/phi/kernels/CMakeLists.txt | 65 ++++++++++++++++++- .../generate_kernels.py | 13 ++-- .../generate_variable_forward_kernels.py | 13 ++-- 3 files changed, 79 insertions(+), 12 deletions(-) diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index 6fbbd0cf83306..5355b4e08c21b 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -66,14 +66,14 @@ if(WITH_CUTLASS) COMMAND ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/generate_kernels.py - --cuda_arch "${NVCC_ARCH_BIN}" + --cuda_arch "${NVCC_ARCH_BIN}" --gen_dir "autogen_tmp" RESULT_VARIABLE memory_efficient_attention_gen_res) execute_process( COMMAND ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/generate_variable_forward_kernels.py - --cuda_arch "${NVCC_ARCH_BIN}" + --cuda_arch "${NVCC_ARCH_BIN}" --gen_dir "autogen_variable_tmp" RESULT_VARIABLE memory_efficient_attention_gen_res) if(NOT memory_efficient_attention_gen_res EQUAL 0) @@ -83,6 +83,67 @@ if(WITH_CUTLASS) ) endif() + set(autogen_tmp_dir + ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/autogen_tmp + ) + set(autogen_variable_tmp_dir + ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/autogen_variable_tmp + ) + set(autogen_dir + ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/autogen + ) + set(autogen_variable_dir + ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/autogen_variable + ) + + file(GLOB generated_files ${autogen_tmp_dir}/*.h ${autogen_tmp_dir}/impl/*.cu) + + file(GLOB variable_generated_files ${autogen_variable_tmp_dir}/*.h + ${autogen_variable_tmp_dir}/impl/*.cu) + + if(EXISTS ${autogen_dir}) + foreach(gen_file ${generated_files}) + string(REPLACE "autogen_tmp" "autogen" now_file ${gen_file}) + execute_process(COMMAND ${CMAKE_COMMAND} -E copy_if_different + "${gen_file}" "${now_file}") + endforeach() + message("copy if different ${autogen_dir}") + else() + foreach(gen_file ${generated_files}) + string(REPLACE "autogen_tmp" "autogen" now_file ${gen_file}) + execute_process(COMMAND ${CMAKE_COMMAND} -E copy "${gen_file}" + "${now_file}") + endforeach() + message("copy ${autogen_dir}") + endif() + + if(EXISTS ${autogen_variable_dir}) + foreach(gen_file ${variable_generated_files}) + string(REPLACE "autogen_variable_tmp" "autogen_variable" now_file + ${gen_file}) + execute_process(COMMAND ${CMAKE_COMMAND} -E copy_if_different + "${gen_file}" "${now_file}") + endforeach() + message("copy if different ${autogen_variable_dir}") + else() + foreach(gen_file ${variable_generated_files}) + string(REPLACE "autogen_variable_tmp" "autogen_variable" now_file + ${gen_file}) + execute_process(COMMAND ${CMAKE_COMMAND} -E copy "${gen_file}" + "${now_file}") + endforeach() + message("copy ${autogen_variable_dir}") + endif() + + file( + REMOVE_RECURSE + ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/autogen_tmp + ) + file( + REMOVE_RECURSE + ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/autogen_variable_tmp + ) + execute_process( COMMAND ${CMAKE_COMMAND} -E make_directory diff --git a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/generate_kernels.py b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/generate_kernels.py index 7caf30236bb79..a6633d02ecebe 100644 --- a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/generate_kernels.py +++ b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/generate_kernels.py @@ -26,7 +26,6 @@ import collections import itertools import os -import shutil from dataclasses import dataclass, field from pathlib import Path from typing import Dict, List, Optional, Tuple, TypeVar @@ -94,6 +93,12 @@ def parse_args(): default=convert_to_arch_list("All"), help="The CUDA architecture to be generated.", ) + parser.add_argument( + "--gen_dir", + type=str, + default="autogen_variable", + help="The directory to save the generated files.", + ) args = parser.parse_args() args.max_arch = find_max_arch(args.cuda_arch) return args @@ -425,7 +430,7 @@ def write_decl_impl( declarations += "} // namespace phi\n" declarations += f"#endif // {enable_def}\n" - autogen_dir = Path(args.dst_path) / "autogen" + autogen_dir = Path(args.dst_path) / args.gen_dir os.makedirs(autogen_dir, exist_ok=True) declaration_path = autogen_dir / f"{family_name}.h" declaration_path.write_text(declarations) @@ -530,14 +535,12 @@ def write_main_header(forward_impl, backward_impl): #endif ''' - path = Path(args.dst_path) / "autogen" + path = Path(args.dst_path) / args.gen_dir os.makedirs(path, exist_ok=True) path = Path(path) / "memory_efficient_attention.h" path.write_text(main_header_content) -if os.path.exists(Path(args.dst_path) / "autogen"): - shutil.rmtree(Path(args.dst_path) / "autogen") forward_impl = "paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/kernel_forward.h" backward_impl = "paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/kernel_backward.h" diff --git a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/generate_variable_forward_kernels.py b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/generate_variable_forward_kernels.py index 86c08cdf8d107..07e710e52d206 100644 --- a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/generate_variable_forward_kernels.py +++ b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/generate_variable_forward_kernels.py @@ -26,7 +26,6 @@ import collections import itertools import os -import shutil from dataclasses import dataclass, field from pathlib import Path from typing import Dict, List, Optional, Tuple, TypeVar @@ -94,6 +93,12 @@ def parse_args(): default=convert_to_arch_list("All"), help="The CUDA architecture to be generated.", ) + parser.add_argument( + "--gen_dir", + type=str, + default="autogen_variable", + help="The directory to save the generated files.", + ) args = parser.parse_args() args.max_arch = find_max_arch(args.cuda_arch) return args @@ -399,7 +404,7 @@ def write_decl_impl( declarations += "} // namespace phi\n" declarations += f"#endif // {enable_def}\n" - autogen_dir = Path(args.dst_path) / "autogen_variable" + autogen_dir = Path(args.dst_path) / args.gen_dir os.makedirs(autogen_dir, exist_ok=True) declaration_path = autogen_dir / f"{family_name}.h" declaration_path.write_text(declarations) @@ -546,14 +551,12 @@ def write_main_header(): #endif ''' - path = Path(args.dst_path) / "autogen_variable" + path = Path(args.dst_path) / args.gen_dir os.makedirs(path, exist_ok=True) path = Path(path) / "memory_efficient_variable_attention.h" path.write_text(main_header_content) -if os.path.exists(Path(args.dst_path) / "autogen_variable"): - shutil.rmtree(Path(args.dst_path) / "autogen_variable") forward_impl = "paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/autogen_variable/memory_efficient_variable_attention.h" write_main_header() From 3b40279ef60ce6577af5c1671a69168e7ab97281 Mon Sep 17 00:00:00 2001 From: Ryan <44900829+DrRyanHuang@users.noreply.github.com> Date: Wed, 29 Nov 2023 10:39:09 +0800 Subject: [PATCH 08/14] [Dy2St] pir dy2st unittest verification - Part -2 (#59370) --------- Co-authored-by: SigureMo --- test/dygraph_to_static/test_assert.py | 31 ++++++++++-------- test/dygraph_to_static/test_dict.py | 29 ++++++++++------- test/dygraph_to_static/test_isinstance.py | 38 ++++++++++++++--------- 3 files changed, 59 insertions(+), 39 deletions(-) diff --git a/test/dygraph_to_static/test_assert.py b/test/dygraph_to_static/test_assert.py index bde776bf023d9..01250626f07e0 100644 --- a/test/dygraph_to_static/test_assert.py +++ b/test/dygraph_to_static/test_assert.py @@ -17,21 +17,17 @@ import numpy from dygraph_to_static_utils import ( Dy2StTestBase, - test_ast_only, ) import paddle from paddle import base -from paddle.jit.api import to_static -@paddle.jit.to_static def dyfunc_assert_variable(x): x_v = base.dygraph.to_variable(x) assert x_v -@to_static def dyfunc_assert_non_variable(x=True): assert x @@ -51,31 +47,40 @@ def _run_dy_static(self, func, x, with_exception): self._run(func, x, with_exception, True) self._run(func, x, with_exception, False) - @test_ast_only def test_non_variable(self): self._run_dy_static( - dyfunc_assert_non_variable, x=False, with_exception=True + paddle.jit.to_static(dyfunc_assert_non_variable), + x=False, + with_exception=True, ) self._run_dy_static( - dyfunc_assert_non_variable, x=True, with_exception=False + paddle.jit.to_static(dyfunc_assert_non_variable), + x=True, + with_exception=False, ) - @test_ast_only def test_bool_variable(self): self._run_dy_static( - dyfunc_assert_variable, x=numpy.array([False]), with_exception=True + paddle.jit.to_static(dyfunc_assert_variable), + x=numpy.array([False]), + with_exception=True, ) self._run_dy_static( - dyfunc_assert_variable, x=numpy.array([True]), with_exception=False + paddle.jit.to_static(dyfunc_assert_variable), + x=numpy.array([True]), + with_exception=False, ) - @test_ast_only def test_int_variable(self): self._run_dy_static( - dyfunc_assert_variable, x=numpy.array([0]), with_exception=True + paddle.jit.to_static(dyfunc_assert_variable), + x=numpy.array([0]), + with_exception=True, ) self._run_dy_static( - dyfunc_assert_variable, x=numpy.array([1]), with_exception=False + paddle.jit.to_static(dyfunc_assert_variable), + x=numpy.array([1]), + with_exception=False, ) diff --git a/test/dygraph_to_static/test_dict.py b/test/dygraph_to_static/test_dict.py index b8b2480573527..f69b112ba9afd 100644 --- a/test/dygraph_to_static/test_dict.py +++ b/test/dygraph_to_static/test_dict.py @@ -15,11 +15,13 @@ import unittest import numpy as np -from dygraph_to_static_utils import Dy2StTestBase +from dygraph_to_static_utils import ( + Dy2StTestBase, + test_legacy_and_pt_and_pir, +) import paddle from paddle import base -from paddle.jit import to_static PLACE = base.CUDAPlace(0) if base.is_compiled_with_cuda() else base.CPUPlace() @@ -79,7 +81,6 @@ def __init__(self, batch_size=64, hidden_size=16, output_size=16): self.output_size = output_size self.sub_net = SubNetWithDict(hidden_size, output_size) - @to_static def forward(self, input, max_len=4): input = base.dygraph.to_variable(input) cache = { @@ -135,17 +136,19 @@ def _run_dygraph(self): def train(self, to_static=False): paddle.jit.enable_to_static(to_static) with base.dygraph.guard(PLACE): - net = MainNetWithDict(batch_size=self.batch_size) + net = paddle.jit.to_static( + MainNetWithDict(batch_size=self.batch_size) + ) ret = net(self.x) return ret.numpy() + @test_legacy_and_pt_and_pir def test_ast_to_func(self): self.assertTrue((self._run_dygraph() == self._run_static()).all()) # Tests for dict pop -@paddle.jit.to_static -def test_dic_pop(x): +def test_dict_pop(x): x = paddle.to_tensor(x) dict_a = {"red": 0, "green": 1, "blue": 2} @@ -156,8 +159,7 @@ def test_dic_pop(x): return out -@paddle.jit.to_static -def test_dic_pop_2(x): +def test_dict_pop_2(x): x = paddle.to_tensor(x) dict_a = {"red": x, "green": x + 1, "blue": x + 3} @@ -179,7 +181,7 @@ def setUp(self): self._set_test_func() def _set_test_func(self): - self.dygraph_func = test_dic_pop + self.dygraph_func = paddle.jit.to_static(test_dict_pop) def _run_static(self): return self._run(to_static=True) @@ -194,6 +196,7 @@ def _run(self, to_static): return result.numpy() + @test_legacy_and_pt_and_pir def test_transformed_result(self): dygraph_res = self._run_dygraph() static_res = self._run_static() @@ -207,14 +210,13 @@ def test_transformed_result(self): class TestDictPop2(TestDictPop): def _set_test_func(self): - self.dygraph_func = test_dic_pop_2 + self.dygraph_func = paddle.jit.to_static(test_dict_pop_2) class NetWithDictPop(paddle.nn.Layer): def __init__(self): super().__init__() - @to_static def forward(self, x, **kwargs): x = paddle.to_tensor(x) y = kwargs.pop('y', None) @@ -233,10 +235,11 @@ def setUp(self): def train(self, to_static=False): paddle.jit.enable_to_static(to_static) with base.dygraph.guard(PLACE): - net = NetWithDictPop() + net = paddle.jit.to_static(NetWithDictPop()) ret = net(z=0, x=self.x, y=True) return ret.numpy() + @test_legacy_and_pt_and_pir def test_ast_to_func(self): dygraph_result = self._run_dygraph() static_result = self._run_static() @@ -248,6 +251,7 @@ def test_ast_to_func(self): class TestDictCmpInFor(Dy2StTestBase): + @test_legacy_and_pt_and_pir def test_with_for(self): def func(): pos = [1, 3] @@ -264,6 +268,7 @@ def func(): self.assertEqual(paddle.jit.to_static(func)()['minus'], 8) + @test_legacy_and_pt_and_pir def test_with_for_enumerate(self): def func(): pos = [1, 3] diff --git a/test/dygraph_to_static/test_isinstance.py b/test/dygraph_to_static/test_isinstance.py index 498c84ad0e885..9cac83cdbbe2f 100644 --- a/test/dygraph_to_static/test_isinstance.py +++ b/test/dygraph_to_static/test_isinstance.py @@ -88,24 +88,34 @@ def train(model, to_static): class TestIsinstance(Dy2StTestBase): @test_legacy_and_pt_and_pir def test_isinstance_simple_return_layer(self): - model = paddle.jit.to_static(IsInstanceLayer(SimpleReturnLayer())) - self._test_model(model) + model_creator = lambda: paddle.jit.to_static( + IsInstanceLayer(SimpleReturnLayer()) + ) + self._test_model(model_creator) + @test_legacy_and_pt_and_pir def test_isinstance_add_attr_layer(self): - model = paddle.jit.to_static(IsInstanceLayer(AddAttrLayer())) - self._test_model(model) + model_creator = lambda: paddle.jit.to_static( + IsInstanceLayer(AddAttrLayer()) + ) + self._test_model(model_creator) + @test_legacy_and_pt_and_pir def test_sequential_layer(self): - layers = [] - for i in range(5): - layers.append(SimpleReturnLayer()) - layers.append(AddAttrLayer()) - model = paddle.jit.to_static(SequentialLayer(layers)) - self._test_model(model) - - def _test_model(self, model): - st_out = train(model, to_static=True) - dy_out = train(model, to_static=False) + def model_creator(): + layers = [] + for i in range(5): + layers.append(SimpleReturnLayer()) + layers.append(AddAttrLayer()) + return paddle.jit.to_static(SequentialLayer(layers)) + + self._test_model(model_creator) + + def _test_model(self, model_creator): + st_model = model_creator() + st_out = train(st_model, to_static=True) + dy_model = model_creator() + dy_out = train(dy_model, to_static=False) np.testing.assert_allclose( dy_out, st_out, From 87bf502307ac3041f3e1171415b052fe8815e0be Mon Sep 17 00:00:00 2001 From: zhengzhonghui <38675903+deepllz@users.noreply.github.com> Date: Wed, 29 Nov 2023 10:43:00 +0800 Subject: [PATCH 09/14] [AutoParallel] Add reduce_all spmd rule (#59411) * [AutoParallel] Add reduce_all spmd rule * fix compile error on window * add c++ unittest for reduce_all spmd --- paddle/phi/api/yaml/legacy_ops.yaml | 1 + .../auto_parallel/reshard/reshard_utils.h | 6 ++-- paddle/phi/core/distributed/gloo_utils.h | 7 ++++ paddle/phi/core/visit_type.h | 29 +++++++++++++++-- paddle/phi/infermeta/spmd_rules/reduction.cc | 7 ++++ paddle/phi/infermeta/spmd_rules/reduction.h | 4 +++ paddle/phi/kernels/gpu/all_reduce_kernel.cu | 5 +++ .../semi_auto_parallel_for_reduction.py | 32 +++++++++++++++++-- test/cpp/auto_parallel/spmd_rule_test.cc | 21 ++++++++++++ 9 files changed, 104 insertions(+), 8 deletions(-) diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml index c14588389a706..3cfcf155f47c4 100755 --- a/paddle/phi/api/yaml/legacy_ops.yaml +++ b/paddle/phi/api/yaml/legacy_ops.yaml @@ -34,6 +34,7 @@ output : Tensor(out) infer_meta : func : ReduceInferMeta + spmd_rule : ReductionAllInferSpmdDynamic kernel : func : all diff --git a/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.h b/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.h index f064d038c5186..022dc06598064 100644 --- a/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.h +++ b/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.h @@ -76,14 +76,14 @@ CommContext* CreateOrGetCommContext(const DeviceContext& dev_ctx, do { \ if (phi::CPUContext::classof(dev_ctx)) { \ VLOG(4) << "Call `" << #fn_name << "` in Resharding on CPU."; \ - PD_VISIT_FLOATING_AND_INTEGRAL_TYPES( \ + PD_VISIT_BOOL_AND_FLOATING_AND_INTEGRAL_TYPES_CPU( \ dtype, #fn_name, ([&] { \ fn_name(static_cast(*dev_ctx), \ __VA_ARGS__); \ })); \ } else if (phi::GPUContext::classof(dev_ctx)) { \ VLOG(4) << "Call `" << #fn_name << "` in Resharding on GPU."; \ - PD_VISIT_BOOL_AND_FLOATING_AND_INTEGRAL_TYPES( \ + PD_VISIT_BOOL_AND_FLOATING_AND_INTEGRAL_TYPES_GPU( \ dtype, #fn_name, ([&] { \ fn_name(static_cast(*dev_ctx), \ __VA_ARGS__); \ @@ -99,7 +99,7 @@ CommContext* CreateOrGetCommContext(const DeviceContext& dev_ctx, do { \ if (phi::CPUContext::classof(dev_ctx)) { \ VLOG(4) << "Call `" << #fn_name << "` in Resharding on CPU."; \ - PD_VISIT_FLOATING_AND_INTEGRAL_TYPES( \ + PD_VISIT_BOOL_AND_FLOATING_AND_INTEGRAL_TYPES_CPU( \ dtype, #fn_name, ([&] { \ fn_name(static_cast(*dev_ctx), \ __VA_ARGS__); \ diff --git a/paddle/phi/core/distributed/gloo_utils.h b/paddle/phi/core/distributed/gloo_utils.h index 36680a873111a..5c0be5d5ca0cf 100644 --- a/paddle/phi/core/distributed/gloo_utils.h +++ b/paddle/phi/core/distributed/gloo_utils.h @@ -142,6 +142,13 @@ void SetReduceFunc(P* opts, int reduce_type) { static_cast( &gloo::product)); break; + case ReduceType::kRedAll: + // NOTE(zhonghui): There is no reduce_all math function for gloo, just use + // min to replace + opts->setReduceFunction( + static_cast( + &gloo::min)); + break; default: PADDLE_THROW( errors::InvalidArgument("Unsupport reduce type: %d.", reduce_type)); diff --git a/paddle/phi/core/visit_type.h b/paddle/phi/core/visit_type.h index 9f1b258bb5b03..047ba79bc1599 100644 --- a/paddle/phi/core/visit_type.h +++ b/paddle/phi/core/visit_type.h @@ -151,7 +151,7 @@ namespace phi { ///////// BOOL and Floating and Integral Dispatch Marco /////////// #if NCCL_VERSION_CODE >= 21000 -#define PD_VISIT_BOOL_AND_FLOATING_AND_INTEGRAL_TYPES(TYPE, NAME, ...) \ +#define PD_VISIT_BOOL_AND_FLOATING_AND_INTEGRAL_TYPES_GPU(TYPE, NAME, ...) \ [&] { \ const auto& __dtype__ = TYPE; \ switch (__dtype__) { \ @@ -180,7 +180,7 @@ namespace phi { } \ }() #else -#define PD_VISIT_BOOL_AND_FLOATING_AND_INTEGRAL_TYPES(TYPE, NAME, ...) \ +#define PD_VISIT_BOOL_AND_FLOATING_AND_INTEGRAL_TYPES_GPU(TYPE, NAME, ...) \ [&] { \ const auto& __dtype__ = TYPE; \ switch (__dtype__) { \ @@ -208,6 +208,31 @@ namespace phi { }() #endif +#define PD_VISIT_BOOL_AND_FLOATING_AND_INTEGRAL_TYPES_CPU(TYPE, NAME, ...) \ + [&] { \ + const auto& __dtype__ = TYPE; \ + switch (__dtype__) { \ + PD_PRIVATE_CASE_TYPE(NAME, ::phi::DataType::BOOL, bool, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE( \ + NAME, ::paddle::DataType::FLOAT32, float, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE( \ + NAME, ::paddle::DataType::FLOAT64, double, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT32, int, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE( \ + NAME, ::paddle::DataType::INT64, int64_t, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE( \ + NAME, ::paddle::DataType::INT8, int8_t, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE( \ + NAME, ::paddle::DataType::UINT8, uint8_t, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE( \ + NAME, ::paddle::DataType::INT16, int16_t, __VA_ARGS__) \ + default: \ + PD_THROW("function " #NAME " is not implemented for data type `", \ + __dtype__, \ + "`"); \ + } \ + }() + ///////// Floating and Complex Dispatch Marco /////////// #define PD_VISIT_FLOATING_AND_COMPLEX_TYPES(TYPE, NAME, ...) \ diff --git a/paddle/phi/infermeta/spmd_rules/reduction.cc b/paddle/phi/infermeta/spmd_rules/reduction.cc index a1fc0873a244a..1129c2ab81e35 100644 --- a/paddle/phi/infermeta/spmd_rules/reduction.cc +++ b/paddle/phi/infermeta/spmd_rules/reduction.cc @@ -159,6 +159,13 @@ SpmdInfo ReductionMaxInferSpmdDynamic(const DistMetaTensor& x, x, axis.GetData(), keep_dim, static_cast(ReduceType::kRedMax)); } +SpmdInfo ReductionAllInferSpmdDynamic(const DistMetaTensor& x, + const IntArray& axis, + bool keep_dim) { + return ReductionInferSpmdBase( + x, axis.GetData(), keep_dim, static_cast(ReduceType::kRedAll)); +} + SpmdInfo ReductionInferSpmdReverse(const DistMetaTensor& x, const DistMetaTensor& out, const std::vector& axis, diff --git a/paddle/phi/infermeta/spmd_rules/reduction.h b/paddle/phi/infermeta/spmd_rules/reduction.h index 30144e6d7ca46..a168f4cb500e5 100644 --- a/paddle/phi/infermeta/spmd_rules/reduction.h +++ b/paddle/phi/infermeta/spmd_rules/reduction.h @@ -44,6 +44,10 @@ SpmdInfo ReductionMaxInferSpmdDynamic(const DistMetaTensor& x, const IntArray& axis, bool keep_dim); +SpmdInfo ReductionAllInferSpmdDynamic(const DistMetaTensor& x, + const IntArray& axis, + bool keep_dim); + SpmdInfo ReductionInferSpmdReverse(const DistMetaTensor& x, const DistMetaTensor& out, const std::vector& axis, diff --git a/paddle/phi/kernels/gpu/all_reduce_kernel.cu b/paddle/phi/kernels/gpu/all_reduce_kernel.cu index 9cd283695c89a..0c920ef1bc61e 100644 --- a/paddle/phi/kernels/gpu/all_reduce_kernel.cu +++ b/paddle/phi/kernels/gpu/all_reduce_kernel.cu @@ -57,6 +57,11 @@ void AllReduceKernel(const Context& dev_ctx, case ReduceType::kRedProd: red_type = ncclProd; break; + case ReduceType::kRedAll: + // NOTE(zhonghui): There is no reduce_all type of ncclRedOp_t, just use + // min to replace + red_type = ncclMin; + break; } comm_ctx->AllReduce(out, x, red_type, stream); #else diff --git a/test/auto_parallel/semi_auto_parallel_for_reduction.py b/test/auto_parallel/semi_auto_parallel_for_reduction.py index 5cd7ef4596268..15c43b9f56304 100644 --- a/test/auto_parallel/semi_auto_parallel_for_reduction.py +++ b/test/auto_parallel/semi_auto_parallel_for_reduction.py @@ -37,8 +37,11 @@ def test_body( ): paddle.seed(self._seed) np.random.seed(self._seed) + is_op_func_all = op_func == paddle.all x = paddle.randn(x_shape, self._dtype) + if is_op_func_all: + x = x > 0 x.stop_gradient = False dist_x = dist.shard_tensor(x, self._mesh, x_placements) @@ -49,9 +52,10 @@ def test_body( self.check_tensor_eq(out, dist_out) np.testing.assert_equal(dist_out.shape, out_shape, verbose=True) - dist_out.backward() - out.backward() - self.check_tensor_eq(x.grad, dist_x.grad) + if not is_op_func_all: + dist_out.backward() + out.backward() + self.check_tensor_eq(x.grad, dist_x.grad) def test_sum_x_shard(self): self.test_body( @@ -113,6 +117,26 @@ def test_max_x_shard_on_axis(self): op_func=paddle.max, ) + def test_all_x_shard(self): + self.test_body( + x_shape=[4, 8, 6], + out_shape=[4, 6], + x_placements=[dist.Shard(0)], + axis=1, + keepdim=False, + op_func=paddle.all, + ) + + def test_all_x_shard_on_axis(self): + self.test_body( + x_shape=[4, 8, 6], + out_shape=[4, 6], + x_placements=[dist.Shard(1)], + axis=1, + keepdim=False, + op_func=paddle.all, + ) + def run_test_case(self): if self._backend == "cpu": paddle.set_device("cpu") @@ -127,6 +151,8 @@ def run_test_case(self): self.test_mean_x_shard() self.test_max_x_shard() self.test_max_x_shard_on_axis() + self.test_all_x_shard() + self.test_all_x_shard_on_axis() if __name__ == '__main__': diff --git a/test/cpp/auto_parallel/spmd_rule_test.cc b/test/cpp/auto_parallel/spmd_rule_test.cc index 844758e923e3b..2a13d46db2798 100644 --- a/test/cpp/auto_parallel/spmd_rule_test.cc +++ b/test/cpp/auto_parallel/spmd_rule_test.cc @@ -1006,6 +1006,27 @@ TEST(ReduceMaxRule, Ctor) { check_partial_dims(backward_info.second[0], {}); } +TEST(ReduceAllRule, Ctor) { + std::vector mesh_shape = {2}; + std::vector process_ids = {0, 1}; + std::vector dim_names = {"x"}; + ProcessMesh process_mesh(mesh_shape, process_ids, dim_names); + + // test forward + auto t_dist_attr = TensorDistAttr(); + t_dist_attr.set_process_mesh(process_mesh); + t_dist_attr.set_dims_mapping({-1, 0, -1}); + t_dist_attr.set_dynamic_dims({false, false, false}); + phi::distributed::DistMetaTensor x = + phi::distributed::DistMetaTensor(phi::make_ddim({4, 6, 8}), t_dist_attr); + IntArray axis = {1}; + bool keep_dim = false; + phi::distributed::SpmdInfo forward_info = + phi::distributed::ReductionAllInferSpmdDynamic(x, axis, keep_dim); + check_dim_mapping(forward_info.second[0], {-1, -1}); + check_partial_dims(forward_info.second[0], {0}); +} + TEST(Numel, Ctor) { std::vector mesh_shape = {2, 2}; std::vector process_ids = {0, 1, 2, 3}; From 5e897083b58b0ff991854c963d791fcab0946928 Mon Sep 17 00:00:00 2001 From: zyt1024 <42999008+zyt1024@users.noreply.github.com> Date: Wed, 29 Nov 2023 10:43:27 +0800 Subject: [PATCH 10/14] =?UTF-8?q?=E3=80=90complex=20op=E3=80=91=20add=20co?= =?UTF-8?q?mplex=20support=20for=20=20unbind,broadcast,broadcast=5Ftensors?= =?UTF-8?q?=20and=20broadcast=5Ftensor=5Fgrad=20(#59122)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add complex support for unbind,broadcast,broadcast_tensors and broadcast_tensors_grad * add test_dtype * add complex support for unbind,broadcast,broadcast_tensors and broadcast_tensor_grad * fix code_style * Resolve conflicts and generate complex data. --- paddle/phi/kernels/cpu/broadcast_kernel.cc | 4 +- .../cpu/broadcast_tensors_grad_kernel.cc | 4 +- .../kernels/cpu/broadcast_tensors_kernel.cc | 4 +- paddle/phi/kernels/cpu/unbind_kernel.cc | 4 +- paddle/phi/kernels/gpu/broadcast_kernel.cu | 8 +- .../gpu/broadcast_tensors_grad_kernel.cu | 4 +- .../kernels/gpu/broadcast_tensors_kernel.cu | 4 +- paddle/phi/kernels/gpu/unbind_kernel.cu | 4 +- .../distributed/communication/broadcast.py | 2 +- python/paddle/tensor/manipulation.py | 8 +- .../cpp/phi/kernels/test_ternary_broadcast.cu | 48 ++++++ test/legacy_test/op_test.py | 2 + test/legacy_test/test_broadcast_error.py | 20 ++- test/legacy_test/test_broadcast_tensors_op.py | 107 ++++++++---- test/legacy_test/test_unbind_op.py | 152 +++++++++++++++--- 15 files changed, 310 insertions(+), 65 deletions(-) diff --git a/paddle/phi/kernels/cpu/broadcast_kernel.cc b/paddle/phi/kernels/cpu/broadcast_kernel.cc index baa12d1815edc..0deb8d8bbc562 100644 --- a/paddle/phi/kernels/cpu/broadcast_kernel.cc +++ b/paddle/phi/kernels/cpu/broadcast_kernel.cc @@ -62,4 +62,6 @@ PD_REGISTER_KERNEL(broadcast, int8_t, uint8_t, int64_t, - phi::dtype::float16) {} + phi::dtype::float16, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc b/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc index 0656f681367ff..8f73c5c5f5f6e 100644 --- a/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc @@ -199,4 +199,6 @@ PD_REGISTER_KERNEL(broadcast_tensors_grad, int64_t, float, double, - phi::dtype::float16) {} + phi::dtype::float16, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/cpu/broadcast_tensors_kernel.cc b/paddle/phi/kernels/cpu/broadcast_tensors_kernel.cc index 3ad26164d7d8d..7d0e08655fc27 100644 --- a/paddle/phi/kernels/cpu/broadcast_tensors_kernel.cc +++ b/paddle/phi/kernels/cpu/broadcast_tensors_kernel.cc @@ -27,4 +27,6 @@ PD_REGISTER_KERNEL(broadcast_tensors, int64_t, float, double, - phi::dtype::float16) {} + phi::dtype::float16, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/cpu/unbind_kernel.cc b/paddle/phi/kernels/cpu/unbind_kernel.cc index e8d0c01352c97..255f73af1aca7 100644 --- a/paddle/phi/kernels/cpu/unbind_kernel.cc +++ b/paddle/phi/kernels/cpu/unbind_kernel.cc @@ -27,4 +27,6 @@ PD_REGISTER_KERNEL(unbind, phi::dtype::float16, phi::dtype::bfloat16, int, - int64_t) {} + int64_t, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/gpu/broadcast_kernel.cu b/paddle/phi/kernels/gpu/broadcast_kernel.cu index 4b46e218c328e..e4986f752b1ae 100644 --- a/paddle/phi/kernels/gpu/broadcast_kernel.cu +++ b/paddle/phi/kernels/gpu/broadcast_kernel.cu @@ -66,7 +66,9 @@ PD_REGISTER_KERNEL(broadcast, int8_t, uint8_t, int64_t, - phi::dtype::float16) {} + phi::dtype::float16, + phi::dtype::complex, + phi::dtype::complex) {} #else PD_REGISTER_KERNEL(broadcast, GPU, @@ -79,5 +81,7 @@ PD_REGISTER_KERNEL(broadcast, int8_t, uint8_t, int64_t, - phi::dtype::float16) {} + phi::dtype::float16, + phi::dtype::complex, + phi::dtype::complex) {} #endif diff --git a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu index 40ea1f195069e..1c56b93c7c1dc 100644 --- a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu @@ -111,4 +111,6 @@ PD_REGISTER_KERNEL(broadcast_tensors_grad, float, double, phi::dtype::float16, - phi::dtype::bfloat16) {} + phi::dtype::bfloat16, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu b/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu index 3d16797cb66c0..aae7d53aeb43a 100644 --- a/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu +++ b/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu @@ -28,4 +28,6 @@ PD_REGISTER_KERNEL(broadcast_tensors, float, double, phi::dtype::float16, - phi::dtype::bfloat16) {} + phi::dtype::bfloat16, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/gpu/unbind_kernel.cu b/paddle/phi/kernels/gpu/unbind_kernel.cu index 37272cebdf118..178191f048e30 100644 --- a/paddle/phi/kernels/gpu/unbind_kernel.cu +++ b/paddle/phi/kernels/gpu/unbind_kernel.cu @@ -27,4 +27,6 @@ PD_REGISTER_KERNEL(unbind, phi::dtype::float16, phi::dtype::bfloat16, int, - int64_t) {} + int64_t, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/python/paddle/distributed/communication/broadcast.py b/python/paddle/distributed/communication/broadcast.py index 208158cd20918..9c87e0345db5f 100644 --- a/python/paddle/distributed/communication/broadcast.py +++ b/python/paddle/distributed/communication/broadcast.py @@ -37,7 +37,7 @@ def broadcast(tensor, src, group=None, sync_op=True): Args: tensor (Tensor): The tensor to send if current rank is the source, or the tensor to receive otherwise. Its data type - should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16. + should be float16, float32, float64, int32, int64, int8, uint8, bool, bfloat16, complex64 or complex128. src (int): The source rank in global view. group (Group, optional): The group instance return by new_group or None for global default group. sync_op (bool, optional): Whether this op is a sync op. The default value is True. diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py index 5bec599390fdb..a0ed76a3b970d 100644 --- a/python/paddle/tensor/manipulation.py +++ b/python/paddle/tensor/manipulation.py @@ -1349,7 +1349,7 @@ def broadcast_tensors(input, name=None): Args: input (list|tuple): ``input`` is a Tensor list or Tensor tuple which is with data type bool, - float16, float32, float64, int32, int64. All the Tensors in ``input`` must have same data type. + float16, float32, float64, int32, int64, complex64, complex128. All the Tensors in ``input`` must have same data type. Currently we only support tensors with rank no greater than 5. name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. @@ -1390,6 +1390,8 @@ def broadcast_tensors(input, name=None): 'int32', 'int64', 'uint16', + 'complex64', + 'complex128', ], 'broadcast_tensors', ) @@ -3037,7 +3039,7 @@ def unbind(input, axis=0): Removes a tensor dimension, then split the input tensor into multiple sub-Tensors. Args: - input (Tensor): The input variable which is an N-D Tensor, data type being bool, float16, float32, float64, int32 or int64. + input (Tensor): The input variable which is an N-D Tensor, data type being bool, float16, float32, float64, int32, int64, complex64 or complex128. axis (int32|int64, optional): A scalar with type ``int32|int64`` shape [1]. The dimension along which to unbind. If :math:`axis < 0`, the dimension to unbind along is :math:`rank(input) + axis`. Default is 0. Returns: @@ -3094,6 +3096,8 @@ def unbind(input, axis=0): 'float64', 'int32', 'int64', + 'complex64', + 'complex128', ], 'unbind', ) diff --git a/test/cpp/phi/kernels/test_ternary_broadcast.cu b/test/cpp/phi/kernels/test_ternary_broadcast.cu index 09598e637909a..959b79725f07a 100644 --- a/test/cpp/phi/kernels/test_ternary_broadcast.cu +++ b/test/cpp/phi/kernels/test_ternary_broadcast.cu @@ -122,6 +122,22 @@ TEST(Broadcast, add) { dim_out, times, AddTernary_1()); + TestCase>( + *dev_ctx, + dim1, + dim2, + dim3, + dim_out, + times, + AddTernary_1>()); + TestCase>( + *dev_ctx, + dim1, + dim2, + dim3, + dim_out, + times, + AddTernary_1>()); } while (0); do { @@ -145,6 +161,22 @@ TEST(Broadcast, add) { dim_out, times, AddTernary_2()); + TestCase>( + *dev_ctx, + dim1, + dim2, + dim3, + dim_out, + times, + AddTernary_2>()); + TestCase>( + *dev_ctx, + dim1, + dim2, + dim3, + dim_out, + times, + AddTernary_2>()); } while (0); do { @@ -168,6 +200,22 @@ TEST(Broadcast, add) { dim_out, times, AddTernary_3()); + TestCase>( + *dev_ctx, + dim1, + dim2, + dim3, + dim_out, + times, + AddTernary_3>()); + TestCase>( + *dev_ctx, + dim1, + dim2, + dim3, + dim_out, + times, + AddTernary_3>()); } while (0); #endif } diff --git a/test/legacy_test/op_test.py b/test/legacy_test/op_test.py index 79a289f65890e..759d76dabef9a 100644 --- a/test/legacy_test/op_test.py +++ b/test/legacy_test/op_test.py @@ -671,6 +671,8 @@ def infer_dtype(numpy_dict, dtype_set): input_dtype_set = set() infer_dtype(inputs, input_dtype_set) dtype_list = [ + np.dtype(np.complex128), + np.dtype(np.complex64), np.dtype(np.float64), np.dtype(np.float32), np.dtype(np.float16), diff --git a/test/legacy_test/test_broadcast_error.py b/test/legacy_test/test_broadcast_error.py index d42cc6d9b8840..e5defec467d9f 100644 --- a/test/legacy_test/test_broadcast_error.py +++ b/test/legacy_test/test_broadcast_error.py @@ -23,7 +23,12 @@ class TestBroadcastOpCpu(OpTest): def setUp(self): self.op_type = "broadcast" - input = np.random.random((100, 2)).astype("float32") + self.init_dtype() + input = np.random.random((100, 2)).astype(self.dtype) + if self.dtype == 'complex64' or self.dtype == 'complex128': + input = ( + np.random.random((100, 2)) + 1j * np.random.random((100, 2)) + ).astype(self.dtype) np_out = input[:] self.inputs = {"X": input} self.attrs = {"sync_mode": False, "root": 0} @@ -35,6 +40,19 @@ def test_check_output_cpu(self): except: print("do not support cpu test, skip") + def init_dtype(self): + self.dtype = 'float32' + + +class TestBroadcastOpCpu_complex64(TestBroadcastOpCpu): + def init_dtype(self): + self.dtype = 'complex64' + + +class TestBroadcastOpCpu_complex128(TestBroadcastOpCpu): + def init_dtype(self): + self.dtype = 'complex128' + if __name__ == "__main__": unittest.main() diff --git a/test/legacy_test/test_broadcast_tensors_op.py b/test/legacy_test/test_broadcast_tensors_op.py index 9f5b7b76caacb..d8de6e1bba8a8 100644 --- a/test/legacy_test/test_broadcast_tensors_op.py +++ b/test/legacy_test/test_broadcast_tensors_op.py @@ -47,7 +47,10 @@ def find_output_shape(input_list): def make_inputs_outputs(input_shapes, dtype, is_bfloat16=False): """Automatically generate formatted inputs and outputs from input_shapes""" input_list = [ - np.random.random(shape).astype(dtype) for shape in input_shapes + (np.random.random(shape) + 1j * np.random.random(shape)).astype(dtype) + if dtype == 'complex64' or dtype == 'complex128' + else np.random.random(shape).astype(dtype) + for shape in input_shapes ] output_shape = find_output_shape(input_list) output_list = [ @@ -98,8 +101,8 @@ class TestCPUBroadcastTensorsOp(OpTest): def set_place(self): self.place = core.CPUPlace() - def set_dtypes(self): - self.dtypes = ['float64'] + def set_dtype(self): + self.dtype = 'float64' def setUp(self): self.op_type = "broadcast_tensors" @@ -112,26 +115,24 @@ def setUp(self): gen_empty_tensors_test, ] self.set_place() - self.set_dtypes() + self.set_dtype() self.python_api = paddle.broadcast_tensors def run_dual_test(self, test_func, args): - for dtype in self.dtypes: - for gen_func in self.test_gen_func_list: - self.inputs, self.outputs = gen_func(dtype) - if len(self.outputs["Out"]) < 3: - self.python_out_sig = [ - f"out{i}" for i in range(len(self.outputs["Out"])) - ] - test_func(**args) + for gen_func in self.test_gen_func_list: + self.inputs, self.outputs = gen_func(self.dtype) + if len(self.outputs["Out"]) < 3: + self.python_out_sig = [ + f"out{i}" for i in range(len(self.outputs["Out"])) + ] + test_func(**args) def run_triple_in_test(self, test_func, args): - for dtype in self.dtypes: - self.inputs, self.outputs = self.test_gen_func_list[2](dtype) - self.python_out_sig = [ - f"out{i}" for i in range(len(self.outputs["Out"])) - ] - test_func(**args) + self.inputs, self.outputs = self.test_gen_func_list[2](self.dtype) + self.python_out_sig = [ + f"out{i}" for i in range(len(self.outputs["Out"])) + ] + test_func(**args) def test_check_output(self): self.run_dual_test( @@ -160,6 +161,16 @@ def test_check_grad_normal(self): ) +class TestCPUBroadcastTensorsOp_complex64(TestCPUBroadcastTensorsOp): + def set_dtypes(self): + self.dtype = 'complex64' + + +class TestCPUBroadcastTensorsOp_complex128(TestCPUBroadcastTensorsOp): + def set_dtypes(self): + self.dtype = 'complex128' + + @unittest.skipIf( not core.is_compiled_with_cuda(), "core is not compiled with CUDA" ) @@ -239,28 +250,44 @@ def test_check_grad_normal(self): class TestBroadcastTensorsAPI(unittest.TestCase): + def setUp(self): + self.dtype = 'float32' + def test_api(self): @test_with_pir_api def test_static(): - inputs = [ - paddle.static.data( - shape=[-1, 4, 1, 4, 1], dtype='float32', name="x0" - ), - paddle.static.data( - shape=[-1, 1, 4, 1, 4], dtype='float32', name="x1" - ), - ] - paddle.broadcast_tensors(inputs) + prog = paddle.static.Program() + startup_prog = paddle.static.Program() + with paddle.static.program_guard(prog, startup_prog): + inputs = [ + paddle.static.data( + shape=[-1, 4, 1, 4, 1], dtype=self.dtype, name="x0" + ), + paddle.static.data( + shape=[-1, 1, 4, 1, 4], dtype=self.dtype, name="x1" + ), + ] + paddle.broadcast_tensors(inputs) def test_dynamic(): paddle.disable_static() try: inputs = [ paddle.to_tensor( - np.random.random([4, 1, 4, 1]).astype("float32") + np.random.random([4, 1, 4, 1]).astype(self.dtype) + if self.dtype == 'float32' + else ( + np.random.random([4, 1, 4, 1]) + + 1j * np.random.random([4, 1, 4, 1]) + ).astype(self.dtype) ), paddle.to_tensor( - np.random.random([1, 4, 1, 4]).astype("float32") + np.random.random([1, 4, 1, 4]).astype(self.dtype) + if self.dtype == 'float32' + else ( + np.random.random([1, 4, 1, 4]) + + 1j * np.random.random([1, 4, 1, 4]) + ).astype(self.dtype) ), ] paddle.broadcast_tensors(inputs) @@ -271,6 +298,16 @@ def test_dynamic(): test_dynamic() +class TestBroadcastTensorsAPI_complex64(TestBroadcastTensorsAPI): + def setUp(self): + self.dtype = 'complex64' + + +class TestBroadcastTensorsAPI_complex128(TestBroadcastTensorsAPI): + def setUp(self): + self.dtype = 'complex128' + + class TestRaiseBroadcastTensorsError(unittest.TestCase): def test_errors(self): def test_type(): @@ -306,9 +343,21 @@ def test_bcast_semantics(): ] paddle.broadcast_tensors(inputs) + def test_bcast_semantics_complex64(): + inputs = [ + paddle.static.data( + shape=[-1, 1, 3, 1, 1], dtype='complex64', name="x11" + ), + paddle.static.data( + shape=[-1, 1, 8, 1, 1], dtype='complex64', name="x12" + ), + ] + paddle.broadcast_tensors(inputs) + self.assertRaises(TypeError, test_type) self.assertRaises(TypeError, test_dtype) self.assertRaises(TypeError, test_bcast_semantics) + self.assertRaises(TypeError, test_bcast_semantics_complex64) class TestRaiseBroadcastTensorsErrorDyGraph(unittest.TestCase): diff --git a/test/legacy_test/test_unbind_op.py b/test/legacy_test/test_unbind_op.py index c01858c06ad5e..833ffd824bb13 100644 --- a/test/legacy_test/test_unbind_op.py +++ b/test/legacy_test/test_unbind_op.py @@ -24,28 +24,38 @@ class TestUnbind(unittest.TestCase): + def setUp(self): + self.init_dtype() + self.input_1 = np.random.random([2, 3]).astype(self.dtype) + if self.dtype == 'complex64' or self.dtype == 'complex128': + self.input_1 = ( + np.random.random([2, 3]) + 1j * np.random.random([2, 3]) + ).astype(self.dtype) + + def init_dtype(self): + self.dtype = 'float32' + @test_with_pir_api def test_unbind(self): paddle.enable_static() - + self.init_dtype() main_program = static.Program() startup_program = static.Program() with static.program_guard( main_program=main_program, startup_program=startup_program ): - x_1 = paddle.static.data(shape=[2, 3], dtype='float32', name='x_1') + x_1 = paddle.static.data(shape=[2, 3], dtype=self.dtype, name='x_1') [out_0, out_1] = tensor.unbind(input=x_1, axis=0) - input_1 = np.random.random([2, 3]).astype("float32") axis = paddle.static.data(shape=[], dtype='int32', name='axis') exe = base.Executor(place=base.CPUPlace()) [res_1, res_2] = exe.run( - feed={"x_1": input_1, "axis": 0}, + feed={"x_1": self.input_1, "axis": 0}, fetch_list=[out_0, out_1], ) - np.testing.assert_array_equal(res_1, input_1[0, 0:100]) - np.testing.assert_array_equal(res_2, input_1[1, 0:100]) + np.testing.assert_array_equal(res_1, self.input_1[0, 0:100]) + np.testing.assert_array_equal(res_2, self.input_1[1, 0:100]) @test_with_pir_api def test_unbind_static_fp16_gpu(self): @@ -73,38 +83,74 @@ def test_unbind_static_fp16_gpu(self): def test_unbind_dygraph(self): with base.dygraph.guard(): - np_x = np.random.random([2, 3]).astype("float32") - x = paddle.to_tensor(np_x) + x = paddle.to_tensor(self.input_1) x.stop_gradient = False [res_1, res_2] = paddle.unbind(x, 0) - np.testing.assert_array_equal(res_1, np_x[0, 0:100]) - np.testing.assert_array_equal(res_2, np_x[1, 0:100]) + np.testing.assert_array_equal(res_1, self.input_1[0, 0:100]) + np.testing.assert_array_equal(res_2, self.input_1[1, 0:100]) out = paddle.add_n([res_1, res_2]) - np_grad = np.ones(x.shape, np.float32) + np_grad = np.ones(x.shape, self.dtype) out.backward() np.testing.assert_array_equal(x.grad.numpy(False), np_grad) +class TestUnbind_complex64(TestUnbind): + def init_dtype(self): + self.dtype = 'complex64' + + def test_unbind_static_fp16_gpu(self): + pass + + +class TestUnbind_complex128(TestUnbind): + def init_dtype(self): + self.dtype = 'complex128' + + def test_unbind_static_fp16_gpu(self): + pass + + class TestLayersUnbind(unittest.TestCase): + def setUp(self): + self.init_dtype() + self.input_1 = np.random.random([2, 3]).astype(self.dtype) + if self.dtype == 'complex64' or self.dtype == 'complex128': + self.input_1 = ( + np.random.random([2, 3]) + 1j * np.random.random([2, 3]) + ).astype(self.dtype) + + def init_dtype(self): + self.dtype = 'float32' + @test_with_pir_api def test_layers_unbind(self): paddle.enable_static() + prog = paddle.static.Program() + startup_prog = paddle.static.Program() + with paddle.static.program_guard(prog, startup_prog): + x_1 = paddle.static.data(shape=[2, 3], dtype=self.dtype, name='x_1') + [out_0, out_1] = paddle.unbind(input=x_1, axis=0) + axis = paddle.static.data(shape=[], dtype='int32', name='axis') + exe = base.Executor(place=base.CPUPlace()) + [res_1, res_2] = exe.run( + feed={"x_1": self.input_1, "axis": 0}, + fetch_list=[out_0, out_1], + ) - x_1 = paddle.static.data(shape=[2, 3], dtype='float32', name='x_1') - [out_0, out_1] = paddle.unbind(input=x_1, axis=0) - input_1 = np.random.random([2, 3]).astype("float32") - axis = paddle.static.data(shape=[], dtype='int32', name='axis') - exe = base.Executor(place=base.CPUPlace()) + np.testing.assert_array_equal(res_1, self.input_1[0, 0:100]) + np.testing.assert_array_equal(res_2, self.input_1[1, 0:100]) - [res_1, res_2] = exe.run( - feed={"x_1": input_1, "axis": 0}, - fetch_list=[out_0, out_1], - ) - np.testing.assert_array_equal(res_1, input_1[0, 0:100]) - np.testing.assert_array_equal(res_2, input_1[1, 0:100]) +class TestLayersUnbind_complex64(TestLayersUnbind): + def init_dtype(self): + self.dtype = 'complex64' + + +class TestLayersUnbind_complex128(TestLayersUnbind): + def init_dtype(self): + self.dtype = 'complex128' class TestUnbindOp(OpTest): @@ -126,6 +172,11 @@ def setUp(self): self.num = 3 self.initParameters() x = np.arange(12).reshape(3, 2, 2).astype(self.dtype) + if self.dtype == np.complex64 or self.dtype == np.complex128: + x = ( + np.arange(12).reshape(3, 2, 2) + + 1j * np.arange(12).reshape(3, 2, 2) + ).astype(self.dtype) self.out = np.split(x, self.num, self.axis) self.outReshape() self.inputs = {'X': x} @@ -208,6 +259,46 @@ def outReshape(self): self.out[1] = self.out[1].reshape((3, 2)) +class TestUnbindOp1_Complex64(TestUnbindOp1): + def get_dtype(self): + return np.complex64 + + +class TestUnbindOp2_Complex64(TestUnbindOp2): + def get_dtype(self): + return np.complex64 + + +class TestUnbindOp3_Complex64(TestUnbindOp3): + def get_dtype(self): + return np.complex64 + + +class TestUnbindOp4_Complex64(TestUnbindOp4): + def get_dtype(self): + return np.complex64 + + +class TestUnbindOp1_Complex128(TestUnbindOp1): + def get_dtype(self): + return np.complex128 + + +class TestUnbindOp2_Complex128(TestUnbindOp2): + def get_dtype(self): + return np.complex128 + + +class TestUnbindOp3_Complex128(TestUnbindOp3): + def get_dtype(self): + return np.complex128 + + +class TestUnbindOp4_Complex128(TestUnbindOp4): + def get_dtype(self): + return np.complex128 + + class TestUnbindFP16Op(OpTest): def setUp(self): paddle.disable_static() @@ -278,10 +369,15 @@ def test_check_grad(self): class TestUnbindAxisError(unittest.TestCase): + def setUp(self): + self.dtype = 'float32' + @test_with_pir_api def test_errors(self): + paddle.enable_static() + with program_guard(Program(), Program()): - x = paddle.static.data(shape=[2, 3], dtype='float32', name='x') + x = paddle.static.data(shape=[2, 3], dtype=self.dtype, name='x') def test_table_Variable(): tensor.unbind(input=x, axis=2.0) @@ -294,6 +390,16 @@ def test_invalid_axis(): self.assertRaises(ValueError, test_invalid_axis) +class TestUnbindAxisError_complex64(TestUnbindAxisError): + def setUp(self): + self.dtype = 'complex64' + + +class TestUnbindAxisError_complex128(TestUnbindAxisError): + def setUp(self): + self.dtype = 'complex128' + + class TestUnbindBool(unittest.TestCase): def test_bool(self): x = paddle.to_tensor([[True, True], [False, False]]) From 7331bf95ef13d2a413c91e7da6c7c23000381d4e Mon Sep 17 00:00:00 2001 From: Xiaoxu Chen Date: Wed, 29 Nov 2023 10:45:43 +0800 Subject: [PATCH 11/14] [auto parallel] add embedding backward spmd rules (#59003) * add embedding backward spmd rules * [auto parallel] add dense embedding_grad spmd rule. * fix embedding out_grad is reshard bugs --- paddle/phi/api/lib/api_custom_impl.cc | 37 ++-- paddle/phi/api/yaml/legacy_ops.yaml | 1 + paddle/phi/infermeta/spmd_rules/embedding.cc | 112 ++++++++++++ paddle/phi/infermeta/spmd_rules/embedding.h | 9 +- .../phi/kernels/cpu/embedding_grad_kernel.cc | 1 - ...py => semi_auto_parallel_for_embedding.py} | 52 +++--- .../test_semi_auto_parallel_basic.py | 4 +- test/cpp/auto_parallel/spmd_rule_test.cc | 159 ++++++++++++++++++ 8 files changed, 329 insertions(+), 46 deletions(-) rename test/auto_parallel/{semi_auto_parallel_for_embedding_grad.py => semi_auto_parallel_for_embedding.py} (76%) diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc index 72c5bc3485772..efb7a0befe281 100644 --- a/paddle/phi/api/lib/api_custom_impl.cc +++ b/paddle/phi/api/lib/api_custom_impl.cc @@ -264,12 +264,17 @@ void embedding_grad_impl(const Tensor& x, auto meta_dist_input_x = MakeDistMetaTensor(*x.impl()); auto meta_dist_input_weight = MakeDistMetaTensor(*weight.impl()); auto meta_dist_input_out_grad = MakeDistMetaTensor(*out_grad.impl()); - auto spmd_info = phi::distributed::VariadicReplicatedInferSpmdDynamic( - meta_dist_input_weight, meta_dist_input_x, meta_dist_input_out_grad); + auto spmd_info = + phi::distributed::EmbeddingGradInferSpmd(meta_dist_input_x, + meta_dist_input_weight, + meta_dist_input_out_grad, + padding_idx, + sparse); // 2. Create Temporary Output & Prepare Dist and Dense Output std::shared_ptr shared_dist_out = - CreateKernelDistOutput(weight_grad, !rank_is_in_current_mesh); + CreateKernelDistOutput( + weight_grad, !rank_is_in_current_mesh, spmd_info.second[0]); phi::distributed::DistTensor* dist_out = shared_dist_out.get(); phi::DenseTensor* dense_out = dist_out->unsafe_mutable_value(); if (dense_out && !rank_is_in_current_mesh && !dist_out->defined()) { @@ -284,37 +289,29 @@ void embedding_grad_impl(const Tensor& x, UnchangedInferMeta(MakeMetaTensor(*weight.impl()), &meta_dist_out); // 4. Set Output Dist Attr For Default Impl - auto current_process_mesh = - paddle::holds_alternative( - spmd_info.first[0]) - ? paddle::get<0>(spmd_info.first[0]).process_mesh() - : paddle::get<1>(spmd_info.first[0]).at(0).process_mesh(); - SetReplicatedDistAttrForOutput(dist_out, current_process_mesh); if (rank_is_in_current_mesh) { // 5. Reshard Input - auto dist_input_weight = - ReshardApiInputToKernelInput(dev_ctx, weight, spmd_info.first[0]); auto dist_input_x = - ReshardApiInputToKernelInput(dev_ctx, x, spmd_info.first[1]); + ReshardApiInputToKernelInput(dev_ctx, x, spmd_info.first[0]); + auto dist_input_weight = + ReshardApiInputToKernelInput(dev_ctx, weight, spmd_info.first[1]); auto dist_input_out_grad = ReshardApiInputToKernelInput(dev_ctx, out_grad, spmd_info.first[2]); // 6. PrepareData (DataTransform & Prepare Dense Input) - dist_input_weight = PrepareDataForDistTensor( - dist_input_weight, + dist_input_x = PrepareDataForDistTensor( + dist_input_x, GetKernelInputArgDef(kernel.InputAt(0), kernel_key.backend()), {}, kernel_result.is_stride_kernel); - auto input_weight = &dist_input_weight->value(); - - dist_input_x = PrepareDataForDistTensor( - dist_input_x, + auto input_x = &dist_input_x->value(); + dist_input_weight = PrepareDataForDistTensor( + dist_input_weight, GetKernelInputArgDef(kernel.InputAt(1), kernel_key.backend()), {}, kernel_result.is_stride_kernel); - auto input_x = &dist_input_x->value(); - + auto input_weight = &dist_input_weight->value(); dist_input_out_grad = PrepareDataForDistTensor( dist_input_out_grad, GetKernelInputArgDef(kernel.InputAt(2), kernel_key.backend()), diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml index 3cfcf155f47c4..e9eaef16b6a29 100755 --- a/paddle/phi/api/yaml/legacy_ops.yaml +++ b/paddle/phi/api/yaml/legacy_ops.yaml @@ -362,6 +362,7 @@ infer_meta : func : EmbeddingInferMeta param : [x, weight, padding_idx] + spmd_rule: EmbeddingInferSpmd kernel : func : embedding {dense, dense -> dense} sparse_weight_embedding {dense, selected_rows -> dense} diff --git a/paddle/phi/infermeta/spmd_rules/embedding.cc b/paddle/phi/infermeta/spmd_rules/embedding.cc index b9d3ee7904ba7..99aec54cd5474 100644 --- a/paddle/phi/infermeta/spmd_rules/embedding.cc +++ b/paddle/phi/infermeta/spmd_rules/embedding.cc @@ -19,6 +19,8 @@ limitations under the License. */ #include "paddle/phi/core/distributed/auto_parallel/dist_attr.h" #include "paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h" #include "paddle/phi/core/distributed/auto_parallel/utils.h" +#include "paddle/phi/infermeta/spmd_rules/matmul.h" +#include "paddle/phi/infermeta/spmd_rules/reshape.h" #include "paddle/phi/infermeta/spmd_rules/utils.h" namespace phi { @@ -208,5 +210,115 @@ SpmdInfo EmbeddingInferSpmdReverse(const DistMetaTensor& x, return {{x_dist_attr, weight_dist_attr}, {out_dist_attr_src}}; } +SpmdInfo EmbeddingGradInferSpmd(const DistMetaTensor& x, + const DistMetaTensor& weight, + const DistMetaTensor& out_grad, + int64_t padding_idx, + bool sparse) { + PADDLE_ENFORCE_EQ(out_grad.dims().size(), + out_grad.dist_attr().dims_mapping().size(), + phi::errors::InvalidArgument( + "The Tensor out_grad's rank [%d] and out_grad's " + "dims_mapping size [%d] are not matched.", + out_grad.dims(), + out_grad.dist_attr().dims_mapping().size())); + + if (sparse) { + PADDLE_THROW(phi::errors::InvalidArgument( + "EmbeddingGradInferSpmd does't support sparse currently.")); + } + + // Propagate sharding info using composite operators. + // The whole mathematical expression of EmbeddingGrad is: + // w_grad = einsum('...j, ...k->jk', onehot(x, j), out_grad) + + // TODO(cxxly): Simplifies the code logic of sharding propagation using + // primitive operators. + DistMetaTensor x_dst(x.dims(), x.dist_attr()); + DistMetaTensor w_dst(weight.dims(), weight.dist_attr()); + DistMetaTensor out_grad_dst(out_grad.dims(), out_grad.dist_attr()); + DistMetaTensor w_grad(weight.dims(), weight.dist_attr()); + + // Step1: t0 = onehot(x_dst, w_dst.shape[0]) = eye(num_classes)[x_dst] + auto t0_dims_mapping = x_dst.dist_attr().dims_mapping(); + t0_dims_mapping.emplace_back(-1); + TensorDistAttr t0_dist_attr(x.dist_attr()); + t0_dist_attr.set_dims_mapping(t0_dims_mapping); + auto t0_shape = phi::vectorize(x.dims()); + t0_shape.emplace_back(w_dst.dims()[0]); + DistMetaTensor t0(phi::make_ddim(t0_shape), t0_dist_attr); + + // Step2: w_grad = einsum('...j, ...k -> jk', t0, out_grad_dst) + // Step 2.1: Build Einsum Notation + std::string alphabet = "abcdefghijlmnopqrstuvwxyz"; + std::string t0_axes = + GetBroadcastAxes(t0.dims().size(), t0.dims().size(), alphabet); + std::string out_grad_dst_axes = t0_axes.substr(0, t0_axes.length() - 1) + "k"; + std::string w_grad_axes = t0_axes.substr(t0_axes.length() - 1, 1) + "k"; + + // Step2.2: Sharding Propogation + // Step2.2.1: merge input shardings + auto axis_to_dim_map = ShardingMergeForTensors( + {{t0_axes, t0.dist_attr().dims_mapping()}, + {out_grad_dst_axes, out_grad_dst.dist_attr().dims_mapping()}}, + false); + + // Step2.2.2: infer output's dims mapping. + auto w_grad_dist_attr = w_grad.dist_attr(); + std::vector w_grad_dims_mapping = + GetDimsMappingForAxes(w_grad_axes, axis_to_dim_map); + w_grad_dist_attr.set_dims_mapping(w_grad_dims_mapping); + + // Step2.2.3: merge potential conflict in inputs, + // update input dims mapping with merged shardings. + t0_dist_attr.set_dims_mapping( + GetDimsMappingForAxes(t0_axes, axis_to_dim_map)); + auto out_grad_dst_dist_attr = out_grad_dst.dist_attr(); + out_grad_dst_dist_attr.set_dims_mapping( + GetDimsMappingForAxes(out_grad_dst_axes, axis_to_dim_map)); + + // Step2.2.4: Handle Partial + std::vector partial_on_dims = + ResoluteOutputPartialDimension(axis_to_dim_map, w_grad_axes); + w_grad_dist_attr.set_partial_status(partial_on_dims); + + // Step2.3: Update inputs info. + // NOTE: Reshard happend on intemediate operators must be ensure propagated + // back to first inputs. + t0 = DistMetaTensor(t0.dims(), t0_dist_attr); + const auto& t0_dims = t0.dist_attr().dims_mapping(); + if (x_dst.dist_attr().dims_mapping() != + std::vector(t0_dims.begin(), t0_dims.end() - 1)) { + TensorDistAttr t0_new(t0.dist_attr()); + t0_new.set_dims_mapping( + std::vector(t0_dims.begin(), t0_dims.end() - 1)); + x_dst = DistMetaTensor(x_dst.dims(), t0_new); + } + out_grad_dst = DistMetaTensor(out_grad_dst.dims(), out_grad_dst_dist_attr); + w_grad = DistMetaTensor(w_grad.dims(), w_grad_dist_attr); + + VLOG(6) << "EmbeddingGradInferSpmd:\n" + << "Input x shape: [" << str_join(phi::vectorize(x.dims())) + << "], src_dims_mapping: [" << str_join(x.dist_attr().dims_mapping()) + << "], dst_dims_mapping: [" + << str_join(x_dst.dist_attr().dims_mapping()) << "]\n" + << "Input weight shape: [" << str_join(phi::vectorize(weight.dims())) + << "], src_dims_mapping: [" + << str_join(weight.dist_attr().dims_mapping()) + << "], dst_dims_mapping: [" + << str_join(w_dst.dist_attr().dims_mapping()) << "]\n" + << "Input out_grad shape: [" + << str_join(phi::vectorize(out_grad.dims())) + << "], src_dims_mapping: [" + << str_join(out_grad.dist_attr().dims_mapping()) + << "], dst_dims_mapping: [" + << str_join(out_grad_dst.dist_attr().dims_mapping()) << "]\n" + << "Output w_grad shape: [" << str_join(phi::vectorize(w_grad.dims())) + << "], dims_mapping: [" << str_join(w_grad.dist_attr().dims_mapping()) + << "]\n\n"; + + return {{x_dst.dist_attr(), w_dst.dist_attr(), out_grad_dst.dist_attr()}, + {w_grad.dist_attr()}}; +} } // namespace distributed } // namespace phi diff --git a/paddle/phi/infermeta/spmd_rules/embedding.h b/paddle/phi/infermeta/spmd_rules/embedding.h index 6b1d3614442bd..08782334f9e21 100644 --- a/paddle/phi/infermeta/spmd_rules/embedding.h +++ b/paddle/phi/infermeta/spmd_rules/embedding.h @@ -29,13 +29,18 @@ namespace distributed { SpmdInfo EmbeddingInferSpmd(const DistMetaTensor& x, const DistMetaTensor& weight, int padding_idx, - bool sparse); + bool sparse = false); SpmdInfo EmbeddingInferSpmdReverse(const DistMetaTensor& x, const DistMetaTensor& weight, const DistMetaTensor& out, int padding_idx, - bool sparse); + bool sparse = false); +SpmdInfo EmbeddingGradInferSpmd(const DistMetaTensor& x, + const DistMetaTensor& weight, + const DistMetaTensor& out_grad, + int64_t padding_idx, + bool sparse = false); } // namespace distributed } // namespace phi diff --git a/paddle/phi/kernels/cpu/embedding_grad_kernel.cc b/paddle/phi/kernels/cpu/embedding_grad_kernel.cc index fabb4e83d52f7..d3dbc06f6c04f 100644 --- a/paddle/phi/kernels/cpu/embedding_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/embedding_grad_kernel.cc @@ -58,7 +58,6 @@ struct EmbeddingGradCPUFunctor { auto* d_table_data = weight_grad_->data(); memset(d_table_data, 0, weight_grad_->numel() * sizeof(T)); - for (int64_t i = 0; i < ids_num; ++i) { if (padding_idx_ != kNoPadding && ids_data[i] == padding_idx_) { // the gradient of padding_idx should be 0, already done by memset, so diff --git a/test/auto_parallel/semi_auto_parallel_for_embedding_grad.py b/test/auto_parallel/semi_auto_parallel_for_embedding.py similarity index 76% rename from test/auto_parallel/semi_auto_parallel_for_embedding_grad.py rename to test/auto_parallel/semi_auto_parallel_for_embedding.py index 19d26d8437fa4..8f91a68f755c7 100644 --- a/test/auto_parallel/semi_auto_parallel_for_embedding_grad.py +++ b/test/auto_parallel/semi_auto_parallel_for_embedding.py @@ -21,7 +21,7 @@ from paddle.distributed import Replicate, Shard -class TestCustomEmbeddingGradApiForSemiAutoParallel: +class TestEmbeddingApiForSemiAutoParallel: def __init__(self): self._dtype = os.getenv("dtype") self._backend = os.getenv("backend") @@ -100,28 +100,37 @@ def test_w_col_shard(self): ) def test_x_row_w_col_shard(self): - self.test_body( - x_shape=[12, 16], - w_shape=[10, 4], - x_placements=[Shard(0)], - w_placements=[Shard(1)], - ) + try: + self.test_body( + x_shape=[12, 16], + w_shape=[10, 4], + x_placements=[Shard(0)], + w_placements=[Shard(1)], + ) + except RuntimeError as e: + assert 'sharded by same mesh dimension ' in str(e) def test_x_col_w_row_shard(self): - self.test_body( - x_shape=[12, 16], - w_shape=[10, 4], - x_placements=[Shard(1)], - w_placements=[Shard(0)], - ) + try: + self.test_body( + x_shape=[12, 16], + w_shape=[10, 4], + x_placements=[Shard(1)], + w_placements=[Shard(0)], + ) + except RuntimeError as e: + assert 'sharded by same mesh dimension ' in str(e) def test_both_col_shard(self): - self.test_body( - x_shape=[12, 16], - w_shape=[10, 4], - x_placements=[Shard(1)], - w_placements=[Shard(1)], - ) + try: + self.test_body( + x_shape=[12, 16], + w_shape=[10, 4], + x_placements=[Shard(1)], + w_placements=[Shard(1)], + ) + except RuntimeError as e: + assert 'sharded by same mesh dimension', str(e) def run_test_case(self): if self._backend == "cpu": @@ -134,7 +143,8 @@ def run_test_case(self): self.test_non_shard() self.test_x_row_shard() self.test_x_col_shard() - self.test_w_row_shard() + # Sharding along weight's row axis is not supported by raw embdding kernel. + # self.test_w_row_shard() self.test_w_col_shard() self.test_x_row_w_col_shard() self.test_x_col_w_row_shard() @@ -142,4 +152,4 @@ def run_test_case(self): if __name__ == '__main__': - TestCustomEmbeddingGradApiForSemiAutoParallel().run_test_case() + TestEmbeddingApiForSemiAutoParallel().run_test_case() diff --git a/test/auto_parallel/test_semi_auto_parallel_basic.py b/test/auto_parallel/test_semi_auto_parallel_basic.py index c933f3784a980..245bf00a197f4 100644 --- a/test/auto_parallel/test_semi_auto_parallel_basic.py +++ b/test/auto_parallel/test_semi_auto_parallel_basic.py @@ -138,13 +138,13 @@ def test_flash_attention_api(self): user_defined_envs=envs, ) - def test_custom_embedding_grad_api(self): + def test_embedding_api(self): envs_list = test_base.gen_product_envs_list( self._default_envs, self._changeable_envs ) for envs in envs_list: self.run_test_case( - "semi_auto_parallel_for_embedding_grad.py", + "semi_auto_parallel_for_embedding.py", user_defined_envs=envs, ) diff --git a/test/cpp/auto_parallel/spmd_rule_test.cc b/test/cpp/auto_parallel/spmd_rule_test.cc index 2a13d46db2798..aec8e4d0445a5 100644 --- a/test/cpp/auto_parallel/spmd_rule_test.cc +++ b/test/cpp/auto_parallel/spmd_rule_test.cc @@ -25,6 +25,7 @@ limitations under the License. */ #include "paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h" #include "paddle/phi/core/distributed/auto_parallel/process_mesh.h" #include "paddle/phi/core/distributed/type_defs.h" +#include "paddle/phi/infermeta/spmd_rules/embedding.h" #include "paddle/phi/infermeta/spmd_rules/replicated.h" #include "paddle/phi/infermeta/spmd_rules/rules.h" @@ -1348,6 +1349,164 @@ TEST(ElementwiseUnaryLike, Ctor) { check_element_unary_like(infered_dist_attrs); } +TEST(EmbeddingGradInferSpmd, Ctor) { + // build input data class + std::vector x_shape = {4, 5}; + std::vector w_shape = {10, 3}; + std::vector out_grad_shape = {4, 5, 3}; + + std::vector mesh_shape = {2, 3}; + std::vector process_ids = {0, 1, 2, 3, 4, 5}; + std::vector dim_names = {"x", "y"}; + ProcessMesh process_mesh(mesh_shape, process_ids, dim_names); + + // indices is shard, embedding table is replicated, + TensorDistAttr x_dist_attr = TensorDistAttr(); + x_dist_attr.set_process_mesh(process_mesh); + x_dist_attr.set_dims_mapping(std::vector({1, -1})); + x_dist_attr.set_dynamic_dims(std::vector({false, false})); + + TensorDistAttr w_dist_attr = TensorDistAttr(); + w_dist_attr.set_process_mesh(process_mesh); + w_dist_attr.set_dims_mapping(std::vector({-1, -1})); + w_dist_attr.set_dynamic_dims(std::vector({false, false})); + + TensorDistAttr out_grad_dist_attr = TensorDistAttr(); + out_grad_dist_attr.set_process_mesh(process_mesh); + out_grad_dist_attr.set_dims_mapping(std::vector({-1, -1, -1})); + out_grad_dist_attr.set_dynamic_dims(std::vector({false, false})); + + phi::distributed::DistMetaTensor x(phi::make_ddim(x_shape), x_dist_attr); + phi::distributed::DistMetaTensor w(phi::make_ddim(w_shape), w_dist_attr); + phi::distributed::DistMetaTensor out_grad(phi::make_ddim(out_grad_shape), + out_grad_dist_attr); + + auto spmdinfo = EmbeddingGradInferSpmd(x, w, out_grad, -1, false); + + EXPECT_EQ(spmdinfo.first.size(), 3UL); + EXPECT_EQ(spmdinfo.second.size(), 1UL); + + EXPECT_EQ(get_dims_mapping(spmdinfo.first[0]), std::vector({1, -1})); + EXPECT_EQ(get_dims_mapping(spmdinfo.first[1]), + std::vector({-1, -1})); + EXPECT_EQ(get_dims_mapping(spmdinfo.first[2]), + std::vector({1, -1, -1})); + + EXPECT_EQ(get_dims_mapping(spmdinfo.second[0]), + std::vector({-1, -1})); + EXPECT_EQ( + PADDLE_GET_CONST(phi::distributed::TensorDistAttr, spmdinfo.second[0]) + .is_partial(), + true); + VLOG(4) << "Test EmbeddingGradInferSpmd with sharding indices and " + "replicating weight" + << std::endl + << std::endl + << std::endl; + + // indices'rank is greater than 1, x and weight is replicated, out_grad is + // sharded along axis 1 + x_dist_attr.set_dims_mapping({-1, -1}); + w_dist_attr.set_dims_mapping({-1, 1}); + out_grad_dist_attr.set_dims_mapping({-1, 1, -1}); + x = phi::distributed::DistMetaTensor(phi::make_ddim(x_shape), x_dist_attr); + w = phi::distributed::DistMetaTensor(phi::make_ddim(w_shape), w_dist_attr); + out_grad = phi::distributed::DistMetaTensor(phi::make_ddim(out_grad_shape), + out_grad_dist_attr); + + spmdinfo = EmbeddingGradInferSpmd(x, w, out_grad, -1, false); + + EXPECT_EQ(spmdinfo.first.size(), 3UL); + EXPECT_EQ(spmdinfo.second.size(), 1UL); + + EXPECT_EQ(get_dims_mapping(spmdinfo.first[0]), std::vector({-1, 1})); + EXPECT_EQ(get_dims_mapping(spmdinfo.first[1]), std::vector({-1, 1})); + EXPECT_EQ(get_dims_mapping(spmdinfo.first[2]), + std::vector({-1, 1, -1})); + + EXPECT_EQ(get_dims_mapping(spmdinfo.second[0]), + std::vector({-1, -1})); + EXPECT_EQ( + PADDLE_GET_CONST(phi::distributed::TensorDistAttr, spmdinfo.second[0]) + .is_partial(), + true); + VLOG(4) << "Test EmbeddingGradInferSpmd with replicating indices and " + "sharding weight along col axis." + << std::endl + << std::endl + << std::endl; + + // Indices's rank equals 1, indices and out_grad is sharded. + x_shape = {5}; + w_shape = {10, 3}; + out_grad_shape = {5, 3}; + + x_dist_attr.set_dims_mapping(std::vector({0})); + w_dist_attr.set_dims_mapping(std::vector({-1, -1})); + out_grad_dist_attr.set_dims_mapping(std::vector({-1, 1})); + + x = phi::distributed::DistMetaTensor(phi::make_ddim(x_shape), x_dist_attr); + w = phi::distributed::DistMetaTensor(phi::make_ddim(w_shape), w_dist_attr); + out_grad = phi::distributed::DistMetaTensor(phi::make_ddim(out_grad_shape), + out_grad_dist_attr); + + spmdinfo = EmbeddingGradInferSpmd(x, w, out_grad, -1, false); + + EXPECT_EQ(spmdinfo.first.size(), 3UL); + EXPECT_EQ(spmdinfo.second.size(), 1UL); + + EXPECT_EQ(get_dims_mapping(spmdinfo.first[0]), std::vector({0})); + EXPECT_EQ(get_dims_mapping(spmdinfo.first[1]), + std::vector({-1, -1})); + EXPECT_EQ(get_dims_mapping(spmdinfo.first[2]), std::vector({0, 1})); + + EXPECT_EQ(get_dims_mapping(spmdinfo.second[0]), + std::vector({-1, 1})); + EXPECT_EQ( + PADDLE_GET_CONST(phi::distributed::TensorDistAttr, spmdinfo.second[0]) + .is_partial(), + true); + VLOG(4) << "Test EmbeddingGradInferSpmd with sharding weight and out_grad." + << std::endl + << std::endl + << std::endl; + + x_shape = {12, 16}; + w_shape = {10, 4}; + out_grad_shape = {12, 16, 4}; + + x_dist_attr.set_dims_mapping(std::vector({-1, -1})); + w_dist_attr.set_dims_mapping(std::vector({-1, -0})); + out_grad_dist_attr.set_dims_mapping(std::vector({-1, -1, 0})); + + x = phi::distributed::DistMetaTensor(phi::make_ddim(x_shape), x_dist_attr); + w = phi::distributed::DistMetaTensor(phi::make_ddim(w_shape), w_dist_attr); + out_grad = phi::distributed::DistMetaTensor(phi::make_ddim(out_grad_shape), + out_grad_dist_attr); + + spmdinfo = EmbeddingGradInferSpmd(x, w, out_grad, -1, false); + + EXPECT_EQ(spmdinfo.first.size(), 3UL); + EXPECT_EQ(spmdinfo.second.size(), 1UL); + + EXPECT_EQ(get_dims_mapping(spmdinfo.first[0]), + std::vector({-1, -1})); + EXPECT_EQ(get_dims_mapping(spmdinfo.first[1]), std::vector({-1, 0})); + EXPECT_EQ(get_dims_mapping(spmdinfo.first[2]), + std::vector({-1, -1, 0})); + + EXPECT_EQ(get_dims_mapping(spmdinfo.second[0]), + std::vector({-1, 0})); + EXPECT_EQ( + PADDLE_GET_CONST(phi::distributed::TensorDistAttr, spmdinfo.second[0]) + .is_partial(), + false); + VLOG(4) << "Test EmbeddingGradInferSpmd with sharding weight and out_grad." + << std::endl + << std::endl + << std::endl; +} + } // namespace auto_parallel } // namespace distributed } // namespace paddle From c97365b7d7fced3e16b24359068bbba7e3bd0417 Mon Sep 17 00:00:00 2001 From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com> Date: Wed, 29 Nov 2023 10:54:54 +0800 Subject: [PATCH 12/14] [PIR] Refine update_ops.yaml gen method (#59444) * fix * fix * fix --- .../fluid/pir/dialect/op_generator/api_gen.py | 8 -------- .../fluid/pir/dialect/op_generator/op_gen.py | 6 ------ paddle/fluid/pir/dialect/operator/ir/ops.yaml | 19 ++++--------------- .../pir/dialect/operator/ir/ops_backward.yaml | 5 +++++ .../pir/dialect/operator/ir/update_ops.yaml | 5 ++++- paddle/fluid/primitive/codegen/gen.py | 8 +++----- 6 files changed, 16 insertions(+), 35 deletions(-) diff --git a/paddle/fluid/pir/dialect/op_generator/api_gen.py b/paddle/fluid/pir/dialect/op_generator/api_gen.py index 64bf32910c274..e6453cd271273 100644 --- a/paddle/fluid/pir/dialect/op_generator/api_gen.py +++ b/paddle/fluid/pir/dialect/op_generator/api_gen.py @@ -21,9 +21,7 @@ PD_MANUAL_OP_LIST, OpCompatParser, OpInfoParser, - check_need_update_ops, to_pascal_case, - update_ops, ) PD_MANUAL_API_LIST = { @@ -164,18 +162,12 @@ def __init__(self) -> None: def _parse_yaml(self, op_yaml_files, op_compat_yaml_file): op_compat_parser = OpCompatParser(op_compat_yaml_file) - need_update_ops, update_yaml_file = check_need_update_ops(op_yaml_files) op_yaml_items = [] for yaml_file in op_yaml_files: - if update_yaml_file == yaml_file: - continue with open(yaml_file, "r") as f: ops = yaml.safe_load(f) op_yaml_items = op_yaml_items + ops - # replace old ir ops with pir ops - if need_update_ops: - update_ops(op_yaml_items, update_yaml_file) op_info_items = [] for op in op_yaml_items: diff --git a/paddle/fluid/pir/dialect/op_generator/op_gen.py b/paddle/fluid/pir/dialect/op_generator/op_gen.py index 10d79bc8b2791..262a434de8c7b 100644 --- a/paddle/fluid/pir/dialect/op_generator/op_gen.py +++ b/paddle/fluid/pir/dialect/op_generator/op_gen.py @@ -1055,18 +1055,12 @@ def OpGenerator( # (2) Prepare: Get all op item in all op_yaml_files op_compat_parser = OpCompatParser(op_compat_yaml_file) - need_update_ops, update_yaml_file = check_need_update_ops(op_yaml_files) op_yaml_items = [] for yaml_file in op_yaml_files: - if update_yaml_file == yaml_file: - continue with open(yaml_file, "r") as f: ops = yaml.safe_load(f) op_yaml_items = op_yaml_items + ops - # replace old ir ops with pir ops - if need_update_ops: - update_ops(op_yaml_items, update_yaml_file) op_info_items = {} for op in op_yaml_items: diff --git a/paddle/fluid/pir/dialect/operator/ir/ops.yaml b/paddle/fluid/pir/dialect/operator/ir/ops.yaml index dbb6ad62f1082..460ca5ad373ce 100644 --- a/paddle/fluid/pir/dialect/operator/ir/ops.yaml +++ b/paddle/fluid/pir/dialect/operator/ir/ops.yaml @@ -1,5 +1,7 @@ -# The apis in this file are unstandardized that may caused by a variety of reasons, -# we are trying to fix these apis and will move standardized apis into ops.yaml. +# The operators included in this file are: +# 1) Operators defined only in PIR, dynamic graphs do not exist; +# 2) The definitions of static graphs and dynamic graphs are inconsistent, but the final definition plan has not yet been clarified. +# After the definition is clearly defined, migrate to paddle /fluid/pir/dialect/operator/ir/update_ops.yaml or paddle/phi/api/yaml/ops.yaml - op : adadelta_ args : (Tensor param, Tensor grad, Tensor avg_squared_grad, Tensor avg_squared_update, Tensor learning_rate, Tensor master_param, float rho, float epsilon, bool multi_precision) @@ -83,19 +85,6 @@ kernel : func : any -- op : arange - args : (Scalar start, Scalar end, Scalar step, DataType dtype=DataType::FLOAT64, Place place=CPUPlace()) - output : Tensor(out) - infer_meta : - func : ArangeInferMeta - param : [start, end, step, dtype] - kernel : - func : arange - param : [start, end, step] - data_type : dtype - backend : place - support_tensor : [start, end, step] - - op : assert args : (Tensor cond, Tensor[] data, int64_t summarize = -1) output : diff --git a/paddle/fluid/pir/dialect/operator/ir/ops_backward.yaml b/paddle/fluid/pir/dialect/operator/ir/ops_backward.yaml index 1ae80dbc8b8fc..beba440b5b6de 100644 --- a/paddle/fluid/pir/dialect/operator/ir/ops_backward.yaml +++ b/paddle/fluid/pir/dialect/operator/ir/ops_backward.yaml @@ -1,3 +1,8 @@ +# The operators included in this file are: +# 1) Operators defined only in PIR, dynamic graphs do not exist; +# 2) The definitions of static graphs and dynamic graphs are inconsistent, but the final definition plan has not yet been clarified. +# After the definition is clearly defined, migrate to paddle /fluid/pir/dialect/operator/ir/update_ops.yaml or paddle/phi/api/yaml/ops.yaml + - backward_op : add_double_grad forward : add_grad (Tensor x, Tensor y, Tensor grad_out, int axis = -1) -> Tensor(grad_x), Tensor(grad_y) args : (Tensor y, Tensor grad_out, Tensor grad_x_grad, Tensor grad_y_grad, int axis = -1) diff --git a/paddle/fluid/pir/dialect/operator/ir/update_ops.yaml b/paddle/fluid/pir/dialect/operator/ir/update_ops.yaml index de542e68f30b9..23040d41ab5df 100644 --- a/paddle/fluid/pir/dialect/operator/ir/update_ops.yaml +++ b/paddle/fluid/pir/dialect/operator/ir/update_ops.yaml @@ -1,4 +1,7 @@ -# Ops in this file is only used for pir currently and will replace ops of legacy_ops.yaml/ops.yaml of PHI in future. +# The operators contained in this file are: +# Operators that are inconsistent with the dynamic graph definition currently, +# but the final definition scheme of the static graph has been determined, after +# the dynamic graph is simultaneously upgraded, the operators in this file will be migrated to paddle/phi/api/yaml/ops.yaml. - op : arange args : (Scalar start, Scalar end, Scalar step, DataType dtype=DataType::FLOAT64, Place place=CPUPlace()) diff --git a/paddle/fluid/primitive/codegen/gen.py b/paddle/fluid/primitive/codegen/gen.py index 056dac8b6640a..f1741929b7beb 100644 --- a/paddle/fluid/primitive/codegen/gen.py +++ b/paddle/fluid/primitive/codegen/gen.py @@ -14,7 +14,6 @@ import argparse import hashlib -import os import pathlib import sys @@ -353,6 +352,7 @@ def gen( compats, ir_fwds, ir_revs, + ir_update_fwds, ) = ( load(prim_path), load(fwd_path), @@ -360,13 +360,11 @@ def gen( load(compat_path), load(fwd_pd_op_path), load(rev_pd_op_path), + load(update_fwd_pd_op_path), ) filter_compat_info(compats) - fwd_apis = fwds + ir_fwds - # replace old ir ops with pir ops - if os.path.exists(update_fwd_pd_op_path): - update_apis(fwd_apis, update_fwd_pd_op_path) + fwd_apis = fwds + ir_fwds + ir_update_fwds apis = [{**api, **{'is_fwd': True}} for api in fwd_apis] apis = apis + [{**api, **{'is_fwd': False}} for api in revs + ir_revs] From 27565243d15d8df64b81b79df6fb50d433b61190 Mon Sep 17 00:00:00 2001 From: Galaxy1458 <55453380+Galaxy1458@users.noreply.github.com> Date: Wed, 29 Nov 2023 10:56:22 +0800 Subject: [PATCH 13/14] chang_cc_test_old (#59384) * chang_cc_test_old * chang_cc_test_old * chang_cc_test_old * chang_cc_test_old * chang_cc_test_old * chang_cc_test_old * update * update * update * update --- paddle/common/ddim.h | 2 +- paddle/fluid/framework/details/CMakeLists.txt | 2 +- paddle/phi/core/ddim.h | 2 +- test/cpp/pir/core/CMakeLists.txt | 10 +--------- 4 files changed, 4 insertions(+), 12 deletions(-) diff --git a/paddle/common/ddim.h b/paddle/common/ddim.h index c2801fa54bd62..cfed0d221221d 100644 --- a/paddle/common/ddim.h +++ b/paddle/common/ddim.h @@ -64,7 +64,7 @@ inline void dynamic_dim_assign(const T1* in, T2* out, int n) { * * The number of dimensions must be between [1, 9]. */ -class DDim { +class TEST_API DDim { public: constexpr static int kMaxRank = 9; diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 92dd5157ff6a5..f76f6af9dce96 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -317,8 +317,8 @@ cc_test_old( var_handle op_handle_base scope - phi memory + phi device_context gather_op_handle) diff --git a/paddle/phi/core/ddim.h b/paddle/phi/core/ddim.h index be11b4c9596cd..ff2abdb3b84b3 100644 --- a/paddle/phi/core/ddim.h +++ b/paddle/phi/core/ddim.h @@ -248,7 +248,7 @@ DDim slice_ddim(const DDim& dim, int begin, int end); int arity(const DDim& ddim); -TEST_API std::ostream& operator<<(std::ostream&, const DDim&); +std::ostream& operator<<(std::ostream&, const DDim&); /** * \brief Flatten dim to 3d diff --git a/test/cpp/pir/core/CMakeLists.txt b/test/cpp/pir/core/CMakeLists.txt index 231f6a64cef6a..42c331c59fb70 100644 --- a/test/cpp/pir/core/CMakeLists.txt +++ b/test/cpp/pir/core/CMakeLists.txt @@ -1,11 +1,4 @@ -cc_test_old( - type_test - SRCS - type_test.cc - DEPS - pir - gtest - op_dialect_vjp) +paddle_test(type_test SRCS type_test.cc DEPS pir op_dialect_vjp) cc_test_old(ir_attribute_test SRCS ir_attribute_test.cc DEPS pir gtest) cc_test_old(ir_value_test SRCS ir_value_test.cc DEPS pir gtest) paddle_test( @@ -14,7 +7,6 @@ paddle_test( ir_op_test.cc DEPS pir - gtest test_dialect op_dialect_vjp) cc_test_old(ir_region_test SRCS ir_region_test.cc DEPS pir gtest) From c8b1e8b6ef066e82c496914bc9740047ce121d0e Mon Sep 17 00:00:00 2001 From: wanghuancoder Date: Wed, 29 Nov 2023 10:58:05 +0800 Subject: [PATCH 14/14] share inplace version counter for strided kernels (#59422) * share inplace version counter for strided kernels --- paddle/phi/kernels/stride/as_complex_kernel.cc | 1 + paddle/phi/kernels/stride/as_real_kernel.cc | 1 + paddle/phi/kernels/stride/as_strided_kernel.cc | 1 + paddle/phi/kernels/stride/complex_kernel.cc | 2 ++ paddle/phi/kernels/stride/diagonal_kernel.cc | 1 + paddle/phi/kernels/stride/index_select_kernel.cc | 1 + paddle/phi/kernels/stride/reshape_kernel.cc | 1 + paddle/phi/kernels/stride/slice_kernel.cc | 1 + paddle/phi/kernels/stride/squeeze_kernel.cc | 1 + paddle/phi/kernels/stride/strided_slice_kernel.cc | 1 + paddle/phi/kernels/stride/tensor_unfold_kernel.cc | 1 + paddle/phi/kernels/stride/transpose_kernel.cc | 1 + paddle/phi/kernels/stride/unsqueeze_kernel.cc | 1 + paddle/phi/kernels/stride/view_kernel.cc | 4 ++++ test/collective/fleet/hybrid_parallel_pp_recompute.py | 1 + 15 files changed, 19 insertions(+) diff --git a/paddle/phi/kernels/stride/as_complex_kernel.cc b/paddle/phi/kernels/stride/as_complex_kernel.cc index 6c5df876049d7..c2e7f816958eb 100644 --- a/paddle/phi/kernels/stride/as_complex_kernel.cc +++ b/paddle/phi/kernels/stride/as_complex_kernel.cc @@ -35,6 +35,7 @@ void AsComplexStridedKernel(const Context& dev_ctx, } out->set_offset(x.offset()); out->ResetHolder(x.Holder()); + out->ShareInplaceVersionCounterWith(x); } } // namespace phi diff --git a/paddle/phi/kernels/stride/as_real_kernel.cc b/paddle/phi/kernels/stride/as_real_kernel.cc index 203fce123c66f..92357968809ce 100644 --- a/paddle/phi/kernels/stride/as_real_kernel.cc +++ b/paddle/phi/kernels/stride/as_real_kernel.cc @@ -33,6 +33,7 @@ void AsRealStridedKernel(const Context& dev_ctx, } out->set_offset(x.offset()); out->ResetHolder(x.Holder()); + out->ShareInplaceVersionCounterWith(x); } } // namespace phi diff --git a/paddle/phi/kernels/stride/as_strided_kernel.cc b/paddle/phi/kernels/stride/as_strided_kernel.cc index f8aa3d81ffe67..0cd63577d1e96 100644 --- a/paddle/phi/kernels/stride/as_strided_kernel.cc +++ b/paddle/phi/kernels/stride/as_strided_kernel.cc @@ -28,6 +28,7 @@ void AsStridedKernel(const Context& dev_ctx, out->set_strides(DDim(stride.data(), static_cast(stride.size()))); out->set_offset(offset); out->ResetHolder(input.Holder()); + out->ShareInplaceVersionCounterWith(input); } } // namespace phi diff --git a/paddle/phi/kernels/stride/complex_kernel.cc b/paddle/phi/kernels/stride/complex_kernel.cc index ed9076141a54f..d72bfec2b09f0 100644 --- a/paddle/phi/kernels/stride/complex_kernel.cc +++ b/paddle/phi/kernels/stride/complex_kernel.cc @@ -35,6 +35,7 @@ void RealStridedKernel(const Context& dev_ctx, out->set_offset(x.offset()); out->set_strides(stride); out->ResetHolder(x.Holder()); + out->ShareInplaceVersionCounterWith(x); } template @@ -54,6 +55,7 @@ void ImagStridedKernel(const Context& dev_ctx, out->set_strides(stride); out->set_offset(x.offset() + phi::SizeOf(out->dtype())); out->ResetHolder(x.Holder()); + out->ShareInplaceVersionCounterWith(x); } } // namespace phi diff --git a/paddle/phi/kernels/stride/diagonal_kernel.cc b/paddle/phi/kernels/stride/diagonal_kernel.cc index b4ca6d9b277df..31b2aa97e96fc 100644 --- a/paddle/phi/kernels/stride/diagonal_kernel.cc +++ b/paddle/phi/kernels/stride/diagonal_kernel.cc @@ -78,6 +78,7 @@ void DiagonalStridedKernel(const Context& dev_ctx, meta.offset = x_offset; out->set_meta(meta); out->ResetHolder(x.Holder()); + out->ShareInplaceVersionCounterWith(x); } } // namespace phi diff --git a/paddle/phi/kernels/stride/index_select_kernel.cc b/paddle/phi/kernels/stride/index_select_kernel.cc index e5db5f9d18f76..b7f96be147532 100644 --- a/paddle/phi/kernels/stride/index_select_kernel.cc +++ b/paddle/phi/kernels/stride/index_select_kernel.cc @@ -53,6 +53,7 @@ void IndexSelectStridedKernel(const Context& ctx, meta.strides = DDim(stride.data(), static_cast(stride.size())); output->set_meta(meta); output->ResetHolder(x.Holder()); + output->ShareInplaceVersionCounterWith(x); } } // namespace phi diff --git a/paddle/phi/kernels/stride/reshape_kernel.cc b/paddle/phi/kernels/stride/reshape_kernel.cc index 732eb86a45e19..9d94e53314193 100644 --- a/paddle/phi/kernels/stride/reshape_kernel.cc +++ b/paddle/phi/kernels/stride/reshape_kernel.cc @@ -41,6 +41,7 @@ void ReshapeStridedKernel(const Context& dev_ctx, out->set_offset(x_offset); out->set_strides(stride); out->ResetHolder(x.Holder()); + out->ShareInplaceVersionCounterWith(x); } else { DenseTensor tmp; DenseTensor tmp_x = x; diff --git a/paddle/phi/kernels/stride/slice_kernel.cc b/paddle/phi/kernels/stride/slice_kernel.cc index 195034fb92249..998bc2700df4f 100644 --- a/paddle/phi/kernels/stride/slice_kernel.cc +++ b/paddle/phi/kernels/stride/slice_kernel.cc @@ -99,6 +99,7 @@ void SliceStridedKernel(const Context& ctx, DDim(output_stride.data(), static_cast(output_stride.size())); out->set_meta(meta); out->ResetHolder(input.Holder()); + out->ShareInplaceVersionCounterWith(input); } } // namespace phi diff --git a/paddle/phi/kernels/stride/squeeze_kernel.cc b/paddle/phi/kernels/stride/squeeze_kernel.cc index 90f240c7ce865..33895dfcf8e66 100644 --- a/paddle/phi/kernels/stride/squeeze_kernel.cc +++ b/paddle/phi/kernels/stride/squeeze_kernel.cc @@ -111,6 +111,7 @@ void SqueezeInferStridedKernel(const Context& dev_ctx, meta.offset = input.offset(); out->set_meta(meta); out->ResetHolder(input.Holder()); + out->ShareInplaceVersionCounterWith(input); } template diff --git a/paddle/phi/kernels/stride/strided_slice_kernel.cc b/paddle/phi/kernels/stride/strided_slice_kernel.cc index 9bc36d5b15f97..a57ed98d119a9 100644 --- a/paddle/phi/kernels/stride/strided_slice_kernel.cc +++ b/paddle/phi/kernels/stride/strided_slice_kernel.cc @@ -121,6 +121,7 @@ void StridedSliceRawStridedKernel(const Context& dev_ctx, DDim(output_stride.data(), static_cast(output_stride.size())); out->set_meta(meta); out->ResetHolder(input.Holder()); + out->ShareInplaceVersionCounterWith(input); } template diff --git a/paddle/phi/kernels/stride/tensor_unfold_kernel.cc b/paddle/phi/kernels/stride/tensor_unfold_kernel.cc index 9207dd6a7fde9..0a4d4ee9af3e2 100644 --- a/paddle/phi/kernels/stride/tensor_unfold_kernel.cc +++ b/paddle/phi/kernels/stride/tensor_unfold_kernel.cc @@ -65,6 +65,7 @@ void TensorUnfoldKernel(const Context& dev_ctx, out->set_strides(DDim(stride.data(), static_cast(stride.size()))); out->set_offset(input.offset()); out->ResetHolder(input.Holder()); + out->ShareInplaceVersionCounterWith(input); } } // namespace phi diff --git a/paddle/phi/kernels/stride/transpose_kernel.cc b/paddle/phi/kernels/stride/transpose_kernel.cc index 1fedb515ef020..acdc321ad0e8a 100644 --- a/paddle/phi/kernels/stride/transpose_kernel.cc +++ b/paddle/phi/kernels/stride/transpose_kernel.cc @@ -41,6 +41,7 @@ void TransposeStridedKernel(const Context& ctx, out->set_meta(meta); out->ResetHolder(x.Holder()); + out->ShareInplaceVersionCounterWith(x); } } // namespace phi diff --git a/paddle/phi/kernels/stride/unsqueeze_kernel.cc b/paddle/phi/kernels/stride/unsqueeze_kernel.cc index 45c50c37fe800..b97a0222e6cd6 100644 --- a/paddle/phi/kernels/stride/unsqueeze_kernel.cc +++ b/paddle/phi/kernels/stride/unsqueeze_kernel.cc @@ -71,6 +71,7 @@ void UnsqueezeInferStridedKernel(const Context& dev_ctx, meta.offset = input.offset(); out->set_meta(meta); out->ResetHolder(input.Holder()); + out->ShareInplaceVersionCounterWith(input); } template diff --git a/paddle/phi/kernels/stride/view_kernel.cc b/paddle/phi/kernels/stride/view_kernel.cc index ff92086ff2f88..d6ca9bf6c02f0 100644 --- a/paddle/phi/kernels/stride/view_kernel.cc +++ b/paddle/phi/kernels/stride/view_kernel.cc @@ -32,6 +32,7 @@ void ViewShapeKernel(const Context& dev_ctx, meta.offset = input.offset(); out->set_meta(meta); out->ResetHolder(input.Holder()); + out->ShareInplaceVersionCounterWith(input); } else { PADDLE_THROW(phi::errors::InvalidArgument( "The Tensor can not be viewed, please call reshape.")); @@ -51,6 +52,7 @@ void ViewDtypeKernel(const Context& dev_ctx, meta.dtype = dtype; out->set_meta(meta); out->ResetHolder(input.Holder()); + out->ShareInplaceVersionCounterWith(input); } else if (input_dtype_size == 0) { PADDLE_THROW(phi::errors::InvalidArgument( "The Tensor's shape is [] can not be viewed.")); @@ -82,6 +84,7 @@ void ViewDtypeKernel(const Context& dev_ctx, meta.offset = input.offset() * times; out->set_meta(meta); out->ResetHolder(input.Holder()); + out->ShareInplaceVersionCounterWith(input); } else { PADDLE_ENFORCE_EQ( input.strides()[input.strides().size() - 1], @@ -140,6 +143,7 @@ void ViewDtypeKernel(const Context& dev_ctx, meta.offset = input.offset() / times; out->set_meta(meta); out->ResetHolder(input.Holder()); + out->ShareInplaceVersionCounterWith(input); } } diff --git a/test/collective/fleet/hybrid_parallel_pp_recompute.py b/test/collective/fleet/hybrid_parallel_pp_recompute.py index fd03b562f25a3..3d95375d1da8a 100644 --- a/test/collective/fleet/hybrid_parallel_pp_recompute.py +++ b/test/collective/fleet/hybrid_parallel_pp_recompute.py @@ -98,6 +98,7 @@ class TransformerNetPipe(TransformerNet): def forward(self, tensors): if framework.in_dynamic_mode(): stable, x = tensors + stable = paddle.assign(stable) output = super().forward(x) return stable, output else: