diff --git a/python/paddle/distributed/auto_parallel/callbacks.py b/python/paddle/distributed/auto_parallel/callbacks.py new file mode 100644 index 0000000000000..17ce5bd71b816 --- /dev/null +++ b/python/paddle/distributed/auto_parallel/callbacks.py @@ -0,0 +1,226 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import time + +import paddle +from paddle.hapi.callbacks import ProgBarLogger, ModelCheckpoint, LRScheduler, CallbackList, Callback +from .interface import CollectionNames, get_collection + + +def config_callbacks(callbacks=None, + engine=None, + batch_size=None, + epochs=None, + steps=None, + log_freq=2, + verbose=2, + save_freq=1, + save_dir=None, + metrics=None, + acc_step=1, + mode='train'): + cbks = callbacks or [] + cbks = cbks if isinstance(cbks, (list, tuple)) else [cbks] + + if not any(isinstance(k, ProgBarLogger) for k in cbks) and verbose: + cbks = [ProgBarLoggerAuto(log_freq, verbose=verbose)] + cbks + + if not any(isinstance(k, LRScheduler) for k in cbks): + cbks = [LRSchedulerAuto()] + cbks + + if not any(isinstance(k, ModelCheckpoint) for k in cbks): + cbks = cbks + [ModelCheckpointAuto(save_freq, save_dir)] + + if not any(isinstance(k, Profiler) for k in cbks) and verbose == 3: + cbks = cbks + [Profiler(timer_only=True)] + + if not any(isinstance(k, History) for k in cbks): + cbks = cbks + [History()] + + for i, k in enumerate(cbks): + if isinstance(k, ProgBarLogger): + cbks[i] = ProgBarLoggerAuto(k.log_freq, k.verbose) + if isinstance(k, LRScheduler): + cbks[i] = LRSchedulerAuto(k.by_step, k.by_epoch) + if isinstance(k, ModelCheckpoint): + cbks[i] = ModelCheckpointAuto(k.save_freq, k.save_dir) + + cbk_list = CallbackList(cbks) + cbk_list.set_model(engine) + metrics = metrics or [] if mode != 'test' else [] + params = { + 'batch_size': batch_size, + 'epochs': epochs, + 'steps': steps, + 'verbose': verbose, + 'metrics': metrics, + 'acc_step': acc_step, + } + cbk_list.set_params(params) + return cbk_list + + +class ProgBarLoggerAuto(ProgBarLogger): + + def __init__(self, log_freq=1, verbose=2): + super(ProgBarLoggerAuto, self).__init__(log_freq, verbose) + + def _is_print(self): + return True + + def _updates(self, logs, mode): + values = [] + metrics = getattr(self, '%s_metrics' % (mode)) + progbar = getattr(self, '%s_progbar' % (mode)) + steps = getattr(self, '%s_step' % (mode)) + + for k in metrics: + if k in logs: + values.append((k, logs[k])) + + if 'lr' in logs: + values.append(('lr', logs['lr'])) + + fetches_logs = logs.get('fetches', {}) + collect_logging = get_collection(CollectionNames.LOGGING) + for name, var in collect_logging: + k = name or var.name + if k in fetches_logs: + values.append((k, fetches_logs[k])) + + out_logs = logs.get('outputs', {}) + for k in out_logs: + values.append((k, out_logs[k])) + + if self.verbose == 3 and hasattr(self, '_%s_timer' % (mode)): + timer = getattr(self, '_%s_timer' % (mode)) + cnt = timer['count'] if timer['count'] > 0 else 1.0 + samples = timer['samples'] if timer['samples'] > 0 else 1.0 + values.append( + ('avg_reader_cost', "%.5f sec" % (timer['data_time'] / cnt))) + values.append( + ('avg_batch_cost', "%.5f sec" % (timer['batch_time'] / cnt))) + values.append( + ('ips', "%.5f samples/sec" % + (samples / (timer['data_time'] + timer['batch_time'])))) + timer['count'] = 0 + timer['samples'] = 0 + timer['data_time'] = 0. + timer['batch_time'] = 0. + + progbar.update(steps, values) + + def on_eval_batch_end(self, step, logs=None): + logs = logs or {} + self.eval_step += 1 + samples = self.params['batch_size'] + self.evaled_samples += samples + + self._eval_timer['batch_time'] += ( + time.time() - self._eval_timer['batch_data_end_time']) + self._eval_timer['count'] += 1 + samples = self.params['batch_size'] + self._eval_timer['samples'] += samples + + if self._is_print() and self.eval_step % self.log_freq == 0: + if self.eval_steps is None or self.eval_step < self.eval_steps: + self._updates(logs, 'eval') + + self._eval_timer['batch_start_time'] = time.time() + + +class LRSchedulerAuto(LRScheduler): + + def __init__(self, by_step=True, by_epoch=False): + super(LRSchedulerAuto, self).__init__(by_step, by_epoch) + + def on_epoch_begin(self, epoch=None, logs=None): + self.acc_step = self.params["acc_step"] + self.epoch = epoch + self.train_step = 0 + + def on_train_batch_end(self, step, logs=None): + self.train_step += 1 + + if self.by_step and self.train_step % self.acc_step == 0: + if self.model._optimizer and \ + hasattr(self.model._optimizer, '_learning_rate') and \ + isinstance(self.model._optimizer._learning_rate, + paddle.optimizer.lr.LRScheduler): + self.model._optimizer._learning_rate.step() + + +class History(Callback): + + def __init__(self): + self.history = {} + + def on_train_begin(self, logs=None): + self.epoch = [] + + def on_epoch_end(self, epoch, logs=None): + logs = logs or {} + self.epoch.append(epoch) + for k, v in logs.items(): + self.history.setdefault(k, []).append(v) + + self.model.history = self + + +class Profiler(Callback): + + def __init__(self, *args, **kwargs): + self.prof = paddle.profiler.Profiler(*args, **kwargs) + + def on_epoch_begin(self, epoch=None, logs=None): + self.epoch = epoch + self.train_step = 0 + self.batch_size = self.params["batch_size"] + self.steps = self.params['steps'] + + def on_train_begin(self, logs=None): + self.prof.start() + + def on_train_batch_end(self, step, logs=None): + self.train_step += 1 + self.prof.step(num_samples=self.batch_size) + print("step {}:{}".format(self.train_step, + self.prof.step_info(unit='samples'))) + + def on_train_end(self, logs=None): + self.prof.stop() + self.prof.summary() + + +class ModelCheckpointAuto(ModelCheckpoint): + + def __init__(self, *args, **kwargs): + super(ModelCheckpointAuto, self).__init__(*args, **kwargs) + + def _is_save(self): + return self.model and self.save_dir + + def on_epoch_end(self, epoch, logs=None): + if self._is_save() and (self.epoch + 1) % self.save_freq == 0: + path = '{}/epoch{}'.format(self.save_dir, epoch) + print('save checkpoint at {}'.format(os.path.abspath(path))) + self.model.save(path) + + def on_train_end(self, logs=None): + if self._is_save(): + path = '{}/final'.format(self.save_dir) + print('save checkpoint at {}'.format(os.path.abspath(path))) + self.model.save(path) diff --git a/python/paddle/distributed/auto_parallel/completion.py b/python/paddle/distributed/auto_parallel/completion.py index 871228820414c..f420a06cfbc74 100644 --- a/python/paddle/distributed/auto_parallel/completion.py +++ b/python/paddle/distributed/auto_parallel/completion.py @@ -19,7 +19,7 @@ from paddle.fluid import core from paddle.fluid import framework -from .utils import print_program_with_dist_attr, is_gradient_clip_op +from .utils import is_gradient_clip_op, __not_shape_var_type__ from .operators import find_compatible_distributed_operator_impls from .dist_context import get_default_distributed_context, _node_id from .dist_tensor import DistributedTensor @@ -142,6 +142,7 @@ class Completer: def __init__(self, dist_context): assert dist_context is not None self._dist_context = dist_context + self._has_prepared = False def _update_tensor_node_dims_mapping(self, tensor_node, fwd=True): changed = False @@ -366,7 +367,14 @@ def _update_dims_mapping_between_graphs(self): def _update_dims_mapping_for_special(self): # Set the dims_mapping of a tensor to the dims_mapping inside the op which produces it op_nodes = self._dist_context._serial_ordered_op_nodes + # NOTE: this list may be changed if Paddle changes the existing rules. + related_reader_ops = [ + "create_py_reader", "create_double_buffer_reader", "read" + ] for op_node in op_nodes: + if op_node.op() is not None \ + and op_node.op().type() in related_reader_ops: + continue op_dist_attr = self._dist_context.get_dist_attr_for_graph(op_node) for tensor_node in op_node.outputs: if tensor_node.is_var() and tensor_node.var() is not None: @@ -406,6 +414,7 @@ def _update_dims_mapping(self): reach_fix_point = False else: reach_fix_point = True + # NOTE: this will be removed after changing the reshard rule self._update_dims_mapping_for_special() def _update_process_mesh_by_nearest(self, op_node, nearest_op_node): @@ -494,14 +503,14 @@ def _find_nodes_related_to_cond(source_node): for tensor_node in node.inputs: if tensor_node.is_var() and tensor_node.var( ) is not None: - if tensor_node.var().type() == core.VarDesc.VarType.READER \ + if tensor_node.var().type() in __not_shape_var_type__ \ or len(tensor_node.var().shape()) != 1: flag = False break for tensor_node in node.outputs: if tensor_node.is_var() and tensor_node.var( ) is not None: - if tensor_node.var().type() == core.VarDesc.VarType.READER \ + if tensor_node.var().type() in __not_shape_var_type__ \ or len(tensor_node.var().shape()) != 1: flag = False break @@ -719,6 +728,8 @@ def _update_process_mesh(self): self._update_process_mesh_between_graphs() def _prepare(self): + if self._has_prepared: + return self._while_op_nodes = {} self._array_nodes = {} self._node_pairs_between_graphs = [] @@ -732,6 +743,8 @@ def _prepare(self): if self._array_nodes.get(array_var_name, None) is None: self._array_nodes[array_var_name] = [] self._array_nodes[array_var_name].append(node) + # Add the array input node + self._array_nodes[array_var_name].append(node.inputs[0]) if node.op().type() == "write_to_array": array_var_name = node.op().output("Out")[0] if self._array_nodes.get(array_var_name, None) is None: @@ -752,6 +765,7 @@ def _prepare(self): and after_node.var().name() == node.var().name(): self._node_pairs_between_graphs.append( (after_node, node)) + self._has_prepared = True def complete_forward_annotation(self, serial_main_program=None): """ Complete annotation for the partial annotated serial_main_program. @@ -899,6 +913,72 @@ def _update_dist_attr_for_dp(self): else: dist_op.dist_attr = original_op_dist_attr + def _complete_tensor_dist_attr_by_op(self, serial_main_program=None): + if serial_main_program is None: + serial_main_program = self._dist_context.serial_main_program + else: + self._dist_context._serial_main_program = serial_main_program + + self._dist_context.initialize() + + self._prepare() + + has_set_dist_attr = set() + + all_nodes = self._dist_context.serial_ordered_nodes + for node in all_nodes: + if node.is_op(): + if node.op().type() in ["while"]: + continue + dist_op = self._dist_context.get_dist_op_for_graph(node) + op_dist_attr = dist_op.dist_attr + for tensor_node in node.inputs: + if tensor_node.is_var() and tensor_node.var() is not None: + # Skip the non-leaf var node + if len(tensor_node.inputs) != 0: + continue + tensor_desc = tensor_node.var() + tensor_name = tensor_desc.name() + tensor = dist_op.get_serial_input(tensor_name) + # Use the first op to set the tensor dist attr + if tensor_name in has_set_dist_attr: + continue + tensor_dist_attr = self._dist_context.get_tensor_dist_attr_for_graph( + tensor_node) + tensor_dist_attr.process_mesh = op_dist_attr.process_mesh + tensor_dist_attr.dims_mapping = op_dist_attr.get_input_dims_mapping( + tensor_name) if tensor.is_parameter else [ + -1 for i in tensor_desc.shape() + ] + has_set_dist_attr.add(tensor_name) + for tensor_node in node.outputs: + if tensor_node.is_var() and tensor_node.var() is not None: + tensor_name = tensor_node.var().name() + if tensor_name in has_set_dist_attr: + continue + tensor_dist_attr = self._dist_context.get_tensor_dist_attr_for_graph( + tensor_node) + tensor_dist_attr.process_mesh = op_dist_attr.process_mesh + tensor_dist_attr.dims_mapping = op_dist_attr.get_output_dims_mapping( + tensor_name) + has_set_dist_attr.add(tensor_name) + + self._update_process_mesh_for_specials() + + self._update_process_mesh_between_graphs() + + self._update_dims_mapping_for_special() + + self._update_dims_mapping_between_graphs() + + # Copy the corresponding distributed attribute from graph to serial_main_program + self._dist_context.copy_dist_attr_from_graph_to_program() + + # Do the validation check and amend some completion + self._dist_context.amend_dist_attr_for_program() + + self._dist_context.validate_dist_attr_for_program() + def _complete_high_order_grad_annotation(self, serial_main_program=None): """ NOTE: diff --git a/python/paddle/distributed/auto_parallel/constants.py b/python/paddle/distributed/auto_parallel/constants.py index 86a545322a294..82c5011faf0af 100644 --- a/python/paddle/distributed/auto_parallel/constants.py +++ b/python/paddle/distributed/auto_parallel/constants.py @@ -116,3 +116,10 @@ def set_field_default_config(category, field, default_value): set_field_default_config(TUNING, "profile_end_step", 1) set_field_default_config(TUNING, "run_after_tuning", True) set_field_default_config(TUNING, "verbose", True) + +######################################### +# dataset configuration +######################################### +DATASET = "dataset" +set_field_default_config(DATASET, "enable", False) +set_field_default_config(DATASET, "num_shards", 1) diff --git a/python/paddle/distributed/auto_parallel/cost/comp_op_cost.py b/python/paddle/distributed/auto_parallel/cost/comp_op_cost.py index b4ac972bcfd29..c5bdc85e1b5b1 100644 --- a/python/paddle/distributed/auto_parallel/cost/comp_op_cost.py +++ b/python/paddle/distributed/auto_parallel/cost/comp_op_cost.py @@ -167,6 +167,25 @@ def calc_time(self): return 0 +@register_op_cost +class DropoutGradOpCost(CompOpCost): + OP_TYPE = "dropout_grad" + + def __init__(self, op=None, op_desc=None, cluster=None): + super(DropoutGradOpCost, self).__init__(op=op, + op_desc=op_desc, + cluster=cluster) + + # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided + def calc_flops(self): + # NOTE: The actual formula will be filled in the future + return 0 + + def calc_time(self): + # NOTE: The actual formula will be filled in the future + return 0 + + @register_op_cost class ElementwiseAddOpCost(CompOpCost): OP_TYPE = "elementwise_add" @@ -395,6 +414,42 @@ def calc_time(self): return 0 +@register_op_cost +class FusedSoftmaxMaskUpperTriangleOpCost(CompOpCost): + OP_TYPE = "fused_softmax_mask_upper_triangle" + + def __init__(self, op=None, op_desc=None, cluster=None): + super(FusedSoftmaxMaskUpperTriangleOpCost, + self).__init__(op=op, op_desc=op_desc, cluster=cluster) + + # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided + def calc_flops(self): + # NOTE: The actual formula will be filled in the future + return 0 + + def calc_time(self): + # NOTE: The actual formula will be filled in the future + return 0 + + +@register_op_cost +class FusedSoftmaxMaskUpperTriangleGradOpCost(CompOpCost): + OP_TYPE = "fused_softmax_mask_upper_triangle_grad" + + def __init__(self, op=None, op_desc=None, cluster=None): + super(FusedSoftmaxMaskUpperTriangleGradOpCost, + self).__init__(op=op, op_desc=op_desc, cluster=cluster) + + # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided + def calc_flops(self): + # NOTE: The actual formula will be filled in the future + return 0 + + def calc_time(self): + # NOTE: The actual formula will be filled in the future + return 0 + + @register_op_cost class GatherOpCost(CompOpCost): OP_TYPE = "gather" diff --git a/python/paddle/distributed/auto_parallel/cost/estimate_cost.py b/python/paddle/distributed/auto_parallel/cost/estimate_cost.py index 7bdde90b6a711..94dc5287910f2 100644 --- a/python/paddle/distributed/auto_parallel/cost/estimate_cost.py +++ b/python/paddle/distributed/auto_parallel/cost/estimate_cost.py @@ -45,6 +45,8 @@ def __init__(self, ) # {`op_id`: {"reshard": [], "dist_op": [], "local_cost": local_cost}}} self._bubble_time_mapping = {} self._ordered_ops = [] + self.max_memories = {} + self.max_memory = None @property def loop_count(self): @@ -123,7 +125,7 @@ def _estimate_core(self, dist_context, resharder, block): for i in range(loop_count): for op in ops: self._detailed_cost[op.desc.id()] = OrderedDict() - # if in the while sub block, the detail of cost is the last cost + # If in the while sub block, the detail of cost is the last cost detail = self._detailed_cost[op.desc.id()] detail["reshard_cost"] = OrderedDict() # detail["dist_op_cost"] = [] @@ -147,15 +149,15 @@ def _estimate_core(self, dist_context, resharder, block): var = get_var_with_recursion(var_name, block, self.program) reshard_cost = resharder.get_cost(op, var, self.cluster) - # calc reshard cost + # Calc reshard cost if reshard_cost is not None: detail["reshard_cost"][var_name] = reshard_cost comm_costs = reshard_cost[0] local_comp_cost = reshard_cost[1] for comm_cost in comm_costs: - # time is cumulative in global cost and local cost, but memory and flops just are cumulative in global cost. - # comm sync + # Time is cumulative in global cost and local cost, but memory and flops just are cumulative in global cost. + # Comm sync for item in comm_cost: group_ranks, cost = item max_time = None @@ -183,7 +185,7 @@ def _estimate_core(self, dist_context, resharder, block): for comp_cost in local_comp_cost[rank]: self.local_cost(rank).time += comp_cost.time - # calc dist op cost + # Calc dist op cost dist_op = dist_context.get_dist_op_for_program(op) op_dist_attr = dist_op.dist_attr processes = op_dist_attr.process_mesh.processes @@ -201,7 +203,7 @@ def _estimate_core(self, dist_context, resharder, block): continue for item in dist_op_cost: if isinstance(item, list): - # comm sync + # Comm sync for comm_op_cost in item: max_time = None cost_time = {} @@ -222,9 +224,9 @@ def _estimate_core(self, dist_context, resharder, block): self._bubble_time_mapping[rank] += ( max_time - cost_time[rank]) elif isinstance(item, dict): - # op just one + # Op just one for rank in processes: - # dp+pp+mp + # DP+PP+MP if rank not in item: continue self.local_cost(rank).time += item[rank].time @@ -267,7 +269,7 @@ def _convert_pm_and_dm_to_str(process_mesh, dims_mapping): return result memories = {} - max_memories = {} + self.max_memories = {} var_info = { } # var_name: [[process_mesh, dims_mapping], [id]], [[process_mesh, dims_mapping], [id]]} @@ -277,6 +279,10 @@ def _convert_pm_and_dm_to_str(process_mesh, dims_mapping): self._ordered_ops.sort(key=lambda x: x[0]) for op_id, op in self._ordered_ops: + if op.type in [ + "create_py_reader", "create_double_buffer_reader", "read" + ]: + continue dist_op = dist_context.get_dist_op_for_program(op) process_mesh = dist_op.dist_attr.process_mesh for var_name in op.input_arg_names: @@ -288,7 +294,7 @@ def _convert_pm_and_dm_to_str(process_mesh, dims_mapping): input_dims_mapping) if key not in var_info[var_name]: var_info[var_name][key] = {} - # it is even partition now + # It is even partition now if "memory" not in var_info[var_name][key]: var = dist_op.get_serial_input(var_name) global_sizes = var.shape @@ -326,6 +332,10 @@ def _convert_pm_and_dm_to_str(process_mesh, dims_mapping): has_used_vars = set() for op_id, op in self._ordered_ops: + if op.type in [ + "create_py_reader", "create_double_buffer_reader", "read" + ]: + continue can_free_memories = {} can_free_vars = set() dist_op = dist_context.get_dist_op_for_program(op) @@ -337,14 +347,14 @@ def _convert_pm_and_dm_to_str(process_mesh, dims_mapping): input_dims_mapping) has_used_var = var_name + key var = dist_op.get_serial_input(var_name) - # not used + # Not used if var_name + key not in has_used_vars: has_used_vars.add(has_used_var) for process in process_mesh.processes: if process not in memories: memories[process] = 0 memories[process] += var_info[var_name][key]["memory"] - # used + # Used else: if op_id == var_info[var_name][key]["position"][-1]: if has_used_var not in can_free_vars: @@ -363,14 +373,14 @@ def _convert_pm_and_dm_to_str(process_mesh, dims_mapping): output_dims_mapping) has_used_var = var_name + key var = dist_op.get_serial_output(var_name) - # not used + # Not used if var_name + key not in has_used_vars: has_used_vars.add(has_used_var) for process in process_mesh.processes: if process not in memories: memories[process] = 0 memories[process] += var_info[var_name][key]["memory"] - # used + # Used else: if op_id == var_info[var_name][key]["position"][-1]: if has_used_var not in can_free_vars: @@ -382,21 +392,22 @@ def _convert_pm_and_dm_to_str(process_mesh, dims_mapping): can_free_memories[process] += var_info[ var_name][key]["memory"] - # calc peak memory + # Calc peak memory for process in memories: - if process not in max_memories: - max_memories[process] = memories[process] + if process not in self.max_memories: + self.max_memories[process] = memories[process] else: - if memories[process] > max_memories[process]: - max_memories[process] = memories[process] + if memories[process] > self.max_memories[process]: + self.max_memories[process] = memories[process] - # free memory + # Free memory for process in can_free_memories: if process in memories: memories[process] -= can_free_memories[process] # Calculate the max memory in all ranks - max_memory = max(max_memories.values()) + max_memory = max(self.max_memories.values()) + self.max_memory = max_memory return max_memory @@ -410,3 +421,143 @@ def estimate(self, dist_context, resharder=None): self._estimate_core(dist_context, resharder, block) return self.global_cost + + def _print_tag(self, max_len, length): + tag = "+" + "-" * max_len + for i in range(length): + print(tag, end="") + if i == length - 1: + print("+") + + def _print_vals(self, vals, max_len): + for idx, val in enumerate(vals): + s = "|" + str(val).center(max_len) + print(s, end="") + if idx == len(vals) - 1: + print("|") + + def _pretty_print_memory_cost(self): + """Print memory of every rank prettily.""" + if not self.max_memories or not self.max_memory: + raise ValueError("Please calculate memory cost before print.") + + # Padding automatically + max_len = 0 + header = ["Rank", "Memory(MiB)"] + memories = [ + int(item // 1e6) for item in list(self.max_memories.values()) + ] + for memory in (memories + header): + if len(str(memory)) > max_len: + max_len = len(str(memory)) + max_len += 4 # for pretty print of center + + # Print tag + self._print_tag(max_len, len(header)) + + # Print header + self._print_vals(header, max_len) + + # Print tag + self._print_tag(max_len, len(header)) + + # Print rank and its memory + for i in range(len(self.max_memories)): + memory = memories[i] + vals = [i, memory] + self._print_vals(vals, max_len) + self._print_tag(max_len, len(header)) + + def _pretty_print_global(self): + """Print global execution time and max memory prettily.""" + if not self.max_memories or not self.max_memory: + raise ValueError("Please calculate cost before print.") + + # Padding automatically + max_len = 0 + header = ["Execution Time(ms)", "Max Memory(MiB)"] + vals = [round(self.global_cost.time, 3), int(self.max_memory // 1e6)] + for memory in (vals + header): + if len(str(memory)) > max_len: + max_len = len(str(memory)) + max_len += 4 # for pretty print of center + + # Print tag + self._print_tag(max_len, len(header)) + + # Print header + self._print_vals(header, max_len) + + # Print tag + self._print_tag(max_len, len(header)) + + # Print exec time and max memory + self._print_vals(vals, max_len) + + # Print tag + self._print_tag(max_len, len(header)) + + def pretty_print_cost(self): + """Print cost prettily.""" + print("The global execution time and max memory are as follows:") + self._pretty_print_global() + print("The memory of every rank is as follows:") + self._pretty_print_memory_cost() + + +def get_cost_from_engine(engine, mode): + from ..utils import to_list + # Construct cost estimator by original main program + serial_main_prog = engine._serial_main_progs[mode].clone( + ) if mode in engine._serial_main_progs else engine._orig_main_prog.clone() + + serial_startup_prog = engine._serial_startup_progs[mode].clone( + ) if mode in engine._serial_startup_progs else engine._orig_startup_prog.clone( + ) + losses = to_list( + engine._loss) if (not isinstance(engine._loss, paddle.nn.Layer) + and not callable(engine._loss)) else engine._losses + + if mode in engine._dist_contexts: + dist_context = engine._dist_contexts[mode] + completer = engine._planners[mode].completer + else: + from ..completion import Completer + from ..dist_context import DistributedContext + dist_context = DistributedContext(serial_main_prog, serial_startup_prog, + engine._optimizer, losses, {}, + {"loss": losses}, engine._cluster, + engine._strategy) + completer = Completer(dist_context) + completer.complete_forward_annotation() + dist_context.block_state.parse_forward_blocks( + dist_context.serial_main_program) + + if mode == "eval" or mode == "predict": + cost_estimator = CostEstimator(serial_main_prog, engine._cluster) + elif mode == "train": + from ..parallelizer_v2 import Parallelizer + # Get serial main program with backward + serial_optimizer = engine._optimizer + parallelizer = Parallelizer(mode, completer, dist_context) + # Generate backward + loss_name = dist_context.serial_loss.name + serial_loss = serial_main_prog.global_block()._var_recursive(loss_name) + params_grads = parallelizer._generate_backward(serial_main_prog, + serial_startup_prog, + serial_loss) + + # Generate optimizer + optimizer_ops = parallelizer._generate_optimizer( + serial_main_prog, serial_startup_prog, serial_optimizer, + params_grads) + cost_estimator = CostEstimator(serial_main_prog, engine._cluster) + + # Estimate global_cost and max memory + global_cost = cost_estimator.estimate(dist_context) + max_memory = cost_estimator._estimate_max_memory_by_dist_op(dist_context) + + # Print the cost + cost_estimator.pretty_print_cost() + + return global_cost, max_memory diff --git a/python/paddle/distributed/auto_parallel/dist_context.py b/python/paddle/distributed/auto_parallel/dist_context.py index d1f00e8a7ba4f..387c964f0aa35 100644 --- a/python/paddle/distributed/auto_parallel/dist_context.py +++ b/python/paddle/distributed/auto_parallel/dist_context.py @@ -77,7 +77,6 @@ def __init__(self, self._serial_optimizer = None self._serial_feed_vars = {} self._serial_fetch_vars = {} - self._lr_optimizer = None # record the optimzier holding lr_scheduler # Data members related to the program self._dist_tensors_for_program = {} @@ -268,12 +267,24 @@ def _restore_serial_feed_vars(self): def _restore_serial_fetch_vars(self): for key, var_list in self._original_serial_fetch_vars.items(): new_var_list = [] - for var in var_list: - block_idx = var.block.idx - var_name = var.name - var = self._serial_main_program.blocks[ - block_idx]._var_recursive(var_name) - new_var_list.append(var) + # metrics is a list of list + if key == "metrics": + for inner_var_list in var_list: + new_inner_var_list = [] + for var in inner_var_list: + block_idx = var.block.idx + var_name = var.name + var = self._serial_main_program.blocks[ + block_idx]._var_recursive(var_name) + new_inner_var_list.append(var) + new_var_list.append(new_inner_var_list) + else: + for var in var_list: + block_idx = var.block.idx + var_name = var.name + var = self._serial_main_program.blocks[ + block_idx]._var_recursive(var_name) + new_var_list.append(var) self._serial_fetch_vars[key] = new_var_list def _restore_serial_info(self, mode="to_backup"): @@ -861,7 +872,7 @@ def __deepcopy__(self, memo): "_serial_ordered_nodes", "_serial_ordered_tensor_nodes", \ "_serial_ordered_op_nodes", "_original_serial_loss", \ "_original_serial_feed_vars", "_original_serial_fetch_vars", \ - "_serial_loss", "_serial_feed_vars", "_serial_fetch_vars", "_lr_optimizer", \ + "_serial_loss", "_serial_feed_vars", "_serial_fetch_vars", "_serial_optimizer", \ "_backup_serial_main_program_stack", "_backup_serial_startup_program_stack", \ "_pass_context"]: setattr(result, k, v) diff --git a/python/paddle/distributed/auto_parallel/dist_loader.py b/python/paddle/distributed/auto_parallel/dist_loader.py index 44f720ade7f80..38b537799e546 100644 --- a/python/paddle/distributed/auto_parallel/dist_loader.py +++ b/python/paddle/distributed/auto_parallel/dist_loader.py @@ -14,44 +14,14 @@ import abc import numpy as np -from functools import wraps import paddle -from .utils import to_list -from paddle.fluid.layers.utils import flatten -from paddle.io import DataLoader, BatchSampler, IterableDataset -from paddle.fluid.dataloader.batch_sampler import _InfiniteIterableSampler +from paddle.io import BatchSampler, IterableDataset +from paddle.fluid.dataloader.batch_sampler import _InfiniteIterableSampler, DistributedBatchSampler from paddle.fluid.dataloader.dataloader_iter import _DatasetKind, default_collate_fn, default_convert_fn -class DistributedDataLoader(metaclass=abc.ABCMeta): - - def __init__(self, dataset, batch_size=1, epochs=1, drop_last=False): - if isinstance(dataset, IterableDataset): - self.dataset_kind = _DatasetKind.ITER - else: - self.dataset_kind = _DatasetKind.MAP - - self.dataset = dataset - self.epochs = epochs - self.drop_lost = drop_last - - if batch_size is None: - self.batch_size = None - self.batch_sampler = None - else: - self.batch_size = batch_size - if isinstance(dataset, IterableDataset): - self.batch_sampler = _InfiniteIterableSampler( - dataset, batch_size) - else: - self.batch_sampler = BatchSampler(dataset, - batch_size=batch_size, - shuffle=False, - drop_last=drop_last) - - self.auto_collate_batch = self.batch_sampler is not None - self.sampler_iter = iter(self.index_sampler) +class DistributedDataLoaderBase(metaclass=abc.ABCMeta): @abc.abstractmethod def __iter__(self): @@ -72,40 +42,72 @@ def index_sampler(self): return _InfiniteIterableSampler(self.dataset, 1) -class NonIterableGeneratorLoader(DistributedDataLoader): +class DistributedDataLoaderFromGenerator(DistributedDataLoaderBase): def __init__(self, dataset, - feed_list, - places, + feed_list=None, + capacity=None, + use_double_buffer=True, + iterable=True, + return_list=False, + use_multiprocess=False, + drop_last=True, + places=None, batch_size=1, epochs=1, steps_per_epoch=None, collate_fn=None, + split_data=True, data_parallel_world_size=[], - data_parallel_rank=[], - drop_last=False, - split_data=True): + data_parallel_rank=[]): + self.dataset = dataset self.feed_list = feed_list + self.capacity = capacity + self.use_double_buffer = use_double_buffer + self.iterable = iterable + self.return_list = return_list + self.use_multiprocess = use_multiprocess + self.drop_last = drop_last self.places = places + self.batch_size = batch_size + self.epochs = epochs self.steps_per_epoch = steps_per_epoch - + self.collate_fn = collate_fn + self.split_data = split_data assert len(data_parallel_world_size) == len(feed_list) assert len(data_parallel_rank) == len(feed_list) self.dp_world_sizes = data_parallel_world_size self.dp_ranks = data_parallel_rank - self.split_data = split_data - super(NonIterableGeneratorLoader, - self).__init__(dataset, batch_size, epochs, drop_last) + if isinstance(dataset, IterableDataset): + self.dataset_kind = _DatasetKind.ITER + else: + self.dataset_kind = _DatasetKind.MAP + + if self.batch_size is None: + self.batch_sampler = None + else: + if isinstance(dataset, IterableDataset): + self.batch_sampler = _InfiniteIterableSampler( + dataset, batch_size) + else: + self.batch_sampler = BatchSampler(dataset, + batch_size=batch_size, + shuffle=False, + drop_last=drop_last) + + self.auto_collate_batch = self.batch_sampler is not None + self.sampler_iter = iter(self.index_sampler) if self.auto_collate_batch: self.collate_fn = collate_fn or default_collate_fn else: self.collate_fn = collate_fn or default_convert_fn + self.dataset_fetcher = _DatasetKind.create_fetcher( self.dataset_kind, self.dataset, self.auto_collate_batch, - self.collate_fn, self.drop_lost) + self.collate_fn, self.drop_last) self._steps = self._infer_steps() self._inner_dataloader = self._create_inner_dataloader() @@ -118,8 +120,10 @@ def __iter__(self): def __next__(self): if not self._steps: self._cur_step += 1 + return None elif self._cur_step < self._steps: self._cur_step += 1 + return None else: self._inner_dataloader.reset() self.sampler_iter = iter(self.index_sampler) @@ -141,6 +145,16 @@ def _infer_steps(self): ) return steps_per_epoch + @property + def index_sampler(self): + if self.auto_collate_batch: + return self.batch_sampler + else: + if self.dataset_kind == _DatasetKind.MAP: + return list(range(len(self.dataset))) + else: + return _InfiniteIterableSampler(self.dataset, 1) + def _create_inner_dataloader(self): def data_generator(): @@ -153,7 +167,7 @@ def data_generator(): self.dataset_fetcher = _DatasetKind.create_fetcher( self.dataset_kind, self.dataset, self.auto_collate_batch, self.collate_fn, - self.drop_lost) + self.drop_last) break partial_data = [] @@ -173,7 +187,83 @@ def data_generator(): yield partial_data dataloader = paddle.fluid.io.DataLoader.from_generator( - feed_list=self.feed_list, capacity=70, iterable=False) + feed_list=self.feed_list, + capacity=self.capacity, + use_double_buffer=self.use_double_buffer, + # iterable=self.iterable, + iterable=False, + return_list=self.return_list, + use_multiprocess=self.use_multiprocess, + drop_last=self.drop_last) dataloader.set_batch_generator(data_generator, self.places) return dataloader + + +class DistributedDataLoader(DistributedDataLoaderBase): + + def __init__(self, + dataset, + feed_list=None, + places=None, + return_list=True, + batch_size=1, + shuffle=False, + drop_last=False, + collate_fn=None, + num_workers=0, + use_buffer_reader=True, + use_shared_memory=True, + timeout=0, + worker_init_fn=None, + epochs=1, + steps_per_epoch=None, + split_data=True, + data_parallel_world_size=[], + data_parallel_rank=[]): + self.dataset = dataset + self.feed_list = feed_list + self.return_list = return_list + self.places = places + self.batch_size = batch_size + self.shuffle = shuffle + self.drop_last = drop_last + self.collate_fn = collate_fn + self.num_workers = num_workers + self.use_buffer_reader = use_buffer_reader + self.use_shared_memory = use_shared_memory + self.timeout = timeout + self.worker_init_fn = worker_init_fn + self.epochs = epochs + self.steps_per_epoch = steps_per_epoch + self.dp_world_sizes = data_parallel_world_size + self.dp_ranks = data_parallel_rank + self.split_data = split_data + # TODO: rank info + self.batch_sampler = DistributedBatchSampler( + self.dataset, self.batch_size, self.dp_world_sizes[0], + self.dp_ranks[0], self.shuffle, self.drop_last) + self._inner_dataloader = self._create_inner_dataloader() + + def __iter__(self): + return self + + def __next__(self): + return next(self.data) + + def _create_inner_dataloader(self): + dataloader = paddle.fluid.io.DataLoader( + self.dataset, + feed_list=self.feed_list, + places=self.places, + return_list=self.return_list, + batch_sampler=self.batch_sampler, + collate_fn=self.collate_fn, + num_workers=self.num_workers, + use_buffer_reader=self.use_buffer_reader, + use_shared_memory=self.use_shared_memory, + timeout=self.timeout, + worker_init_fn=self.worker_init_fn) + self.data = (x for x in dataloader) + + return dataloader diff --git a/python/paddle/distributed/auto_parallel/engine.py b/python/paddle/distributed/auto_parallel/engine.py index e329f775590f5..a2e1477f8873c 100644 --- a/python/paddle/distributed/auto_parallel/engine.py +++ b/python/paddle/distributed/auto_parallel/engine.py @@ -13,8 +13,6 @@ # limitations under the License. import os -import time -import copy import logging import random import numpy as np @@ -24,18 +22,18 @@ import paddle.utils as utils from paddle import fluid, static -from paddle.jit import to_static from paddle.metric import Metric from paddle.static import InputSpec from paddle.fluid import core from paddle.fluid import Variable from paddle.fluid.layers.utils import flatten from paddle.fluid.executor import global_scope, _to_name_str -from paddle.fluid.framework import Operator, Parameter, _non_static_mode +from paddle.fluid.framework import Operator, _non_static_mode from paddle.fluid.framework import _current_expected_place as _get_device from paddle.fluid.dygraph.parallel import ParallelEnv from paddle.distributed import fleet +from .callbacks import config_callbacks from .converter import Converter from .helper import ProgramHelper from .cluster import Cluster, get_default_cluster @@ -43,13 +41,15 @@ from .parallelizer_v2 import Parallelizer from .dist_op import DistributedOperator from .dist_saver import DistributedSaver -from .dist_loader import NonIterableGeneratorLoader -from .utils import print_program_with_dist_attr, to_list -from .utils import get_logger, get_dist_attr +from .dist_loader import DistributedDataLoaderFromGenerator, DistributedDataLoader +from .utils import to_list, get_dist_attr, get_lr from .process_group import new_process_group, get_all_process_groups from .dist_context import DistributedContext, get_default_distributed_context from .strategy import Strategy -from .interface import _get_fetches +from .interface import CollectionNames, get_collection +from ..utils.log_utils import get_logger +from .utils import initialize_pg_in_full_mode +from .cost.estimate_cost import get_cost_from_engine class Engine: @@ -129,12 +129,6 @@ def __init__(self, "'model must be sub classes of `paddle.nn.Layer` or any callable function." ) self._model = model - - if loss and not isinstance(loss, - paddle.nn.Layer) and not callable(loss): - raise TypeError( - "'loss' must be sub classes of `paddle.nn.Layer` or any callable function." - ) self._loss = loss if optimizer and not isinstance( @@ -187,17 +181,277 @@ def __init__(self, self._feed_vars = {} self._fetch_vars = {} self._planners = {} - self._mode_init_states = { + self._has_prepared = {"train": False, "eval": False, "predict": False} + self._has_prepared_reader = { "train": False, "eval": False, "predict": False } + self._inputs_spec = [] + self._labels_spec = [] + self._inputs = [] + self._labels = [] + self._skip_build = False + self._outside_dataloader = False self._planned_mode = None self._dygraph_mode = False self._tuning = self._strategy.tuning + self._losses = None + + self.history = None + + def _prepare_data_spec(self, data, split, batch_size): + inputs_spec = [] + labels_spec = [] + if isinstance(data, paddle.io.IterableDataset): + if split is None: + inputs, labels = next(iter(data)) + else: + sample = next(iter(data)) + inputs = sample[:split] + labels = sample[split:] + elif isinstance(data, paddle.io.Dataset): + if split is None: + inputs, labels = data[0] + else: + sample = data[0] + inputs = sample[:split] + labels = sample[split:] + else: + raise ValueError( + "Data should be a Dataset or IterableDatset, but received {}.". + format(type(data).__name__)) + inputs = to_list(inputs) + labels = to_list(labels) + + num_shards = self._strategy.dataset.num_shards + + def _adjust_item_spec(num_shards, spec): + if num_shards > 1 and len(spec.shape) > 1: + spec.shape[0] = spec.shape[0] * num_shards + + def _infer_item_spec(item, name, batch_size, specs): + if isinstance(item, np.ndarray): + spec = InputSpec.from_numpy(item, name) + if batch_size is None: + _adjust_item_spec(num_shards, spec) + specs.append(spec) + else: + specs.append(spec.batch(batch_size)) + elif isinstance(item, (Variable, core.VarBase, core.eager.Tensor)): + _adjust_item_spec(num_shards, spec) + spec = InputSpec.from_tensor(item, name) + if batch_size is None: + specs.append(spec) + else: + specs.append(spec.batch(batch_size)) + else: + specs.append(InputSpec([batch_size], type(item), name)) + + if inputs is not None: + for i, item in enumerate(inputs): + assert item is not None, "Receive None input." + name = "input" + str(i) + _infer_item_spec(item, name, batch_size, inputs_spec) + if labels is not None: + for i, item in enumerate(labels): + assert item is not None, "Receive None input." + name = "label" + str(i) + _infer_item_spec(item, name, batch_size, labels_spec) + + inputs_spec = self._validate_spec(inputs_spec) + labels_spec = self._validate_spec(labels_spec) + return inputs_spec, labels_spec + + def _prepare_data_tensor(self, + inputs_spec, + labels_spec, + inputs=None, + labels=None): + if _non_static_mode() or self._dygraph_mode: + return None, None + inputs_spec = inputs_spec if inputs_spec else [] + labels_spec = labels_spec if labels_spec else [] + if inputs_spec: + assert isinstance(inputs_spec, list), \ + "inputs should be list, but received {}".format(type(inputs_spec)) + if inputs is None: + inputs = [s._create_feed_layer() for s in inputs_spec] + else: + assert isinstance(inputs, list), \ + "inputs should be list, but received {}".format(type(inputs)) + for input_spec, input in zip(inputs_spec, inputs): + if input_spec.shape != input.shape: + input.desc.set_shape(input_spec.shape) + if labels_spec: + assert isinstance(labels_spec, list), \ + "labels should be list, but received {}".format(type(labels_spec)) + if labels is None: + labels = [s._create_feed_layer() for s in labels_spec] + else: + assert isinstance(labels, list), \ + "labels should be list, but received {}".format(type(labels)) + for label_spec, label in zip(labels_spec, labels): + if label_spec.shape != label.shape: + label.desc.set_shape(label_spec.shape) + return inputs, labels + + def _prepare_reader(self): + dist_main_prog = self._dist_main_progs[self._mode][self._cur_rank] + dist_context = self._dist_contexts[self._mode] + dist_main_block = dist_main_prog.global_block() - def _prepare_single_mode(self, mode): + # NOTE: this list may be changed if Paddle changes the existing rules. + related_reader_ops = [ + "create_py_reader", "create_double_buffer_reader", "read" + ] + # remove the first three ops if multiple run fit/evaluate/predict + if dist_main_block.ops[0].type == 'create_py_reader': + for i in range(len(related_reader_ops)): + if dist_main_block.ops[0].type in related_reader_ops: + dist_main_block._remove_op(0, sync=False) + dist_main_block._sync_with_cpp() + # Step 1: find the reader ops + reader_op_indices = [] + for idx, op in enumerate(dist_main_block.ops): + if op.type in related_reader_ops: + reader_op_indices.append(idx) + # Step 2: insert the new reader ops to cpp + new_reader_ops = [] + for idx in reversed(reader_op_indices): + new_op_desc = dist_main_block.desc._prepend_op() + new_op_desc.copy_from(dist_main_block.ops[idx].desc) + new_op = Operator(dist_main_block, + new_op_desc, + type=new_op_desc.type()) + new_reader_ops.append(new_op) + dist_op = DistributedOperator(new_op) + dist_context.add_dist_op_for_program(dist_op) + # Step 3: insert the new reader ops to python + for new_op in new_reader_ops: + dist_main_block.ops.insert(0, new_op) + for i in range(len(reader_op_indices)): + reader_op_indices[i] += len(reader_op_indices) + # Step 4: remove the old reader ops from python and cpp + for idx in reversed(reader_op_indices): + op = dist_main_block.ops.pop(idx) + dist_main_block.desc._remove_op(idx, idx + 1) + dist_main_block._sync_with_cpp() + self._has_prepared_reader[self._mode] = True + + def _prepare_feed(self, data, user_feeds, mode): + feeds = {} + if data is not None: + if isinstance(data, (list, tuple)): + if len(data) == 1 and isinstance(data[0], dict): + for name, data in data[0].items(): + feeds[name] = data + else: + raise ValueError("Unsupported data {}".format(data)) + elif isinstance(data, dict): + for name, data in data.items(): + feeds[name] = data + else: + raise ValueError("Unsupported data {}".format(data)) + if user_feeds is not None: + assert isinstance(user_feeds, dict), \ + "user_feeds must be a dict, but receive {}".format(type(user_feeds).__name__) + for name, data in user_feeds.items(): + feeds[name] = data + return feeds + + def _prepare_fetch(self, user_fetches, mode): + if user_fetches is not None: + assert isinstance(user_fetches, list), \ + "user_fetches must be a list, but receive {}".format(type(user_fetches).__name__) + fetch_names = [] + fetch_indices = [] + + def _process_fetch_group(group_name, var_list): + group_indices = [] + for var in var_list: + # Remove duplicate var_names + if self._is_local_var(var): + var_name = _to_name_str(var) + if var_name not in fetch_names: + fetch_names.append(var_name) + group_indices.append(fetch_names.index(var_name)) + if not group_indices: + fetch_names.append([]) + fetch_indices.append(group_indices) + + if mode != "predict": + _process_fetch_group("loss", self._fetch_vars[mode]["loss"]) + if mode != "predict": + metrics = self._fetch_vars[mode]["metrics"] + for i, var_list in enumerate(metrics): + _process_fetch_group("metrics_" + str(i), var_list) + if mode == "predict": + _process_fetch_group("outputs", self._fetch_vars[mode]["outputs"]) + user_fetches_collection = [ + item[1] for item in get_collection(CollectionNames.FETCHES) + ] + var_list = (user_fetches_collection or []) + (user_fetches or []) + _process_fetch_group("fetches", var_list) + return fetch_names, fetch_indices + + def _prepare_logger(self, + outs, + epoch=None, + step=None, + lr=None, + fetch_names=None, + fetch_indices=None, + mode=None): + logs = {} + if epoch is not None: + logs["epoch"] = epoch + if step is not None: + logs["step"] = step + 1 + if lr is not None: + logs["lr"] = lr + group_idx = 0 + if mode != "predict": + # logging loss + loss_indices = fetch_indices[group_idx] + assert len(loss_indices) <= 1 + for idx in loss_indices: + logs["loss"] = outs[idx][0] + group_idx += 1 + # logging metrics + metric_vars = self._fetch_vars[mode]["metrics"] + if metric_vars: + for metric in self._metrics: + metrics_indices = fetch_indices[group_idx] + metric_out = [] + for idx in metrics_indices: + metric_out.append(outs[idx]) + if metric_out: + metric.update(*metric_out) + results = metric.accumulate() + for i, res in enumerate(to_list(results)): + logs[metric.name()[i]] = res + group_idx += 1 + # logging outputs + elif mode == "predict": + outputs_indices = fetch_indices[group_idx] + logs_out = {} + for idx in outputs_indices: + logs_out["out%d" % (idx)] = outs[idx] + logs["outputs"] = logs_out + group_idx += 1 + # logging user fetches + collect_fetches = get_collection(CollectionNames.FETCHES) + logs_fetch = {} + for name, var in collect_fetches: + if var.name in fetch_names: + idx = fetch_names.index(var.name) + logs_fetch[name or var.name] = outs[idx] + logs["fetches"] = logs_fetch + return logs + + def _prepare_program(self, mode): # Do the build process self._build(mode) # Do the planning process @@ -206,7 +460,7 @@ def _prepare_single_mode(self, mode): self._parallel(mode) # Init comm and startup program self._initialize(mode) - self._mode_init_states[mode] = True + self._has_prepared[mode] = True def _build(self, mode): if _non_static_mode() or self._dygraph_mode: @@ -214,8 +468,8 @@ def _build(self, mode): self._dygraph_mode = True self._logger.info("Building model with 'to_static' method.") - inputs_spec = self.inputs_spec - labels_spec = self.labels_spec if self.labels_spec else [] + inputs_spec = self._inputs_spec + labels_spec = self._labels_spec if self._labels_spec else [] self.program_helper = ProgramHelper(self._model, self._loss, self._metrics, inputs_spec, labels_spec) @@ -230,8 +484,12 @@ def _build(self, mode): outputs = self.program_helper.output_vars labels = self.program_helper.label_vars losses = self.program_helper.loss_vars + self._losses = losses metrics = self.program_helper.metric_vars + self._inputs = inputs + self._labels = labels + paddle.enable_static() else: # build program in static mode @@ -239,24 +497,28 @@ def _build(self, mode): if serial_main_prog is not None: return + outputs = [] losses = [] metrics = [] + inputs = self._inputs if self._inputs else [] + labels = self._labels if self._labels else [] serial_main_prog = self._orig_main_prog.clone() serial_startup_prog = self._orig_startup_prog.clone() - with static.program_guard(serial_main_prog, serial_startup_prog), \ - utils.unique_name.guard(): - inputs_spec = self.inputs_spec - labels_spec = self.labels_spec if self.labels_spec else [] - inputs = [s._create_feed_layer() for s in inputs_spec] - labels = [s._create_feed_layer() for s in labels_spec] - outputs = to_list(self._model(*inputs)) - if mode != "predict" and self._loss: - losses = to_list(self._loss(*(outputs + labels))) - - if mode != "predict": - for metric in self._metrics: - metrics.extend( - to_list(metric.compute(*(outputs + labels)))) + if not self._skip_build: + with static.program_guard(serial_main_prog, serial_startup_prog), \ + utils.unique_name.guard(): + outputs = to_list(self._model(*inputs)) + if mode != "predict" and self._loss: + losses = to_list(self._loss(*(outputs + labels))) + self._losses = losses + + if mode != "predict" and (outputs or labels): + for metric in self._metrics: + metrics.append( + to_list(metric.compute(*(outputs + labels)))) + else: + losses = to_list(self._loss) + self.losses = losses default_ctx = get_default_distributed_context() if not default_ctx.has_annotation: @@ -299,8 +561,8 @@ def _optimization_tuning(self, mode, dataset, batch_size): self._optimization_tuner = OptimizationTuner(self._tuning.to_dict(), self._dist_contexts[mode], dataset, - self.inputs_spec, - self.labels_spec, + self._inputs_spec, + self._labels_spec, batch_size=batch_size, rank=self._cur_rank) @@ -324,6 +586,7 @@ def _plan(self, mode): inputs_var = self._dist_contexts[mode].serial_feed_vars["inputs"] labels_var = self._dist_contexts[mode].serial_feed_vars["labels"] block = self._dist_contexts[mode].serial_main_program.global_block() + # TODO: check this feed_list feed_list = [] for var in inputs_var + labels_var: if var.name in block.vars: @@ -378,18 +641,20 @@ def _initialize(self, mode): mode].dist_startup_programs self._feed_vars[mode] = self._dist_contexts[mode].serial_feed_vars self._fetch_vars[mode] = self._dist_contexts[mode].serial_fetch_vars - self._lr_optimizer = self._dist_contexts[mode]._lr_optimizer + self._optimizer = self._dist_contexts[mode]._serial_optimizer if self._nranks > 1: # Traverse different rank programs and traverse each op of them, # instantiate communication by process_mapping. all_process_groups = get_all_process_groups() - # NOTE: add the comm init control in the future for auto search - for process_group in all_process_groups: - if self._cur_rank not in process_group.ranks: - continue - process_group.instantiate() + if self._strategy.auto_mode == "full": + initialize_pg_in_full_mode(all_process_groups, cur_rank) + else: + for process_group in all_process_groups: + if self._cur_rank not in process_group.ranks: + continue + process_group.instantiate() place = _get_device() if isinstance(place, fluid.CUDAPlace): @@ -423,77 +688,26 @@ def _initialize(self, mode): self._dist_attr) if self._strategy.reinit: - self._logger.info("NOTE: parameters wiil be re-initialized.") + self._logger.info("NOTE: parameters will be re-initialized.") dist_startup_prog = self._dist_startup_progs[mode][self._cur_rank] self._executor.run(dist_startup_prog) - def _infer_sample_spec(self, data, batch_size, split): - if isinstance(data, paddle.io.IterableDataset): - if split is None: - input, label = next(iter(data)) - else: - sample = next(iter(data)) - input = sample[:split] - label = sample[split:] - elif isinstance(data, paddle.io.Dataset): - if split is None: - input, label = data[0] - else: - sample = data[0] - input = sample[:split] - label = sample[split:] - else: - raise ValueError( - "Data should be a Dataset or IterableDatset, but received {}.". - format(type(data).__name__)) - - self.inputs_spec = [] - self.labels_spec = [] - input_list = to_list(input) - label_list = to_list(label) - - def _infer_item_spec(item, name, batch_size, specs): - if isinstance(item, np.ndarray): - spec = InputSpec.from_numpy(item, name) - if batch_size is None: - specs.append(spec) - else: - specs.append(spec.batch(batch_size)) - elif isinstance(item, (Variable, core.VarBase, core.eager.Tensor)): - spec = InputSpec.from_tensor(item, name) - if batch_size is None: - specs.append(spec) - else: - specs.append(spec.batch(batch_size)) - else: - specs.append(InputSpec([batch_size], type(item), name)) - - if input_list is not None: - for i, item in enumerate(input_list): - assert item is not None, "Receive None input." - name = "input" + str(i) - _infer_item_spec(item, name, batch_size, self.inputs_spec) - if label_list is not None: - for i, item in enumerate(label_list): - assert item is not None, "Receive None input." - name = "label" + str(i) - _infer_item_spec(item, name, batch_size, self.labels_spec) - - self.inputs_spec = self._validate_spec(self.inputs_spec) - self.labels_spec = self._validate_spec(self.labels_spec) - def fit(self, train_data, train_sample_split=None, batch_size=1, epochs=1, steps_per_epoch=None, + log_freq=10, + save_dir=None, + save_freq=1, valid_data=None, valid_sample_split=None, valid_freq=1, valid_steps=None, collate_fn=None, - callbacks=None): + callbacks=None, + verbose=2): """ Trains the model for a fixed number of epochs. If `valid_data` is set, evaluation will be done at the end of each epoch. @@ -560,80 +774,90 @@ def fit(self, epochs=2, batch_size=64) """ - self.mode = 'train' - self._infer_sample_spec(train_data, batch_size, train_sample_split) - if not self._mode_init_states[self.mode]: - self._prepare_single_mode(self.mode) + self._mode = 'train' + self._inputs_spec, self._labels_spec = self._prepare_data_spec( + train_data, train_sample_split, batch_size) + self._inputs, self._labels = self._prepare_data_tensor( + self._inputs_spec, self._labels_spec) + if not self._has_prepared[self._mode]: + self._prepare_program(self._mode) else: - self._switch_mode("train") - - assert self.mode in self._dist_main_progs, \ - "train model is not ready, please call `engine._prepare_single_mode('train')` first." - train_dataloader = self._create_dataloader(train_data, batch_size, - epochs, steps_per_epoch, - collate_fn) - - fetch_loss = self._validate_fetches(self.fetch_vars["loss"]) - fetch_metrics = self._validate_fetches(self.fetch_vars["metrics"]) - inner_fetch = dict(fetch_loss, **fetch_metrics) - usr_fetch = self._validate_fetches(_get_fetches()) - fetch_list, fetch_map = self._fetch_map(inner_fetch, usr_fetch) - lr_scheduler = self._get_lr_scheduler(self.main_program) - - outputs = defaultdict(list) + self._switch_mode(self._mode) + + assert self._mode in self._dist_main_progs, \ + "train model is not ready, please call `engine._prepare_program('train')` first." + + train_dataloader = self._prepare_dataloader_from_generator( + dataset=train_data, + capacity=70, + iterable=False, + batch_size=batch_size, + epochs=epochs, + steps_per_epoch=steps_per_epoch, + collate_fn=collate_fn) + + fetch_names, fetch_indices = self._prepare_fetch(None, mode=self._mode) + + cbks = config_callbacks( + callbacks, + engine=self, + batch_size=batch_size, + epochs=epochs, + steps=train_dataloader._steps, + log_freq=log_freq, + save_freq=save_freq, + save_dir=save_dir, + verbose=verbose, + metrics=self._metrics_name(), + acc_step=self._k_steps, + ) + + cbks.on_begin('train') for epoch in range(epochs): - train_logs = {"epoch: {:d} ": epoch} + logs = {} + cbks.on_epoch_begin(epoch) for step, _ in enumerate(train_dataloader): + cbks.on_batch_begin('train', step, logs) try: outs = self._executor.run( self.main_program, - fetch_list=fetch_list, + fetch_list=fetch_names, use_program_cache=self._strategy.use_cache, return_numpy=self._strategy.return_numpy) except core.EOFException: break - train_logs["step: {:d} "] = step - # update lr - if lr_scheduler and step % self._k_steps == 0: - lr_scheduler.step() - train_logs["lr: {:5e} "] = self._get_lr(self._lr_optimizer) - # inner fetches - if fetch_loss: - train_logs["loss: {:8f} "] = outs[0][0] - outputs["loss"].append(outs[0][0]) - # Metric - if fetch_metrics: - metric_out = outs[len(fetch_loss):len(inner_fetch)] - for metric in self._metrics: - metric.update(*metric_out) - results = metric.accumulate() - for i, res in enumerate(to_list(results)): - train_logs[metric.name()[i] + ": {:8f} "] = res - outputs[metric.name()[i]].append(outs[0][0]) - # user fetches - user_outs = outs[len(inner_fetch):] - user_fetch_list = fetch_list[len(inner_fetch):] - for i, out in enumerate(user_outs): - train_logs[fetch_map[user_fetch_list[i]] + ": {}"] = out - # logger - string = '[train] ' + ''.join(list(train_logs.keys())) - self._logger.info(string.format(*list(train_logs.values()))) - - if valid_data and epoch % valid_freq == 0: - self.evaluate(valid_data, valid_sample_split, batch_size, - valid_steps, collate_fn, callbacks) + lr = get_lr(self._optimizer) + logs = self._prepare_logger(outs, epoch, step, lr, fetch_names, + fetch_indices, self._mode) + cbks.on_batch_end('train', step, logs) + + if valid_data and (epoch + 1) % valid_freq == 0: + val_logs = self.evaluate(valid_data, valid_sample_split, + batch_size, valid_steps, log_freq, + collate_fn, callbacks, verbose) + val_logs = { + "val_" + name: val + for name, val in val_logs.items() + } + logs.update(val_logs) self._switch_mode("train") else: self._reset_metrics() - return outputs + + cbks.on_epoch_end(epoch, logs) + + cbks.on_end('train', logs) + return self.history def evaluate(self, valid_data, valid_sample_split=None, batch_size=1, steps=None, + log_freq=10, collate_fn=None, - callbacks=None): + callbacks=None, + verbose=2): """ Evaluate the loss and metrics of the model on evaluation data. @@ -652,7 +876,7 @@ def evaluate(self, the sample list, None for only stack each fields of sample in axis 0. Default None. callbacks (Callback|None, optional): A list of `Callback` instances to apply - during evaling. Default: None. (Unused for now) + during evaluating. Default: None. (Unused for now) Returns: None @@ -680,60 +904,59 @@ def evaluate(self, engine.evaluate(valid_dataset, batch_size=64) """ - self.mode = 'eval' - self._infer_sample_spec(valid_data, batch_size, valid_sample_split) - if not self._mode_init_states[self.mode]: - self._prepare_single_mode(self.mode) + self._mode = 'eval' + self._inputs_spec, self._labels_spec = self._prepare_data_spec( + valid_data, valid_sample_split, batch_size) + self._inputs, self._labels = self._prepare_data_tensor( + self._inputs_spec, self._labels_spec) + if not self._has_prepared[self._mode]: + self._prepare_program(self._mode) else: - self._switch_mode("eval") - - assert self.mode in self._dist_main_progs, \ - "eval model is not ready, please call `engine._prepare_single_mode('eval')` first." - valid_dataloader = self._create_dataloader(valid_data, - batch_size, - steps_per_epoch=steps, - collate_fn=collate_fn) - - fetch_loss = self._validate_fetches(self.fetch_vars["loss"]) - fetch_metrics = self._validate_fetches(self.fetch_vars["metrics"]) - inner_fetch = dict(fetch_loss, **fetch_metrics) - usr_fetch = self._validate_fetches(_get_fetches()) - fetch_list, fetch_map = self._fetch_map(inner_fetch, usr_fetch) - - outputs = defaultdict(list) + self._switch_mode(self._mode) + + assert self._mode in self._dist_main_progs, \ + "eval model is not ready, please call `engine._prepare_program('eval')` first." + valid_dataloader = self._prepare_dataloader_from_generator( + dataset=valid_data, + capacity=70, + iterable=False, + batch_size=batch_size, + steps_per_epoch=steps, + collate_fn=collate_fn) + + fetch_names, fetch_indices = self._prepare_fetch(None, mode=self._mode) + + cbks = config_callbacks( + callbacks, + engine=self, + batch_size=batch_size, + log_freq=log_freq, + verbose=verbose, + metrics=self._metrics_name(), + ) + + eval_steps = valid_dataloader._steps + cbks.on_begin('eval', { + 'steps': eval_steps, + 'metrics': self._metrics_name() + }) + logs = {} for step, _ in enumerate(valid_dataloader): + cbks.on_batch_begin('eval', step, logs) try: outs = self._executor.run( self.main_program, - fetch_list=fetch_list, + fetch_list=fetch_names, use_program_cache=self._strategy.use_cache, return_numpy=self._strategy.return_numpy) except core.EOFException: break - eval_logs = {"step: {:d} ": step} - # inner fetches - if fetch_loss: - eval_logs["loss: {:8f} "] = outs[0][0] - outputs["eval_loss"].append(outs[0][0]) - # Metric - if fetch_metrics: - metric_out = outs[len(fetch_loss):len(inner_fetch)] - for metric in self._metrics: - metric.update(*metric_out) - results = metric.accumulate() - for i, res in enumerate(to_list(results)): - eval_logs[metric.name()[i] + ": {:8f} "] = res - outputs["eval_" + metric.name()[i]].append(res) - # user fetches - usr_outs = outs[len(inner_fetch):] - usr_fetch_list = fetch_list[len(inner_fetch):] - for i, out in enumerate(usr_outs): - eval_logs[fetch_map[usr_fetch_list[i]] + ": {}"] = out - # logger - string = '[eval] ' + ''.join(list(eval_logs.keys())) - self._logger.info(string.format(*list(eval_logs.values()))) + logs = self._prepare_logger(outs, None, step, None, fetch_names, + fetch_indices, self._mode) + cbks.on_batch_end('eval', step, logs) + cbks.on_end('eval', logs) self._reset_metrics() - return outputs + return logs def predict(self, test_data, @@ -741,7 +964,8 @@ def predict(self, batch_size=1, steps=None, collate_fn=None, - callbacks=None): + callbacks=None, + verbose=2): """ Compute the output predictions on testing data. @@ -785,72 +1009,223 @@ def predict(self, engine = auto.Engine(model) engine.predict(valid_dataset, batch_size=64) """ - self.mode = 'predict' - self._infer_sample_spec(test_data, batch_size, test_sample_split) - if not self._mode_init_states[self.mode]: - self._prepare_single_mode(self.mode) + self._mode = 'predict' + self._inputs_spec, self._labels_spec = self._prepare_data_spec( + test_data, test_sample_split, batch_size) + self._inputs, self._labels = self._prepare_data_tensor( + self._inputs_spec, self._labels_spec) + if not self._has_prepared[self._mode]: + self._prepare_program(self._mode) else: - self._switch_mode("predict") + self._switch_mode(self._mode) - assert self.mode in self._dist_main_progs, \ - "predict model is not ready, please call `engine._prepare_single_mode('predict')` first." - test_dataloader = self._create_dataloader(test_data, - batch_size, - steps_per_epoch=steps, - collate_fn=collate_fn) + assert self._mode in self._dist_main_progs, \ + "predict model is not ready, please call `engine._prepare_program('predict')` first." - fetch_outputs = self._validate_fetches(self.fetch_vars["outputs"]) - usr_fetch = self._validate_fetches(_get_fetches()) - fetch_list, fetch_map = self._fetch_map(fetch_outputs, usr_fetch) + test_dataloader = self._prepare_dataloader_from_generator( + dataset=test_data, + capacity=70, + iterable=False, + batch_size=batch_size, + steps_per_epoch=steps, + collate_fn=collate_fn) + + fetch_names, fetch_indices = self._prepare_fetch(None, mode=self._mode) outputs = [] + cbks = config_callbacks(callbacks, engine=self, verbose=verbose) + test_steps = test_dataloader._steps + cbks.on_begin('predict', {'steps': test_steps}) + logs = {} for step, _ in enumerate(test_dataloader): + cbks.on_batch_begin('predict', step, logs) try: outs = self._executor.run( self.main_program, - fetch_list=fetch_list, + fetch_list=fetch_names, use_program_cache=self._strategy.use_cache, return_numpy=self._strategy.return_numpy) except core.EOFException: break - predict_logs = {"step: {:d} ": step} - outputs.append(outs[:len(fetch_outputs)]) - for i, out in enumerate(outs): - predict_logs[fetch_map[fetch_list[i]] + ": {}"] = out - # logger - string = '[pred] ' + ''.join(list(predict_logs.keys())) - self._logger.info(string.format(*list(predict_logs.values()))) - + logs = self._prepare_logger(outs, None, step, None, fetch_names, + fetch_indices, self._mode) + cbks.on_batch_end('predict', step, logs) + outputs.append(list(logs["outputs"].values())) + cbks.on_end('predict', logs) return outputs - def _tune(self, tune_data, tune_sample_split=None, batch_size=1): - self.mode = 'train' - self._infer_sample_spec(tune_data, batch_size, tune_sample_split) - self._optimization_tuning(self.mode, tune_data, batch_size) + def dataloader(self, + dataset, + batch_size=1, + shuffle=False, + drop_last=False, + collate_fn=None, + num_workers=0, + use_buffer_reader=True, + use_shared_memory=True, + timeout=0, + worker_init_fn=None, + epochs=1, + steps_per_epoch=None, + sample_split=1, + mode=None): + if mode is not None: + self.to_mode(mode) + self._inputs_spec, self._labels_spec = self._prepare_data_spec( + dataset, sample_split, batch_size) + self._inputs, self._labels = self._prepare_data_tensor( + self._inputs_spec, self._labels_spec) + if not self._has_prepared[self._mode]: + self._prepare_program(self._mode) + else: + self._switch_mode(self._mode) + dataloader = self._prepare_dataloader( + dataset, + return_list=False, + batch_size=batch_size, + shuffle=shuffle, + drop_last=drop_last, + collate_fn=collate_fn, + num_workers=num_workers, + use_buffer_reader=use_buffer_reader, + use_shared_memory=use_shared_memory, + timeout=timeout, + worker_init_fn=worker_init_fn, + epochs=epochs, + steps_per_epoch=steps_per_epoch) + return dataloader - def _create_dataloader(self, - dataset, - batch_size, - epochs=1, - steps_per_epoch=None, - collate_fn=None): + def dataloader_from_generator(self, + dataset, + capacity=70, + use_double_buffer=True, + iterable=True, + use_multiprocess=False, + drop_last=True, + batch_size=1, + epochs=1, + steps_per_epoch=None, + collate_fn=None, + sample_split=1, + mode=None): + if mode is not None: + self.to_mode(mode) + self._inputs_spec, self._labels_spec = self._prepare_data_spec( + dataset, sample_split, batch_size) + self._inputs, self._labels = self._prepare_data_tensor( + self._inputs_spec, self._labels_spec) + if not self._has_prepared[self._mode]: + self._prepare_program(self._mode) + else: + self._switch_mode(self._mode) + dataloader = self._prepare_dataloader_from_generator( + dataset=dataset, + capacity=capacity, + use_double_buffer=use_double_buffer, + iterable=iterable, + return_list=False, + use_multiprocess=use_multiprocess, + drop_last=drop_last, + batch_size=batch_size, + epochs=epochs, + steps_per_epoch=steps_per_epoch, + collate_fn=collate_fn) + return dataloader + + def prepare(self, + inputs_spec=None, + labels_spec=None, + inputs=None, + labels=None, + main_program=None, + startup_program=None, + mode=None): + if mode is not None: + self.to_mode(mode) + if inputs or labels: + self._skip_build = True + self._inputs_spec = inputs_spec + self._labels_spec = labels_spec + self._inputs, self._labels = self._prepare_data_tensor( + self._inputs_spec, self._labels_spec, inputs, labels) + self._orig_main_prog = main_program + if self._orig_main_prog is None: + self._orig_main_prog = static.default_main_program() + self._orig_startup_prog = startup_program + if self._orig_startup_prog is None: + self._orig_startup_prog = static.default_startup_program() + if not self._has_prepared[self._mode]: + self._prepare_program(self._mode) + else: + self._switch_mode(self._mode) + elif inputs_spec or labels_spec: + self._inputs_spec = inputs_spec + self._labels_spec = labels_spec + self._outside_dataloader = True + self._inputs, self._labels = self._prepare_data_tensor( + self._inputs_spec, self._labels_spec) + self._orig_main_prog = main_program + if self._orig_main_prog is None: + self._orig_main_prog = static.default_main_program() + self._orig_startup_prog = startup_program + if self._orig_startup_prog is None: + self._orig_startup_prog = static.default_startup_program() + if not self._has_prepared[self._mode]: + self._prepare_program(self._mode) + else: + self._switch_mode(self._mode) + else: + assert self._inputs_spec and self._labels_spec, \ + "Please call the dataloader(...) before calling prepare(...)" + + def run(self, data=None, feed=None, fetch_list=None, mode=None): + if mode is not None: + self.to_mode(mode) + feed_dict = self._prepare_feed(data, feed, self._mode) + fetch_names, fetch_indices = self._prepare_fetch(fetch_list, self._mode) + if self._outside_dataloader and not self._has_prepared_reader[ + self._mode]: + self._prepare_reader() + outs = self._executor.run(self.main_program, + feed=feed_dict, + fetch_list=fetch_names, + use_program_cache=self._strategy.use_cache, + return_numpy=self._strategy.return_numpy) + logs = self._prepare_logger(outs, None, None, None, fetch_names, + fetch_indices, self._mode) + return logs + + def _prepare_dataloader(self, + dataset, + return_list=True, + batch_size=1, + shuffle=False, + drop_last=False, + collate_fn=None, + num_workers=0, + use_buffer_reader=True, + use_shared_memory=True, + timeout=0, + worker_init_fn=None, + epochs=1, + steps_per_epoch=None): if self._strategy.gradient_merge and batch_size is not None: assert batch_size % self._k_steps == 0, \ "Requires batch_size:[{}] to be divisible by k_steps:[{}].".format(batch_size, self._k_steps) batch_size //= self._k_steps - dist_main_prog = self._dist_main_progs[self.mode][self._cur_rank] - dist_startup_prog = self._dist_startup_progs[self.mode][self._cur_rank] - dist_context = self._dist_contexts[self.mode] + dist_main_prog = self._dist_main_progs[self._mode][self._cur_rank] + dist_startup_prog = self._dist_startup_progs[self._mode][self._cur_rank] + dist_context = self._dist_contexts[self._mode] dist_main_block = dist_main_prog.global_block() # NOTE: Get feed_list, then insert dataloader op with sharded var shape. # Cause predict_program does not contain labels var, # then we will add labels var from serial_program to dist_program, # that maintains the length of feed_list equal to the length of dataset's values. - inputs_var = self._feed_vars[self.mode]["inputs"] - labels_var = self._feed_vars[self.mode]["labels"] + inputs_var = self._feed_vars[self._mode]["inputs"] + labels_var = self._feed_vars[self._mode]["labels"] feed_list = [] for var in inputs_var + labels_var: if var.name in dist_main_block.vars: @@ -860,45 +1235,99 @@ def _create_dataloader(self, copy_var.desc.set_original_id(var.desc.original_id()) feed_list.append(copy_var) - # remove the first three ops if multi run fit/evaluate/predict - op_size = len(dist_main_block.ops) - if dist_main_block.ops[0].type == 'create_py_reader': - op_size -= 3 - for _ in range(3): - dist_main_block._remove_op(0, sync=False) - # insert read op at the end of program places = paddle.static.cuda_places() with static.program_guard(dist_main_prog, dist_startup_prog): - dataloader = NonIterableGeneratorLoader( + dataloader = DistributedDataLoader( dataset, - feed_list, - places, - batch_size, - epochs, - steps_per_epoch, - collate_fn, + feed_list=feed_list, + places=places, + return_list=return_list, + batch_size=batch_size, + shuffle=shuffle, + drop_last=drop_last, + collate_fn=collate_fn, + num_workers=num_workers, + use_buffer_reader=use_buffer_reader, + use_shared_memory=use_shared_memory, + timeout=timeout, + worker_init_fn=worker_init_fn, + epochs=epochs, + steps_per_epoch=steps_per_epoch, + split_data=self._strategy.split_data, data_parallel_world_size=self._dp_world_sizes, - data_parallel_rank=self._dp_ranks, - split_data=self._strategy.split_data) + data_parallel_rank=self._dp_ranks) - # move read op from the end of program to the start of program - new_op_size = len(dist_main_block.ops) - for _ in range(new_op_size - 1, op_size - 1, -1): - op = dist_main_block.ops[new_op_size - 1] - new_op_desc = dist_main_block.desc._prepend_op() - new_op_desc.copy_from(op.desc) - new_op = Operator(dist_main_block, - new_op_desc, - type=new_op_desc.type()) - dist_main_block.ops.insert(0, new_op) - dist_op = DistributedOperator(new_op) - dist_context.add_dist_op_for_program(dist_op) - for _ in range(new_op_size - op_size): - dist_main_block._remove_op(new_op_size, sync=False) - dist_main_block._sync_with_cpp() return dataloader + def _prepare_dataloader_from_generator(self, + dataset, + capacity=None, + use_double_buffer=True, + iterable=True, + return_list=False, + use_multiprocess=False, + drop_last=True, + batch_size=1, + epochs=1, + steps_per_epoch=None, + collate_fn=None): + + if self._strategy.gradient_merge and batch_size is not None: + assert batch_size % self._k_steps == 0, \ + "Requires batch_size:[{}] to be divisible by k_steps:[{}].".format(batch_size, self._k_steps) + batch_size //= self._k_steps + + dist_main_prog = self._dist_main_progs[self._mode][self._cur_rank] + dist_startup_prog = self._dist_startup_progs[self._mode][self._cur_rank] + dist_context = self._dist_contexts[self._mode] + dist_main_block = dist_main_prog.global_block() + + # NOTE: Get feed_list, then insert dataloader op with sharded var shape. + # Cause predict_program does not contain labels var, + # then we will add labels var from serial_program to dist_program, + # that maintains the length of feed_list equal to the length of dataset's values. + inputs_var = self._feed_vars[self._mode]["inputs"] + labels_var = self._feed_vars[self._mode]["labels"] + feed_list = [] + for var in inputs_var + labels_var: + if var.name in dist_main_block.vars: + feed_list.append(dist_main_block.vars[var.name]) + else: + copy_var = dist_main_block._clone_variable(var, var.persistable) + copy_var.desc.set_original_id(var.desc.original_id()) + feed_list.append(copy_var) + + places = paddle.static.cuda_places() + with static.program_guard(dist_main_prog, dist_startup_prog): + dataloader = DistributedDataLoaderFromGenerator( + dataset=dataset, + feed_list=feed_list, + capacity=capacity, + use_double_buffer=use_double_buffer, + iterable=iterable, + return_list=return_list, + use_multiprocess=use_multiprocess, + drop_last=drop_last, + places=places, + batch_size=batch_size, + epochs=epochs, + steps_per_epoch=steps_per_epoch, + collate_fn=collate_fn, + split_data=self._strategy.split_data, + data_parallel_world_size=self._dp_world_sizes, + data_parallel_rank=self._dp_ranks) + self._prepare_reader() + return dataloader + + def _tune(self, tune_data, tune_sample_split=None, batch_size=1): + self._mode = 'train' + self._inputs_spec, self._labels_spec = self._prepare_data_spec( + tune_data, tune_sample_split, batch_size) + self._inputs, self._labels = self._prepare_data_tensor( + self._inputs_spec, self._labels_spec) + self._optimization_tuning(self._mode, tune_data, batch_size) + def _validate_spec(self, specs): specs = to_list(specs) self._k_steps = self._strategy.gradient_merge.k_steps @@ -921,32 +1350,6 @@ def _is_local_var(self, var): var_name = _to_name_str(var) return var_name in self.main_program.global_block().vars - def _validate_fetches(self, fetches): - # 1. Check user-defined fetches type - # 2. Prepare fetches_dict like {user_defined_name: var_name} - if not fetches: - return {} - if isinstance(fetches, dict): - fetch_var_names = list(map(_to_name_str, fetches.values())) - fetches_dict = dict(zip(fetch_var_names, list(fetches.keys()))) - elif isinstance(fetches, list): - fetch_var_names = list(map(_to_name_str, fetches)) - fetches_dict = dict(zip(fetch_var_names, fetch_var_names)) - else: - raise TypeError("'fetches' only support 'dict' and 'list', " - "but got '{}'".format(str(type(fetches)))) - return dict( - filter(lambda x: self._is_local_var(x[0]), fetches_dict.items())) - - def _fetch_map(self, inner_fetch, usr_fetch): - # replace inner fetch name if usr set for it - for iname in inner_fetch: - if iname in usr_fetch: - inner_fetch[iname] = usr_fetch[iname] - usr_fetch.pop(iname) - fetches = dict(inner_fetch, **usr_fetch) - return list(fetches.keys()), fetches - def _get_input_split_info(self, var, dist_context): # deduce how the input data is split among the cluster from .utils import _get_comm_group, _get_corresponding_rank @@ -1007,9 +1410,20 @@ def _reset_metrics(self): for metric in self._metrics: metric.reset() + def _metrics_name(self): + metrics_name = ['loss'] if self._loss else [] + for m in self._metrics: + metrics_name.extend(to_list(m.name())) + return metrics_name + def _switch_mode(self, mode): - self.mode = mode - self._initialize(mode) + self.to_mode(mode) + self._optimizer = self._dist_contexts[mode]._serial_optimizer + + def to_mode(self, mode): + assert mode in ["train", "eval", "predict"], \ + "mode {} should be one of ['train', 'eval', 'predict']".format(mode) + self._mode = mode def _set_state_dict(self, mode, strict, state_dict, dist_attr): program = self._dist_main_progs[mode][self._cur_rank] @@ -1029,7 +1443,7 @@ def save(self, path, training=True): is 'dirname/file_prefix' or 'file_prefix'. if empty str. A exception will be raised. training (bool, optional): Whether to save for training. If not, save - for inference only. If `training` is set to True, the optimzer state + for inference only. If `training` is set to True, the optimizer state will be saved. Otherwise, only the model and parameters are saved. This function will silently overwrite existing file at the target location. Default: True. @@ -1065,20 +1479,19 @@ def save(self, path, training=True): """ if training: - assert 'train' in self._serial_main_progs, \ - "training model is not ready, please call `engine._prepare_single_mode('train')` first." - serial_program = self._serial_main_progs["train"] - dist_main_prog = self._dist_main_progs["train"][self._cur_rank] - dist_context = self._dist_contexts["train"] + assert self._mode in self._serial_main_progs + serial_program = self._serial_main_progs[self._mode] + dist_main_prog = self._dist_main_progs[self._mode][self._cur_rank] + dist_context = self._dist_contexts[self._mode] self._saver.save(path, serial_program=serial_program, dist_main_program=dist_main_prog, dist_context=dist_context) else: - mode = "predict" - feed_vars = self._feed_vars[mode]['inputs'] - fetch_vars = self._fetch_vars[mode]['outputs'] - dist_main_prog = self._dist_main_progs[mode][self._cur_rank] + assert "predict" in self._dist_main_progs + feed_vars = self._feed_vars["predict"]['inputs'] + fetch_vars = self._fetch_vars["predict"]['outputs'] + dist_main_prog = self._dist_main_progs["predict"][self._cur_rank] self._saver.save_inference_model(path, feed_vars, fetch_vars, @@ -1097,7 +1510,7 @@ def load(self, path, strict=True, load_optimizer=True): the parameter in file storing model states of or receives a mismatch shape). Default: False. load_optimizer (bool, optional): If True, the stored optimizer - states is restored. Otherwise, the optimizer states is intialized + states is restored. Otherwise, the optimizer states is initialized from scratch. Default: False. Returns: @@ -1136,65 +1549,82 @@ def load(self, path, strict=True, load_optimizer=True): path, load_optimizer) return self._state_dict, self._dist_attr - @staticmethod - def _get_lr_scheduler(program): - lr_sheduler = None - if hasattr(program, 'lr_sheduler'): - from paddle.optimizer.lr import LRScheduler - lr_sheduler = program.lr_sheduler - assert isinstance(lr_sheduler, LRScheduler), "must be LRScheduler" - return lr_sheduler - - def _get_lr(self, optimizer): - if isinstance(optimizer, paddle.optimizer.Optimizer): - return optimizer.get_lr() - elif isinstance(optimizer, paddle.fluid.optimizer.Optimizer): - if isinstance(optimizer._learning_rate, float): - return optimizer._learning_rate - else: - return optimizer._learning_rate() + def cost(self, inputs_spec=None, labels_spec=None, mode="train"): + """ + Get and Print cost, including memory of every rank, + max memory among all ranks, and the global cost of one step based on + communication cost(computation cost is 0 by default). + In the future, the flops information of every rank and global cost including + computation cost will be added. + + Args: + inputs_spec(InputSpec): The specification of inputs. Default: None. + labels_spec(InputSpec): The specification of labels. Default: None. + mode (str): The engine mode must be in ["train", "predict", "eval"]. Default: "train". + + Returns: + Return the global execution time (ms) and max memory (B). + + """ + # Check parallel mode + if self._strategy.auto_mode == "full": + print( + "The cost will be calcudated in the search process when the auto mode is full." + ) + return + + # Check mode + accepted_modes = ["train", "predict", "eval"] + if mode not in accepted_modes: + raise ValueError("The mode {} is not in accepted modes {}".format( + mode, accepted_modes)) + self.to_mode(mode) + + if inputs_spec is not None: + self._inputs_spec, self._labels_spec = inputs_spec, labels_spec + self._inputs, self._labels = self._prepare_data_tensor( + self._inputs_spec, self._labels_spec) + self._build(mode) + self._plan(mode) else: - raise TypeError( - "'optimizer' must be object of class `paddle.optimizer.Optimizer`" \ - " or `paddle.fluid.optimizer.Optimizer`, but got {}.".format(type(optimizer)) + if _non_static_mode() or self._dygraph_mode: + raise ValueError( + "Please call `engine._prepare_program('mode')` firstly when in the static graph mode." ) - @property - def mode(self): - return self._mode + # Estimate the exec cost and max memory + global_cost, max_memory = get_cost_from_engine(self, mode) - @mode.setter - def mode(self, mode): - self._mode = mode + return global_cost.time, max_memory @property def main_program(self): - return self._dist_main_progs[self.mode][self._cur_rank] + return self._dist_main_progs[self._mode][self._cur_rank] @property def startup_program(self): - return self._dist_startup_progs[self.mode][self._cur_rank] + return self._dist_startup_progs[self._mode][self._cur_rank] @property def dist_context(self): - return self._dist_contexts[self.mode] + return self._dist_contexts[self._mode] @property def serial_main_program(self): - return self._serial_main_progs[self.mode] + return self._serial_main_progs[self._mode] @property def serial_startup_program(self): - return self._serial_startup_progs[self.mode] + return self._serial_startup_progs[self._mode] @property def fetch_vars(self): - return self._fetch_vars[self.mode] + return self._fetch_vars[self._mode] @property def inputs(self): - return self.inputs_spec + return self._inputs @property def labels(self): - return self.labels_spec + return self._labels diff --git a/python/paddle/distributed/auto_parallel/helper.py b/python/paddle/distributed/auto_parallel/helper.py index 4a3a1ab5e15cc..7faa426ed3430 100644 --- a/python/paddle/distributed/auto_parallel/helper.py +++ b/python/paddle/distributed/auto_parallel/helper.py @@ -139,7 +139,7 @@ def call_metrics(self, inputs): """ outs = [] for metric in self.metrics: - outs.extend(metric.compute(*inputs)) + outs.append(to_list(metric.compute(*inputs))) return outs diff --git a/python/paddle/distributed/auto_parallel/interface.py b/python/paddle/distributed/auto_parallel/interface.py index dae8cb41e66e5..a0dcb488658b8 100644 --- a/python/paddle/distributed/auto_parallel/interface.py +++ b/python/paddle/distributed/auto_parallel/interface.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from collections import defaultdict + import paddle from paddle.fluid import core from .process_mesh import ProcessMesh @@ -196,15 +198,36 @@ def __call__(self, *args, **kwargs): return RecomputeOperator(op) -_g_fetched_tensors = {} +_g_collections = {} + + +class CollectionNames(object): + FETCHES = "fetches" + LOGGING = "logging" + + +def get_collection(name): + collection = _g_collections.get(name, None) + if collection is None: + collection = [] + _g_collections[name] = collection + return _g_collections[name] -def fetch(tensor, name=None): - if name is None: - _g_fetched_tensors[tensor.name] = tensor +def add_to_collection(collection_name, value, name=None): + if collection_name not in _g_collections: + _g_collections[collection_name] = [] + if name is not None: + for _, v in _g_collections[collection_name]: + if v == value: return + _g_collections[collection_name].append((name, value)) else: - _g_fetched_tensors[name] = tensor + for _, v in _g_collections[collection_name]: + if v == value: return + _g_collections[collection_name].append((None, value)) -def _get_fetches(): - return _g_fetched_tensors +def fetch(tensor, name=None, logging=False): + add_to_collection(CollectionNames.FETCHES, tensor, name) + if logging: + add_to_collection(CollectionNames.LOGGING, tensor, name) diff --git a/python/paddle/distributed/auto_parallel/operators/__init__.py b/python/paddle/distributed/auto_parallel/operators/__init__.py index 02b5138be2146..4a0a05a4f1cd4 100644 --- a/python/paddle/distributed/auto_parallel/operators/__init__.py +++ b/python/paddle/distributed/auto_parallel/operators/__init__.py @@ -33,3 +33,5 @@ from . import dist_fused_feedforward from . import dist_fused_attention from . import dist_reduce_sum_p +from . import dist_shape +from . import dist_assign diff --git a/python/paddle/distributed/auto_parallel/operators/dist_assign.py b/python/paddle/distributed/auto_parallel/operators/dist_assign.py new file mode 100644 index 0000000000000..96923f461a73d --- /dev/null +++ b/python/paddle/distributed/auto_parallel/operators/dist_assign.py @@ -0,0 +1,88 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .common import DistributedOperatorImplContainer +from .common import DistributedOperatorImpl +from .common import register_distributed_operator_impl_container +from .common import register_distributed_operator_impl +from .dist_default import DistributedDefaultImpl0 +from ..utils import compute_compatible_and_update_dim_mapping + + +class DistributedAssign(DistributedOperatorImplContainer): + + def __init__(self, op_type): + super(DistributedAssign, self).__init__(op_type) + + +register_distributed_operator_impl_container(DistributedAssign("assign")) + + +class DistributedAssignImpl(DistributedOperatorImpl): + + def __init__(self, name): + super(DistributedAssignImpl, self).__init__(name) + self._forward_implemented = True + self._backward_implemented = True + + def is_input_compatible(self, dist_op): + return True + + def is_output_compatible(self, dist_op): + return True + + def is_auto_compatible(self, dist_op): + if (not self.is_input_compatible(dist_op)) or \ + (not self.is_output_compatible(dist_op)): + return False + + op_desc = dist_op.serial_op.desc + op_dist_attr = dist_op.dist_attr + x_name = op_desc.input('X')[0] + out_name = op_desc.output('Out')[0] + x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name) + out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name) + + if x_dims_mapping != out_dims_mapping: + return False + + return True + + def update_dims_mapping(self, dist_op): + changed = False + op_desc = dist_op.serial_op.desc + op_dist_attr = dist_op.dist_attr + x_name = op_desc.input('X')[0] + out_name = op_desc.output('Out')[0] + x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name) + out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name) + + for i in range(len(x_dims_mapping)): + dim_changed = compute_compatible_and_update_dim_mapping( + [x_dims_mapping, out_dims_mapping], [i, i]) + if dim_changed: + changed = True + + return changed + + @staticmethod + def forward(ctx, *args, **kwargs): + DistributedDefaultImpl0.forward(ctx, *args, **kwargs) + + @staticmethod + def backward(ctx, *args, **kwargs): + DistributedDefaultImpl0.backward(ctx, *args, **kwargs) + + +register_distributed_operator_impl("assign", DistributedAssignImpl("assign")) diff --git a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py index 3be84c55126bf..8f2db1a3b2637 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py @@ -1308,6 +1308,8 @@ def calc_bwd_cost(self, dist_op, ctx, cluster): process_mesh = dist_attr.process_mesh processes = process_mesh.processes # col parallel: matmul + allreduce + if backward_op.attr("trans_y"): + Y_var_dim_mapping.reverse() assert Y_var_dim_mapping[0] < 0 parallel_axis = Y_var_dim_mapping[1] diff --git a/python/paddle/distributed/auto_parallel/operators/dist_shape.py b/python/paddle/distributed/auto_parallel/operators/dist_shape.py new file mode 100644 index 0000000000000..313f296ab9624 --- /dev/null +++ b/python/paddle/distributed/auto_parallel/operators/dist_shape.py @@ -0,0 +1,73 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .common import DistributedOperatorImplContainer +from .common import DistributedOperatorImpl +from .common import register_distributed_operator_impl_container +from .common import register_distributed_operator_impl +from .dist_default import DistributedDefaultImpl0 +from ..utils import is_dim_shard + + +class DistributedShape(DistributedOperatorImplContainer): + + def __init__(self, op_type): + super(DistributedShape, self).__init__(op_type) + + +register_distributed_operator_impl_container(DistributedShape("shape")) + + +class DistributedShapeImpl(DistributedOperatorImpl): + + def __init__(self, name): + super(DistributedShapeImpl, self).__init__(name) + self._forward_implemented = True + self._backward_implemented = True + + def is_input_compatible(self, dist_op): + return True + + def is_output_compatible(self, dist_op): + op_desc = dist_op.serial_op.desc + op_dist_attr = dist_op.dist_attr + out_name = op_desc.output('Out')[0] + out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name) + + assert len(out_dims_mapping) == 1 + if is_dim_shard(out_dims_mapping[0]): + return False + + return True + + def is_auto_compatible(self, dist_op): + if (not self.is_input_compatible(dist_op)) or \ + (not self.is_output_compatible(dist_op)): + return False + + return True + + def update_dims_mapping(self, dist_op): + return False + + @staticmethod + def forward(ctx, *args, **kwargs): + DistributedDefaultImpl0.forward(ctx, *args, **kwargs) + + @staticmethod + def backward(ctx, *args, **kwargs): + DistributedDefaultImpl0.backward(ctx, *args, **kwargs) + + +register_distributed_operator_impl("shape", DistributedShapeImpl("shape")) diff --git a/python/paddle/distributed/auto_parallel/operators/dist_split.py b/python/paddle/distributed/auto_parallel/operators/dist_split.py index 8f89020b53ca4..9b7c680d7921d 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_split.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_split.py @@ -101,8 +101,12 @@ def update_dims_mapping(self, dist_op): return changed def is_auto_compatible(self, dist_op): - raise NotImplementedError( - "Auto Search is not supported by dist split yet.") + if (not self.is_input_compatible(dist_op)) or \ + (not self.is_output_compatible(dist_op)) or \ + (not self.is_compatible(dist_op)): + return False + + return True @staticmethod def forward(ctx, *args, **kwargs): diff --git a/python/paddle/distributed/auto_parallel/parallelizer.py b/python/paddle/distributed/auto_parallel/parallelizer.py index 3d7394216140c..75fb3d1ec5200 100644 --- a/python/paddle/distributed/auto_parallel/parallelizer.py +++ b/python/paddle/distributed/auto_parallel/parallelizer.py @@ -23,14 +23,12 @@ import pickle import time import paddle -from paddle.fluid.backward import append_backward -from paddle.distributed.utils.log_utils import get_logger -from paddle.distributed.fleet import cloud_utils import paddle.fluid.core as core from paddle.fluid import program_guard +from paddle.fluid.backward import append_backward +from paddle.distributed.utils.log_utils import get_logger from paddle.distributed.passes import new_pass, PassContext from .dist_context import DistributedContext -from .dist_context import get_default_distributed_context from .dist_context import set_default_distributed_context from .completion import Completer from .partitioner import Partitioner @@ -40,9 +38,7 @@ from .process_group import _g_process_group_map, ProcessGroup from .utils import make_data_unshard from .utils import set_grad_var_shape -from .utils import print_program_with_dist_attr from .utils import SerialProgramInfo -from .utils import get_logger from .reshard import Resharder from .cluster import Cluster from .mapper import mapping @@ -148,7 +144,7 @@ def _apply_optimize(self, main_program, startup_program, params_grads): with program_guard(main_program, startup_program): optimize_ops = optimizer.apply_gradients(params_grads) - self._dist_context._lr_optimizer = optimizer + self._dist_context._serial_optimizer = optimizer # update completion self._completer = Completer(self._dist_context) self._completer.complete_update_annotation(main_program) diff --git a/python/paddle/distributed/auto_parallel/parallelizer_v2.py b/python/paddle/distributed/auto_parallel/parallelizer_v2.py index a1dd58fef7131..e87c401055e75 100644 --- a/python/paddle/distributed/auto_parallel/parallelizer_v2.py +++ b/python/paddle/distributed/auto_parallel/parallelizer_v2.py @@ -15,24 +15,17 @@ import copy import time import logging -from collections import defaultdict -import paddle from paddle.fluid import program_guard from paddle.fluid.backward import append_backward -from paddle.fluid.framework import _non_static_mode, unique_name +from paddle.fluid.framework import unique_name from paddle.distributed.passes import new_pass from .reshard import Resharder from .partitioner import Partitioner -from .dist_op import DistributedOperator -from .dist_saver import DistributedSaver -from .dist_loader import NonIterableGeneratorLoader -from .utils import make_data_unshard, set_grad_var_shape -from .utils import print_program_with_dist_attr, to_list -from .utils import get_logger -from .process_group import get_all_process_groups, get_world_process_group -from .dist_context import DistributedContext, get_default_distributed_context +from .utils import set_grad_var_shape +from .process_group import get_world_process_group +from ..utils.log_utils import get_logger class Parallelizer: @@ -69,7 +62,7 @@ def parallel(self, rank): serial_main_program, serial_startup_program, params_grads = self._apply_pre_optimization( serial_main_program, serial_startup_program, serial_loss, serial_optimizer, params_grads) - self._logger.info( + self._logger.debug( "within parallel apply_pre_optimization time: {}, mode {}". format(time.time() - time0, self._mode)) # Do logical partition @@ -77,14 +70,14 @@ def parallel(self, rank): partitioner = Partitioner(self._dist_context, rank) dist_main_prog, dist_startup_prog, dist_params_grads = partitioner.partition( serial_main_program, serial_startup_program, params_grads) - self._logger.info( + self._logger.debug( "within parallel partitioner time: {}, mode {}".format( time.time() - time0, self._mode)) # Generate optimizer time0 = time.time() self._generate_optimizer(dist_main_prog, dist_startup_prog, serial_optimizer, dist_params_grads) - self._logger.info( + self._logger.debug( "within parallel optimizer time: {}, mode {}".format( time.time() - time0, self._mode)) # Do reshard process @@ -93,14 +86,14 @@ def parallel(self, rank): resharder = Resharder(dist_main_prog, dist_startup_prog, rank, self._dist_context, dist_params_grads) resharder.reshard() - self._logger.info( + self._logger.debug( "within parallel reshard time: {}, mode {}".format( time.time() - time0, self._mode)) # Apply post optimization passes time0 = time.time() self._apply_post_optimization(dist_main_prog, dist_startup_prog, rank, dist_params_grads) - self._logger.info( + self._logger.debug( "within parallel apply_post_optimization time: {}, mode {}". format(time.time() - time0, self._mode)) else: @@ -109,7 +102,7 @@ def parallel(self, rank): self._apply_pre_optimization(serial_main_program, serial_startup_program, None, None, None) - self._logger.info( + self._logger.debug( "within parallel apply_pre_optimization time: {}, mode {}". format(time.time() - time0, self._mode)) # Do logical partition @@ -118,14 +111,14 @@ def parallel(self, rank): dist_main_prog, dist_startup_prog, dist_params_grads = partitioner.partition( serial_main_program, serial_startup_program, []) # Do reshard process - self._logger.info( + self._logger.debug( "within parallel partitioner time: {}, mode {}".format( time.time() - time0, self._mode)) time0 = time.time() resharder = Resharder(dist_main_prog, dist_startup_prog, rank, self._dist_context, [], 1) resharder.reshard() - self._logger.info( + self._logger.debug( "within parallel reshard time: {}, mode {}".format( time.time() - time0, self._mode)) # Clone program for test @@ -150,7 +143,7 @@ def _generate_optimizer(self, main_program, startup_program, optimizer, # NOTE: `apply_gradients` will add an Accumulator for a parameter only once, # but optimizer will be called repeatedly in re-launch, so optimizer need to be copied. optimizer = copy.deepcopy(optimizer) - self._dist_context._lr_optimizer = optimizer + self._dist_context._serial_optimizer = optimizer with program_guard(main_program, startup_program): with unique_name.guard("opt_"): optimizer_ops = optimizer.apply_gradients(params_grads) @@ -177,9 +170,7 @@ def _apply_pre_optimization(self, main_program, startup_program, loss, startup_program = self._pass_context.get_attr("startup_program") params_grads = self._pass_context.get_attr("params_grads") - # apply amp pass - # FIXME we disenable amp for eval since it has a little bug with - # eval program and which will be fixed in future + # apply amp pass on train/eval/predict if self._strategy.amp.enable: config = copy.deepcopy(self._strategy.amp.to_dict()) config["dist_context"] = self._dist_context diff --git a/python/paddle/distributed/auto_parallel/partitioner.py b/python/paddle/distributed/auto_parallel/partitioner.py index 3262505416b1d..e12a111dd2a61 100644 --- a/python/paddle/distributed/auto_parallel/partitioner.py +++ b/python/paddle/distributed/auto_parallel/partitioner.py @@ -28,7 +28,7 @@ from .utils import print_program_with_dist_attr, is_forward_op, is_backward_op, is_loss_op, is_optimize_op from .operators.common import BACKWARD_ONLY_DIST_OPS -__varname_not_in_block__ = ["lod_tensor_blocking_queue_0"] +__varname_not_in_block__ = ["lod_tensor_blocking_queue"] __not_shape_var_type__ = [ core.VarDesc.VarType.READER, core.VarDesc.VarType.STEP_SCOPES ] @@ -243,7 +243,9 @@ def partition_block(self, ref_block, target_block): target_block, serial_input_varname, new_varname) else: - assert serial_input_varname in __varname_not_in_block__ + for varname_not_in_block in __varname_not_in_block__: + assert varname_not_in_block in serial_input_varname, \ + "{} is not found".format(serial_input_varname) self._serial2dist_varname_mapping[ serial_input_varname] = new_varname diff --git a/python/paddle/distributed/auto_parallel/planner_v2.py b/python/paddle/distributed/auto_parallel/planner_v2.py index 90b840c5943bc..8e2c0c4617b0f 100755 --- a/python/paddle/distributed/auto_parallel/planner_v2.py +++ b/python/paddle/distributed/auto_parallel/planner_v2.py @@ -14,9 +14,7 @@ from .completion import Completer from .dist_context import get_default_distributed_context -from .utils import print_program_with_dist_attr - -# from .tuner.parallel_tuner import ParallelTuner +from .tuner.parallel_tuner import ParallelTuner class Planner: @@ -39,20 +37,20 @@ def __init__(self, mode, dist_context): self._completer = Completer(self._dist_context) self._strategy = dist_context.strategy - # if self._strategy.auto_search: - # self._parallel_tuner = ParallelTuner( - # self._dist_context, mode=self._mode) + # set parallel tuner for auto search + if self._strategy.auto_mode == "full": + self._parallel_tuner = ParallelTuner(self._dist_context, + mode=self._mode) @property def completer(self): return self._completer def plan(self): - self._completer.complete_forward_annotation() - # if self._strategy.auto_search: - # self._parallel_tuner.tune() - # else: - # self._completer.complete_forward_annotation() + if self._strategy.auto_mode == "full": + self._parallel_tuner.tune() + else: + self._completer.complete_forward_annotation() # parse forward sub block self._dist_context.block_state.parse_forward_blocks( self._dist_context.serial_main_program) diff --git a/python/paddle/distributed/auto_parallel/process_mesh.py b/python/paddle/distributed/auto_parallel/process_mesh.py index 14ce5ea75b10c..a18fc196477da 100644 --- a/python/paddle/distributed/auto_parallel/process_mesh.py +++ b/python/paddle/distributed/auto_parallel/process_mesh.py @@ -168,7 +168,10 @@ def __getitem__(self, index): else: new_mesh = self._mesh[index] new_dim_names = self._dim_names[1:] - return ProcessMesh(new_mesh, new_dim_names) + if new_mesh.shape: + return ProcessMesh(new_mesh, new_dim_names) + else: + return ProcessMesh([new_mesh]) def __enter__(self): set_current_process_mesh(self) diff --git a/python/paddle/distributed/auto_parallel/reshard.py b/python/paddle/distributed/auto_parallel/reshard.py index 52d5c607bbc57..bb3d2d6cfbaff 100644 --- a/python/paddle/distributed/auto_parallel/reshard.py +++ b/python/paddle/distributed/auto_parallel/reshard.py @@ -37,6 +37,7 @@ _g_gradient_clip_ops = [ "sum", "sqrt", "fill_constant", "elementwise_max", "elementwise_div" ] +_g_subblock_ops = ["while", "conditional_block"] def get_var_with_recursion(var_name, block, program): @@ -45,10 +46,11 @@ def get_var_with_recursion(var_name, block, program): if var_name in block.vars: var = block.vars[var_name] else: - parent_block = program.blocks[block.parent_idx] - if var_name in parent_block.vars: - var = parent_block.vars[var_name] - assert var is not None + var = block._var_recursive(var_name) + # parent_block = program.blocks[block.parent_idx] + # if var_name in parent_block.vars: + # var = parent_block.vars[var_name] + assert var is not None, "{} is not found".format(var.name) return var @@ -1077,7 +1079,9 @@ def change_while_op_input_and_output(auto_parallel_main_prog, dist_context): new_Out = [] for var_name in while_op.output("Out"): for output_name in sub_block_op_outputs[::-1]: - if output_name.find(var_name) != -1: + if output_name.find(var_name) != -1 and ( + len(var_name) == len(output_name) + or "@RESHARD" in output_name): if output_name not in new_Out: new_Out.append(output_name) assert new_Out @@ -1106,13 +1110,15 @@ def is_special_op(self, op): return False def is_condition_replicative(self, op): - assert op.type == "while" sub_block = self.auto_parallel_main_prog.blocks[op.attr("sub_block").id] - dist_op = self.dist_context.get_dist_op_for_program(op) - op_dist_attr = dist_op.dist_attr + + if op.type == "while": + input_cond = op.input("Condition") + elif op.type == "conditional_block": + input_cond = op.input("Cond") # the dims mapping of condition tensor should be replicative - for var_name in op.input("Condition"): + for var_name in input_cond: var = get_var_with_recursion(var_name, sub_block, self.auto_parallel_main_prog) dist_tensor = self.dist_context.get_dist_tensor_for_program(var) @@ -1662,9 +1668,9 @@ def parse_op_desc(self, block, op_desc_seq, var_name, reshard_op, op.desc.set_input(proto.inputs[0].name, op.input("X") + while_op_X_append) - def _get_while_op_input_attrs(self, op, var_name): + def _get_subblock_input_attrs(self, op, var_name): # NOTE: Multi while loop is not supported - assert op.type == "while" + assert op.type in _g_subblock_ops sub_block = self.auto_parallel_main_prog.blocks[op.attr("sub_block").id] ops = sub_block.ops input_attrs = [] @@ -1715,8 +1721,8 @@ def _get_common_op_input_attrs(self, op, var_name): def get_op_input_attrs(self, op, var_name): op_input_attrs = [] - if op.type == "while": - op_input_attrs = self._get_while_op_input_attrs(op, var_name) + if op.type in _g_subblock_ops: + op_input_attrs = self._get_subblock_input_attrs(op, var_name) else: op_input_attrs = self._get_common_op_input_attrs(op, var_name) @@ -1738,8 +1744,18 @@ def _remove_global_process_mesh(self): if len(set(process_mesh.processes)) == len(processes): global_process_mesh_idx = idx break + if global_process_mesh_idx is not None: - self.dist_context.process_meshes.pop(idx) + is_removed = False + global_mesh = self.dist_context.process_meshes[idx] + for i, mesh in enumerate(self.dist_context.process_meshes): + if i == idx: + continue + if set(mesh.processes) < set(global_mesh.processes): + is_removed = True + + if is_removed: + self.dist_context.process_meshes.pop(idx) def _change_subblock_op_input_and_output(self, block_idx, block): if "var_reshard_mapping" in Resharder.while_block_info[block_idx]: @@ -1810,7 +1826,7 @@ def _reshard_input(self, block): if dist_op is not None: op_input_dist_attrs = [ ] # [(op_process_mesh, op_input_dims_mapping), (op_process_mesh, op_input_dims_mapping)] - if op.type == "while": + if op.type in _g_subblock_ops: if not self.is_condition_replicative(op): raise ValueError( "Please check the condition due to the dims mapping is not replicative." @@ -1824,6 +1840,8 @@ def _reshard_input(self, block): if op.type == "while": # condition var process mesh is the same with op and dims_mapping is replicative, so it do not need reshard input_var_names = op.input("X") + elif op.type == "conditional_block": + input_var_names = op.input("Input") else: input_var_names = op.input_arg_names # to avoid while op X order different @@ -1831,8 +1849,8 @@ def _reshard_input(self, block): idx_offset = 0 for var_name in input_var_names: - # skip lod_tensor_blocking_queue_0 - if var_name == "lod_tensor_blocking_queue_0": + # skip lod_tensor_blocking_queue_? name + if "lod_tensor_blocking_queue" in var_name: continue var = get_var_with_recursion(var_name, block, self.auto_parallel_main_prog) @@ -1976,11 +1994,12 @@ def _reshard_output(self, block): idx = 0 # skip reader and ops whose process mesh is union skip_ops = [ - "create_py_reader", "create_double_buffer_reader", "read", "while", + "create_py_reader", "create_double_buffer_reader", "read", "write_to_array", "read_from_array" ] global _g_special_ops skip_ops += _g_special_ops + skip_ops += _g_subblock_ops while idx < len(block.ops): pre_op_count = len(block.ops) op = block.ops[idx] diff --git a/python/paddle/distributed/auto_parallel/strategy.py b/python/paddle/distributed/auto_parallel/strategy.py index 977e77d2ec326..927aa25dbfba7 100644 --- a/python/paddle/distributed/auto_parallel/strategy.py +++ b/python/paddle/distributed/auto_parallel/strategy.py @@ -116,6 +116,13 @@ def __init__(self, config_dict=None): super(TuningConfig, self).__init__(category, config_dict) +class DatasetConfig(BaseConfig): + + def __init__(self, config_dict=None): + category = constants.DATASET + super(DatasetConfig, self).__init__(category, config_dict) + + class Strategy(BaseConfig): """ The `Strategy` object is used to configure the paralleization and optimization beheviors. @@ -180,3 +187,6 @@ def __init__(self, config=None): config_dict = self._config_dict.get(constants.TUNING, None) self.tuning = TuningConfig(config_dict) + + config_dict = self._config_dict.get(constants.DATASET, None) + self.dataset = DatasetConfig(config_dict) diff --git a/python/paddle/distributed/auto_parallel/tuner/optimization_tuner.py b/python/paddle/distributed/auto_parallel/tuner/optimization_tuner.py index 4835d6f885ccf..4b3c53ef30b43 100644 --- a/python/paddle/distributed/auto_parallel/tuner/optimization_tuner.py +++ b/python/paddle/distributed/auto_parallel/tuner/optimization_tuner.py @@ -136,12 +136,24 @@ def _copy_context(ref_dist_context): for key, var_list in ref_dist_context._serial_fetch_vars.items(): new_var_list = [] - for var in var_list: - block_idx = var.block.idx - var_name = var.name - var = new_dist_context._serial_main_program.blocks[ - block_idx]._var_recursive(var_name) - new_var_list.append(var) + # metrics is a list of list + if key == "metrics": + for inner_var_list in var_list: + new_inner_var_list = [] + for var in inner_var_list: + block_idx = var.block.idx + var_name = var.name + var = new_dist_context._serial_main_program.blocks[ + block_idx]._var_recursive(var_name) + new_inner_var_list.append(var) + new_var_list.append(new_inner_var_list) + else: + for var in var_list: + block_idx = var.block.idx + var_name = var.name + var = new_dist_context._serial_main_program.blocks[ + block_idx]._var_recursive(var_name) + new_var_list.append(var) new_dist_context._serial_fetch_vars[key] = new_var_list # copy information in forward and backward diff --git a/python/paddle/distributed/auto_parallel/tuner/parallel_tuner.py b/python/paddle/distributed/auto_parallel/tuner/parallel_tuner.py new file mode 100644 index 0000000000000..24ee382f7f75a --- /dev/null +++ b/python/paddle/distributed/auto_parallel/tuner/parallel_tuner.py @@ -0,0 +1,968 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import time +import math +import copy +import hashlib +import itertools +from collections import defaultdict +import numpy as np +from ..process_mesh import ProcessMesh +from ..completion import Completer +from ..parallelizer_v2 import Parallelizer +from ..dist_context import _node_id +from ..dist_op import DistributedOperator +from ..operators.common import find_compatible_distributed_operator_impls +from .trial import Trial, TrialStatus +from .tunable_space import TunableSpace +from .tunable_variable import Boolean, IntRange +from ..cost import CostEstimator +from .tunable_variable import Boolean, IntRange + + +class ParallelTuner: + + def __init__(self, + dist_context, + mode="train", + max_trials=25, + tuner_id=None, + seed=None, + logger=None, + loop_count=10): + self._loop_count = loop_count + self._estimator = None + self._dist_context = dist_context + assert self._dist_context._is_initialized + self._mode = mode + self._cluster = self._dist_context.cluster + self._num_machines = self._cluster.get_num_machines() + self._num_devices_per_machine = self._cluster.get_num_devices_per_machine( + ) + self._space = TunableSpace() + self._objective = "time" + self._direction = "min" + self._max_trials = max_trials + self._tuner_id = tuner_id + self._seed = seed if seed is not None else 9999 + + print("seed", + self._seed, + "mode", + self._mode, + "num_machies", + self._num_machines, + "num_devices_per_machine", + self._num_devices_per_machine, + flush=True) + self._seed_state = self._seed + self._logger = logger + self._max_collisions = 3 + self._tried_values = set() + self._num_trials = 0 + self._rng = np.random.default_rng(self._seed) + + # Search the op types in the include_op_types, + # and will search all op types if it is empty. + # Exclude the op types in the exclude_op_types + # from the search list. + self._exclude_op_types = [] + self._include_op_types = [] + # The final dist ops will be searched after considering + # the include_op_types and exclude_op_types. + self._concerned_dist_ops = {} + + self._op_id_to_dist_attr_candidates = defaultdict(list) + self._cached_dims_mapping_candidates = {} + self._cached_candidates_info = defaultdict(list) + + self._special_ops = [ + "create_py_reader", "create_double_buffer_reader", "read", "while", + "read_from_array", "write_to_array" + ] + + # Each parallel strategy has two elements. The First one is for distributed tensors, + # the second element is for distributed tensors, the third element is for process meshes. + self._init_parallel_strategy = [None, None, None] + self._best_parallel_strategy = [None, None, None] + + self._completer = Completer(self._dist_context) + + self._parallelizer = Parallelizer(self._mode, self._completer, + self._dist_context) + + def _generate_combination(self, + elements, + target, + idx, + partial_candidate, + candidates, + num_candidates=None): + if target == 0: + candidates.append(copy.deepcopy(partial_candidate)) + return + + if target < 0 or idx == len(elements) \ + or len(candidates) > num_candidates: + return + + # Use + partial_candidate.append(elements[idx]) + self._generate_combination(elements, target - elements[idx], idx, + partial_candidate, candidates, + num_candidates) + # Not use + partial_candidate.pop() + self._generate_combination(elements, target, idx + 1, partial_candidate, + candidates, num_candidates) + + def _permute_combination(self, + combination, + target, + check, + partial_candidate, + candidates, + num_candidates=None, + skip_prob=None): + if num_candidates is not None \ + and len(candidates) == num_candidates: + return + + if len(partial_candidate) == len(combination): + candidates.append(partial_candidate) + return + + for i in range(len(combination)): + if check[i] == 1: + continue + if self._rng.choice([True, False], p=[skip_prob, 1 - skip_prob]): + continue + if i > 0 and combination[i] == combination[i - 1] \ + and check[i -1] == 0: + continue + check[i] = 1 + self._permute_combination(combination, target, check, + partial_candidate + [combination[i]], + candidates, num_candidates, skip_prob) + check[i] = 0 + + def _partition_number(self, target): + log2_target = int(math.log2(target)) + elements = [pow(2, i) for i in range(log2_target)] + if pow(2, log2_target) == target: + elements.append(target) + seed_candidates = [] + num_seed_candidates = 1000 + partial_results = [] + self._generate_combination(elements, target, 0, partial_results, + seed_candidates, num_seed_candidates) + + candidates = [] + for seed_candidate in seed_candidates: + cur_candidates = [] + num_cur_candidates = 16 + seed_candidate.sort() + check = [0 for i in range(len(seed_candidate))] + if target <= 8: + skip_prob = 0.0 + else: + skip_prob = (len(seed_candidate) / target) + self._permute_combination(seed_candidate, target, check, [], + cur_candidates, num_cur_candidates, + skip_prob) + candidates.extend(cur_candidates) + return candidates + + def _partition_devices(self, num_machines, num_devices_per_machine): + inter_node_partitions = self._partition_number(num_machines) + intra_node_partitions = self._partition_number(num_devices_per_machine) + return inter_node_partitions, intra_node_partitions + + def _generate_process_mesh_list(self, inter_node_partition, + intra_node_partition): + process_mesh_list = [] + start_row = 0 + start_col = 0 + for m in inter_node_partition: + start_col = 0 + for n in intra_node_partition: + process_mesh = [] + for p in range(m): + start = (start_row + + p) * self._num_devices_per_machine + start_col + tmp = [] + for q in range(n): + tmp.append(start + q) + process_mesh.append(tmp) + process_mesh_list.append(copy.deepcopy(process_mesh)) + start_col += n + start_row += m + return process_mesh_list + + def _generate_dims_mapping_candidates_helper(self, dims_mapping, dims_list, + start, visited, candidates): + if start == len(dims_mapping) or all(visited): + candidates.append(copy.deepcopy(dims_mapping)) + return + + for idx, dim in enumerate(dims_list): + if visited[idx] == False: + dims_mapping[start] = dim + visited[idx] = True + self._generate_dims_mapping_candidates_helper( + dims_mapping, dims_list, start + 1, visited, candidates) + visited[idx] = False + dims_mapping[start] = -1 + self._generate_dims_mapping_candidates_helper(dims_mapping, dims_list, + start + 1, visited, + candidates) + + def _generate_dims_mapping_candidates(self, dims_mapping_len, + process_mesh_len): + assert dims_mapping_len >= 1 and process_mesh_len >= 1 + key = (dims_mapping_len, process_mesh_len) + if key in self._cached_dims_mapping_candidates: + return self._cached_dims_mapping_candidates[key] + candidates = [] + dims_mapping = [-1 for i in range(dims_mapping_len)] + dims_list = [i for i in range(process_mesh_len)] + visited = [False for i in range(process_mesh_len)] + self._generate_dims_mapping_candidates_helper(dims_mapping, dims_list, + 0, visited, candidates) + self._cached_dims_mapping_candidates[key] = candidates + return candidates + + def _generate_dist_attr_candidates(self, op_id, dist_op): + # For now, only allow the process meshes have two dimensions + process_mesh_len = 2 + serial_op = dist_op.serial_op + op_dist_attr = dist_op.dist_attr + if serial_op.type in self._special_ops: + return [copy.deepcopy(op_dist_attr)] + key = [] + key.append(serial_op.type) + for input_name in serial_op.input_names: + key.append(input_name) + for input_arg_name in serial_op.input(input_name): + key.append( + len(op_dist_attr.get_input_dims_mapping(input_arg_name))) + for output_name in serial_op.output_names: + key.append(output_name) + for output_arg_name in serial_op.output(output_name): + key.append( + len(op_dist_attr.get_output_dims_mapping(output_arg_name))) + key = tuple(key) + + if key in self._cached_candidates_info: + cached_dist_attr_candidates = [] + cached_input_arg_names = self._cached_candidates_info[key][0] + cached_output_arg_names = self._cached_candidates_info[key][1] + for cached_dist_attr in self._cached_candidates_info[key][2]: + new_op_dist_attr = copy.deepcopy(dist_op.dist_attr) + i = 0 + for input_name in serial_op.input_names: + for input_arg_name in serial_op.input(input_name): + cached_dims_mapping = cached_dist_attr.get_input_dims_mapping( + cached_input_arg_names[i]) + new_op_dist_attr.set_input_dims_mapping( + input_arg_name, cached_dims_mapping) + i += 1 + i = 0 + for output_name in serial_op.output_names: + for output_arg_name in serial_op.output(output_name): + cached_dims_mapping = cached_dist_attr.get_output_dims_mapping( + cached_output_arg_names[i]) + new_op_dist_attr.set_output_dims_mapping( + output_arg_name, cached_dims_mapping) + i += 1 + cached_dist_attr_candidates.append(new_op_dist_attr) + return cached_dist_attr_candidates + + # cached_candidates_info = [] + input_arg_names = [] + for input_name in serial_op.input_names: + for input_arg_name in serial_op.input(input_name): + input_arg_names.append(input_arg_name) + self._cached_candidates_info[key].append(input_arg_names) + # cached_candidates_info.append(input_arg_names) + output_arg_names = [] + for output_name in serial_op.output_names: + for output_arg_name in serial_op.output(output_name): + output_arg_names.append(output_arg_name) + self._cached_candidates_info[key].append(output_arg_names) + # cached_candidates_info.append(output_arg_names) + + new_op_dist_attr = copy.deepcopy(dist_op.dist_attr) + # Find valid dims_mapping candidates for inputs + input_names = [] + dims_mapping_generated = [] + inputs_dist_attrs = op_dist_attr.inputs_dist_attrs + for tensor_name, tensor_dist_attr in inputs_dist_attrs.items(): + original_dims_mapping = tensor_dist_attr.dims_mapping + dims_mapping_len = len(original_dims_mapping) + input_names.append(tensor_name) + if dims_mapping_len < 1: + dims_mapping_generated.append( + [copy.deepcopy(original_dims_mapping)]) + else: + dims_mapping_generated.append( + self._generate_dims_mapping_candidates( + dims_mapping_len, process_mesh_len)) + input_dims_mapping_candidates = [] + for dims_mapping_list in itertools.product(*dims_mapping_generated): + dims_mapping_list = list(dims_mapping_list) + assert len(dims_mapping_list) == len(input_names) + for i, dims_mapping in enumerate(dims_mapping_list): + new_op_dist_attr.set_input_dims_mapping(input_names[i], + dims_mapping) + new_dist_op = DistributedOperator(dist_op.serial_op, + new_op_dist_attr) + dist_op_impls = find_compatible_distributed_operator_impls( + new_dist_op, fwd=True) + if dist_op_impls is not None: + input_dims_mapping_candidates.append(dims_mapping_list) + + # Find valid dims_mapping candidates for outputs + output_names = [] + dims_mapping_generated = [] + outputs_dist_attrs = op_dist_attr.outputs_dist_attrs + for tensor_name, tensor_dist_attr in outputs_dist_attrs.items(): + original_dims_mapping = tensor_dist_attr.dims_mapping + dims_mapping_len = len(original_dims_mapping) + output_names.append(tensor_name) + if dims_mapping_len < 1: + dims_mapping_generated.append( + [copy.deepcopy(original_dims_mapping)]) + else: + dims_mapping_generated.append( + self._generate_dims_mapping_candidates( + dims_mapping_len, process_mesh_len)) + output_dims_mapping_candidates = [] + for dims_mapping_list in itertools.product(*dims_mapping_generated): + dims_mapping_list = list(dims_mapping_list) + assert len(dims_mapping_list) == len(output_names) + for i, dims_mapping in enumerate(dims_mapping_list): + new_op_dist_attr.set_output_dims_mapping( + output_names[i], dims_mapping) + new_dist_op = DistributedOperator(dist_op.serial_op, + new_op_dist_attr) + dist_op_impls = find_compatible_distributed_operator_impls( + new_dist_op, fwd=False) + if dist_op_impls is not None: + output_dims_mapping_candidates.append(dims_mapping_list) + + if not input_dims_mapping_candidates and output_dims_mapping_candidates: + inout_dims_mapping_generated = [[[[-2]]], + output_dims_mapping_candidates] + elif input_dims_mapping_candidates and not output_dims_mapping_candidates: + inout_dims_mapping_generated = [ + input_dims_mapping_candidates, [[[-2]]] + ] + elif not input_dims_mapping_candidates and not output_dims_mapping_candidates: + inout_dims_mapping_generated = [[[[-2]]], [[[-2]]]] + else: + inout_dims_mapping_generated = [ + input_dims_mapping_candidates, output_dims_mapping_candidates + ] + # Find valid dims_mapping generated for both inputs and outputs + cached_dist_attr_candidates = [] + for inout_dims_mapping_list in itertools.product( + *inout_dims_mapping_generated): + assert len(inout_dims_mapping_list) == 2 + if input_dims_mapping_candidates: + assert len(inout_dims_mapping_list[0]) == len(input_names) + if output_dims_mapping_candidates: + assert len(inout_dims_mapping_list[1]) == len(output_names) + # set the dims_mappings for inputs + for i, dims_mapping in enumerate(inout_dims_mapping_list[0]): + if dims_mapping != [-2]: + new_op_dist_attr.set_input_dims_mapping( + input_names[i], dims_mapping) + # set the dims_mappings for outputs + for i, dims_mapping in enumerate(inout_dims_mapping_list[1]): + if dims_mapping != [-2]: + new_op_dist_attr.set_output_dims_mapping( + output_names[i], dims_mapping) + new_dist_op = DistributedOperator(dist_op.serial_op, + new_op_dist_attr) + dist_op_impls = find_compatible_distributed_operator_impls( + new_dist_op, partial=False) + if dist_op_impls is None: + continue + for dist_op_impl in dist_op_impls: + new_op_dist_attr.impl_type = dist_op_impl.type + new_op_dist_attr.impl_idx = dist_op_impl.idx + cached_dist_attr_candidates.append( + copy.deepcopy(new_op_dist_attr)) + self._cached_candidates_info[key].append(cached_dist_attr_candidates) + return self._cached_candidates_info[key][2] + + def construct_space(self): + inter_node_partitions, intra_node_partitions = self._partition_devices( + self._num_machines, self._num_devices_per_machine) + self._space.choice("inter_node_partitions", + inter_node_partitions, + default=inter_node_partitions[0]) + self._space.choice("intra_node_partitions", + intra_node_partitions, + default=intra_node_partitions[0]) + + dist_ops = self._dist_context._dist_ops_for_program + for op_id, dist_op in dist_ops.items(): + op_type = dist_op.serial_op.type + if self._include_op_types: + if op_type in self._include_op_types: + self._concerned_dist_ops[op_id] = dist_op + else: + self._concerned_dist_ops[op_id] = dist_op + + for op_id, dist_op in self._concerned_dist_ops.items(): + op_type = dist_op.serial_op.type + if op_type in self._exclude_op_types: + del self._concerned_dist_ops[op_id] + + print("Number of the concered dist ops", + len(self._concerned_dist_ops), + flush=True) + search_space = 1 + for op_id, dist_op in self._concerned_dist_ops.items(): + op_dist_attr_candidates = self._generate_dist_attr_candidates( + op_id, dist_op) + search_space *= len(op_dist_attr_candidates) + self._space.choice(str(op_id), + op_dist_attr_candidates, + default=op_dist_attr_candidates[0]) + + def _compute_values_hash(self, values): + keys = sorted(values.keys()) + s = "".join(str(k) + "=" + str(values[k]) for k in keys) + return hashlib.sha256(s.encode("utf-8")).hexdigest()[:32] + + def _random_values(self): + space = TunableSpace() + collisions = 0 + while True: + for v in self._space.variables.values(): + space._register(v) + space.values[v.name] = v.random(self._seed_state) + self._seed_state += 1 + values = space.values + values_hash = self._compute_values_hash(values) + if values_hash in self._tried_values: + collisions += 1 + if collisions > self._max_collisions: + return None + continue + self._tried_values.add(values_hash) + break + return values + + def _populate_space(self): + values = self._random_values() + if values is None: + return {"status": TrialStatus.STOPPED, "values": None} + return {"status": TrialStatus.RUNNING, "values": values} + + def _create_trial(self): + trial_id = "{{:0{}d}}".format(len(str(self._max_trials))) + trial_id = trial_id.format(self._num_trials) + + if self._max_trials and self._num_trials >= self._max_trials: + status = TrialStatus.STOPPED + values = None + else: + results = self._populate_space() + status = results["status"] + values = results["values"] + + space = TunableSpace() + space.variables = self._space.variables + space.values = values + trial = Trial(tunable_space=space, trial_id=trial_id, status=status) + self._num_trials += 1 + return trial + + def _generate_pipeline_starts(self, process_mesh_list): + total_ops = len(self._dist_context._dist_ops_for_program) + total_stages = len(process_mesh_list) + ops_per_stage = total_ops // total_stages + if ops_per_stage == 0: + return None + # Compute the initial pipeline starts + pipeline_starts = [] + start = 0 + pipeline_starts.append(0) + # The pipeline_starts have total_stages+1 items, and + # at least have 2 items. + for _ in process_mesh_list: + start += ops_per_stage + pipeline_starts.append(start) + pipeline_starts[-1] = total_ops + # Adjust the pipeline starts by random selection + directions = [] + sizes = [] + half_ops_per_stage = ops_per_stage // 2 + if half_ops_per_stage > 0 and total_stages > 1: + new_pipeline_starts = [] + # Don't change the first start + new_pipeline_starts.append(0) + # Consider the starts except the first and the last one + for _ in pipeline_starts[1:-1]: + directions.append(Boolean("direction")) + sizes.append( + IntRange("size", + start=0, + stop=half_ops_per_stage, + endpoint=True)) + for i, start in enumerate(pipeline_starts[1:-1]): + direction = directions[i].random(self._seed) + size = sizes[i].random(self._seed) + if direction: + # Substract 1 from size to avoid the overlapping of new starts + new_start = start - (size - 1) + else: + new_start = start + size + new_pipeline_starts.append(new_start) + # Don't change the last start + new_pipeline_starts.append(pipeline_starts[-1]) + # Validate the new starts + print("Adjusted pipeline starts", + new_pipeline_starts, + half_ops_per_stage, + pipeline_starts, + flush=True) + for i, new_start in enumerate(new_pipeline_starts[1:]): + assert new_start > new_pipeline_starts[i] + return new_pipeline_starts + else: + print("Non-adjusted pipeline starts", + pipeline_starts, + half_ops_per_stage, + flush=True) + return pipeline_starts + + def _apply_pipeline_partition(self, process_mesh_list): + op_id_to_process_mesh = {} + total_ops = len(self._dist_context._dist_ops_for_program) + total_stages = len(process_mesh_list) + ops_per_stage = total_ops // total_stages + if ops_per_stage == 0: + return None + pipeline_starts = self._generate_pipeline_starts(process_mesh_list) + start_idx = 1 + sorted_op_ids = sorted(self._dist_context._dist_ops_for_program.keys()) + for idx, op_id in enumerate(sorted_op_ids): + if idx < pipeline_starts[start_idx]: + op_id_to_process_mesh[op_id] = process_mesh_list[start_idx - 1] + else: + start_idx += 1 + op_id_to_process_mesh[op_id] = process_mesh_list[start_idx - 1] + return op_id_to_process_mesh + + def _amend_dist_attr(self): + # 1) Reshape the process mesh of [1, x] to [x] or [x, 1] to [x], + # and amend the corresponding dims_mapping. + # 2) Set the dim_mapping to -1 when the shape cannot be divided + # by the corresponding processes. + for dist_op in self._dist_context._dist_ops_for_program.values(): + dist_attr = dist_op.dist_attr + process_mesh = dist_attr.process_mesh + if process_mesh is None: + continue + assert process_mesh.ndim == 2 + dim_of_one = None + dim_of_other = None + if process_mesh.topology[0] == 1: + dim_of_one = 0 + dim_of_other = 1 + elif process_mesh.topology[1] == 1: + dim_of_one = 1 + dim_of_other = 0 + + if dim_of_one is not None: + dist_attr.process_mesh = ProcessMesh(process_mesh.processes) + self._dist_context.add_process_mesh(dist_attr.process_mesh) + + for arg_name in dist_attr.inputs_dist_attrs.keys(): + new_dims_mapping = [] + dims_mapping = dist_attr.get_input_dims_mapping(arg_name) + for dim_mapping in dims_mapping: + if dim_mapping == dim_of_one: + new_dims_mapping.append(-1) + elif dim_mapping == dim_of_other: + new_dims_mapping.append(0) + else: + new_dims_mapping.append(dim_mapping) + dist_attr.set_input_dims_mapping(arg_name, new_dims_mapping) + + dims_mapping = dist_attr.get_input_dims_mapping(arg_name) + # dynamic_dims = dist_attr.get_input_dynamic_dims(arg_name) + process_mesh = dist_attr.process_mesh + process_shape = process_mesh.topology + tensor = dist_op.get_serial_input(arg_name) + if dims_mapping: + tensor_shape = tensor.shape + else: + continue + for i, dim_mapping in enumerate(dims_mapping): + # if dim_mapping != -1 \ + # and (tensor_shape[i] % process_shape[dim_mapping] != 0 \ + # or dynamic_dims[i] == 1): + if dim_mapping != -1 \ + and (tensor_shape[i] % process_shape[dim_mapping] != 0): + dims_mapping[i] = -1 + # it is a fix-bug + if dim_mapping != -1 \ + and process_shape[dim_mapping] == 1: + dims_mapping[i] = -1 + + for arg_name in dist_attr.outputs_dist_attrs.keys(): + new_dims_mapping = [] + dims_mapping = dist_attr.get_output_dims_mapping(arg_name) + for dim_mapping in dims_mapping: + if dim_mapping == dim_of_one: + new_dims_mapping.append(-1) + elif dim_mapping == dim_of_other: + new_dims_mapping.append(0) + else: + new_dims_mapping.append(dim_mapping) + dist_attr.set_output_dims_mapping(arg_name, new_dims_mapping) + + dims_mapping = dist_attr.get_output_dims_mapping(arg_name) + # dynamic_dims = dist_attr.get_output_dynamic_dims(arg_name) + process_mesh = dist_attr.process_mesh + process_shape = process_mesh.topology + + tensor = dist_op.get_serial_output(arg_name) + if dims_mapping: + tensor_shape = tensor.shape + else: + continue + for i, dim_mapping in enumerate(dims_mapping): + if dim_mapping != -1 \ + and (tensor_shape[i] % process_shape[dim_mapping] != 0): + dims_mapping[i] = -1 + # it is a fix-bug + if dim_mapping != -1 \ + and process_shape[dim_mapping] == 1: + dims_mapping[i] = -1 + dist_op_impls = find_compatible_distributed_operator_impls( + dist_op, partial=False) + serial_op_type = dist_op.serial_op.type + + if dist_op_impls is not None and ( + serial_op_type != "fused_softmax_mask_upper_triangle" + or self._check_fused_softmax_mask_upper_triangle(dist_op)): + dist_op.dist_attr.impl_type = dist_op_impls[0].type + dist_op.dist_attr.impl_idx = dist_op_impls[0].idx + else: + # Use the default dist op impl + for arg_name in dist_attr.inputs_dist_attrs.keys(): + dims_mapping = dist_attr.get_input_dims_mapping(arg_name) + for i, _ in enumerate(dims_mapping): + dims_mapping[i] = -1 + for arg_name in dist_attr.outputs_dist_attrs.keys(): + dims_mapping = dist_attr.get_output_dims_mapping(arg_name) + for i, _ in enumerate(dims_mapping): + dims_mapping[i] = -1 + dist_op.dist_attr.impl_type = "default" + dist_op.dist_attr.impl_idx = 0 + + def _check_fused_softmax_mask_upper_triangle(self, dist_op): + """The last_but_one dim shoule be equal to last dim.""" + input_name = dist_op.serial_op.input_arg_names[0] + input_dims_mapping = dist_op.dist_attr.get_input_dims_mapping( + input_name) + topology = dist_op.dist_attr.process_mesh.topology + input_tensor = dist_op.get_serial_input(input_name) + last_but_one_dim = input_tensor.shape[-2] // topology[ + input_dims_mapping[-2]] if input_dims_mapping[ + -2] != -1 else input_tensor.shape[-2] + last_dim = input_tensor.shape[-1] // topology[input_dims_mapping[ + -1]] if input_dims_mapping[-1] != -1 else input_tensor.shape[-1] + if last_but_one_dim == last_dim: + return True + return False + + def _eval_trial(self, trial): + if self._num_trials == 0: + num_prev_trials = 0 + else: + num_prev_trials = self._num_trials - 1 + + results = None + + start_time = time.time() + + inter_node_partition = trial.space.values["inter_node_partitions"] + intra_node_partition = trial.space.values["intra_node_partitions"] + process_mesh_list = self._generate_process_mesh_list( + inter_node_partition, intra_node_partition) + print("\tprocess_mesh list", process_mesh_list, flush=True) + op_id_to_process_mesh = self._apply_pipeline_partition( + process_mesh_list) + if op_id_to_process_mesh is None: + print("Operators are less than pipeline stages", flush=True) + return results + + op_id_to_dist_attr = {} + for name, value in trial.space.values.items(): + if name != "inter_node_partitions" \ + and name !="intra_node_partitions": + op_id_to_dist_attr[int(name)] = value + + end_time = time.time() + cur_sample_time = end_time - start_time + self._sample_time = (num_prev_trials * self._sample_time + + cur_sample_time) / self._num_trials + print("\tsample_time", + num_prev_trials, + self._num_trials, + self._sample_time, + cur_sample_time, + flush=True) + + assert len(op_id_to_process_mesh) == len(op_id_to_dist_attr) + + start_time = time.time() + for op_id, process_mesh in op_id_to_process_mesh.items(): + dist_op = self._dist_context._dist_ops_for_program[op_id] + dist_op.dist_attr = copy.deepcopy(op_id_to_dist_attr[op_id]) + assert dist_op.dist_attr.impl_type == op_id_to_dist_attr[ + op_id].impl_type + assert dist_op.dist_attr.impl_idx == op_id_to_dist_attr[ + op_id].impl_idx + dist_op.dist_attr.process_mesh = process_mesh + self._amend_dist_attr() + + self._completer._complete_tensor_dist_attr_by_op() + + self._dist_context.block_state.parse_forward_blocks( + self._dist_context.serial_main_program) + + end_time = time.time() + cur_complete_time = end_time - start_time + self._complete_time = (num_prev_trials * self._complete_time + + cur_complete_time) / self._num_trials + print("\tcomplete_time", + num_prev_trials, + self._num_trials, + self._complete_time, + cur_complete_time, + flush=True) + + start_time = time.time() + estimate_time = self._estimate_trial() + end_time = time.time() + cur_estimate_time = end_time - start_time + self._estimate_time = (num_prev_trials * self._estimate_time + + cur_estimate_time) / self._num_trials + print("\testimate_time", + num_prev_trials, + self._num_trials, + self._estimate_time, + cur_estimate_time, + estimate_time, + flush=True) + + results = {"estimate_time": estimate_time} + return results + + def _update_trail(self, trial, metrics, step=0): + for metric_name, metric_value in metrics.items(): + trial.recorder.update(metric_name, metric_value, step=step) + return trial.status + + def _estimate_trial(self): + assert self._cluster is not None + if self._mode == "eval": + self._estimator = CostEstimator( + self._dist_context.serial_main_program, + self._cluster, + loop_count=self._loop_count) + elif self._mode == "predict": + self._estimator = CostEstimator( + self._dist_context.serial_main_program, + self._cluster, + loop_count=self._loop_count) + elif self._mode == "train": + # get serial main program with backward + serial_main_program = self._dist_context.serial_main_program + serial_startup_program = self._dist_context.serial_startup_program + serial_optimizer = self._dist_context.serial_optimizer + + # Generate backward + serial_loss = self._dist_context.serial_fetch_vars["loss"][0] + params_grads = self._parallelizer._generate_backward( + serial_main_program, serial_startup_program, serial_loss) + + # Generate optimizer + optimizer_ops = self._parallelizer._generate_optimizer( + serial_main_program, serial_startup_program, serial_optimizer, + params_grads) + self._estimator = CostEstimator(serial_main_program, + self._cluster, + loop_count=self._loop_count) + + max_memory = self._estimator._estimate_max_memory_by_dist_op( + self._dist_context) + print("\tmax_memory", "{:,}".format(max_memory), flush=True) + # The max memory must be less than 80% 32GB (hard code) + if max_memory > 32 * 0.8 * 1024 * 1024 * 1024: + return math.inf + else: + global_cost = self._estimator.estimate(self._dist_context) + return global_cost.time + + def _store_init_parallel_strategy(self): + # If there is no annotation information, use the dp as the initial parallel strategy. + # TODO: we should need a better way to set up the initial parallel strategy. + if not self._dist_context.has_annotation \ + or not self._dist_context.process_meshes: + ranks = self._num_machines * self._num_devices_per_machine + tensor_node = self._dist_context._serial_ordered_tensor_nodes[0] + tensor_node_id = _node_id(tensor_node) + tensor = self._dist_context._dist_tensors_for_graph[ + tensor_node_id].serial_tensor + tensor_dist_attr = self._dist_context._dist_tensors_for_graph[ + tensor_node_id].dist_attr + tensor_dist_attr.process_mesh = ProcessMesh(list(range(ranks))) + self._dist_context._process_meshes.append( + tensor_dist_attr.process_mesh) + tensor_dist_attr.dims_mapping = [0] + [ + -1 for _ in range(len(tensor.shape) - 1) + ] + tensor_dist_attr.mark_annotated("process_mesh") + tensor_dist_attr.mark_annotated("dims_mapping") + print("Use dp as the init parallel strategy!", flush=True) + + # Do the sharding propagation + self._completer.complete_forward_annotation() + self._dist_context.block_state.parse_forward_blocks( + self._dist_context.serial_main_program) + + # Backup the intital parallel strategy + self._init_parallel_strategy[0] = copy.deepcopy( + self._dist_context._dist_tensors_for_program) + self._init_parallel_strategy[1] = copy.deepcopy( + self._dist_context._dist_ops_for_program) + self._init_parallel_strategy[2] = copy.deepcopy( + self._dist_context.process_meshes) + + # Initialize the best parallel strategy to the initial one + self._best_parallel_strategy[0] = copy.deepcopy( + self._dist_context._dist_tensors_for_program) + self._best_parallel_strategy[1] = copy.deepcopy( + self._dist_context._dist_ops_for_program) + self._best_parallel_strategy[2] = copy.deepcopy( + self._dist_context._process_meshes) + + def _store_best_parallel_strategy(self): + # Swap the best and the current parallel strategy + tmp = [None, None, None] + tmp[0] = self._best_parallel_strategy[0] + tmp[1] = self._best_parallel_strategy[1] + tmp[2] = self._best_parallel_strategy[2] + self._best_parallel_strategy[ + 0] = self._dist_context._dist_tensors_for_program + self._best_parallel_strategy[ + 1] = self._dist_context._dist_ops_for_program + self._best_parallel_strategy[2] = self._dist_context._process_meshes + self._dist_context._dist_tensors_for_program = tmp[0] + self._dist_context._dist_ops_for_program = tmp[1] + self._dist_context._process_meshes = tmp[2] + + def tune(self): + global_start_time = time.time() + self._dist_context._backup(serial=True, dist=True) + # This store statement must follow the above backup statement + self._store_init_parallel_strategy() + init_time = self._estimate_trial() # estimate_trial when init + # print_program_with_dist_attr(self._dist_context.serial_main_program, self._dist_context) + # We have to restore the distributed context, because the estimation of one trail need to + # generate the backward and update parts. Since we will do the tuning process, + # here we only need to reset all distributed information to the default one. + self._dist_context._restore(serial=True, + serial_mode="to_backup", + dist=True, + dist_mode="to_default") + + best_time = init_time + start_time = time.time() + self.construct_space() + end_time = time.time() + print("construct_space time", + self._num_trials, + end_time - start_time, + flush=True) + create_trial_time = 0.0 + eval_trial_time = 0.0 + self._sample_time = 0.0 + self._complete_time = 0.0 + self._estimate_time = 0.0 + while True: + start_time = time.time() + trial = self._create_trial() + if self._num_trials == 0: + num_prev_trials = 0 + else: + num_prev_trials = self._num_trials - 1 + end_time = time.time() + cur_create_trial_time = end_time - start_time + create_trial_time = (num_prev_trials * create_trial_time + + cur_create_trial_time) / self._num_trials + print("create_trial time", + num_prev_trials, + self._num_trials, + create_trial_time, + cur_create_trial_time, + flush=True) + if trial.status == TrialStatus.STOPPED: + break + # We need to backup the distributed context, because the evaluation of one trail will + # generate the backward and update parts which may change the context. + # However, the distributed information of the context aren't backup since a new one is used. + self._dist_context._backup(serial=True, dist=False) + + start_time = time.time() + results = self._eval_trial(trial) + end_time = time.time() + cur_eval_trial_time = end_time - start_time + eval_trial_time = (num_prev_trials * eval_trial_time + + cur_eval_trial_time) / self._num_trials + print("eval_trial time", + num_prev_trials, + self._num_trials, + eval_trial_time, + cur_eval_trial_time, + "\n", + flush=True) + + cur_time = results["estimate_time"] + if cur_time < best_time: + self._update_trail(trial, results) + self._store_best_parallel_strategy() + best_time = cur_time + # We need to restore the distributed context and reset the distributed information to the default. + self._dist_context._restore(serial=True, + serial_mode="to_backup", + dist=True, + dist_mode="to_default") + # Select the best parallel strategy + self._dist_context._dist_tensors_for_program = self._best_parallel_strategy[ + 0] + self._dist_context._dist_ops_for_program = self._best_parallel_strategy[ + 1] + self._dist_context._process_meshes = self._best_parallel_strategy[2] diff --git a/python/paddle/distributed/auto_parallel/tuner/profiler.py b/python/paddle/distributed/auto_parallel/tuner/profiler.py index a894554c2facd..4b2655028bf7f 100644 --- a/python/paddle/distributed/auto_parallel/tuner/profiler.py +++ b/python/paddle/distributed/auto_parallel/tuner/profiler.py @@ -13,20 +13,17 @@ # limitations under the License. import os -import sys import argparse import traceback import pickle import json import time -import numpy as np -from functools import partial import paddle from paddle.fluid.framework import Program, _current_expected_place -from paddle.fluid.framework import Operator, Parameter -from paddle.distributed.auto_parallel.process_group import clear_all_process_groups, get_all_process_groups, new_process_group -from paddle.distributed.auto_parallel.dist_loader import NonIterableGeneratorLoader +from paddle.fluid.framework import Operator +from paddle.distributed.auto_parallel.process_group import get_all_process_groups, new_process_group +from paddle.distributed.auto_parallel.dist_loader import DistributedDataLoaderFromGenerator from paddle.distributed.collective import _get_global_env paddle.enable_static() @@ -135,13 +132,14 @@ def create_dataloader(main_program, # insert read op at the end of program places = paddle.static.cuda_places() with paddle.static.program_guard(main_program, startup_program): - dataloader = NonIterableGeneratorLoader( - dataset, - feed_list, - places, - dataset.batch_size, - epochs, - steps_per_epoch, + dataloader = DistributedDataLoaderFromGenerator( + dataset=dataset, + feed_list=feed_list, + capacity=70, + places=places, + batch_size=dataset.batch_size, + epochs=epochs, + steps_per_epoch=steps_per_epoch, data_parallel_world_size=dataset.dp_world_size, data_parallel_rank=dataset.dp_rank) diff --git a/python/paddle/distributed/auto_parallel/tuner/tunable_space.py b/python/paddle/distributed/auto_parallel/tuner/tunable_space.py index 93ae25c9c4dd1..38dc142468e8a 100644 --- a/python/paddle/distributed/auto_parallel/tuner/tunable_space.py +++ b/python/paddle/distributed/auto_parallel/tuner/tunable_space.py @@ -44,10 +44,18 @@ def __init__(self): def variables(self): return self._variables + @variables.setter + def variables(self, variables): + self._variables = variables + @property def values(self): return self._values + @values.setter + def values(self, values): + self._values = values + def get_value(self, name): if name in self.values: return self.values[name] diff --git a/python/paddle/distributed/auto_parallel/tuner/tunable_variable.py b/python/paddle/distributed/auto_parallel/tuner/tunable_variable.py index 424b6b74bb154..31dd07aad374c 100644 --- a/python/paddle/distributed/auto_parallel/tuner/tunable_variable.py +++ b/python/paddle/distributed/auto_parallel/tuner/tunable_variable.py @@ -90,6 +90,7 @@ def __init__(self, name, values, default=None): raise TypeError( "Choice can contain only one type of value, but found values: {} with types: {}." .format(str(values), str(types))) + self._is_unknown_type = False if isinstance(values[0], str): values = [str(v) for v in values] @@ -108,9 +109,8 @@ def __init__(self, name, values, default=None): if default is not None: default = bool(default) else: - raise TypeError( - "Choice can only contain str, int, float, or boll, but found: {} " - .format(str(values))) + self._is_unknown_type = True + self._indices = [i for i in range(len(values))] self.values = values if default is not None and default not in values: @@ -129,7 +129,11 @@ def default(self): def random(self, seed=None): rng = np.random.default_rng(seed) - return rng.choice(self.values) + if self._is_unknown_type: + indice = rng.choice(self._indices) + return self.values[indice] + else: + return rng.choice(self.values) def get_state(self): state = super(Choice, self).get_state() diff --git a/python/paddle/distributed/auto_parallel/utils.py b/python/paddle/distributed/auto_parallel/utils.py index 8ee11669cbefd..cccff89b3d991 100644 --- a/python/paddle/distributed/auto_parallel/utils.py +++ b/python/paddle/distributed/auto_parallel/utils.py @@ -27,6 +27,10 @@ from paddle.fluid.io import is_parameter, is_belong_to_optimizer from paddle.distributed.auto_parallel.dist_attribute import TensorDistributedAttribute, OperatorDistributedAttribute +__not_shape_var_type__ = [ + core.VarDesc.VarType.READER, core.VarDesc.VarType.STEP_SCOPES +] + def get_logger(log_level, name="auto_parallel"): logger = logging.getLogger(name) @@ -1583,3 +1587,80 @@ def find_higher_order_backward_op(program): return True return False + + +def get_lr(optimizer): + if isinstance(optimizer, paddle.optimizer.Optimizer): + return optimizer.get_lr() + elif isinstance(optimizer, paddle.fluid.optimizer.Optimizer): + if isinstance(optimizer._learning_rate, float): + return optimizer._learning_rate + else: + return optimizer._learning_rate() + else: + raise TypeError( + "'optimizer' must be object of class `paddle.optimizer.Optimizer`" \ + " or `paddle.fluid.optimizer.Optimizer`, but got {}.".format(type(optimizer)) + ) + + +def initialize_pg_in_full_mode(all_process_groups, cur_rank): + import socket + from ..collective import _get_global_env + + has_recv_by_socket = [] + # This is a magic number + magic_num = 500 + genv = _get_global_env() + cur_rank_ip, cur_rank_port = genv.current_endpoint.split(":") + cur_rank_recv_port = int(cur_rank_port) + magic_num + server_socket = None + # Large enough for recv rank + buff_size = 1024 + server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + server_socket.bind((cur_rank_ip, cur_rank_recv_port)) + # The 10 is an empirical value + server_socket.listen(10) + client_sockets = {} + for process_group in all_process_groups: + if cur_rank not in process_group.ranks: + continue + if len(process_group.ranks) == 2: + index = process_group.ranks.index(cur_rank) + is_send = True if index == 0 else False + if is_send: + recv_rank = process_group.ranks[1] + recv_rank_ip, recv_rank_port = genv.trainer_endpoints[ + recv_rank].split(":") + connect_port = int(recv_rank_port) + magic_num + client_socket = socket.socket(socket.AF_INET, + socket.SOCK_STREAM) + client_socket.connect((recv_rank_ip, connect_port)) + client_socket.send(str(cur_rank).encode('utf-8')) + rank = client_socket.recv(buff_size).decode('utf-8') + rank = int(rank) + if rank != recv_rank: + raise ValueError( + "Please check comm pair, the recv rank should be {} but got {}." + .format(recv_rank, rank)) + else: + print("It is able to instantiate {} as sender now.".format( + process_group.ranks)) + client_socket.close() + else: + send_rank = process_group.ranks[0] + while True: + if send_rank not in has_recv_by_socket: + client_socket, recv_addr = server_socket.accept() + rank = int(client_socket.recv(buff_size).decode()) + client_sockets[rank] = client_socket + has_recv_by_socket.append(rank) + else: + client_sockets[send_rank].send( + str(cur_rank).encode("utf-8")) + client_sockets[send_rank].close() + print("It is able to instantiate {} as recver now.". + format(process_group.ranks)) + break + process_group.instantiate() + server_socket.close() diff --git a/python/paddle/distributed/passes/auto_parallel_amp.py b/python/paddle/distributed/passes/auto_parallel_amp.py index 3545783ba177e..9e0aaa6448554 100644 --- a/python/paddle/distributed/passes/auto_parallel_amp.py +++ b/python/paddle/distributed/passes/auto_parallel_amp.py @@ -517,9 +517,11 @@ def __init__(self): self.set_attr("use_dynamic_loss_scaling", False) self.set_attr("input_data", []) self.set_attr("params_grads", []) + self._loss = None self._loss_scaling = None self._num_good_steps = None self._num_bad_steps = None + self._loss = None def _check_self(self): if self.get_attr("init_loss_scaling") < 0: diff --git a/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py b/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py index ec64101c2c7a1..70592e8b38037 100644 --- a/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py +++ b/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py @@ -82,9 +82,11 @@ def _apply_single_impl(self, main_program, startup_program, context): with paddle.static.program_guard(main_program, startup_program): self._analyze_program() - self._prune_grad_scaling() - self._calc_comm_overlap() - grad_group = self._fuse_allreduce() + + if self.is_data_parallel_applied(): + self._prune_grad_scaling() + self._calc_comm_overlap() + grad_group = self._fuse_allreduce() # self.summary(grad_group) @@ -167,6 +169,9 @@ def _analyze_program(self): ) == 0, "Unexception: gradients [{}] is scaled BUT NOT synchronized.".format( not_synchronized_grads) + def is_data_parallel_applied(self): + return len(self._group_to_grad_name_map) > 0 + def _could_be_prune(self): return self.dist_context.gradient_scale and ( diff --git a/python/paddle/distributed/passes/auto_parallel_grad_clip.py b/python/paddle/distributed/passes/auto_parallel_grad_clip.py index f1a0c6e38674a..34c0b7d56a038 100644 --- a/python/paddle/distributed/passes/auto_parallel_grad_clip.py +++ b/python/paddle/distributed/passes/auto_parallel_grad_clip.py @@ -213,7 +213,7 @@ def _check_self(self): if self.get_attr("dist_context") is None: return False dist_context = self.get_attr("dist_context") - if dist_context._lr_optimizer._grad_clip is None: + if dist_context._serial_optimizer._grad_clip is None: return False if self.get_attr("params_grads") is None: return False diff --git a/python/paddle/distributed/passes/auto_parallel_sharding.py b/python/paddle/distributed/passes/auto_parallel_sharding.py index 5840c16fc019c..636b3218c8a0b 100644 --- a/python/paddle/distributed/passes/auto_parallel_sharding.py +++ b/python/paddle/distributed/passes/auto_parallel_sharding.py @@ -396,7 +396,7 @@ def _shard_gradient_synchronization(self, main_block): dp_ring_ids = [group.id for group in self.dp_groups] for idx, op in reversed(list(enumerate(main_block.ops))): - if is_data_parallel_reduce_op(op): + if _is_param_grad_allreduce_op(op, main_block): input_name = op.input_arg_names[0] base_name = _get_base_name_from_grad_name(input_name) sharding_info = self.varname_to_sharding_info[base_name] @@ -653,6 +653,20 @@ def _get_base_name_from_grad_name(grad_name): return base_name +def _is_param_grad_allreduce_op(op, block): + + if not is_data_parallel_reduce_op(op): + return False + + output_name = op.output_arg_names[0] + base_name = _get_base_name_from_grad_name(output_name) + + if not block.has_var(base_name): + return False + + return block.var(base_name).is_parameter + + def _is_param_grad_sum_op(op, block): if not is_backward_op(op): diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt index bbccf452742a3..3d34ed4fcdbe7 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt @@ -60,6 +60,9 @@ if(WITH_DISTRIBUTE AND WITH_GPU) py_test_modules(test_pass_amp MODULES test_pass_amp ENVS ${dist_ENVS}) set_tests_properties(test_pass_amp PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 50) + py_test_modules(test_engine_callbacks MODULES test_engine_callbacks) + set_tests_properties(test_engine_callbacks + PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 50) py_test_modules(test_while_op_completion MODULES test_while_op_completion ENVS ${dist_ENVS}) @@ -78,6 +81,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU) py_test_modules(test_dist_embedding MODULES test_dist_embedding ENVS ${dist_ENVS}) py_test_modules(test_dist_slice MODULES test_dist_slice ENVS ${dist_ENVS}) + py_test_modules(test_dist_split MODULES test_dist_split ENVS ${dist_ENVS}) py_test_modules(test_cluster MODULES test_cluster ENVS ${dist_ENVS}) py_test_modules(test_comm_cost MODULES test_comm_cost ENVS ${dist_ENVS}) py_test_modules(test_comp_cost MODULES test_comp_cost ENVS ${dist_ENVS}) @@ -96,4 +100,19 @@ if(WITH_DISTRIBUTE AND WITH_GPU) py_test_modules(test_strategy MODULES test_strategy) py_test_modules(test_pass_quantization MODULES test_pass_quantization) + py_test_modules(test_dist_shape MODULES test_dist_shape) + py_test_modules(test_dist_assign MODULES test_dist_assign) + py_test_modules(test_conditional_block_reshard MODULES + test_conditional_block_reshard) + + py_test_modules(test_parallel_tuner MODULES test_parallel_tuner ENVS + ${dist_ENVS}) + set_tests_properties(test_parallel_tuner PROPERTIES TIMEOUT 120) + py_test_modules(test_parallel_tuner_full MODULES test_parallel_tuner_full + ENVS ${dist_ENVS}) + set_tests_properties(test_parallel_tuner_full PROPERTIES TIMEOUT 120) + py_test_modules(test_parallel_tuner_predict MODULES + test_parallel_tuner_predict ENVS ${dist_ENVS}) + set_tests_properties(test_parallel_tuner_predict PROPERTIES TIMEOUT 120) + endif() diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/amp_pass_unittest.py b/python/paddle/fluid/tests/unittests/auto_parallel/amp_pass_unittest.py index 45ca5695af4ca..ea3bdd3208240 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/amp_pass_unittest.py +++ b/python/paddle/fluid/tests/unittests/auto_parallel/amp_pass_unittest.py @@ -88,33 +88,27 @@ def check_results(self, ref_losses, check_losses, rtol=None, atol=None): def test_amp_pass(self): # mp2 training mp_engine = self.get_engine() - mp_losses = mp_engine.fit(self.dataset, 3, batch_size=self.batch_size) - mp_losses = np.array(mp_losses["loss"]) + history = mp_engine.fit(self.dataset, 3, batch_size=self.batch_size) + mp_losses = np.array(history.history["loss"]) # mp2 amp-o1 training amp_o1_engine = self.get_engine(True, "o1") - amp_o1_losses = amp_o1_engine.fit(self.dataset, - 3, - batch_size=self.batch_size) - amp_o1_losses = np.array(amp_o1_losses["loss"]) + history = amp_o1_engine.fit(self.dataset, 3, batch_size=self.batch_size) + amp_o1_losses = np.array(history.history["loss"]) amp_o1_engine.evaluate(self.dataset, 3, batch_size=self.batch_size) # self.check_results(mp_losses, amp_o1_losses) # mp2 amp-o2 training amp_o2_engine = self.get_engine(True, "o2") - amp_o2_losses = amp_o2_engine.fit(self.dataset, - 3, - batch_size=self.batch_size) - amp_o2_losses = np.array(amp_o2_losses["loss"]) + history = amp_o2_engine.fit(self.dataset, 3, batch_size=self.batch_size) + amp_o2_losses = np.array(history.history["loss"]) amp_o2_engine.evaluate(self.dataset, 3, batch_size=self.batch_size) # self.check_results(mp_losses, amp_o2_losses) # mp2 amp-o3 training amp_o3_engine = self.get_engine(True, "o3") - amp_o3_losses = amp_o3_engine.fit(self.dataset, - 3, - batch_size=self.batch_size) - amp_o3_losses = np.array(amp_o3_losses["loss"]) + history = amp_o3_engine.fit(self.dataset, 3, batch_size=self.batch_size) + amp_o3_losses = np.array(history.history["loss"]) amp_o3_engine.evaluate(self.dataset, 3, batch_size=self.batch_size) # self.check_results(mp_losses, amp_o3_losses) diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py b/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py index bad90667df1c0..1cc68393b6426 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py +++ b/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py @@ -20,6 +20,8 @@ import numpy as np import subprocess import paddle +import paddle.static as static +import paddle.utils as utils import paddle.nn as nn import paddle.fluid as fluid import paddle.static as static @@ -29,14 +31,17 @@ from paddle.io import Dataset, IterableDataset, DataLoader from paddle.distributed.fleet import auto +from paddle.distributed.auto_parallel.interface import get_collection, CollectionNames from paddle.optimizer.lr import CosineAnnealingDecay from paddle.fluid.dataloader.collate import default_collate_fn paddle.enable_static() + global_process_mesh = auto.ProcessMesh(mesh=[0, 1]) PP_MESH_0 = auto.ProcessMesh([0]) PP_MESH_1 = auto.ProcessMesh([1]) -batch_size = 1 +epoch_num = 1 +batch_size = 2 batch_num = 10 hidden_size = 1024 sequence_len = 512 @@ -46,6 +51,8 @@ paddle.seed(44) is_fetch = True +is_feed = True +my_feed_vars = [] class MyDataset(Dataset): @@ -63,6 +70,23 @@ def __len__(self): return self.num_samples +def get_random_inputs_and_labels(image_shape, label_shape): + input = np.random.random(size=image_shape).astype('float32') + label = np.random.random(size=label_shape).astype('int64') + return input, label + + +def batch_generator_creator(): + + def __reader__(): + for _ in range(batch_num): + batch_input, batch_label = get_random_inputs_and_labels( + [batch_size, image_size], [batch_size, 1]) + yield batch_input, batch_label + + return __reader__ + + class MLPLayer(nn.Layer): def __init__(self, @@ -92,16 +116,20 @@ def __init__(self, def forward(self, input): out = auto.shard_op(self.norm, PP_MESH_0)(input) out = self.linear0(out) + if is_feed: + my_feed_vars.append((out, out.shape)) out = F.gelu(out, approximate=True) out = auto.shard_op(self.linear1, PP_MESH_1)(out) out = self.dropout(out) out = self.linear2(out) + if is_feed: + my_feed_vars.append((out, out.shape)) if is_fetch: - auto.fetch(out, "out") + auto.fetch(out, "my_fetch", logging=True) return out -def train(fetch): +def train_high_level(fetch): global is_fetch is_fetch = fetch mlp = MLPLayer(hidden_size=hidden_size, @@ -124,10 +152,12 @@ def train(fetch): # train train_dataset = MyDataset(batch_num * batch_size) eval_dataset1 = MyDataset(5 * batch_size) - engine.fit(train_data=train_dataset, - epochs=2, - batch_size=batch_size, - valid_data=eval_dataset1) + + history = engine.fit(train_data=train_dataset, + epochs=2, + batch_size=batch_size, + valid_data=eval_dataset1, + log_freq=1) # eval eval_dataset2 = MyDataset(batch_size) @@ -135,7 +165,7 @@ def train(fetch): # predict test_dataset = MyDataset(batch_size) - engine.predict(test_dataset, batch_size=batch_size) + outputs = engine.predict(test_dataset, batch_size=batch_size) # save temp_dir = tempfile.TemporaryDirectory() @@ -145,6 +175,265 @@ def train(fetch): temp_dir.cleanup() +def train_low_level(): + mlp = MLPLayer(hidden_size=hidden_size, + intermediate_size=4 * hidden_size, + dropout_ratio=0.1, + initializer_range=0.02) + loss = paddle.nn.CrossEntropyLoss() + optimizer = paddle.optimizer.Adam(learning_rate=0.00001, + beta1=0.9, + beta2=0.999, + epsilon=1e-08, + grad_clip=None) + metric = paddle.metric.Accuracy() + + strategy = auto.Strategy() + strategy.auto_mode = "semi" + + engine = auto.Engine(mlp, loss, optimizer, metrics=None, strategy=strategy) + + feed_dict = {} + for feed_var, shape in my_feed_vars: + feed_dict[feed_var.name] = np.zeros(shape, dtype="float32") + + # Build normal normal dataloader + # train + train_dataset = MyDataset(batch_num * batch_size) + train_dataloader = engine.dataloader(train_dataset, + batch_size=batch_size, + mode="train") + engine.prepare(mode="train") + for data in train_dataloader: + outs = engine.run(data, feed=feed_dict, mode="train") + + # eval + eval_dataset2 = MyDataset(batch_size) + eval_dataloader = engine.dataloader(eval_dataset2, + batch_size=batch_size, + mode="eval") + engine.prepare(mode="eval") + for data in eval_dataloader: + outs = engine.run(data, feed=feed_dict, mode="eval") + + # predict + engine.to_mode("predict") + test_dataset = MyDataset(batch_size) + predict_dataloader = engine.dataloader(test_dataset, batch_size=batch_size) + engine.prepare() + for data in predict_dataloader: + outs = engine.run(data, feed=feed_dict) + + # save + temp_dir = tempfile.TemporaryDirectory() + model_filename = os.path.join(temp_dir.name, 'mlp') + engine.save(model_filename, training=True) + engine.load(model_filename) + temp_dir.cleanup() + + # Build dataloader from generator + # train + train_dataset = MyDataset(batch_num * batch_size) + train_dataloader = engine.dataloader_from_generator(train_dataset, + batch_size=batch_size, + mode="train") + engine.prepare(mode="train") + for data in train_dataloader: + outs = engine.run(data, feed=feed_dict, mode="train") + + # eval + engine.to_mode("eval") + eval_dataset2 = MyDataset(batch_size) + eval_dataloader = engine.dataloader_from_generator(eval_dataset2, + batch_size=batch_size) + engine.prepare() + for data in eval_dataloader: + outs = engine.run(data, feed=feed_dict) + + # predict + test_dataset = MyDataset(batch_size) + predict_dataloader = engine.dataloader_from_generator(test_dataset, + batch_size=batch_size, + mode="predict") + engine.prepare(mode="predict") + for data in predict_dataloader: + outs = engine.run(data, feed=feed_dict, mode="predict") + + # save + temp_dir = tempfile.TemporaryDirectory() + model_filename = os.path.join(temp_dir.name, 'mlp') + engine.save(model_filename, training=True) + engine.load(model_filename) + temp_dir.cleanup() + + +def train_builtin_data_vars(): + mlp = MLPLayer(hidden_size=hidden_size, + intermediate_size=4 * hidden_size, + dropout_ratio=0.1, + initializer_range=0.02) + loss = paddle.nn.CrossEntropyLoss() + optimizer = paddle.optimizer.Adam(learning_rate=0.00001, + beta1=0.9, + beta2=0.999, + epsilon=1e-08, + grad_clip=None) + metric = paddle.metric.Accuracy() + + strategy = auto.Strategy() + strategy.auto_mode = "semi" + + engine = auto.Engine(mlp, loss, optimizer, metric, strategy=strategy) + + # train + engine.to_mode("train") + + input_spec = static.InputSpec([batch_size, image_size], 'float32', 'input') + label_spec = static.InputSpec([batch_size, 1], 'int64', 'label') + engine.prepare(inputs_spec=[input_spec], labels_spec=[label_spec]) + + with static.program_guard(engine.main_program, engine.startup_program): + feed_list = engine.inputs + engine.labels + print(feed_list) + loader = paddle.io.DataLoader.from_generator(feed_list=feed_list, + capacity=4 * batch_size, + iterable=False) + + places = static.cuda_places() + loader.set_batch_generator(batch_generator_creator(), places=places) + + for _ in range(epoch_num): + loader.start() # call DataLoader.start() before each epoch starts + try: + while True: + engine.run() + except paddle.fluid.core.EOFException: + loader.reset( + ) # call DataLoader.reset() after catching EOFException + + +def train_non_builtin_data_vars(): + main_program = static.Program() + startup_program = static.Program() + with static.program_guard(main_program, + startup_program), utils.unique_name.guard(): + input = static.data(name="input", + shape=[batch_size, image_size], + dtype='float32') + label = static.data(name="label", shape=[batch_size, 1], dtype='int64') + + loader = paddle.io.DataLoader.from_generator(feed_list=[input, label], + capacity=4 * batch_size, + iterable=False) + places = static.cuda_places() + loader.set_batch_generator(batch_generator_creator(), places=places) + + mlp = MLPLayer(hidden_size=hidden_size, + intermediate_size=4 * hidden_size, + dropout_ratio=0.1, + initializer_range=0.02) + loss = paddle.nn.CrossEntropyLoss() + optimizer = paddle.optimizer.Adam(learning_rate=0.00001, + beta1=0.9, + beta2=0.999, + epsilon=1e-08, + grad_clip=None) + metric = paddle.metric.Accuracy() + predict = mlp(input) + loss_var = loss(predict, label) + + strategy = auto.Strategy() + strategy.auto_mode = "semi" + + engine = auto.Engine(loss=loss_var, + optimizer=optimizer, + metrics=metric, + strategy=strategy) + + # train + engine.to_mode("train") + engine.prepare(inputs=[input], + labels=[label], + main_program=main_program, + startup_program=startup_program) + for _ in range(epoch_num): + loader.start() # call DataLoader.start() before each epoch starts + try: + while True: + engine.run() + except paddle.fluid.core.EOFException: + loader.reset( + ) # call DataLoader.reset() after catching EOFException + + +def get_cost(): + main_program = static.default_main_program() + startup_program = static.default_startup_program() + with static.program_guard(main_program, + startup_program), utils.unique_name.guard(): + input = static.data(name="input", + shape=[batch_size, image_size], + dtype='float32') + label = static.data(name="label", shape=[batch_size, 1], dtype='int64') + + loader = paddle.io.DataLoader.from_generator(feed_list=[input, label], + capacity=4 * batch_size, + iterable=False) + places = static.cuda_places() + loader.set_batch_generator(batch_generator_creator(), places=places) + + mlp = MLPLayer(hidden_size=hidden_size, + intermediate_size=4 * hidden_size, + dropout_ratio=0.1, + initializer_range=0.02) + loss = paddle.nn.CrossEntropyLoss() + optimizer = paddle.optimizer.Adam(learning_rate=0.00001, + beta1=0.9, + beta2=0.999, + epsilon=1e-08, + grad_clip=None) + metric = paddle.metric.Accuracy() + predict = mlp(input) + loss_var = loss(predict, label) + + strategy = auto.Strategy() + strategy.auto_mode = "semi" + + engine = auto.Engine(loss=loss_var, + optimizer=optimizer, + metrics=metric, + strategy=strategy) + engine.cost() + + +def get_cost_by_spec(): + mlp = MLPLayer(hidden_size=hidden_size, + intermediate_size=4 * hidden_size, + dropout_ratio=0.1, + initializer_range=0.02) + loss = paddle.nn.CrossEntropyLoss() + optimizer = paddle.optimizer.Adam(learning_rate=0.00001, + beta1=0.9, + beta2=0.999, + epsilon=1e-08, + grad_clip=None) + metric = paddle.metric.Accuracy() + + strategy = auto.Strategy() + strategy.auto_mode = "semi" + + engine = auto.Engine(mlp, loss, optimizer, metric, strategy=strategy) + + input_spec = static.InputSpec([batch_size, image_size], 'float32', 'input') + label_spec = static.InputSpec([batch_size, 1], 'int64', 'label') + engine.cost(mode="eval", inputs_spec=[input_spec], labels_spec=[label_spec]) + + if __name__ == "__main__": - train(fetch=True) - train(fetch=False) + train_high_level(fetch=True) + train_high_level(fetch=False) + train_low_level() + train_builtin_data_vars() + train_non_builtin_data_vars() + get_cost() + get_cost_by_spec() diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/gradient_merge_pass_unittest.py b/python/paddle/fluid/tests/unittests/auto_parallel/gradient_merge_pass_unittest.py index 828f82d59ce07..438e17d29f777 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/gradient_merge_pass_unittest.py +++ b/python/paddle/fluid/tests/unittests/auto_parallel/gradient_merge_pass_unittest.py @@ -84,25 +84,32 @@ def check_results(self, ref_losses, check_losses): def test_gradient_merge_pass(self): # dp2 training dp_engine = self.get_engine() - dp_losses = dp_engine.fit(self.dataset, 3, batch_size=self.batch_size) - dp_losses = np.array(dp_losses["loss"]) + history = dp_engine.fit(self.dataset, + 3, + batch_size=self.batch_size, + log_freq=1) + dp_losses = np.array(history.history["loss"]) # dp2 gradient merge training gm_engine = self.get_engine(True) - gm_losses = gm_engine.fit(self.dataset, 3, batch_size=self.batch_size) - gm_losses = np.array(gm_losses["loss"]) - - avg_loss = 0 - pass_avg_ret_list = [] - for i, pass_ret in enumerate(gm_losses): - if (i + 1) % 4 == 0: - avg_loss += pass_ret - pass_avg_ret_list.append(avg_loss / 4) - avg_loss = 0 - else: - avg_loss += pass_ret - - self.check_results(dp_losses, np.array(pass_avg_ret_list)) + history = gm_engine.fit(self.dataset, + 3, + batch_size=self.batch_size, + log_freq=1) + gm_losses = np.array(history.history["loss"]) + + # avg_loss = 0 + # pass_avg_ret_list = [] + # for i, pass_ret in enumerate(gm_losses): + # if (i + 1) % 4 == 0: + # avg_loss += pass_ret + # pass_avg_ret_list.append(avg_loss / 4) + # avg_loss = 0 + # else: + # avg_loss += pass_ret + + # NOTE: every sample data from dataset is all the same + self.check_results(dp_losses, gm_losses) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/recompute_pass_unittest.py b/python/paddle/fluid/tests/unittests/auto_parallel/recompute_pass_unittest.py index c45f74ea45bb0..1a444353d0399 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/recompute_pass_unittest.py +++ b/python/paddle/fluid/tests/unittests/auto_parallel/recompute_pass_unittest.py @@ -79,13 +79,13 @@ def check_results(self, ref_losses, check_losses): def test_recompute_pass(self): # mp2 training mp_engine = self.get_engine() - mp_losses = mp_engine.fit(self.dataset, 3, batch_size=self.batch_size) - mp_losses = np.array(mp_losses["loss"]) + history = mp_engine.fit(self.dataset, 3, batch_size=self.batch_size) + mp_losses = np.array(history.history["loss"]) # mp2 recompute training rc_engine = self.get_engine(True) - rc_losses = rc_engine.fit(self.dataset, 3, batch_size=self.batch_size) - rc_losses = np.array(rc_losses["loss"]) + history = rc_engine.fit(self.dataset, 3, batch_size=self.batch_size) + rc_losses = np.array(history.history["loss"]) self.check_results(mp_losses, rc_losses) diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/sharding_pass_unittest.py b/python/paddle/fluid/tests/unittests/auto_parallel/sharding_pass_unittest.py index 6f5296ce35cdc..356c8ec2e14a7 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/sharding_pass_unittest.py +++ b/python/paddle/fluid/tests/unittests/auto_parallel/sharding_pass_unittest.py @@ -84,31 +84,31 @@ def check_results(self, ref_losses, check_losses): def test_sharding_pass(self): # dp2 training dp_engine = self.get_engine() - dp_losses = dp_engine.fit(self.dataset, 3, batch_size=self.batch_size) - dp_losses = np.array(dp_losses["loss"]) + history = dp_engine.fit(self.dataset, 3, batch_size=self.batch_size) + dp_losses = np.array(history.history["loss"]) # sharding2 stage1 training sharding1_engine = self.get_engine(True, 1) - sharding1_losses = sharding1_engine.fit(self.dataset, - 3, - batch_size=self.batch_size) - sharding1_losses = np.array(sharding1_losses["loss"]) + history = sharding1_engine.fit(self.dataset, + 3, + batch_size=self.batch_size) + sharding1_losses = np.array(history.history["loss"]) self.check_results(dp_losses, sharding1_losses) # sharding2 stage2 training sharding2_engine = self.get_engine(True, 2) - sharding2_losses = sharding2_engine.fit(self.dataset, - 3, - batch_size=self.batch_size) - sharding2_losses = np.array(sharding2_losses["loss"]) + history = sharding2_engine.fit(self.dataset, + 3, + batch_size=self.batch_size) + sharding2_losses = np.array(history.history["loss"]) self.check_results(dp_losses, sharding2_losses) # sharding2 stage3 training sharding3_engine = self.get_engine(True, 3) - sharding3_losses = sharding3_engine.fit(self.dataset, - 3, - batch_size=self.batch_size) - sharding3_losses = np.array(sharding3_losses["loss"]) + history = sharding3_engine.fit(self.dataset, + 3, + batch_size=self.batch_size) + sharding3_losses = np.array(history.history["loss"]) self.check_results(dp_losses, sharding3_losses) diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_comp_cost.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_comp_cost.py index 0a3a5993ffdb9..c0f7c8781928b 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/test_comp_cost.py +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_comp_cost.py @@ -82,6 +82,9 @@ from paddle.distributed.auto_parallel.cost.comp_op_cost import Transpose2GradOpCost from paddle.distributed.auto_parallel.cost.comp_op_cost import Unsqueeze2OpCost from paddle.distributed.auto_parallel.cost.comp_op_cost import WriteToArrayOpCost +from paddle.distributed.auto_parallel.cost.comp_op_cost import DropoutGradOpCost +from paddle.distributed.auto_parallel.cost.comp_op_cost import FusedSoftmaxMaskUpperTriangleOpCost +from paddle.distributed.auto_parallel.cost.comp_op_cost import FusedSoftmaxMaskUpperTriangleGradOpCost from test_cluster import cluster_json @@ -417,6 +420,22 @@ def test_comp_cost(self): self.assertTrue(op_cost.flops >= 0) self.assertTrue(op_cost.time >= 0) self.assertTrue(op_cost.memory >= 0) + + op_cost = DropoutGradOpCost(cluster=cluster) + self.assertTrue(op_cost.flops >= 0) + self.assertTrue(op_cost.time >= 0) + self.assertTrue(op_cost.memory >= 0) + + op_cost = FusedSoftmaxMaskUpperTriangleOpCost(cluster=cluster) + self.assertTrue(op_cost.flops >= 0) + self.assertTrue(op_cost.time >= 0) + self.assertTrue(op_cost.memory >= 0) + + op_cost = FusedSoftmaxMaskUpperTriangleGradOpCost(cluster=cluster) + self.assertTrue(op_cost.flops >= 0) + self.assertTrue(op_cost.time >= 0) + self.assertTrue(op_cost.memory >= 0) + # Remove unnecessary files if os.path.exists(cluster_json_path): os.remove(cluster_json_path) diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_conditional_block_reshard.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_conditional_block_reshard.py new file mode 100644 index 0000000000000..86371cbae6436 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_conditional_block_reshard.py @@ -0,0 +1,96 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from paddle.static import InputSpec +from paddle.distributed.fleet import auto + + +class MLPLayer(nn.Layer): + + def __init__(self, + hidden_size=64, + intermediate_size=4 * 64, + initializer_range=0.02): + super(MLPLayer, self).__init__() + self.norm = nn.LayerNorm(hidden_size, epsilon=1e-5) + self.linear0 = nn.Linear( + hidden_size, + intermediate_size, + paddle.ParamAttr(initializer=nn.initializer.Normal( + mean=0.0, std=initializer_range)), + bias_attr=None) + self.linear1 = nn.Linear( + intermediate_size, + hidden_size, + paddle.ParamAttr(initializer=nn.initializer.Normal( + mean=0.0, std=initializer_range)), + bias_attr=None) + + def forward(self, input): + out = self.norm(input) + + auto.shard_tensor(self.linear0.weight, auto.ProcessMesh([0, 1], "x"), + [None, "x"]) + out = self.linear0(out) + out = F.gelu(out, approximate=True) + + auto.shard_tensor(self.linear1.weight, auto.ProcessMesh([0, 1], "x"), + ["x", None]) + out = self.linear1(out) + + if paddle.mean(out) < 2: + out = self.norm(out) + out = self.linear0(out) + out = F.gelu(out, approximate=True) + out = self.linear1(out) + else: + out = self.norm(out) + out = self.linear0(out) + out = self.linear1(out) + + return out + + +def loss_fn(predict, label): + error_cost = paddle.nn.functional.square_error_cost(predict, label) + loss = paddle.mean(error_cost) + return loss + + +class TestSubblock(unittest.TestCase): + + def test_subblock(self): + + mlp = MLPLayer() + + strategy = auto.Strategy() + strategy.auto_mode = "semi" + + engine = auto.Engine(model=mlp, loss=loss_fn, strategy=strategy) + + input_sepc = InputSpec([4, 64], 'float32', 'input') + label_spec = InputSpec([4, 1], 'float32', 'label') + engine.prepare(inputs_spec=[input_sepc], + labels_spec=[label_spec], + mode="predict") + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_assign.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_assign.py new file mode 100644 index 0000000000000..b21dd606d8cb7 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_assign.py @@ -0,0 +1,84 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import paddle +from paddle.distributed.fleet import auto + +paddle.enable_static() + + +def make_program(): + main_program = paddle.fluid.Program() + start_program = paddle.fluid.Program() + with paddle.static.program_guard(main_program, start_program): + + x = paddle.static.data(name='x', shape=[4, 4, 8], dtype='float32') + y = paddle.static.data(name='y', shape=[4, 4, 8], dtype='float32') + auto.shard_tensor(x, auto.ProcessMesh([0, 1], dim_names=["d"]), + [None, "d", None]) + + z = paddle.add(x, y) + paddle.assign(x, output=z) + + return main_program, start_program + + +def parallelizer(program_func, rank): + from paddle.distributed.auto_parallel.completion import Completer + from paddle.distributed.auto_parallel.partitioner import Partitioner + from paddle.distributed.auto_parallel.dist_context import DistributedContext + + main_program, start_program = program_func() + + dist_context = DistributedContext() + completer = Completer(dist_context) + completer.complete_forward_annotation(main_program) + dist_context.block_state.parse_forward_blocks(main_program) + + partitioner = Partitioner(dist_context, rank) + dist_main_prog, _, _ = partitioner.partition(main_program, start_program, + []) + + return dist_main_prog, dist_context + + +class TestDistAssign(unittest.TestCase): + + def test_dist_assign(self): + + dist_main_prog, dist_context = parallelizer(make_program, 0) + ops = dist_main_prog.global_block().ops + for op in ops: + if op.type == "assign": + dist_op = dist_context.get_dist_op_for_program(op) + dist_op.dist_attr.impl_type == "assign" + dist_op.dist_attr.impl_idx == 0 + + x_name = op.input_arg_names[0] + out_name = op.output_arg_names[0] + out_var = dist_main_prog.global_block().vars[out_name] + dist_out = dist_context.get_dist_tensor_for_program(out_var) + + x_dims_mapping = dist_op.dist_attr.get_input_dims_mapping( + x_name) + out_dims_mapping = dist_op.dist_attr.get_output_dims_mapping( + out_name) + + assert x_dims_mapping == out_dims_mapping + assert out_dims_mapping == dist_out.dist_attr.dims_mapping + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_context.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_context.py index 63621256193a1..d2047332c9a22 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_context.py +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_context.py @@ -199,7 +199,7 @@ def test_deepcopy(self): "_serial_ordered_nodes", "_serial_ordered_tensor_nodes", \ "_serial_ordered_op_nodes", "_original_serial_loss", \ "_original_serial_feed_vars", "_original_serial_fetch_vars", \ - "_serial_loss", "_serial_feed_vars", "_serial_fetch_vars", "_lr_optimizer", \ + "_serial_loss", "_serial_feed_vars", "_serial_fetch_vars", "_serial_optimizer", \ "_backup_serial_main_program_stack", "_backup_serial_startup_program_stack", \ "_pass_context"] diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_shape.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_shape.py new file mode 100644 index 0000000000000..5e18b7d90c519 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_shape.py @@ -0,0 +1,74 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import paddle +from paddle.distributed.fleet import auto + +paddle.enable_static() + + +def make_program(): + main_program = paddle.fluid.Program() + start_program = paddle.fluid.Program() + with paddle.static.program_guard(main_program, start_program): + x = paddle.static.data(name='x', shape=[4, 4, 8], dtype='float32') + x.stop_gradient = False + auto.shard_tensor(x, auto.ProcessMesh([0, 1], dim_names=["x"]), + ["x", None, None]) + shape = paddle.shape(x) + return main_program, start_program + + +def parallelizer(program_func, rank): + from paddle.distributed.auto_parallel.completion import Completer + from paddle.distributed.auto_parallel.partitioner import Partitioner + from paddle.distributed.auto_parallel.dist_context import DistributedContext + + main_program, start_program = program_func() + + dist_context = DistributedContext() + completer = Completer(dist_context) + completer.complete_forward_annotation(main_program) + dist_context.block_state.parse_forward_blocks(main_program) + + partitioner = Partitioner(dist_context, rank) + dist_main_prog, _, _ = partitioner.partition(main_program, start_program, + []) + + return dist_main_prog, dist_context + + +class TestDistShape(unittest.TestCase): + + def test_dist_shape(self): + + dist_main_prog, dist_context = parallelizer(make_program, 0) + ops = dist_main_prog.global_block().ops + shape_op = ops[0] + dist_op = dist_context.get_dist_op_for_program(shape_op) + dist_op.dist_attr.impl_type == "shape" + dist_op.dist_attr.impl_idx == 0 + + in_name = shape_op.input_arg_names[0] + out_name = shape_op.output_arg_names[0] + in_dims_mapping = dist_op.dist_attr.get_input_dims_mapping(in_name) + out_dims_mapping = dist_op.dist_attr.get_output_dims_mapping(out_name) + + assert in_dims_mapping == [0, -1, -1] + assert out_dims_mapping == [-1] + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_split.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_split.py new file mode 100644 index 0000000000000..566c57a140dc9 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_split.py @@ -0,0 +1,69 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import paddle +from paddle.distributed.fleet import auto + +from paddle.fluid import program_guard +from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr + +paddle.enable_static() + + +def make_program_dp2(): + main_program = paddle.fluid.Program() + start_program = paddle.fluid.Program() + with paddle.static.program_guard(main_program, start_program): + x = paddle.static.data(name='x', shape=[4, 12, 16], dtype='float32') + x.stop_gradient = False + auto.shard_tensor(x, auto.ProcessMesh([0, 1], dim_names=["x"]), + ["x", None, None]) + out0, out1, out2 = paddle.split(x, num_or_sections=3, axis=1) + return main_program, start_program + + +def parallelizer(program_func, rank): + from paddle.distributed.auto_parallel.completion import Completer + from paddle.distributed.auto_parallel.partitioner import Partitioner + from paddle.distributed.auto_parallel.dist_context import DistributedContext + + main_program, start_program = program_func() + + dist_context = DistributedContext() + completer = Completer(dist_context) + completer.complete_forward_annotation(main_program) + dist_context.block_state.parse_forward_blocks(main_program) + + partitioner = Partitioner(dist_context, rank) + dist_main_prog, _, _ = partitioner.partition(main_program, start_program, + []) + + return dist_main_prog, dist_context + + +class TestDistSplit(unittest.TestCase): + + def test_dist_split_dp2(self): + + for rank in range(2): + dist_main_prog, dist_context = parallelizer(make_program_dp2, rank) + ops = dist_main_prog.global_block().ops + op_dist_attr = dist_context.get_op_dist_attr_for_program(ops[0]) + assert op_dist_attr.impl_type == "split" + assert op_dist_attr.impl_idx == 0 + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_callbacks.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_callbacks.py new file mode 100644 index 0000000000000..9baaee353f715 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_callbacks.py @@ -0,0 +1,173 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import tempfile +import shutil +import time +import random + +import paddle +import paddle.vision.transforms as T + +from paddle.static import InputSpec +from paddle.distributed.fleet import auto +from paddle.distributed.auto_parallel.callbacks import config_callbacks +from paddle.vision.models import LeNet +from paddle.vision.datasets import MNIST + +paddle.enable_static() + + +class TestCallbacks(unittest.TestCase): + + def setUp(self): + self.save_dir = tempfile.mkdtemp() + + def tearDown(self): + shutil.rmtree(self.save_dir) + + def run_callback(self): + epochs = 2 + steps = 5 + freq = 2 + eval_steps = 2 + + inputs_spec = [InputSpec([None, 1, 28, 28], 'float32', 'image')] + strategy = auto.Strategy() + strategy.auto_mode = "semi" + + engine = auto.Engine(LeNet(), strategy=strategy) + engine.prepare(inputs_spec, mode="predict") + + cbks = config_callbacks(engine=engine, + batch_size=128, + epochs=epochs, + steps=steps, + log_freq=freq, + verbose=self.verbose, + metrics=['loss', 'acc'], + save_dir=self.save_dir) + cbks.on_begin('train') + + logs = {'loss': 50.341673, 'acc': 0.00256} + for epoch in range(epochs): + cbks.on_epoch_begin(epoch) + for step in range(steps): + cbks.on_batch_begin('train', step, logs) + logs['loss'] -= random.random() * 0.1 + logs['acc'] += random.random() * 0.1 + time.sleep(0.005) + cbks.on_batch_end('train', step, logs) + cbks.on_epoch_end(epoch, logs) + + eval_logs = {'eval_loss': 20.341673, 'eval_acc': 0.256} + params = { + 'steps': eval_steps, + 'metrics': ['eval_loss', 'eval_acc'], + } + cbks.on_begin('eval', params) + for step in range(eval_steps): + cbks.on_batch_begin('eval', step, eval_logs) + eval_logs['eval_loss'] -= random.random() * 0.1 + eval_logs['eval_acc'] += random.random() * 0.1 + eval_logs['batch_size'] = 2 + time.sleep(0.005) + cbks.on_batch_end('eval', step, eval_logs) + cbks.on_end('eval', eval_logs) + + test_logs = {} + params = {'steps': eval_steps} + cbks.on_begin('predict', params) + for step in range(eval_steps): + cbks.on_batch_begin('predict', step, test_logs) + test_logs['batch_size'] = 2 + time.sleep(0.005) + cbks.on_batch_end('predict', step, test_logs) + cbks.on_end('predict', test_logs) + + cbks.on_end('train') + + print(engine.history.history) + + def test_callback_verbose_0(self): + self.verbose = 0 + self.run_callback() + + def test_callback_verbose_1(self): + self.verbose = 1 + self.run_callback() + + def test_callback_verbose_2(self): + self.verbose = 2 + self.run_callback() + + def test_callback_verbose_3(self): + self.verbose = 3 + self.run_callback() + + +class TestCallbacksEngine(unittest.TestCase): + + def setUp(self): + self.save_dir = tempfile.mkdtemp() + transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])]) + self.train_dataset = MNIST(mode='train', transform=transform) + self.test_dataset = MNIST(mode='test', transform=transform) + self.prepare_engine() + + def tearDown(self): + shutil.rmtree(self.save_dir) + + def prepare_engine(self): + model = paddle.vision.models.LeNet() + loss = paddle.nn.CrossEntropyLoss() + base_lr = 1e-3 + boundaries = [5, 8] + values = [base_lr * (0.1**i) for i in range(len(boundaries) + 1)] + lr = paddle.optimizer.lr.PiecewiseDecay(boundaries=boundaries, + values=values, + verbose=False) + optimizer = paddle.optimizer.Adam(learning_rate=lr, + parameters=model.parameters()) + auto.fetch(model.parameters()[0], "param0", logging=True) + metrics = paddle.metric.Accuracy(topk=(1, 2)) + self.engine = auto.Engine(model, loss, optimizer, metrics) + + def test_fit_eval(self): + history = self.engine.fit(train_data=self.train_dataset, + valid_data=self.test_dataset, + batch_size=128, + steps_per_epoch=60, + valid_steps=40, + log_freq=20, + save_dir=self.save_dir, + save_freq=1) + print(history.history) + + def test_eval(self): + self.engine.evaluate(valid_data=self.test_dataset, + batch_size=128, + steps=40, + log_freq=10) + + def test_predict(self): + logger_cbks = paddle.callbacks.ProgBarLogger() + self.engine.predict(test_data=self.test_dataset, + batch_size=128, + callbacks=[logger_cbks]) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_lr_grad_clip.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_lr_grad_clip.py index 9c116a3288153..3fed759424aae 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/test_lr_grad_clip.py +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_lr_grad_clip.py @@ -78,7 +78,7 @@ def init_optimizer(self): def test_lr_scheduler(self): self.init_engine() self.engine.fit(self.dataset, batch_size=self.batch_size) - lr = self.engine._lr_optimizer._learning_rate + lr = self.engine._optimizer._learning_rate assert isinstance(lr, paddle.optimizer.lr.LRScheduler) diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_parallel_tuner.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_parallel_tuner.py new file mode 100644 index 0000000000000..ab48e2838f9b9 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_parallel_tuner.py @@ -0,0 +1,141 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import paddle +import paddle.static as static + +from paddle.distributed import fleet +from paddle.distributed.auto_parallel.cluster import Cluster +from paddle.distributed.auto_parallel.dist_context import DistributedContext, set_default_distributed_context +from paddle.distributed.auto_parallel.tuner.parallel_tuner import ParallelTuner +from paddle.distributed.auto_parallel.process_mesh import ProcessMesh +import sys + +sys.path.append("..") +import auto_parallel_gpt_model as modeling +from auto_parallel_gpt_model import GPTModel, GPTForPretraining, GPTPretrainingCriterion + +paddle.enable_static() + +batch_size = 4 +epoch_num = 10 +hidden_size = 1024 +sequence_len = 512 +_g_process_mesh = [ + ProcessMesh([0, 1], dim_names=["x"]), + ProcessMesh([2, 3], dim_names=["x"]) +] + + +def get_program_v3(): + dist_strategy = fleet.DistributedStrategy() + dist_strategy.semi_auto = True + # fleet.init(is_collective=True, strategy=dist_strategy) + place = paddle.set_device("gpu") + gpus = [0, 1] + batch_size = 8 + sequence_len = 512 + vocab_size = 1000 + + train_program = static.Program() + start_program = static.Program() + modeling.init_global() + modeling._global_parallel_strategy = None + # modeling.DPMPPP_MESH_LIST = [ + # ProcessMesh([[0, 1], [2, 3]], dim_names=["x", "y"]), + # ProcessMesh([[4, 5], [6, 7]], dim_names=["x", "y"]) + # ] + with static.program_guard(train_program, start_program): + tokens = paddle.static.data(name="tokens", + shape=[batch_size, sequence_len], + dtype='int64') + position_ids = paddle.static.data(name="position_ids", + shape=[batch_size, sequence_len], + dtype='int64') + attention_mask = paddle.static.data( + name="attention_mask", + shape=[batch_size, 1, sequence_len, sequence_len], + dtype='float32') + labels = paddle.static.data(name="labels", + shape=[batch_size, sequence_len], + dtype='int64') + loss_mask = paddle.static.data(name="loss_mask", + shape=[batch_size, sequence_len], + dtype='float32') + data_holder = [tokens, position_ids, attention_mask, labels, loss_mask] + + gpt = GPTModel(vocab_size=1000, + hidden_size=1024, + num_hidden_layers=2, + num_attention_heads=16, + intermediate_size=4 * 1024, + hidden_act="gelu", + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + max_position_embeddings=1024, + type_vocab_size=1, + initializer_range=0.02, + pad_token_id=0, + eos_token_id=7, + bos_token_id=0, + eol_token_id=3, + pp_degree=1) + + model = GPTForPretraining(gpt, + vocab_size=1000, + hidden_size=64, + initializer_range=0.02) + preds = model(tokens, position_ids, attention_mask) + criterion = GPTPretrainingCriterion() + loss = criterion(preds, labels, loss_mask) + + optimizer = paddle.fluid.optimizer.AdamOptimizer(learning_rate=0.00001, + beta1=0.9, + beta2=0.999, + epsilon=1e-08, + grad_clip=None) + + feed_vars = { + "inputs": [tokens, position_ids, attention_mask, loss_mask], + "labels": [labels] + } + fetch_vars = {"loss": [loss]} + + return train_program, start_program, None, loss, optimizer, feed_vars, fetch_vars + + +class TestParallelTunerTrain(unittest.TestCase): + + def test_tune_with_train(self): + flag = False + set_default_distributed_context(DistributedContext()) + train_program, start_program, dataloader, loss, optimizer, feed_vars, fetch_vars = get_program_v3( + ) + cluster = Cluster() + cluster.gen_default_config_cluster(node_count=1, device_count=8) + dist_context = DistributedContext(train_program, start_program, + optimizer, loss, feed_vars, + fetch_vars, cluster) + dist_context.initialize() + parallel_tuner = ParallelTuner(dist_context, max_trials=3, mode="train") + parallel_tuner.tune() + parallel_tuner._store_best_parallel_strategy() + flag = True + self.assertTrue(flag) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_parallel_tuner_full.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_parallel_tuner_full.py new file mode 100644 index 0000000000000..27833a6a18500 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_parallel_tuner_full.py @@ -0,0 +1,147 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import paddle +import paddle.static as static + +from paddle.distributed import fleet +from paddle.distributed.auto_parallel.cluster import Cluster +from paddle.distributed.auto_parallel.dist_context import DistributedContext, set_default_distributed_context +from paddle.distributed.auto_parallel.tuner.parallel_tuner import ParallelTuner +from paddle.distributed.auto_parallel.process_mesh import ProcessMesh +from paddle.distributed.auto_parallel.planner_v2 import Planner +from paddle.distributed.auto_parallel.strategy import Strategy +import sys + +sys.path.append("..") +import auto_parallel_gpt_model as modeling +from auto_parallel_gpt_model import GPTModel, GPTForPretraining, GPTPretrainingCriterion + +paddle.enable_static() + +batch_size = 4 +epoch_num = 10 +hidden_size = 1024 +sequence_len = 512 +_g_process_mesh = [ + ProcessMesh([0, 1], dim_names=["x"]), + ProcessMesh([2, 3], dim_names=["x"]) +] + + +def get_program_v3(): + dist_strategy = fleet.DistributedStrategy() + dist_strategy.semi_auto = True + # fleet.init(is_collective=True, strategy=dist_strategy) + place = paddle.set_device("gpu") + gpus = [0, 1] + batch_size = 8 + sequence_len = 512 + vocab_size = 1000 + + train_program = static.Program() + start_program = static.Program() + modeling.init_global() + modeling._global_parallel_strategy = "dp_mp_pp" + modeling.DPMPPP_MESH_LIST = [ + ProcessMesh([[0, 1], [2, 3]], dim_names=["x", "y"]), + ProcessMesh([[4, 5], [6, 7]], dim_names=["x", "y"]) + ] + with static.program_guard(train_program, start_program): + tokens = paddle.static.data(name="tokens", + shape=[batch_size, sequence_len], + dtype='int64') + position_ids = paddle.static.data(name="position_ids", + shape=[batch_size, sequence_len], + dtype='int64') + attention_mask = paddle.static.data( + name="attention_mask", + shape=[batch_size, 1, sequence_len, sequence_len], + dtype='float32') + labels = paddle.static.data(name="labels", + shape=[batch_size, sequence_len], + dtype='int64') + loss_mask = paddle.static.data(name="loss_mask", + shape=[batch_size, sequence_len], + dtype='float32') + data_holder = [tokens, position_ids, attention_mask, labels, loss_mask] + + gpt = GPTModel(vocab_size=1000, + hidden_size=1024, + num_hidden_layers=2, + num_attention_heads=16, + intermediate_size=4 * 1024, + hidden_act="gelu", + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + max_position_embeddings=1024, + type_vocab_size=1, + initializer_range=0.02, + pad_token_id=0, + eos_token_id=7, + bos_token_id=0, + eol_token_id=3, + pp_degree=len(modeling.DPMPPP_MESH_LIST)) + + model = GPTForPretraining(gpt, + vocab_size=1000, + hidden_size=64, + initializer_range=0.02) + preds = model(tokens, position_ids, attention_mask) + criterion = GPTPretrainingCriterion() + loss = criterion(preds, labels, loss_mask) + + optimizer = paddle.fluid.optimizer.AdamOptimizer(learning_rate=0.00001, + beta1=0.9, + beta2=0.999, + epsilon=1e-08, + grad_clip=None) + + feed_vars = { + "inputs": [tokens, position_ids, attention_mask, loss_mask], + "labels": [labels] + } + fetch_vars = {"loss": [loss]} + + return train_program, start_program, None, loss, optimizer, feed_vars, fetch_vars + + +class TestParallelTunerFull(unittest.TestCase): + + def test_tune_with_planner(self): + flag = False + set_default_distributed_context(DistributedContext()) + train_program, start_program, dataloader, loss, optimizer, feed_vars, fetch_vars = get_program_v3( + ) + cluster = Cluster() + cluster.gen_default_config_cluster(node_count=1, device_count=8) + strategy = Strategy() + strategy.auto_mode = "full" + dist_context = DistributedContext(train_program, start_program, + optimizer, loss, feed_vars, + fetch_vars, cluster, strategy) + dist_context.initialize() + planner = Planner("train", dist_context) + planner._parallel_tuner = ParallelTuner(planner._dist_context, + mode=planner._mode, + max_trials=3) + planner.plan() + flag = True + self.assertTrue(flag) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_parallel_tuner_predict.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_parallel_tuner_predict.py new file mode 100644 index 0000000000000..2d7a2c10579a7 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_parallel_tuner_predict.py @@ -0,0 +1,144 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import paddle +import paddle.static as static + +from paddle.distributed import fleet +from paddle.distributed.auto_parallel.cluster import Cluster +from paddle.distributed.auto_parallel.dist_context import DistributedContext, set_default_distributed_context +from paddle.distributed.auto_parallel.tuner.parallel_tuner import ParallelTuner +from paddle.distributed.auto_parallel.process_mesh import ProcessMesh +import sys + +sys.path.append("..") +import auto_parallel_gpt_model as modeling +from auto_parallel_gpt_model import GPTModel, GPTForPretraining, GPTPretrainingCriterion + +paddle.enable_static() + +batch_size = 4 +epoch_num = 10 +hidden_size = 1024 +sequence_len = 512 +_g_process_mesh = [ + ProcessMesh([0, 1], dim_names=["x"]), + ProcessMesh([2, 3], dim_names=["x"]) +] + + +def get_program_v3(): + dist_strategy = fleet.DistributedStrategy() + dist_strategy.semi_auto = True + # fleet.init(is_collective=True, strategy=dist_strategy) + place = paddle.set_device("gpu") + gpus = [0, 1] + batch_size = 8 + sequence_len = 512 + vocab_size = 1000 + + train_program = static.Program() + start_program = static.Program() + modeling.init_global() + modeling._global_parallel_strategy = "dp_mp_pp" + modeling.DPMPPP_MESH_LIST = [ + ProcessMesh([[0, 1], [2, 3]], dim_names=["x", "y"]), + ProcessMesh([[4, 5], [6, 7]], dim_names=["x", "y"]) + ] + with static.program_guard(train_program, start_program): + tokens = paddle.static.data(name="tokens", + shape=[batch_size, sequence_len], + dtype='int64') + position_ids = paddle.static.data(name="position_ids", + shape=[batch_size, sequence_len], + dtype='int64') + attention_mask = paddle.static.data( + name="attention_mask", + shape=[batch_size, 1, sequence_len, sequence_len], + dtype='float32') + labels = paddle.static.data(name="labels", + shape=[batch_size, sequence_len], + dtype='int64') + loss_mask = paddle.static.data(name="loss_mask", + shape=[batch_size, sequence_len], + dtype='float32') + data_holder = [tokens, position_ids, attention_mask, labels, loss_mask] + + gpt = GPTModel(vocab_size=1000, + hidden_size=1024, + num_hidden_layers=2, + num_attention_heads=16, + intermediate_size=4 * 1024, + hidden_act="gelu", + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + max_position_embeddings=1024, + type_vocab_size=1, + initializer_range=0.02, + pad_token_id=0, + eos_token_id=7, + bos_token_id=0, + eol_token_id=3, + pp_degree=len(modeling.DPMPPP_MESH_LIST)) + + model = GPTForPretraining(gpt, + vocab_size=1000, + hidden_size=64, + initializer_range=0.02) + preds = model(tokens, position_ids, attention_mask) + criterion = GPTPretrainingCriterion() + loss = criterion(preds, labels, loss_mask) + + optimizer = paddle.fluid.optimizer.AdamOptimizer(learning_rate=0.00001, + beta1=0.9, + beta2=0.999, + epsilon=1e-08, + grad_clip=None) + + feed_vars = { + "inputs": [tokens, position_ids, attention_mask, loss_mask], + "labels": [labels] + } + fetch_vars = {"loss": [loss]} + + return train_program, start_program, None, loss, optimizer, feed_vars, fetch_vars + + +class TestParallelTunerPredict(unittest.TestCase): + + def test_tune_predict(self): + flag = False + set_default_distributed_context(DistributedContext()) + train_program, start_program, dataloader, loss, optimizer, feed_vars, fetch_vars = get_program_v3( + ) + cluster = Cluster() + cluster.gen_default_config_cluster(node_count=1, device_count=8) + dist_context = DistributedContext(train_program, start_program, + optimizer, loss, feed_vars, + fetch_vars, cluster) + dist_context.initialize() + + parallel_tuner = ParallelTuner(dist_context, + max_trials=3, + mode="predict") + parallel_tuner.tune() + flag = True + + self.assertTrue(flag) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_process_mesh.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_process_mesh.py index c9419f8c855af..ce38780564b5b 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/test_process_mesh.py +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_process_mesh.py @@ -101,6 +101,12 @@ def test_construction(self): self.assertEqual(sub_process_mesh4.dim_names, ["d0"]) self.assertEqual(sub_process_mesh4.ndim, 1) + sub_process_mesh5 = sub_process_mesh3[0] + self.assertEqual(sub_process_mesh5.shape, [1]) + self.assertEqual(sub_process_mesh5.process_ids, [1]) + self.assertEqual(sub_process_mesh5.dim_names, ["d0"]) + self.assertEqual(sub_process_mesh5.ndim, 1) + def test_context_manager(self): mesh = np.array([1, 2, 3, 4]) input = static.data(name="input", diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_to_static.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_to_static.py index e6419b3aafc6e..94d88a69bea35 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/test_to_static.py +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_to_static.py @@ -110,7 +110,7 @@ def test_apply_optimzier(self): program_helper.to('train') forward_ops = program_helper.main_program.block(0).ops - self.assertEqual(len(forward_ops), 21) + self.assertEqual(len(forward_ops), 17) # step 2: apply optimzer to generate whole program optimize_ops, _ = program_helper.apply_optimizer(optimizer) @@ -119,7 +119,7 @@ def test_apply_optimzier(self): op for op in program_helper.main_program.block(0).ops if op.type == 'sgd' ] - self.assertEqual(len(all_ops), 41) + self.assertEqual(len(all_ops), 37) self.assertEqual(len(optimize_ops), len(sgd_ops)) program_helper.reset() diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_tunable_space.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_tunable_space.py index f0c6a0b7cdf79..58ff36aba09db 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/test_tunable_space.py +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_tunable_space.py @@ -136,6 +136,16 @@ def test_state(self): self.assertEqual(new_space.variables["int_range"].step, 1) self.assertEqual(new_space.variables["int_range"].endpoint, False) + def test_expection(self): + space = ts.TunableSpace() + flag = True + try: + val = space.get_value("test") + flag = False + except: + pass + self.assertTrue(flag) + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_completion.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_completion.py index d31b34cacc923..f0edf8d6e2d83 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_completion.py +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_completion.py @@ -187,6 +187,14 @@ def test_completer(self): train_program) # print_program_with_dist_attr(complete_train_program, dist_context) + def test_completer_by_dist_op(self): + train_program, start_program, dataloader, i, loss = get_program() + dist_context = DistributedContext() + completer = Completer(dist_context) + complete_train_program = completer.complete_forward_annotation( + train_program) + complete_train_program = completer._complete_tensor_dist_attr_by_op() + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/auto_parallel_gpt_model.py b/python/paddle/fluid/tests/unittests/auto_parallel_gpt_model.py index bc8cd91b1a9d1..e065e2077f839 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel_gpt_model.py +++ b/python/paddle/fluid/tests/unittests/auto_parallel_gpt_model.py @@ -305,14 +305,14 @@ def forward(self, auto.shard_tensor(output, PP_MESH_LIST[0], [None for i in range(len(output.shape))]) if _global_parallel_strategy == "dp_pp": - auto.shard_tensor(output, DPPP_MESH_LIST[0], ["x"].extends( - [None for i in range(len(output.shape) - 1)])) + auto.shard_tensor(output, DPPP_MESH_LIST[0], ["x"] + + [None for i in range(len(output.shape) - 1)]) if _global_parallel_strategy == "mp_pp": auto.shard_tensor(output, MPPP_MESH_LIST[0], [None for i in range(len(output.shape))]) if _global_parallel_strategy == "dp_mp_pp": - auto.shard_tensor(output, DPMPPP_MESH_LIST[0], ["x"].extends( - [None for i in range(len(output.shape) - 1)])) + auto.shard_tensor(output, DPMPPP_MESH_LIST[0], ["x"] + + [None for i in range(len(output.shape) - 1)]) for i, mod in enumerate(self.layers): if cache is None: if use_cache: @@ -330,8 +330,8 @@ def forward(self, tgt_mask, use_cache, cache) auto.shard_tensor( - output, DPPP_MESH_LIST[mod.mesh_idx], ["x"].extends( - [None for i in range(len(output.shape) - 1)])) + output, DPPP_MESH_LIST[mod.mesh_idx], ["x"] + + [None for i in range(len(output.shape) - 1)]) elif _global_parallel_strategy == "mp_pp": output, new_cache = auto.shard_op( mod, MPPP_MESH_LIST[mod.mesh_idx])(output, memory, @@ -369,8 +369,8 @@ def forward(self, tgt_mask, use_cache, cache) auto.shard_tensor( - output, DPPP_MESH_LIST[mod.mesh_idx], ["x"].extends( - [None for i in range(len(output.shape) - 1)])) + output, DPPP_MESH_LIST[mod.mesh_idx], ["x"] + + [None for i in range(len(output.shape) - 1)]) elif _global_parallel_strategy == "mp_pp": output = auto.shard_op( mod, MPPP_MESH_LIST[mod.mesh_idx])(output, memory, @@ -385,9 +385,8 @@ def forward(self, output, memory, tgt_mask, use_cache, cache) auto.shard_tensor( - output, DPMPPP_MESH_LIST[mod.mesh_idx], - ["x"].extends( - [None for i in range(len(output.shape) - 1)])) + output, DPMPPP_MESH_LIST[mod.mesh_idx], ["x"] + + [None for i in range(len(output.shape) - 1)]) else: output = mod(output, memory, @@ -407,9 +406,9 @@ def forward(self, mod, DPPP_MESH_LIST[mod.mesh_idx])(output, memory, tgt_mask, use_cache, cache) - auto.shard_tensor(output, DPPP_MESH_LIST[mod.mesh_idx], [ - "x" - ].extends([None for i in range(len(output.shape) - 1)])) + auto.shard_tensor( + output, DPPP_MESH_LIST[mod.mesh_idx], + ["x"] + [None for i in range(len(output.shape) - 1)]) elif _global_parallel_strategy == "mp_pp": output, new_cache = auto.shard_op( mod, @@ -422,9 +421,9 @@ def forward(self, mod, DPMPPP_MESH_LIST[mod.mesh_idx])(output, memory, tgt_mask, use_cache, cache) - auto.shard_tensor(output, DPMPPP_MESH_LIST[mod.mesh_idx], [ - "x" - ].extends([None for i in range(len(output.shape) - 1)])) + auto.shard_tensor( + output, DPMPPP_MESH_LIST[mod.mesh_idx], + ["x"] + [None for i in range(len(output.shape) - 1)]) else: output, new_cache = mod(output, memory, @@ -689,11 +688,11 @@ def forward(self, auto.shard_tensor(input_ids, PP_MESH_LIST[0], [None for i in range(len(input_ids.shape))]) if _global_parallel_strategy == "dp_pp": - auto.shard_tensor(input_ids, DPPP_MESH_LIST[0], ["x"].extends( - [None for i in range(len(input_ids.shape) - 1)])) + auto.shard_tensor(input_ids, DPPP_MESH_LIST[0], ["x"] + + [None for i in range(len(input_ids.shape) - 1)]) if _global_parallel_strategy == "dp_mp_pp": - auto.shard_tensor(input_ids, DPMPPP_MESH_LIST[0], ["x"].extends( - [None for i in range(len(input_ids.shape) - 1)])) + auto.shard_tensor(input_ids, DPMPPP_MESH_LIST[0], ["x"] + + [None for i in range(len(input_ids.shape) - 1)]) encoder_outputs = self.decoder(embedding_output, memory=None, tgt_mask=attention_mask, diff --git a/python/paddle/hapi/callbacks.py b/python/paddle/hapi/callbacks.py index 1ba33a6b52bd7..bdd79b35a499a 100644 --- a/python/paddle/hapi/callbacks.py +++ b/python/paddle/hapi/callbacks.py @@ -20,7 +20,7 @@ import numpy as np import paddle -from paddle.distributed import ParallelEnv +from paddle.fluid.dygraph.parallel import ParallelEnv from paddle.utils import try_import from .progressbar import ProgressBar diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py index 16b3646a4a81a..56bbde53e5c2f 100644 --- a/python/paddle/hapi/model.py +++ b/python/paddle/hapi/model.py @@ -50,6 +50,7 @@ import paddle.distributed as dist import paddle.distributed.fleet as fleet from paddle.distributed.fleet.base import role_maker +from paddle.autograd import no_grad from .callbacks import config_callbacks, EarlyStopping from .model_summary import summary @@ -1105,7 +1106,7 @@ def train_batch(self, inputs, labels=None, update=True): self._update_inputs() return loss - @paddle.no_grad() + @no_grad() def eval_batch(self, inputs, labels=None): """ Run one evaluating step on a batch of data. @@ -1157,7 +1158,7 @@ def eval_batch(self, inputs, labels=None): self._update_inputs() return loss - @paddle.no_grad() + @no_grad() def predict_batch(self, inputs): """ Run one predicting step on a batch of data. diff --git a/python/paddle/hapi/model_summary.py b/python/paddle/hapi/model_summary.py index 6928bc75f5f71..26bf28ed10afd 100644 --- a/python/paddle/hapi/model_summary.py +++ b/python/paddle/hapi/model_summary.py @@ -19,7 +19,7 @@ import paddle import paddle.nn as nn from paddle.static import InputSpec - +from paddle.autograd import no_grad from collections import OrderedDict __all__ = [] @@ -229,7 +229,7 @@ def _check_input(input_size): return params_info -@paddle.no_grad() +@no_grad() def summary_string(model, input_size=None, dtypes=None, input=None): def _all_is_numper(items):