diff --git a/python/paddle/amp/auto_cast.py b/python/paddle/amp/auto_cast.py index 2d8e1d22b1642..04ab0a8570258 100644 --- a/python/paddle/amp/auto_cast.py +++ b/python/paddle/amp/auto_cast.py @@ -136,7 +136,7 @@ def _is_gpu_bfloat16_supported(): def need_keep_fp32(layer, dtype): need_keep_fp32 = False - # Highest prority. Because all the layers except BN will use bfloat16 params in bfoat16 training, + # Highest priority. Because all the layers except BN will use bfloat16 params in bfloat16 training, # here we provide a option to keep fp32 param. if not layer._cast_to_low_precision: need_keep_fp32 = True @@ -238,7 +238,7 @@ def check_models(models): ) if isinstance(model, paddle.DataParallel): raise RuntimeError( - "For distributed AMP training, you should first use paddle.amp.decorate() to decotate origin model, and then call paddle.DataParallel get distributed model." + "For distributed AMP training, you should first use paddle.amp.decorate() to decorate origin model, and then call paddle.DataParallel get distributed model." ) @@ -440,7 +440,7 @@ def master_grad_hook(): # TODO(zhiqiu) set amp related flags automatically in this guard # Currently, if FLAGS_cudnn_batchnorm_spatial_persistent is set True in amp_guard, - # batch_norm can run in fast mode, but batch_norm_grad can not if backward if not executed insise amp_guard. + # batch_norm can run in fast mode, but batch_norm_grad can not if backward if not executed inside amp_guard. # So, users need to set related flags manually. # original_flags = get_flags(AMP_RELATED_FLAGS) @@ -522,7 +522,7 @@ def amp_decorate( level(str, optional): Auto mixed precision level. Accepted values are "O1" and "O2": O1 represent mixed precision, the decorator will do nothing; O2 represent Pure fp16/bf16, the decorator will cast all parameters of models to FP16/BF16, except BatchNorm, InstanceNorm and LayerNorm. Default is O1(amp) dtype(str, optional): Whether to use 'float16' or 'bfloat16'. Default is 'float16'. - master_weight(bool, optinal): For level='O2', whether to use multi-precision during weight updating. If master_weight is None, in O2 level optimizer will use multi-precision. Default is None. + master_weight(bool, optional): For level='O2', whether to use multi-precision during weight updating. If master_weight is None, in O2 level optimizer will use multi-precision. Default is None. save_dtype(float, optional): The save model parameter dtype when use `paddle.save` or `paddle.jit.save`,it should be float16, bfloat16, float32, float64 or None. The save_dtype will not change model parameters dtype, it just change the state_dict dtype. When save_dtype is None, the save dtype is same as model dtype. Default is None. diff --git a/python/paddle/base/executor.py b/python/paddle/base/executor.py index 1588d2d6b3b53..7a8f214efd4fc 100755 --- a/python/paddle/base/executor.py +++ b/python/paddle/base/executor.py @@ -1055,7 +1055,7 @@ def _get_program_and_executor(self, cached_data): # if enables distributed training with prim mechanism (prim is behind of distributed) # step 1: translate program to pir program. # step 2: decompose PHI ops in pir program into prim ops. - # When decomposing backward ops, the grad_var_to_var in distributed context is needed to finding correpsonding forward op. + # When decomposing backward ops, the grad_var_to_var in distributed context is needed to finding corresponding forward op. if ( os.getenv("FLAGS_enable_prim_after_distribute") in ['True', 'true', '1'] @@ -1410,7 +1410,7 @@ def _fetch_data(self, fetch_list, fetch_var_name, scope): @classmethod def _split_optimize_ops_in_fetch_list(cls, fetch_list): """ - Split optimize_ops from fetch_list, which provided to specify program prunning. + Split optimize_ops from fetch_list, which provided to specify program pruning. Args: fetch_list(list): The original fetch_list. Possible types of fetch_list are: @@ -1666,7 +1666,7 @@ def run( and fetch_list Tensor) of this interface remains unchanged during running. The default is False. use_prune(bool): This parameter indicates whether the input :code:`Program` will be pruned. - If the parameter is True, the program will be pruned accroding to the given feed and fetch_list, + If the parameter is True, the program will be pruned according to the given feed and fetch_list, which means the operators and variables in program that generate :code:`feed` and are not needed to generate :code:`fetch_list` will be pruned. The default is False, which means the program will not pruned and all the operators and variables will be executed during running. diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py index 6a38ba12a234f..23825357793f5 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py @@ -45,7 +45,7 @@ def check_broadcast(block): if "@BroadCast" in var_name: if var_name in broadcast_vars: raise ValueError( - "var_name areadly exist: {}" + "var_name already exist: {}" "the old pos is {}, the new pos is {}".format( var_name, broadcast_vars[var_name]["broadcast_pos"], @@ -1015,7 +1015,7 @@ def is_opt_vars(var): def is_gradient_merge_vars(var): # NOTE(JZ-LIANG): to revise save/load logic in framework instead of write this naive rule - return var.name.endswith("@GradiantMerge") + return var.name.endswith("@GradientMerge") def is_trainable(var): return ( diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py index 1830773317462..5d2f561ca974d 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py @@ -987,7 +987,7 @@ def _split_program(self, block): def _prune_main_program(self, block, shard, rings): """ - calculate deps from allredce op to optimize op, + calculate deps from allreduce op to optimize op, remove ops and vars not needed in this worker 1. prune regularization (weight decay) @@ -1005,7 +1005,7 @@ def _prune_main_program(self, block, shard, rings): # amp could use global group for sync FP16Utils.prune_fp16(block, shard, self._reduced_grads_to_param, rings) - # clipbyglobalnorm should only use the Model paramllelism group (mp-sharding-pp) + # clipbyglobalnorm should only use the Model parallelism group (mp-sharding-pp) gradientclip_helper = GradientClipHelper(None) gradientclip_helper.prune_gradient_clip(block, shard, rings) @@ -1133,7 +1133,7 @@ def _add_broadcast_allreduce(self, block): self._segments[-1]._end_idx = new_end_idx if self._segments[-1]._allreduce_vars: - shard_allredue_vars = self._shard.filter_grads( + shard_allreduce_vars = self._shard.filter_grads( self._segments[-1]._allreduce_vars ) if ( @@ -1143,20 +1143,20 @@ def _add_broadcast_allreduce(self, block): if ( self.hybrid_dp and self.hybrid_dp_mode == "sharding_hybrid_dp" - and len(shard_allredue_vars) >= 1 + and len(shard_allreduce_vars) >= 1 ): if not self._use_calc_stream: insert_sync_comm_ops( block, self._segments[-1]._end_idx, self.dp_ring_id, - shard_allredue_vars, + shard_allreduce_vars, ) insert_allreduce_ops( block, self._segments[-1]._end_idx, self.dp_ring_id, - shard_allredue_vars, + shard_allreduce_vars, user_defined_strategy=self.user_defined_strategy, use_calc_stream=self._use_calc_stream, ) @@ -1169,7 +1169,7 @@ def _add_broadcast_allreduce(self, block): block, self._startup_program.global_block(), self._segments[-1]._end_idx, - shard_allredue_vars, + shard_allreduce_vars, self._shard, ) if not self._use_calc_stream: @@ -1241,7 +1241,7 @@ def _add_broadcast_allreduce(self, block): ) # step2: add Sync ops - shard_allredue_vars = self._shard.filter_grads(allreduce_vars) + shard_allreduce_vars = self._shard.filter_grads(allreduce_vars) if ( self.gradient_merge_mode != "sharding_gm" @@ -1250,14 +1250,14 @@ def _add_broadcast_allreduce(self, block): if ( self.hybrid_dp and self.hybrid_dp_mode == "sharding_hybrid_dp" - and len(shard_allredue_vars) >= 1 + and len(shard_allreduce_vars) >= 1 ): if not self._use_calc_stream: insert_sync_comm_ops( block, segment._end_idx, self.dp_ring_id, - shard_allredue_vars, + shard_allreduce_vars, ) broad_cast_vars = [x[0] for x in broadcast_vars] @@ -1322,7 +1322,7 @@ def _add_broadcast_allreduce(self, block): block, self._startup_program.global_block(), segment._start_idx, - shard_allredue_vars, + shard_allreduce_vars, self._shard, ) @@ -1343,13 +1343,13 @@ def _add_broadcast_allreduce(self, block): if ( self.hybrid_dp and self.hybrid_dp_mode == "sharding_hybrid_dp" - and len(shard_allredue_vars) >= 1 + and len(shard_allreduce_vars) >= 1 ): insert_allreduce_ops( block, segment._start_idx, self.dp_ring_id, - shard_allredue_vars, + shard_allreduce_vars, user_defined_strategy=self.user_defined_strategy, use_calc_stream=self._use_calc_stream, ) @@ -1562,7 +1562,7 @@ def _build_groups(self): // (self.sharding_degree * self.mp_degree) % self.pp_degree ) - # (NOTE): Already adjust for (outter-pure) dp + # (NOTE): Already adjust for (outer-pure) dp self.pp_group_id = self.global_rank // ( self.mp_degree * self.sharding_degree * self.pp_degree ) @@ -1588,10 +1588,10 @@ def _build_groups(self): self.pp_group_id = -1 self.pp_group_endpoints = [] - # outter-pure-dp group - # NOTE (JZ-LIANG) support outter-pure-dp to scale the throughput in 3D parallelism + # outer-pure-dp group + # NOTE (JZ-LIANG) support outer-pure-dp to scale the throughput in 3D parallelism # e.g. mp-sharding-pp-dp - # sharding-hybrid-dp as one senario of outter-pure-dp + # sharding-hybrid-dp as one scenario of outer-pure-dp local_pp_degree = self.pp_degree if os.getenv("PADDLE_MANUAL_PIPELINE_STAGE", None): assert self.pp_degree == 2, ( @@ -1785,7 +1785,7 @@ def create_persistable_gradients_and_insert_merge_ops( assert ( get_grad_device(grad_name, shard) == shard.worker_idx ), f"try to merge gradient not belong to current shard: [{grad_name}]" - persistable_grad_name = grad_name + '@GradiantMerge' + persistable_grad_name = grad_name + '@GradientMerge' assert ( grad_name not in self._grad2merged_grad ), "grad [{}] already in grad2merged_grad, maybe you meet sharing weight case !".format( @@ -2206,7 +2206,7 @@ def _init_communicator( ): nranks = len(endpoints) block = program.global_block() - # init mulit node nccl + # init multi node nccl if nranks > 1: other_endpoints = endpoints[:] other_endpoints.remove(current_endpoint) diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/four_directions_p2p_communication.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/four_directions_p2p_communication.py index a4c4a6cf806a3..62f54c09d46c8 100644 --- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/four_directions_p2p_communication.py +++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/four_directions_p2p_communication.py @@ -816,7 +816,7 @@ def send_backward_recv_forward(self, input_tensor_grad, pp_first_stage): def send_forward_backward_recv_forward_backward( self, output_tensor, input_tensor_grad, recv_prev, recv_next ): - # always have to send dytpe info to downstream + # always have to send dtype info to downstream global _timers if _timers is not None: _timers("send_forward_backward_recv_forward_backward").start() @@ -837,7 +837,7 @@ def send_forward_backward_recv_forward_backward( return input_tensor, output_tensor_grad def send_forward_recv_forward(self, output_tensor, recv_prev): - # always have to send dytpe info to downstream + # always have to send dtype info to downstream global _timers if _timers is not None: _timers("send_forward_recv_forward").start() diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py index 5f589dd4c6902..4566f89290fc0 100644 --- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py +++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py @@ -638,7 +638,7 @@ def send_backward_recv_forward(self, input_tensor_grad, pp_first_stage): def send_forward_backward_recv_forward_backward( self, output_tensor, input_tensor_grad, recv_prev, recv_next ): - # always have to send dytpe info to downstream + # always have to send dtype info to downstream global _timers if _timers is not None: _timers("send_forward_backward_recv_forward_backward").start() @@ -661,7 +661,7 @@ def send_forward_backward_recv_forward_backward( return input_tensor, output_tensor_grad def send_forward_recv_forward(self, output_tensor, recv_prev): - # always have to send dytpe info to downstream + # always have to send dtype info to downstream global _timers if _timers is not None: _timers("send_forward_recv_forward").start() diff --git a/python/paddle/distributed/fleet/recompute/recompute_hybrid.py b/python/paddle/distributed/fleet/recompute/recompute_hybrid.py index 2ba162c0c10b9..789f0cac73d94 100644 --- a/python/paddle/distributed/fleet/recompute/recompute_hybrid.py +++ b/python/paddle/distributed/fleet/recompute/recompute_hybrid.py @@ -250,14 +250,14 @@ def backward(ctx, *args): def recompute_hybrid(ctx, function, *args, **kwargs): """ recompute intermediate activations to save the memory in hybrid parallel scene. - # NODTE(shenliang03)The current hybrid parallel recompute has limitations. + # NOTE(shenliang03)The current hybrid parallel recompute has limitations. # It cannot handle the following situations: # 1. The calculation output of recompute, there are tensors that do not require gradients. # 2. The forward output tensor has no gradient. This problem can be solved temporarily by detach(). # 3. Here, we only use float dtype to distinguish whether a gradient is needed in output tensor Parameters: - ctx(dict): include 'mp_group', 'offload', and 'partition' keys. the key 'mp_group' (Group), represents the avtivations are splitted + ctx(dict): include 'mp_group', 'offload', and 'partition' keys. the key 'mp_group' (Group), represents the activations are splitted in which group. the key 'offload' (bool, optional, default=False), represents whether to offload to cpu. the key 'partition' (bool, optional, default=False), represents whether to split activations in the mp_group. function(paddle.nn.Layer): layer of sequence of layers that describes part of forward pass of the model diff --git a/python/paddle/distributed/fleet/utils/fs.py b/python/paddle/distributed/fleet/utils/fs.py index 743ceac3e296c..34572e881e9c5 100644 --- a/python/paddle/distributed/fleet/utils/fs.py +++ b/python/paddle/distributed/fleet/utils/fs.py @@ -127,7 +127,7 @@ class LocalFS(FS): def ls_dir(self, fs_path): """ - List directorys and files under `fs_path` . + List directories and files under `fs_path` . Args: fs_path(str): The local file path. @@ -377,7 +377,7 @@ def mv(self, src_path, dst_path, overwrite=False, test_exists=False): def list_dirs(self, fs_path): """ - Only list directorys under `fs_path` . + Only list directories under `fs_path` . Args: fs_path(str): The local file path. @@ -516,7 +516,7 @@ def _run_cmd(self, cmd, redirect_stderr=False, retry_times=5): @_handle_errors() def list_dirs(self, fs_path): """ - Only list directorys under `fs_path` . + Only list directories under `fs_path` . Args: fs_path(str): The HDFS file path. @@ -550,7 +550,7 @@ def list_dirs(self, fs_path): @_handle_errors() def ls_dir(self, fs_path): """ - List directorys and files under `fs_path` . + List directories and files under `fs_path` . Args: fs_path(str): The HDFS file path. @@ -1226,7 +1226,7 @@ def init(self, fs_name, fs_user, fs_passwd, fs_conf): def list_dirs(self, fs_path): """ - Only list directorys under `fs_path` . + Only list directories under `fs_path` . Args: fs_path(str): The HDFS file path. @@ -1254,7 +1254,7 @@ def list_dirs(self, fs_path): def ls_dir(self, fs_path): """ - List directorys and files under `fs_path` . + List directories and files under `fs_path` . Args: fs_path(str): The HDFS file path. diff --git a/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py b/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py index 9170754bb78ff..d8142b7081f2b 100644 --- a/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py +++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py @@ -33,7 +33,7 @@ class HybridParallelInferenceHelper: num_pp (int): number of pipeline parallel degree. Default ``1``. micro_batch_size (int): number of micro batch size. Default ``1``. beam_size (int): number of beam search size. Default ``1``. - init_comm (bool): wheter if initilize comminication group. Default ``True``. + init_comm (bool): whether if initialize communication group. Default ``True``. role_maker (RoleMakerBase or subclass): user custom define RoleMakerBase. If ``role_maker==None``, then use PaddleCloudRoleMaker. Default ``None``. diff --git a/python/paddle/distributed/fleet/utils/pp_parallel_adaptor.py b/python/paddle/distributed/fleet/utils/pp_parallel_adaptor.py index ad640a7200d0d..fa728217474d2 100644 --- a/python/paddle/distributed/fleet/utils/pp_parallel_adaptor.py +++ b/python/paddle/distributed/fleet/utils/pp_parallel_adaptor.py @@ -95,7 +95,7 @@ def __init__( def apply(self, src_model_path: str, dst_model_path: str): for i in range(self._src_parallel_config.mp): for j in range(self._src_parallel_config.sharding): - # TODO(liuzhenhai): use multiple processs + # TODO(liuzhenhai): use multiple process layers = [] # 1、extract layers in the same pp group @@ -190,7 +190,7 @@ def extract_layers(self, dir: str, with_shared: bool): e for e in opt.keys() if e not in ["master_weights", "LR_Scheduler"] ] opt_to_t = self._opt_name_to_tname(tensor_names, opt_names) - # gather tensors belonging to one layer togather + # gather tensors belonging to one layer together layers = OrderedDict() for k, v in params.items(): layer, p = tname_to_layer_and_pname[v.name] @@ -339,7 +339,7 @@ def segment_uniform(): segments[i] = [([layers[0][0]], layers[0][1])] + segments[i] for pp_rank, segs in enumerate(segments): - print(f"segmentment result for pp_rank {pp_rank}:") + print(f"segment result for pp_rank {pp_rank}:") print(50 * "=") for seg in segs: print(f"{seg[0]} => {seg[1]}") @@ -469,56 +469,56 @@ def parse_args(): '--src_mp', type=int, default=2, - help='mp degree of the origin triaing task that dumpped this model', + help='mp degree of the origin training task that dumpped this model', ) parser.add_argument( '--src_pp', type=int, default=2, - help='pp degree of the origin triaing task that dumpped this model', + help='pp degree of the origin training task that dumpped this model', ) parser.add_argument( '--src_vp', type=int, default=2, - help='vp degree of the origin triaing task that dumpped this model', + help='vp degree of the origin training task that dumpped this model', ) parser.add_argument( '--dst_mp', type=int, default=None, - help='mp degree of the origin triaing task that dumpped this model', + help='mp degree of the origin training task that dumpped this model', ) parser.add_argument( '--dst_pp', type=int, default=None, - help='pp degree of the expected triaing task that would recover this model', + help='pp degree of the expected training task that would recover this model', ) parser.add_argument( '--dst_vp', type=int, default=2, - help='vp degree of the expected triaing task that would recover this model', + help='vp degree of the expected training task that would recover this model', ) parser.add_argument( '--sharding', type=int, default=1, - help=" sharding degree of both the origin triaing task that dumpped this model and the expected triaing task that would recover this model", + help=" sharding degree of both the origin training task that dumpped this model and the expected training task that would recover this model", ) parser.add_argument( '--method', type=str, default="adapt_model", - help='vp degree of the expected triaing task that would recover this model', + help='vp degree of the expected training task that would recover this model', ) parser.add_argument( diff --git a/python/paddle/distributed/transpiler/distribute_transpiler.py b/python/paddle/distributed/transpiler/distribute_transpiler.py index 0ad1df56d2b51..2a77fa678ec0a 100644 --- a/python/paddle/distributed/transpiler/distribute_transpiler.py +++ b/python/paddle/distributed/transpiler/distribute_transpiler.py @@ -2265,7 +2265,7 @@ def _create_vars_from_blocklist( NOTE: only grads need to be named for different trainers, use add_trainer_suffix to rename the grad vars. Args: - program (ProgramDesc): ProgramDesc which gradients blong. + program (ProgramDesc): ProgramDesc which gradients belong. block_list (list[(varname, block_id, block_size)]): List of gradient blocks. add_trainer_suffix (Bool): Add trainer suffix to new variable's name if set True. Returns: @@ -2329,7 +2329,7 @@ def _create_vars_from_blocklist( dtype=orig_var.dtype, type=orig_var.type, shape=splited_shape, - ) # flattend split var + ) # flatten split var var_mapping[varname].append(var) program.global_block()._sync_with_cpp() return var_mapping diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py index 866901b840a31..c0ff4dd25b424 100644 --- a/python/paddle/hapi/model.py +++ b/python/paddle/hapi/model.py @@ -976,7 +976,7 @@ def load(self, param_state_pairs, optim_state, scaler_state=None): # which would happen when set_state_dict before minimize, the state would be # stored in optimizer._accumulators_holder and loaded lazily. # To contrive this when loading from static-graph saved states, extend - # state dict to include keys named accoring to dygraph naming rules. + # state dict to include keys named according to dygraph naming rules. # TODO: if len(self.model._optimizer._accumulators) > 0 converted_state = dict(optim_state) opt_unq_name = self.model._optimizer._name @@ -1429,7 +1429,7 @@ def load(self, path, skip_mismatch=False, reset_optimizer=False): for optimizer states is not necessary if no need to restore the optimizer. NOTE: parameters are retrieved out from the file storing model states - accoring to their structured names. + according to their structured names. For fine-tuning or transfer-learning models where some of the layers have changed, keep parameters needed to restore have same structured names in diff --git a/python/paddle/incubate/distributed/models/moe/moe_layer.py b/python/paddle/incubate/distributed/models/moe/moe_layer.py index 023dd8fa24764..986096ad4ccc8 100644 --- a/python/paddle/incubate/distributed/models/moe/moe_layer.py +++ b/python/paddle/incubate/distributed/models/moe/moe_layer.py @@ -263,7 +263,7 @@ def prepare_forward(gate, num_expert, world_size, moe_group): class MoELayer(nn.Layer): """MoE Layer Args: - d_model (int): Model dimention. + d_model (int): Model dimension. experts (nn.LayerList): Expert networks list. gate (dict|NaiveGate|SwitchGate|NaiveGate): diff --git a/python/paddle/incubate/nn/functional/masked_multihead_attention.py b/python/paddle/incubate/nn/functional/masked_multihead_attention.py index f8131e2910461..84541f35f77ee 100644 --- a/python/paddle/incubate/nn/functional/masked_multihead_attention.py +++ b/python/paddle/incubate/nn/functional/masked_multihead_attention.py @@ -39,7 +39,7 @@ def masked_multihead_attention( ): r""" Masked Multi-head attention for text summarization. - This is a fusion operator to compute masked multihead attention in transformer model architecture. + This is a fusion operator to compute masked multi-head attention in transformer model architecture. This operator only supports running on GPU. Args: diff --git a/python/paddle/incubate/nn/layer/fused_ec_moe.py b/python/paddle/incubate/nn/layer/fused_ec_moe.py index 1ae117fce4544..a960bad85b2ca 100644 --- a/python/paddle/incubate/nn/layer/fused_ec_moe.py +++ b/python/paddle/incubate/nn/layer/fused_ec_moe.py @@ -40,8 +40,8 @@ class FusedEcMoe(Layer): **bias** (Parameter): the learnable bias of this layer. Shape: - - input: Multi-dimentional tensor with shape :math:`[batch\_size, seq\_len, d\_model]` . - - output: Multi-dimentional tensor with shape :math:`[batch\_size, seq\_len, d\_model]` . + - input: Multi-dimensional tensor with shape :math:`[batch\_size, seq\_len, d\_model]` . + - output: Multi-dimensional tensor with shape :math:`[batch\_size, seq\_len, d\_model]` . Examples: .. code-block:: python diff --git a/python/paddle/incubate/nn/layer/fused_linear.py b/python/paddle/incubate/nn/layer/fused_linear.py index 14b159b9eb896..a28ff647c97af 100644 --- a/python/paddle/incubate/nn/layer/fused_linear.py +++ b/python/paddle/incubate/nn/layer/fused_linear.py @@ -50,8 +50,8 @@ class FusedLinear(Layer): **bias** (Parameter): the learnable bias of this layer. Shape: - - input: Multi-dimentional tensor with shape :math:`[batch\_size, *, in\_features]` . - - output: Multi-dimentional tensor with shape :math:`[batch\_size, *, out\_features]` . + - input: Multi-dimensional tensor with shape :math:`[batch\_size, *, in\_features]` . + - output: Multi-dimensional tensor with shape :math:`[batch\_size, *, out\_features]` . Examples: .. code-block:: python diff --git a/python/paddle/incubate/nn/layer/fused_transformer.py b/python/paddle/incubate/nn/layer/fused_transformer.py index 1626403e26b5a..b35b87b3e4c31 100644 --- a/python/paddle/incubate/nn/layer/fused_transformer.py +++ b/python/paddle/incubate/nn/layer/fused_transformer.py @@ -509,7 +509,7 @@ class FusedFeedForward(Layer): epsilon (float, optional): he small value added to the variance to prevent division by zero. Default: 1e-05. activation (str, optional): The activation function. Default relu. - act_dropout_rate (float, optional): The dropout probability after activition. + act_dropout_rate (float, optional): The dropout probability after activation. If None, use the value of `dropout_rate`. Default None normalize_before (bool, optional): Indicate whether to put layer normalization into, preprocessing or postprocessing. Default False @@ -747,7 +747,7 @@ class FusedTransformerEncoderLayer(Layer): in MHA to drop some attention target. If None, use the value of `dropout`. Default None act_dropout_rate (float, optional): The dropout probability used after FFN - activition. If None, use the value of `dropout`. Default None + activation. If None, use the value of `dropout`. Default None normalize_before (bool, optional): Indicate whether to put layer normalization into preprocessing of MHA and FFN sub-layers. If True, pre-process is layer normalization and post-precess includes dropout, residual connection. @@ -911,7 +911,7 @@ class FusedTransformer(Layer): Please refer to `Attention is all you need `_ , and see `TransformerEncoder` and `TransformerDecoder` for more details. - Users can configurate the model architecture with corresponding parameters. + Users can configure the model architecture with corresponding parameters. Note the usage of `normalize_before` representing where to apply layer normalization (in pre-process or post-precess of multi-head attention or FFN), and some transformer like models are different on this, such as @@ -934,7 +934,7 @@ class FusedTransformer(Layer): in MHA to drop some attention target. If None, use the value of `dropout`. Default None act_dropout (float, optional): The dropout probability used after FFN - activition. If None, use the value of `dropout`. Default None + activation. If None, use the value of `dropout`. Default None normalize_before (bool, optional): Indicate whether to put layer normalization into preprocessing of MHA and FFN sub-layers. If True, pre-process is layer normalization and post-precess includes dropout, residual connection. @@ -946,7 +946,7 @@ class FusedTransformer(Layer): would be used as `weight_attr` for cross attention of `TransformerDecoder`, and `weight_attr[2]` would be used as `weight_attr` for linear in FFN. If it is 2, `weight_attr[0]` would be used as `weight_attr` both for self attention - and cross attntion and `weight_attr[1]` would be used as `weight_attr` for + and cross attention and `weight_attr[1]` would be used as `weight_attr` for linear in FFN. If it is 1, `weight_attr[0]` would be used as `weight_attr` for self attention, cross attention and linear in FFN. Otherwise, the three sub-layers all uses it as `weight_attr` to create parameters. @@ -959,7 +959,7 @@ class FusedTransformer(Layer): would be used as `bias_attr` for cross attention of `TransformerDecoder`, and `bias_attr[2]` would be used as `bias_attr` for linear in FFN. If it is 2, `bias_attr[0]` would be used as `bias_attr` both for self attention - and cross attntion and `bias_attr[1]` would be used as `bias_attr` for + and cross attention and `bias_attr[1]` would be used as `bias_attr` for linear in FFN. If it is 1, `bias_attr[0]` would be used as `bias_attr` for self attention, cross attention and linear in FFN. Otherwise, the three sub-layers all uses it as `bias_attr` to create parameters. diff --git a/python/paddle/incubate/operators/resnet_unit.py b/python/paddle/incubate/operators/resnet_unit.py index bb80b602bcb5b..8a0030bff16df 100644 --- a/python/paddle/incubate/operators/resnet_unit.py +++ b/python/paddle/incubate/operators/resnet_unit.py @@ -151,7 +151,7 @@ def resnet_unit( class ResNetUnit(Layer): r""" ******Temporary version******. - ResNetUnit is designed for optimize the performence by using cudnnv8 API. + ResNetUnit is designed for optimize the performance by using cudnnv8 API. """ def __init__( diff --git a/python/paddle/incubate/optimizer/modelaverage.py b/python/paddle/incubate/optimizer/modelaverage.py index 2f4b9efe76383..eafb0a82f1c8f 100644 --- a/python/paddle/incubate/optimizer/modelaverage.py +++ b/python/paddle/incubate/optimizer/modelaverage.py @@ -163,7 +163,7 @@ class ModelAverage(Optimizer): >>> with model_average.apply(need_restore=False): ... evaluate(layer, eval_loader, loss_fn) - >>> print("\nEvaluate With Restored Paramters") + >>> print("\nEvaluate With Restored Parameters") >>> model_average.restore() >>> evaluate(layer, eval_loader, loss_fn) diff --git a/python/paddle/incubate/optimizer/pipeline.py b/python/paddle/incubate/optimizer/pipeline.py index a88dc419f05e6..02aef51b881e6 100644 --- a/python/paddle/incubate/optimizer/pipeline.py +++ b/python/paddle/incubate/optimizer/pipeline.py @@ -1285,7 +1285,7 @@ def _insert_accumulate_gradients_with_fuse( # To meet the requirement, 128 fp16 or 64 float will be aligned # Think the total shape of the input tensors if [64], # if the dtype is float, then the shape of the fuse var is [64] - # however if the dytpe if fp16, the shape of the fuse var is [128], + # however if the dtype if fp16, the shape of the fuse var is [128], # which will cause the fused vars' shape vary between each other. # To make sure the shape of the fused vars are identical, # we set the dtype of float and fp16 both to 2. diff --git a/python/paddle/incubate/xpu/resnet_block.py b/python/paddle/incubate/xpu/resnet_block.py index 3a1d3e625938b..9cd31f5f3435a 100644 --- a/python/paddle/incubate/xpu/resnet_block.py +++ b/python/paddle/incubate/xpu/resnet_block.py @@ -327,7 +327,7 @@ def resnet_basic_block( class ResNetBasicBlock(Layer): r""" - ResNetBasicBlock is designed for optimize the performence of the basic unit of ssd resnet block. + ResNetBasicBlock is designed for optimize the performance of the basic unit of ssd resnet block. If has_shortcut = True, it can calculate 3 Conv2D, 3 BatchNorm and 2 ReLU in one time. If has_shortcut = False, it can calculate 2 Conv2D, 2 BatchNorm and 2 ReLU in one time. In this case the shape of output is same with input. diff --git a/python/paddle/io/dataloader/batch_sampler.py b/python/paddle/io/dataloader/batch_sampler.py index 9f8c4a6f92c66..9a51d2308ae03 100644 --- a/python/paddle/io/dataloader/batch_sampler.py +++ b/python/paddle/io/dataloader/batch_sampler.py @@ -182,14 +182,14 @@ class DistributedBatchSampler(BatchSampler): or other python object which implemented `__len__` for BatchSampler to get indices of samples. batch_size(int): sample size of each mini-batch. - num_replicas(int, optional): porcess number in distributed training. + num_replicas(int, optional): process number in distributed training. If :attr:`num_replicas` is None, :attr:`num_replicas` will be retrieved from :ref:`api_paddle_distributed_ParallelEnv` . Default None. rank(int, optional): the rank of the current process among :attr:`num_replicas` processes. If :attr:`rank` is None, :attr:`rank` is retrieved from :ref:`api_paddle_distributed_ParallelEnv`. Default None. - shuffle(bool, optional): whther to shuffle indices order before genrating + shuffle(bool, optional): whether to shuffle indices order before generating batch indices. Default False. drop_last(bool, optional): whether drop the last incomplete(less than a mini-batch) batch dataset size. Default False. diff --git a/python/paddle/io/dataloader/worker.py b/python/paddle/io/dataloader/worker.py index a7d49b3e172a1..46d4539e69c44 100644 --- a/python/paddle/io/dataloader/worker.py +++ b/python/paddle/io/dataloader/worker.py @@ -249,9 +249,9 @@ def mix(x, y): result = (result ^ (result >> XSHIFT)) & MASK32 return result - # init entropys with based_seed and worker_id and calculate pool - entropys = [worker_id, base_seed & MASK32, base_seed >> 32, 0] - pool = [hash(entropy) for entropy in entropys] + # init entropies with based_seed and worker_id and calculate pool + entropies = [worker_id, base_seed & MASK32, base_seed >> 32, 0] + pool = [hash(entropy) for entropy in entropies] # mix all bits together for i in range(len(pool)): @@ -284,7 +284,7 @@ def _worker_loop( num_workers, use_shared_memory, base_seed, - shm_cahce_size=0, + shm_cache_size=0, ): try: # NOTE: [ mmap files clear ] When the child process exits unexpectedly, @@ -296,7 +296,7 @@ def _worker_loop( # set signal handler core._set_process_signal_handler() - core._set_max_memory_map_allocation_pool_size(shm_cahce_size) + core._set_max_memory_map_allocation_pool_size(shm_cache_size) # set different numpy seed for each worker try: diff --git a/python/paddle/io/multiprocess_utils.py b/python/paddle/io/multiprocess_utils.py index c57b6dae86b5e..b8a746158a49d 100644 --- a/python/paddle/io/multiprocess_utils.py +++ b/python/paddle/io/multiprocess_utils.py @@ -62,7 +62,7 @@ class CleanupFuncRegistrar: @classmethod def register(cls, function, signals=[]): - def _func_exectuor(): + def _func_executor(): if function not in cls._executed_func_set: try: function() @@ -74,11 +74,11 @@ def _func_register(function): raise TypeError("%s is not callable object." % (function)) # check function object whether hash-able {function} if function not in cls._registered_func_set: - atexit.register(_func_exectuor) + atexit.register(_func_executor) cls._registered_func_set.add(function) def _signal_handler(signum=None, frame=None): - _func_exectuor() + _func_executor() if signum is not None: if signum == signal.SIGINT: raise KeyboardInterrupt diff --git a/python/paddle/io/reader.py b/python/paddle/io/reader.py index 5829b35aa2ee3..fbbbbcb88861f 100644 --- a/python/paddle/io/reader.py +++ b/python/paddle/io/reader.py @@ -218,16 +218,16 @@ class DataLoader: DataLoader provides an iterator which iterates given dataset once by the batch_sampler. - DataLoader supports single-process and multi-prcess data loading, + DataLoader supports single-process and multi-process data loading, multi-process workers will be used to load data asynchronously if :attr:`num_workers` is set as a positive number. DataLoader supports map-style dataset and iterable-style dataset. - For map-style datast(can get a sample from dataset with a given + For map-style dataset(can get a sample from dataset with a given index), please see :code:`paddle.io.Dataset`. - For iterable-style datast(get samples from dataset iteratively, + For iterable-style dataset(get samples from dataset iteratively, like a Python iterator), please see :code:`paddle.io.IterableDataset`. For :code:`batch_sampler` please see :code:`paddle.io.BatchSampler` @@ -235,7 +235,7 @@ class DataLoader: Notes: GPU tensor operation is not supported in subprocess currently, please don't use GPU tensor operations in pipeline which will - be performed in subprocess, such as dataset transforms, collte_fn, + be performed in subprocess, such as dataset transforms, collate_fn, etc. Numpy array and CPU tensor operation is supported. **Disable automatic batching** @@ -282,7 +282,7 @@ class DataLoader: is not set, a default `paddle.io.BatchSampler` will be used and initialize by :attr:`batch_size`, :attr:`shuffle` and :attr:`drop_last`. Default 1. - shuffle(bool, optional): whther to shuffle indices order before genrate + shuffle(bool, optional): whether to shuffle indices order before generate batch indices, a substitution parameter for :attr:`batch_sampler` see :attr:`batch_size`. Default False. drop_last(bool, optional): whether drop the last incomplete batch dataset size @@ -293,7 +293,7 @@ class DataLoader: 0(same as :attr::`np.stack(..., axis=0)`). Default None num_workers(int, optional): the number of subprocess to load data, 0 for no subprocess used and loading data in main process. Default 0 - use_buffer_reader (bool, optional): whether to use bufferred reader. + use_buffer_reader (bool, optional): whether to use buffered reader. If use_buffer_reader=True, the DataLoader would prefetch batch data asynchronously, so it would speed up data feeding and occupies a little more CPU or GPU memory, i.e., the memory @@ -303,17 +303,17 @@ class DataLoader: use_shared_memory (bool, optional): whether to use shared memory to speed up putting data into inter-process queue, set :attr:`use_shared_memory` as True only when the shared memory space on your machine(e.g. - space of '/dev/shm' on Linux operating sysytem) is large enough. + space of '/dev/shm' on Linux operating system) is large enough. Shared memory will only be enabled in multi-process mode(num_workers > 0). Default True. timeout(int, optional): the timeout value for getting data form output queue of subprocesses. Default 0. worker_init_fn(callable, optional): init function which will be called with - worker id on each subproces starting if not set as None. Default + worker id on each subprocess starting if not set as None. Default None. Returns: - DataLoader: an iterable object for data iterating, each elemnet of the generated data is a Tensor. + DataLoader: an iterable object for data iterating, each element of the generated data is a Tensor. Examples: @@ -429,7 +429,7 @@ def __init__( ): warnings.warn( "DataLoader with multi-process mode is not supported on MacOs and Windows currently." - " Please use signle-process mode with num_workers = 0 instead" + " Please use single-process mode with num_workers = 0 instead" ) num_workers = 0 self.num_workers = num_workers diff --git a/python/paddle/jit/api.py b/python/paddle/jit/api.py index 8749955354531..6a3ff80461234 100644 --- a/python/paddle/jit/api.py +++ b/python/paddle/jit/api.py @@ -295,7 +295,7 @@ def decorated(python_func): def not_to_static(func=None): """ - A Decorator to suppresses the convertion of a function. + A Decorator to suppresses the convention of a function. Args: func(callable): The function to decorate. @@ -363,7 +363,7 @@ def __init__(self): # when need to save a prune model, use input_names_after_prune to specify the inputs left after pruning self.input_names_after_prune = None - # in the scene of llm-inference, prunning program can cause unexpectable result, an option to skip prune is necessary + # in the scene of llm-inference, pruning program can cause unexpectable result, an option to skip prune is necessary self.skip_prune_program = False @property @@ -493,7 +493,7 @@ def _parse_load_config(configs): def _get_input_var_names(inputs, input_spec, input_names_after_prune): name_none_error = ( "The %s's name is None. " - "When using jit.save, please set InputSepc's name in " + "When using jit.save, please set InputSpec's name in " "to_static(input_spec=[]) and jit.save(input_spec=[]) " "and make sure they are consistent." ) @@ -521,7 +521,7 @@ def _get_input_var_names(inputs, input_spec, input_names_after_prune): # no prune return input_var_names else: - # fileter out non-tensor type spec infos. + # filter out non-tensor type spec infos. input_spec = [ spec for spec in input_spec @@ -1154,7 +1154,7 @@ def save(layer, path, input_spec=None, **configs): extra_info_dict['trainable'] = param_or_buffer.trainable extra_var_info[param_or_buffer.name] = extra_info_dict - # 4. build input & output of save_infernece_model + # 4. build input & output of save_inference_model # NOTE(chenweihang): [ Get input variables name ] # There are two cases, whether to prune the inputs or not # - not prune inputs (recommend): @@ -1285,7 +1285,7 @@ def load(path, **configs): .. note:: If you load model saved by ``paddle.static.save_inference_model`` , there will be the following limitations when using it in fine-tuning: - 1. Imperative mode do not support LoDTensor. All original model's feed targets or parametars that depend on LoD are temporarily unavailable. + 1. Imperative mode do not support LoDTensor. All original model's feed targets or parameters that depend on LoD are temporarily unavailable. 2. All saved model's feed targets need to be passed into TranslatedLayer's forward function. 3. The variable's ``stop_gradient`` information is lost and can not be recovered. 4. The parameter's ``trainable`` information is lost and can not be recovered. diff --git a/python/paddle/jit/dy2static/partial_program.py b/python/paddle/jit/dy2static/partial_program.py index 2c248ce6f4837..7b0bcc0d322fa 100644 --- a/python/paddle/jit/dy2static/partial_program.py +++ b/python/paddle/jit/dy2static/partial_program.py @@ -1134,7 +1134,7 @@ def _check_params_all_inited(self, main_program): "\n\tBut we found parameter(%s) was created in the decorated function." "\n" "\n\tRevise suggestion: " - "\n\t\t1. Please ensure all your sublayers are inheritted from nn.Layer." + "\n\t\t1. Please ensure all your sublayers are inherited from nn.Layer." "\n\t\t2. Please use nn.ParameterList and nn.LayerList as container instead of using a native Python container such as List" % name ) diff --git a/python/paddle/jit/translated_layer.py b/python/paddle/jit/translated_layer.py index 53d3ff9a718c8..bfe96f68f5f3f 100644 --- a/python/paddle/jit/translated_layer.py +++ b/python/paddle/jit/translated_layer.py @@ -1399,13 +1399,13 @@ def __init__(self, programs, persistable_vars): # NOTE(chenweihang): [ why not use var name directly? ] # When add parameter or buffer to Layer by follow apis, - # the variable name can't contain `.`, beccause which may cause + # the variable name can't contain `.`, because which may cause # AttributeError when access the newly added parameter or buffer # in the form of `self.**.**``, but the EagerParamBase or BarBase # name contains `.` originally, such as `linear_0.w_0`, so here # need to generate new var name for each var self._persistable_var_name_dict = {} - # the TranslatedLayer object holded var names count started from 0 + # the TranslatedLayer object held var names count started from 0 with unique_name.guard(): for name, var in persistable_vars.items(): if isinstance(var, framework.EagerParamBase): @@ -1501,7 +1501,7 @@ def program(self, method_name='forward'): Gets translated program of specified method. Args: - - method_name (string): mehtod name corresponding to the program + - method_name (string): method name corresponding to the program to be obtained. Default: 'forward'. Returns: diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py index f89885a711958..2b43207893b75 100644 --- a/python/paddle/nn/functional/activation.py +++ b/python/paddle/nn/functional/activation.py @@ -31,7 +31,7 @@ def celu(x, alpha=1.0, name=None): r""" celu activation. - Apply the following operation to each element of the input Tensor accroding to the `Continuously Differentiable Exponential Linear Units `_. + Apply the following operation to each element of the input Tensor according to the `Continuously Differentiable Exponential Linear Units `_. .. math:: diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py index 6b741a2b92110..394fc3de6feb9 100644 --- a/python/paddle/nn/layer/common.py +++ b/python/paddle/nn/layer/common.py @@ -44,8 +44,8 @@ class Identity(Layer): kwargs: any keyword argument (unused) Shape: - - input: Multi-dimentional tensor with shape :math:`[batch\_size, n1, n2, ...]` . - - output: Multi-dimentional tensor with shape :math:`[batch\_size, n1, n2, ...]` . + - input: Multi-dimensional tensor with shape :math:`[batch\_size, n1, n2, ...]` . + - output: Multi-dimensional tensor with shape :math:`[batch\_size, n1, n2, ...]` . Examples: .. code-block:: python @@ -118,8 +118,8 @@ class Linear(Layer): **bias** (Parameter): the learnable bias of this layer. Shape: - - input: Multi-dimentional tensor with shape :math:`[batch\_size, *, in\_features]` . Its data types are float16, float32, float64 ,The default is float32 . - - output: Multi-dimentional tensor with shape :math:`[batch\_size, *, out\_features]` . The data type is the same as the input . + - input: Multi-dimensional tensor with shape :math:`[batch\_size, *, in\_features]` . Its data types are float16, float32, float64 ,The default is float32 . + - output: Multi-dimensional tensor with shape :math:`[batch\_size, *, out\_features]` . The data type is the same as the input . Examples: .. code-block:: python diff --git a/python/paddle/nn/layer/layers.py b/python/paddle/nn/layer/layers.py index fcbda20cff38c..b7c7a1e3b30ec 100644 --- a/python/paddle/nn/layer/layers.py +++ b/python/paddle/nn/layer/layers.py @@ -415,7 +415,7 @@ def __init__(self, name_scope=None, dtype="float32"): self._cast_to_low_precision = True self._state_dict_hooks = collections.OrderedDict() - # Records orignal functions after @to_static to support to rollback + # Records original functions after @to_static to support to rollback self._original_funcs = collections.OrderedDict() def train(self): diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py index 51b4de0f33a3a..147a84e2a14be 100644 --- a/python/paddle/nn/layer/transformer.py +++ b/python/paddle/nn/layer/transformer.py @@ -205,7 +205,7 @@ def __init__( def _prepare_qkv(self, query, key, value, cache=None): r""" - Prapares linear projected queries, keys and values for usage of subsequnt + Prepares linear projected queries, keys and values for usage of subsequent multiple parallel attention. If `cache` is not None, using cached results to reduce redundant calculations. @@ -292,7 +292,7 @@ def compute_kv(self, key, value): def gen_cache(self, key, value=None, type=Cache): """ - Generates cache for `forward` usage in inference accroding to arguments. + Generates cache for `forward` usage in inference according to arguments. The generated cache is an instance of `MultiHeadAttention.Cache` or an instance of `MultiHeadAttention.StaticCache`. @@ -401,7 +401,7 @@ def forward(self, query, key=None, value=None, attn_mask=None, cache=None): If `cache` is not None, the tuple then includes the new cache having the same type as `cache`, and if it is `StaticCache`, it is same as the input `cache`, if it is `Cache`, the new cache - reserves tensors concatanating raw tensors with intermediate + reserves tensors concatenating raw tensors with intermediate results of current query. """ key = query if key is None else key @@ -467,7 +467,7 @@ class TransformerEncoderLayer(Layer): in MHA to drop some attention target. If None, use the value of `dropout`. Default None act_dropout (float, optional): The dropout probability used after FFN - activition. If None, use the value of `dropout`. Default None + activation. If None, use the value of `dropout`. Default None normalize_before (bool, optional): Indicate whether to put layer normalization into preprocessing of MHA and FFN sub-layers. If True, pre-process is layer normalization and post-precess includes dropout, residual connection. @@ -783,7 +783,7 @@ class TransformerDecoderLayer(Layer): in MHA to drop some attention target. If None, use the value of `dropout`. Default None act_dropout (float, optional): The dropout probability used after FFN - activition. If None, use the value of `dropout`. Default None + activation. If None, use the value of `dropout`. Default None normalize_before (bool, optional): Indicate whether to put layer normalization into preprocessing of MHA and FFN sub-layers. If True, pre-process is layer normalization and post-precess includes dropout, residual connection. @@ -1182,7 +1182,7 @@ class Transformer(Layer): Please refer to `Attention is all you need `_ , and see `TransformerEncoder` and `TransformerDecoder` for more details. - Users can configurate the model architecture with corresponding parameters. + Users can configure the model architecture with corresponding parameters. Note the usage of `normalize_before` representing where to apply layer normalization (in pre-process or post-precess of multi-head attention or FFN), and some transformer like models are different on this, such as @@ -1205,7 +1205,7 @@ class Transformer(Layer): in MHA to drop some attention target. If None, use the value of `dropout`. Default None act_dropout (float, optional): The dropout probability used after FFN - activition. If None, use the value of `dropout`. Default None + activation. If None, use the value of `dropout`. Default None normalize_before (bool, optional): Indicate whether to put layer normalization into preprocessing of MHA and FFN sub-layers. If True, pre-process is layer normalization and post-precess includes dropout, residual connection. @@ -1217,7 +1217,7 @@ class Transformer(Layer): would be used as `weight_attr` for cross attention of `TransformerDecoder`, and `weight_attr[2]` would be used as `weight_attr` for linear in FFN. If it is 2, `weight_attr[0]` would be used as `weight_attr` both for self attention - and cross attntion and `weight_attr[1]` would be used as `weight_attr` for + and cross attention and `weight_attr[1]` would be used as `weight_attr` for linear in FFN. If it is 1, `weight_attr[0]` would be used as `weight_attr` for self attention, cross attention and linear in FFN. Otherwise, the three sub-layers all uses it as `weight_attr` to create parameters. @@ -1230,7 +1230,7 @@ class Transformer(Layer): would be used as `bias_attr` for cross attention of `TransformerDecoder`, and `bias_attr[2]` would be used as `bias_attr` for linear in FFN. If it is 2, `bias_attr[0]` would be used as `bias_attr` both for self attention - and cross attntion and `bias_attr[1]` would be used as `bias_attr` for + and cross attention and `bias_attr[1]` would be used as `bias_attr` for linear in FFN. If it is 1, `bias_attr[0]` would be used as `bias_attr` for self attention, cross attention and linear in FFN. Otherwise, the three sub-layers all uses it as `bias_attr` to create parameters. diff --git a/python/paddle/static/amp/decorator.py b/python/paddle/static/amp/decorator.py index 0c1cb8398a140..cf1cb71237bc6 100644 --- a/python/paddle/static/amp/decorator.py +++ b/python/paddle/static/amp/decorator.py @@ -846,10 +846,10 @@ def decorate( # noqa: F811 will be converted to float16/bfloat16, and that have any float32 input will be converted to float32. For the OD level, operators in default white list will compute in float16/bfloat16, and the others will compute in float32. Default is O1. dtype(str, optional): Whether to use 'float16' or 'bfloat16'. Default is 'float16'. - master_weight(bool, optinal): For level='O2', whether to use multi-precision + master_weight(bool, optional): For level='O2', whether to use multi-precision during weight updating. If master_weight is None, in O2 level optimizer will use multi-precision. Default is None. - master_grad(bool, optinal): For level='O2', whether to use master_grad + master_grad(bool, optional): For level='O2', whether to use master_grad during weight updating. If master_grad is False, in O2 level optimizer will not use master grad. Default is False. init_loss_scaling(float, optional): The initial loss scaling factor. diff --git a/python/paddle/static/amp/fp16_lists.py b/python/paddle/static/amp/fp16_lists.py index 06630039ca877..ad2dbf2821ff7 100644 --- a/python/paddle/static/amp/fp16_lists.py +++ b/python/paddle/static/amp/fp16_lists.py @@ -62,7 +62,7 @@ def get_low_precision_vartype(dtype): return var_type else: raise TypeError( - "The type of dtype is expected to be string or core.VarDesc.VarType, but recieved {}.".format( + "The type of dtype is expected to be string or core.VarDesc.VarType, but received {}.".format( type(dtype) ) ) @@ -82,7 +82,7 @@ def get_low_precision_dtypestr(dtype): ) else: raise TypeError( - "The type of dtype is expected to be string or core.VarDesc.VarType, but recieved {}.".format( + "The type of dtype is expected to be string or core.VarDesc.VarType, but received {}.".format( type(dtype) ) ) @@ -125,7 +125,7 @@ def _get_unsupported_list(dtype): return _sys_unsupported_list, _sys_all_list -# The three sets listed below are changed dynamiclly. They don't contain all +# The three sets listed below are changed dynamically. They don't contain all # paddle ops currently. # The set of ops that support fp16 calculation and are considered numerically- diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py index 8b245f4609191..92bdb228a2ff9 100644 --- a/python/paddle/tensor/manipulation.py +++ b/python/paddle/tensor/manipulation.py @@ -345,7 +345,7 @@ def slice(input, axes, starts, ends): else: raise ValueError( - f"Input axes must be a python list or tuple, but reveived {type(axes)}" + f"Input axes must be a python list or tuple, but received {type(axes)}" ) if in_dynamic_mode(): @@ -633,7 +633,7 @@ def unstack(x, axis=0, num=None): def shard_index(input, index_num, nshards, shard_id, ignore_value=-1): """ - Reset the values of `input` according to the shard it beloning to. + Reset the values of `input` according to the shard it belongs to. Every value in `input` must be a non-negative integer, and the parameter `index_num` represents the integer above the maximum value of `input`. Thus, all values in `input` must be in the range @@ -1944,7 +1944,7 @@ def roll(x, shifts, axis=None, name=None): def stack(x, axis=0, name=None): """ - Stacks all the input tensors ``x`` along ``axis`` dimemsion. + Stacks all the input tensors ``x`` along ``axis`` dimension. All tensors must be of the same shape and same dtype. For example, given N tensors of shape [A, B], if ``axis == 0``, the shape of stacked @@ -3706,7 +3706,7 @@ def scatter(x, index, updates, overwrite=True, name=None): Args: x (Tensor): The input N-D Tensor with ndim>=1. Data type can be float32, float64. index (Tensor): The index is a 1-D or 0-D Tensor. Data type can be int32, int64. The length of index cannot exceed updates's length, and the value in index cannot exceed input's length. - updates (Tensor): Update input with updates parameter based on index. When the index is a 1-D tensor, the updates shape should be the same as input, and dim value with dim > 1 should be the same as input. When the index is a 0-D tensor, the updates should be a (N-1)-D tensor, the ith dim of the updates should be queal with the (i+1)th dim of the input. + updates (Tensor): Update input with updates parameter based on index. When the index is a 1-D tensor, the updates shape should be the same as input, and dim value with dim > 1 should be the same as input. When the index is a 0-D tensor, the updates should be a (N-1)-D tensor, the ith dim of the updates should be equal with the (i+1)th dim of the input. overwrite (bool, optional): The mode that updating the output when there are same indices.If True, use the overwrite mode to update the output of the same index,if False, use the accumulate mode to update the output of the same index. Default value is True. name(str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name` . @@ -4085,7 +4085,7 @@ def expand_as(x, y, name=None): Expand the input tensor ``x`` to the same shape as the input tensor ``y``. - Both the number of dimensions of ``x`` and ``y`` must be less than or equal to 6, and the number of dimensions of ``y`` must be greather than or equal to that of ``x``. The dimension to expand must have a value of 0. + Both the number of dimensions of ``x`` and ``y`` must be less than or equal to 6, and the number of dimensions of ``y`` must be greater than or equal to that of ``x``. The dimension to expand must have a value of 0. Args: x (Tensor): The input tensor, its data type is bool, float32, float64, int32 or int64. @@ -4225,7 +4225,7 @@ def expand(x, shape, name=None): if paddle.utils._contain_var(shape): shape = paddle.utils.get_int_tensor_list(shape) else: - TypeError("Shape only supports OpReslut, or list, or tuple.") + TypeError("Shape only supports OpResult, or list, or tuple.") return _C_ops.expand(x, shape) else: if isinstance(shape, Variable): @@ -6153,7 +6153,7 @@ def index_put_(x, indices, value, accumulate=False, name=None): indices (Tuple of Tensor): The tuple of Tensor containing the indices to index. The data type of ``tensor in indices`` must be int32, int64 or bool. value (Tensor): The tensor used to be assigned to x. - accummulate (Bool, optional): Whether the elements in values are added to x. Default: False. + accumulate (Bool, optional): Whether the elements in values are added to x. Default: False. name(str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None. Returns: