diff --git a/python/paddle/distributed/fleet/fleet.py b/python/paddle/distributed/fleet/fleet.py index ac008330dff0d1..08d82a0ef16c69 100755 --- a/python/paddle/distributed/fleet/fleet.py +++ b/python/paddle/distributed/fleet/fleet.py @@ -106,54 +106,55 @@ class Fleet: Returns: Fleet: A Fleet instance - + Examples: .. code-block:: python :name: code-example1 - # Example1: for collective training - import paddle - paddle.enable_static() - import paddle.distributed.fleet as fleet + >>> # Example1: for collective training + >>> import paddle + >>> paddle.enable_static() + >>> import paddle.distributed.fleet as fleet - fleet.init(is_collective=True) + >>> fleet.init(is_collective=True) - strategy = fleet.DistributedStrategy() - optimizer = paddle.optimizer.SGD(learning_rate=0.001) - optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) + >>> strategy = fleet.DistributedStrategy() + >>> linear = paddle.nn.Linear(10, 10) + >>> optimizer = paddle.optimizer.SGD(learning_rate=0.001, parameters=linear.parameters()) + >>> optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) - # do distributed training + >>> # do distributed training .. code-block:: python :name: code-example2 - # Example2: for parameter server training - import paddle - paddle.enable_static() - import paddle.distributed.fleet as fleet - strategy = fleet.DistributedStrategy() - fleet.init(strategy=strategy) + >>> # Example2: for parameter server training + >>> import paddle + >>> paddle.enable_static() + >>> import paddle.distributed.fleet as fleet + >>> strategy = fleet.DistributedStrategy() + >>> fleet.init(strategy=strategy) - optimizer = paddle.optimizer.SGD(learning_rate=0.001) - optimizer = fleet.distributed_optimizer(optimizer) + >>> optimizer = paddle.optimizer.SGD(learning_rate=0.001) + >>> optimizer = fleet.distributed_optimizer(optimizer) - if fleet.is_first_worker(): - print("this is first worker") + >>> if fleet.is_first_worker(): + ... print("this is first worker") - print("current node index: {}".format(fleet.worker_index())) - print("total number of worker num: {}".format(fleet.worker_num())) + >>> print("current node index: {}".format(fleet.worker_index())) + >>> print("total number of worker num: {}".format(fleet.worker_num())) - if fleet.is_worker(): - print("this is worker") - print("worker endpoints: {}".format(fleet.worker_endpoints(to_string=True))) + >>> if fleet.is_worker(): + ... print("this is worker") + >>> print("worker endpoints: {}".format(fleet.worker_endpoints(to_string=True))) - print("server num: {}".format(fleet.server_num())) - print("server endpoints: {}".format(fleet.server_endpoints(to_string=True))) + >>> print("server num: {}".format(fleet.server_num())) + >>> print("server endpoints: {}".format(fleet.server_endpoints(to_string=True))) - if fleet.is_server(): - print("this is server") - fleet.stop_worker() + >>> if fleet.is_server(): + ... print("this is server") + >>> fleet.stop_worker() """ @@ -202,37 +203,37 @@ def init( .. code-block:: python :name: code-example1 - import paddle.distributed.fleet as fleet - fleet.init() + >>> import paddle.distributed.fleet as fleet + >>> fleet.init() .. code-block:: python :name: code-example2 - import paddle.distributed.fleet as fleet - fleet.init(is_collective=True) + >>> import paddle.distributed.fleet as fleet + >>> fleet.init(is_collective=True) .. code-block:: python :name: code-example3 - import paddle.distributed.fleet as fleet - role = fleet.PaddleCloudRoleMaker() - fleet.init(role) + >>> import paddle.distributed.fleet as fleet + >>> role = fleet.PaddleCloudRoleMaker() + >>> fleet.init(role) .. code-block:: python :name: code-example4 - import paddle.distributed.fleet as fleet - strategy = fleet.DistributedStrategy() - fleet.init(strategy=strategy) + >>> import paddle.distributed.fleet as fleet + >>> strategy = fleet.DistributedStrategy() + >>> fleet.init(strategy=strategy) .. code-block:: python :name: code-example5 - import paddle.distributed.fleet as fleet - strategy = fleet.DistributedStrategy() - fleet.init(log_level = "DEBUG") + >>> import paddle.distributed.fleet as fleet + >>> strategy = fleet.DistributedStrategy() + >>> fleet.init(log_level = "DEBUG") """ from paddle.distributed import parallel_helper @@ -693,9 +694,9 @@ def is_first_worker(self): .. code-block:: python - import paddle.distributed.fleet as fleet - fleet.init() - fleet.is_first_worker() + >>> import paddle.distributed.fleet as fleet + >>> fleet.init() + >>> fleet.is_first_worker() """ return self._role_maker._is_first_worker() @@ -711,9 +712,9 @@ def worker_index(self): .. code-block:: python - import paddle.distributed.fleet as fleet - fleet.init() - fleet.worker_index() + >>> import paddle.distributed.fleet as fleet + >>> fleet.init() + >>> fleet.worker_index() """ return self._role_maker._worker_index() @@ -729,9 +730,9 @@ def worker_num(self): .. code-block:: python - import paddle.distributed.fleet as fleet - fleet.init() - fleet.worker_num() + >>> import paddle.distributed.fleet as fleet + >>> fleet.init() + >>> fleet.worker_num() """ return self._role_maker._worker_num() @@ -760,9 +761,9 @@ def is_worker(self): .. code-block:: python - import paddle.distributed.fleet as fleet - fleet.init() - fleet.is_worker() + >>> import paddle.distributed.fleet as fleet + >>> fleet.init() + >>> fleet.is_worker() """ return self._role_maker._is_worker() @@ -781,9 +782,9 @@ def worker_endpoints(self, to_string=False): .. code-block:: python - import paddle.distributed.fleet as fleet - fleet.init() - fleet.worker_endpoints() + >>> import paddle.distributed.fleet as fleet + >>> fleet.init() + >>> fleet.worker_endpoints() """ if to_string: @@ -802,9 +803,9 @@ def server_num(self): .. code-block:: python - import paddle.distributed.fleet as fleet - fleet.init() - fleet.server_num() + >>> import paddle.distributed.fleet as fleet + >>> fleet.init() + >>> fleet.server_num() """ return len(self._role_maker._get_pserver_endpoints()) @@ -819,9 +820,9 @@ def server_index(self): .. code-block:: python - import paddle.distributed.fleet as fleet - fleet.init() - fleet.server_index() + >>> import paddle.distributed.fleet as fleet + >>> fleet.init() + >>> fleet.server_index() """ return self._role_maker._server_index() @@ -837,9 +838,9 @@ def server_endpoints(self, to_string=False): .. code-block:: python - import paddle.distributed.fleet as fleet - fleet.init() - fleet.server_endpoints() + >>> import paddle.distributed.fleet as fleet + >>> fleet.init() + >>> fleet.server_endpoints() """ @@ -860,9 +861,9 @@ def is_server(self): .. code-block:: python - import paddle.distributed.fleet as fleet - fleet.init() - fleet.is_server() + >>> import paddle.distributed.fleet as fleet + >>> fleet.init() + >>> fleet.is_server() """ return self._role_maker._is_server() @@ -878,9 +879,9 @@ def barrier_worker(self): .. code-block:: python - import paddle.distributed.fleet as fleet - fleet.init() - fleet.barrier_worker() + >>> import paddle.distributed.fleet as fleet + >>> fleet.init() + >>> fleet.barrier_worker() """ self._role_maker._barrier("worker") @@ -898,13 +899,13 @@ def init_worker(self, scopes=None): .. code-block:: python - import paddle.distributed.fleet as fleet - fleet.init() + >>> import paddle.distributed.fleet as fleet + >>> fleet.init() - # build net - # fleet.distributed_optimizer(...) + >>> # build net + >>> # fleet.distributed_optimizer(...) - fleet.init_worker() + >>> fleet.init_worker() """ self._runtime_handle._init_worker(scopes) @@ -943,13 +944,13 @@ def init_server(self, *args, **kwargs): .. code-block:: python - import paddle.distributed.fleet as fleet - fleet.init() + >>> import paddle.distributed.fleet as fleet + >>> fleet.init() - # build net - # fleet.distributed_optimizer(...) + >>> # build net + >>> # fleet.distributed_optimizer(...) - fleet.init_server() + >>> fleet.init_server() """ self._runtime_handle._init_server(*args, **kwargs) @@ -968,13 +969,13 @@ def load_model(self, path, mode): .. code-block:: python - import paddle.distributed.fleet as fleet - fleet.init() + >>> import paddle.distributed.fleet as fleet + >>> fleet.init() - # build net - # fleet.distributed_optimizer(...) + >>> # build net + >>> # fleet.distributed_optimizer(...) - fleet.load_model("path", mode=0) + >>> fleet.load_model("path", mode=0) """ self._runtime_handle._load_persistables(path, mode) @@ -993,13 +994,13 @@ def load_one_table(self, table_id, path, mode): .. code-block:: python - import paddle.distributed.fleet as fleet - fleet.init() + >>> import paddle.distributed.fleet as fleet + >>> fleet.init() - # build net - # fleet.distributed_optimizer(...) + >>> # build net + >>> # fleet.distributed_optimizer(...) - fleet.load_one_table(0, "path", mode=0) + >>> fleet.load_one_table(0, "path", mode=0) """ self._runtime_handle._load_one_table(table_id, path, mode) @@ -1018,13 +1019,13 @@ def load_inference_model(self, path, mode): .. code-block:: python - import paddle.distributed.fleet as fleet - fleet.init() + >>> import paddle.distributed.fleet as fleet + >>> fleet.init() - # build net - # fleet.distributed_optimizer(...) + >>> # build net + >>> # fleet.distributed_optimizer(...) - fleet.load_inference_model("path", mode=1) + >>> fleet.load_inference_model("path", mode=1) """ self._runtime_handle._load_inference_model(path, mode) @@ -1042,14 +1043,14 @@ def run_server(self): .. code-block:: python - import paddle.distributed.fleet as fleet - fleet.init() + >>> import paddle.distributed.fleet as fleet + >>> fleet.init() - # build net - # fleet.distributed_optimizer(...) + >>> # build net + >>> # fleet.distributed_optimizer(...) - if fleet.is_server(): - fleet.init_server() + >>> if fleet.is_server(): + ... fleet.init_server() """ self._runtime_handle._run_server() @@ -1067,13 +1068,13 @@ def stop_worker(self): .. code-block:: python - import paddle.distributed.fleet as fleet - fleet.init() + >>> import paddle.distributed.fleet as fleet + >>> fleet.init() - # build net - # fleet.distributed_optimizer(...) + >>> # build net + >>> # fleet.distributed_optimizer(...) - fleet.init_server() + >>> fleet.init_server() """ self._runtime_handle._stop_worker() @@ -1147,13 +1148,13 @@ def save_inference_model( .. code-block:: python - import paddle.distributed.fleet as fleet - fleet.init() + >>> import paddle.distributed.fleet as fleet + >>> fleet.init() - # build net - # fleet.distributed_optimizer(...) + >>> # build net + >>> # fleet.distributed_optimizer(...) - fleet.init_server() + >>> fleet.init_server() """ @@ -1197,17 +1198,17 @@ def save_persistables(self, executor, dirname, main_program=None, mode=0): .. code-block:: text - import paddle - paddle.enable_static() - import paddle.distributed.fleet as fleet + >>> import paddle + >>> paddle.enable_static() + >>> import paddle.distributed.fleet as fleet - fleet.init() + >>> fleet.init() - # build net - # fleet.distributed_optimizer(...) + >>> # build net + >>> # fleet.distributed_optimizer(...) - exe = paddle.static.Executor(paddle.CPUPlace()) - fleet.save_persistables(exe, "dirname", paddle.static.default_main_program()) + >>> exe = paddle.static.Executor(paddle.CPUPlace()) + >>> fleet.save_persistables(exe, "dirname", paddle.static.default_main_program()) """ self._runtime_handle._save_persistables( @@ -1247,13 +1248,13 @@ def save_one_table(self, table_id, path, mode): .. code-block:: python - import paddle.distributed.fleet as fleet - fleet.init() + >>> import paddle.distributed.fleet as fleet + >>> fleet.init() - # build net - # fleet.distributed_optimizer(...) + >>> # build net + >>> # fleet.distributed_optimizer(...) - fleet.save_one_table(0, "path", mode=0) + >>> fleet.save_one_table(0, "path", mode=0) """ self._runtime_handle._save_one_table(table_id, path, mode) @@ -1274,16 +1275,16 @@ def save_dense_params( .. code-block:: python - import paddle.distributed.fleet as fleet - fleet.init() - import paddle - place = paddle.CPUPlace() - exe = paddle.static.Executor(place) + >>> import paddle.distributed.fleet as fleet + >>> fleet.init() + >>> import paddle + >>> place = paddle.CPUPlace() + >>> exe = paddle.static.Executor(place) - # build net - # fleet.distributed_optimizer(...) + >>> # build net + >>> # fleet.distributed_optimizer(...) - fleet.save_dense_params(exe, "path", scope=paddle.static.global_scope(), program=paddle.static.default_main_program()) + >>> fleet.save_dense_params(exe, "path", scope=paddle.static.global_scope(), program=paddle.static.default_main_program()) """ self._runtime_handle._save_dense_params( @@ -1317,12 +1318,13 @@ def distributed_optimizer(self, optimizer, strategy=None): .. code-block:: python - import paddle - import paddle.distributed.fleet as fleet - fleet.init(is_collective=True) - strategy = fleet.DistributedStrategy() - optimizer = paddle.optimizer.SGD(learning_rate=0.001) - optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) + >>> import paddle + >>> import paddle.distributed.fleet as fleet + >>> fleet.init(is_collective=True) + >>> linear = paddle.nn.Linear(10, 10) + >>> strategy = fleet.DistributedStrategy() + >>> optimizer = paddle.optimizer.SGD(learning_rate=0.001, parameters=linear.parameters()) + >>> optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) """ self.user_defined_optimizer = optimizer @@ -1380,46 +1382,46 @@ def amp_init( Examples: .. code-block:: python - import paddle - import paddle.nn.functional as F - paddle.enable_static() - - def run_example_code(): - place = paddle.CUDAPlace(0) - exe = paddle.static.Executor(place) - data = paddle.static.data(name='X', shape=[None, 1, 28, 28], dtype='float32') - conv2d = paddle.static.nn.conv2d(input=data, num_filters=6, filter_size=3) - # 1) Use fp16_guard to control the range of fp16 kernels used. - with paddle.static.amp.fp16_guard(): - bn = paddle.static.nn.batch_norm(input=conv2d, act="relu") - pool = F.max_pool2d(bn, kernel_size=2, stride=2) - hidden = paddle.static.nn.fc(pool, size=10) - loss = paddle.mean(hidden) - # 2) Create the optimizer and set `multi_precision` to True. - # Setting `multi_precision` to True can avoid the poor accuracy - # or the slow convergence in a way. - optimizer = paddle.optimizer.Momentum(learning_rate=0.01, multi_precision=True) - # 3) These ops in `custom_black_list` will keep in the float32 computation type. - amp_list = paddle.static.amp.CustomOpLists( - custom_black_list=['pool2d']) - # 4) The entry of Paddle AMP. - # Enable pure fp16 training by setting `use_pure_fp16` to True. - optimizer = paddle.static.amp.decorate( - optimizer, - amp_list, - init_loss_scaling=128.0, - use_dynamic_loss_scaling=True, - use_pure_fp16=True) - # If you don't use the default_startup_program(), you sholud pass - # your defined `startup_program` into `minimize`. - optimizer.minimize(loss) - exe.run(paddle.static.default_startup_program()) - # 5) Use `amp_init` after FP32 parameters initialization(such as `exe.run(startup_program)`). - # If you want to perform the testing process, you should pass `test_program` into `amp_init`. - optimizer.amp_init(place, scope=paddle.static.global_scope()) - - if paddle.is_compiled_with_cuda() and len(paddle.static.cuda_places()) > 0: - run_example_code() + >>> import paddle + >>> import paddle.nn.functional as F + >>> paddle.enable_static() + + >>> def run_example_code(): + ... place = paddle.CUDAPlace(0) + ... exe = paddle.static.Executor(place) + ... data = paddle.static.data(name='X', shape=[None, 1, 28, 28], dtype='float32') + ... conv2d = paddle.static.nn.conv2d(input=data, num_filters=6, filter_size=3) + ... # 1) Use fp16_guard to control the range of fp16 kernels used. + ... with paddle.static.amp.fp16_guard(): + ... bn = paddle.static.nn.batch_norm(input=conv2d, act="relu") + ... pool = F.max_pool2d(bn, kernel_size=2, stride=2) + ... hidden = paddle.static.nn.fc(pool, size=10) + ... loss = paddle.mean(hidden) + ... # 2) Create the optimizer and set `multi_precision` to True. + ... # Setting `multi_precision` to True can avoid the poor accuracy + ... # or the slow convergence in a way. + ... optimizer = paddle.optimizer.Momentum(learning_rate=0.01, multi_precision=True) + ... # 3) These ops in `custom_black_list` will keep in the float32 computation type. + ... amp_list = paddle.static.amp.CustomOpLists( + ... custom_black_list=['pool2d']) + ... # 4) The entry of Paddle AMP. + ... # Enable pure fp16 training by setting `use_pure_fp16` to True. + ... optimizer = paddle.static.amp.decorate( + ... optimizer, + ... amp_list, + ... init_loss_scaling=128.0, + ... use_dynamic_loss_scaling=True, + ... use_pure_fp16=True) + ... # If you don't use the default_startup_program(), you sholud pass + ... # your defined `startup_program` into `minimize`. + ... optimizer.minimize(loss) + ... exe.run(paddle.static.default_startup_program()) + ... # 5) Use `amp_init` after FP32 parameters initialization(such as `exe.run(startup_program)`). + ... # If you want to perform the testing process, you should pass `test_program` into `amp_init`. + ... optimizer.amp_init(place, scope=paddle.static.global_scope()) + + >>> if paddle.is_compiled_with_cuda() and len(paddle.static.cuda_places()) > 0: + ... run_example_code() """ amp_optimizer = self._get_amp_optimizer() return amp_optimizer.amp_init(place, scope, test_program, use_fp16_test) @@ -1512,28 +1514,29 @@ def minimize( .. code-block:: python - import paddle - paddle.enable_static() - import paddle.distributed.fleet as fleet - import paddle.nn.functional as F - - hid_dim = 10 - label_dim = 2 - input_x = paddle.static.data(name='x', shape=[None, 13], dtype='float32') - input_y = paddle.static.data(name='y', shape=[None, 1], dtype='int64') - fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim, activation='tanh') - fc_2 = paddle.static.nn.fc(x=fc_1, size=hid_dim, activation='tanh') - prediction = paddle.static.nn.fc(x=[fc_2], size=label_dim, activation='softmax') - cost = F.cross_entropy(input=prediction, label=input_y) - avg_cost = paddle.mean(x=cost) - - fleet.init(is_collective=True) - strategy = fleet.DistributedStrategy() - optimizer = paddle.optimizer.SGD(learning_rate=0.001) - optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) - optimizer.minimize(avg_cost) - - # for more examples, please reference https://github.com/PaddlePaddle/PaddleFleetX + >>> import paddle + >>> paddle.enable_static() + >>> import paddle.distributed.fleet as fleet + >>> import paddle.nn.functional as F + + >>> hid_dim = 10 + >>> label_dim = 2 + >>> input_x = paddle.static.data(name='x', shape=[None, 13], dtype='float32') + >>> input_y = paddle.static.data(name='y', shape=[None, 1], dtype='int64') + >>> fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim, activation='tanh') + >>> fc_2 = paddle.static.nn.fc(x=fc_1, size=hid_dim, activation='tanh') + >>> prediction = paddle.static.nn.fc(x=[fc_2], size=label_dim, activation='softmax') + >>> cost = F.cross_entropy(input=prediction, label=input_y) + >>> avg_cost = paddle.mean(x=cost) + + >>> fleet.init(is_collective=True) + >>> strategy = fleet.DistributedStrategy() + >>> linear = paddle.nn.Linear(10, 10) + >>> optimizer = paddle.optimizer.SGD(learning_rate=0.001, parameters=linear.parameters()) + >>> optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) + >>> optimizer.minimize(avg_cost) + + >>> # for more examples, please reference https://github.com/PaddlePaddle/PaddleFleetX """ if not isinstance(loss, list): diff --git a/python/paddle/incubate/nn/functional/fused_rms_norm.py b/python/paddle/incubate/nn/functional/fused_rms_norm.py index 3995cd4a4087d0..99f9c4e72e77d0 100644 --- a/python/paddle/incubate/nn/functional/fused_rms_norm.py +++ b/python/paddle/incubate/nn/functional/fused_rms_norm.py @@ -54,14 +54,15 @@ def fused_rms_norm( Examples: .. code-block:: python - # required: gpu - import paddle + >>> # doctest: +REQUIRES(env:GPU) + >>> import paddle + >>> paddle.device.set_device('gpu') - paddle_x = paddle.cast(paddle.randn(shape=[32, 256]), dtype=paddle.float16) - paddle_weight = paddle.cast(paddle.randn(shape=[256]), dtype=paddle.float16) - paddle_bias = paddle.cast(paddle.randn(shape=[256]), dtype=paddle.float16) - epsilon = 1e-6 - paddle_rmsnorm = paddle.incubate.nn.functional.fused_rms_norm(paddle_x, paddle_weight, paddle_bias, epsilon, 1) + >>> paddle_x = paddle.cast(paddle.randn(shape=[32, 256]), dtype=paddle.float16) + >>> paddle_weight = paddle.cast(paddle.randn(shape=[256]), dtype=paddle.float16) + >>> paddle_bias = paddle.cast(paddle.randn(shape=[256]), dtype=paddle.float16) + >>> epsilon = 1e-6 + >>> paddle_rmsnorm = paddle.incubate.nn.functional.fused_rms_norm(paddle_x, paddle_weight, paddle_bias, epsilon, 1) """ if in_dynamic_or_pir_mode(): return _C_ops.rms_norm( diff --git a/python/paddle/incubate/nn/functional/fused_transformer.py b/python/paddle/incubate/nn/functional/fused_transformer.py index 355b5916b5ddb2..c4cf8abfdb3546 100644 --- a/python/paddle/incubate/nn/functional/fused_transformer.py +++ b/python/paddle/incubate/nn/functional/fused_transformer.py @@ -56,20 +56,20 @@ def fused_feedforward( This operator only supports running on GPU. The function of the operator is consistent with the following pseudo code: - .. code-block:: python - - residual = x - if pre_layer_norm: - out = layer_norm1(x) - else: - out = x - out = linear2(dropout1(activation(linear1(src)))) - if add_residual: - out = residual + dropout2(out) - else: - out = dropout2(out) - if not pre_layer_norm: - out = layer_norm2(out) + .. code-block:: text + + >>> residual = x + >>> if pre_layer_norm: + ... out = layer_norm1(x) + ... else: + ... out = x + >>> out = linear2(dropout1(activation(linear1(src)))) + >>> if add_residual: + ... out = residual + dropout2(out) + ... else: + ... out = dropout2(out) + >>> if not pre_layer_norm: + ... out = layer_norm2(out) Args: @@ -110,16 +110,17 @@ def fused_feedforward( Examples: .. code-block:: python - # required: gpu - import paddle - import paddle.incubate.nn.functional as F - - x = paddle.randn(shape=(1, 8, 8), dtype="float32") - linear1_weight = paddle.randn(shape=(8, 8), dtype="float32") - linear2_weight = paddle.randn(shape=(8, 8), dtype="float32") - out = F.fused_feedforward(x, linear1_weight, linear2_weight) - print(out.shape) - # (1, 8, 8) + >>> # doctest: +REQUIRES(env:GPU) + >>> import paddle + >>> paddle.device.set_device('gpu') + >>> import paddle.incubate.nn.functional as F + + >>> x = paddle.randn(shape=(1, 8, 8), dtype="float32") + >>> linear1_weight = paddle.randn(shape=(8, 8), dtype="float32") + >>> linear2_weight = paddle.randn(shape=(8, 8), dtype="float32") + >>> out = F.fused_feedforward(x, linear1_weight, linear2_weight) + >>> print(out.shape) + [1, 8, 8] """ _verify_dropout_rate(dropout1_rate) _verify_dropout_rate(dropout2_rate) @@ -288,9 +289,9 @@ def fused_bias_dropout_residual_layer_norm( The fused_bias_dropout_residual_layer_norm operator. The pseudo code is as follows: - .. code-block:: python + .. code-block:: text - y = layer_norm(residual + dropout(bias + x)) + >>> y = layer_norm(residual + dropout(bias + x)) Parameters: x (Tensor): The input tensor. The shape is `[*, embed\_dim]`. @@ -323,21 +324,22 @@ def fused_bias_dropout_residual_layer_norm( Examples: .. code-block:: python - # required: gpu - import paddle - import paddle.incubate.nn.functional as F - - # input: [batch_size, seq_len, embed_dim] - x = paddle.rand(shape=(2, 4, 128), dtype="float32") - # residual: [batch_size, seq_len, embed_dim] - residual = paddle.rand(shape=(2, 4, 128), dtype="float32") - # linear bias: [embed_dim] - bias = paddle.rand(shape=[128], dtype="float32") - # output: [batch_size, seq_len, embed_dim] - output = F.fused_bias_dropout_residual_layer_norm( - x, residual, bias) - # [2, 4, 128] - print(output.shape) + >>> # doctest: +REQUIRES(env:GPU) + >>> import paddle + >>> paddle.device.set_device('gpu') + >>> import paddle.incubate.nn.functional as F + + >>> # input: [batch_size, seq_len, embed_dim] + >>> x = paddle.rand(shape=(2, 4, 128), dtype="float32") + >>> # residual: [batch_size, seq_len, embed_dim] + >>> residual = paddle.rand(shape=(2, 4, 128), dtype="float32") + >>> # linear bias: [embed_dim] + >>> bias = paddle.rand(shape=[128], dtype="float32") + >>> # output: [batch_size, seq_len, embed_dim] + >>> output = F.fused_bias_dropout_residual_layer_norm( + ... x, residual, bias) + >>> print(output.shape) + [2, 4, 128] """ seed = None @@ -493,35 +495,35 @@ def fused_multi_head_attention( to information from different representation subspaces. This API only support self_attention. The pseudo code is as follows: - .. code-block:: python - - residual = x - if pre_layer_norm: - out = layer_norm(x) - else: - out = x - # compute q, k, v - out = matmul(out, qkv_weight) + qkv_bias - out = transpose(out, perm=[2, 0, 3, 1, 4]) - # extract q, k and v from out - q = out[0:1,::] * (head_dim ** -0.5) - k = out[1:2,::] - v = out[2:3,::] - out = matmul(q, k, transpose_y=True) - out = out + attn_mask - out = softmax(out) - out = dropout(out) - out = matmul(out, v) - # combine heads - out = transpose(out, perm=[0, 2, 1, 3]) - # project to output - out = linear(out) - if add_residual: - out = residual + dropout(out) - else: - out = dropout(out) - if not pre_layer_norm: - out = layer_norm(out) + .. code-block:: text + + >>> residual = x + >>> if pre_layer_norm: + ... out = layer_norm(x) + ... else: + ... out = x + >>> # compute q, k, v + >>> out = matmul(out, qkv_weight) + qkv_bias + >>> out = transpose(out, perm=[2, 0, 3, 1, 4]) + >>> # extract q, k and v from out + >>> q = out[0:1,::] * (head_dim ** -0.5) + >>> k = out[1:2,::] + >>> v = out[2:3,::] + >>> out = matmul(q, k, transpose_y=True) + >>> out = out + attn_mask + >>> out = softmax(out) + >>> out = dropout(out) + >>> out = matmul(out, v) + >>> # combine heads + >>> out = transpose(out, perm=[0, 2, 1, 3]) + >>> # project to output + >>> out = linear(out) + >>> if add_residual: + ... out = residual + dropout(out) + ... else: + ... out = dropout(out) + >>> if not pre_layer_norm: + ... out = layer_norm(out) Parameters: @@ -581,30 +583,31 @@ def fused_multi_head_attention( .. code-block:: python - # required: gpu - import paddle - import paddle.incubate.nn.functional as F - - # input: [batch_size, seq_len, embed_dim] - x = paddle.rand(shape=(2, 4, 128), dtype="float32") - # qkv_weight: [3, num_head, head_dim, embed_dim] - qkv_weight = paddle.rand(shape=(3, 4, 32, 128), dtype="float32") - # qkv_bias: [3, num_head, head_dim] - qkv_bias = paddle.rand(shape=(3, 4, 32), dtype="float32") - # linear_weight: [embed_dim, embed_dim] - linear_weight = paddle.rand(shape=(128, 128), dtype="float32") - # linear_bias: [embed_dim] - linear_bias = paddle.rand(shape=[128], dtype="float32") - # self attention mask: [batch_size, num_heads, seq_len, seq_len] - attn_mask = paddle.rand(shape=(2, 4, 4, 4), dtype="float32") - - # output: [batch_size, seq_len, embed_dim] - output = F.fused_multi_head_attention( - x, qkv_weight, linear_weight, False, - None, None, None, None, 1e-5, qkv_bias, - linear_bias, None, attn_mask) - # [2, 4, 128] - print(output.shape) + >>> # doctest: +REQUIRES(env:GPU) + >>> import paddle + >>> paddle.device.set_device('gpu') + >>> import paddle.incubate.nn.functional as F + + >>> # input: [batch_size, seq_len, embed_dim] + >>> x = paddle.rand(shape=(2, 4, 128), dtype="float32") + >>> # qkv_weight: [3, num_head, head_dim, embed_dim] + >>> qkv_weight = paddle.rand(shape=(3, 4, 32, 128), dtype="float32") + >>> # qkv_bias: [3, num_head, head_dim] + >>> qkv_bias = paddle.rand(shape=(3, 4, 32), dtype="float32") + >>> # linear_weight: [embed_dim, embed_dim] + >>> linear_weight = paddle.rand(shape=(128, 128), dtype="float32") + >>> # linear_bias: [embed_dim] + >>> linear_bias = paddle.rand(shape=[128], dtype="float32") + >>> # self attention mask: [batch_size, num_heads, seq_len, seq_len] + >>> attn_mask = paddle.rand(shape=(2, 4, 4, 4), dtype="float32") + + >>> # output: [batch_size, seq_len, embed_dim] + >>> output = F.fused_multi_head_attention( + ... x, qkv_weight, linear_weight, False, + ... None, None, None, None, 1e-5, qkv_bias, + ... linear_bias, None, attn_mask) + >>> print(output.shape) + [2, 4, 128] """ seed = None @@ -906,39 +909,39 @@ def fused_multi_transformer( This operator only supports running on GPU. The function of the transformer layer is consistent with the following pseudo code: - .. code-block:: python - - if pre_layer_norm: - out = layer_norm(x) - out = qkv_linear(out) + qkv_bias - else: - out = qkv_linear(x) + qkv_bias - out = transpose(out, perm=[2, 0, 3, 1, 4]) - # extract q, k and v from out. - q = out[0:1, ::] - k = out[1:2, ::] - v = out[2:3, ::] - out = q * k^t - out = attn_mask + out - out = softmax(out) - out = dropout(out) - out = out * v - out = transpose(out, perm=[0, 2, 1, 3]) - out = linear(out) - if pre_layer_norm: - out = x + dropout(out + bias) - else: - out = layer_norm(x + dropout(out + bias)) - - residual = out; - if pre_layer_norm: - out = ffn_layer_norm(out) - out = ffn1_linear(out) - out = dropout(activation(out + ffn1_bias)) - out = ffn2_linear(out) - out = residual + dropout(out + ffn2_bias) - if not pre_layer_norm: - out = ffn_layer_norm(out) + .. code-block:: text + + >>> if pre_layer_norm: + ... out = layer_norm(x) + ... out = qkv_linear(out) + qkv_bias + ... else: + ... out = qkv_linear(x) + qkv_bias + >>> out = transpose(out, perm=[2, 0, 3, 1, 4]) + >>> # extract q, k and v from out. + >>> q = out[0:1, ::] + >>> k = out[1:2, ::] + >>> v = out[2:3, ::] + >>> out = q * k^t + >>> out = attn_mask + out + >>> out = softmax(out) + >>> out = dropout(out) + >>> out = out * v + >>> out = transpose(out, perm=[0, 2, 1, 3]) + >>> out = linear(out) + >>> if pre_layer_norm: + ... out = x + dropout(out + bias) + ... else: + ... out = layer_norm(x + dropout(out + bias)) + + >>> residual = out; + >>> if pre_layer_norm: + ... out = ffn_layer_norm(out) + >>> out = ffn1_linear(out) + >>> out = dropout(activation(out + ffn1_bias)) + >>> out = ffn2_linear(out) + >>> out = residual + dropout(out + ffn2_bias) + >>> if not pre_layer_norm: + ... out = ffn_layer_norm(out) Args: x (Tensor): the input tensor could be 3-D tensor, the input data type could be float16 or float32, the shape is `[batch\_size, sequence\_length, d\_model]`. @@ -996,48 +999,49 @@ def fused_multi_transformer( Examples: .. code-block:: python - # required: gpu - import paddle - import paddle.incubate.nn.functional as F - - # input: [batch_size, seq_len, embed_dim] - x = paddle.rand(shape=(2, 4, 128), dtype="float32") - - # ln_scale: [embed_dim], ln_bias: [embed_dim] - ln_scale = paddle.rand(shape=(128,), dtype="float32") - ln_bias = paddle.rand(shape=(128,), dtype="float32") - - # qkv_weight: [3, num_head, head_dim, embed_dim], qkv_bias: [3, num_head, head_dim] - qkv_weight = paddle.rand(shape=(3, 4, 32, 128), dtype="float32") - qkv_bias = paddle.rand(shape=(3, 4, 32), dtype="float32") - - # linear_weight: [embed_dim, embed_dim], linear_bias: [embed_dim] - linear_weight = paddle.rand(shape=(128, 128), dtype="float32") - linear_bias = paddle.rand(shape=(128,), dtype="float32") - - # ffn_ln_scale: [embed_dim], ffn_ln_bias: [embed_dim] - ffn_ln_scale = paddle.rand(shape=(128,), dtype="float32") - ffn_ln_bias = paddle.rand(shape=(128,), dtype="float32") - - # ffn1_weight: [embed_dim, 4*embed_dim], ffn1_bias: [4*embed_dim] - ffn1_weight = paddle.rand(shape=(128, 4*128), dtype="float32") - ffn1_bias = paddle.rand(shape=(4*128,), dtype="float32") - - # ffn2_weight: [4*embed_dim, embed_dim], ffn2_bias: [embed_dim] - ffn2_weight = paddle.rand(shape=(4*128, 128), dtype="float32") - ffn2_bias = paddle.rand(shape=(128,), dtype="float32") - - # self attention mask: [batch_size, 1, seq_len, seq_len] - attn_mask = paddle.rand(shape=(2, 1, 4, 4), dtype="float32") - - # output: [batch_size, seq_len, embed_dim] - output = F.fused_multi_transformer( - x, [ln_scale], [ln_bias], [qkv_weight], [qkv_bias], - [linear_weight], [linear_bias], [ffn_ln_scale], [ffn_ln_bias], - [ffn1_weight], [ffn1_bias], [ffn2_weight], [ffn2_bias], - attn_mask=attn_mask) - # [2, 4, 128] - print(output.shape) + >>> # doctest: +REQUIRES(env:GPU) + >>> import paddle + >>> paddle.device.set_device('gpu') + >>> import paddle.incubate.nn.functional as F + + >>> # input: [batch_size, seq_len, embed_dim] + >>> x = paddle.rand(shape=(2, 4, 128), dtype="float32") + + >>> # ln_scale: [embed_dim], ln_bias: [embed_dim] + >>> ln_scale = paddle.rand(shape=(128,), dtype="float32") + >>> ln_bias = paddle.rand(shape=(128,), dtype="float32") + + >>> # qkv_weight: [3, num_head, head_dim, embed_dim], qkv_bias: [3, num_head, head_dim] + >>> qkv_weight = paddle.rand(shape=(3, 4, 32, 128), dtype="float32") + >>> qkv_bias = paddle.rand(shape=(3, 4, 32), dtype="float32") + + >>> # linear_weight: [embed_dim, embed_dim], linear_bias: [embed_dim] + >>> linear_weight = paddle.rand(shape=(128, 128), dtype="float32") + >>> linear_bias = paddle.rand(shape=(128,), dtype="float32") + + >>> # ffn_ln_scale: [embed_dim], ffn_ln_bias: [embed_dim] + >>> ffn_ln_scale = paddle.rand(shape=(128,), dtype="float32") + >>> ffn_ln_bias = paddle.rand(shape=(128,), dtype="float32") + + >>> # ffn1_weight: [embed_dim, 4*embed_dim], ffn1_bias: [4*embed_dim] + >>> ffn1_weight = paddle.rand(shape=(128, 4*128), dtype="float32") + >>> ffn1_bias = paddle.rand(shape=(4*128,), dtype="float32") + + >>> # ffn2_weight: [4*embed_dim, embed_dim], ffn2_bias: [embed_dim] + >>> ffn2_weight = paddle.rand(shape=(4*128, 128), dtype="float32") + >>> ffn2_bias = paddle.rand(shape=(128,), dtype="float32") + + >>> # self attention mask: [batch_size, 1, seq_len, seq_len] + >>> attn_mask = paddle.rand(shape=(2, 1, 4, 4), dtype="float32") + + >>> # output: [batch_size, seq_len, embed_dim] + >>> output = F.fused_multi_transformer( + ... x, [ln_scale], [ln_bias], [qkv_weight], [qkv_bias], + ... [linear_weight], [linear_bias], [ffn_ln_scale], [ffn_ln_bias], + ... [ffn1_weight], [ffn1_bias], [ffn2_weight], [ffn2_bias], + ... attn_mask=attn_mask) + >>> print(output.shape) + [2, 4, 128] """ if mode not in ('downscale_in_infer', 'upscale_in_train'): raise ValueError( diff --git a/python/paddle/incubate/nn/layer/fused_dropout_nd.py b/python/paddle/incubate/nn/layer/fused_dropout_nd.py index ded171158fe3dc..09f083da88c741 100644 --- a/python/paddle/incubate/nn/layer/fused_dropout_nd.py +++ b/python/paddle/incubate/nn/layer/fused_dropout_nd.py @@ -54,6 +54,7 @@ class FusedDropout(paddle.nn.Layer): .. code-block:: python >>> import paddle + >>> paddle.seed(2023) >>> x = paddle.to_tensor([[1, 2, 3], [4, 5, 6]], dtype="float32") >>> m = paddle.incubate.nn.FusedDropout(p=0.5) @@ -61,15 +62,15 @@ class FusedDropout(paddle.nn.Layer): >>> y_train = m(x) >>> print(y_train) Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True, - [[2., 0., 6.], - [0., 0., 0.]]) + [[0., 0., 6.], + [0., 0., 0.]]) >>> m.eval() # switch the model to test phase >>> y_test = m(x) >>> print(y_test) Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True, - [[1., 2., 3.], - [4., 5., 6.]]) + [[1., 2., 3.], + [4., 5., 6.]]) """ def __init__(self, p=0.5, axis=None, mode="upscale_in_train", name=None):