From 93f351089c790b521537dbc4aacee6f1796b9b46 Mon Sep 17 00:00:00 2001 From: ceci3 <592712189@qq.com> Date: Fri, 7 Aug 2020 03:36:01 +0000 Subject: [PATCH 01/10] add SyncBatchNorm,test=develop --- paddle/fluid/pybind/op_function_generator.cc | 4 + python/paddle/fluid/dygraph/nn.py | 215 +++++++++++++++++- .../parallel_dygraph_sync_batch_norm.py | 91 ++++++++ .../test_parallel_dygraph_sync_batch_norm.py | 40 ++++ python/paddle/nn/__init__.py | 1 + python/paddle/nn/layer/__init__.py | 1 + python/paddle/nn/layer/norm.py | 4 +- 7 files changed, 354 insertions(+), 2 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py create mode 100644 python/paddle/fluid/tests/unittests/test_parallel_dygraph_sync_batch_norm.py diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc index 7412eede118d1..50554aaf9a8ed 100644 --- a/paddle/fluid/pybind/op_function_generator.cc +++ b/paddle/fluid/pybind/op_function_generator.cc @@ -56,6 +56,9 @@ std::map> op_outs_map = { {"batch_norm", {"Y", "MeanOut", "VarianceOut", "SavedMean", "SavedVariance", "ReserveSpace"}}, + {"sync_batch_norm", + {"Y", "MeanOut", "VarianceOut", "SavedMean", "SavedVariance", + "ReserveSpace"}}, }; // NOTE(zhiqiu): Commonly, the outputs in auto-generated OP function are @@ -75,6 +78,7 @@ std::map> op_passing_outs_map = { {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}}, {"momentum", {"ParamOut", "VelocityOut"}}, {"batch_norm", {"MeanOut", "VarianceOut"}}, + {"sync_batch_norm", {"MeanOut", "VarianceOut"}}, {"accuracy", {"Correct", "Total"}}, {"fill_constant", {"Out"}}, {"matmul", {"Out"}}, diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py index cc2b746b0c1e9..3205c7921d246 100644 --- a/python/paddle/fluid/dygraph/nn.py +++ b/python/paddle/fluid/dygraph/nn.py @@ -35,7 +35,7 @@ 'Conv2D', 'Conv3D', 'Pool2D', 'Linear', 'BatchNorm', 'Dropout', 'Embedding', 'GRUUnit', 'InstanceNorm', 'LayerNorm', 'NCE', 'PRelu', 'BilinearTensorProduct', 'Conv2DTranspose', 'Conv3DTranspose', 'GroupNorm', - 'SpectralNorm', 'TreeConv' + 'SpectralNorm', 'TreeConv', 'SyncBatchNorm' ] @@ -3182,3 +3182,216 @@ def forward(self, nodes_vector, edge_set): else: pre_activation = out return self._helper.append_activation(pre_activation, act=self._act) + + +class SyncBatchNorm(layers.Layer): + """ + :alias_main: paddle.nn.SyncBatchNorm + :alias: paddle.nn.SyncBatchNorm,paddle.nn.layer.SyncBatchNorm,paddle.nn.layer.norm.SyncBatchNorm + :old_api: paddle.fluid.dygraph.SyncBatchNorm + + This interface is used to construct a callable object of the ``SyncBatchNorm`` class. + For more details, refer to code examples. + It implements the function of the Batch Normalization Layer and can be used + as a normalizer function for conv2d and fully connected operations. + The data is normalized by the mean and variance of the channel based on all mini-batches + of the same process groups. + Refer to `Batch Normalization: Accelerating Deep Network Training by Reducing + Internal Covariate Shift `_ + for more details. + + When model in train mode, the :math:`\\mu_{\\beta}` + and :math:`\\sigma_{\\beta}^{2}` are the statistics of all mini-batches in the same process groups. + Calculated as follows: + + .. math:: + + \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\ + \ mini-batch\ mean \\\\ + \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\ + \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\ + + - :math:`x` : mini-batch data + - :math:`m` : the size of the mini-batch data + + When model in eval mode, the :math:`\\mu_{\\beta}` + and :math:`\\sigma_{\\beta}^{2}` are global or running statistics (moving_mean and moving_variance). + It usually got from the pre-trained model. Calculated as follows: + + .. math:: + moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\ + moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\ + + The normalization function formula is as follows: + + .. math:: + + \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\ + \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\ + y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift + + - :math:`\\epsilon` : add a smaller value to the variance to prevent division by zero + - :math:`\\gamma` : trainable proportional parameter + - :math:`\\beta` : trainable deviation parameter + + **Note**: + moving mean and moving variance will be calculate whether `track_running_stats` is set to `True` + or `False`, we will fix it in the next time. + + Parameters: + num_features(int): Indicate the number of channels of the input ``Tensor``. + eps(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5. + momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9. + scale_attr(ParamAttr, optional): The parameter attribute for Parameter `scale` + of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm + will create ParamAttr as param_attr. If the Initializer of the param_attr + is not set, the parameter is initialized with Xavier. Default: None. + bias_attr(ParamAttr, optional): The parameter attribute for the bias of batch_norm. + If it is set to None or one attribute of ParamAttr, batch_norm + will create ParamAttr as bias_attr. If the Initializer of the bias_attr + is not set, the bias is initialized zero. Default: None. + track_running_stats(bool, optional): Whether to compute global stats, which including running mean and + running variance. Default: True. + + Returns: + None + + Examples: + .. code-block:: python + + from paddle.fluid.dygraph import to_variable + import paddle.nn as nn + import numpy as np + + x = np.random.random(size=(3, 10, 3, 7)).astype('float32') + with fluid.dygraph.guard(): + x = to_variable(x) + sync_batch_norm = nn.SyncBatchNorm(10) + hidden1 = sync_batch_norm(x) + """ + + def __init__(self, + num_features, + eps=1e-05, + momentum=0.9, + track_running_stats=True, + scale_attr=None, + bias_attr=None, + name=None): + super(SyncBatchNorm, self).__init__() + self._scale_attr = scale_attr + self._bias_attr = bias_attr + + assert bias_attr is not False, "bias_attr should not be False in batch_norm." + + self._dtype = "float32" + + param_shape = [num_features] + + # create parameter + self.weight = self.create_parameter( + attr=self._scale_attr, + shape=param_shape, + dtype=self._dtype, + default_initializer=Constant(1.0)) + self.weight.stop_gradient = self._scale_attr != None and self._scale_attr.learning_rate == 0. + + self.bias = self.create_parameter( + attr=self._bias_attr, + shape=param_shape, + dtype=self._dtype, + is_bias=True) + self.bias.stop_gradient = self._scale_attr != None and self._scale_attr.learning_rate == 0. + + self._mean = self.create_parameter( + attr=ParamAttr( + name=None, + initializer=Constant(0.0), + trainable=False, + do_model_average=False), + shape=param_shape, + dtype=self._dtype) + self._mean.stop_gradient = True + + self._variance = self.create_parameter( + attr=ParamAttr( + name=None, + initializer=Constant(1.0), + trainable=False, + do_model_average=False), + shape=param_shape, + dtype=self._dtype) + self._variance.stop_gradient = True + + self._data_layout = 'NCHW' + self._momentum = momentum + self._epsilon = epsilon + self._track_running_stats = track_running_stats + + def forward(self, input): + # create output + # mean and mean_out share the same memory + mean_out = self._mean + # variance and variance out share the same memory + variance_out = self._variance + + ### train mode: use mini-batch stats, eval mode: use global stats + if self.training: + use_global_stats = False + trainable_statistics = False + else: + use_global_stats = True + trainable_statistics = False + + if in_dygraph_mode(): + attrs = ("momentum", self._momentum, "epsilon", self._epsilon, + "is_test", not self.training, "data_layout", + self._data_layout, "use_mkldnn", False, "fuse_with_relu", + False, "use_global_stats", use_global_stats, + 'trainable_statistics', trainable_statistics) + sync_batch_norm_out, _, _, _, _, _ = core.ops.sync_batch_norm( + input, self.weight, self.bias, self._mean, self._variance, + mean_out, variance_out, *attrs) + + return sync_batch_norm_out + + check_variable_and_dtype(input, 'input', + ['float16', 'float32', 'float64'], 'BatchNorm') + + attrs = { + "momentum": self._momentum, + "epsilon": self._epsilon, + "is_test": not self.training, + "data_layout": self._data_layout, + "use_mkldnn": False, + "fuse_with_relu": False, + "use_global_stats": use_global_stats, + "trainable_statistics": trainable_statistics, + } + + inputs = { + "X": [input], + "Scale": [self.weight], + "Bias": [self.bias], + "Mean": [self._mean], + "Variance": [self._variance] + } + + saved_mean = self._helper.create_variable_for_type_inference( + dtype=self._dtype, stop_gradient=True) + saved_variance = self._helper.create_variable_for_type_inference( + dtype=self._dtype, stop_gradient=True) + sync_batch_norm_out = input if self._in_place else self._helper.create_variable_for_type_inference( + self._dtype) + + outputs = { + "Y": [sync_batch_norm_out], + "MeanOut": [mean_out], + "VarianceOut": [variance_out], + "SavedMean": [saved_mean], + "SavedVariance": [saved_variance] + } + + self._helper.append_op( + type="sync_batch_norm", inputs=inputs, outputs=outputs, attrs=attrs) + return sync_batch_norm_out diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py new file mode 100644 index 0000000000000..96be49e277deb --- /dev/null +++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py @@ -0,0 +1,91 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +import contextlib +import unittest +import numpy as np +import six +import pickle + +import paddle +import paddle.fluid as fluid +import paddle.fluid.dygraph as dygraph +from paddle.fluid import core +from paddle.fluid.optimizer import SGDOptimizer +#from paddle.nn import Conv2D, Pool2D, Linear, SyncBatchNorm +from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear, SyncBatchNorm +from paddle.fluid.dygraph.base import to_variable + +from test_dist_base import runtime_main, TestParallelDyGraphRunnerBase + + +class TestLayer(fluid.dygraph.Layer): + def __init__(self, + num_channels, + num_filters, + filter_size, + stride=1, + groups=1, + act=None): + super(TestLayer, self).__init__() + + self._conv = Conv2D( + num_channels=num_channels, + num_filters=num_filters, + filter_size=filter_size, + stride=stride, + padding=(filter_size - 1) // 2, + groups=groups, + act=None, + bias_attr=False) + + self._sync_batch_norm = SyncBatchNorm(num_filters) + + def forward(self, inputs): + y = self._conv(inputs) + y = self._sync_batch_norm(y) + + return y + + +class TestSyncBatchNorm(TestParallelDyGraphRunnerBase): + def get_model(self): + model = TestLayer(3, 64, 7) + train_reader = paddle.batch( + paddle.dataset.flowers.test(use_xmap=False), + batch_size=32, + drop_last=True) + opt = fluid.optimizer.Adam( + learning_rate=1e-3, parameter_list=model.parameters()) + return model, train_reader, opt + + def run_one_loop(self, model, opt, data): + batch_size = len(data) + dy_x_data = np.array([x[0].reshape(3, 224, 224) + for x in data]).astype('float32') + img = to_variable(dy_x_data) + img.stop_gradient = False + + out = model(img) + + out = fluid.layers.mean(out) + + return out + + +if __name__ == "__main__": + runtime_main(TestSyncBatchNorm) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sync_batch_norm.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sync_batch_norm.py new file mode 100644 index 0000000000000..7d48750b88eb8 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sync_batch_norm.py @@ -0,0 +1,40 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import unittest +from test_dist_base import TestDistBase +import paddle.fluid as fluid + +import os +flag_name = os.path.splitext(__file__)[0] + + +class TestParallelDygraphMnist(TestDistBase): + def _setup_config(self): + self._sync_mode = False + self._nccl2_mode = True + self._dygraph = True + + def test_mnist(self): + if fluid.core.is_compiled_with_cuda(): + self.check_with_place( + "parallel_dygraph_sync_batch_norm.py", + delta=1e-5, + check_error_log=True, + log_name=flag_name) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py index e074ca66bb1d3..c974fc1414512 100644 --- a/python/paddle/nn/__init__.py +++ b/python/paddle/nn/__init__.py @@ -85,6 +85,7 @@ from .layer.loss import NLLLoss #DEFINE_ALIAS from .layer.loss import BCELoss #DEFINE_ALIAS from .layer.norm import BatchNorm #DEFINE_ALIAS +from .layer.norm import SyncBatchNorm #DEFINE_ALIAS from .layer.norm import GroupNorm #DEFINE_ALIAS from .layer.norm import LayerNorm #DEFINE_ALIAS from .layer.norm import SpectralNorm #DEFINE_ALIAS diff --git a/python/paddle/nn/layer/__init__.py b/python/paddle/nn/layer/__init__.py index 4963ac360804f..674782d474817 100644 --- a/python/paddle/nn/layer/__init__.py +++ b/python/paddle/nn/layer/__init__.py @@ -61,6 +61,7 @@ from .loss import NLLLoss #DEFINE_ALIAS from .loss import BCELoss #DEFINE_ALIAS from .norm import BatchNorm #DEFINE_ALIAS +from .norm import SyncBatchNorm #DEFINE_ALIAS from .norm import GroupNorm #DEFINE_ALIAS from .norm import LayerNorm #DEFINE_ALIAS from .norm import SpectralNorm #DEFINE_ALIAS diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py index 1beba62c1809f..1d00f9c7b8b02 100644 --- a/python/paddle/nn/layer/norm.py +++ b/python/paddle/nn/layer/norm.py @@ -20,7 +20,9 @@ from ...fluid.dygraph import GroupNorm #DEFINE_ALIAS from ...fluid.dygraph import LayerNorm #DEFINE_ALIAS from ...fluid.dygraph import SpectralNorm #DEFINE_ALIAS +from ...fluid.dygraph import SyncBatchNorm #DEFINE_ALIAS __all__ = [ - 'BatchNorm', 'GroupNorm', 'LayerNorm', 'SpectralNorm', 'InstanceNorm' + 'BatchNorm', 'GroupNorm', 'LayerNorm', 'SpectralNorm', 'InstanceNorm', + 'SyncBatchNorm' ] From 44db6f528605f9139b33458573f4f1cc6eef60f4 Mon Sep 17 00:00:00 2001 From: ceci3 <592712189@qq.com> Date: Fri, 7 Aug 2020 07:19:21 +0000 Subject: [PATCH 02/10] fix unittest, test=develop --- python/paddle/fluid/dygraph/nn.py | 19 +++++++++++-------- .../fluid/tests/unittests/CMakeLists.txt | 2 ++ 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py index 86651eec939d1..94a7375ff5a8e 100644 --- a/python/paddle/fluid/dygraph/nn.py +++ b/python/paddle/fluid/dygraph/nn.py @@ -3191,17 +3191,17 @@ class SyncBatchNorm(layers.Layer): :old_api: paddle.fluid.dygraph.SyncBatchNorm This interface is used to construct a callable object of the ``SyncBatchNorm`` class. - For more details, refer to code examples. - It implements the function of the Batch Normalization Layer and can be used - as a normalizer function for conv2d and fully connected operations. - The data is normalized by the mean and variance of the channel based on all mini-batches - of the same process groups. + It implements the function of the Cross-GPU Synchronized Batch Normalization Layer, and can + be used as a normalizer function for other operations, such as conv2d and fully connected + operations. + The data is normalized by the mean and variance of the channel based on whole mini-batch + , which including data in all gpus. Refer to `Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift `_ for more details. When model in train mode, the :math:`\\mu_{\\beta}` - and :math:`\\sigma_{\\beta}^{2}` are the statistics of all mini-batches in the same process groups. + and :math:`\\sigma_{\\beta}^{2}` are the statistics of whole mini-batch data in all gpus. Calculated as follows: .. math:: @@ -3267,8 +3267,11 @@ class SyncBatchNorm(layers.Layer): x = np.random.random(size=(3, 10, 3, 7)).astype('float32') with fluid.dygraph.guard(): x = to_variable(x) - sync_batch_norm = nn.SyncBatchNorm(10) - hidden1 = sync_batch_norm(x) + if paddle.fluid.is_compiled_with_cuda(): + sync_batch_norm = nn.SyncBatchNorm(10) + hidden1 = sync_batch_norm(x) + else: + raise NotImplemented("SyncBatchNorm only support GPU") """ def __init__(self, diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 686844fea76c0..dc8328d809f29 100755 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -100,6 +100,8 @@ if (NOT ${WITH_GPU}) list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_se_resnext) LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sparse_embedding) LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_transformer) + LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sync_batch_norm) + elseif(${CUDNN_VERSION} VERSION_LESS 7100) LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op) endif() From 38bb371c9d8fa599beef5f99bc2ce0bf6645b81b Mon Sep 17 00:00:00 2001 From: ceci3 <592712189@qq.com> Date: Sun, 9 Aug 2020 08:24:16 +0000 Subject: [PATCH 03/10] add unittest,test=develop --- python/paddle/fluid/dygraph/nn.py | 25 ++++++++----------- .../fluid/tests/unittests/test_layers.py | 18 +++++++++++++ .../test_parallel_dygraph_sync_batch_norm.py | 2 +- .../unittests/test_sync_batch_norm_op.py | 18 +++++++++++++ 4 files changed, 47 insertions(+), 16 deletions(-) diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py index 94a7375ff5a8e..966300585c1da 100644 --- a/python/paddle/fluid/dygraph/nn.py +++ b/python/paddle/fluid/dygraph/nn.py @@ -3267,11 +3267,9 @@ class SyncBatchNorm(layers.Layer): x = np.random.random(size=(3, 10, 3, 7)).astype('float32') with fluid.dygraph.guard(): x = to_variable(x) - if paddle.fluid.is_compiled_with_cuda(): + if fluid.is_compiled_with_cuda(): sync_batch_norm = nn.SyncBatchNorm(10) hidden1 = sync_batch_norm(x) - else: - raise NotImplemented("SyncBatchNorm only support GPU") """ def __init__(self, @@ -3340,19 +3338,12 @@ def forward(self, input): variance_out = self._variance ### train mode: use mini-batch stats, eval mode: use global stats - if self.training: - use_global_stats = False - trainable_statistics = False - else: - use_global_stats = True - trainable_statistics = False - if in_dygraph_mode(): attrs = ("momentum", self._momentum, "epsilon", self._eps, "is_test", not self.training, "data_layout", self._data_layout, "use_mkldnn", False, "fuse_with_relu", - False, "use_global_stats", use_global_stats, - 'trainable_statistics', trainable_statistics) + False, "use_global_stats", not self.training, + 'trainable_statistics', False) sync_batch_norm_out, _, _, _, _, _ = core.ops.sync_batch_norm( input, self.weight, self.bias, self._mean, self._variance, mean_out, variance_out, *attrs) @@ -3369,8 +3360,8 @@ def forward(self, input): "data_layout": self._data_layout, "use_mkldnn": False, "fuse_with_relu": False, - "use_global_stats": use_global_stats, - "trainable_statistics": trainable_statistics, + "use_global_stats": not self.training, + "trainable_statistics": False, } inputs = { @@ -3385,7 +3376,7 @@ def forward(self, input): dtype=self._dtype, stop_gradient=True) saved_variance = self._helper.create_variable_for_type_inference( dtype=self._dtype, stop_gradient=True) - sync_batch_norm_out = input if self._in_place else self._helper.create_variable_for_type_inference( + sync_batch_norm_out = self._helper.create_variable_for_type_inference( self._dtype) outputs = { @@ -3415,6 +3406,10 @@ class Flatten(layers.Layer): start_axis(int): first dim to flatten (default = 1) stop_axis(int): last dim to flatten (default = -1). + Returns: + None + + Examples: .. code-block:: python diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 9da70e85f01c0..91186b2e95ae0 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -283,6 +283,24 @@ def test_layer_norm(self): with self.assertRaises(ValueError): lm(base.to_variable(inp)) + def test_SyncBatchNorm(self): + if core.is_compiled_with_cuda(): + with self.static_graph(): + t = layers.data(name='t', shape=[-1, 3, 5, 5], dtype='float32') + my_sync_bn = nn.SyncBatchNorm(3) + ret = my_sync_bn(t) + static_ret = self.get_static_graph_result( + feed={'t': np.ones( + [3, 3, 5, 5], dtype='float32')}, + fetch_list=[ret])[0] + + with self.dynamic_graph(): + t = np.ones([3, 3, 5, 5], dtype='float32') + my_syncbn = paddle.nn.SyncBatchNorm(3) + dy_ret = my_syncbn(base.to_variable(t)) + dy_ret_value = dy_ret.numpy() + self.assertTrue(np.array_equal(static_ret, static_ret)) + def test_relu(self): with self.static_graph(): t = layers.data(name='t', shape=[3, 3], dtype='float32') diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sync_batch_norm.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sync_batch_norm.py index 7d48750b88eb8..84e97127f4868 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sync_batch_norm.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sync_batch_norm.py @@ -25,7 +25,7 @@ class TestParallelDygraphMnist(TestDistBase): def _setup_config(self): self._sync_mode = False self._nccl2_mode = True - self._dygraph = True + self._dygraph = False #True def test_mnist(self): if fluid.core.is_compiled_with_cuda(): diff --git a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py index 8fd118c019303..806b6b90e7e2d 100644 --- a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py +++ b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py @@ -25,6 +25,7 @@ import paddle.fluid.core as core import paddle.fluid as fluid from paddle.fluid import compiler +from paddle.fluid import Program, program_guard from op_test import OpTest, _set_use_system_allocator @@ -202,5 +203,22 @@ def setUp(self): self.atol = 1e-2 +class TestDygraphSyncBatchNormAPIError(unittest.TestCase): + def test_errors(self): + if not core.is_compiled_with_cuda(): + return + + with program_guard(Program(), Program()): + my_sync_batch_norm = fluid.dygraph.SyncBatchNorm(10) + x1 = fluid.create_lod_tensor( + np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CUDAPlace(0)) + self.assertRaises(TypeError, my_sync_batch_norm, x1) + + # the input dtype of SyncBatchNorm must be float16 or float32 or float64 + # float16 only can be set on GPU place + x2 = fluid.layers.data(name='x2', shape=[3, 4, 5, 6], dtype="int32") + self.assertRaises(TypeError, my_sync_batch_norm, x2) + + if __name__ == '__main__': unittest.main() From b8360a83f54989fb577703bb48ab264e5cc7c49b Mon Sep 17 00:00:00 2001 From: ceci3 <592712189@qq.com> Date: Mon, 10 Aug 2020 03:32:26 +0000 Subject: [PATCH 04/10] fix, test=develop --- python/paddle/fluid/dygraph/nn.py | 6 +++--- .../tests/unittests/parallel_dygraph_sync_batch_norm.py | 2 +- .../unittests/test_parallel_dygraph_sync_batch_norm.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py index 966300585c1da..a082a302b1183 100644 --- a/python/paddle/fluid/dygraph/nn.py +++ b/python/paddle/fluid/dygraph/nn.py @@ -3222,7 +3222,7 @@ class SyncBatchNorm(layers.Layer): moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\ moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\ - The normalization function formula is as follows: + The formula of normalization is as follows: .. math:: @@ -3235,8 +3235,8 @@ class SyncBatchNorm(layers.Layer): - :math:`\\beta` : trainable deviation parameter **Note**: - moving mean and moving variance will be calculate whether `track_running_stats` is set to `True` - or `False`, we will fix it in the next time. + moving mean and moving variance will be calculated whether `track_running_stats` is set to `True` + or `False`, we will fix it in the next version. Parameters: num_features(int): Indicate the number of channels of the input ``Tensor``. diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py index 1cd02dadcc287..a150129b82a89 100644 --- a/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py +++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sync_batch_norm.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sync_batch_norm.py index 84e97127f4868..5c34b35fc83a3 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sync_batch_norm.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sync_batch_norm.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 445f0fa5500656994b32f9ee19dc2a33ccd30815 Mon Sep 17 00:00:00 2001 From: ceci3 <592712189@qq.com> Date: Mon, 10 Aug 2020 06:26:24 +0000 Subject: [PATCH 05/10] fix doc, test=develop --- python/paddle/fluid/dygraph/nn.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py index a082a302b1183..a4aa80ac8be9e 100644 --- a/python/paddle/fluid/dygraph/nn.py +++ b/python/paddle/fluid/dygraph/nn.py @@ -3264,12 +3264,14 @@ class SyncBatchNorm(layers.Layer): from paddle.fluid.dygraph import to_variable import numpy as np - x = np.random.random(size=(3, 10, 3, 7)).astype('float32') + x = np.array([[[[0.3, 0.4], [0.3, 0.07]], [[0.83, 0.37], [0.18, 0.93]]]]).astype('float32') with fluid.dygraph.guard(): x = to_variable(x) if fluid.is_compiled_with_cuda(): - sync_batch_norm = nn.SyncBatchNorm(10) + sync_batch_norm = nn.SyncBatchNorm(2) hidden1 = sync_batch_norm(x) + print(hidden1.numpy()) + # [[[[0.26824948, 1.0936325],[0.26824948, -1.6301316]],[[ 0.8095662, -0.665287],[-1.2744656, 1.1301866 ]]]] """ def __init__(self, From 39d82b564de32a472bcc48c9bba96bcb08ba7a1d Mon Sep 17 00:00:00 2001 From: ceci3 <592712189@qq.com> Date: Fri, 14 Aug 2020 02:52:24 +0000 Subject: [PATCH 06/10] fix docs,test=develop --- python/paddle/fluid/dygraph/nn.py | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py index a4aa80ac8be9e..90575b507f2b6 100644 --- a/python/paddle/fluid/dygraph/nn.py +++ b/python/paddle/fluid/dygraph/nn.py @@ -3186,10 +3186,6 @@ def forward(self, nodes_vector, edge_set): class SyncBatchNorm(layers.Layer): """ - :alias_main: paddle.nn.SyncBatchNorm - :alias: paddle.nn.SyncBatchNorm,paddle.nn.layer.SyncBatchNorm,paddle.nn.layer.norm.SyncBatchNorm - :old_api: paddle.fluid.dygraph.SyncBatchNorm - This interface is used to construct a callable object of the ``SyncBatchNorm`` class. It implements the function of the Cross-GPU Synchronized Batch Normalization Layer, and can be used as a normalizer function for other operations, such as conv2d and fully connected @@ -3259,19 +3255,18 @@ class SyncBatchNorm(layers.Layer): Examples: .. code-block:: python + import paddle import paddle.nn as nn - import paddle.fluid as fluid - from paddle.fluid.dygraph import to_variable import numpy as np x = np.array([[[[0.3, 0.4], [0.3, 0.07]], [[0.83, 0.37], [0.18, 0.93]]]]).astype('float32') - with fluid.dygraph.guard(): - x = to_variable(x) - if fluid.is_compiled_with_cuda(): - sync_batch_norm = nn.SyncBatchNorm(2) - hidden1 = sync_batch_norm(x) - print(hidden1.numpy()) - # [[[[0.26824948, 1.0936325],[0.26824948, -1.6301316]],[[ 0.8095662, -0.665287],[-1.2744656, 1.1301866 ]]]] + paddle.disable_static() + x = paddle.to_variable(x) + if paddle.fluid.is_compiled_with_cuda(): + sync_batch_norm = nn.SyncBatchNorm(2) + hidden1 = sync_batch_norm(x) + print(hidden1.numpy()) + # [[[[0.26824948, 1.0936325],[0.26824948, -1.6301316]],[[ 0.8095662, -0.665287],[-1.2744656, 1.1301866 ]]]] """ def __init__(self, From 07633024299f3be0c60dd25427e6f5721e7f6d5d Mon Sep 17 00:00:00 2001 From: ceci3 <592712189@qq.com> Date: Sun, 16 Aug 2020 05:46:29 +0000 Subject: [PATCH 07/10] fix syncbn,test=develop --- python/paddle/fluid/dygraph/nn.py | 112 ++++++++++++++++++++++-------- 1 file changed, 82 insertions(+), 30 deletions(-) diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py index a324024262541..0256e840b4145 100644 --- a/python/paddle/fluid/dygraph/nn.py +++ b/python/paddle/fluid/dygraph/nn.py @@ -20,6 +20,7 @@ from ..layers import nn as F from .. import dygraph_utils from . import layers +from .base import no_grad from ..framework import Variable, in_dygraph_mode, OpProtoHolder, Parameter, _dygraph_tracer, _varbase_creator, default_main_program from ..data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype from ..param_attr import ParamAttr @@ -3237,9 +3238,9 @@ class SyncBatchNorm(layers.Layer): Parameters: num_features(int): Indicate the number of channels of the input ``Tensor``. - eps(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5. + epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5. momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9. - scale_attr(ParamAttr, optional): The parameter attribute for Parameter `scale` + weight_attr(ParamAttr, optional): The parameter attribute for Parameter `scale` of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm will create ParamAttr as param_attr. If the Initializer of the param_attr is not set, the parameter is initialized with Xavier. Default: None. @@ -3272,43 +3273,57 @@ class SyncBatchNorm(layers.Layer): def __init__(self, num_features, - eps=1e-05, + epsilon=1e-05, momentum=0.9, track_running_stats=True, - scale_attr=None, + weight_attr=None, bias_attr=None, + data_format='NCHW', name=None): super(SyncBatchNorm, self).__init__() - self._scale_attr = scale_attr + self._weight_attr = weight_attr self._bias_attr = bias_attr + self._num_features = num_features + self._data_layout = data_format + self._momentum = momentum + self._epsilon = epsilon + self._track_running_stats = track_running_stats - assert bias_attr is not False, "bias_attr should not be False in batch_norm." + param_shape = [self._num_features] - self._dtype = "float32" + ### TODO(lvmengsi): remove create param when weight_attr=False in python when BatchNorm kernel support + if weight_attr == False: + self.weight = self.create_parameter( + attr=self._weight_attr, + shape=param_shape, + default_initializer=Constant(1.0)) + self.weight.stop_gradient = True - param_shape = [num_features] + self.bias = self.create_parameter( + attr=self._bias_attr, + shape=param_shape, + default_initializer=Constant(0.0), + is_bias=True) + self.bias.stop_gradient = True - # create parameter - self.weight = self.create_parameter( - attr=self._scale_attr, - shape=param_shape, - dtype=self._dtype, - default_initializer=Constant(1.0)) - self.weight.stop_gradient = self._scale_attr != None and self._scale_attr.learning_rate == 0. + else: + # create parameter + self.weight = self.create_parameter( + attr=self._weight_attr, + shape=param_shape, + default_initializer=Constant(1.0)) + self.weight.stop_gradient = self._weight_attr != None and self._weight_attr.learning_rate == 0. - self.bias = self.create_parameter( - attr=self._bias_attr, - shape=param_shape, - dtype=self._dtype, - is_bias=True) - self.bias.stop_gradient = self._scale_attr != None and self._scale_attr.learning_rate == 0. + self.bias = self.create_parameter( + attr=self._bias_attr, shape=param_shape, is_bias=True) + self.bias.stop_gradient = self._weight_attr != None and self._weight_attr.learning_rate == 0. self._mean = self.create_parameter( attr=ParamAttr( name=None, initializer=Constant(0.0), trainable=False, - do_model_average=False), + do_model_average=True), shape=param_shape, dtype=self._dtype) self._mean.stop_gradient = True @@ -3318,16 +3333,11 @@ def __init__(self, name=None, initializer=Constant(1.0), trainable=False, - do_model_average=False), + do_model_average=True), shape=param_shape, dtype=self._dtype) self._variance.stop_gradient = True - self._data_layout = 'NCHW' - self._momentum = momentum - self._eps = eps - self._track_running_stats = track_running_stats - def forward(self, input): # create output # mean and mean_out share the same memory @@ -3337,7 +3347,7 @@ def forward(self, input): ### train mode: use mini-batch stats, eval mode: use global stats if in_dygraph_mode(): - attrs = ("momentum", self._momentum, "epsilon", self._eps, + attrs = ("momentum", self._momentum, "epsilon", self._epsilon, "is_test", not self.training, "data_layout", self._data_layout, "use_mkldnn", False, "fuse_with_relu", False, "use_global_stats", not self.training, @@ -3353,7 +3363,7 @@ def forward(self, input): attrs = { "momentum": self._momentum, - "epsilon": self._eps, + "epsilon": self._epsilon, "is_test": not self.training, "data_layout": self._data_layout, "use_mkldnn": False, @@ -3389,6 +3399,48 @@ def forward(self, input): type="sync_batch_norm", inputs=inputs, outputs=outputs, attrs=attrs) return sync_batch_norm_out + ### TODO: remove comment after BatchNorm merged. + #@classmethod + #def convert_sync_batchnorm(cls, layer): + # """ + # Helper function to convert :class: `paddle.nn.BatchNorm` in the model to :class: `paddle.nn.SyncBatchNorm` layers. + + # Parameters: + # layer(paddle.fluid.dygraph.Layer): layer containing one or more `BatchNorm` layers. + + # Returns: + # A new SyncBatchNorm layer object if origin layer is BatchNorm Layer. + + # Examples: + + # .. code-block:: python + # import paddle + # import paddle.nn as nn + + # paddle.disable_static() + # model = nn.Sequential(nn.Conv2D(3, 5, 3), nn.BatchNorm(5)) + # sync_model = nn.SyncBatchNorm.convert(model) + + # """ + # layer_output = layer + # if isinstance(layer, BatchNorm): + # layer_output = SyncBatchNorm(layer._num_features, + # layer._epsilon, layer._momentum, + # layer._weight_attr, layer._bias_attr + # layer._data_layout) + + # if layer._weight_attr != False and layer._bias_attr != False: + # with no_grad(): + # layer_output.weight = layer.weight + # layer_output.bias = layer.bias + # layer_output._mean = layer._mean + # layer_output._variance = layer._variance + + # for name, sublayer in layer.named_sublayer(): + # layer_output.add_sublayer(name, cls.convert_sync_batchnorm(sublayer)) + # del layer + # return layer_output + class Flatten(layers.Layer): """ From 6e46c44f965013fd50c6e0c959cffd12af474066 Mon Sep 17 00:00:00 2001 From: ceci3 <592712189@qq.com> Date: Mon, 17 Aug 2020 05:54:18 +0000 Subject: [PATCH 08/10] fix docs, test=develop --- python/paddle/fluid/dygraph/nn.py | 110 ++++++++++-------------------- 1 file changed, 35 insertions(+), 75 deletions(-) diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py index 0256e840b4145..3fc2bf92d3c7e 100644 --- a/python/paddle/fluid/dygraph/nn.py +++ b/python/paddle/fluid/dygraph/nn.py @@ -3198,7 +3198,7 @@ class SyncBatchNorm(layers.Layer): Internal Covariate Shift `_ for more details. - When model in train mode, the :math:`\\mu_{\\beta}` + When model in training mode, the :math:`\\mu_{\\beta}` and :math:`\\sigma_{\\beta}^{2}` are the statistics of whole mini-batch data in all gpus. Calculated as follows: @@ -3209,12 +3209,12 @@ class SyncBatchNorm(layers.Layer): \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\ \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\ - - :math:`x` : mini-batch data - - :math:`m` : the size of the mini-batch data + - :math:`x` : whole mini-batch data in all gpus + - :math:`m` : the size of the whole mini-batch data - When model in eval mode, the :math:`\\mu_{\\beta}` - and :math:`\\sigma_{\\beta}^{2}` are global or running statistics (moving_mean and moving_variance). - It usually got from the pre-trained model. Calculated as follows: + When model in evaluation mode, the :math:`\\mu_{\\beta}` + and :math:`\\sigma_{\\beta}^{2}` are global statistics (moving_mean and moving_variance, + which usually got from the pre-trained model). Global statistics calculated as follows: .. math:: moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\ @@ -3229,25 +3229,23 @@ class SyncBatchNorm(layers.Layer): y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift - :math:`\\eps` : add a smaller value to the variance to prevent division by zero - - :math:`\\gamma` : trainable proportional parameter - - :math:`\\beta` : trainable deviation parameter - - **Note**: - moving mean and moving variance will be calculated whether `track_running_stats` is set to `True` - or `False`, we will fix it in the next version. + - :math:`\\gamma` : trainable scale parameter vector + - :math:`\\beta` : trainable shift parameter vector Parameters: num_features(int): Indicate the number of channels of the input ``Tensor``. epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5. momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9. - weight_attr(ParamAttr, optional): The parameter attribute for Parameter `scale` - of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm + weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale` + of this layer. If it is set to None or one attribute of ParamAttr, this layerr will create ParamAttr as param_attr. If the Initializer of the param_attr - is not set, the parameter is initialized with Xavier. Default: None. - bias_attr(ParamAttr, optional): The parameter attribute for the bias of batch_norm. - If it is set to None or one attribute of ParamAttr, batch_norm + is not set, the parameter is initialized with Xavier. If it is set to False, + this layer will not have trainable scale parameter. Default: None. + bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of this layer. + If it is set to None or one attribute of ParamAttr, this layer will create ParamAttr as bias_attr. If the Initializer of the bias_attr - is not set, the bias is initialized zero. Default: None. + is not set, the bias is initialized zero. If it is set to False, this layer will not + have trainable bias parameter. Default: None. track_running_stats(bool, optional): Whether to compute global stats, which including running mean and running variance. Default: True. @@ -3289,31 +3287,35 @@ def __init__(self, self._epsilon = epsilon self._track_running_stats = track_running_stats + if self._track_running_stats == False: + logging.warn( + "moving mean and moving variance will be calculated whether `track_running_stats` is set to `True` or `False`, we will fix it in the next version." + ) + param_shape = [self._num_features] - ### TODO(lvmengsi): remove create param when weight_attr=False in python when BatchNorm kernel support + # create parameter if weight_attr == False: self.weight = self.create_parameter( attr=self._weight_attr, shape=param_shape, default_initializer=Constant(1.0)) self.weight.stop_gradient = True + else: + self.weight = self.create_parameter( + attr=self._weight_attr, + shape=param_shape, + default_initializer=Constant(1.0)) + self.weight.stop_gradient = self._weight_attr != None and self._weight_attr.learning_rate == 0. + if bias_attr == False: self.bias = self.create_parameter( attr=self._bias_attr, shape=param_shape, default_initializer=Constant(0.0), is_bias=True) self.bias.stop_gradient = True - else: - # create parameter - self.weight = self.create_parameter( - attr=self._weight_attr, - shape=param_shape, - default_initializer=Constant(1.0)) - self.weight.stop_gradient = self._weight_attr != None and self._weight_attr.learning_rate == 0. - self.bias = self.create_parameter( attr=self._bias_attr, shape=param_shape, is_bias=True) self.bias.stop_gradient = self._weight_attr != None and self._weight_attr.learning_rate == 0. @@ -3338,7 +3340,7 @@ def __init__(self, dtype=self._dtype) self._variance.stop_gradient = True - def forward(self, input): + def forward(self, x): # create output # mean and mean_out share the same memory mean_out = self._mean @@ -3353,13 +3355,13 @@ def forward(self, input): False, "use_global_stats", not self.training, 'trainable_statistics', False) sync_batch_norm_out, _, _, _, _, _ = core.ops.sync_batch_norm( - input, self.weight, self.bias, self._mean, self._variance, - mean_out, variance_out, *attrs) + x, self.weight, self.bias, self._mean, self._variance, mean_out, + variance_out, *attrs) return sync_batch_norm_out - check_variable_and_dtype(input, 'input', - ['float16', 'float32', 'float64'], 'BatchNorm') + check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'], + 'BatchNorm') attrs = { "momentum": self._momentum, @@ -3373,7 +3375,7 @@ def forward(self, input): } inputs = { - "X": [input], + "X": [x], "Scale": [self.weight], "Bias": [self.bias], "Mean": [self._mean], @@ -3399,48 +3401,6 @@ def forward(self, input): type="sync_batch_norm", inputs=inputs, outputs=outputs, attrs=attrs) return sync_batch_norm_out - ### TODO: remove comment after BatchNorm merged. - #@classmethod - #def convert_sync_batchnorm(cls, layer): - # """ - # Helper function to convert :class: `paddle.nn.BatchNorm` in the model to :class: `paddle.nn.SyncBatchNorm` layers. - - # Parameters: - # layer(paddle.fluid.dygraph.Layer): layer containing one or more `BatchNorm` layers. - - # Returns: - # A new SyncBatchNorm layer object if origin layer is BatchNorm Layer. - - # Examples: - - # .. code-block:: python - # import paddle - # import paddle.nn as nn - - # paddle.disable_static() - # model = nn.Sequential(nn.Conv2D(3, 5, 3), nn.BatchNorm(5)) - # sync_model = nn.SyncBatchNorm.convert(model) - - # """ - # layer_output = layer - # if isinstance(layer, BatchNorm): - # layer_output = SyncBatchNorm(layer._num_features, - # layer._epsilon, layer._momentum, - # layer._weight_attr, layer._bias_attr - # layer._data_layout) - - # if layer._weight_attr != False and layer._bias_attr != False: - # with no_grad(): - # layer_output.weight = layer.weight - # layer_output.bias = layer.bias - # layer_output._mean = layer._mean - # layer_output._variance = layer._variance - - # for name, sublayer in layer.named_sublayer(): - # layer_output.add_sublayer(name, cls.convert_sync_batchnorm(sublayer)) - # del layer - # return layer_output - class Flatten(layers.Layer): """ From c3d6db64acfa818a4c7b22432e0dbf5d61f8ee13 Mon Sep 17 00:00:00 2001 From: ceci3 <592712189@qq.com> Date: Mon, 17 Aug 2020 12:10:39 +0000 Subject: [PATCH 09/10] fix converage, test=develop --- python/paddle/fluid/dygraph/nn.py | 1 - .../parallel_dygraph_sync_batch_norm.py | 18 ++++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py index 3fc2bf92d3c7e..57f3333b64364 100644 --- a/python/paddle/fluid/dygraph/nn.py +++ b/python/paddle/fluid/dygraph/nn.py @@ -20,7 +20,6 @@ from ..layers import nn as F from .. import dygraph_utils from . import layers -from .base import no_grad from ..framework import Variable, in_dygraph_mode, OpProtoHolder, Parameter, _dygraph_tracer, _varbase_creator, default_main_program from ..data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype from ..param_attr import ParamAttr diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py index a150129b82a89..4d0a977bb4de8 100644 --- a/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py +++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py @@ -54,9 +54,27 @@ def __init__(self, self._sync_batch_norm = SyncBatchNorm(num_filters) + self._conv2 = Conv2D( + num_channels=num_channels, + num_filters=num_filters, + filter_size=filter_size, + stride=stride, + padding=(filter_size - 1) // 2, + groups=groups, + act=None, + bias_attr=False) + + self._sync_batch_norm2 = SyncBatchNorm( + num_filters, + weight_attr=False, + bias_attr=False, + track_running_stats=False) + def forward(self, inputs): y = self._conv(inputs) y = self._sync_batch_norm(y) + y = self._conv2(y) + y = self._sync_batch_norm2(y) return y From 5130ae358124a1c3d9cace833b624ca03db605cc Mon Sep 17 00:00:00 2001 From: ceci3 <592712189@qq.com> Date: Tue, 18 Aug 2020 03:11:59 +0000 Subject: [PATCH 10/10] fix unittest, test=develop --- python/paddle/fluid/dygraph/nn.py | 8 +++----- .../tests/unittests/parallel_dygraph_sync_batch_norm.py | 2 +- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py index 57f3333b64364..a6d81b5b6377f 100644 --- a/python/paddle/fluid/dygraph/nn.py +++ b/python/paddle/fluid/dygraph/nn.py @@ -3260,7 +3260,7 @@ class SyncBatchNorm(layers.Layer): x = np.array([[[[0.3, 0.4], [0.3, 0.07]], [[0.83, 0.37], [0.18, 0.93]]]]).astype('float32') paddle.disable_static() - x = paddle.to_variable(x) + x = paddle.to_tensor(x) if paddle.fluid.is_compiled_with_cuda(): sync_batch_norm = nn.SyncBatchNorm(2) hidden1 = sync_batch_norm(x) @@ -3296,9 +3296,7 @@ def __init__(self, # create parameter if weight_attr == False: self.weight = self.create_parameter( - attr=self._weight_attr, - shape=param_shape, - default_initializer=Constant(1.0)) + attr=None, shape=param_shape, default_initializer=Constant(1.0)) self.weight.stop_gradient = True else: self.weight = self.create_parameter( @@ -3309,7 +3307,7 @@ def __init__(self, if bias_attr == False: self.bias = self.create_parameter( - attr=self._bias_attr, + attr=None, shape=param_shape, default_initializer=Constant(0.0), is_bias=True) diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py index 4d0a977bb4de8..5e2059592b517 100644 --- a/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py +++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py @@ -55,7 +55,7 @@ def __init__(self, self._sync_batch_norm = SyncBatchNorm(num_filters) self._conv2 = Conv2D( - num_channels=num_channels, + num_channels=num_filters, num_filters=num_filters, filter_size=filter_size, stride=stride,