From 93f351089c790b521537dbc4aacee6f1796b9b46 Mon Sep 17 00:00:00 2001
From: ceci3 <592712189@qq.com>
Date: Fri, 7 Aug 2020 03:36:01 +0000
Subject: [PATCH 01/10] add SyncBatchNorm,test=develop

---
 paddle/fluid/pybind/op_function_generator.cc  |   4 +
 python/paddle/fluid/dygraph/nn.py             | 215 +++++++++++++++++-
 .../parallel_dygraph_sync_batch_norm.py       |  91 ++++++++
 .../test_parallel_dygraph_sync_batch_norm.py  |  40 ++++
 python/paddle/nn/__init__.py                  |   1 +
 python/paddle/nn/layer/__init__.py            |   1 +
 python/paddle/nn/layer/norm.py                |   4 +-
 7 files changed, 354 insertions(+), 2 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_parallel_dygraph_sync_batch_norm.py

diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc
index 7412eede118d1..50554aaf9a8ed 100644
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -56,6 +56,9 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
     {"batch_norm",
      {"Y", "MeanOut", "VarianceOut", "SavedMean", "SavedVariance",
       "ReserveSpace"}},
+    {"sync_batch_norm",
+     {"Y", "MeanOut", "VarianceOut", "SavedMean", "SavedVariance",
+      "ReserveSpace"}},
 };
 
 // NOTE(zhiqiu): Commonly, the outputs in auto-generated OP function are
@@ -75,6 +78,7 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = {
      {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}},
     {"momentum", {"ParamOut", "VelocityOut"}},
     {"batch_norm", {"MeanOut", "VarianceOut"}},
+    {"sync_batch_norm", {"MeanOut", "VarianceOut"}},
     {"accuracy", {"Correct", "Total"}},
     {"fill_constant", {"Out"}},
     {"matmul", {"Out"}},
diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index cc2b746b0c1e9..3205c7921d246 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -35,7 +35,7 @@
     'Conv2D', 'Conv3D', 'Pool2D', 'Linear', 'BatchNorm', 'Dropout', 'Embedding',
     'GRUUnit', 'InstanceNorm', 'LayerNorm', 'NCE', 'PRelu',
     'BilinearTensorProduct', 'Conv2DTranspose', 'Conv3DTranspose', 'GroupNorm',
-    'SpectralNorm', 'TreeConv'
+    'SpectralNorm', 'TreeConv', 'SyncBatchNorm'
 ]
 
 
@@ -3182,3 +3182,216 @@ def forward(self, nodes_vector, edge_set):
         else:
             pre_activation = out
         return self._helper.append_activation(pre_activation, act=self._act)
+
+
+class SyncBatchNorm(layers.Layer):
+    """
+    :alias_main: paddle.nn.SyncBatchNorm
+	:alias: paddle.nn.SyncBatchNorm,paddle.nn.layer.SyncBatchNorm,paddle.nn.layer.norm.SyncBatchNorm
+	:old_api: paddle.fluid.dygraph.SyncBatchNorm
+
+    This interface is used to construct a callable object of the ``SyncBatchNorm`` class.
+    For more details, refer to code examples.
+    It implements the function of the Batch Normalization Layer and can be used 
+    as a normalizer function for conv2d and fully connected operations.
+    The data is normalized by the mean and variance of the channel based on all mini-batches
+    of the same process groups.
+    Refer to `Batch Normalization: Accelerating Deep Network Training by Reducing
+    Internal Covariate Shift <https://arxiv.org/pdf/1502.03167.pdf>`_
+    for more details.
+
+    When model in train mode, the :math:`\\mu_{\\beta}` 
+    and :math:`\\sigma_{\\beta}^{2}` are the statistics of all mini-batches in the same process groups.
+    Calculated as follows:
+
+    ..  math::
+
+        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
+        \ mini-batch\ mean \\\\
+        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
+        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
+
+    - :math:`x` : mini-batch data
+    - :math:`m` : the size of the mini-batch data
+
+    When model in eval mode, the :math:`\\mu_{\\beta}`
+    and :math:`\\sigma_{\\beta}^{2}` are global or running statistics (moving_mean and moving_variance). 
+    It usually got from the pre-trained model. Calculated as follows:
+
+    .. math::
+        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\
+        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\
+
+    The normalization function formula is as follows:
+ 
+    ..  math::
+
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+
+    - :math:`\\epsilon` : add a smaller value to the variance to prevent division by zero
+    - :math:`\\gamma` : trainable proportional parameter
+    - :math:`\\beta` : trainable deviation parameter
+
+    **Note**:
+        moving mean and moving variance will be calculate whether `track_running_stats` is set to `True`
+        or `False`, we will fix it in the next time.
+
+    Parameters:
+        num_features(int): Indicate the number of channels of the input ``Tensor``.
+        eps(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        scale_attr(ParamAttr, optional): The parameter attribute for Parameter `scale`
+             of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
+             will create ParamAttr as param_attr. If the Initializer of the param_attr
+             is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr(ParamAttr, optional): The parameter attribute for the bias of batch_norm.
+             If it is set to None or one attribute of ParamAttr, batch_norm
+             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+             is not set, the bias is initialized zero. Default: None.
+        track_running_stats(bool, optional): Whether to compute global stats, which including running mean and 
+             running variance. Default: True.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+          from paddle.fluid.dygraph import to_variable
+          import paddle.nn as nn
+          import numpy as np
+
+          x = np.random.random(size=(3, 10, 3, 7)).astype('float32')
+          with fluid.dygraph.guard():
+              x = to_variable(x)
+              sync_batch_norm = nn.SyncBatchNorm(10)
+              hidden1 = sync_batch_norm(x)
+    """
+
+    def __init__(self,
+                 num_features,
+                 eps=1e-05,
+                 momentum=0.9,
+                 track_running_stats=True,
+                 scale_attr=None,
+                 bias_attr=None,
+                 name=None):
+        super(SyncBatchNorm, self).__init__()
+        self._scale_attr = scale_attr
+        self._bias_attr = bias_attr
+
+        assert bias_attr is not False, "bias_attr should not be False in batch_norm."
+
+        self._dtype = "float32"
+
+        param_shape = [num_features]
+
+        # create parameter
+        self.weight = self.create_parameter(
+            attr=self._scale_attr,
+            shape=param_shape,
+            dtype=self._dtype,
+            default_initializer=Constant(1.0))
+        self.weight.stop_gradient = self._scale_attr != None and self._scale_attr.learning_rate == 0.
+
+        self.bias = self.create_parameter(
+            attr=self._bias_attr,
+            shape=param_shape,
+            dtype=self._dtype,
+            is_bias=True)
+        self.bias.stop_gradient = self._scale_attr != None and self._scale_attr.learning_rate == 0.
+
+        self._mean = self.create_parameter(
+            attr=ParamAttr(
+                name=None,
+                initializer=Constant(0.0),
+                trainable=False,
+                do_model_average=False),
+            shape=param_shape,
+            dtype=self._dtype)
+        self._mean.stop_gradient = True
+
+        self._variance = self.create_parameter(
+            attr=ParamAttr(
+                name=None,
+                initializer=Constant(1.0),
+                trainable=False,
+                do_model_average=False),
+            shape=param_shape,
+            dtype=self._dtype)
+        self._variance.stop_gradient = True
+
+        self._data_layout = 'NCHW'
+        self._momentum = momentum
+        self._epsilon = epsilon
+        self._track_running_stats = track_running_stats
+
+    def forward(self, input):
+        # create output
+        # mean and mean_out share the same memory
+        mean_out = self._mean
+        # variance and variance out share the same memory
+        variance_out = self._variance
+
+        ### train mode: use mini-batch stats, eval mode: use global stats
+        if self.training:
+            use_global_stats = False
+            trainable_statistics = False
+        else:
+            use_global_stats = True
+            trainable_statistics = False
+
+        if in_dygraph_mode():
+            attrs = ("momentum", self._momentum, "epsilon", self._epsilon,
+                     "is_test", not self.training, "data_layout",
+                     self._data_layout, "use_mkldnn", False, "fuse_with_relu",
+                     False, "use_global_stats", use_global_stats,
+                     'trainable_statistics', trainable_statistics)
+            sync_batch_norm_out, _, _, _, _, _ = core.ops.sync_batch_norm(
+                input, self.weight, self.bias, self._mean, self._variance,
+                mean_out, variance_out, *attrs)
+
+            return sync_batch_norm_out
+
+        check_variable_and_dtype(input, 'input',
+                                 ['float16', 'float32', 'float64'], 'BatchNorm')
+
+        attrs = {
+            "momentum": self._momentum,
+            "epsilon": self._epsilon,
+            "is_test": not self.training,
+            "data_layout": self._data_layout,
+            "use_mkldnn": False,
+            "fuse_with_relu": False,
+            "use_global_stats": use_global_stats,
+            "trainable_statistics": trainable_statistics,
+        }
+
+        inputs = {
+            "X": [input],
+            "Scale": [self.weight],
+            "Bias": [self.bias],
+            "Mean": [self._mean],
+            "Variance": [self._variance]
+        }
+
+        saved_mean = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype, stop_gradient=True)
+        saved_variance = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype, stop_gradient=True)
+        sync_batch_norm_out = input if self._in_place else self._helper.create_variable_for_type_inference(
+            self._dtype)
+
+        outputs = {
+            "Y": [sync_batch_norm_out],
+            "MeanOut": [mean_out],
+            "VarianceOut": [variance_out],
+            "SavedMean": [saved_mean],
+            "SavedVariance": [saved_variance]
+        }
+
+        self._helper.append_op(
+            type="sync_batch_norm", inputs=inputs, outputs=outputs, attrs=attrs)
+        return sync_batch_norm_out
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py
new file mode 100644
index 0000000000000..96be49e277deb
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import contextlib
+import unittest
+import numpy as np
+import six
+import pickle
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.dygraph as dygraph
+from paddle.fluid import core
+from paddle.fluid.optimizer import SGDOptimizer
+#from paddle.nn import Conv2D, Pool2D, Linear, SyncBatchNorm
+from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear, SyncBatchNorm
+from paddle.fluid.dygraph.base import to_variable
+
+from test_dist_base import runtime_main, TestParallelDyGraphRunnerBase
+
+
+class TestLayer(fluid.dygraph.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 act=None):
+        super(TestLayer, self).__init__()
+
+        self._conv = Conv2D(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            act=None,
+            bias_attr=False)
+
+        self._sync_batch_norm = SyncBatchNorm(num_filters)
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._sync_batch_norm(y)
+
+        return y
+
+
+class TestSyncBatchNorm(TestParallelDyGraphRunnerBase):
+    def get_model(self):
+        model = TestLayer(3, 64, 7)
+        train_reader = paddle.batch(
+            paddle.dataset.flowers.test(use_xmap=False),
+            batch_size=32,
+            drop_last=True)
+        opt = fluid.optimizer.Adam(
+            learning_rate=1e-3, parameter_list=model.parameters())
+        return model, train_reader, opt
+
+    def run_one_loop(self, model, opt, data):
+        batch_size = len(data)
+        dy_x_data = np.array([x[0].reshape(3, 224, 224)
+                              for x in data]).astype('float32')
+        img = to_variable(dy_x_data)
+        img.stop_gradient = False
+
+        out = model(img)
+
+        out = fluid.layers.mean(out)
+
+        return out
+
+
+if __name__ == "__main__":
+    runtime_main(TestSyncBatchNorm)
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sync_batch_norm.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sync_batch_norm.py
new file mode 100644
index 0000000000000..7d48750b88eb8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sync_batch_norm.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+from test_dist_base import TestDistBase
+import paddle.fluid as fluid
+
+import os
+flag_name = os.path.splitext(__file__)[0]
+
+
+class TestParallelDygraphMnist(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = False
+        self._nccl2_mode = True
+        self._dygraph = True
+
+    def test_mnist(self):
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place(
+                "parallel_dygraph_sync_batch_norm.py",
+                delta=1e-5,
+                check_error_log=True,
+                log_name=flag_name)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index e074ca66bb1d3..c974fc1414512 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -85,6 +85,7 @@
 from .layer.loss import NLLLoss  #DEFINE_ALIAS
 from .layer.loss import BCELoss  #DEFINE_ALIAS
 from .layer.norm import BatchNorm  #DEFINE_ALIAS
+from .layer.norm import SyncBatchNorm  #DEFINE_ALIAS
 from .layer.norm import GroupNorm  #DEFINE_ALIAS
 from .layer.norm import LayerNorm  #DEFINE_ALIAS
 from .layer.norm import SpectralNorm  #DEFINE_ALIAS
diff --git a/python/paddle/nn/layer/__init__.py b/python/paddle/nn/layer/__init__.py
index 4963ac360804f..674782d474817 100644
--- a/python/paddle/nn/layer/__init__.py
+++ b/python/paddle/nn/layer/__init__.py
@@ -61,6 +61,7 @@
 from .loss import NLLLoss  #DEFINE_ALIAS
 from .loss import BCELoss  #DEFINE_ALIAS
 from .norm import BatchNorm  #DEFINE_ALIAS
+from .norm import SyncBatchNorm  #DEFINE_ALIAS
 from .norm import GroupNorm  #DEFINE_ALIAS
 from .norm import LayerNorm  #DEFINE_ALIAS
 from .norm import SpectralNorm  #DEFINE_ALIAS
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index 1beba62c1809f..1d00f9c7b8b02 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -20,7 +20,9 @@
 from ...fluid.dygraph import GroupNorm  #DEFINE_ALIAS
 from ...fluid.dygraph import LayerNorm  #DEFINE_ALIAS
 from ...fluid.dygraph import SpectralNorm  #DEFINE_ALIAS
+from ...fluid.dygraph import SyncBatchNorm  #DEFINE_ALIAS
 
 __all__ = [
-    'BatchNorm', 'GroupNorm', 'LayerNorm', 'SpectralNorm', 'InstanceNorm'
+    'BatchNorm', 'GroupNorm', 'LayerNorm', 'SpectralNorm', 'InstanceNorm',
+    'SyncBatchNorm'
 ]

From 44db6f528605f9139b33458573f4f1cc6eef60f4 Mon Sep 17 00:00:00 2001
From: ceci3 <592712189@qq.com>
Date: Fri, 7 Aug 2020 07:19:21 +0000
Subject: [PATCH 02/10] fix unittest, test=develop

---
 python/paddle/fluid/dygraph/nn.py             | 19 +++++++++++--------
 .../fluid/tests/unittests/CMakeLists.txt      |  2 ++
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index 86651eec939d1..94a7375ff5a8e 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -3191,17 +3191,17 @@ class SyncBatchNorm(layers.Layer):
 	:old_api: paddle.fluid.dygraph.SyncBatchNorm
 
     This interface is used to construct a callable object of the ``SyncBatchNorm`` class.
-    For more details, refer to code examples.
-    It implements the function of the Batch Normalization Layer and can be used 
-    as a normalizer function for conv2d and fully connected operations.
-    The data is normalized by the mean and variance of the channel based on all mini-batches
-    of the same process groups.
+    It implements the function of the Cross-GPU Synchronized Batch Normalization Layer, and can 
+    be used as a normalizer function for other operations, such as conv2d and fully connected 
+    operations.
+    The data is normalized by the mean and variance of the channel based on whole mini-batch
+    , which including data in all gpus.
     Refer to `Batch Normalization: Accelerating Deep Network Training by Reducing
     Internal Covariate Shift <https://arxiv.org/pdf/1502.03167.pdf>`_
     for more details.
 
     When model in train mode, the :math:`\\mu_{\\beta}` 
-    and :math:`\\sigma_{\\beta}^{2}` are the statistics of all mini-batches in the same process groups.
+    and :math:`\\sigma_{\\beta}^{2}` are the statistics of whole mini-batch data in all gpus.
     Calculated as follows:
 
     ..  math::
@@ -3267,8 +3267,11 @@ class SyncBatchNorm(layers.Layer):
           x = np.random.random(size=(3, 10, 3, 7)).astype('float32')
           with fluid.dygraph.guard():
               x = to_variable(x)
-              sync_batch_norm = nn.SyncBatchNorm(10)
-              hidden1 = sync_batch_norm(x)
+              if paddle.fluid.is_compiled_with_cuda():
+                  sync_batch_norm = nn.SyncBatchNorm(10)
+                  hidden1 = sync_batch_norm(x)
+              else:
+                  raise NotImplemented("SyncBatchNorm only support GPU")
     """
 
     def __init__(self,
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 686844fea76c0..dc8328d809f29 100755
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -100,6 +100,8 @@ if (NOT ${WITH_GPU})
     list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_se_resnext)
     LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sparse_embedding)
     LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_transformer)
+    LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sync_batch_norm)
+
 elseif(${CUDNN_VERSION} VERSION_LESS 7100)
     LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
 endif()

From 38bb371c9d8fa599beef5f99bc2ce0bf6645b81b Mon Sep 17 00:00:00 2001
From: ceci3 <592712189@qq.com>
Date: Sun, 9 Aug 2020 08:24:16 +0000
Subject: [PATCH 03/10] add unittest,test=develop

---
 python/paddle/fluid/dygraph/nn.py             | 25 ++++++++-----------
 .../fluid/tests/unittests/test_layers.py      | 18 +++++++++++++
 .../test_parallel_dygraph_sync_batch_norm.py  |  2 +-
 .../unittests/test_sync_batch_norm_op.py      | 18 +++++++++++++
 4 files changed, 47 insertions(+), 16 deletions(-)

diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index 94a7375ff5a8e..966300585c1da 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -3267,11 +3267,9 @@ class SyncBatchNorm(layers.Layer):
           x = np.random.random(size=(3, 10, 3, 7)).astype('float32')
           with fluid.dygraph.guard():
               x = to_variable(x)
-              if paddle.fluid.is_compiled_with_cuda():
+              if fluid.is_compiled_with_cuda():
                   sync_batch_norm = nn.SyncBatchNorm(10)
                   hidden1 = sync_batch_norm(x)
-              else:
-                  raise NotImplemented("SyncBatchNorm only support GPU")
     """
 
     def __init__(self,
@@ -3340,19 +3338,12 @@ def forward(self, input):
         variance_out = self._variance
 
         ### train mode: use mini-batch stats, eval mode: use global stats
-        if self.training:
-            use_global_stats = False
-            trainable_statistics = False
-        else:
-            use_global_stats = True
-            trainable_statistics = False
-
         if in_dygraph_mode():
             attrs = ("momentum", self._momentum, "epsilon", self._eps,
                      "is_test", not self.training, "data_layout",
                      self._data_layout, "use_mkldnn", False, "fuse_with_relu",
-                     False, "use_global_stats", use_global_stats,
-                     'trainable_statistics', trainable_statistics)
+                     False, "use_global_stats", not self.training,
+                     'trainable_statistics', False)
             sync_batch_norm_out, _, _, _, _, _ = core.ops.sync_batch_norm(
                 input, self.weight, self.bias, self._mean, self._variance,
                 mean_out, variance_out, *attrs)
@@ -3369,8 +3360,8 @@ def forward(self, input):
             "data_layout": self._data_layout,
             "use_mkldnn": False,
             "fuse_with_relu": False,
-            "use_global_stats": use_global_stats,
-            "trainable_statistics": trainable_statistics,
+            "use_global_stats": not self.training,
+            "trainable_statistics": False,
         }
 
         inputs = {
@@ -3385,7 +3376,7 @@ def forward(self, input):
             dtype=self._dtype, stop_gradient=True)
         saved_variance = self._helper.create_variable_for_type_inference(
             dtype=self._dtype, stop_gradient=True)
-        sync_batch_norm_out = input if self._in_place else self._helper.create_variable_for_type_inference(
+        sync_batch_norm_out = self._helper.create_variable_for_type_inference(
             self._dtype)
 
         outputs = {
@@ -3415,6 +3406,10 @@ class Flatten(layers.Layer):
         start_axis(int): first dim to flatten (default = 1)
         stop_axis(int): last dim to flatten (default = -1).
     
+    Returns:
+        None
+
+    Examples:
 
         .. code-block:: python
 
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 9da70e85f01c0..91186b2e95ae0 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -283,6 +283,24 @@ def test_layer_norm(self):
             with self.assertRaises(ValueError):
                 lm(base.to_variable(inp))
 
+    def test_SyncBatchNorm(self):
+        if core.is_compiled_with_cuda():
+            with self.static_graph():
+                t = layers.data(name='t', shape=[-1, 3, 5, 5], dtype='float32')
+                my_sync_bn = nn.SyncBatchNorm(3)
+                ret = my_sync_bn(t)
+                static_ret = self.get_static_graph_result(
+                    feed={'t': np.ones(
+                        [3, 3, 5, 5], dtype='float32')},
+                    fetch_list=[ret])[0]
+
+            with self.dynamic_graph():
+                t = np.ones([3, 3, 5, 5], dtype='float32')
+                my_syncbn = paddle.nn.SyncBatchNorm(3)
+                dy_ret = my_syncbn(base.to_variable(t))
+                dy_ret_value = dy_ret.numpy()
+            self.assertTrue(np.array_equal(static_ret, static_ret))
+
     def test_relu(self):
         with self.static_graph():
             t = layers.data(name='t', shape=[3, 3], dtype='float32')
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sync_batch_norm.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sync_batch_norm.py
index 7d48750b88eb8..84e97127f4868 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sync_batch_norm.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sync_batch_norm.py
@@ -25,7 +25,7 @@ class TestParallelDygraphMnist(TestDistBase):
     def _setup_config(self):
         self._sync_mode = False
         self._nccl2_mode = True
-        self._dygraph = True
+        self._dygraph = False  #True
 
     def test_mnist(self):
         if fluid.core.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
index 8fd118c019303..806b6b90e7e2d 100644
--- a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
@@ -25,6 +25,7 @@
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 from paddle.fluid import compiler
+from paddle.fluid import Program, program_guard
 
 from op_test import OpTest, _set_use_system_allocator
 
@@ -202,5 +203,22 @@ def setUp(self):
         self.atol = 1e-2
 
 
+class TestDygraphSyncBatchNormAPIError(unittest.TestCase):
+    def test_errors(self):
+        if not core.is_compiled_with_cuda():
+            return
+
+        with program_guard(Program(), Program()):
+            my_sync_batch_norm = fluid.dygraph.SyncBatchNorm(10)
+            x1 = fluid.create_lod_tensor(
+                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CUDAPlace(0))
+            self.assertRaises(TypeError, my_sync_batch_norm, x1)
+
+            # the input dtype of SyncBatchNorm must be float16 or float32 or float64
+            # float16 only can be set on GPU place
+            x2 = fluid.layers.data(name='x2', shape=[3, 4, 5, 6], dtype="int32")
+            self.assertRaises(TypeError, my_sync_batch_norm, x2)
+
+
 if __name__ == '__main__':
     unittest.main()

From b8360a83f54989fb577703bb48ab264e5cc7c49b Mon Sep 17 00:00:00 2001
From: ceci3 <592712189@qq.com>
Date: Mon, 10 Aug 2020 03:32:26 +0000
Subject: [PATCH 04/10] fix, test=develop

---
 python/paddle/fluid/dygraph/nn.py                           | 6 +++---
 .../tests/unittests/parallel_dygraph_sync_batch_norm.py     | 2 +-
 .../unittests/test_parallel_dygraph_sync_batch_norm.py      | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index 966300585c1da..a082a302b1183 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -3222,7 +3222,7 @@ class SyncBatchNorm(layers.Layer):
         moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\
         moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\
 
-    The normalization function formula is as follows:
+    The formula of normalization is as follows:
  
     ..  math::
 
@@ -3235,8 +3235,8 @@ class SyncBatchNorm(layers.Layer):
     - :math:`\\beta` : trainable deviation parameter
 
     **Note**:
-        moving mean and moving variance will be calculate whether `track_running_stats` is set to `True`
-        or `False`, we will fix it in the next time.
+        moving mean and moving variance will be calculated whether `track_running_stats` is set to `True`
+        or `False`, we will fix it in the next version.
 
     Parameters:
         num_features(int): Indicate the number of channels of the input ``Tensor``.
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py
index 1cd02dadcc287..a150129b82a89 100644
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sync_batch_norm.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sync_batch_norm.py
index 84e97127f4868..5c34b35fc83a3 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sync_batch_norm.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sync_batch_norm.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 445f0fa5500656994b32f9ee19dc2a33ccd30815 Mon Sep 17 00:00:00 2001
From: ceci3 <592712189@qq.com>
Date: Mon, 10 Aug 2020 06:26:24 +0000
Subject: [PATCH 05/10] fix doc, test=develop

---
 python/paddle/fluid/dygraph/nn.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index a082a302b1183..a4aa80ac8be9e 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -3264,12 +3264,14 @@ class SyncBatchNorm(layers.Layer):
           from paddle.fluid.dygraph import to_variable
           import numpy as np
 
-          x = np.random.random(size=(3, 10, 3, 7)).astype('float32')
+          x = np.array([[[[0.3, 0.4], [0.3, 0.07]], [[0.83, 0.37], [0.18, 0.93]]]]).astype('float32')
           with fluid.dygraph.guard():
               x = to_variable(x)
               if fluid.is_compiled_with_cuda():
-                  sync_batch_norm = nn.SyncBatchNorm(10)
+                  sync_batch_norm = nn.SyncBatchNorm(2)
                   hidden1 = sync_batch_norm(x)
+                  print(hidden1.numpy())
+                  # [[[[0.26824948, 1.0936325],[0.26824948, -1.6301316]],[[ 0.8095662, -0.665287],[-1.2744656, 1.1301866 ]]]]
     """
 
     def __init__(self,

From 39d82b564de32a472bcc48c9bba96bcb08ba7a1d Mon Sep 17 00:00:00 2001
From: ceci3 <592712189@qq.com>
Date: Fri, 14 Aug 2020 02:52:24 +0000
Subject: [PATCH 06/10] fix docs,test=develop

---
 python/paddle/fluid/dygraph/nn.py | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index a4aa80ac8be9e..90575b507f2b6 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -3186,10 +3186,6 @@ def forward(self, nodes_vector, edge_set):
 
 class SyncBatchNorm(layers.Layer):
     """
-    :alias_main: paddle.nn.SyncBatchNorm
-	:alias: paddle.nn.SyncBatchNorm,paddle.nn.layer.SyncBatchNorm,paddle.nn.layer.norm.SyncBatchNorm
-	:old_api: paddle.fluid.dygraph.SyncBatchNorm
-
     This interface is used to construct a callable object of the ``SyncBatchNorm`` class.
     It implements the function of the Cross-GPU Synchronized Batch Normalization Layer, and can 
     be used as a normalizer function for other operations, such as conv2d and fully connected 
@@ -3259,19 +3255,18 @@ class SyncBatchNorm(layers.Layer):
     Examples:
         .. code-block:: python
 
+          import paddle
           import paddle.nn as nn
-          import paddle.fluid as fluid
-          from paddle.fluid.dygraph import to_variable
           import numpy as np
 
           x = np.array([[[[0.3, 0.4], [0.3, 0.07]], [[0.83, 0.37], [0.18, 0.93]]]]).astype('float32')
-          with fluid.dygraph.guard():
-              x = to_variable(x)
-              if fluid.is_compiled_with_cuda():
-                  sync_batch_norm = nn.SyncBatchNorm(2)
-                  hidden1 = sync_batch_norm(x)
-                  print(hidden1.numpy())
-                  # [[[[0.26824948, 1.0936325],[0.26824948, -1.6301316]],[[ 0.8095662, -0.665287],[-1.2744656, 1.1301866 ]]]]
+          paddle.disable_static()
+          x = paddle.to_variable(x)
+          if paddle.fluid.is_compiled_with_cuda():
+              sync_batch_norm = nn.SyncBatchNorm(2)
+              hidden1 = sync_batch_norm(x)
+              print(hidden1.numpy())
+              # [[[[0.26824948, 1.0936325],[0.26824948, -1.6301316]],[[ 0.8095662, -0.665287],[-1.2744656, 1.1301866 ]]]]
     """
 
     def __init__(self,

From 07633024299f3be0c60dd25427e6f5721e7f6d5d Mon Sep 17 00:00:00 2001
From: ceci3 <592712189@qq.com>
Date: Sun, 16 Aug 2020 05:46:29 +0000
Subject: [PATCH 07/10] fix syncbn,test=develop

---
 python/paddle/fluid/dygraph/nn.py | 112 ++++++++++++++++++++++--------
 1 file changed, 82 insertions(+), 30 deletions(-)

diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index a324024262541..0256e840b4145 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -20,6 +20,7 @@
 from ..layers import nn as F
 from .. import dygraph_utils
 from . import layers
+from .base import no_grad
 from ..framework import Variable, in_dygraph_mode, OpProtoHolder, Parameter, _dygraph_tracer, _varbase_creator, default_main_program
 from ..data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
 from ..param_attr import ParamAttr
@@ -3237,9 +3238,9 @@ class SyncBatchNorm(layers.Layer):
 
     Parameters:
         num_features(int): Indicate the number of channels of the input ``Tensor``.
-        eps(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
+        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
         momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
-        scale_attr(ParamAttr, optional): The parameter attribute for Parameter `scale`
+        weight_attr(ParamAttr, optional): The parameter attribute for Parameter `scale`
              of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
              will create ParamAttr as param_attr. If the Initializer of the param_attr
              is not set, the parameter is initialized with Xavier. Default: None.
@@ -3272,43 +3273,57 @@ class SyncBatchNorm(layers.Layer):
 
     def __init__(self,
                  num_features,
-                 eps=1e-05,
+                 epsilon=1e-05,
                  momentum=0.9,
                  track_running_stats=True,
-                 scale_attr=None,
+                 weight_attr=None,
                  bias_attr=None,
+                 data_format='NCHW',
                  name=None):
         super(SyncBatchNorm, self).__init__()
-        self._scale_attr = scale_attr
+        self._weight_attr = weight_attr
         self._bias_attr = bias_attr
+        self._num_features = num_features
+        self._data_layout = data_format
+        self._momentum = momentum
+        self._epsilon = epsilon
+        self._track_running_stats = track_running_stats
 
-        assert bias_attr is not False, "bias_attr should not be False in batch_norm."
+        param_shape = [self._num_features]
 
-        self._dtype = "float32"
+        ### TODO(lvmengsi): remove create param when weight_attr=False in python when BatchNorm kernel support
+        if weight_attr == False:
+            self.weight = self.create_parameter(
+                attr=self._weight_attr,
+                shape=param_shape,
+                default_initializer=Constant(1.0))
+            self.weight.stop_gradient = True
 
-        param_shape = [num_features]
+            self.bias = self.create_parameter(
+                attr=self._bias_attr,
+                shape=param_shape,
+                default_initializer=Constant(0.0),
+                is_bias=True)
+            self.bias.stop_gradient = True
 
-        # create parameter
-        self.weight = self.create_parameter(
-            attr=self._scale_attr,
-            shape=param_shape,
-            dtype=self._dtype,
-            default_initializer=Constant(1.0))
-        self.weight.stop_gradient = self._scale_attr != None and self._scale_attr.learning_rate == 0.
+        else:
+            # create parameter
+            self.weight = self.create_parameter(
+                attr=self._weight_attr,
+                shape=param_shape,
+                default_initializer=Constant(1.0))
+            self.weight.stop_gradient = self._weight_attr != None and self._weight_attr.learning_rate == 0.
 
-        self.bias = self.create_parameter(
-            attr=self._bias_attr,
-            shape=param_shape,
-            dtype=self._dtype,
-            is_bias=True)
-        self.bias.stop_gradient = self._scale_attr != None and self._scale_attr.learning_rate == 0.
+            self.bias = self.create_parameter(
+                attr=self._bias_attr, shape=param_shape, is_bias=True)
+            self.bias.stop_gradient = self._weight_attr != None and self._weight_attr.learning_rate == 0.
 
         self._mean = self.create_parameter(
             attr=ParamAttr(
                 name=None,
                 initializer=Constant(0.0),
                 trainable=False,
-                do_model_average=False),
+                do_model_average=True),
             shape=param_shape,
             dtype=self._dtype)
         self._mean.stop_gradient = True
@@ -3318,16 +3333,11 @@ def __init__(self,
                 name=None,
                 initializer=Constant(1.0),
                 trainable=False,
-                do_model_average=False),
+                do_model_average=True),
             shape=param_shape,
             dtype=self._dtype)
         self._variance.stop_gradient = True
 
-        self._data_layout = 'NCHW'
-        self._momentum = momentum
-        self._eps = eps
-        self._track_running_stats = track_running_stats
-
     def forward(self, input):
         # create output
         # mean and mean_out share the same memory
@@ -3337,7 +3347,7 @@ def forward(self, input):
 
         ### train mode: use mini-batch stats, eval mode: use global stats
         if in_dygraph_mode():
-            attrs = ("momentum", self._momentum, "epsilon", self._eps,
+            attrs = ("momentum", self._momentum, "epsilon", self._epsilon,
                      "is_test", not self.training, "data_layout",
                      self._data_layout, "use_mkldnn", False, "fuse_with_relu",
                      False, "use_global_stats", not self.training,
@@ -3353,7 +3363,7 @@ def forward(self, input):
 
         attrs = {
             "momentum": self._momentum,
-            "epsilon": self._eps,
+            "epsilon": self._epsilon,
             "is_test": not self.training,
             "data_layout": self._data_layout,
             "use_mkldnn": False,
@@ -3389,6 +3399,48 @@ def forward(self, input):
             type="sync_batch_norm", inputs=inputs, outputs=outputs, attrs=attrs)
         return sync_batch_norm_out
 
+    ### TODO: remove comment after BatchNorm merged.
+    #@classmethod
+    #def convert_sync_batchnorm(cls, layer):
+    #    """
+    #    Helper function to convert :class: `paddle.nn.BatchNorm` in the model to :class: `paddle.nn.SyncBatchNorm` layers.
+
+    #    Parameters:
+    #        layer(paddle.fluid.dygraph.Layer): layer containing one or more `BatchNorm` layers.
+
+    #    Returns:
+    #        A new SyncBatchNorm layer object if origin layer is BatchNorm Layer.
+
+    #    Examples:
+
+    #        .. code-block:: python
+    #            import paddle
+    #            import paddle.nn as nn
+
+    #            paddle.disable_static()
+    #            model = nn.Sequential(nn.Conv2D(3, 5, 3), nn.BatchNorm(5))
+    #            sync_model = nn.SyncBatchNorm.convert(model)
+
+    #    """
+    #    layer_output = layer
+    #    if isinstance(layer, BatchNorm):
+    #        layer_output = SyncBatchNorm(layer._num_features,
+    #                             layer._epsilon, layer._momentum,
+    #                             layer._weight_attr, layer._bias_attr
+    #                             layer._data_layout)
+
+    #        if layer._weight_attr != False and layer._bias_attr != False:
+    #            with no_grad():
+    #                layer_output.weight = layer.weight
+    #                layer_output.bias = layer.bias
+    #        layer_output._mean = layer._mean
+    #        layer_output._variance = layer._variance
+
+    #    for name, sublayer in layer.named_sublayer():
+    #        layer_output.add_sublayer(name, cls.convert_sync_batchnorm(sublayer))
+    #    del layer
+    #    return layer_output
+
 
 class Flatten(layers.Layer):
     """

From 6e46c44f965013fd50c6e0c959cffd12af474066 Mon Sep 17 00:00:00 2001
From: ceci3 <592712189@qq.com>
Date: Mon, 17 Aug 2020 05:54:18 +0000
Subject: [PATCH 08/10] fix docs, test=develop

---
 python/paddle/fluid/dygraph/nn.py | 110 ++++++++++--------------------
 1 file changed, 35 insertions(+), 75 deletions(-)

diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index 0256e840b4145..3fc2bf92d3c7e 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -3198,7 +3198,7 @@ class SyncBatchNorm(layers.Layer):
     Internal Covariate Shift <https://arxiv.org/pdf/1502.03167.pdf>`_
     for more details.
 
-    When model in train mode, the :math:`\\mu_{\\beta}` 
+    When model in training mode, the :math:`\\mu_{\\beta}` 
     and :math:`\\sigma_{\\beta}^{2}` are the statistics of whole mini-batch data in all gpus.
     Calculated as follows:
 
@@ -3209,12 +3209,12 @@ class SyncBatchNorm(layers.Layer):
         \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
         \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
 
-    - :math:`x` : mini-batch data
-    - :math:`m` : the size of the mini-batch data
+    - :math:`x` : whole mini-batch data in all gpus
+    - :math:`m` : the size of the whole mini-batch data
 
-    When model in eval mode, the :math:`\\mu_{\\beta}`
-    and :math:`\\sigma_{\\beta}^{2}` are global or running statistics (moving_mean and moving_variance). 
-    It usually got from the pre-trained model. Calculated as follows:
+    When model in evaluation mode, the :math:`\\mu_{\\beta}`
+    and :math:`\\sigma_{\\beta}^{2}` are global statistics (moving_mean and moving_variance, 
+    which usually got from the pre-trained model). Global statistics calculated as follows:
 
     .. math::
         moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\
@@ -3229,25 +3229,23 @@ class SyncBatchNorm(layers.Layer):
         y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
 
     - :math:`\\eps` : add a smaller value to the variance to prevent division by zero
-    - :math:`\\gamma` : trainable proportional parameter
-    - :math:`\\beta` : trainable deviation parameter
-
-    **Note**:
-        moving mean and moving variance will be calculated whether `track_running_stats` is set to `True`
-        or `False`, we will fix it in the next version.
+    - :math:`\\gamma` : trainable scale parameter vector
+    - :math:`\\beta` : trainable shift parameter vector 
 
     Parameters:
         num_features(int): Indicate the number of channels of the input ``Tensor``.
         epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
         momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
-        weight_attr(ParamAttr, optional): The parameter attribute for Parameter `scale`
-             of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
+             of this layer. If it is set to None or one attribute of ParamAttr, this layerr
              will create ParamAttr as param_attr. If the Initializer of the param_attr
-             is not set, the parameter is initialized with Xavier. Default: None.
-        bias_attr(ParamAttr, optional): The parameter attribute for the bias of batch_norm.
-             If it is set to None or one attribute of ParamAttr, batch_norm
+             is not set, the parameter is initialized with Xavier. If it is set to False, 
+             this layer will not have trainable scale parameter. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of this layer.
+             If it is set to None or one attribute of ParamAttr, this layer
              will create ParamAttr as bias_attr. If the Initializer of the bias_attr
-             is not set, the bias is initialized zero. Default: None.
+             is not set, the bias is initialized zero. If it is set to False, this layer will not 
+             have trainable bias parameter. Default: None.
         track_running_stats(bool, optional): Whether to compute global stats, which including running mean and 
              running variance. Default: True.
 
@@ -3289,31 +3287,35 @@ def __init__(self,
         self._epsilon = epsilon
         self._track_running_stats = track_running_stats
 
+        if self._track_running_stats == False:
+            logging.warn(
+                "moving mean and moving variance will be calculated whether `track_running_stats` is set to `True` or `False`, we will fix it in the next version."
+            )
+
         param_shape = [self._num_features]
 
-        ### TODO(lvmengsi): remove create param when weight_attr=False in python when BatchNorm kernel support
+        # create parameter
         if weight_attr == False:
             self.weight = self.create_parameter(
                 attr=self._weight_attr,
                 shape=param_shape,
                 default_initializer=Constant(1.0))
             self.weight.stop_gradient = True
+        else:
+            self.weight = self.create_parameter(
+                attr=self._weight_attr,
+                shape=param_shape,
+                default_initializer=Constant(1.0))
+            self.weight.stop_gradient = self._weight_attr != None and self._weight_attr.learning_rate == 0.
 
+        if bias_attr == False:
             self.bias = self.create_parameter(
                 attr=self._bias_attr,
                 shape=param_shape,
                 default_initializer=Constant(0.0),
                 is_bias=True)
             self.bias.stop_gradient = True
-
         else:
-            # create parameter
-            self.weight = self.create_parameter(
-                attr=self._weight_attr,
-                shape=param_shape,
-                default_initializer=Constant(1.0))
-            self.weight.stop_gradient = self._weight_attr != None and self._weight_attr.learning_rate == 0.
-
             self.bias = self.create_parameter(
                 attr=self._bias_attr, shape=param_shape, is_bias=True)
             self.bias.stop_gradient = self._weight_attr != None and self._weight_attr.learning_rate == 0.
@@ -3338,7 +3340,7 @@ def __init__(self,
             dtype=self._dtype)
         self._variance.stop_gradient = True
 
-    def forward(self, input):
+    def forward(self, x):
         # create output
         # mean and mean_out share the same memory
         mean_out = self._mean
@@ -3353,13 +3355,13 @@ def forward(self, input):
                      False, "use_global_stats", not self.training,
                      'trainable_statistics', False)
             sync_batch_norm_out, _, _, _, _, _ = core.ops.sync_batch_norm(
-                input, self.weight, self.bias, self._mean, self._variance,
-                mean_out, variance_out, *attrs)
+                x, self.weight, self.bias, self._mean, self._variance, mean_out,
+                variance_out, *attrs)
 
             return sync_batch_norm_out
 
-        check_variable_and_dtype(input, 'input',
-                                 ['float16', 'float32', 'float64'], 'BatchNorm')
+        check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
+                                 'BatchNorm')
 
         attrs = {
             "momentum": self._momentum,
@@ -3373,7 +3375,7 @@ def forward(self, input):
         }
 
         inputs = {
-            "X": [input],
+            "X": [x],
             "Scale": [self.weight],
             "Bias": [self.bias],
             "Mean": [self._mean],
@@ -3399,48 +3401,6 @@ def forward(self, input):
             type="sync_batch_norm", inputs=inputs, outputs=outputs, attrs=attrs)
         return sync_batch_norm_out
 
-    ### TODO: remove comment after BatchNorm merged.
-    #@classmethod
-    #def convert_sync_batchnorm(cls, layer):
-    #    """
-    #    Helper function to convert :class: `paddle.nn.BatchNorm` in the model to :class: `paddle.nn.SyncBatchNorm` layers.
-
-    #    Parameters:
-    #        layer(paddle.fluid.dygraph.Layer): layer containing one or more `BatchNorm` layers.
-
-    #    Returns:
-    #        A new SyncBatchNorm layer object if origin layer is BatchNorm Layer.
-
-    #    Examples:
-
-    #        .. code-block:: python
-    #            import paddle
-    #            import paddle.nn as nn
-
-    #            paddle.disable_static()
-    #            model = nn.Sequential(nn.Conv2D(3, 5, 3), nn.BatchNorm(5))
-    #            sync_model = nn.SyncBatchNorm.convert(model)
-
-    #    """
-    #    layer_output = layer
-    #    if isinstance(layer, BatchNorm):
-    #        layer_output = SyncBatchNorm(layer._num_features,
-    #                             layer._epsilon, layer._momentum,
-    #                             layer._weight_attr, layer._bias_attr
-    #                             layer._data_layout)
-
-    #        if layer._weight_attr != False and layer._bias_attr != False:
-    #            with no_grad():
-    #                layer_output.weight = layer.weight
-    #                layer_output.bias = layer.bias
-    #        layer_output._mean = layer._mean
-    #        layer_output._variance = layer._variance
-
-    #    for name, sublayer in layer.named_sublayer():
-    #        layer_output.add_sublayer(name, cls.convert_sync_batchnorm(sublayer))
-    #    del layer
-    #    return layer_output
-
 
 class Flatten(layers.Layer):
     """

From c3d6db64acfa818a4c7b22432e0dbf5d61f8ee13 Mon Sep 17 00:00:00 2001
From: ceci3 <592712189@qq.com>
Date: Mon, 17 Aug 2020 12:10:39 +0000
Subject: [PATCH 09/10] fix converage, test=develop

---
 python/paddle/fluid/dygraph/nn.py              |  1 -
 .../parallel_dygraph_sync_batch_norm.py        | 18 ++++++++++++++++++
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index 3fc2bf92d3c7e..57f3333b64364 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -20,7 +20,6 @@
 from ..layers import nn as F
 from .. import dygraph_utils
 from . import layers
-from .base import no_grad
 from ..framework import Variable, in_dygraph_mode, OpProtoHolder, Parameter, _dygraph_tracer, _varbase_creator, default_main_program
 from ..data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
 from ..param_attr import ParamAttr
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py
index a150129b82a89..4d0a977bb4de8 100644
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py
@@ -54,9 +54,27 @@ def __init__(self,
 
         self._sync_batch_norm = SyncBatchNorm(num_filters)
 
+        self._conv2 = Conv2D(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            act=None,
+            bias_attr=False)
+
+        self._sync_batch_norm2 = SyncBatchNorm(
+            num_filters,
+            weight_attr=False,
+            bias_attr=False,
+            track_running_stats=False)
+
     def forward(self, inputs):
         y = self._conv(inputs)
         y = self._sync_batch_norm(y)
+        y = self._conv2(y)
+        y = self._sync_batch_norm2(y)
 
         return y
 

From 5130ae358124a1c3d9cace833b624ca03db605cc Mon Sep 17 00:00:00 2001
From: ceci3 <592712189@qq.com>
Date: Tue, 18 Aug 2020 03:11:59 +0000
Subject: [PATCH 10/10] fix unittest, test=develop

---
 python/paddle/fluid/dygraph/nn.py                         | 8 +++-----
 .../tests/unittests/parallel_dygraph_sync_batch_norm.py   | 2 +-
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index 57f3333b64364..a6d81b5b6377f 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -3260,7 +3260,7 @@ class SyncBatchNorm(layers.Layer):
 
           x = np.array([[[[0.3, 0.4], [0.3, 0.07]], [[0.83, 0.37], [0.18, 0.93]]]]).astype('float32')
           paddle.disable_static()
-          x = paddle.to_variable(x)
+          x = paddle.to_tensor(x)
           if paddle.fluid.is_compiled_with_cuda():
               sync_batch_norm = nn.SyncBatchNorm(2)
               hidden1 = sync_batch_norm(x)
@@ -3296,9 +3296,7 @@ def __init__(self,
         # create parameter
         if weight_attr == False:
             self.weight = self.create_parameter(
-                attr=self._weight_attr,
-                shape=param_shape,
-                default_initializer=Constant(1.0))
+                attr=None, shape=param_shape, default_initializer=Constant(1.0))
             self.weight.stop_gradient = True
         else:
             self.weight = self.create_parameter(
@@ -3309,7 +3307,7 @@ def __init__(self,
 
         if bias_attr == False:
             self.bias = self.create_parameter(
-                attr=self._bias_attr,
+                attr=None,
                 shape=param_shape,
                 default_initializer=Constant(0.0),
                 is_bias=True)
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py
index 4d0a977bb4de8..5e2059592b517 100644
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py
@@ -55,7 +55,7 @@ def __init__(self,
         self._sync_batch_norm = SyncBatchNorm(num_filters)
 
         self._conv2 = Conv2D(
-            num_channels=num_channels,
+            num_channels=num_filters,
             num_filters=num_filters,
             filter_size=filter_size,
             stride=stride,